├── test ├── logger_test.cc ├── thread_test.cc ├── slice_test.cc ├── CMakeLists.txt ├── histogram.h ├── block_test.cc ├── testutil.h ├── testutil.cc ├── mutex_test.cc ├── db_test.cc ├── file_test.cc ├── node_test.cc ├── random.h ├── skiplist_test.cc ├── rwlock_test.cc ├── table_test.cc ├── dbimpl_test.cc ├── histogram.cc └── db_bench.cc ├── db ├── comparator.h ├── options.h ├── db_impl.h └── db_impl.cc ├── include └── yodb │ └── db.h ├── sys ├── rwlock.h ├── condition.h ├── thread.h ├── mutex.h ├── thread.cc └── rwlock.cc ├── util ├── timestamp.h ├── timestamp.cc ├── arena.h ├── log_stream.h ├── arena.cc ├── logger.cc ├── log_stream.cc ├── logger.h ├── slice.h ├── block.cc └── block.h ├── CMakeLists.txt ├── tree ├── buffer_tree.h ├── msg.h ├── buffer_tree.cc ├── msg.cc ├── node.h ├── skiplist.h └── node.cc ├── fs ├── env.h ├── file.h ├── table.h ├── file.cc └── table.cc ├── cache ├── cache.h └── cache.cc └── README.md /test/logger_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/logger.h" 2 | 3 | int main() 4 | { 5 | LOG_INFO << "here we are"; 6 | LOG_TRACE << "trace logger"; 7 | } 8 | -------------------------------------------------------------------------------- /test/thread_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sys/thread.h" 3 | 4 | using namespace yodb; 5 | 6 | int count1; 7 | static void thr_fn() 8 | { 9 | count1 += 1; 10 | } 11 | 12 | TEST(Thread, run) 13 | { 14 | Thread thr(thr_fn); 15 | count1 = 0; 16 | thr.run(); 17 | thr.join(); 18 | EXPECT_EQ(count1, 1); 19 | } 20 | -------------------------------------------------------------------------------- /db/comparator.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_COMPARATOR_H_ 2 | #define _YODB_COMPARATOR_H_ 3 | 4 | #include "util/slice.h" 5 | 6 | namespace yodb { 7 | 8 | class Comparator { 9 | public: 10 | virtual int compare(const Slice& a, const Slice& b) const = 0; 11 | }; 12 | 13 | class BytewiseComparator : public Comparator { 14 | public: 15 | int compare(const Slice& a, const Slice& b) const { return a.compare(b); } 16 | }; 17 | 18 | } // namespace yodb 19 | 20 | #endif // _YODB_COMPARATOR_H_ 21 | -------------------------------------------------------------------------------- /include/yodb/db.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_DB_H_ 2 | #define _YODB_DB_H_ 3 | 4 | #include "db/comparator.h" 5 | #include "db/options.h" 6 | #include "fs/env.h" 7 | #include "util/slice.h" 8 | 9 | namespace yodb { 10 | 11 | class DB { 12 | public: 13 | static DB* open(const std::string& dbname, const Options& opts); 14 | 15 | virtual bool put(Slice key, Slice value) = 0; 16 | virtual bool get(Slice key, Slice& value) = 0; 17 | virtual bool del(Slice key) = 0; 18 | }; 19 | 20 | } // namespace yodb 21 | 22 | #endif // _YODB_DB_H_ 23 | -------------------------------------------------------------------------------- /test/slice_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util/slice.h" 3 | #include "util/logger.h" 4 | #include 5 | 6 | using namespace yodb; 7 | 8 | TEST(Slice, constructor) 9 | { 10 | EXPECT_EQ(Slice().size(), 0U); 11 | EXPECT_LT(Slice(), Slice("a")); 12 | EXPECT_LT(Slice("a"), Slice("ab")); 13 | EXPECT_TRUE(Slice("a") == Slice("ab", 1)); 14 | EXPECT_EQ(Slice(std::string("ab")), Slice("ab")); 15 | } 16 | 17 | TEST(Slice, empty) 18 | { 19 | EXPECT_TRUE(Slice().empty()); 20 | EXPECT_TRUE(Slice("").empty()); 21 | EXPECT_FALSE(Slice("a").empty()); 22 | } 23 | 24 | TEST(Slice, clear) 25 | { 26 | Slice ab("ab"); 27 | ab.clear(); 28 | EXPECT_EQ(ab, Slice()); 29 | } 30 | -------------------------------------------------------------------------------- /db/options.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_OPTIONS_H_ 2 | #define _YODB_OPTIONS_H_ 3 | 4 | #include "db/comparator.h" 5 | #include "fs/env.h" 6 | 7 | namespace yodb { 8 | 9 | class Options { 10 | public: 11 | Options() 12 | { 13 | comparator = NULL; 14 | env = NULL; 15 | max_node_child_number = 16; 16 | max_node_msg_count = 10240; 17 | cache_limited_memory = 1 << 28; 18 | cache_dirty_node_expire = 1; 19 | } 20 | Comparator* comparator; 21 | Env* env; 22 | 23 | size_t max_node_child_number; 24 | size_t max_node_msg_count; 25 | size_t cache_limited_memory; 26 | size_t cache_dirty_node_expire; 27 | 28 | }; 29 | 30 | } // namespace yodb 31 | 32 | #endif // _YODB_OPTIONS_H_ 33 | -------------------------------------------------------------------------------- /sys/rwlock.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_RWLOCK_H_ 2 | #define _YODB_RWLOCK_H_ 3 | 4 | #include "sys/mutex.h" 5 | #include "sys/condition.h" 6 | #include 7 | 8 | namespace yodb { 9 | 10 | class RWLock : boost::noncopyable { 11 | public: 12 | RWLock(); 13 | 14 | bool try_read_lock(); 15 | void read_lock(); 16 | void read_unlock(); 17 | 18 | bool try_write_lock(); 19 | void write_lock(); 20 | void write_unlock(); 21 | 22 | private: 23 | Mutex mutex_; 24 | CondVar cond_wait_read_; 25 | CondVar cond_wait_write_; 26 | 27 | size_t readers_; 28 | size_t writers_; 29 | size_t want_readers_; 30 | size_t want_writers_; 31 | }; 32 | 33 | } // namespace yodb 34 | 35 | #endif // _YODB_RWLOCK_H_ 36 | -------------------------------------------------------------------------------- /sys/condition.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_COND_H_ 2 | #define _YODB_COND_H_ 3 | 4 | #include "sys/mutex.h" 5 | #include 6 | #include 7 | 8 | namespace yodb { 9 | 10 | class CondVar : boost::noncopyable { 11 | public: 12 | CondVar(Mutex& mutex) 13 | : mutex_(mutex) 14 | { 15 | pthread_cond_init(&cond_, NULL); 16 | } 17 | 18 | ~CondVar() { pthread_cond_destroy(&cond_); } 19 | 20 | void wait() { pthread_cond_wait(&cond_, mutex_.mutex()); } 21 | void notify() { pthread_cond_signal(&cond_); } 22 | void notify_all() { pthread_cond_broadcast(&cond_); } 23 | 24 | private: 25 | Mutex& mutex_; 26 | pthread_cond_t cond_; 27 | }; 28 | 29 | } // namespace yodb 30 | 31 | #endif // _YODB_COND_H_ 32 | -------------------------------------------------------------------------------- /db/db_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_DB_IMPL_H_ 2 | #define _YODB_DB_IMPL_H_ 3 | 4 | #include "yodb/db.h" 5 | #include "db/options.h" 6 | #include "tree/buffer_tree.h" 7 | 8 | namespace yodb { 9 | 10 | class DBImpl : public DB { 11 | public: 12 | DBImpl(const std::string& name, const Options& opts) 13 | : name_(name), opts_(opts), file_(NULL), 14 | table_(NULL), cache_(NULL), tree_(NULL) 15 | { 16 | } 17 | 18 | ~DBImpl(); 19 | 20 | bool init(); 21 | 22 | bool put(Slice key, Slice value); 23 | bool del(Slice key); 24 | bool get(Slice key, Slice& value); 25 | 26 | private: 27 | std::string name_; 28 | Options opts_; 29 | 30 | AIOFile* file_; 31 | Table* table_; 32 | Cache* cache_; 33 | BufferTree* tree_; 34 | }; 35 | 36 | } // namespace yodb 37 | 38 | #endif // _YODB_DB_IMPL_H_ 39 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(GSOURCE 2 | thread_test.cc 3 | # node_test.cc 4 | slice_test.cc 5 | block_test.cc 6 | ) 7 | 8 | # add_executable(test ${GSOURCE}) 9 | # target_link_libraries(test yodb gtest_main gtest) 10 | # 11 | # add_executable(logger logger_test.cc) 12 | # target_link_libraries(logger yodb) 13 | # 14 | # add_executable(mutex mutex_test.cc) 15 | # target_link_libraries(mutex yodb) 16 | # 17 | # add_executable(rwlock rwlock_test.cc) 18 | # target_link_libraries(rwlock yodb) 19 | # 20 | add_executable(dbimpl dbimpl_test.cc) 21 | target_link_libraries(dbimpl yodb) 22 | # 23 | # add_executable(file file_test.cc) 24 | # target_link_libraries(file yodb) 25 | # 26 | # add_executable(table table_test.cc) 27 | # target_link_libraries(table yodb) 28 | 29 | add_executable(db db_test.cc) 30 | target_link_libraries(db yodb) 31 | 32 | add_executable(skiplist skiplist_test.cc) 33 | target_link_libraries(skiplist yodb) 34 | 35 | add_executable(benchmark db_bench.cc histogram.cc testutil.cc) 36 | target_link_libraries(benchmark yodb) 37 | -------------------------------------------------------------------------------- /util/timestamp.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_TIMESTAMP_H_ 2 | #define _YODB_TIMESTAMP_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace yodb { 8 | 9 | class Timestamp { 10 | public: 11 | Timestamp(); 12 | Timestamp(int64_t microsec_since_epoch); 13 | 14 | std::string to_string(); 15 | static Timestamp now(); 16 | int64_t microseconds() const; 17 | 18 | static const int kMicroPerSecond = 1000 * 1000; 19 | private: 20 | int64_t microseconds_; 21 | }; 22 | 23 | inline double time_interval(const Timestamp& x, const Timestamp& y) 24 | { 25 | int64_t diff_microsec = x.microseconds() - y.microseconds(); 26 | return static_cast(diff_microsec) / Timestamp::kMicroPerSecond; 27 | } 28 | 29 | inline bool operator<(const Timestamp& x, const Timestamp& y) 30 | { 31 | return x.microseconds() < y.microseconds(); 32 | } 33 | 34 | inline bool operator>(const Timestamp& x, const Timestamp& y) 35 | { 36 | return x.microseconds() > y.microseconds(); 37 | } 38 | 39 | } // namespace yodb 40 | 41 | #endif // _YODB_TIMESTAMP_H_ 42 | -------------------------------------------------------------------------------- /util/timestamp.cc: -------------------------------------------------------------------------------- 1 | #include "util/timestamp.h" 2 | #include 3 | #include 4 | #define __STDC_FORMAT_MACROS 5 | #include 6 | #undef __STDC_FORMAT_MACROS 7 | #include 8 | 9 | using namespace yodb; 10 | 11 | Timestamp::Timestamp() 12 | : microseconds_(0) 13 | { 14 | } 15 | 16 | Timestamp::Timestamp(int64_t microsec_since_epoch) 17 | : microseconds_(microsec_since_epoch) 18 | { 19 | } 20 | 21 | Timestamp Timestamp::now() 22 | { 23 | struct timeval tv; 24 | int64_t seconds; 25 | 26 | gettimeofday(&tv, NULL); 27 | seconds = tv.tv_sec; 28 | 29 | return Timestamp(seconds * kMicroPerSecond + tv.tv_usec); 30 | } 31 | 32 | int64_t Timestamp::microseconds() const 33 | { 34 | return microseconds_; 35 | } 36 | 37 | std::string Timestamp::to_string() 38 | { 39 | char time[32]; 40 | memset(time, 0, sizeof(time)); 41 | 42 | int64_t sec = microseconds_ / kMicroPerSecond; 43 | int64_t microsec = microseconds_ % kMicroPerSecond; 44 | 45 | snprintf(time, sizeof(time), "%" PRId64 ".%06" PRId64 "", sec, microsec); 46 | 47 | return time; 48 | } 49 | -------------------------------------------------------------------------------- /test/histogram.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // This file is copied from LevelDB and modifed a little 6 | // to add LevelDB style benchmark 7 | 8 | #ifndef _BENCH_HISTOGRAM_H_ 9 | #define _BENCH_HISTOGRAM_H_ 10 | 11 | #include 12 | 13 | namespace yodb { 14 | 15 | class Histogram { 16 | public: 17 | Histogram() { } 18 | ~Histogram() { } 19 | 20 | void Clear(); 21 | void Add(double value); 22 | void Merge(const Histogram& other); 23 | 24 | std::string ToString() const; 25 | 26 | private: 27 | double min_; 28 | double max_; 29 | double num_; 30 | double sum_; 31 | double sum_squares_; 32 | 33 | enum { kNumBuckets = 154 }; 34 | static const double kBucketLimit[kNumBuckets]; 35 | double buckets_[kNumBuckets]; 36 | 37 | double Median() const; 38 | double Percentile(double p) const; 39 | double Average() const; 40 | double StandardDeviation() const; 41 | }; 42 | 43 | } // namespace leveldb 44 | #endif 45 | -------------------------------------------------------------------------------- /sys/thread.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_THREAD_H_ 2 | #define _YODB_THREAD_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | namespace yodb { 15 | 16 | class Thread : boost::noncopyable { 17 | public: 18 | typedef boost::function Function; 19 | 20 | explicit Thread(const Function&, const string& name = string()); 21 | ~Thread(); 22 | 23 | void run(); 24 | void join(); 25 | 26 | pid_t get_tid() 27 | { 28 | return static_cast(*tid_); 29 | } 30 | 31 | bool is_main_thread() 32 | { 33 | return get_tid() == getpid(); 34 | } 35 | private: 36 | bool alive_; 37 | bool joined_; 38 | pthread_t tidp_; 39 | string name_; 40 | boost::shared_ptr tid_; 41 | Function thread_fn_; 42 | }; 43 | 44 | namespace current_thread { 45 | 46 | extern __thread const char* thread_name; 47 | extern __thread pid_t cached_tid; 48 | extern pid_t get_tid(); 49 | 50 | } // namespace current_thread 51 | 52 | } // namespace yodb 53 | 54 | #endif // _YODB_THREAD_H_ 55 | -------------------------------------------------------------------------------- /util/arena.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_ARENA_H_ 2 | #define _YODB_ARENA_H_ 3 | 4 | #include "sys/mutex.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace yodb { 13 | 14 | const int kBlockSize = 4096; 15 | const int kAlignedSize = 8; 16 | 17 | class Arena : boost::noncopyable { 18 | public: 19 | Arena(); 20 | ~Arena(); 21 | 22 | // Return a newly allocated memory block. 23 | char* alloc(size_t bytes); 24 | 25 | // Return a newly allocated memory block with address alligned by kAlignedSize. 26 | char* alloc_aligned(size_t bytes); 27 | 28 | // Estimate of the total memory usage of data allocated by arena. 29 | size_t usage() const; 30 | 31 | // Clear all the memory allocated, reset all the class members. 32 | void clear(); 33 | 34 | private: 35 | char* alloc_fallback(size_t bytes); 36 | char* alloc_new_block(size_t block_bytes); 37 | 38 | char* alloc_ptr_; 39 | size_t remaining_; 40 | 41 | mutable Mutex mutex_; 42 | 43 | std::vector blocks_; 44 | size_t blocks_size_; 45 | }; 46 | 47 | } // namespace yodb 48 | 49 | #endif // _YODB_ARENA_H_ 50 | -------------------------------------------------------------------------------- /test/block_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/block.h" 2 | #include 3 | #include 4 | 5 | using namespace yodb; 6 | 7 | TEST(Block, RW) 8 | { 9 | char buf[1024]; 10 | Block block(Slice(buf, 1024)); 11 | BlockWriter writer(block); 12 | 13 | bool v = true; 14 | uint8_t u8 = 1; 15 | uint16_t u16 = 1000; 16 | uint32_t u32 = 100000; 17 | uint64_t u64 = 100000000; 18 | 19 | writer << v << u8 << u16 << u32 << u64; 20 | EXPECT_TRUE(writer.ok()); 21 | 22 | Slice s1("abcedf"); 23 | Slice s2("cdefgh"); 24 | Slice empty1; 25 | 26 | writer << empty1 << s1 << s2; 27 | EXPECT_TRUE(writer.ok()); 28 | 29 | bool rv; 30 | uint8_t ru8; 31 | uint16_t ru16; 32 | uint32_t ru32; 33 | uint64_t ru64; 34 | 35 | BlockReader reader(block); 36 | 37 | reader >> rv >> ru8 >> ru16 >> ru32 >> ru64; 38 | EXPECT_TRUE(reader.ok()); 39 | EXPECT_EQ(rv, v); 40 | EXPECT_EQ(ru8, u8); 41 | EXPECT_EQ(ru16, u16); 42 | EXPECT_EQ(ru32, u32); 43 | EXPECT_EQ(ru64, u64); 44 | 45 | Slice s3, s4, empty2; 46 | reader >> empty2 >> s3 >> s4; 47 | EXPECT_EQ(s1, s3); 48 | EXPECT_EQ(s2, s4); 49 | EXPECT_EQ(empty1, empty2); 50 | } 51 | -------------------------------------------------------------------------------- /test/testutil.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // This file is copied from LevelDB and modifed a little 6 | // to add LevelDB style benchmark 7 | 8 | #ifndef _BENCH_TESTUTIL_H_ 9 | #define _BENCH_TESTUTIL_H_ 10 | 11 | #include 12 | #include "util/slice.h" 13 | #include "random.h" 14 | 15 | namespace yodb { 16 | 17 | // Store in *dst a random string of length "len" and return a Slice that 18 | // references the generated data. 19 | extern Slice RandomSlice(Random* rnd, size_t len, std::string* dst); 20 | 21 | // Return a random key with the specified length that may contain interesting 22 | // characters (e.g. \x00, \xff, etc.). 23 | extern std::string RandomKey(Random* rnd, size_t len); 24 | 25 | // Store in *dst a string of length "len" that will compress to 26 | // "N*compressed_fraction" bytes and return a Slice that references 27 | // the generated data. 28 | extern Slice CompressibleSlice(Random* rnd, double compressed_fraction, 29 | size_t len, std::string* dst); 30 | 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /sys/mutex.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_MUTEX_H_ 2 | #define _YODB_MUTEX_H_ 3 | 4 | #include "sys/thread.h" 5 | #include 6 | #include 7 | 8 | namespace yodb { 9 | 10 | class Mutex : boost::noncopyable { 11 | public: 12 | explicit Mutex() 13 | : holder_(0) 14 | { 15 | assert(holder_ == 0); 16 | pthread_mutex_init(&mutex_, NULL); 17 | } 18 | 19 | ~Mutex() 20 | { 21 | pthread_mutex_destroy(&mutex_); 22 | } 23 | 24 | void lock() 25 | { 26 | pthread_mutex_lock(&mutex_); 27 | holder_ = current_thread::get_tid(); 28 | } 29 | 30 | void unlock() 31 | { 32 | holder_ = 0; 33 | pthread_mutex_unlock(&mutex_); 34 | } 35 | 36 | bool is_locked_by_this_thread() const 37 | { 38 | return holder_ == current_thread::get_tid(); 39 | } 40 | 41 | pthread_mutex_t* mutex() { return &mutex_; } 42 | private: 43 | pid_t holder_; 44 | pthread_mutex_t mutex_; 45 | }; 46 | 47 | class ScopedMutex : boost::noncopyable { 48 | public: 49 | explicit ScopedMutex(Mutex& mutex) 50 | : mutex_(mutex) 51 | { 52 | mutex_.lock(); 53 | } 54 | 55 | ~ScopedMutex() 56 | { 57 | mutex_.unlock(); 58 | } 59 | 60 | private: 61 | Mutex& mutex_; 62 | }; 63 | 64 | } // namespace yodb 65 | 66 | #endif // _YODB_MUTEX_H_ 67 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yodb CXX) 4 | 5 | if(NOT CMAKE_BUILD_TYPE) 6 | set(CMAKE_BUILD_TYPE "Debug") 7 | endif() 8 | 9 | set(CXX_FLAGS 10 | -g 11 | -Wall 12 | -march=native 13 | -rdynamic 14 | -std=c++0x 15 | ) 16 | 17 | string(REPLACE ";" " " CMAKE_CXX_FLAGS "${CXX_FLAGS}") 18 | 19 | set(CMAKE_CXX_COMPILER "g++") 20 | set(CMAKE_CXX_FLAGS_DEBUG "-O2") 21 | 22 | string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE) 23 | message(STATUS "CXX_FLAGS = " ${CMAKE_CXX_FLAGS} " " ${CMAKE_CXX_FLAGS_${BUILD_TYPE}}) 24 | 25 | find_package(Boost REQUIRED) 26 | find_package(GTest REQUIRED) 27 | 28 | include_directories(${Boost_INCLUDE_DIRS}) 29 | include_directories(${PROJECT_SOURCE_DIR}) 30 | include_directories(${PROJECT_SOURCE_DIR}/include) 31 | 32 | set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) 33 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) 34 | 35 | file(GLOB_RECURSE SOURCE 36 | ${CMAKE_CURRENT_SOURCE_DIR} 37 | ${CMAKE_CURRENT_SOURCE_DIR}/db/*.cc 38 | ${CMAKE_CURRENT_SOURCE_DIR}/sys/*.cc 39 | ${CMAKE_CURRENT_SOURCE_DIR}/fs/*.cc 40 | ${CMAKE_CURRENT_SOURCE_DIR}/util/*.cc 41 | ${CMAKE_CURRENT_SOURCE_DIR}/tree/*.cc 42 | ${CMAKE_CURRENT_SOURCE_DIR}/cache/*.cc 43 | ) 44 | 45 | add_library(yodb ${SOURCE}) 46 | target_link_libraries(yodb pthread rt aio) 47 | set_target_properties(yodb PROPERTIES OUTPUT_NAME yodb) 48 | install(TARGETS yodb DESTINATION lib) 49 | 50 | add_subdirectory(test) 51 | -------------------------------------------------------------------------------- /tree/buffer_tree.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_BUFFER_TREE_H_ 2 | #define _YODB_BUFFER_TREE_H_ 3 | 4 | #include "db/options.h" 5 | #include "fs/table.h" 6 | #include "cache/cache.h" 7 | #include "util/slice.h" 8 | #include "tree/node.h" 9 | #include "sys/mutex.h" 10 | 11 | #include 12 | #include 13 | 14 | namespace yodb { 15 | 16 | class Node; 17 | 18 | class BufferTree { 19 | public: 20 | BufferTree(const std::string name, Options& opts, Cache* cache, Table* table); 21 | ~BufferTree(); 22 | 23 | bool init(); 24 | void grow_up(Node* root); 25 | 26 | bool put(const Slice& key, const Slice& value); 27 | bool del(const Slice& key); 28 | bool get(const Slice& key, Slice& value); 29 | 30 | // Create a newly node without known the nid. 31 | Node* create_node(); 32 | 33 | // This kind of create_node with nid is usually called by cache, 34 | // typically because nid have already exists, so no need to produce a new one. 35 | Node* create_node(nid_t nid); 36 | 37 | Node* get_node_by_nid(nid_t nid); 38 | void lock_path(const Slice& key, std::vector& path); 39 | 40 | private: 41 | friend class Node; 42 | 43 | std::string name_; 44 | Options options_; 45 | Cache* cache_; 46 | Table* table_; 47 | Node* root_; 48 | nid_t node_count_; 49 | std::map node_map_; 50 | Mutex mutex_; 51 | Mutex mutex_lock_path_; 52 | }; 53 | 54 | } // namespace yodb 55 | 56 | #endif // _YODB_BUFFER_TREE_H_ 57 | -------------------------------------------------------------------------------- /fs/env.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_ENV_H_ 2 | #define _YODB_ENV_H_ 3 | 4 | #include "fs/file.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace yodb { 10 | 11 | class Env : boost::noncopyable { 12 | public: 13 | Env(const std::string& dirname) 14 | : dirname_(dirname) 15 | { 16 | } 17 | 18 | bool file_exists(const std::string& filename) 19 | { 20 | struct stat st; 21 | memset(&st, 0, sizeof(st)); 22 | 23 | if (stat(full_path(filename).c_str(), &st) == -1) { 24 | LOG_ERROR << "stat file: " << filename << ", error: " << strerror(errno); 25 | return false; 26 | } 27 | 28 | return S_ISREG(st.st_mode); 29 | } 30 | 31 | size_t file_length(const std::string& filename) 32 | { 33 | struct stat st; 34 | memset(&st, 0, sizeof(st)); 35 | 36 | if (stat(full_path(filename).c_str(), &st) == -1) { 37 | LOG_ERROR << "stat file: " << filename << " error" << strerror(errno); 38 | return 0; 39 | } 40 | 41 | return (size_t)(st.st_size); 42 | } 43 | 44 | AIOFile* open_aio_file(const std::string& filename) 45 | { 46 | AIOFile* faio = new AIOFile(full_path(filename)); 47 | 48 | if (faio && faio->open()) 49 | return faio; 50 | 51 | delete faio; 52 | return NULL; 53 | } 54 | 55 | std::string full_path(const std::string& filename) 56 | { 57 | return dirname_ + "/" + filename; 58 | } 59 | private: 60 | std::string dirname_; 61 | }; 62 | 63 | } // namespace yodb 64 | 65 | #endif // _YODB_ENV_H_ 66 | -------------------------------------------------------------------------------- /test/testutil.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // This file is copied from LevelDB and modifed a little 6 | // to add LevelDB style benchmark 7 | 8 | #include "testutil.h" 9 | 10 | #include 11 | #include 12 | 13 | namespace yodb { 14 | 15 | Slice RandomSlice(Random* rnd, size_t len, std::string* dst) { 16 | dst->resize(len); 17 | for (size_t i = 0; i < len; i++) { 18 | (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' 19 | } 20 | return Slice(*dst); 21 | } 22 | 23 | std::string RandomKey(Random* rnd, size_t len) { 24 | // Make sure to generate a wide variety of characters so we 25 | // test the boundary conditions for short-key optimizations. 26 | static const char kTestChars[] = { 27 | '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' 28 | }; 29 | std::string result; 30 | for (size_t i = 0; i < len; i++) { 31 | result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; 32 | } 33 | return result; 34 | } 35 | 36 | extern Slice CompressibleSlice(Random* rnd, double compressed_fraction, 37 | size_t len, std::string* dst) { 38 | size_t raw = static_cast(len * compressed_fraction); 39 | if (raw < 1) raw = 1; 40 | std::string raw_data; 41 | RandomSlice(rnd, raw, &raw_data); 42 | 43 | // Duplicate the random data until we have filled "len" bytes 44 | dst->clear(); 45 | while (dst->size() < len) { 46 | dst->append(raw_data); 47 | } 48 | dst->resize(len); 49 | return Slice(*dst); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /db/db_impl.cc: -------------------------------------------------------------------------------- 1 | #include "db/db_impl.h" 2 | 3 | using namespace yodb; 4 | 5 | DBImpl::~DBImpl() 6 | { 7 | delete tree_; 8 | delete cache_; 9 | delete table_; 10 | delete file_; 11 | } 12 | 13 | bool DBImpl::init() 14 | { 15 | if (opts_.comparator == NULL) { 16 | LOG_ERROR << "Comparator must be set"; 17 | return false; 18 | } 19 | 20 | Env* env = opts_.env; 21 | if (env == NULL) { 22 | LOG_ERROR << "File environment must be set"; 23 | return false; 24 | } 25 | 26 | size_t size = 0; 27 | bool create = true; 28 | 29 | if (env->file_exists(name_)) { 30 | size = env->file_length(name_); 31 | if (size > 0) 32 | create = false; 33 | } 34 | 35 | file_ = env->open_aio_file(name_); 36 | 37 | table_ = new Table(file_, size); 38 | if (!table_->init(create)) { 39 | LOG_ERROR << "init table error"; 40 | return false; 41 | } 42 | 43 | cache_ = new Cache(opts_); 44 | if (!cache_->init()) { 45 | LOG_ERROR << "init cache error"; 46 | return false; 47 | } 48 | 49 | tree_ = new BufferTree(name_, opts_, cache_, table_); 50 | if (!tree_->init()) { 51 | LOG_ERROR << "init buffer tree error"; 52 | return false; 53 | } 54 | 55 | return true; 56 | } 57 | 58 | bool DBImpl::put(Slice key, Slice value) 59 | { 60 | return tree_->put(key, value); 61 | } 62 | 63 | bool DBImpl::del(Slice key) 64 | { 65 | return tree_->del(key); 66 | } 67 | 68 | bool DBImpl::get(Slice key, Slice& value) 69 | { 70 | return tree_->get(key, value); 71 | } 72 | 73 | DB* yodb::DB::open(const std::string& dbname, const Options& opts) 74 | { 75 | DBImpl* db = new DBImpl(dbname, opts); 76 | 77 | if (!db->init()) { 78 | delete db; 79 | return NULL; 80 | } 81 | 82 | return db; 83 | } 84 | -------------------------------------------------------------------------------- /test/mutex_test.cc: -------------------------------------------------------------------------------- 1 | #include "sys/thread.h" 2 | #include "sys/mutex.h" 3 | #include "util/timestamp.h" 4 | #include 5 | #include 6 | 7 | using namespace yodb; 8 | 9 | const int kCount = 10 * 1000 * 1000; 10 | 11 | Mutex g_mutex; 12 | std::vector g_container; 13 | 14 | void thr_fn() __attribute__((__noinline__)); 15 | void thr_fn() 16 | { 17 | for (int i = 0; i < kCount; i++) { 18 | ScopedMutex lock(g_mutex); 19 | g_container.push_back(i); 20 | } 21 | } 22 | 23 | int main() 24 | { 25 | { 26 | // basic test for ScopedMutex 27 | ScopedMutex lock(g_mutex); 28 | assert(g_mutex.is_locked_by_this_thread()); 29 | } 30 | 31 | Timestamp start, finish; 32 | int thread_count = 8; 33 | 34 | // g_container.reserve(kCount * thread_count); 35 | 36 | start = Timestamp::now(); 37 | for (int i = 0; i < kCount; i++) { 38 | g_container.push_back(i); 39 | } 40 | printf("single thread without lock %f\n", 41 | time_interval(Timestamp::now(), start)); 42 | 43 | start = Timestamp::now(); 44 | g_container.clear(); 45 | thr_fn(); 46 | printf("single thread with lock %f\n", 47 | time_interval(Timestamp::now(), start)); 48 | 49 | for (int i = 1; i < thread_count; i++) { 50 | boost::ptr_vector threads; 51 | g_container.clear(); 52 | 53 | start = Timestamp::now(); 54 | for (int j = 0; j < i; j++) { 55 | threads.push_back(new Thread(&thr_fn)); 56 | threads.back().run(); 57 | } 58 | for (int j = 0; j < i; j++) { 59 | threads[j].join(); 60 | } 61 | finish = Timestamp::now(); 62 | 63 | double time = time_interval(finish, start); 64 | printf("%d thread(s) with lock %f, average=%f\n", 65 | i, time, time / (double)i); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /util/log_stream.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_LOGSTREAM_H_ 2 | #define _YODB_LOGSTREAM_H_ 3 | 4 | #include "util/block.h" 5 | #include "util/slice.h" 6 | #include 7 | 8 | namespace yodb { 9 | 10 | class LogStream : boost::noncopyable { 11 | public: 12 | LogStream() 13 | : log_buffer_() 14 | { 15 | } 16 | 17 | Slice get_stream_data() 18 | { 19 | return log_buffer_.buffer(); 20 | } 21 | 22 | typedef LogStream self; 23 | 24 | self& operator<< (bool v) 25 | { 26 | log_buffer_.append(v ? "1" : "0", 1); 27 | return *this; 28 | } 29 | 30 | self& operator<< (short); 31 | self& operator<< (unsigned short); 32 | self& operator<< (int); 33 | self& operator<< (unsigned int); 34 | self& operator<< (long); 35 | self& operator<< (unsigned long); 36 | self& operator<< (long long); 37 | self& operator<< (unsigned long long); 38 | 39 | 40 | self& operator<< (const char* s) 41 | { 42 | log_buffer_.append(s, strlen(s)); 43 | return *this; 44 | } 45 | 46 | self& operator<< (char c) 47 | { 48 | log_buffer_.append(&c, 1); 49 | return *this; 50 | } 51 | 52 | self& operator<< (const std::string& s) 53 | { 54 | log_buffer_.append(s.c_str(), s.size()); 55 | return *this; 56 | } 57 | 58 | inline void append(const char* s, size_t length) 59 | { 60 | log_buffer_.append(Slice(s, length)); 61 | } 62 | private: 63 | template void format_integer(T); 64 | FixedBlock log_buffer_; 65 | }; 66 | 67 | class Fmt { 68 | public: 69 | template Fmt(const char*, T); 70 | 71 | const char* data() const { return buf_; } 72 | size_t length() const { return length_; } 73 | private: 74 | char buf_[32]; 75 | size_t length_; 76 | }; 77 | 78 | inline LogStream& operator<< (LogStream& s, const Fmt& fmt) 79 | { 80 | s.append(fmt.data(), fmt.length()); 81 | return s; 82 | } 83 | 84 | } // namespace yodb 85 | 86 | #endif // _YODB_LOGSTREAM_H_ 87 | 88 | -------------------------------------------------------------------------------- /sys/thread.cc: -------------------------------------------------------------------------------- 1 | #include "sys/thread.h" 2 | #include 3 | 4 | namespace yodb { 5 | namespace current_thread { 6 | 7 | __thread pid_t cached_tid = 0; 8 | __thread const char* thread_name = "unknown"; 9 | 10 | pid_t get_tid() 11 | { 12 | if (!cached_tid) { 13 | cached_tid = static_cast(syscall(SYS_gettid)); 14 | } 15 | return cached_tid; 16 | } 17 | 18 | } // namespace current_thread 19 | } // namespace yodb 20 | 21 | using namespace yodb; 22 | 23 | struct ThreadData { 24 | typedef boost::function Function; 25 | 26 | Function fn_; 27 | std::string name_; 28 | boost::weak_ptr weak_tid_; 29 | 30 | ThreadData(Function& fn, std::string& name, boost::shared_ptr tid) 31 | : fn_(fn), name_(name), weak_tid_(tid) 32 | { 33 | } 34 | 35 | void run_in_data() 36 | { 37 | boost::shared_ptr shared_tid = weak_tid_.lock(); 38 | 39 | if (shared_tid) { 40 | *shared_tid = current_thread::get_tid(); 41 | shared_tid.reset(); 42 | } 43 | 44 | current_thread::thread_name = name_.c_str(); 45 | fn_(); 46 | current_thread::thread_name = "thread finished"; 47 | } 48 | }; 49 | 50 | void* thread_start_fn(void* arg) 51 | { 52 | ThreadData* data = (ThreadData*)arg; 53 | data->run_in_data(); 54 | delete data; 55 | 56 | return NULL; 57 | } 58 | 59 | Thread::Thread(const Function& fn, const std::string& name) 60 | : alive_(false), joined_(false), tidp_(0), 61 | name_(name), tid_(new pid_t(0)), thread_fn_(fn) 62 | { 63 | } 64 | 65 | Thread::~Thread() 66 | { 67 | if (alive_ && !joined_) 68 | pthread_detach(tidp_); 69 | } 70 | 71 | void Thread::run() 72 | { 73 | ThreadData* data = new ThreadData(thread_fn_, name_, tid_); 74 | 75 | alive_ = true; 76 | if (pthread_create(&tidp_, NULL, &thread_start_fn, (void*)data)) { 77 | alive_ = false; 78 | delete data; 79 | } 80 | } 81 | 82 | void Thread::join() 83 | { 84 | assert(alive_); 85 | assert(!joined_); 86 | 87 | joined_ = true; 88 | pthread_join(tidp_, NULL); 89 | } 90 | -------------------------------------------------------------------------------- /test/db_test.cc: -------------------------------------------------------------------------------- 1 | #include "db/options.h" 2 | #include "db/comparator.h" 3 | #include "db/db_impl.h" 4 | #include "fs/env.h" 5 | #include "util/logger.h" 6 | #include "util/timestamp.h" 7 | 8 | using namespace yodb; 9 | 10 | const uint64_t kMaxCount = 10 * 1000 * 1000; 11 | 12 | void write(DBImpl* db) 13 | { 14 | for (uint64_t i = 0; i < kMaxCount; i++) { 15 | char buffer[16] = {0}; 16 | sprintf(buffer, "%08ld", i); 17 | 18 | Slice value(buffer, strlen(buffer)); 19 | Slice key = value; 20 | 21 | db->put(key, value); 22 | } 23 | } 24 | 25 | void read(DBImpl* db) 26 | { 27 | uint64_t read_failed_count = 0; 28 | 29 | for (uint64_t i = 0; i < kMaxCount; i++) { 30 | char buffer[16] = {0}; 31 | sprintf(buffer, "%08ld", i); 32 | 33 | Slice key(buffer, strlen(buffer)); 34 | Slice value; 35 | 36 | db->get(key, value); 37 | 38 | if (((i+1) % 10000) == 0) 39 | LOG_INFO << "Read 10000 success"; 40 | 41 | if (key.compare(value) != 0) 42 | read_failed_count++; 43 | 44 | value.release(); 45 | } 46 | 47 | LOG_INFO << Fmt("%zu read failed", read_failed_count); 48 | } 49 | 50 | int main() 51 | { 52 | Options opts; 53 | opts.comparator = new BytewiseComparator(); 54 | opts.max_node_child_number = 16; 55 | opts.max_node_msg_count = 102400; 56 | opts.cache_limited_memory = 1 << 28; 57 | opts.env = new Env("/home/kedebug/develop/yodb/bin"); 58 | 59 | DBImpl* db = new DBImpl("third", opts); 60 | db->init(); 61 | 62 | Timestamp begin = Timestamp::now(); 63 | 64 | write(db); 65 | 66 | Timestamp end = Timestamp::now(); 67 | LOG_INFO << "write: " << kMaxCount 68 | << Fmt(", key, costs time: %f", time_interval(end, begin)); 69 | 70 | delete db; 71 | 72 | db = new DBImpl("third", opts); 73 | db->init(); 74 | read(db); 75 | 76 | end = Timestamp::now(); 77 | LOG_INFO << "read: " << kMaxCount 78 | << Fmt(", key, costs time: %f", time_interval(end, begin)); 79 | 80 | delete db; 81 | delete opts.env; 82 | delete opts.comparator; 83 | } 84 | -------------------------------------------------------------------------------- /cache/cache.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_CACHE_H_ 2 | #define _YODB_CACHE_H_ 3 | 4 | #include "db/options.h" 5 | #include "fs/table.h" 6 | #include "fs/file.h" 7 | #include "sys/thread.h" 8 | #include "sys/rwlock.h" 9 | #include "sys/mutex.h" 10 | #include "tree/node.h" 11 | 12 | #include 13 | 14 | namespace yodb { 15 | 16 | class BufferTree; 17 | 18 | class Cache { 19 | public: 20 | Cache(const Options& opts); 21 | ~Cache(); 22 | 23 | bool init(); 24 | 25 | // Integrate cache with our buffer tree and Table storage. 26 | void integrate(BufferTree* tree, Table* table); 27 | 28 | // When we invoke BufferTree::create_node(), 29 | // the newly created node should put into cache. 30 | void put(nid_t nid, Node* node); 31 | 32 | // All nodes are managed by our Cache System, 33 | // if the node is not in the cache, then we will 34 | // invoke Table::read() to get node buffer from disk. 35 | Node* get(nid_t nid); 36 | 37 | void flush(); 38 | 39 | Timestamp last_checkpoint_timestamp; 40 | private: 41 | // There is a single thread to write the memory node 42 | // to disk, the frequency is defined by Options. 43 | void write_back(); 44 | void write_complete_handler(Node* node, Slice buffer, Status status); 45 | 46 | void flush_ready_nodes(std::vector& nodes); 47 | 48 | void maybe_eviction(); 49 | void evict_from_memory(); 50 | 51 | private: 52 | Options options_; 53 | size_t cache_size_; 54 | Mutex cache_size_mutex_; 55 | 56 | bool alive_; 57 | Thread* worker_; 58 | 59 | Table* table_; 60 | BufferTree* tree_; 61 | 62 | typedef std::map NodeMap; 63 | NodeMap nodes_; 64 | RWLock lock_nodes_; 65 | }; 66 | 67 | class FirstWriteComparator { 68 | public: 69 | bool operator()(Node* x, Node* y) 70 | { 71 | return x->get_first_write_timestamp() < y->get_first_write_timestamp(); 72 | } 73 | }; 74 | 75 | class LRUComparator { 76 | public: 77 | bool operator()(Node* x, Node* y) 78 | { 79 | return x->get_last_used_timestamp() < y->get_last_used_timestamp(); 80 | } 81 | }; 82 | 83 | 84 | } // namespace yodb 85 | 86 | #endif // _YODB_CACHE_H_ 87 | -------------------------------------------------------------------------------- /test/file_test.cc: -------------------------------------------------------------------------------- 1 | #include "fs/file.h" 2 | #include "fs/env.h" 3 | #include "sys/mutex.h" 4 | #include "util/slice.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | using namespace yodb; 14 | 15 | const size_t kMaxCount = 2000; 16 | const size_t kBoundarySize = 4096; 17 | const size_t kBlockSize = 4096; 18 | 19 | Slice alloc_aligned_buffer(size_t size) 20 | { 21 | void* ptr = memalign(kBoundarySize, size); 22 | return Slice((char*)ptr, size); 23 | } 24 | 25 | void free_aligned_buffer(Slice& buffer) 26 | { 27 | free((void*)buffer.data()); 28 | } 29 | 30 | Mutex g_mutex; 31 | std::map g_map; 32 | 33 | void complete(int* id, Status stat) 34 | { 35 | ScopedMutex lock(g_mutex); 36 | g_map[*id] = stat; 37 | } 38 | 39 | int main() 40 | { 41 | Env env("./"); 42 | AIOFile* file = env.open_aio_file("aio_file_test"); 43 | assert(file); 44 | 45 | int id[kMaxCount]; 46 | Slice buffer[kMaxCount]; 47 | 48 | for (size_t i = 0; i < kMaxCount; i++) { 49 | id[i] = i; 50 | buffer[i] = alloc_aligned_buffer(kBlockSize); 51 | memset((void*)buffer[i].data(), i & 0xFF, kBlockSize); 52 | 53 | file->async_write(i * kBlockSize, buffer[i], 54 | boost::bind(&complete, id + i, _1)); 55 | } 56 | 57 | while (g_map.size() != kMaxCount) 58 | usleep(10000); 59 | 60 | for (size_t i = 0; i < kMaxCount; i++) { 61 | assert(g_map[i].succ); 62 | assert(g_map[i].size == kBlockSize); 63 | } 64 | 65 | g_map.clear(); 66 | 67 | for (size_t i = 0; i < kMaxCount; i++) { 68 | id[i] = i; 69 | file->async_read(i * kBlockSize, buffer[i], 70 | boost::bind(&complete, id + i, _1)); 71 | } 72 | 73 | while (g_map.size() != kMaxCount) 74 | usleep(10000); 75 | 76 | for (size_t i = 0; i < kMaxCount; i++) { 77 | assert(g_map[i].succ); 78 | assert(g_map[i].size == kBlockSize); 79 | 80 | char dst[kBlockSize]; 81 | memset((void*)dst, i & 0xFF, kBlockSize); 82 | assert(memcmp(buffer[i].data(), dst, kBlockSize) == 0); 83 | 84 | free_aligned_buffer(buffer[i]); 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /sys/rwlock.cc: -------------------------------------------------------------------------------- 1 | #include "sys/rwlock.h" 2 | 3 | using namespace yodb; 4 | 5 | RWLock::RWLock() 6 | : mutex_(), 7 | cond_wait_read_(mutex_), 8 | cond_wait_write_(mutex_), 9 | readers_(0), 10 | writers_(0), 11 | want_readers_(0), 12 | want_writers_(0) 13 | { 14 | } 15 | 16 | bool RWLock::try_read_lock() 17 | { 18 | ScopedMutex lock(mutex_); 19 | 20 | if (writers_ || want_writers_) 21 | return false; 22 | 23 | ++readers_; 24 | return true; 25 | } 26 | 27 | void RWLock::read_lock() 28 | { 29 | ScopedMutex lock(mutex_); 30 | 31 | if (writers_ || want_writers_) { 32 | ++want_readers_; 33 | 34 | while (writers_ || want_writers_) 35 | cond_wait_read_.wait(); 36 | 37 | // it's our turn now 38 | assert(writers_ == 0); 39 | assert(want_readers_ > 0); 40 | 41 | --want_readers_; 42 | } 43 | ++readers_; 44 | } 45 | 46 | void RWLock::read_unlock() 47 | { 48 | ScopedMutex lock(mutex_); 49 | 50 | assert(readers_ > 0); 51 | assert(writers_ == 0); 52 | 53 | --readers_; 54 | 55 | if (readers_ == 0 && want_writers_) 56 | cond_wait_write_.notify(); 57 | } 58 | 59 | bool RWLock::try_write_lock() 60 | { 61 | ScopedMutex lock(mutex_); 62 | 63 | if (readers_ || writers_) 64 | return false; 65 | 66 | ++writers_; 67 | return true; 68 | } 69 | 70 | void RWLock::write_lock() 71 | { 72 | ScopedMutex lock(mutex_); 73 | 74 | if (readers_ || writers_) { 75 | ++want_writers_; 76 | 77 | while (readers_ || writers_) 78 | cond_wait_write_.wait(); 79 | 80 | // it's our turn now 81 | assert(readers_ == 0); 82 | assert(want_writers_ > 0); 83 | 84 | --want_writers_; 85 | } 86 | ++writers_; 87 | } 88 | 89 | void RWLock::write_unlock() 90 | { 91 | ScopedMutex lock(mutex_); 92 | 93 | assert(readers_ == 0); 94 | assert(writers_ == 1); 95 | 96 | --writers_; 97 | 98 | // writer first 99 | if (want_writers_) 100 | cond_wait_write_.notify(); 101 | else if (want_readers_) 102 | cond_wait_read_.notify_all(); 103 | } 104 | 105 | -------------------------------------------------------------------------------- /test/node_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tree/buffer_tree.h" 3 | 4 | using namespace yodb; 5 | 6 | TEST(BufferTree, bootstrap) 7 | { 8 | Options opts; 9 | opts.comparator = new LexicalComparator(); 10 | opts.max_node_msg_count = 4; 11 | opts.max_node_child_number = 2; 12 | 13 | BufferTree* tree = new BufferTree("", opts); 14 | tree->init(); 15 | Node* n1 = tree->root_; 16 | 17 | n1->put("a", "1"); 18 | n1->put("b", "1"); 19 | n1->put("c", "1"); 20 | n1->put("d", "1"); 21 | 22 | EXPECT_EQ(n1->pivots_.size(), 1); 23 | EXPECT_EQ(n1->pivots_[0].msgbuf->msg_count(), 4); 24 | 25 | n1->put("e", "1"); 26 | // split msgbuf 27 | EXPECT_EQ(n1->pivots_.size(), 2); 28 | EXPECT_EQ(n1->pivots_[0].msgbuf->msg_count(), 2); 29 | EXPECT_EQ(n1->pivots_[1].msgbuf->msg_count(), 3); 30 | 31 | n1->put("f", "1"); 32 | EXPECT_EQ(n1->pivots_[1].msgbuf->msg_count(), 4); 33 | n1->put("aa", "1"); 34 | EXPECT_EQ(n1->pivots_[0].msgbuf->msg_count(), 3); 35 | 36 | n1->put("g", "1"); 37 | // split msgbuf, add pivot, then split node 38 | Node* n2 = tree->root_; 39 | EXPECT_EQ(n2->pivots_.size(), 2); 40 | 41 | Node* n2_1 = tree->get_node_by_nid(n2->pivots_[0].child_nid); 42 | Node* n2_2 = tree->get_node_by_nid(n2->pivots_[1].child_nid); 43 | 44 | EXPECT_EQ(n2_1->pivots_.size(), 1); 45 | EXPECT_EQ(n2_2->pivots_.size(), 2); 46 | 47 | n2->put("bb", "1"); 48 | EXPECT_EQ(n2->pivots_[0].msgbuf->msg_count(), 1); 49 | n2->put("a", "1"); 50 | n2->put("aa", "1"); 51 | n2->put("b", "1"); 52 | n2->put("bb", "2"); 53 | EXPECT_EQ(n2->pivots_[0].msgbuf->msg_count(), 4); 54 | n2->put("bc", "1"); 55 | // n2's msgbuf should push down to n2_1 56 | EXPECT_EQ(n2_1->pivots_[0].msgbuf->msg_count(), 2); 57 | EXPECT_EQ(n2_1->pivots_[1].msgbuf->msg_count(), 3); 58 | 59 | n2->put("e", "1"); 60 | n2->put("f", "1"); 61 | n2->put("g", "1"); 62 | n2->put("h", "1"); 63 | n2->put("hh", "1"); 64 | // n2's msgbuf should push down to n2_2 65 | // the tree has 3 levels NOW 66 | 67 | Node* n3 = tree->root_; 68 | EXPECT_EQ(n3->pivots_.size(), 2); 69 | EXPECT_EQ(n3->pivots_[0].child_nid, n2->self_nid_); 70 | 71 | delete opts.comparator; 72 | delete tree; 73 | } 74 | -------------------------------------------------------------------------------- /test/random.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // This file is copied from LevelDB and modifed a little 6 | // to add LevelDB style benchmark 7 | 8 | #ifndef _BENCH_RANDOM_H_ 9 | #define _BENCH_RANDOM_H_ 10 | 11 | #include 12 | 13 | namespace yodb { 14 | 15 | // A very simple random number generator. Not especially good at 16 | // generating truly random bits, but good enough for our needs in this 17 | // package. 18 | class Random { 19 | private: 20 | uint32_t seed_; 21 | public: 22 | explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } 23 | uint32_t Next() { 24 | static const uint32_t M = 2147483647L; // 2^31-1 25 | static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 26 | // We are computing 27 | // seed_ = (seed_ * A) % M, where M = 2^31-1 28 | // 29 | // seed_ must not be zero or M, or else all subsequent computed values 30 | // will be zero or M respectively. For all other values, seed_ will end 31 | // up cycling through every number in [1,M-1] 32 | uint64_t product = seed_ * A; 33 | 34 | // Compute (product % M) using the fact that ((x << 31) % M) == x. 35 | seed_ = static_cast((product >> 31) + (product & M)); 36 | // The first reduction may overflow by 1 bit, so we may need to 37 | // repeat. mod == M is not possible; using > allows the faster 38 | // sign-bit-based test. 39 | if (seed_ > M) { 40 | seed_ -= M; 41 | } 42 | return seed_; 43 | } 44 | // Returns a uniformly distributed value in the range [0..n-1] 45 | // REQUIRES: n > 0 46 | uint32_t Uniform(int n) { return Next() % n; } 47 | 48 | // Randomly returns true ~"1/n" of the time, and false otherwise. 49 | // REQUIRES: n > 0 50 | bool OneIn(int n) { return (Next() % n) == 0; } 51 | 52 | // Skewed: pick "base" uniformly from range [0,max_log] and then 53 | // return "base" random bits. The effect is to pick a number in the 54 | // range [0,2^max_log-1] with exponential bias towards smaller numbers. 55 | uint32_t Skewed(int max_log) { 56 | return Uniform(1 << Uniform(max_log + 1)); 57 | } 58 | }; 59 | 60 | } // namespace leveldb 61 | 62 | #endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ 63 | -------------------------------------------------------------------------------- /util/arena.cc: -------------------------------------------------------------------------------- 1 | #include "util/arena.h" 2 | #include 3 | 4 | using namespace yodb; 5 | 6 | Arena::Arena() 7 | { 8 | blocks_size_ = 0; 9 | alloc_ptr_ = NULL; 10 | remaining_ = 0; 11 | } 12 | 13 | Arena::~Arena() 14 | { 15 | clear(); 16 | } 17 | 18 | char* Arena::alloc(size_t bytes) 19 | { 20 | assert(bytes > 0); 21 | 22 | ScopedMutex lock(mutex_); 23 | 24 | if (bytes <= remaining_) { 25 | char* result = alloc_ptr_; 26 | 27 | alloc_ptr_ += bytes; 28 | remaining_ -= bytes; 29 | 30 | return result; 31 | } 32 | 33 | return alloc_fallback(bytes); 34 | } 35 | 36 | char* Arena::alloc_aligned(size_t bytes) 37 | { 38 | ScopedMutex lock(mutex_); 39 | 40 | size_t cut = reinterpret_cast(alloc_ptr_) & (kAlignedSize - 1); 41 | size_t slop = (cut == 0 ? 0 : kAlignedSize - cut); 42 | size_t fixed_size = bytes + slop; 43 | char* result; 44 | 45 | if (fixed_size <= remaining_) { 46 | result = alloc_ptr_ + slop; 47 | alloc_ptr_ += fixed_size; 48 | remaining_ -= fixed_size; 49 | } else { 50 | result = alloc_fallback(bytes); 51 | } 52 | 53 | assert((reinterpret_cast(result) & (kAlignedSize - 1)) == 0); 54 | 55 | return result; 56 | } 57 | 58 | char* Arena::alloc_fallback(size_t bytes) 59 | { 60 | if (bytes > kBlockSize / 4) 61 | return alloc_new_block(bytes); 62 | 63 | alloc_ptr_ = alloc_new_block(kBlockSize); 64 | remaining_ = kBlockSize; 65 | 66 | char* result = alloc_ptr_; 67 | 68 | alloc_ptr_ += bytes; 69 | remaining_ -= bytes; 70 | 71 | return result; 72 | } 73 | 74 | char* Arena::alloc_new_block(size_t block_bytes) 75 | { 76 | char* result = new char[block_bytes]; 77 | 78 | blocks_size_ += block_bytes; 79 | blocks_.push_back(result); 80 | 81 | return result; 82 | } 83 | 84 | size_t Arena::usage() const 85 | { 86 | ScopedMutex lock(mutex_); 87 | return blocks_size_ + blocks_.capacity() * (sizeof(char*)); 88 | } 89 | 90 | void Arena::clear() 91 | { 92 | ScopedMutex lock(mutex_); 93 | 94 | for (size_t i = 0; i < blocks_.size(); i++) 95 | delete[] blocks_[i]; 96 | 97 | blocks_size_ = 0; 98 | alloc_ptr_ = NULL; 99 | remaining_ = 0; 100 | 101 | blocks_.clear(); 102 | } 103 | -------------------------------------------------------------------------------- /util/logger.cc: -------------------------------------------------------------------------------- 1 | #include "util/logger.h" 2 | #include "sys/thread.h" 3 | #include 4 | 5 | namespace yodb { 6 | 7 | LogLevel init_logger_level() 8 | { 9 | char* env = getenv("YODB_LOG_LEVEL"); 10 | 11 | if (env == NULL) 12 | return INFO; 13 | else if (strcmp(env, "TRACE") == 0) 14 | return TRACE; 15 | else if (strcmp(env, "DEBUG") == 0) 16 | return DEBUG; 17 | else if (strcmp(env, "INFO") == 0) 18 | return INFO; 19 | else if (strcmp(env, "WARN") == 0) 20 | return WARN; 21 | else if (strcmp(env, "ERROR") == 0) 22 | return ERROR; 23 | 24 | return INFO; 25 | } 26 | 27 | void default_output(const char* msg, int len) 28 | { 29 | fwrite(msg, 1, len, stdout); 30 | } 31 | 32 | void default_flush() 33 | { 34 | fflush(stdout); 35 | } 36 | 37 | LogLevel g_logger_level = init_logger_level(); 38 | Logger::output_fn g_output = default_output; 39 | Logger::flush_fn g_flush = default_flush; 40 | 41 | const char* LogLevelName[] = { 42 | "TRACE ", "DEBUG ", "INFO ", "WARN ", "ERROR ", 43 | }; 44 | 45 | } // namespace yodb 46 | 47 | 48 | using namespace yodb; 49 | 50 | Impl::Impl(LogLevel level, const SourceFile& file, int line) 51 | : stream_(), level_(level), filename_(file), line_(line) 52 | { 53 | stream_ << filename_.data() << ':' << line_ << ' '; 54 | stream_ << LogLevelName[level] << ": "; 55 | stream_ << Fmt("tid=%d, ", current_thread::get_tid()); 56 | } 57 | 58 | void Impl::finish() 59 | { 60 | stream_ << '\n'; 61 | } 62 | 63 | Logger::Logger(SourceFile file, int line) 64 | : Impl(INFO, file, line) 65 | { 66 | } 67 | 68 | Logger::Logger(SourceFile file, int line, LogLevel level, const char* fn_name) 69 | : Impl(level, file, line) 70 | { 71 | stream() << '*' << fn_name << "* "; 72 | } 73 | 74 | Logger::Logger(SourceFile file, int line, LogLevel level) 75 | : Impl(level, file, line) 76 | { 77 | } 78 | 79 | LogLevel Logger::logger_level() 80 | { 81 | return g_logger_level; 82 | } 83 | 84 | void Logger::set_flush(flush_fn fn) 85 | { 86 | g_flush = fn; 87 | } 88 | 89 | void Logger::set_output(output_fn fn) 90 | { 91 | g_output = fn; 92 | } 93 | 94 | void Logger::set_logger_level(LogLevel level) 95 | { 96 | g_logger_level = level; 97 | } 98 | 99 | Logger::~Logger() 100 | { 101 | finish(); 102 | Slice slice = get_logger_data(); 103 | g_output(slice.data(), slice.size()); 104 | } 105 | -------------------------------------------------------------------------------- /util/log_stream.cc: -------------------------------------------------------------------------------- 1 | #include "util/log_stream.h" 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace yodb; 7 | 8 | template 9 | void LogStream::format_integer(T value) 10 | { 11 | if (log_buffer_.avail() < 32) 12 | return; 13 | 14 | char buf[64]; 15 | char* p = buf; 16 | T v = value > 0 ? value : -value; 17 | 18 | while (v) { 19 | int lsd = static_cast(v % 10); 20 | v /= 10; 21 | *p++ = lsd + '0'; 22 | } 23 | 24 | if (value < 0) *p++ = '-'; 25 | else if (value == 0) *p++ = '0'; 26 | *p = '\0'; 27 | std::reverse(buf, p); 28 | 29 | log_buffer_.append(buf, p - buf); 30 | } 31 | 32 | LogStream& LogStream::operator<< (short v) 33 | { 34 | *this << static_cast(v); 35 | return *this; 36 | } 37 | 38 | LogStream& LogStream::operator<< (unsigned short v) 39 | { 40 | *this << static_cast(v); 41 | return *this; 42 | } 43 | 44 | LogStream& LogStream::operator<< (int v) 45 | { 46 | format_integer(v); 47 | return *this; 48 | } 49 | 50 | LogStream& LogStream::operator<< (unsigned int v) 51 | { 52 | format_integer(v); 53 | return *this; 54 | } 55 | 56 | LogStream& LogStream::operator<< (long v) 57 | { 58 | format_integer(v); 59 | return *this; 60 | } 61 | 62 | LogStream& LogStream::operator<< (unsigned long v) 63 | { 64 | format_integer(v); 65 | return *this; 66 | } 67 | 68 | LogStream& LogStream::operator<< (long long v) 69 | { 70 | format_integer(v); 71 | return *this; 72 | } 73 | 74 | LogStream& LogStream::operator<< (unsigned long long v) 75 | { 76 | format_integer(v); 77 | return *this; 78 | } 79 | 80 | template 81 | Fmt::Fmt(const char* fmt, T val) 82 | { 83 | length_ = snprintf(buf_, sizeof(buf_), fmt, val); 84 | assert(static_cast(length_) < sizeof(buf_)); 85 | } 86 | 87 | template Fmt::Fmt(const char* fmt, char); 88 | 89 | template Fmt::Fmt(const char* fmt, short); 90 | template Fmt::Fmt(const char* fmt, unsigned short); 91 | template Fmt::Fmt(const char* fmt, int); 92 | template Fmt::Fmt(const char* fmt, unsigned int); 93 | template Fmt::Fmt(const char* fmt, long); 94 | template Fmt::Fmt(const char* fmt, unsigned long); 95 | template Fmt::Fmt(const char* fmt, long long); 96 | template Fmt::Fmt(const char* fmt, unsigned long long); 97 | 98 | template Fmt::Fmt(const char* fmt, float); 99 | template Fmt::Fmt(const char* fmt, double); 100 | 101 | -------------------------------------------------------------------------------- /util/logger.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_LOGGER_H_ 2 | #define _YODB_LOGGER_H_ 3 | 4 | #include "util/log_stream.h" 5 | #include 6 | #include 7 | 8 | namespace yodb { 9 | 10 | enum LogLevel { 11 | TRACE, DEBUG, INFO, WARN, ERROR, 12 | }; 13 | 14 | class SourceFile { 15 | public: 16 | template 17 | inline SourceFile(const char (&array)[N]) 18 | : data_(array), size_(N-1) 19 | { 20 | const char* slash = strrchr(data_, '/'); 21 | if (slash) { 22 | data_ = slash + 1; 23 | size_ -= static_cast(data_ - array); 24 | } 25 | } 26 | 27 | explicit SourceFile(const char* filename) 28 | : data_(filename) 29 | { 30 | const char* slash = strrchr(filename, '/'); 31 | if (slash) data_ = slash + 1; 32 | size_ = static_cast(strlen(data_)); 33 | } 34 | 35 | const char* data() { return data_; } 36 | private: 37 | const char* data_; 38 | int size_; 39 | }; 40 | 41 | class Impl { 42 | public: 43 | explicit Impl(LogLevel level, const SourceFile& file, int line); 44 | void finish(); 45 | LogStream& stream() { return stream_; } 46 | 47 | private: 48 | LogStream stream_; 49 | LogLevel level_; 50 | SourceFile filename_; 51 | int line_; 52 | }; 53 | 54 | class Logger : public Impl { 55 | public: 56 | Logger(SourceFile file, int line); 57 | Logger(SourceFile file, int line, LogLevel level); 58 | Logger(SourceFile file, int line, LogLevel level, const char* fn); 59 | ~Logger(); 60 | 61 | Slice get_logger_data() 62 | { 63 | return stream().get_stream_data(); 64 | } 65 | 66 | static LogLevel logger_level(); 67 | static void set_logger_level(LogLevel level); 68 | 69 | typedef void (* output_fn)(const char* msg, int len); 70 | typedef void (* flush_fn)(); 71 | 72 | static void set_output(output_fn); 73 | static void set_flush(flush_fn); 74 | }; 75 | 76 | #define LOG_TRACE if (yodb::Logger::logger_level() <= yodb::TRACE) \ 77 | yodb::Logger(__FILE__, __LINE__, yodb::TRACE, __func__).stream() 78 | #define LOG_DEBUG if (yodb::Logger::logger_level() <= yodb::DEBUG) \ 79 | yodb::Logger(__FILE__, __LINE__, yodb::DEBUG, __func__).stream() 80 | #define LOG_INFO if (yodb::Logger::logger_level() <= yodb::INFO) \ 81 | yodb::Logger(__FILE__, __LINE__).stream() 82 | #define LOG_WARN yodb::Logger(__FILE__, __LINE__, yodb::WARN).stream() 83 | #define LOG_ERROR yodb::Logger(__FILE__, __LINE__, yodb::ERROR).stream() 84 | 85 | } // namespace yodb 86 | 87 | #endif // _YODB_LOGGER_H_ 88 | -------------------------------------------------------------------------------- /tree/msg.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_MSG_H_ 2 | #define _YODB_MSG_H_ 3 | 4 | #include "db/comparator.h" 5 | #include "util/slice.h" 6 | #include "util/logger.h" 7 | #include "sys/mutex.h" 8 | #include "tree/skiplist.h" 9 | 10 | #include 11 | 12 | namespace yodb { 13 | 14 | enum MsgType { 15 | _Nop, Put, Del, 16 | }; 17 | 18 | class Msg { 19 | public: 20 | Msg() : type_(_Nop) {} 21 | Msg(MsgType type, Slice key, Slice value = Slice()) 22 | : type_(type), key_(key), value_(value) {} 23 | 24 | size_t size() const 25 | { 26 | size_t size = 0; 27 | 28 | size += 1; // MsgType->uint8_t 29 | size += 4 + key_.size(); // Slice->(see BlockWriter<<(Slice)) 30 | if (type_ == Put) 31 | size += 4 + value_.size(); // Same as key_ 32 | 33 | return size; 34 | } 35 | 36 | void release() 37 | { 38 | key_.release(); 39 | if (value_.size()) 40 | value_.release(); 41 | } 42 | 43 | Slice key() const { return key_; } 44 | Slice value() const { return value_; } 45 | MsgType type() const { return type_; } 46 | 47 | private: 48 | MsgType type_; 49 | Slice key_; 50 | Slice value_; 51 | }; 52 | 53 | class Compare { 54 | public: 55 | Compare(Comparator* comparator) 56 | : comparator_(comparator) {} 57 | 58 | int operator()(const Msg& a, const Msg& b) const 59 | { 60 | return comparator_->compare(a.key(), b.key()); 61 | } 62 | private: 63 | Comparator* comparator_; 64 | }; 65 | 66 | class MsgTable { 67 | public: 68 | typedef SkipList List; 69 | typedef List::Iterator Iterator; 70 | 71 | MsgTable(Comparator* comparator); 72 | ~MsgTable(); 73 | 74 | size_t count(); 75 | 76 | size_t size(); 77 | 78 | size_t memory_usage(); 79 | 80 | // Clear the Msg, but not delete the memory they allocated. 81 | void clear(); 82 | 83 | // you must lock hold the lock before use it 84 | bool find(Slice key, Msg& msg); 85 | 86 | void insert(const Msg& msg); 87 | 88 | bool constrcutor(BlockReader& reader); 89 | bool destructor(BlockWriter& writer); 90 | 91 | // resize the msgbuf, release but not delete the truncated Msg 92 | void resize(size_t size); 93 | 94 | void lock() { mutex_.lock(); } 95 | void unlock() { mutex_.unlock(); } 96 | 97 | List* skiplist() { return &list_; } 98 | private: 99 | List list_; 100 | Comparator* comparator_; 101 | Mutex mutex_; 102 | size_t size_; 103 | }; 104 | 105 | } // namespace yodb 106 | 107 | #endif // _YODB_MSG_H_ 108 | -------------------------------------------------------------------------------- /util/slice.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_SLICE_H_ 2 | #define _YODB_SLICE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace yodb { 9 | 10 | class Slice { 11 | public: 12 | Slice() 13 | : data_(""), size_(0), is_self_alloc_(false) {} 14 | Slice(const char* s) 15 | : data_(s), size_(strlen(s)), is_self_alloc_(false) {} 16 | Slice(const char* s, size_t size, bool is_self_alloc = false) 17 | : data_(s), size_(size), is_self_alloc_(is_self_alloc) {} 18 | Slice(const std::string& s) 19 | : data_(s.data()), size_(s.size()), is_self_alloc_(false) {} 20 | 21 | const char* data() const { return data_; } 22 | size_t size() const { return size_; } 23 | bool empty() const { return size_ == 0; } 24 | 25 | std::string to_string() const { return std::string(data_, size_); } 26 | 27 | void clear() 28 | { 29 | data_ = ""; 30 | size_ = 0; 31 | is_self_alloc_ = false; 32 | } 33 | 34 | char operator[] (size_t i) const 35 | { 36 | assert(i < size()); 37 | return data_[i]; 38 | } 39 | 40 | int compare(const Slice& slice) const 41 | { 42 | int prefix_len = (size_ < slice.size()) ? size_ : slice.size(); 43 | int res = memcmp(data_, slice.data(), prefix_len); 44 | if (res == 0) { 45 | if (size_ < slice.size()) res = -1; 46 | else if (size_ > slice.size()) res = 1; 47 | } 48 | return res; 49 | } 50 | 51 | Slice clone() const 52 | { 53 | if (size_ == 0) 54 | return Slice("", 0, false); 55 | 56 | char* s = new char[size_]; 57 | assert(s); 58 | memcpy(s, data_, size_); 59 | return Slice(s, size_, true); 60 | } 61 | 62 | static Slice alloc(size_t size) 63 | { 64 | assert(size); 65 | char* s = new char[size]; 66 | return Slice(s, size, true); 67 | } 68 | 69 | void release() 70 | { 71 | assert(is_self_alloc_); 72 | 73 | if (size_ == 0) return; 74 | 75 | delete[] data_; 76 | clear(); 77 | } 78 | 79 | private: 80 | const char* data_; 81 | size_t size_; 82 | bool is_self_alloc_; 83 | }; 84 | 85 | inline bool operator== (const Slice& x, const Slice& y) 86 | { 87 | return x.compare(y) == 0; 88 | } 89 | 90 | inline bool operator!= (const Slice& x, const Slice& y) 91 | { 92 | return !(x == y); 93 | } 94 | 95 | inline bool operator< (const Slice& x, const Slice& y) 96 | { 97 | return x.compare(y) < 0; 98 | } 99 | 100 | } // namespace yodb 101 | 102 | #endif // _YODB_SLICE_H_ 103 | -------------------------------------------------------------------------------- /test/skiplist_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/logger.h" 2 | #include "tree/skiplist.h" 3 | #include 4 | 5 | using namespace yodb; 6 | 7 | typedef uint64_t Key; 8 | 9 | struct Comparator { 10 | int operator()(const Key& a, const Key& b) const 11 | { 12 | if (a < b) return -1; 13 | else if (a > b) return 1; 14 | else return 0; 15 | } 16 | }; 17 | 18 | void test_empty() 19 | { 20 | Comparator cmp; 21 | SkipList list(cmp); 22 | 23 | assert(!list.contains(10)); 24 | 25 | SkipList::Iterator iter(&list); 26 | 27 | assert(!iter.valid()); 28 | iter.seek_to_first(); 29 | assert(!iter.valid()); 30 | iter.seek_to_middle(); 31 | assert(!iter.valid()); 32 | iter.seek_to_last(); 33 | assert(!iter.valid()); 34 | } 35 | 36 | void test_insert_erase() 37 | { 38 | const size_t N = 100; 39 | Comparator cmp; 40 | SkipList list(cmp); 41 | 42 | for (size_t i = 0; i < N; i += 2) 43 | list.insert(i); 44 | for (size_t i = 1; i < N; i += 2) 45 | list.insert(i); 46 | 47 | // test repeat 48 | for (size_t i = 0; i < N; i += 2) 49 | list.insert(i); 50 | for (size_t i = 1; i < N; i += 2) 51 | list.insert(i); 52 | 53 | assert(list.count() == N); 54 | 55 | SkipList::Iterator iter(&list); 56 | 57 | iter.seek_to_first(); 58 | assert(cmp(iter.key(), 0) == 0); 59 | iter.seek_to_middle(); 60 | assert(cmp(iter.key(), 50) == 0); 61 | 62 | list.erase(50); 63 | iter.seek_to_middle(); 64 | assert(cmp(iter.key(), 49) == 0); 65 | 66 | list.insert(50); 67 | iter.seek_to_middle(); 68 | assert(cmp(iter.key(), 50) == 0); 69 | 70 | std::vector v1, v2; 71 | 72 | iter.seek_to_first(); 73 | while (iter.valid()) { 74 | v1.push_back(iter.key()); 75 | iter.next(); 76 | } 77 | 78 | iter.seek_to_last(); 79 | while (iter.valid()) { 80 | v2.push_back(iter.key()); 81 | iter.prev(); 82 | } 83 | 84 | assert(v1.size() == v2.size()); 85 | size_t size = v1.size(); 86 | for (size_t i = 0; i < size; i++) 87 | assert(v1[i] == v2[size-1-i]); 88 | 89 | // test resize 90 | 91 | LOG_INFO << Fmt("before memory usage=%zu", list.memory_usage()); 92 | 93 | assert(list.count() == N); 94 | list.resize(N / 2); 95 | 96 | LOG_INFO << Fmt("after memory usage=%zu", list.memory_usage()); 97 | 98 | iter.seek_to_first(); 99 | for (size_t i = 0; i < list.count(); i++) { 100 | assert(iter.key() == i); 101 | iter.next(); 102 | } 103 | } 104 | 105 | int main() 106 | { 107 | test_empty(); 108 | test_insert_erase(); 109 | } 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | YoDB [![Total views](https://sourcegraph.com/api/repos/github.com/kedebug/yodb/counters/views.png)](https://sourcegraph.com/github.com/kedebug/yodb) 2 | ==== 3 | 4 | A lightweight and fast key-value storage engine based on the buffer tree. 5 | 6 | ## Purpose 7 | There have already been several KV databases like [tokudb](https://github.com/Tokutek/ft-index) and [cascadb](https://github.com/weicao/cascadb), which are using buffer tree as the underlying data structure to optimize written operation. But their code seems to be a little complex and hard for the __beginners__ to understand the core idea of how the tree really works in multi-threaded environment. 8 | 9 | So I write this storage engine which named __yodb__ (__yo__ just helps for funny pronunciation), try to meet the need of those guys who want to have a quick introspect of this beautiful algorithm. Yodb has an excellent performance that can handle millions of read/written requests at a time with only 6K source lines of code, and also of course has a detailed notation. 10 | 11 | ## Performance 12 | #### Setup 13 | We use a database with a million entries. Each entry has a 16 byte key, and a 100 byte value. 14 | ``` 15 | yodb: version 0.1 16 | Date: Tue Dec 17 15:00:09 2013 17 | CPU: 4 * Intel(R) Core(TM)2 Quad CPU Q8300 @ 2.50GHz 18 | CPUCache: 2048 KB 19 | Keys: 16 bytes each 20 | Values: 100 bytes each 21 | Entries: 1000000 22 | RawSize: 110.6 MB (estimated) 23 | FileSize: 110.6 MB (estimated, compression disabled) 24 | ``` 25 | #### Write performance 26 | ``` 27 | fillseq : 4.989 micros/op; 22.2 MB/s 28 | fillrandom : 5.223 micros/op; 21.2 MB/s 29 | ``` 30 | Each "op" above corresponds to a read/write of a single key/value pair. I.e., a random write benchmark goes at approximately 200,000 writes per second. 31 | 32 | #### Read performance 33 | ``` 34 | readseq : 2.653 micros/op; 41.7 MB/s 35 | readrandom : 7.804 micros/op; 6.4 MB/s 36 | readhot : 2.662 micros/op; 41.6 MB/s 37 | ``` 38 | 39 | 40 | ## Usage 41 | #### Include to your header 42 | ```cpp 43 | #include 44 | 45 | using namespace yodb; 46 | 47 | Options opts; 48 | opts.comparator = new BytewiseComparator(); 49 | opts.env = new Env("/your/database/path"); 50 | 51 | DB* db = new DB("your_db_name", opts); 52 | if (!db->init()) { 53 | fprintf(stderr, "error initialize database\n"); 54 | } 55 | ``` 56 | #### Write 57 | ```cpp 58 | if (!db->put("Shanghai", "Minhang part")) { 59 | fprintf(stderr, "insert error\n"); 60 | } 61 | ``` 62 | #### Read 63 | ```cpp 64 | Slice value; 65 | if (!db->get("Guangzhou", value)) { 66 | fprintf(stderr, "read error\n"); 67 | } 68 | ``` 69 | #### Delete 70 | ```cpp 71 | if (!db->del("Beijing")) { 72 | fprintf(stderr, "delete error\n"); 73 | } 74 | ``` 75 | #### Exit 76 | ```cpp 77 | delete db; 78 | delete opts.comparator; 79 | delete opts.env; 80 | ``` 81 | 82 | ## Further work 83 | - Make lock independent with the tree. 84 | - Add bloom filter to accelerate read operation. 85 | - Add memory table just like leveldb 86 | -------------------------------------------------------------------------------- /util/block.cc: -------------------------------------------------------------------------------- 1 | #include "util/block.h" 2 | 3 | using namespace yodb; 4 | 5 | BlockReader& BlockReader::operator>>(uint8_t& v) 6 | { 7 | if (!succ_) return *this; 8 | 9 | succ_ = read_uint(v); 10 | return *this; 11 | } 12 | 13 | BlockReader& BlockReader::operator>>(uint16_t& v) 14 | { 15 | if (!succ_) return *this; 16 | 17 | succ_ = read_uint(v); 18 | return *this; 19 | } 20 | 21 | BlockReader& BlockReader::operator>>(uint32_t& v) 22 | { 23 | if (!succ_) return *this; 24 | 25 | succ_ = read_uint(v); 26 | return *this; 27 | } 28 | 29 | BlockReader& BlockReader::operator>>(uint64_t& v) 30 | { 31 | if (!succ_) return *this; 32 | 33 | succ_ = read_uint(v); 34 | return *this; 35 | } 36 | 37 | BlockReader& BlockReader::operator>>(Slice& s) 38 | { 39 | if (!succ_) return *this; 40 | 41 | uint32_t size; 42 | succ_ = read_uint(size); 43 | 44 | if (succ_) { 45 | assert(offset_ <= block_.size()); 46 | if (offset_ + size <= block_.size()) { 47 | s = Slice(block_.data() + offset_, size).clone(); 48 | offset_ += size; 49 | } else { 50 | succ_ = false; 51 | } 52 | } 53 | return *this; 54 | } 55 | 56 | template 57 | bool BlockReader::read_uint(T& v) 58 | { 59 | assert(offset_ <= block_.size()); 60 | 61 | if (offset_ + sizeof(T) <= block_.size()) { 62 | v = *(T*)(block_.data() + offset_); 63 | offset_ += sizeof(T); 64 | return true; 65 | } else { 66 | return false; 67 | } 68 | } 69 | 70 | BlockWriter& BlockWriter::operator<<(uint8_t v) 71 | { 72 | if (!succ_) return *this; 73 | 74 | succ_ = write_uint(v); 75 | return *this; 76 | } 77 | 78 | BlockWriter& BlockWriter::operator<<(uint16_t v) 79 | { 80 | if (!succ_) return *this; 81 | 82 | succ_ = write_uint(v); 83 | return *this; 84 | } 85 | 86 | BlockWriter& BlockWriter::operator<<(uint32_t v) 87 | { 88 | if (!succ_) return *this; 89 | 90 | succ_ = write_uint(v); 91 | return *this; 92 | } 93 | 94 | BlockWriter& BlockWriter::operator<<(uint64_t v) 95 | { 96 | if (!succ_) return *this; 97 | 98 | succ_ = write_uint(v); 99 | return *this; 100 | } 101 | 102 | BlockWriter& BlockWriter::operator<<(const Slice& s) 103 | { 104 | if (!succ_) return *this; 105 | 106 | uint32_t size = s.size(); 107 | succ_ = write_uint(size); 108 | 109 | if (succ_) { 110 | assert(offset_ <= block_.size()); 111 | if (offset_ + size <= block_.size()) { 112 | memcpy((char*)block_.data() + offset_, s.data(), size); 113 | offset_ += size; 114 | } else { 115 | succ_ = false; 116 | } 117 | } 118 | return *this; 119 | } 120 | 121 | template 122 | bool BlockWriter::write_uint(T v) 123 | { 124 | assert(offset_ <= block_.size()); 125 | if (offset_ + sizeof(T) <= block_.size()) { 126 | *(T*)(block_.data() + offset_) = v; 127 | offset_ += sizeof(T); 128 | return true; 129 | } else { 130 | return false; 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /fs/file.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_FILE_H_ 2 | #define _YODB_FILE_H_ 3 | 4 | #include "sys/thread.h" 5 | #include "sys/mutex.h" 6 | #include "sys/condition.h" 7 | #include "util/slice.h" 8 | #include "util/logger.h" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | namespace yodb { 15 | 16 | #define MAX_AIO_EVENTS 100 17 | 18 | struct Status { 19 | Status() : succ(false), size(0) {} 20 | bool succ; 21 | size_t size; 22 | }; 23 | 24 | // blocking i/o request 25 | class BIORequest { 26 | public: 27 | BIORequest() 28 | : mutex(), cond(mutex), status() {} 29 | 30 | void complete(Status stat) 31 | { 32 | mutex.lock(); 33 | status = stat; 34 | mutex.unlock(); 35 | cond.notify(); 36 | } 37 | 38 | Mutex mutex; 39 | CondVar cond; 40 | Status status; 41 | }; 42 | 43 | // asynchronous i/o request 44 | class AIORequest { 45 | public: 46 | typedef boost::function Callback; 47 | 48 | size_t size; 49 | Callback callback; 50 | 51 | virtual void complete(int result) = 0; 52 | 53 | virtual ~AIORequest() {} 54 | }; 55 | 56 | class AIOReadRequest : public AIORequest, boost::noncopyable { 57 | public: 58 | void complete(int result) 59 | { 60 | Status status; 61 | 62 | if (result < 0) { 63 | LOG_ERROR << "AIOFile::async_read error: " << strerror(-result); 64 | status.succ = false; 65 | } else { 66 | status.succ = true; 67 | status.size = result; 68 | } 69 | 70 | callback(status); 71 | } 72 | }; 73 | 74 | class AIOWriteRequest : public AIORequest, boost::noncopyable { 75 | public: 76 | void complete(int result) 77 | { 78 | Status status; 79 | 80 | if (result < 0) { 81 | LOG_ERROR << "AIOFile::async_write error: " << strerror(-result); 82 | status.succ = false; 83 | } else if (result < static_cast(size)) { 84 | LOG_ERROR << "AIOFile::async_write incomplete, " 85 | << Fmt("expected=%zu, ", size) 86 | << Fmt("actually=%zu.", result); 87 | status.succ = false; 88 | status.size = result; 89 | } else { 90 | status.succ = true; 91 | status.size = result; 92 | } 93 | 94 | callback(status); 95 | } 96 | }; 97 | 98 | class AIOFile : boost::noncopyable { 99 | public: 100 | AIOFile(const std::string& path); 101 | ~AIOFile(); 102 | 103 | bool open(); 104 | void close(); 105 | void truncate(uint64_t offset); 106 | 107 | Status read(uint64_t offset, Slice& buffer); 108 | Status write(uint64_t offset, const Slice& buffer); 109 | 110 | typedef AIORequest::Callback Callback; 111 | 112 | void async_read(uint64_t offset, Slice& buffer, Callback cb); 113 | void async_write(uint64_t offset, const Slice& buffer, Callback cb); 114 | 115 | private: 116 | void handle_io_complete(); 117 | 118 | std::string path_; 119 | int fd_; 120 | bool closed_; 121 | io_context_t ioctx_; 122 | Thread* thread_; 123 | }; 124 | 125 | } // namespace yodb 126 | 127 | #endif // _YODB_FILE_H_ 128 | -------------------------------------------------------------------------------- /tree/buffer_tree.cc: -------------------------------------------------------------------------------- 1 | #include "tree/buffer_tree.h" 2 | 3 | using namespace yodb; 4 | 5 | BufferTree::BufferTree(const std::string name, Options& opts, 6 | Cache* cache, Table* table) 7 | : name_(name), options_(opts), 8 | cache_(cache), table_(table), 9 | root_(NULL), node_count_(0), 10 | node_map_(), mutex_(), mutex_lock_path_() 11 | { 12 | } 13 | 14 | BufferTree::~BufferTree() 15 | { 16 | // root_ is always referenced 17 | if (root_) { 18 | root_->dec_ref(); 19 | assert(root_->refs() == 0); 20 | table_->set_root_nid(root_->nid()); 21 | } 22 | 23 | LOG_INFO << Fmt("%zu nodes created", node_count_); 24 | 25 | cache_->flush(); 26 | 27 | LOG_INFO << "BufferTree destructor finished"; 28 | } 29 | 30 | bool BufferTree::init() 31 | { 32 | cache_->integrate(this, table_); 33 | 34 | nid_t root_nid = table_->get_root_nid(); 35 | node_count_ = table_->get_node_count(); 36 | 37 | root_ = get_node_by_nid(root_nid); 38 | 39 | if (root_nid == NID_NIL) { 40 | assert(root_ == NULL); 41 | assert(node_count_ == 0); 42 | 43 | root_ = create_node(); 44 | root_->set_leaf(true); 45 | root_->create_first_pivot(); 46 | } 47 | 48 | return root_ != NULL; 49 | } 50 | 51 | void BufferTree::grow_up(Node* root) 52 | { 53 | ScopedMutex lock(mutex_); 54 | 55 | root_->dec_ref(); 56 | root_ = root; 57 | table_->set_root_nid(root_->nid()); 58 | } 59 | 60 | Node* BufferTree::create_node() 61 | { 62 | { 63 | ScopedMutex lock(mutex_); 64 | ++node_count_; 65 | } 66 | 67 | nid_t nid = node_count_; 68 | Node* node = new Node(this, nid); 69 | 70 | cache_->put(nid, node); 71 | 72 | return node; 73 | } 74 | 75 | Node* BufferTree::create_node(nid_t nid) 76 | { 77 | return new Node(this, nid); 78 | } 79 | 80 | Node* BufferTree::get_node_by_nid(nid_t nid) 81 | { 82 | return cache_->get(nid); 83 | } 84 | 85 | void BufferTree::lock_path(const Slice& key, std::vector& path) 86 | { 87 | ScopedMutex lock(mutex_lock_path_); 88 | 89 | Node* root = root_; 90 | root->inc_ref(); 91 | root->write_lock(); 92 | 93 | if (root != root_) { 94 | // Tree maybe grow up after we get the lock, 95 | // so we just give up if we miss this action. 96 | root->write_unlock(); 97 | root->dec_ref(); 98 | } else { 99 | root->lock_path(key, path); 100 | } 101 | } 102 | 103 | bool BufferTree::put(const Slice& key, const Slice& value) 104 | { 105 | assert(root_); 106 | 107 | // Tree maybe grow up after we insert a kv, 108 | // so we should use the copy of the root_ to 109 | // ensure dec_ref() right processed.(same as below) 110 | Node* root = root_; 111 | root->inc_ref(); 112 | bool succ = root->put(key, value); 113 | root->dec_ref(); 114 | 115 | return succ; 116 | } 117 | 118 | bool BufferTree::del(const Slice& key) 119 | { 120 | assert(root_); 121 | 122 | Node* root = root_; 123 | root->inc_ref(); 124 | bool succ = root->del(key); 125 | root->dec_ref(); 126 | 127 | return succ; 128 | } 129 | 130 | bool BufferTree::get(const Slice& key, Slice& value) 131 | { 132 | assert(root_); 133 | 134 | Node* root = root_; 135 | root->inc_ref(); 136 | bool succ = root->get(key, value); 137 | root->dec_ref(); 138 | 139 | return succ; 140 | } 141 | -------------------------------------------------------------------------------- /tree/msg.cc: -------------------------------------------------------------------------------- 1 | #include "tree/msg.h" 2 | 3 | using namespace yodb; 4 | 5 | MsgTable::MsgTable(Comparator* comparator) 6 | : list_(Compare(comparator)), 7 | comparator_(comparator), 8 | mutex_(), size_(0) 9 | { 10 | } 11 | 12 | MsgTable::~MsgTable() 13 | { 14 | Iterator iter(&list_); 15 | iter.seek_to_first(); 16 | 17 | while (iter.valid()) { 18 | Msg msg = iter.key(); 19 | msg.release(); 20 | iter.next(); 21 | } 22 | 23 | list_.clear(); 24 | } 25 | 26 | size_t MsgTable::count() 27 | { 28 | return list_.count(); 29 | } 30 | 31 | size_t MsgTable::size() 32 | { 33 | return 4 + size_; 34 | } 35 | 36 | size_t MsgTable::memory_usage() 37 | { 38 | return list_.memory_usage() + sizeof(MsgTable); 39 | } 40 | 41 | void MsgTable::clear() 42 | { 43 | assert(mutex_.is_locked_by_this_thread()); 44 | 45 | list_.clear(); 46 | size_ = 0; 47 | } 48 | 49 | void MsgTable::insert(const Msg& msg) 50 | { 51 | assert(mutex_.is_locked_by_this_thread()); 52 | 53 | Iterator iter(&list_); 54 | Msg got; 55 | bool release = false; 56 | iter.seek(msg); 57 | 58 | if (iter.valid()) { 59 | got = iter.key(); 60 | 61 | if (got.key() == msg.key()) { 62 | size_ -= got.size(); 63 | release = true; 64 | } 65 | } 66 | 67 | list_.insert(msg); 68 | size_ += msg.size(); 69 | 70 | if (release) 71 | got.release(); 72 | } 73 | 74 | void MsgTable::resize(size_t size) 75 | { 76 | assert(mutex_.is_locked_by_this_thread()); 77 | list_.resize(size); 78 | 79 | size_ = 0; 80 | Iterator iter(&list_); 81 | iter.seek_to_first(); 82 | 83 | while (iter.valid()) { 84 | size_ += iter.key().size(); 85 | iter.next(); 86 | } 87 | } 88 | 89 | bool MsgTable::find(Slice key, Msg& msg) 90 | { 91 | assert(mutex_.is_locked_by_this_thread()); 92 | 93 | Msg fake(_Nop, key); 94 | Iterator iter(&list_); 95 | 96 | iter.seek(fake); 97 | 98 | if (iter.valid() && iter.key().key() == key) { 99 | msg = iter.key(); 100 | return true; 101 | } 102 | 103 | return false; 104 | } 105 | 106 | bool MsgTable::constrcutor(BlockReader& reader) 107 | { 108 | assert(reader.ok()); 109 | 110 | ScopedMutex lock(mutex_); 111 | 112 | uint32_t count = 0; 113 | reader >> count; 114 | 115 | if (count == 0) return true; 116 | 117 | for (size_t i = 0; i < count; i++) { 118 | uint8_t type; 119 | Slice key, value; 120 | 121 | reader >> type >> key; 122 | if (type == Put) 123 | reader >> value; 124 | 125 | Msg msg((MsgType)type, key, value); 126 | list_.insert(msg); 127 | size_ += msg.size(); 128 | } 129 | 130 | return reader.ok(); 131 | } 132 | 133 | bool MsgTable::destructor(BlockWriter& writer) 134 | { 135 | assert(writer.ok()); 136 | 137 | ScopedMutex lock(mutex_); 138 | 139 | uint32_t count = list_.count(); 140 | writer << count; 141 | 142 | Iterator iter(&list_); 143 | iter.seek_to_first(); 144 | 145 | while (iter.valid()) { 146 | Msg msg = iter.key(); 147 | uint8_t type = msg.type(); 148 | 149 | writer << type << msg.key(); 150 | if (type == Put) 151 | writer << msg.value(); 152 | 153 | count--; 154 | iter.next(); 155 | } 156 | assert(count == 0); 157 | 158 | return writer.ok(); 159 | } 160 | -------------------------------------------------------------------------------- /tree/node.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_NODE_H_ 2 | #define _YODB_NODE_H_ 3 | 4 | #include "tree/msg.h" 5 | #include "sys/rwlock.h" 6 | #include "util/timestamp.h" 7 | #include "util/slice.h" 8 | #include "util/block.h" 9 | #include "util/logger.h" 10 | 11 | #include 12 | #include 13 | 14 | namespace yodb { 15 | 16 | typedef uint64_t nid_t; 17 | 18 | #define NID_NIL ((nid_t)0) 19 | 20 | class BufferTree; 21 | 22 | class Pivot { 23 | public: 24 | Pivot() {} 25 | Pivot(nid_t child, MsgTable* mbuf, Slice key = Slice()) 26 | : table(mbuf), child_nid(child), left_most_key(key) {} 27 | 28 | MsgTable* table; 29 | nid_t child_nid; 30 | Slice left_most_key; 31 | }; 32 | 33 | class Node { 34 | public: 35 | Node(BufferTree* tree, nid_t self); 36 | ~Node(); 37 | 38 | void create_first_pivot(); 39 | 40 | bool get(const Slice& key, Slice& value, Node* parent = NULL); 41 | 42 | bool put(const Slice& key, const Slice& value); 43 | 44 | bool del(const Slice& key); 45 | 46 | bool write(const Msg& msg); 47 | 48 | size_t size(); 49 | size_t write_back_size(); 50 | 51 | nid_t nid(); 52 | void set_nid(nid_t nid); 53 | void set_leaf(bool leaf); 54 | 55 | void read_lock() { rwlock_.read_lock(); } 56 | void read_unlock() { rwlock_.read_unlock(); } 57 | 58 | void write_lock() { rwlock_.write_lock(); } 59 | void write_unlock() { rwlock_.write_unlock(); } 60 | 61 | bool try_read_lock() { return rwlock_.try_read_lock(); } 62 | bool try_write_lock() { return rwlock_.try_write_lock(); } 63 | 64 | void set_dirty(bool modified); 65 | bool dirty(); 66 | 67 | void set_flushing(bool flushing); 68 | bool flushing(); 69 | 70 | size_t refs(); 71 | void inc_ref(); 72 | void dec_ref(); 73 | 74 | Timestamp get_first_write_timestamp(); 75 | Timestamp get_last_used_timestamp(); 76 | 77 | bool constrcutor(BlockReader& reader); 78 | bool destructor(BlockWriter& writer); 79 | 80 | void lock_path(const Slice& key, std::vector& path); 81 | 82 | private: 83 | // when the leaf node's number of pivot is out of limit, 84 | // it then will split the node and push up the split operation. 85 | void try_split_node(std::vector& path); 86 | 87 | // find which pivot matches the key 88 | size_t find_pivot(Slice key); 89 | 90 | void add_pivot(nid_t child, MsgTable* table, Slice key); 91 | 92 | // maybe push down or split the table 93 | void maybe_push_down_or_split(); 94 | 95 | // internal node would push down the table when it is full 96 | void push_down(MsgTable* table, Node* parent); 97 | 98 | // only the leaf node would split table when it is full 99 | void split_table(MsgTable* table); 100 | 101 | void insert_msg(size_t index, const Msg& msg); 102 | 103 | typedef std::vector Container; 104 | 105 | void push_down_locked(MsgTable* table, Node* parent); 106 | 107 | void optional_lock() { is_leaf_ ? write_lock() : read_lock(); } 108 | void optional_unlock() { is_leaf_ ? write_unlock() : read_unlock(); } 109 | 110 | private: 111 | BufferTree* tree_; 112 | nid_t self_nid_; 113 | bool is_leaf_; 114 | size_t refcnt_; 115 | 116 | Container pivots_; 117 | Mutex pivots_mutex_; 118 | 119 | RWLock rwlock_; 120 | 121 | Mutex mutex_; 122 | bool dirty_; 123 | bool flushing_; 124 | Timestamp first_write_timestamp_; 125 | Timestamp last_used_timestamp_; 126 | }; 127 | 128 | } // namespace yodb 129 | 130 | #endif // _YODB_NODE_H_ 131 | -------------------------------------------------------------------------------- /util/block.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_BLOCK_H_ 2 | #define _YODB_BLOCK_H_ 3 | 4 | #include "util/slice.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace yodb { 10 | 11 | const int kSmallBuffer = 4000; 12 | const int kLargeBuffer = 4000 * 1000; 13 | 14 | template 15 | class FixedBlock { 16 | public: 17 | FixedBlock() 18 | : offset_(0) {} 19 | 20 | size_t avail() 21 | { 22 | return SIZE - offset_; 23 | } 24 | 25 | void append(const Slice& slice) 26 | { 27 | assert(avail() > slice.size()); 28 | memcpy(buffer_ + offset_, slice.data(), slice.size()); 29 | offset_ += slice.size(); 30 | } 31 | 32 | void append(const char* s, size_t len) 33 | { 34 | assert(avail() > len); 35 | memcpy(buffer_ + offset_, s, len); 36 | offset_ += len; 37 | } 38 | 39 | Slice buffer() 40 | { 41 | assert(offset_ < SIZE); 42 | return Slice(buffer_, offset_); 43 | } 44 | private: 45 | char buffer_[SIZE]; 46 | size_t offset_; 47 | }; 48 | 49 | class Block : boost::noncopyable { 50 | public: 51 | Block(Slice slice) 52 | : buffer_(slice), offset_(0), size_(slice.size()) {} 53 | 54 | // a block maybe only a part of the slice 55 | Block(Slice slice, size_t offset, size_t size) 56 | : buffer_(slice), offset_(offset), size_(size) 57 | { 58 | assert(size <= slice.size()); 59 | assert(offset + size <= slice.size()); 60 | } 61 | 62 | // return data in the range of the block 63 | const char* data() { return buffer_.data() + offset_; } 64 | size_t size() { return size_; } 65 | size_t avail() { return size_ - offset_; } 66 | Slice buffer() { return buffer_; } 67 | 68 | private: 69 | Slice buffer_; 70 | size_t offset_; 71 | size_t size_; 72 | }; 73 | 74 | class BlockReader : boost::noncopyable { 75 | public: 76 | explicit BlockReader(Block& block) 77 | : block_(block), offset_(0), succ_(true) 78 | { 79 | } 80 | 81 | bool ok() { return succ_; } 82 | 83 | typedef BlockReader self; 84 | 85 | self& operator>>(bool& v) { 86 | uint8_t val; 87 | 88 | *this >> val; 89 | if (val) v = true; 90 | else v = false; 91 | 92 | return *this; 93 | } 94 | 95 | self& operator>>(uint8_t& v); 96 | self& operator>>(uint16_t& v); 97 | self& operator>>(uint32_t& v); 98 | self& operator>>(uint64_t& v); 99 | self& operator>>(Slice& s); 100 | 101 | private: 102 | template 103 | bool read_uint(T& v); 104 | 105 | Block& block_; 106 | size_t offset_; 107 | bool succ_; 108 | }; 109 | 110 | class BlockWriter : boost::noncopyable { 111 | public: 112 | explicit BlockWriter(Block& block) 113 | : block_(block), offset_(0), succ_(true) 114 | { 115 | } 116 | 117 | bool ok() { return succ_; } 118 | 119 | typedef BlockWriter self; 120 | 121 | self& operator<<(bool v) { 122 | uint8_t val; 123 | 124 | if (v) val = 1; 125 | else val = 0; 126 | 127 | *this << val; 128 | return *this; 129 | } 130 | 131 | self& operator<<(uint8_t v); 132 | self& operator<<(uint16_t v); 133 | self& operator<<(uint32_t v); 134 | self& operator<<(uint64_t v); 135 | self& operator<<(const Slice& s); 136 | 137 | private: 138 | template 139 | bool write_uint(T v); 140 | 141 | Block& block_; 142 | size_t offset_; 143 | bool succ_; 144 | }; 145 | 146 | } // namespace yodb 147 | 148 | #endif // _YODB_BLOCK_H_ 149 | -------------------------------------------------------------------------------- /test/rwlock_test.cc: -------------------------------------------------------------------------------- 1 | #include "sys/rwlock.h" 2 | #include "sys/thread.h" 3 | #include "util/timestamp.h" 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | using namespace yodb; 11 | 12 | static const int kCount = 1000 * 10; 13 | static const int kMaxThreads = 4; 14 | 15 | RWLock rwlock; 16 | Mutex g_mutex; 17 | pthread_rwlock_t g_lock; 18 | 19 | void rwlock_read() 20 | { 21 | for (int i = 0; i < kCount; i++) { 22 | rwlock.read_lock(); 23 | usleep(100); 24 | rwlock.read_unlock(); 25 | } 26 | } 27 | 28 | void rwlock_write() 29 | { 30 | for (int i = 0; i < kCount; i++) { 31 | rwlock.write_lock(); 32 | usleep(100); 33 | rwlock.write_unlock(); 34 | } 35 | } 36 | 37 | void pthread_rwlock_read() 38 | { 39 | for (int i = 0; i < kCount; i++) { 40 | pthread_rwlock_rdlock(&g_lock); 41 | usleep(100); 42 | pthread_rwlock_unlock(&g_lock); 43 | } 44 | } 45 | 46 | void pthread_rwlock_write() 47 | { 48 | for (int i = 0; i < kCount; i++) { 49 | pthread_rwlock_wrlock(&g_lock); 50 | usleep(100); 51 | pthread_rwlock_unlock(&g_lock); 52 | } 53 | } 54 | 55 | void mutex_read() 56 | { 57 | for (int i = 0; i < kCount; i++) { 58 | ScopedMutex lock(g_mutex); 59 | usleep(100); 60 | } 61 | } 62 | 63 | void mutex_write() 64 | { 65 | for (int i = 0; i < kCount; i++) { 66 | ScopedMutex lock(g_mutex); 67 | usleep(100); 68 | } 69 | } 70 | 71 | int main() 72 | { 73 | Timestamp start, finish; 74 | boost::ptr_vector threads; 75 | 76 | // pthread_rwlock_t 77 | pthread_rwlock_init(&g_lock, NULL); 78 | 79 | for (int i = 0; i < kMaxThreads / 2; i++) 80 | threads.push_back(new Thread(&pthread_rwlock_read)); 81 | for (int i = kMaxThreads / 2; i < kMaxThreads; i++) 82 | threads.push_back(new Thread(&pthread_rwlock_write)); 83 | 84 | start = Timestamp::now(); 85 | for (int i = 0; i < kMaxThreads; i++) 86 | threads[i].run(); 87 | for (int i = 0; i < kMaxThreads; i++) 88 | threads[i].join(); 89 | finish = Timestamp::now(); 90 | 91 | double time = time_interval(finish, start); 92 | printf("POSIX: %d threads, %d readers, %d writers, costs = %f\n", 93 | kMaxThreads, kMaxThreads / 2, kMaxThreads - kMaxThreads / 2, time); 94 | 95 | pthread_rwlock_destroy(&g_lock); 96 | 97 | // Mutex 98 | threads.clear(); 99 | 100 | for (int i = 0; i < kMaxThreads / 2; i++) 101 | threads.push_back(new Thread(&mutex_read)); 102 | for (int i = kMaxThreads / 2; i < kMaxThreads; i++) 103 | threads.push_back(new Thread(&mutex_write)); 104 | 105 | start = Timestamp::now(); 106 | for (int i = 0; i < kMaxThreads; i++) 107 | threads[i].run(); 108 | for (int i = 0; i < kMaxThreads; i++) 109 | threads[i].join(); 110 | finish = Timestamp::now(); 111 | 112 | time = time_interval(finish, start); 113 | printf("Mutex: %d threads, %d readers, %d writers, costs = %f\n", 114 | kMaxThreads, kMaxThreads / 2, kMaxThreads - kMaxThreads / 2, time); 115 | 116 | // RWLock 117 | threads.clear(); 118 | 119 | for (int i = 0; i < kMaxThreads / 2; i++) 120 | threads.push_back(new Thread(&rwlock_read)); 121 | for (int i = kMaxThreads / 2; i < kMaxThreads; i++) 122 | threads.push_back(new Thread(&rwlock_write)); 123 | 124 | start = Timestamp::now(); 125 | for (int i = 0; i < kMaxThreads; i++) 126 | threads[i].run(); 127 | for (int i = 0; i < kMaxThreads; i++) 128 | threads[i].join(); 129 | finish = Timestamp::now(); 130 | 131 | time = time_interval(finish, start); 132 | printf("RWLock: %d threads, %d readers, %d writers, costs = %f\n", 133 | kMaxThreads, kMaxThreads / 2, kMaxThreads - kMaxThreads / 2, time); 134 | } 135 | -------------------------------------------------------------------------------- /test/table_test.cc: -------------------------------------------------------------------------------- 1 | #include "fs/file.h" 2 | #include "fs/table.h" 3 | #include "fs/env.h" 4 | #include "sys/mutex.h" 5 | #include "sys/thread.h" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace yodb; 12 | 13 | const size_t min_page_size = 1; 14 | const size_t max_page_size = 32 * 1024; 15 | const uint64_t kMaxCount = 1000; 16 | 17 | Table* table; 18 | std::map g_result; 19 | std::map g_blocks; 20 | Mutex g_result_mutex; 21 | Mutex g_blocks_mutex; 22 | 23 | void async_write_handler(nid_t nid, Status status) 24 | { 25 | ScopedMutex lock(g_result_mutex); 26 | g_result[nid] = status.succ; 27 | } 28 | 29 | void async_write_test(size_t count) 30 | { 31 | srand(0); 32 | for (nid_t i = kMaxCount * count; i < kMaxCount * (count + 1); i++) { 33 | size_t size = min_page_size + rand() % (max_page_size - min_page_size); 34 | Slice buffer = table->self_alloc(size); 35 | Block* block = new Block(buffer, 0, size); 36 | 37 | { 38 | ScopedMutex lock(g_blocks_mutex); 39 | g_blocks[i] = block; 40 | } 41 | BlockWriter writer(*block); 42 | 43 | for (size_t j = 0; j < size; j++) 44 | writer << (uint8_t)(i & 0xFF); 45 | 46 | table->async_write(i, *block, size, boost::bind(&async_write_handler, i, _1)); 47 | } 48 | } 49 | 50 | void read_test(size_t n) 51 | { 52 | for (nid_t i = 0; i < n; i++) { 53 | Block* block = table->read(i); 54 | assert(block); 55 | assert(g_blocks[i]->size() == block->size()); 56 | assert(memcmp(g_blocks[i]->data(), block->data(), block->size()) == 0); 57 | table->self_dealloc(block->buffer()); 58 | } 59 | } 60 | 61 | void release() 62 | { 63 | std::map::iterator iter; 64 | for (iter = g_blocks.begin(); iter != g_blocks.end(); iter++) { 65 | table->self_dealloc(iter->second->buffer()); 66 | delete iter->second; 67 | } 68 | g_blocks.clear(); 69 | } 70 | 71 | int main() 72 | { 73 | Env env("./"); 74 | AIOFile* file = env.open_aio_file("table_test"); 75 | 76 | table = new Table(file, 0); 77 | table->init(true); 78 | 79 | ///////////////////////////////////////////////////// 80 | // 1st test 81 | async_write_test(0); 82 | while (g_result.size() != kMaxCount) 83 | usleep(1000); 84 | 85 | for (size_t i = 0; i < g_result.size(); i++) 86 | assert(g_result[i]); 87 | 88 | uint64_t file_size = table->size(); 89 | delete table; 90 | 91 | table = new Table(file, file_size); 92 | table->init(false); 93 | read_test(kMaxCount); 94 | 95 | ///////////////////////////////////////////////////// 96 | // 2nd test 97 | release(); 98 | 99 | g_result.clear(); 100 | 101 | async_write_test(0); 102 | while (g_result.size() != kMaxCount) 103 | usleep(1000); 104 | 105 | for (size_t i = 0; i < g_result.size(); i++) 106 | assert(g_result[i]); 107 | 108 | table->flush_right_now(); 109 | read_test(kMaxCount); 110 | 111 | file_size = table->size(); 112 | release(); 113 | delete table; 114 | 115 | ///////////////////////////////////////////////////// 116 | // multithreading environment test 117 | table = new Table(file, file_size); 118 | table->init(false); 119 | 120 | Thread thr1(boost::bind(&async_write_test, 0)); 121 | Thread thr2(boost::bind(&async_write_test, 1)); 122 | 123 | thr1.run(); 124 | thr2.run(); 125 | 126 | table->flush_right_now(); 127 | thr1.join(); 128 | thr2.join(); 129 | 130 | while (g_result.size() != 2 * kMaxCount) 131 | usleep(1000); 132 | 133 | for (size_t i = 0; i < g_result.size(); i++) 134 | assert(g_result[i]); 135 | 136 | assert(table->get_node_count() == 2 * kMaxCount); 137 | read_test(2 * kMaxCount); 138 | 139 | release(); 140 | delete table; 141 | } 142 | -------------------------------------------------------------------------------- /test/dbimpl_test.cc: -------------------------------------------------------------------------------- 1 | #include "db/db_impl.h" 2 | #include "db/options.h" 3 | #include "sys/thread.h" 4 | #include "util/slice.h" 5 | #include "util/timestamp.h" 6 | #include "util/logger.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace yodb; 13 | 14 | const uint64_t kCount = 1000 * 1000; 15 | 16 | DBImpl* g_db; 17 | 18 | void thr_put1() 19 | { 20 | for (uint64_t i = 0; i < kCount; i++) { 21 | char buffer[100]; 22 | sprintf(buffer, "%16ld", i); 23 | 24 | Slice value(buffer, strlen(buffer)); 25 | Slice key = value; 26 | 27 | g_db->put(key, value); 28 | } 29 | } 30 | 31 | void thr_put2() 32 | { 33 | for (uint64_t i = kCount; i < kCount * 2; i++) { 34 | char buffer[100]; 35 | sprintf(buffer, "%16ld", i); 36 | 37 | Slice value(buffer, strlen(buffer)); 38 | Slice key = value; 39 | 40 | g_db->put(key, value); 41 | } 42 | } 43 | 44 | void thr_put3() 45 | { 46 | for (uint64_t i = kCount * 2; i < kCount * 3; i++) { 47 | char buffer[16] = {0}; 48 | sprintf(buffer, "%08ld", i); 49 | 50 | Slice value(buffer, strlen(buffer)); 51 | Slice key = value; 52 | 53 | g_db->put(key, value); 54 | } 55 | } 56 | 57 | void thr_put4() 58 | { 59 | for (uint64_t i = kCount * 3; i < kCount * 4; i++) { 60 | char buffer[16] = {0}; 61 | sprintf(buffer, "%08ld", i); 62 | 63 | Slice value(buffer, strlen(buffer)); 64 | Slice key = value; 65 | 66 | g_db->put(key, value); 67 | } 68 | } 69 | 70 | void get_thr(uint64_t x, uint64_t y) 71 | { 72 | uint64_t count = 0; 73 | for (uint64_t i = x; i < y; i++) { 74 | char buffer[16] = {0}; 75 | sprintf(buffer, "%08ld", i); 76 | 77 | Slice target_value(buffer, strlen(buffer)); 78 | Slice key = target_value; 79 | 80 | Slice get_value; 81 | g_db->get(key, get_value); 82 | 83 | if (target_value.compare(get_value) != 0) 84 | count++; 85 | 86 | if (get_value.size()) 87 | get_value.release(); 88 | } 89 | 90 | LOG_INFO << Fmt("read %zu records, ", y - x) << Fmt("%zu failed", count); 91 | } 92 | 93 | int main() 94 | { 95 | Options opts; 96 | opts.comparator = new BytewiseComparator(); 97 | opts.max_node_child_number = 16; 98 | opts.max_node_msg_count = 10240; 99 | opts.cache_limited_memory = 1 << 28; 100 | opts.env = new Env("/home/kedebug/develop/yodb/bin"); 101 | 102 | g_db = new DBImpl("third", opts); 103 | g_db->init(); 104 | 105 | boost::ptr_vector threads; 106 | threads.push_back(new Thread(&thr_put1)); 107 | // threads.push_back(new Thread(&thr_put2)); 108 | // threads.push_back(new Thread(&thr_put3)); 109 | // threads.push_back(new Thread(&thr_put4)); 110 | 111 | Timestamp start, finish; 112 | 113 | start = Timestamp::now(); 114 | for (size_t i = 0; i < threads.size(); i++) 115 | threads[i].run(); 116 | 117 | for (size_t i = 0; i < threads.size(); i++) 118 | threads[i].join(); 119 | finish = Timestamp::now(); 120 | 121 | LOG_INFO << Fmt("%ld puts, ", threads.size() * kCount) 122 | << Fmt("cost = %f", time_interval(finish, start)); 123 | 124 | threads.clear(); 125 | delete g_db; 126 | 127 | g_db = new DBImpl("third", opts); 128 | g_db->init(); 129 | 130 | threads.push_back(new Thread(boost::bind(&get_thr, 0, kCount))); 131 | // threads.push_back(new Thread(boost::bind(&get_thr, kCount, kCount * 2))); 132 | // threads.push_back(new Thread(boost::bind(&get_thr, kCount * 2, kCount * 3))); 133 | // threads.push_back(new Thread(boost::bind(&get_thr, kCount * 3, kCount * 4))); 134 | 135 | start = Timestamp::now(); 136 | for (size_t i = 0; i < threads.size(); i++) 137 | threads[i].run(); 138 | 139 | for (size_t i = 0; i < threads.size(); i++) 140 | threads[i].join(); 141 | finish = Timestamp::now(); 142 | 143 | LOG_INFO << Fmt("%ld get, ", threads.size() * kCount) 144 | << Fmt("cost = %f", time_interval(finish, start)); 145 | 146 | delete g_db; 147 | delete opts.env; 148 | delete opts.comparator; 149 | } 150 | -------------------------------------------------------------------------------- /fs/table.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_TABLE_H_ 2 | #define _YODB_TABLE_H_ 3 | 4 | #include "fs/file.h" 5 | #include "tree/node.h" 6 | #include "util/block.h" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | namespace yodb { 16 | 17 | #define PAGE_SIZE 4096 18 | 19 | #define PAGE_ROUND_UP(x) (((x) + PAGE_SIZE - 1) & (~(PAGE_SIZE - 1))) 20 | #define PAGE_ROUND_DOWN(x) ((x) & (~(PAGE_SIZE - 1))) 21 | #define PAGE_ROUNDED(x) ((x) == (PAGE_ROUND_DOWN(x))) 22 | 23 | #define BOOTSTRAP_SIZE PAGE_SIZE 24 | 25 | struct BlockHandle { 26 | BlockHandle() : offset(0), size(0) {} 27 | 28 | uint64_t offset; 29 | uint32_t size; 30 | }; 31 | 32 | class Bootstrap { 33 | public: 34 | Bootstrap() : header(), root_nid(NID_NIL) {} 35 | 36 | BlockHandle header; 37 | nid_t root_nid; 38 | }; 39 | 40 | // Table for permanent storage 41 | class Table : boost::noncopyable { 42 | public: 43 | Table(AIOFile* file, uint64_t file_size); 44 | ~Table(); 45 | 46 | bool init(bool create = false); 47 | 48 | // Get node's block information marked by nid. 49 | Block* read(nid_t nid); 50 | 51 | typedef boost::function Callback; 52 | 53 | // Asynchoronous write file, this will be always called by Cache module. 54 | void async_write(nid_t nid, Block& block, Callback cb); 55 | 56 | bool flush_bootstrap(); 57 | bool load_bootstrap(); 58 | 59 | bool flush_header(); 60 | bool load_header(); 61 | 62 | // Flush all the buffers to file. 63 | bool flush(); 64 | bool flush_immediately(); 65 | 66 | void init_holes(); 67 | 68 | // Holes are unused parts of our file, we collect it for further usage. 69 | void add_hole(uint64_t offset, uint32_t size); 70 | 71 | // Whether we can get suitable room from hole list. 72 | // This will be always called by find_space(). 73 | bool get_hole(uint32_t size, uint64_t& offset); 74 | 75 | // Fly holes are collected from asynchoronous write calls: 76 | // new rooms allocated and old rooms become fly holes. 77 | // There will still be some readers since we are in multithreading environment. 78 | // Fly holes will be able to use if we invoke flush_fly_holes(). 79 | void add_fly_hole(uint64_t offset, uint32_t size); 80 | 81 | void flush_fly_holes(size_t fly_holes); 82 | 83 | nid_t get_root_nid() { return bootstrap_.root_nid; } 84 | void set_root_nid(nid_t nid) { bootstrap_.root_nid = nid; } 85 | 86 | size_t get_node_count() 87 | { 88 | ScopedMutex lock(block_entry_mutex_); 89 | return block_entry_.size(); 90 | } 91 | 92 | // Get size of all the block handle, this will be always called by flush_index(). 93 | uint32_t block_header_size(); 94 | 95 | // Give a block handle, returns the the block you needed. 96 | Block* read_block(const BlockHandle* handle); 97 | 98 | // Get suitable room for size, return the offset of our file. 99 | uint64_t find_space(uint32_t size); 100 | 101 | // Synchoronous read file 102 | bool read_file(uint64_t offset, Slice& buffer); 103 | // Synchoronous write file 104 | bool write_file(uint64_t offset, const Slice& buffer); 105 | 106 | void truncate(); 107 | 108 | // We use posix_memalign() to allocate aligned buffer, this is efficiency 109 | // since we are using asynchoronous read/write functions. 110 | Slice self_alloc(size_t size); 111 | 112 | // posix_memalign() allocated buffer should use free() to deallocate. 113 | void self_dealloc(Slice alloc_ptr); 114 | 115 | // size() function is seldom used, we remain it for debug reason. 116 | uint64_t size() { return file_size_; } 117 | 118 | private: 119 | AIOFile* file_; 120 | uint64_t file_size_; 121 | uint64_t offset_; 122 | Mutex mutex_; 123 | 124 | Bootstrap bootstrap_; 125 | 126 | typedef std::map BlockEntry; 127 | BlockEntry block_entry_; 128 | Mutex block_entry_mutex_; 129 | 130 | struct AsyncWriteContext { 131 | nid_t nid; 132 | Callback callback; 133 | BlockHandle handle; 134 | }; 135 | 136 | void async_write_handler(AsyncWriteContext* context, Status status); 137 | 138 | struct Hole { 139 | uint64_t offset; 140 | uint32_t size; 141 | }; 142 | 143 | uint32_t fly_readers_; 144 | uint32_t fly_writers_; 145 | 146 | typedef std::deque HoleList; 147 | // the holes is sorted by offset 148 | HoleList hole_list_; 149 | HoleList fly_hole_list_; 150 | 151 | Mutex hole_list_mutex_; 152 | Mutex fly_hole_list_mutex_; 153 | }; 154 | 155 | } // namespace yodb 156 | 157 | #endif // _YODB_TABLE_H_ 158 | -------------------------------------------------------------------------------- /fs/file.cc: -------------------------------------------------------------------------------- 1 | #include "fs/file.h" 2 | #include 3 | #include 4 | 5 | using namespace yodb; 6 | 7 | AIOFile::AIOFile(const std::string& path) 8 | : path_(path), fd_(-1), 9 | closed_(false), ioctx_(0), thread_(NULL) 10 | { 11 | } 12 | 13 | AIOFile::~AIOFile() 14 | { 15 | close(); 16 | } 17 | 18 | bool AIOFile::open() 19 | { 20 | fd_ = ::open(path_.c_str(), O_RDWR | O_DIRECT | O_CREAT, 0644); 21 | if (fd_ == -1) { 22 | LOG_ERROR << "open file " << path_ << " error: " << strerror(errno); 23 | return false; 24 | } 25 | 26 | int status = io_setup(MAX_AIO_EVENTS, &ioctx_); 27 | if (status < 0) { 28 | LOG_ERROR << "io_setup error: " << strerror(status); 29 | ::close(fd_); 30 | return false; 31 | } 32 | 33 | thread_ = new Thread(boost::bind(&AIOFile::handle_io_complete, this)); 34 | assert(thread_); 35 | thread_->run(); 36 | 37 | return true; 38 | } 39 | 40 | void AIOFile::close() 41 | { 42 | if (closed_ == false) { 43 | closed_ = true; 44 | thread_->join(); 45 | delete thread_; 46 | 47 | int status = io_destroy(ioctx_); 48 | if (status < 0) 49 | LOG_ERROR << "io_destroy error: " << strerror(status); 50 | 51 | ::close(fd_); 52 | } 53 | } 54 | 55 | void AIOFile::truncate(uint64_t offset) 56 | { 57 | if (ftruncate(fd_, offset) < 0) 58 | LOG_ERROR << "ftruncate error: " << strerror(errno); 59 | } 60 | 61 | Status AIOFile::read(uint64_t offset, Slice& buffer) 62 | { 63 | BIORequest* request = new BIORequest(); 64 | 65 | request->mutex.lock(); 66 | async_read(offset, buffer, boost::bind(&BIORequest::complete, request, _1)); 67 | assert(request->mutex.is_locked_by_this_thread()); 68 | request->cond.wait(); 69 | 70 | Status stat = request->status; 71 | request->mutex.unlock(); 72 | 73 | delete request; 74 | return stat; 75 | } 76 | 77 | Status AIOFile::write(uint64_t offset, const Slice& buffer) 78 | { 79 | BIORequest* request = new BIORequest(); 80 | 81 | request->mutex.lock(); 82 | async_write(offset, buffer, boost::bind(&BIORequest::complete, request, _1)); 83 | assert(request->mutex.is_locked_by_this_thread()); 84 | request->cond.wait(); 85 | 86 | Status stat = request->status; 87 | request->mutex.unlock(); 88 | 89 | delete request; 90 | return stat; 91 | } 92 | 93 | void AIOFile::async_read(uint64_t offset, Slice& buffer, Callback cb) 94 | { 95 | struct iocb iocb; 96 | struct iocb* iocbs = &iocb; 97 | 98 | AIORequest* request = new AIOReadRequest(); 99 | request->size = buffer.size(); 100 | request->callback = cb; 101 | 102 | io_prep_pread(&iocb, fd_, (void*)buffer.data(), buffer.size(), offset); 103 | iocb.data = request; 104 | 105 | int status; 106 | do { 107 | status = io_submit(ioctx_, 1, &iocbs); 108 | 109 | if (-status == EAGAIN) { 110 | usleep(1000); 111 | } else if (status < 0) { 112 | LOG_ERROR << "io_submit error: " << strerror(-status); 113 | request->complete(status); 114 | delete request; 115 | break; 116 | } 117 | } while (status < 0); 118 | } 119 | 120 | void AIOFile::async_write(uint64_t offset, const Slice& buffer, Callback cb) 121 | { 122 | struct iocb iocb; 123 | struct iocb* iocbs = &iocb; 124 | 125 | AIORequest* request = new AIOWriteRequest(); 126 | request->size = buffer.size(); 127 | request->callback = cb; 128 | 129 | io_prep_pwrite(&iocb, fd_, (void*)buffer.data(), buffer.size(), offset); 130 | iocb.data = request; 131 | 132 | int status; 133 | do { 134 | status = io_submit(ioctx_, 1, &iocbs); 135 | 136 | if (-status == EAGAIN) { 137 | usleep(1000); 138 | } else if (status < 0) { 139 | LOG_ERROR << "io_submit error: " << strerror(-status); 140 | request->complete(status); 141 | delete request; 142 | break; 143 | } 144 | } while (status < 0); 145 | } 146 | 147 | void AIOFile::handle_io_complete() 148 | { 149 | while (!closed_) { 150 | struct io_event events[MAX_AIO_EVENTS]; 151 | memset(events, 0, sizeof(io_event) * MAX_AIO_EVENTS); 152 | 153 | struct timespec timeout; 154 | timeout.tv_sec = 0; 155 | timeout.tv_nsec = 100000000; 156 | 157 | int num_events = io_getevents(ioctx_, 1, MAX_AIO_EVENTS, events, &timeout); 158 | 159 | if (num_events < 0) { 160 | if (-num_events != EINTR) { 161 | LOG_ERROR << "io_getevents error: " << strerror(-num_events); 162 | break; 163 | } else { 164 | continue; 165 | } 166 | } 167 | 168 | for (int i = 0; i < num_events; i++) { 169 | AIORequest* req = static_cast(events[i].data); 170 | req->complete(events[i].res); 171 | delete req; 172 | } 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /test/histogram.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // This file is copied from LevelDB and modifed a little 6 | // to add LevelDB style benchmark 7 | 8 | #include 9 | #include 10 | #include "histogram.h" 11 | 12 | namespace yodb { 13 | 14 | const double Histogram::kBucketLimit[kNumBuckets] = { 15 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, 16 | 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, 17 | 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 18 | 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, 19 | 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 20 | 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, 21 | 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, 22 | 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, 23 | 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, 24 | 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, 25 | 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, 26 | 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, 27 | 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, 28 | 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, 29 | 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, 30 | 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, 31 | 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, 32 | 1e200, 33 | }; 34 | 35 | void Histogram::Clear() { 36 | min_ = kBucketLimit[kNumBuckets-1]; 37 | max_ = 0; 38 | num_ = 0; 39 | sum_ = 0; 40 | sum_squares_ = 0; 41 | for (int i = 0; i < kNumBuckets; i++) { 42 | buckets_[i] = 0; 43 | } 44 | } 45 | 46 | void Histogram::Add(double value) { 47 | // Linear search is fast enough for our usage in db_bench 48 | int b = 0; 49 | while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { 50 | b++; 51 | } 52 | buckets_[b] += 1.0; 53 | if (min_ > value) min_ = value; 54 | if (max_ < value) max_ = value; 55 | num_++; 56 | sum_ += value; 57 | sum_squares_ += (value * value); 58 | } 59 | 60 | void Histogram::Merge(const Histogram& other) { 61 | if (other.min_ < min_) min_ = other.min_; 62 | if (other.max_ > max_) max_ = other.max_; 63 | num_ += other.num_; 64 | sum_ += other.sum_; 65 | sum_squares_ += other.sum_squares_; 66 | for (int b = 0; b < kNumBuckets; b++) { 67 | buckets_[b] += other.buckets_[b]; 68 | } 69 | } 70 | 71 | double Histogram::Median() const { 72 | return Percentile(50.0); 73 | } 74 | 75 | double Histogram::Percentile(double p) const { 76 | double threshold = num_ * (p / 100.0); 77 | double sum = 0; 78 | for (int b = 0; b < kNumBuckets; b++) { 79 | sum += buckets_[b]; 80 | if (sum >= threshold) { 81 | // Scale linearly within this bucket 82 | double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; 83 | double right_point = kBucketLimit[b]; 84 | double left_sum = sum - buckets_[b]; 85 | double right_sum = sum; 86 | double pos = (threshold - left_sum) / (right_sum - left_sum); 87 | double r = left_point + (right_point - left_point) * pos; 88 | if (r < min_) r = min_; 89 | if (r > max_) r = max_; 90 | return r; 91 | } 92 | } 93 | return max_; 94 | } 95 | 96 | double Histogram::Average() const { 97 | if (num_ == 0.0) return 0; 98 | return sum_ / num_; 99 | } 100 | 101 | double Histogram::StandardDeviation() const { 102 | if (num_ == 0.0) return 0; 103 | double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); 104 | return sqrt(variance); 105 | } 106 | 107 | std::string Histogram::ToString() const { 108 | std::string r; 109 | char buf[200]; 110 | snprintf(buf, sizeof(buf), 111 | "Count: %.0f Average: %.4f StdDev: %.2f\n", 112 | num_, Average(), StandardDeviation()); 113 | r.append(buf); 114 | snprintf(buf, sizeof(buf), 115 | "Min: %.4f Median: %.4f Max: %.4f\n", 116 | (num_ == 0.0 ? 0.0 : min_), Median(), max_); 117 | r.append(buf); 118 | r.append("------------------------------------------------------\n"); 119 | const double mult = 100.0 / num_; 120 | double sum = 0; 121 | for (int b = 0; b < kNumBuckets; b++) { 122 | if (buckets_[b] <= 0.0) continue; 123 | sum += buckets_[b]; 124 | snprintf(buf, sizeof(buf), 125 | "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", 126 | ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left 127 | kBucketLimit[b], // right 128 | buckets_[b], // count 129 | mult * buckets_[b], // percentage 130 | mult * sum); // cumulative percentage 131 | r.append(buf); 132 | 133 | // Add hash marks based on percentage; 20 marks for 100%. 134 | int marks = static_cast(20*(buckets_[b] / num_) + 0.5); 135 | r.append(marks, '#'); 136 | r.push_back('\n'); 137 | } 138 | return r; 139 | } 140 | 141 | } // namespace leveldb 142 | -------------------------------------------------------------------------------- /tree/skiplist.h: -------------------------------------------------------------------------------- 1 | #ifndef _YODB_SKIPLIST_H_ 2 | #define _YODB_SKIPLIST_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "util/arena.h" 12 | 13 | namespace yodb { 14 | 15 | class Arena; 16 | 17 | template 18 | class SkipList : boost::noncopyable { 19 | private: 20 | struct Node; 21 | public: 22 | explicit SkipList(Comparator cmp); 23 | 24 | void insert(const Key& key); 25 | bool contains(const Key& key) const; 26 | void erase(const Key& key); 27 | void resize(size_t size); 28 | void clear(); 29 | 30 | size_t count() const { return count_; } 31 | size_t memory_usage() const { return arena_.usage(); } 32 | 33 | class Iterator { 34 | public: 35 | explicit Iterator(const SkipList* list); 36 | 37 | bool valid() const; 38 | const Key& key() const; 39 | void next(); 40 | void prev(); 41 | 42 | void seek(const Key& target); 43 | void seek_to_first(); 44 | void seek_to_middle(); 45 | void seek_to_last(); 46 | 47 | private: 48 | const SkipList* list_; 49 | Node* node_; 50 | }; 51 | 52 | private: 53 | enum { kMaxHeight = 17 }; 54 | 55 | Arena arena_; 56 | Node* head_; 57 | size_t max_height_; 58 | size_t count_; 59 | Comparator compare_; 60 | 61 | int seed_; 62 | 63 | int random_height(); 64 | bool equal(const Key& a, const Key& b) const; 65 | 66 | Node* new_node(const Key& key, size_t height); 67 | Node* find_greater_or_equal(const Key& key, Node** prev) const; 68 | Node* find_less_than(const Key& key) const; 69 | }; 70 | 71 | template 72 | struct SkipList::Node { 73 | explicit Node(const Key& k) : key(k) { } 74 | 75 | Key key; 76 | 77 | Node* next(size_t n) { return next_[n]; } 78 | void set_next(size_t n, Node* node) { next_[n] = node; } 79 | void set_key(const Key& k) { key = k; } 80 | 81 | private: 82 | Node* next_[1]; 83 | }; 84 | 85 | template 86 | typename SkipList::Node* 87 | SkipList::new_node(const Key& key, size_t height) 88 | { 89 | size_t size = sizeof(Node) + sizeof(Node*) * (height - 1); 90 | char* alloc_ptr = arena_.alloc_aligned(size); 91 | 92 | return new (alloc_ptr) Node(key); 93 | } 94 | 95 | template 96 | inline SkipList::Iterator::Iterator(const SkipList* list) 97 | { 98 | list_ = list; 99 | node_ = NULL; 100 | } 101 | 102 | template 103 | inline bool SkipList::Iterator::valid() const 104 | { 105 | return node_ != NULL; 106 | } 107 | 108 | template 109 | inline const Key& SkipList::Iterator::key() const 110 | { 111 | assert(valid()); 112 | return node_->key; 113 | } 114 | 115 | template 116 | inline void SkipList::Iterator::next() 117 | { 118 | assert(valid()); 119 | node_ = node_->next(0); 120 | } 121 | 122 | template 123 | inline void SkipList::Iterator::prev() 124 | { 125 | assert(valid()); 126 | 127 | node_ = list_->find_less_than(node_->key); 128 | if (node_ == list_->head_) 129 | node_ = NULL; 130 | } 131 | 132 | template 133 | inline void SkipList::Iterator::seek(const Key& target) 134 | { 135 | node_ = list_->find_greater_or_equal(target, NULL); 136 | if (node_ == list_->head_) 137 | node_ = NULL; 138 | } 139 | 140 | template 141 | inline void SkipList::Iterator::seek_to_first() 142 | { 143 | node_ = list_->head_->next(0); 144 | } 145 | 146 | template 147 | inline void SkipList::Iterator::seek_to_middle() 148 | { 149 | int middle = list_->count_ / 2; 150 | 151 | seek_to_first(); 152 | 153 | for (int i = 0; i < middle; i++) 154 | node_ = node_->next(0); 155 | } 156 | 157 | template 158 | inline void SkipList::Iterator::seek_to_last() 159 | { 160 | Node* curr = list_->head_; 161 | size_t level = list_->max_height_ - 1; 162 | 163 | while (true) { 164 | Node* next = curr->next(level); 165 | 166 | if (next == NULL) { 167 | if (level == 0) 168 | break; 169 | else 170 | level--; 171 | } else { 172 | curr = next; 173 | } 174 | } 175 | 176 | node_ = curr; 177 | if (node_ == list_->head_) 178 | node_ = NULL; 179 | } 180 | 181 | template 182 | bool SkipList::equal(const Key& a, const Key& b) const 183 | { 184 | return compare_(a, b) == 0; 185 | } 186 | 187 | template 188 | int SkipList::random_height() 189 | { 190 | static const size_t kBranching = 4; 191 | int height = 1; 192 | 193 | while (height < kMaxHeight && (rand() % kBranching) == 0) 194 | height++; 195 | 196 | return height; 197 | } 198 | 199 | template 200 | typename SkipList::Node* 201 | SkipList::find_greater_or_equal(const Key& key, Node** prev) const 202 | { 203 | Node* curr = head_; 204 | size_t level = max_height_ - 1; 205 | 206 | while (true) { 207 | Node* next = curr->next(level); 208 | 209 | if (next != NULL && compare_(next->key, key) < 0) { 210 | curr = next; 211 | } else { 212 | if (prev != NULL) 213 | prev[level] = curr; 214 | 215 | if (level == 0) 216 | return next; 217 | else 218 | level--; 219 | } 220 | } 221 | } 222 | 223 | template 224 | typename SkipList::Node* 225 | SkipList::find_less_than(const Key& key) const 226 | { 227 | Node* curr = head_; 228 | size_t level = max_height_ - 1; 229 | 230 | while (true) { 231 | Node* next = curr->next(level); 232 | 233 | if (next == NULL || compare_(next->key, key) >= 0) { 234 | if (level == 0) 235 | return curr; 236 | else 237 | level--; 238 | } else { 239 | curr = next; 240 | } 241 | } 242 | } 243 | 244 | template 245 | SkipList::SkipList(Comparator cmp) 246 | : arena_(), head_(new_node(Key(), kMaxHeight)), 247 | max_height_(1), count_(0), 248 | compare_(cmp), seed_(time(NULL)) 249 | { 250 | srand(seed_); 251 | 252 | for (int i = 0; i < kMaxHeight; i++) 253 | head_->set_next(i, NULL); 254 | } 255 | 256 | template 257 | void SkipList::insert(const Key& key) 258 | { 259 | Node* prev[kMaxHeight]; 260 | Node* next = find_greater_or_equal(key, prev); 261 | 262 | size_t height = random_height(); 263 | 264 | if (height > max_height_) { 265 | for (size_t i = max_height_; i < height; i++) 266 | prev[i] = head_; 267 | 268 | max_height_ = height; 269 | } 270 | 271 | if (next && equal(next->key, key)) { 272 | next->set_key(key); 273 | } else { 274 | Node* curr = new_node(key, height); 275 | 276 | for (size_t i = 0; i < height; i++) { 277 | curr->set_next(i, prev[i]->next(i)); 278 | prev[i]->set_next(i, curr); 279 | } 280 | 281 | count_++; 282 | } 283 | } 284 | 285 | template 286 | bool SkipList::contains(const Key& key) const 287 | { 288 | Node* x = find_greater_or_equal(key, NULL); 289 | 290 | if (x != NULL && equal(x->key, key)) 291 | return true; 292 | else 293 | return false; 294 | } 295 | 296 | template 297 | void SkipList::erase(const Key& key) 298 | { 299 | Node* prev[kMaxHeight]; 300 | Node* curr = find_greater_or_equal(key, prev); 301 | 302 | assert(curr != NULL); 303 | assert(equal(curr->key, key)); 304 | 305 | for (size_t i = 0; i < max_height_; i++) { 306 | if (prev[i]->next(i) == curr) 307 | prev[i]->set_next(i, curr->next(i)); 308 | } 309 | 310 | count_--; 311 | } 312 | 313 | template 314 | void SkipList::resize(size_t size) 315 | { 316 | assert(size <= count_); 317 | 318 | std::vector keys; 319 | keys.reserve(size); 320 | 321 | Iterator iter(this); 322 | iter.seek_to_first(); 323 | 324 | for (size_t i = 0; i < size; i++) { 325 | assert(iter.valid()); 326 | keys.push_back(iter.key()); 327 | iter.next(); 328 | } 329 | 330 | clear(); 331 | 332 | for (size_t i = 0; i < keys.size(); i++) 333 | insert(keys[i]); 334 | 335 | count_ = size; 336 | } 337 | 338 | template 339 | void SkipList::clear() 340 | { 341 | arena_.clear(); 342 | head_ = new_node(Key(), kMaxHeight); 343 | 344 | for (int i = 0; i < kMaxHeight; i++) 345 | head_->set_next(i, NULL); 346 | 347 | count_ = 0; 348 | max_height_ = 1; 349 | } 350 | 351 | } // namespace yodb 352 | 353 | #endif // _YODB_SKIPLIST_H_ 354 | -------------------------------------------------------------------------------- /cache/cache.cc: -------------------------------------------------------------------------------- 1 | #include "cache/cache.h" 2 | #include "tree/buffer_tree.h" 3 | 4 | #include 5 | #include 6 | 7 | using namespace yodb; 8 | 9 | Cache::Cache(const Options& opts) 10 | : options_(opts), cache_size_(0), 11 | alive_(false), worker_(NULL) 12 | { 13 | } 14 | 15 | Cache::~Cache() 16 | { 17 | alive_ = false; 18 | if (worker_) { 19 | worker_->join(); 20 | delete worker_; 21 | LOG_INFO << "Cache write back work thread finished."; 22 | } 23 | 24 | LOG_INFO << "Cache destructor finished"; 25 | } 26 | 27 | bool Cache::init() 28 | { 29 | alive_ = true; 30 | worker_ = new Thread(boost::bind(&Cache::write_back, this)); 31 | 32 | if (worker_ == NULL) { 33 | LOG_ERROR << "create thread error"; 34 | return false; 35 | } 36 | 37 | worker_->run(); 38 | return true; 39 | } 40 | 41 | void Cache::integrate(BufferTree* tree, Table* table) 42 | { 43 | assert(tree && table); 44 | 45 | tree_ = tree; 46 | table_ = table; 47 | 48 | last_checkpoint_timestamp = Timestamp::now(); 49 | } 50 | 51 | void Cache::put(nid_t nid, Node* node) 52 | { 53 | assert(node->refs() == 0); 54 | 55 | maybe_eviction(); 56 | 57 | lock_nodes_.write_lock(); 58 | 59 | assert(nodes_.find(nid) == nodes_.end()); 60 | nodes_[nid] = node; 61 | node->inc_ref(); 62 | 63 | lock_nodes_.write_unlock(); 64 | } 65 | 66 | Node* Cache::get(nid_t nid) 67 | { 68 | lock_nodes_.read_lock(); 69 | 70 | NodeMap::iterator iter = nodes_.find(nid); 71 | 72 | if (iter != nodes_.end()) { 73 | Node* node = iter->second; 74 | node->inc_ref(); 75 | lock_nodes_.read_unlock(); 76 | return node; 77 | } 78 | 79 | lock_nodes_.read_unlock(); 80 | 81 | maybe_eviction(); 82 | 83 | Block* block = table_->read(nid); 84 | if (block == NULL) return NULL; 85 | 86 | BlockReader reader(*block); 87 | Node* node = tree_->create_node(nid); 88 | 89 | assert(node->nid() == nid); 90 | 91 | if (!(node->constrcutor(reader))){ 92 | assert(false); 93 | } 94 | 95 | table_->self_dealloc(block->buffer()); 96 | 97 | lock_nodes_.write_lock(); 98 | 99 | if (nodes_.find(nid) != nodes_.end()) { 100 | delete node; 101 | node = nodes_[nid]; 102 | } else { 103 | nodes_[nid] = node; 104 | } 105 | node->inc_ref(); 106 | 107 | lock_nodes_.write_unlock(); 108 | 109 | return node; 110 | } 111 | 112 | void Cache::flush() 113 | { 114 | lock_nodes_.write_lock(); 115 | 116 | std::vector ready_nodes; 117 | 118 | for (NodeMap::iterator it = nodes_.begin(); it != nodes_.end(); it++) { 119 | Node* node = it->second; 120 | assert(node->refs() == 0); 121 | 122 | if (node->dirty() && !node->flushing()) { 123 | node->write_lock(); 124 | node->set_flushing(true); 125 | ready_nodes.push_back(node); 126 | } 127 | } 128 | 129 | lock_nodes_.write_unlock(); 130 | 131 | if (ready_nodes.size()) 132 | flush_ready_nodes(ready_nodes); 133 | 134 | table_->flush(); 135 | } 136 | 137 | void Cache::write_back() 138 | { 139 | while (alive_) { 140 | std::vector expired_nodes; 141 | Timestamp now = Timestamp::now(); 142 | 143 | size_t total_size = 0; 144 | size_t dirty_size = 0; 145 | size_t expired_size = 0; 146 | size_t goal = options_.cache_limited_memory / 100; 147 | 148 | lock_nodes_.read_lock(); 149 | 150 | for (NodeMap::iterator iter = nodes_.begin(); iter != nodes_.end(); iter++) { 151 | Node* node = iter->second; 152 | size_t size = node->size(); 153 | 154 | total_size += size; 155 | 156 | if (node->dirty()) { 157 | dirty_size += size; 158 | 159 | bool expire = 2.0 * options_.cache_dirty_node_expire < 160 | time_interval(now, node->get_first_write_timestamp()); 161 | 162 | if (expire && !node->flushing()) { 163 | expired_nodes.push_back(node); 164 | expired_size += size; 165 | } 166 | } 167 | } 168 | 169 | // LOG_INFO << Fmt("Memory total size: %zuK, ", total_size / 1024) 170 | // << Fmt("expire size: %zuK, ", expired_size / 1024) 171 | // << Fmt("cache size: %zuK", options_.cache_limited_memory / 1024); 172 | 173 | lock_nodes_.read_unlock(); 174 | { 175 | ScopedMutex lock(cache_size_mutex_); 176 | cache_size_ = total_size; 177 | } 178 | 179 | FirstWriteComparator comparator; 180 | std::sort(expired_nodes.begin(), expired_nodes.end(), comparator); 181 | 182 | std::vector flush_nodes; 183 | size_t flush_size = 0; 184 | 185 | for (size_t i = 0; i < expired_nodes.size(); i++) { 186 | Node* node = expired_nodes[i]; 187 | 188 | if (node->try_write_lock()) { 189 | node->set_flushing(true); 190 | flush_size += node->size(); 191 | flush_nodes.push_back(node); 192 | } 193 | 194 | if (flush_size > goal) break; 195 | } 196 | 197 | // LOG_INFO << Fmt("Memory dirty size: %zuK", dirty_size / 1024); 198 | 199 | size_t overage = options_.cache_limited_memory / 100 * 30; 200 | bool maybe = (dirty_size - flush_size) > overage; 201 | 202 | goal += goal * (dirty_size - flush_size) / overage; 203 | 204 | if (maybe && flush_size < goal) { 205 | lock_nodes_.read_lock(); 206 | 207 | std::vector maybe_nodes; 208 | for (NodeMap::iterator iter = nodes_.begin(); 209 | iter != nodes_.end(); iter++) { 210 | Node* node = iter->second; 211 | 212 | if (node->dirty() && !node->flushing()) 213 | maybe_nodes.push_back(node); 214 | } 215 | 216 | lock_nodes_.read_unlock(); 217 | 218 | std::sort(maybe_nodes.begin(), maybe_nodes.end(), comparator); 219 | 220 | for (size_t i = 0; i < maybe_nodes.size(); i++) { 221 | Node* node = maybe_nodes[i]; 222 | size_t size = node->size(); 223 | 224 | if (node->try_write_lock()) { 225 | node->set_flushing(true); 226 | flush_size += size; 227 | flush_nodes.push_back(node); 228 | } 229 | 230 | if (flush_size > goal) break; 231 | } 232 | } 233 | 234 | // LOG_INFO << Fmt("Memory flush size: %zuK", flush_size / 1024); 235 | 236 | if (flush_nodes.size()) 237 | flush_ready_nodes(flush_nodes); 238 | 239 | ::usleep(1000 * 100); // 100ms 240 | } 241 | } 242 | 243 | void Cache::flush_ready_nodes(std::vector& ready_nodes) 244 | { 245 | for (size_t i = 0; i < ready_nodes.size(); i++) { 246 | Node* node = ready_nodes[i]; 247 | size_t bytes = node->write_back_size(); 248 | 249 | Slice alloc_ptr = table_->self_alloc(bytes); 250 | assert(alloc_ptr.size()); 251 | 252 | Block block(alloc_ptr, 0, bytes); 253 | BlockWriter writer(block); 254 | 255 | node->destructor(writer); 256 | assert(writer.ok()); 257 | 258 | node->set_dirty(false); 259 | node->write_unlock(); 260 | 261 | table_->async_write(node->nid(), block, 262 | boost::bind(&Cache::write_complete_handler, this, node, alloc_ptr, _1)); 263 | } 264 | 265 | Timestamp now = Timestamp::now(); 266 | double time = 30.0; 267 | if (time_interval(now, last_checkpoint_timestamp) > time) { 268 | table_->flush_immediately(); 269 | table_->truncate(); 270 | last_checkpoint_timestamp = now; 271 | } 272 | } 273 | 274 | void Cache::write_complete_handler(Node* node, Slice alloc_ptr, Status status) 275 | { 276 | assert(node != NULL); 277 | assert(alloc_ptr.size()); 278 | 279 | node->set_flushing(false); 280 | table_->self_dealloc(alloc_ptr); 281 | 282 | if (!status.succ) { 283 | LOG_ERROR << "write back node failed, nid=" << node->nid(); 284 | } 285 | } 286 | 287 | void Cache::maybe_eviction() 288 | { 289 | { 290 | ScopedMutex lock(cache_size_mutex_); 291 | if (cache_size_ < options_.cache_limited_memory) 292 | return; 293 | } 294 | evict_from_memory(); 295 | } 296 | 297 | void Cache::evict_from_memory() 298 | { 299 | // Apply write lock, don't allow any get/put operation, 300 | // it is guaranteed no increase reference during this period. 301 | lock_nodes_.write_lock(); 302 | 303 | size_t total_size = 0; 304 | std::vector candidates; 305 | 306 | for (NodeMap::iterator it = nodes_.begin(); it != nodes_.end(); it++) { 307 | Node* node = it->second; 308 | assert(node->nid() == it->first); 309 | 310 | size_t size = node->size(); 311 | total_size += size; 312 | 313 | if (node->refs() == 0 && !node->dirty() && !node->flushing()) 314 | candidates.push_back(node); 315 | } 316 | 317 | { 318 | ScopedMutex lock(cache_size_mutex_); 319 | cache_size_ = total_size; 320 | } 321 | 322 | size_t evict_size = 0; 323 | size_t goal = options_.cache_limited_memory / 100; 324 | std::vector evict_nodes; 325 | LRUComparator comparator; 326 | 327 | 328 | std::sort(candidates.begin(), candidates.end(), comparator); 329 | 330 | for (size_t i = 0; i < candidates.size(); i++) { 331 | Node* node = candidates[i]; 332 | size_t size = node->size(); 333 | 334 | assert(node->refs() == 0); 335 | assert(!node->dirty()); 336 | assert(!node->flushing()); 337 | 338 | evict_size += size; 339 | nodes_.erase(node->nid()); 340 | 341 | delete node; 342 | 343 | if (evict_size >= goal) break; 344 | } 345 | 346 | { 347 | ScopedMutex lock(cache_size_mutex_); 348 | cache_size_ -= evict_size; 349 | } 350 | 351 | lock_nodes_.write_unlock(); 352 | 353 | // LOG_INFO << Fmt("evict %zuK bytes from memory", evict_size / 1024); 354 | } 355 | -------------------------------------------------------------------------------- /tree/node.cc: -------------------------------------------------------------------------------- 1 | #include "tree/node.h" 2 | #include "tree/buffer_tree.h" 3 | 4 | using namespace yodb; 5 | 6 | Node::Node(BufferTree* tree, nid_t self) 7 | : tree_(tree), 8 | self_nid_(self), 9 | refcnt_(0), 10 | dirty_(false), 11 | flushing_(false) 12 | { 13 | } 14 | 15 | Node::~Node() 16 | { 17 | for (size_t i = 0; i < pivots_.size(); i++) { 18 | Pivot& pivot = pivots_[i]; 19 | 20 | if (pivot.left_most_key.size()) { 21 | pivot.left_most_key.release(); 22 | } 23 | 24 | delete pivot.table; 25 | } 26 | pivots_.clear(); 27 | } 28 | 29 | bool Node::get(const Slice& key, Slice& value, Node* parent) 30 | { 31 | read_lock(); 32 | 33 | if (parent) { 34 | parent->read_unlock(); 35 | } 36 | 37 | size_t index = find_pivot(key); 38 | MsgTable* table = pivots_[index].table; 39 | 40 | table->lock(); 41 | 42 | Msg lookup; 43 | if (table->find(key, lookup) && lookup.key() == key) { 44 | if (lookup.type() == Put) { 45 | value = lookup.value().clone(); 46 | table->unlock(); 47 | read_unlock(); 48 | return true; 49 | } else { 50 | table->unlock(); 51 | read_unlock(); 52 | return false; 53 | } 54 | } 55 | table->unlock(); 56 | 57 | if (pivots_[index].child_nid == NID_NIL) { 58 | read_unlock(); 59 | return false; 60 | } 61 | 62 | Node* node = tree_->get_node_by_nid(pivots_[index].child_nid); 63 | assert(node); 64 | 65 | bool exists = node->get(key, value, this); 66 | node->dec_ref(); 67 | 68 | return exists; 69 | } 70 | 71 | bool Node::put(const Slice& key, const Slice& value) 72 | { 73 | return write(Msg(Put, key.clone(), value.clone())); 74 | } 75 | 76 | bool Node::del(const Slice& key) 77 | { 78 | return write(Msg(Del, key.clone())); 79 | } 80 | 81 | bool Node::write(const Msg& msg) 82 | { 83 | assert(pivots_.size()); 84 | 85 | optional_lock(); 86 | 87 | if (tree_->root_->nid() != self_nid_) { 88 | optional_unlock(); 89 | return tree_->root_->write(msg); 90 | } 91 | 92 | insert_msg(find_pivot(msg.key()), msg); 93 | set_dirty(true); 94 | 95 | maybe_push_down_or_split(); 96 | return true; 97 | } 98 | 99 | void Node::maybe_push_down_or_split() 100 | { 101 | int index = -1; 102 | 103 | for (size_t i = 0; i < pivots_.size(); i++) { 104 | if (pivots_[i].table->count() > 105 | tree_->options_.max_node_msg_count) { 106 | index = i; break; 107 | } 108 | } 109 | 110 | if (index < 0) { 111 | optional_unlock(); 112 | return; 113 | } 114 | 115 | if (pivots_[index].child_nid != NID_NIL) { 116 | MsgTable* table = pivots_[index].table; 117 | Node* node = tree_->get_node_by_nid(pivots_[index].child_nid); 118 | node->push_down(table, this); 119 | node->dec_ref(); 120 | } else { 121 | split_table(pivots_[index].table); 122 | } 123 | 124 | optional_lock(); 125 | maybe_push_down_or_split(); 126 | } 127 | 128 | void Node::create_first_pivot() 129 | { 130 | write_lock(); 131 | 132 | assert(pivots_.size() == 0); 133 | add_pivot(NID_NIL, NULL, Slice()); 134 | 135 | write_unlock(); 136 | } 137 | 138 | size_t Node::find_pivot(Slice key) 139 | { 140 | if (pivots_.size() == 0) 141 | return 0; 142 | 143 | size_t pivot = 0; 144 | Comparator* cmp = tree_->options_.comparator; 145 | 146 | for (size_t i = 1; i < pivots_.size(); i++) { 147 | if (cmp->compare(key, pivots_[i].left_most_key) < 0) { 148 | return pivot; 149 | } 150 | pivot++; 151 | } 152 | return pivot; 153 | } 154 | 155 | void Node::push_down(MsgTable* table, Node* parent) 156 | { 157 | optional_lock(); 158 | 159 | push_down_locked(table, parent); 160 | parent->read_unlock(); 161 | 162 | maybe_push_down_or_split(); 163 | } 164 | 165 | void Node::split_table(MsgTable* table) 166 | { 167 | assert(is_leaf_); 168 | 169 | if (table->count() <= tree_->options_.max_node_msg_count) { 170 | write_unlock(); 171 | return; 172 | } 173 | 174 | MsgTable* table0 = table; 175 | MsgTable* table1 = new MsgTable(tree_->options_.comparator); 176 | 177 | table0->lock(); 178 | 179 | MsgTable::Iterator iter(table0->skiplist()); 180 | 181 | iter.seek_to_first(); 182 | assert(iter.valid()); 183 | Msg msg = iter.key(); 184 | 185 | iter.seek_to_middle(); 186 | assert(iter.valid()); 187 | Msg middle = iter.key(); 188 | 189 | table1->lock(); 190 | while (iter.valid()) { 191 | table1->insert(iter.key()); 192 | iter.next(); 193 | } 194 | 195 | size_t sz = table0->size(); 196 | table0->resize(table0->count() / 2); 197 | 198 | add_pivot(NID_NIL, table1, middle.key().clone()); 199 | 200 | assert(table0->size() + table1->size() >= sz); 201 | 202 | table1->unlock(); 203 | table0->unlock(); 204 | 205 | set_dirty(true); 206 | write_unlock(); 207 | 208 | std::vector locked_path; 209 | tree_->lock_path(msg.key(), locked_path); 210 | 211 | if (!locked_path.empty()) { 212 | Node* node = locked_path.back(); 213 | node->try_split_node(locked_path); 214 | } 215 | } 216 | 217 | void Node::try_split_node(std::vector& path) 218 | { 219 | assert(path.back() == this); 220 | 221 | if (pivots_.size() <= tree_->options_.max_node_child_number) { 222 | while (!path.empty()) { 223 | Node* node = path.back(); 224 | node->write_unlock(); 225 | node->dec_ref(); 226 | path.pop_back(); 227 | } 228 | return; 229 | } 230 | 231 | size_t middle = pivots_.size() / 2; 232 | Slice middle_key = pivots_[middle].left_most_key; 233 | 234 | // LOG_INFO << "try_split_node, " << middle_key.data(); 235 | 236 | Node* node = tree_->create_node(); 237 | node->is_leaf_ = is_leaf_; 238 | 239 | Container::iterator first = pivots_.begin() + middle; 240 | Container::iterator last = pivots_.end(); 241 | 242 | node->pivots_.insert(node->pivots_.begin(), first, last); 243 | node->set_dirty(true); 244 | node->dec_ref(); 245 | 246 | pivots_.resize(middle); 247 | set_dirty(true); 248 | 249 | path.pop_back(); 250 | 251 | if (path.empty()) { 252 | Node* root = tree_->create_node(); 253 | root->is_leaf_ = false; 254 | 255 | root->add_pivot(nid(), NULL, Slice()); 256 | root->add_pivot(node->nid(), NULL, middle_key.clone()); 257 | 258 | tree_->grow_up(root); 259 | } else { 260 | Node* parent = path.back(); 261 | 262 | parent->add_pivot(node->nid(), NULL, middle_key.clone()); 263 | parent->try_split_node(path); 264 | } 265 | 266 | write_unlock(); 267 | dec_ref(); 268 | } 269 | 270 | void Node::add_pivot(nid_t child, MsgTable* table, Slice key) 271 | { 272 | ScopedMutex lock(pivots_mutex_); 273 | 274 | if (key.size() == 0) { 275 | assert(table == NULL); 276 | assert(pivots_.size() == 0); 277 | 278 | table = new MsgTable(tree_->options_.comparator); 279 | pivots_.push_back(Pivot(child, table, key)); 280 | } else { 281 | assert(pivots_.size()); 282 | 283 | if (table == NULL) { 284 | table = new MsgTable(tree_->options_.comparator); 285 | } 286 | 287 | size_t idx = find_pivot(key); 288 | pivots_.insert(pivots_.begin() + idx + 1, Pivot(child, table, key)); 289 | } 290 | 291 | set_dirty(true); 292 | } 293 | 294 | void Node::lock_path(const Slice& key, std::vector& path) 295 | { 296 | path.push_back(this); 297 | 298 | size_t index = find_pivot(key); 299 | 300 | if (pivots_[index].child_nid != NID_NIL) { 301 | Node* node = tree_->get_node_by_nid(pivots_[index].child_nid); 302 | 303 | node->write_lock(); 304 | node->push_down_locked(pivots_[index].table, this); 305 | node->lock_path(key, path); 306 | } 307 | } 308 | 309 | void Node::push_down_locked(MsgTable* table, Node* parent) 310 | { 311 | table->lock(); 312 | 313 | if (table->count() == 0) { 314 | table->unlock(); 315 | return; 316 | } 317 | 318 | size_t idx = 1; 319 | size_t i = 0, j = 0; 320 | MsgTable::Iterator slow(table->skiplist()); 321 | MsgTable::Iterator fast(table->skiplist()); 322 | 323 | slow.seek_to_first(); 324 | fast.seek_to_first(); 325 | 326 | Comparator* cmp = tree_->options_.comparator; 327 | 328 | while (fast.valid() && idx < pivots_.size()) { 329 | if (cmp->compare(fast.key().key(), pivots_[idx].left_most_key) < 0) { 330 | j++; 331 | fast.next(); 332 | } else { 333 | while (i != j) { 334 | insert_msg(idx - 1, slow.key()); 335 | i++; 336 | slow.next(); 337 | } 338 | idx++; 339 | } 340 | } 341 | 342 | while (slow.valid()) { 343 | insert_msg(idx - 1, slow.key()); 344 | slow.next(); 345 | } 346 | 347 | set_dirty(true); 348 | parent->set_dirty(true); 349 | table->clear(); 350 | table->unlock(); 351 | } 352 | 353 | void Node::insert_msg(size_t index, const Msg& msg) 354 | { 355 | MsgTable* table = pivots_[index].table; 356 | 357 | table->lock(); 358 | table->insert(msg); 359 | table->unlock(); 360 | } 361 | 362 | size_t Node::size() 363 | { 364 | ScopedMutex lock(pivots_mutex_); 365 | 366 | size_t usage = sizeof(Node); 367 | 368 | for (size_t i = 0; i < pivots_.size(); i++) 369 | usage += pivots_[i].table->memory_usage() + pivots_[i].table->size(); 370 | 371 | return usage + pivots_.size() * sizeof(Pivot); 372 | } 373 | 374 | size_t Node::write_back_size() 375 | { 376 | size_t size = 0; 377 | 378 | size += 8; // self_nid_ 379 | size += 1; // is_leaf 380 | size += 4; // number of pivots 381 | 382 | for (size_t i = 0; i < pivots_.size(); i++) { 383 | size += 8; // child 384 | size += 4 + pivots_[i].left_most_key.size(); // left_most_key 385 | size += pivots_[i].table->size(); // table size 386 | } 387 | 388 | return size; 389 | } 390 | 391 | bool Node::constrcutor(BlockReader& reader) 392 | { 393 | reader >> self_nid_ >> is_leaf_; 394 | 395 | uint32_t pivots = 0; 396 | reader >> pivots; 397 | 398 | assert(reader.ok()); 399 | assert(pivots > 0); 400 | 401 | for (size_t i = 0; i < pivots; i++) { 402 | nid_t child; 403 | MsgTable* table = new MsgTable(tree_->options_.comparator); 404 | Slice left_most_key; 405 | 406 | reader >> child >> left_most_key; 407 | table->constrcutor(reader); 408 | 409 | pivots_.push_back(Pivot(child, table, left_most_key)); 410 | } 411 | set_dirty(true); 412 | 413 | return reader.ok(); 414 | } 415 | 416 | bool Node::destructor(BlockWriter& writer) 417 | { 418 | writer << self_nid_ << is_leaf_; 419 | 420 | uint32_t pivots = pivots_.size(); 421 | assert(pivots > 0); 422 | 423 | writer << pivots; 424 | 425 | for (size_t i = 0; i < pivots; i++) { 426 | writer << pivots_[i].child_nid 427 | << pivots_[i].left_most_key; 428 | pivots_[i].table->destructor(writer); 429 | } 430 | 431 | return writer.ok(); 432 | } 433 | 434 | nid_t Node::nid() 435 | { 436 | ScopedMutex lock(mutex_); 437 | return self_nid_; 438 | } 439 | 440 | void Node::set_nid(nid_t nid) 441 | { 442 | ScopedMutex lock(mutex_); 443 | self_nid_ = nid; 444 | } 445 | 446 | void Node::set_leaf(bool leaf) 447 | { 448 | ScopedMutex lock(mutex_); 449 | is_leaf_ = leaf; 450 | } 451 | 452 | void Node::set_dirty(bool dirty) 453 | { 454 | ScopedMutex lock(mutex_); 455 | 456 | if (!dirty_ && dirty) 457 | first_write_timestamp_ = Timestamp::now(); 458 | 459 | dirty_ = dirty; 460 | } 461 | 462 | bool Node::dirty() 463 | { 464 | ScopedMutex lock(mutex_); 465 | return dirty_; 466 | } 467 | 468 | void Node::set_flushing(bool flushing) 469 | { 470 | ScopedMutex lock(mutex_); 471 | flushing_ = flushing; 472 | } 473 | 474 | bool Node::flushing() 475 | { 476 | ScopedMutex lock(mutex_); 477 | return flushing_; 478 | } 479 | 480 | void Node::inc_ref() 481 | { 482 | ScopedMutex lock(mutex_); 483 | refcnt_++; 484 | } 485 | 486 | void Node::dec_ref() 487 | { 488 | ScopedMutex lock(mutex_); 489 | assert(refcnt_ > 0); 490 | 491 | refcnt_--; 492 | last_used_timestamp_ = Timestamp::now(); 493 | } 494 | 495 | size_t Node::refs() 496 | { 497 | ScopedMutex lock(mutex_); 498 | return refcnt_; 499 | } 500 | 501 | Timestamp Node::get_first_write_timestamp() 502 | { 503 | ScopedMutex lock(mutex_); 504 | return first_write_timestamp_; 505 | } 506 | 507 | Timestamp Node::get_last_used_timestamp() 508 | { 509 | ScopedMutex lock(mutex_); 510 | return last_used_timestamp_; 511 | } 512 | -------------------------------------------------------------------------------- /fs/table.cc: -------------------------------------------------------------------------------- 1 | #include "fs/table.h" 2 | #include 3 | #include 4 | 5 | using namespace yodb; 6 | 7 | Table::Table(AIOFile* file, uint64_t file_size) 8 | : file_(file), file_size_(file_size), offset_(0), 9 | fly_readers_(0), fly_writers_(0) 10 | { 11 | } 12 | 13 | Table::~Table() 14 | { 15 | if (!flush()) { 16 | LOG_ERROR << "flush error"; 17 | assert(false); 18 | } 19 | 20 | BlockEntry::iterator iter; 21 | ScopedMutex lock(block_entry_mutex_); 22 | 23 | for (iter = block_entry_.begin(); iter != block_entry_.end(); iter++) { 24 | BlockHandle* handle = iter->second; 25 | delete handle; 26 | } 27 | 28 | block_entry_.clear(); 29 | 30 | LOG_INFO << "Table destructor finished"; 31 | } 32 | 33 | bool Table::init(bool create) 34 | { 35 | if (create) { 36 | if (!flush_bootstrap()) return false; 37 | 38 | offset_ = BOOTSTRAP_SIZE; 39 | file_size_ = BOOTSTRAP_SIZE; 40 | } else { 41 | if (file_size_ < BOOTSTRAP_SIZE) return false; 42 | if (!load_bootstrap()) return false; 43 | 44 | if (bootstrap_.header.offset) { 45 | if (!load_header()) return false; 46 | } 47 | 48 | init_holes(); 49 | LOG_INFO << block_entry_.size() << " blocks found"; 50 | } 51 | truncate(); 52 | return true; 53 | } 54 | 55 | bool Table::flush() 56 | { 57 | mutex_.lock(); 58 | while (fly_writers_) { 59 | mutex_.unlock(); 60 | usleep(1000); 61 | mutex_.lock(); 62 | } 63 | mutex_.unlock(); 64 | 65 | if (!flush_immediately()) 66 | return false; 67 | 68 | truncate(); 69 | return true; 70 | } 71 | 72 | bool Table::flush_immediately() 73 | { 74 | size_t fly_holes; 75 | { 76 | ScopedMutex lock(fly_hole_list_mutex_); 77 | fly_holes = fly_hole_list_.size(); 78 | } 79 | 80 | if (!flush_header()) return false; 81 | if (!flush_bootstrap()) return false; 82 | 83 | flush_fly_holes(fly_holes); 84 | 85 | return true; 86 | } 87 | 88 | bool Table::flush_bootstrap() 89 | { 90 | Slice alloc_ptr = self_alloc(BOOTSTRAP_SIZE); 91 | assert(alloc_ptr.size()); 92 | 93 | Block block(alloc_ptr, 0, BOOTSTRAP_SIZE); 94 | BlockWriter writer(block); 95 | 96 | bool maybe = bootstrap_.header.offset == 0 ? false : true; 97 | writer << maybe; 98 | if (writer.ok() && maybe) { 99 | writer << bootstrap_.header.offset 100 | << bootstrap_.header.size 101 | << bootstrap_.root_nid; 102 | } 103 | 104 | assert(writer.ok()); 105 | 106 | if (!write_file(0, alloc_ptr)) { 107 | LOG_INFO << "flush_bootstrap error"; 108 | return false; 109 | } 110 | 111 | LOG_INFO << "flush_bootstrap success, " 112 | << Fmt("offset=%zu, ", bootstrap_.header.offset) 113 | << Fmt("size=%zu, ", bootstrap_.header.size) 114 | << Fmt("root nid=%zu", bootstrap_.root_nid); 115 | 116 | self_dealloc(alloc_ptr); 117 | return true; 118 | } 119 | 120 | bool Table::load_bootstrap() 121 | { 122 | Slice alloc_ptr = self_alloc(BOOTSTRAP_SIZE); 123 | assert(alloc_ptr.size()); 124 | 125 | if (!read_file(0, alloc_ptr)) { 126 | LOG_ERROR << "load bootstrap failed"; 127 | self_dealloc(alloc_ptr); 128 | return false; 129 | } 130 | 131 | Block block(alloc_ptr, 0, BOOTSTRAP_SIZE); 132 | BlockReader reader(block); 133 | bool maybe = false; 134 | 135 | reader >> maybe; 136 | if (reader.ok() && maybe) { 137 | reader >> bootstrap_.header.offset 138 | >> bootstrap_.header.size 139 | >> bootstrap_.root_nid; 140 | } 141 | 142 | if (!reader.ok()) 143 | LOG_ERROR << "read bootstrap failed"; 144 | else 145 | LOG_INFO << "load_bootstrap success, " 146 | << Fmt("offset=%zu, ", bootstrap_.header.offset) 147 | << Fmt("size=%zu, ", bootstrap_.header.size) 148 | << Fmt("root nid=%zu", bootstrap_.root_nid); 149 | 150 | self_dealloc(alloc_ptr); 151 | return reader.ok(); 152 | } 153 | 154 | bool Table::flush_header() 155 | { 156 | uint32_t header_size = block_header_size(); 157 | Slice alloc_ptr = self_alloc(header_size); 158 | 159 | assert(alloc_ptr.size()); 160 | 161 | Block block(alloc_ptr, 0, header_size); 162 | BlockWriter writer(block); 163 | 164 | { 165 | ScopedMutex lock(block_entry_mutex_); 166 | 167 | uint32_t blocks = block_entry_.size(); 168 | BlockEntry::iterator iter; 169 | 170 | writer << blocks; 171 | for (iter = block_entry_.begin(); iter != block_entry_.end(); iter++) { 172 | nid_t nid = iter->first; 173 | BlockHandle* handle = iter->second; 174 | 175 | writer << nid << handle->offset << handle->size; 176 | } 177 | 178 | if (!writer.ok()) { 179 | LOG_ERROR << "flush_header error"; 180 | self_dealloc(alloc_ptr); 181 | return false; 182 | } 183 | } 184 | 185 | uint64_t offset = find_space(alloc_ptr.size()); 186 | Status stat = file_->write(offset, alloc_ptr); 187 | 188 | if (!stat.succ) { 189 | LOG_ERROR << "write file error"; 190 | add_hole(offset, alloc_ptr.size()); 191 | self_dealloc(alloc_ptr); 192 | return false; 193 | } 194 | 195 | LOG_INFO << "flush_header success, " 196 | << Fmt("offset=%zu, ", offset) 197 | << Fmt("size=%zu", header_size); 198 | 199 | if (bootstrap_.header.offset) { 200 | add_fly_hole(bootstrap_.header.offset, 201 | PAGE_ROUND_UP(bootstrap_.header.size)); 202 | } 203 | 204 | bootstrap_.header.offset = offset; 205 | bootstrap_.header.size = header_size; 206 | 207 | self_dealloc(alloc_ptr); 208 | 209 | return true; 210 | } 211 | 212 | bool Table::load_header() 213 | { 214 | assert(bootstrap_.header.offset > 0); 215 | 216 | Block* block = read_block(&bootstrap_.header); 217 | if (block == NULL) { 218 | LOG_ERROR << "read_block failed"; 219 | return false; 220 | } 221 | 222 | BlockReader reader(*block); 223 | uint32_t blocks = 0; 224 | ScopedMutex lock(block_entry_mutex_); 225 | 226 | reader >> blocks; 227 | while (reader.ok() && blocks > 0) { 228 | nid_t nid; 229 | BlockHandle* handle = new BlockHandle(); 230 | assert(handle); 231 | 232 | reader >> nid >> handle->offset >> handle->size; 233 | 234 | if (reader.ok()) 235 | block_entry_[nid] = handle; 236 | else 237 | delete handle; 238 | 239 | blocks--; 240 | } 241 | 242 | if (!reader.ok()) 243 | LOG_ERROR << "load_header error"; 244 | else 245 | LOG_INFO << "load_header success"; 246 | 247 | self_dealloc(block->buffer()); 248 | delete block; 249 | return reader.ok(); 250 | } 251 | void Table::flush_fly_holes(size_t fly_holes) 252 | { 253 | ScopedMutex lock(fly_hole_list_mutex_); 254 | 255 | while (fly_holes) { 256 | Hole hole = fly_hole_list_.front(); 257 | add_hole(hole.offset, hole.size); 258 | fly_hole_list_.pop_front(); 259 | fly_holes--; 260 | } 261 | } 262 | 263 | void Table::init_holes() 264 | { 265 | ScopedMutex lock(block_entry_mutex_); 266 | 267 | std::map offset_set; 268 | offset_set[bootstrap_.header.offset] = &bootstrap_.header; 269 | 270 | BlockEntry::iterator iter; 271 | for (iter = block_entry_.begin(); iter != block_entry_.end(); iter++) { 272 | offset_set[iter->second->offset] = iter->second; 273 | } 274 | 275 | std::map::iterator curr, prev; 276 | for (curr = offset_set.begin(); curr != offset_set.end(); curr++) { 277 | uint64_t left; 278 | 279 | if (curr == offset_set.begin()) 280 | left = BOOTSTRAP_SIZE; 281 | else 282 | left = prev->second->offset + PAGE_ROUND_UP(prev->second->size); 283 | 284 | if (left < curr->second->offset) 285 | add_hole(left, curr->second->offset - left); 286 | 287 | prev = curr; 288 | } 289 | 290 | if (offset_set.size() > 1) 291 | offset_ = offset_set.rbegin()->second->offset + 292 | PAGE_ROUND_UP(offset_set.rbegin()->second->size); 293 | else 294 | offset_ = BOOTSTRAP_SIZE; 295 | } 296 | 297 | void Table::add_hole(uint64_t offset, uint32_t size) 298 | { 299 | { 300 | ScopedMutex lock(mutex_); 301 | if (offset + size == offset_) { 302 | offset_ = offset; 303 | return ; 304 | } 305 | } 306 | 307 | Hole hole; 308 | hole.size = size; 309 | hole.offset = offset; 310 | 311 | ScopedMutex lock(hole_list_mutex_); 312 | 313 | if (hole_list_.empty()) { 314 | hole_list_.push_back(hole); 315 | return; 316 | } 317 | 318 | HoleList::iterator iter, prev; 319 | for (iter = hole_list_.begin(); iter != hole_list_.end(); iter++) { 320 | if (iter->offset > hole.offset) 321 | break; 322 | prev = iter; 323 | } 324 | 325 | if (iter != hole_list_.end()) { 326 | assert(hole.offset + hole.size <= iter->offset); 327 | if (iter != hole_list_.begin()) { 328 | assert(prev->offset + prev->size <= hole.offset); 329 | 330 | if (prev->offset + prev->size == hole.offset) { 331 | hole.offset = prev->offset; 332 | hole.size += prev->size; 333 | hole_list_.erase(prev); 334 | } 335 | } 336 | 337 | if (hole.offset + hole.size == iter->offset) { 338 | iter->offset = hole.offset; 339 | iter->size += hole.size; 340 | } else { 341 | hole_list_.insert(iter, hole); 342 | } 343 | } else { 344 | if (prev->offset + prev->size == hole.offset) 345 | prev->size += hole.size; 346 | else 347 | hole_list_.push_back(hole); 348 | } 349 | } 350 | 351 | bool Table::get_hole(uint32_t size, uint64_t& offset) 352 | { 353 | ScopedMutex lock(hole_list_mutex_); 354 | HoleList::iterator iter; 355 | 356 | for (iter = hole_list_.begin(); iter != hole_list_.end(); iter++) { 357 | if (iter->size > size) { 358 | iter->size -= size; 359 | offset = iter->offset; 360 | iter->offset += size; 361 | return true; 362 | } else if (iter->size == size) { 363 | offset = iter->offset; 364 | hole_list_.erase(iter); 365 | return true; 366 | } 367 | } 368 | return false; 369 | } 370 | 371 | void Table::add_fly_hole(uint64_t offset, uint32_t size) 372 | { 373 | assert(PAGE_ROUNDED(size)); 374 | 375 | Hole hole; 376 | hole.offset = offset; 377 | hole.size = size; 378 | 379 | ScopedMutex lock(fly_hole_list_mutex_); 380 | fly_hole_list_.push_back(hole); 381 | } 382 | 383 | void Table::truncate() 384 | { 385 | ScopedMutex lock(mutex_); 386 | 387 | if (offset_ <= file_size_) { 388 | file_->truncate(offset_); 389 | file_size_ = offset_; 390 | LOG_INFO << Fmt("truncate, file size=%zuK", file_size_/1024); 391 | } 392 | } 393 | 394 | 395 | uint32_t Table::block_header_size() 396 | { 397 | ScopedMutex lock(block_entry_mutex_); 398 | 399 | uint32_t size_blocks = 4; 400 | uint32_t blocks = block_entry_.size(); 401 | uint32_t size_block_handle = sizeof(nid_t) + sizeof(BlockHandle); 402 | 403 | return size_blocks + blocks * size_block_handle; 404 | } 405 | 406 | Block* Table::read(nid_t nid) 407 | { 408 | BlockHandle* handle; 409 | 410 | { 411 | ScopedMutex lock(block_entry_mutex_); 412 | 413 | BlockEntry::iterator iter = block_entry_.find(nid); 414 | if (iter == block_entry_.end()) return NULL; 415 | 416 | handle = iter->second; 417 | assert(handle); 418 | } 419 | 420 | Block* block = read_block(handle); 421 | 422 | //LOG_INFO << Fmt("read node success, nid=%zu", nid); 423 | return block; 424 | } 425 | 426 | void Table::async_write(nid_t nid, Block& block, Callback cb) 427 | { 428 | assert(block.buffer().size() == PAGE_ROUND_UP(block.size())); 429 | 430 | AsyncWriteContext* context = new AsyncWriteContext(); 431 | 432 | context->nid = nid; 433 | context->callback = cb; 434 | context->handle.size = block.size(); 435 | context->handle.offset = find_space(block.buffer().size()); 436 | { 437 | ScopedMutex lock(mutex_); 438 | fly_writers_++; 439 | } 440 | 441 | file_->async_write(context->handle.offset, block.buffer(), 442 | boost::bind(&Table::async_write_handler, this, context, _1)); 443 | } 444 | 445 | void Table::async_write_handler(AsyncWriteContext* context, Status status) 446 | { 447 | { 448 | ScopedMutex lock(mutex_); 449 | fly_writers_--; 450 | } 451 | 452 | if (status.succ) { 453 | ScopedMutex lock(block_entry_mutex_); 454 | 455 | BlockEntry::iterator iter = block_entry_.find(context->nid); 456 | 457 | if (iter == block_entry_.end()) { 458 | BlockHandle* handle = new BlockHandle(); 459 | 460 | *handle = context->handle; 461 | block_entry_[context->nid] = handle; 462 | } else { 463 | BlockHandle* handle = iter->second; 464 | 465 | add_fly_hole(handle->offset, PAGE_ROUND_UP(handle->size)); 466 | *handle = context->handle; 467 | } 468 | } else { 469 | LOG_ERROR << "async_write error, " << Fmt("nid=%zu", context->nid); 470 | add_hole(context->handle.offset, context->handle.size); 471 | } 472 | 473 | context->callback(status); 474 | delete context; 475 | } 476 | 477 | uint64_t Table::find_space(uint32_t size) 478 | { 479 | uint64_t offset; 480 | 481 | if (get_hole(size, offset)) 482 | return offset; 483 | 484 | ScopedMutex lock(mutex_); 485 | // no hole suit for us, append the file 486 | offset = offset_; 487 | offset_ += size; 488 | 489 | if (offset_ > file_size_) 490 | file_size_ = offset_; 491 | 492 | return offset; 493 | } 494 | 495 | Block* Table::read_block(const BlockHandle* handle) 496 | { 497 | Slice alloc_ptr = self_alloc(handle->size); 498 | if (alloc_ptr.size() == 0) { 499 | LOG_INFO << alloc_ptr.size(); 500 | assert(false); 501 | } 502 | assert(alloc_ptr.size()); 503 | 504 | if (!read_file(handle->offset, alloc_ptr)) { 505 | LOG_ERROR << "read_file failed"; 506 | self_dealloc(alloc_ptr); 507 | return NULL; 508 | } 509 | 510 | return new Block(alloc_ptr, 0, handle->size); 511 | } 512 | 513 | bool Table::read_file(uint64_t offset, Slice& buffer) 514 | { 515 | { 516 | ScopedMutex lock(mutex_); 517 | fly_readers_++; 518 | } 519 | 520 | Status status = file_->read(offset, buffer); 521 | 522 | { 523 | ScopedMutex lock(mutex_); 524 | fly_readers_--; 525 | } 526 | 527 | if (!status.succ) { 528 | LOG_ERROR << "read file error, " << Fmt("offset=%zu.", offset); 529 | return false; 530 | } 531 | return true; 532 | } 533 | 534 | bool Table::write_file(uint64_t offset, const Slice& buffer) 535 | { 536 | { 537 | ScopedMutex lock(mutex_); 538 | fly_writers_++; 539 | } 540 | 541 | Status status = file_->write(offset, buffer); 542 | 543 | { 544 | ScopedMutex lock(mutex_); 545 | fly_writers_--; 546 | } 547 | 548 | if (!status.succ) { 549 | LOG_ERROR << "write file error, " << Fmt("offset=%zu.", offset); 550 | return false; 551 | } 552 | return true; 553 | } 554 | 555 | Slice Table::self_alloc(size_t size) 556 | { 557 | size_t aligned_size = PAGE_ROUND_UP(size); 558 | void* alloc_ptr = NULL; 559 | 560 | if (posix_memalign(&alloc_ptr, PAGE_SIZE, aligned_size) != 0) { 561 | LOG_ERROR << "posix_memalign error: " << strerror(errno); 562 | return Slice(); 563 | } 564 | assert(alloc_ptr); 565 | return Slice((char*)alloc_ptr, aligned_size); 566 | } 567 | 568 | void Table::self_dealloc(Slice alloc_ptr) 569 | { 570 | if (alloc_ptr.size()) 571 | free((char*)alloc_ptr.data()); 572 | } 573 | -------------------------------------------------------------------------------- /test/db_bench.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // This file is copied from LevelDB and modifed a little 6 | // to add LevelDB style benchmark 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "yodb/db.h" 16 | #include "util/timestamp.h" 17 | 18 | #include "random.h" 19 | #include "testutil.h" 20 | #include "histogram.h" 21 | 22 | using namespace std; 23 | using namespace yodb; 24 | 25 | // Comma-separated list of operations to run in the specified order 26 | // Actual benchmarks: 27 | // 28 | // fillseq -- write N values in sequential key order in async mode 29 | // fillrandom -- write N values in random key order in async mode 30 | // readseq -- read N times sequentially 31 | // readrandom -- read N times in random order 32 | static const char* FLAGS_benchmarks = 33 | "fillseq," 34 | "readseq," 35 | "readhot," 36 | "fillrandom," 37 | "readhot," 38 | "readrandom," 39 | ; 40 | 41 | // Number of key/values to place in database 42 | static size_t FLAGS_num = 1000000; 43 | 44 | // Number of read operations to do. If zero, do FLAGS_num reads. 45 | static size_t FLAGS_reads = 0; 46 | 47 | // Number of concurrent threads to run. 48 | static int FLAGS_threads = 1; 49 | 50 | // Size of each value 51 | static size_t FLAGS_value_size = 100; 52 | 53 | // Arrange to generate values that shrink to this fraction of 54 | // their original size after compression 55 | static double FLAGS_compression_ratio = 0.5; 56 | 57 | // Print histogram of operation timings 58 | static bool FLAGS_histogram = false; 59 | 60 | // Number of bytes to use as a cache of uncompressed data. 61 | // Zero means use default setings. 62 | static size_t FLAGS_cache_size = 0; 63 | 64 | // If true, do not destroy the existing database. If you set this 65 | // flag and also specify a benchmark that wants a fresh database, that 66 | // benchmark will fail. 67 | static bool FLAGS_use_existing_db = false; 68 | 69 | // Use the db with the following name. 70 | static const char* FLAGS_db = NULL; 71 | 72 | 73 | // Helper for quickly generating random values. 74 | class RandomGenerator { 75 | private: 76 | std::string data_; 77 | size_t pos_; 78 | 79 | public: 80 | RandomGenerator() { 81 | // We use a limited amount of data over and over again and ensure 82 | // that it is larger than the compression window (32K), and also 83 | // large enough to serve all typical value sizes we want to write. 84 | Random rnd(301); 85 | std::string piece; 86 | while (data_.size() < 1048576) { 87 | // Add a short fragment that is as compressible as specified 88 | // by compression_ratio. 89 | CompressibleSlice(&rnd, FLAGS_compression_ratio, 100U, &piece); 90 | data_.append(piece); 91 | } 92 | pos_ = 0; 93 | } 94 | 95 | Slice Generate(size_t len) { 96 | if (pos_ + len > data_.size()) { 97 | pos_ = 0; 98 | assert(len < data_.size()); 99 | } 100 | pos_ += len; 101 | return Slice(data_.data() + pos_ - len, len); 102 | } 103 | }; 104 | 105 | inline 106 | static void DBSynchronize(DB* db) 107 | { 108 | // Synchronize will flush writes to disk 109 | // db->flush(); 110 | } 111 | 112 | static Slice TrimSpace(Slice s) { 113 | size_t start = 0; 114 | while (start < s.size() && isspace(s[start])) { 115 | start++; 116 | } 117 | size_t limit = s.size(); 118 | while (limit > start && isspace(s[limit-1])) { 119 | limit--; 120 | } 121 | return Slice(s.data() + start, limit - start); 122 | } 123 | 124 | static void AppendWithSpace(std::string* str, Slice msg) { 125 | if (msg.empty()) return; 126 | if (!str->empty()) { 127 | str->push_back(' '); 128 | } 129 | str->append(msg.data(), msg.size()); 130 | } 131 | 132 | class Stats { 133 | private: 134 | Timestamp start_; 135 | Timestamp finish_; 136 | double seconds_; 137 | int done_; 138 | int next_report_; 139 | int64_t bytes_; 140 | Timestamp last_op_finish_; 141 | Histogram hist_; 142 | std::string message_; 143 | 144 | public: 145 | Stats() { Start(); } 146 | 147 | void Start() { 148 | next_report_ = 100; 149 | last_op_finish_ = start_; 150 | hist_.Clear(); 151 | done_ = 0; 152 | bytes_ = 0; 153 | seconds_ = 0; 154 | start_ = Timestamp::now(); 155 | finish_ = start_; 156 | message_.clear(); 157 | } 158 | 159 | void Merge(const Stats& other) { 160 | hist_.Merge(other.hist_); 161 | done_ += other.done_; 162 | bytes_ += other.bytes_; 163 | seconds_ += other.seconds_; 164 | if (other.start_ < start_) start_ = other.start_; 165 | if (other.finish_ > finish_) finish_ = other.finish_; 166 | 167 | // Just keep the messages from one thread 168 | if (message_.empty()) message_ = other.message_; 169 | } 170 | 171 | void Stop() { 172 | finish_ = Timestamp::now(); 173 | seconds_ = time_interval(finish_, start_); 174 | } 175 | 176 | void AddMessage(Slice msg) { 177 | AppendWithSpace(&message_, msg); 178 | } 179 | 180 | void FinishedSingleOp() { 181 | if (FLAGS_histogram) { 182 | Timestamp now = Timestamp::now(); 183 | double micros = time_interval(now, last_op_finish_) * Timestamp::kMicroPerSecond; 184 | hist_.Add(micros); 185 | if (micros > 20000) { 186 | fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); 187 | fflush(stderr); 188 | } 189 | last_op_finish_ = now; 190 | } 191 | 192 | done_++; 193 | if (done_ >= next_report_) { 194 | if (next_report_ < 1000) next_report_ += 100; 195 | else if (next_report_ < 5000) next_report_ += 500; 196 | else if (next_report_ < 10000) next_report_ += 1000; 197 | else if (next_report_ < 50000) next_report_ += 5000; 198 | else if (next_report_ < 100000) next_report_ += 10000; 199 | else if (next_report_ < 500000) next_report_ += 50000; 200 | else next_report_ += 100000; 201 | fprintf(stderr, "... finished %d ops%30s\r", done_, ""); 202 | fflush(stderr); 203 | } 204 | } 205 | 206 | void AddBytes(int64_t n) { 207 | bytes_ += n; 208 | } 209 | 210 | void Report(const Slice& name) { 211 | // Pretend at least one op was done in case we are running a benchmark 212 | // that does not call FinishedSingleOp(). 213 | if (done_ < 1) done_ = 1; 214 | 215 | double elapsed = time_interval(finish_, start_); 216 | 217 | std::string extra; 218 | if (bytes_ > 0) { 219 | // Rate is computed on actual elapsed time, not the sum of per-thread 220 | // elapsed times. 221 | char rate[100]; 222 | snprintf(rate, sizeof(rate), "%6.1f MB/s", 223 | (bytes_ / 1048576.0) / elapsed); 224 | extra = rate; 225 | } 226 | AppendWithSpace(&extra, message_); 227 | 228 | //should computed on actual elapsed time too 229 | fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", 230 | name.to_string().c_str(), 231 | elapsed * 1e6 / done_, 232 | (extra.empty() ? "" : " "), 233 | extra.c_str()); 234 | if (FLAGS_histogram) { 235 | fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); 236 | } 237 | fflush(stdout); 238 | } 239 | }; 240 | 241 | // State shared by all concurrent executions of the same benchmark. 242 | struct SharedState { 243 | Mutex mu; 244 | CondVar cv; 245 | int total; 246 | 247 | // Each thread goes through the following states: 248 | // (1) initializing 249 | // (2) waiting for others to be initialized 250 | // (3) running 251 | // (4) done 252 | 253 | int num_initialized; 254 | int num_done; 255 | bool start; 256 | 257 | SharedState() : cv(mu) { } 258 | }; 259 | 260 | // Per-thread state for concurrent executions of the same benchmark. 261 | struct ThreadState { 262 | int tid; // 0..n-1 when running in n threads 263 | Random rand; // Has different seeds for different threads 264 | Stats stats; 265 | SharedState* shared; 266 | 267 | ThreadState(int index) 268 | : tid(index), 269 | rand(1000 + index) { 270 | } 271 | }; 272 | 273 | class Benchmark { 274 | private: 275 | Env* env_; 276 | Comparator *comparator_; 277 | DB *db_; 278 | int db_num_; 279 | size_t num_; 280 | size_t reads_; 281 | double start_; 282 | double last_op_finish_; 283 | int64_t bytes_; 284 | std::string message_; 285 | Histogram hist_; 286 | RandomGenerator gen_; 287 | Random rand_; 288 | 289 | // State kept for progress messages 290 | int done_; 291 | int next_report_; // When to report next 292 | 293 | void PrintHeader() { 294 | const int kKeySize = 16; 295 | PrintEnvironment(); 296 | fprintf(stdout, "Keys: %d bytes each\n", kKeySize); 297 | fprintf(stdout, "Values: %ld bytes each (%ld bytes after compression)\n", 298 | FLAGS_value_size, 299 | static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); 300 | fprintf(stdout, "Entries: %ld\n", num_); 301 | fprintf(stdout, "RawSize: %.1f MB (estimated)\n", 302 | ((static_cast(kKeySize + FLAGS_value_size) * num_) 303 | / 1048576.0)); 304 | #ifdef HAS_SNAPPY 305 | fprintf(stdout, "FileSize: %.1f MB (estimated)\n", 306 | (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) 307 | / 1048576.0)); 308 | #else 309 | fprintf(stdout, "FileSize: %.1f MB (estimated, compression disabled)\n", 310 | (((kKeySize + FLAGS_value_size) * num_) 311 | / 1048576.0)); 312 | #endif 313 | PrintWarnings(); 314 | fprintf(stdout, "------------------------------------------------\n"); 315 | } 316 | 317 | void PrintWarnings() { 318 | #if defined(__GNUC__) && !defined(__OPTIMIZE__) 319 | fprintf(stdout, 320 | "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" 321 | ); 322 | #endif 323 | #ifndef NDEBUG 324 | fprintf(stdout, 325 | "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); 326 | #endif 327 | #ifndef HAS_SNAPPY 328 | fprintf(stdout, 329 | "WARNING: Snappy compression is disabled\n"); 330 | #endif 331 | #ifndef HAS_LIBAIO 332 | fprintf(stdout, 333 | "WARNING: Linux AIO is disabled, Posix AIO (simulate AIO with user threads) is used instead\n"); 334 | #endif 335 | } 336 | 337 | void PrintEnvironment() { 338 | fprintf(stderr, "yodb: Alpha version\n"); 339 | 340 | #if defined(__linux) 341 | time_t now = time(NULL); 342 | fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline 343 | 344 | FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); 345 | if (cpuinfo != NULL) { 346 | char line[1000]; 347 | int num_cpus = 0; 348 | std::string cpu_type; 349 | std::string cache_size; 350 | while (fgets(line, sizeof(line), cpuinfo) != NULL) { 351 | const char* sep = strchr(line, ':'); 352 | if (sep == NULL) { 353 | continue; 354 | } 355 | Slice key = TrimSpace(Slice(line, sep - 1 - line)); 356 | Slice val = TrimSpace(Slice(sep + 1)); 357 | if (key == "model name") { 358 | ++num_cpus; 359 | cpu_type = val.to_string(); 360 | } else if (key == "cache size") { 361 | cache_size = val.to_string(); 362 | } 363 | } 364 | fclose(cpuinfo); 365 | fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); 366 | fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); 367 | } 368 | #endif 369 | } 370 | 371 | public: 372 | 373 | Benchmark() 374 | : env_(new Env(FLAGS_db)), 375 | comparator_(new BytewiseComparator()), 376 | db_(NULL), 377 | db_num_(0), 378 | num_(FLAGS_num), 379 | reads_(FLAGS_reads == 0 ? FLAGS_num : FLAGS_reads), 380 | bytes_(0), 381 | rand_(301) { 382 | } 383 | 384 | ~Benchmark() { 385 | delete db_; 386 | delete comparator_; 387 | delete env_; 388 | } 389 | 390 | 391 | void Run() { 392 | PrintHeader(); 393 | Open(); 394 | 395 | int cnt = 0; 396 | 397 | const char* benchmarks = FLAGS_benchmarks; 398 | while (benchmarks != NULL) { 399 | const char* sep = strchr(benchmarks, ','); 400 | Slice name; 401 | if (sep == NULL) { 402 | name = benchmarks; 403 | benchmarks = NULL; 404 | } else { 405 | name = Slice(benchmarks, sep - benchmarks); 406 | benchmarks = sep + 1; 407 | } 408 | 409 | void (Benchmark::*method)(ThreadState*) = NULL; 410 | 411 | bool fresh_db = false; 412 | int num_threads = FLAGS_threads; 413 | 414 | if (name == Slice("fillseq")) { 415 | fresh_db = true; 416 | method = &Benchmark::WriteSeq; 417 | } else if (name == Slice("fillrandom")) { 418 | fresh_db = true; 419 | method = &Benchmark::WriteRandom; 420 | } else if (name == Slice("readseq")) { 421 | method = &Benchmark::ReadSequential; 422 | } else if (name == Slice("readrandom")) { 423 | method = &Benchmark::ReadRandom; 424 | } else if (name == Slice("readhot")) { 425 | method = &Benchmark::ReadHot; 426 | } else { 427 | if (name != Slice()) { // No error message for empty name 428 | fprintf(stderr, "unknown benchmark '%s'\n", name.to_string().c_str()); 429 | } 430 | } 431 | 432 | // Create new database if recreate is true 433 | if (fresh_db) { 434 | if (FLAGS_use_existing_db) { 435 | message_ = "skipping (--use_existing_db is true)"; 436 | method = NULL; 437 | } else { 438 | if (cnt != 0) { 439 | delete db_; 440 | db_ = NULL; 441 | Open(); 442 | } 443 | } 444 | } 445 | 446 | if (method) { 447 | RunBenchmark(num_threads, name, method); 448 | cnt ++; 449 | } 450 | } 451 | } 452 | 453 | private: 454 | struct ThreadArg { 455 | Benchmark* bm; 456 | SharedState* shared; 457 | ThreadState* thread; 458 | void (Benchmark::*method)(ThreadState*); 459 | }; 460 | 461 | static void* ThreadBody(void* v) { 462 | ThreadArg* arg = reinterpret_cast(v); 463 | SharedState* shared = arg->shared; 464 | ThreadState* thread = arg->thread; 465 | { 466 | ScopedMutex l(shared->mu); 467 | shared->num_initialized++; 468 | if (shared->num_initialized >= shared->total) { 469 | shared->cv.notify_all(); 470 | } 471 | while (!shared->start) { 472 | shared->cv.wait(); 473 | } 474 | } 475 | 476 | thread->stats.Start(); 477 | (arg->bm->*(arg->method))(thread); 478 | thread->stats.Stop(); 479 | 480 | { 481 | ScopedMutex l(shared->mu); 482 | shared->num_done++; 483 | if (shared->num_done >= shared->total) { 484 | shared->cv.notify_all(); 485 | } 486 | } 487 | return NULL; 488 | } 489 | 490 | void RunBenchmark(int n, Slice name, 491 | void (Benchmark::*method)(ThreadState*)) { 492 | SharedState shared; 493 | shared.total = n; 494 | shared.num_initialized = 0; 495 | shared.num_done = 0; 496 | shared.start = false; 497 | 498 | ThreadArg* arg = new ThreadArg[n]; 499 | for (int i = 0; i < n; i++) { 500 | arg[i].bm = this; 501 | arg[i].method = method; 502 | arg[i].shared = &shared; 503 | arg[i].thread = new ThreadState(i); 504 | arg[i].thread->shared = &shared; 505 | Thread *thr = new Thread(boost::bind(&Benchmark::ThreadBody, &arg[i])); 506 | thr->run(); 507 | } 508 | 509 | shared.mu.lock(); 510 | while (shared.num_initialized < n) { 511 | shared.cv.wait(); 512 | } 513 | 514 | shared.start = true; 515 | shared.cv.notify_all(); 516 | while (shared.num_done < n) { 517 | shared.cv.wait(); 518 | } 519 | shared.mu.unlock(); 520 | 521 | for (int i = 1; i < n; i++) { 522 | arg[0].thread->stats.Merge(arg[i].thread->stats); 523 | } 524 | arg[0].thread->stats.Report(name); 525 | 526 | for (int i = 0; i < n; i++) { 527 | delete arg[i].thread; 528 | } 529 | delete[] arg; 530 | } 531 | 532 | void Open() 533 | { 534 | assert(db_ == NULL); 535 | 536 | Options opts; 537 | opts.env = env_; 538 | opts.comparator = comparator_; 539 | #ifdef HAS_SNAPPY 540 | opts.compress = kSnappyCompress; 541 | #endif 542 | if (FLAGS_cache_size) { 543 | opts.cache_limited_memory = FLAGS_cache_size; 544 | } 545 | 546 | char file_name[100]; 547 | db_num_++; 548 | snprintf(file_name, sizeof(file_name), 549 | "dbbench_yodb_%d", db_num_); 550 | 551 | db_ = DB::open(file_name, opts); 552 | if (!db_) { 553 | fprintf(stderr, "open error %s\n", file_name); 554 | exit(1); 555 | } 556 | } 557 | 558 | void WriteSeq(ThreadState* thread) 559 | { 560 | Write(thread, false); 561 | } 562 | 563 | void WriteRandom(ThreadState* thread) 564 | { 565 | Write(thread, true); 566 | } 567 | 568 | void Write(ThreadState* thread, bool random) 569 | { 570 | int64_t bytes = 0; 571 | for (size_t i = 0; i < num_; i++ ) { 572 | uint64_t k = random ? rand() % FLAGS_num: i; 573 | char key[100]; 574 | snprintf(key, sizeof(key), "%016ld", k); 575 | bytes += FLAGS_value_size + strlen(key); 576 | 577 | if (!db_->put(key, gen_.Generate(FLAGS_value_size))) { 578 | fprintf(stderr, "put key %ld error\n", k); 579 | } 580 | thread->stats.FinishedSingleOp(); 581 | } 582 | thread->stats.AddBytes(bytes); 583 | } 584 | 585 | void ReadSequential(ThreadState* thread) { 586 | int bytes = 0; 587 | Slice value; 588 | for (size_t i = 0; i < reads_; i++) { 589 | uint64_t k = i; 590 | char key[100]; 591 | snprintf(key, sizeof(key), "%016ld", k); 592 | if (db_->get(key, value)) { 593 | bytes += value.size() + strlen(key); 594 | value.release(); 595 | } 596 | thread->stats.FinishedSingleOp(); 597 | } 598 | thread->stats.AddBytes(bytes); 599 | } 600 | 601 | void ReadRandom(ThreadState* thread) { 602 | int bytes = 0; 603 | Slice value; 604 | for (size_t i = 0; i < reads_; i++) { 605 | uint64_t k = rand() % FLAGS_num; 606 | char key[100]; 607 | snprintf(key, sizeof(key), "%016ld", k); 608 | if (db_->get(key, value)) { 609 | bytes += value.size() + strlen(key); 610 | value.release(); 611 | } 612 | thread->stats.FinishedSingleOp(); 613 | } 614 | thread->stats.AddBytes(bytes); 615 | } 616 | 617 | void ReadHot(ThreadState* thread) { 618 | int bytes = 0; 619 | Slice value; 620 | uint64_t range = (FLAGS_num + 99) / 100; 621 | for (size_t i = 0; i < reads_; i++) { 622 | char key[100]; 623 | uint64_t k = rand() % range; 624 | snprintf(key, sizeof(key), "%016ld", k); 625 | if (db_->get(key, value)) { 626 | bytes += value.size() + strlen(key); 627 | value.release(); 628 | } 629 | thread->stats.FinishedSingleOp(); 630 | } 631 | thread->stats.AddBytes(bytes); 632 | } 633 | }; 634 | 635 | 636 | int main(int argc, char** argv) 637 | { 638 | for (int i = 1; i < argc; i++) { 639 | double d; 640 | long n; 641 | char junk; 642 | if (strncmp(argv[i], "--benchmarks=", 13) == 0) { 643 | FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); 644 | } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { 645 | FLAGS_compression_ratio = d; 646 | } else if (sscanf(argv[i], "--histogram=%ld%c", &n, &junk) == 1 && 647 | (n == 0 || n == 1)) { 648 | FLAGS_histogram = n; 649 | } else if (sscanf(argv[i], "--use_existing_db=%ld%c", &n, &junk) == 1 && 650 | (n == 0 || n == 1)) { 651 | FLAGS_use_existing_db = n; 652 | } else if(sscanf(argv[i], "--cache_size=%ld%c", &n, &junk) == 1) { 653 | FLAGS_cache_size = n; 654 | } else if(sscanf(argv[i], "--num=%ld%c", &n, &junk) == 1) { 655 | FLAGS_num = n; 656 | } else if (sscanf(argv[i], "--reads=%ld%c", &n, &junk) == 1) { 657 | FLAGS_reads = n; 658 | } else if (sscanf(argv[i], "--threads=%ld%c", &n, &junk) == 1) { 659 | FLAGS_threads = n; 660 | } else if (sscanf(argv[i], "--value_size=%ld%c", &n, &junk) == 1) { 661 | FLAGS_value_size = n; 662 | } else if(strncmp(argv[i], "--db=", 5) == 0) { 663 | FLAGS_db = argv[i] + 5; 664 | } else { 665 | cerr << "Invalid flag '" << argv[i] << "'" << endl; 666 | exit(1); 667 | } 668 | } 669 | 670 | // Choose a location for the test database if none given with --db= 671 | if (FLAGS_db == NULL) { 672 | FLAGS_db = "."; 673 | } 674 | 675 | // string logpath = FLAGS_db; 676 | // logpath += "/yodb.log"; 677 | // init_logger(logpath, kInfo); 678 | 679 | Benchmark benchmark; 680 | benchmark.Run(); 681 | return 0; 682 | } 683 | --------------------------------------------------------------------------------