├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── rocksdb_flags.cmake ├── codecov.yml ├── include └── titan │ ├── checkpoint.h │ ├── db.h │ ├── options.h │ └── statistics.h ├── scripts ├── format-diff.sh ├── travis-format.sh └── travis-make.sh ├── src ├── base_db_listener.cc ├── base_db_listener.h ├── blob_file_builder.cc ├── blob_file_builder.h ├── blob_file_cache.cc ├── blob_file_cache.h ├── blob_file_iterator.cc ├── blob_file_iterator.h ├── blob_file_iterator_test.cc ├── blob_file_manager.h ├── blob_file_reader.cc ├── blob_file_reader.h ├── blob_file_set.cc ├── blob_file_set.h ├── blob_file_size_collector.cc ├── blob_file_size_collector.h ├── blob_file_size_collector_test.cc ├── blob_file_test.cc ├── blob_format.cc ├── blob_format.h ├── blob_format_test.cc ├── blob_gc.cc ├── blob_gc.h ├── blob_gc_job.cc ├── blob_gc_job.h ├── blob_gc_job_test.cc ├── blob_gc_picker.cc ├── blob_gc_picker.h ├── blob_gc_picker_test.cc ├── blob_storage.cc ├── blob_storage.h ├── compaction_filter.h ├── compaction_filter_test.cc ├── db.cc ├── db_impl.cc ├── db_impl.h ├── db_impl_files.cc ├── db_impl_gc.cc ├── db_iter.h ├── edit_collector.h ├── gc_stats_test.cc ├── options.cc ├── table_builder.cc ├── table_builder.h ├── table_builder_test.cc ├── table_factory.cc ├── table_factory.h ├── testutil.h ├── thread_safety_test.cc ├── titan_checkpoint_impl.cc ├── titan_checkpoint_impl.h ├── titan_checkpoint_test.cc ├── titan_db_test.cc ├── titan_fault_injection_test_env.h ├── titan_logging.h ├── titan_options_test.cc ├── titan_stats.cc ├── titan_stats.h ├── util.cc ├── util.h ├── util_test.cc ├── version_edit.cc ├── version_edit.h └── version_test.cc ├── tools ├── benchmark.sh ├── blob_file_dump.cc ├── db_bench.cc ├── db_bench_tool.cc ├── db_bench_tool.h ├── manifest_dump.cc └── titandb_stress.cc └── util ├── titan_build_version.cc.in └── titan_build_version.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.o 3 | *.swp 4 | *_test 5 | titandb_bench 6 | titandb_stress 7 | 8 | build/ 9 | rocksdb 10 | Makefile 11 | *.cmake 12 | .idea/ 13 | .vscode/ 14 | cmake-build-debug/ 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: focal 2 | 3 | language: cpp 4 | 5 | os: linux 6 | compiler: clang 7 | 8 | addons: 9 | apt: 10 | packages: 11 | - clang-format-10 12 | - g++-9 13 | - libgflags-dev 14 | - liblz4-dev 15 | - libsnappy-dev 16 | - libzstd-dev 17 | - lcov 18 | - zlib1g 19 | 20 | stages: 21 | - format 22 | - test 23 | 24 | env: 25 | - BUILD_TYPE="Release" 26 | - SANITIZER="ASAN" 27 | - SANITIZER="TSAN" 28 | - SANITIZER="UBSAN" 29 | 30 | # For GCC build, we also report code coverage to codecov. 31 | matrix: 32 | fast_finish: true 33 | include: 34 | - os: osx 35 | osx_image: xcode12.2 36 | env: 37 | - SANITIZER="ASAN" 38 | - os: linux 39 | compiler: 40 | env: 41 | - COMPILER=gcc9 42 | - os: linux 43 | arch: arm64 44 | env: 45 | - SANITIZER="ASAN" 46 | - stage: format 47 | os: linux 48 | compiler: 49 | env: 50 | - FORMATTER=ON 51 | 52 | install: 53 | # Don't do package update because Travis homebrew aren't cached. 54 | - if [ "${TRAVIS_OS_NAME}" == osx ]; then 55 | to_install=( "gflags" "zstd" "lz4" "snappy" "xz" ); 56 | brew list --formula > tmp; 57 | for ins in ${to_install[*]}; do 58 | grep -q "$ins" tmp; 59 | if [ $? -ne 0 ]; then 60 | brew install "$ins"; 61 | fi 62 | done 63 | fi 64 | - export CTEST_OUTPUT_ON_FAILURE=1 65 | - if [ "${COMPILER}" == gcc9 ]; then 66 | CC=gcc-9; 67 | CXX=g++-9; 68 | export COVERAGE_OPT="-DCODE_COVERAGE=ON"; 69 | fi 70 | - if [ ! -z "${BUILD_TYPE}" ]; then 71 | export BUILD_OPT="-DCMAKE_BUILD_TYPE=${BUILD_TYPE}"; 72 | else 73 | export BUILD_OPT="-DCMAKE_BUILD_TYPE=Debug"; 74 | fi 75 | - if [ ! -z "${SANITIZER}" ]; then 76 | export SANITIZER_OPT="-DWITH_${SANITIZER}=ON"; 77 | export TOOLS_OPT="-DWITH_TITAN_TOOLS=OFF"; 78 | fi 79 | - export COMPRESSION_OPT="-DWITH_SNAPPY=ON -DWITH_LZ4=ON -DWITH_ZLIB=ON -DWITH_ZSTD=ON" 80 | 81 | script: 82 | - if [ -z "${FORMATTER}" ]; then 83 | bash scripts/travis-make.sh; 84 | else 85 | bash scripts/travis-format.sh; 86 | fi 87 | 88 | after_success: 89 | - if [ "${COMPILER}" == gcc9 ]; then 90 | lcov --gcov-tool gcov-7 --directory . --capture --output-file coverage.info; 91 | lcov --remove coverage.info '/usr/*' --output-file coverage.info; 92 | lcov --list coverage.info; 93 | bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports"; 94 | fi 95 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(titan) 3 | enable_language(CXX) 4 | enable_language(C) 5 | find_package(Git) 6 | 7 | if (NOT ROCKSDB_GIT_REPO) 8 | set(ROCKSDB_GIT_REPO "https://github.com/tikv/rocksdb.git") 9 | endif() 10 | 11 | if (NOT ROCKSDB_GIT_BRANCH) 12 | set(ROCKSDB_GIT_BRANCH "8.10.tikv") 13 | endif() 14 | 15 | if (NOT DEFINED ROCKSDB_DIR) 16 | if (GIT_FOUND) 17 | if (WIN32) 18 | execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} clone --branch=${ROCKSDB_GIT_BRANCH} ${ROCKSDB_GIT_REPO}) 19 | else() 20 | execute_process(COMMAND ${GIT_EXECUTABLE} clone --branch=${ROCKSDB_GIT_BRANCH} ${ROCKSDB_GIT_REPO}) 21 | endif() 22 | set(ROCKSDB_DIR "${CMAKE_BINARY_DIR}/rocksdb") 23 | endif() 24 | endif() 25 | 26 | if (NOT DEFINED ROCKSDB_DIR) 27 | message(FATAL_ERROR "ROCKSDB_DIR is not defined.") 28 | endif() 29 | 30 | get_filename_component(CMAKE_MODULE_PATH "${ROCKSDB_DIR}/cmake/modules/" ABSOLUTE) 31 | include(cmake/rocksdb_flags.cmake) 32 | 33 | include_directories(${ROCKSDB_DIR}) 34 | include_directories(${ROCKSDB_DIR}/include) 35 | include_directories(${PROJECT_SOURCE_DIR}/include) 36 | include_directories(${PROJECT_SOURCE_DIR}/src) 37 | include_directories(${PROJECT_SOURCE_DIR}/util) 38 | 39 | file(GLOB SOURCES src/*.cc) 40 | file(GLOB TEST_SOURCES src/*test.cc) 41 | list(REMOVE_ITEM SOURCES ${TEST_SOURCES}) 42 | 43 | add_library(titan STATIC ${SOURCES} $) 44 | 45 | option(WITH_TITAN_TESTS "Build with tests." ON) 46 | option(WITH_TITAN_TOOLS "Build with tools." ON) 47 | option(TRAVIS "Building in Travis." OFF) 48 | option(CODE_COVERAGE "Generate code coverage report." OFF) 49 | 50 | if (CMAKE_COMPILER_IS_GNUCXX) 51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-array-bounds") 52 | endif() 53 | 54 | if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") 55 | set(WITH_TITAN_TESTS OFF) 56 | endif() 57 | 58 | if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") 59 | if(WIN32) 60 | execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) 61 | else() 62 | execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) 63 | endif() 64 | else() 65 | set(GIT_SHA 0) 66 | endif() 67 | string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}") 68 | 69 | set(TITAN_BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/titan_build_version.cc) 70 | configure_file(util/titan_build_version.cc.in ${TITAN_BUILD_VERSION_CC} @ONLY) 71 | add_library(titan_build_version OBJECT ${TITAN_BUILD_VERSION_CC}) 72 | target_include_directories(titan_build_version PRIVATE 73 | ${CMAKE_CURRENT_SOURCE_DIR}/util) 74 | 75 | if (TRAVIS) 76 | add_definitions(-DTRAVIS) 77 | endif() 78 | 79 | if (CODE_COVERAGE) 80 | target_compile_options(titan PRIVATE "--coverage") 81 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") 82 | endif() 83 | 84 | if(WITH_ASAN OR WITH_TSAN) 85 | find_package(Threads) 86 | if(CMAKE_USE_PTHREADS_INIT) 87 | link_libraries(pthread) 88 | endif() 89 | endif() 90 | 91 | if (WITH_TITAN_TESTS OR WITH_TITAN_TOOLS) 92 | add_subdirectory(${ROCKSDB_DIR} rocksdb EXCLUDE_FROM_ALL) 93 | # Check if -latomic is required or not 94 | if (NOT MSVC) 95 | set(CMAKE_REQUIRED_FLAGS "--std=c++17") 96 | CHECK_CXX_SOURCE_COMPILES(" 97 | #include 98 | std::atomic x(0); 99 | int main() { 100 | uint64_t i = x.load(std::memory_order_relaxed); 101 | bool b = x.is_lock_free(); 102 | return 0; 103 | } 104 | " BUILTIN_ATOMIC) 105 | if (NOT BUILTIN_ATOMIC) 106 | #TODO: Check if -latomic exists 107 | list(APPEND THIRDPARTY_LIBS atomic) 108 | endif() 109 | endif() 110 | endif() 111 | 112 | # Check if -latomic is required or not 113 | if (NOT MSVC) 114 | set(CMAKE_REQUIRED_FLAGS "--std=c++17") 115 | CHECK_CXX_SOURCE_COMPILES(" 116 | #include 117 | std::atomic x(0); 118 | int main() { 119 | uint64_t i = x.load(std::memory_order_relaxed); 120 | bool b = x.is_lock_free(); 121 | return 0; 122 | } 123 | " BUILTIN_ATOMIC) 124 | if (NOT BUILTIN_ATOMIC) 125 | #TODO: Check if -latomic exists 126 | list(APPEND THIRDPARTY_LIBS atomic) 127 | endif() 128 | endif() 129 | 130 | if (WITH_TITAN_TESTS AND (CMAKE_BUILD_TYPE STREQUAL "Debug")) 131 | include(CTest) 132 | include_directories(SYSTEM ${ROCKSDB_DIR}/third-party/gtest-1.8.1/fused-src) 133 | 134 | set(TEST_LIBS 135 | titan 136 | testutillib 137 | testharness 138 | gtest) 139 | 140 | set(TESTS 141 | blob_file_iterator_test 142 | blob_file_size_collector_test 143 | blob_file_test 144 | blob_format_test 145 | blob_gc_job_test 146 | blob_gc_picker_test 147 | gc_stats_test 148 | table_builder_test 149 | thread_safety_test 150 | titan_db_test 151 | titan_checkpoint_test 152 | titan_options_test 153 | util_test 154 | compaction_filter_test 155 | version_test) 156 | 157 | foreach(test ${TESTS}) 158 | add_executable(titan_${test} src/${test}.cc) 159 | target_link_libraries(titan_${test} ${TEST_LIBS}) 160 | add_test(titan_${test} titan_${test}) 161 | endforeach(test ${TESTS}) 162 | endif() 163 | 164 | if (WITH_TITAN_TOOLS) 165 | set(TOOLS_LIBS titan rocksdb) 166 | 167 | if (NOT TRAVIS) 168 | find_package(gflags REQUIRED) 169 | else() 170 | # Hack: On Travis (with Ubuntu xenial or before), libgflags-dev package doesn't come with 171 | # gflags-config.cmake, so find_package will fail. Hard-code gflag path for now. 172 | if (NOT APPLE) 173 | set(gflags_INCLUDE_DIR "/usr/include/gflags") 174 | list(APPEND TOOLS_LIBS "/usr/lib/x86_64-linux-gnu/libgflags.a") 175 | endif() 176 | endif() 177 | add_definitions(-DGFLAGS) 178 | 179 | add_executable(titandb_stress tools/titandb_stress.cc) 180 | target_include_directories(titandb_stress PRIVATE ${GFLAGS_INCLUDE_DIR}) 181 | target_link_libraries(titandb_stress ${TOOLS_LIBS}) 182 | 183 | add_executable(titandb_bench tools/db_bench.cc tools/db_bench_tool.cc) 184 | target_include_directories(titandb_bench PRIVATE ${GFLAGS_INCLUDE_DIR}) 185 | target_link_libraries(titandb_bench ${TOOLS_LIBS}) 186 | 187 | add_executable(titan_manifest_dump tools/manifest_dump.cc) 188 | target_include_directories(titan_manifest_dump PRIVATE ${GFLAGS_INCLUDE_DIR}) 189 | target_link_libraries(titan_manifest_dump ${TOOLS_LIBS}) 190 | 191 | add_executable(titan_blob_file_dump tools/blob_file_dump.cc) 192 | target_include_directories(titan_blob_file_dump PRIVATE ${GFLAGS_INCLUDE_DIR}) 193 | target_link_libraries(titan_blob_file_dump ${TOOLS_LIBS}) 194 | endif() 195 | 196 | # Installation - copy lib/ and include/ 197 | 198 | include(GNUInstallDirs) 199 | install(DIRECTORY include/titan 200 | COMPONENT devel 201 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 202 | ) 203 | install(TARGETS titan 204 | COMPONENT devel 205 | ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" 206 | INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 207 | ) 208 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Titan: A RocksDB Plugin to Reduce Write Amplification 2 | 3 | [![Build Status](https://travis-ci.org/tikv/titan.svg?branch=master)](https://travis-ci.org/tikv/titan) 4 | [![codecov](https://codecov.io/gh/tikv/titan/branch/master/graph/badge.svg)](https://codecov.io/gh/tikv/titan) 5 | 6 | Titan is a RocksDB Plugin for key-value separation, inspired by 7 | [WiscKey](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf). 8 | For introduction and design details, see our 9 | [blog post](https://pingcap.com/blog/titan-storage-engine-design-and-implementation/). 10 | 11 | ## Build and Test 12 | Titan relies on RocksDB source code to build. You need to checkout RocksDB source code locally, 13 | and provide the path to Titan build script. 14 | ``` 15 | # To build: 16 | mkdir -p build 17 | cd build 18 | cmake .. 19 | make -j 20 | 21 | # To specify custom rocksdb 22 | cmake .. -DROCKSDB_DIR= 23 | # or 24 | cmake .. -DROCKSDB_GIT_REPO= -DROCKSDB_GIT_BRANCH= 25 | 26 | # Build static lib (i.e. libtitan.a) only: 27 | make titan -j 28 | 29 | # Release build: 30 | cmake .. -DROCKSDB_DIR= -DCMAKE_BUILD_TYPE=Release 31 | 32 | # Building with sanitizer (e.g. ASAN): 33 | cmake .. -DROCKSDB_DIR= -DWITH_ASAN=ON 34 | 35 | # Building with compression libraries (e.g. snappy): 36 | cmake .. -DROCKSDB_DIR= -DWITH_SNAPPY=ON 37 | 38 | # Run tests after build. You need to filter tests by "titan" prefix. 39 | ctest -R titan 40 | 41 | # To format code, install clang-format and run the script. 42 | bash scripts/format-diff.sh 43 | ``` 44 | 45 | ## Compatibility with RocksDB 46 | 47 | | Titan Version | TiKV Version | RocksDB Version (TiKV fork) | 48 | | --------------| ------------- | ---------------------------- | 49 | | master | 8.5.0 + | [8.10.tikv] | 50 | | [tikv-7.5] | 7.5.0 + | [6.29.tikv] | 51 | | [tikv-7.1] | 7.1.0 + | [6.29.tikv] | 52 | | [tikv-6.5] | 6.5.0 + | [6.29.tikv] | 53 | | [tikv-6.1] | 6.1.0 + | [6.4.tikv] | 54 | 55 | [6.4.tikv]: https://github.com/tikv/rocksdb/tree/6.4.tikv 56 | [6.29.tikv]: https://github.com/tikv/rocksdb/tree/6.29.tikv 57 | [8.10.tikv]: https://github.com/tikv/rocksdb/tree/8.10.tikv 58 | [tikv-6.1]: https://github.com/tikv/titan/tree/tikv-6.1 59 | [tikv-6.5]: https://github.com/tikv/titan/tree/tikv-6.5 60 | [tikv-7.1]: https://github.com/tikv/titan/tree/tikv-7.1 61 | [tikv-7.5]: https://github.com/tikv/titan/tree/tikv-7.5 62 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "diff" 3 | behavior: default 4 | 5 | coverage: 6 | status: 7 | project: off 8 | patch: off 9 | -------------------------------------------------------------------------------- /include/titan/checkpoint.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "db.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | class Checkpoint { 9 | public: 10 | // Creates a Checkpoint object to be used for creating openable snapshots 11 | static Status Create(TitanDB* db, Checkpoint** checkpoint_ptr); 12 | 13 | // Builds an openable snapshot of TitanDB. 14 | // base_checkpoint_dir: checkpoint directory of base DB 15 | // titan_checkpoint_dir: checkpoint directory of TitanDB, if not specified, 16 | // default value is {base_checkpoint_dir}/titandb. 17 | // The specified directory should contain absolute path and not exist, it 18 | // will be created by the API. 19 | // When a checkpoint is created: 20 | // (1) SST and blob files are hard linked if the output directory is on the 21 | // same filesystem as the database, and copied otherwise. 22 | // (2) MANIFEST file specific to TitanDB will be regenerated based on all 23 | // existing blob files. 24 | // (3) other required files are always copied. 25 | // log_size_for_flush: if the total log file size is equal or larger than 26 | // this value, then a flush is triggered for all the column families. The 27 | // default value is 0, which means flush is always triggered. If you move 28 | // away from the default, the checkpoint may not contain up-to-date data 29 | // if WAL writing is not always enabled. 30 | // Flush will always trigger if it is 2PC. 31 | virtual Status CreateCheckpoint(const std::string& base_checkpoint_dir, 32 | const std::string& titan_checkpoint_dir = "", 33 | uint64_t log_size_for_flush = 0); 34 | 35 | virtual ~Checkpoint() {} 36 | }; 37 | 38 | } // namespace titandb 39 | } // namespace rocksdb 40 | -------------------------------------------------------------------------------- /include/titan/db.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/utilities/stackable_db.h" 4 | 5 | #include "titan/options.h" 6 | 7 | namespace rocksdb { 8 | namespace titandb { 9 | 10 | class VersionEdit; 11 | 12 | struct TitanCFDescriptor { 13 | std::string name; 14 | TitanCFOptions options; 15 | TitanCFDescriptor() 16 | : name(kDefaultColumnFamilyName), options(TitanCFOptions()) {} 17 | TitanCFDescriptor(const std::string& _name, const TitanCFOptions& _options) 18 | : name(_name), options(_options) {} 19 | }; 20 | 21 | class TitanDB : public StackableDB { 22 | public: 23 | static Status Open(const TitanOptions& options, const std::string& dbname, 24 | TitanDB** db); 25 | 26 | static Status Open(const TitanDBOptions& db_options, 27 | const std::string& dbname, 28 | const std::vector& descs, 29 | std::vector* handles, TitanDB** db); 30 | 31 | TitanDB() : StackableDB(nullptr) {} 32 | 33 | using StackableDB::CreateColumnFamily; 34 | Status CreateColumnFamily(const ColumnFamilyOptions& options, 35 | const std::string& name, 36 | ColumnFamilyHandle** handle) override { 37 | TitanCFDescriptor desc(name, TitanCFOptions(options)); 38 | return CreateColumnFamily(desc, handle); 39 | } 40 | Status CreateColumnFamily(const TitanCFDescriptor& desc, 41 | ColumnFamilyHandle** handle) { 42 | std::vector handles; 43 | Status s = CreateColumnFamilies({desc}, &handles); 44 | if (s.ok()) { 45 | *handle = handles[0]; 46 | } 47 | return s; 48 | } 49 | 50 | using StackableDB::CreateColumnFamilies; 51 | Status CreateColumnFamilies( 52 | const ColumnFamilyOptions& options, const std::vector& names, 53 | std::vector* handles) override { 54 | std::vector descs; 55 | for (auto& name : names) { 56 | descs.emplace_back(name, TitanCFOptions(options)); 57 | } 58 | return CreateColumnFamilies(descs, handles); 59 | } 60 | Status CreateColumnFamilies( 61 | const std::vector& base_descs, 62 | std::vector* handles) override { 63 | std::vector descs; 64 | for (auto& desc : base_descs) { 65 | descs.emplace_back(desc.name, TitanCFOptions(desc.options)); 66 | } 67 | return CreateColumnFamilies(descs, handles); 68 | } 69 | virtual Status CreateColumnFamilies( 70 | const std::vector& descs, 71 | std::vector* handles) = 0; 72 | 73 | Status DropColumnFamily(ColumnFamilyHandle* handle) override { 74 | return DropColumnFamilies({handle}); 75 | } 76 | 77 | Status DropColumnFamilies( 78 | const std::vector& handles) override = 0; 79 | 80 | Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) override = 81 | 0; 82 | 83 | using StackableDB::NewIterator; 84 | Iterator* NewIterator(const ReadOptions& opts, 85 | ColumnFamilyHandle* column_family) override { 86 | return NewIterator(TitanReadOptions(opts), column_family); 87 | } 88 | Iterator* NewIterator(const ReadOptions& opts) override { 89 | return NewIterator(TitanReadOptions(opts), DefaultColumnFamily()); 90 | } 91 | virtual Iterator* NewIterator(const TitanReadOptions& opts) { 92 | return NewIterator(opts, DefaultColumnFamily()); 93 | } 94 | virtual Iterator* NewIterator(const TitanReadOptions& opts, 95 | ColumnFamilyHandle* column_family) = 0; 96 | 97 | using StackableDB::NewIterators; 98 | Status NewIterators(const ReadOptions& options, 99 | const std::vector& column_families, 100 | std::vector* iterators) override { 101 | return NewIterators(TitanReadOptions(options), column_families, iterators); 102 | } 103 | virtual Status NewIterators( 104 | const TitanReadOptions& options, 105 | const std::vector& column_families, 106 | std::vector* iterators) = 0; 107 | 108 | using StackableDB::Merge; 109 | Status Merge(const WriteOptions&, ColumnFamilyHandle*, const Slice& /*key*/, 110 | const Slice& /*value*/) override { 111 | return Status::NotSupported("TitanDB doesn't support this operation"); 112 | } 113 | 114 | using StackableDB::DisableFileDeletions; 115 | Status DisableFileDeletions() override { 116 | return Status::NotSupported("TitanDB doesn't support this operation"); 117 | } 118 | 119 | using StackableDB::EnableFileDeletions; 120 | Status EnableFileDeletions(bool /*force*/) override { 121 | return Status::NotSupported("TitanDB doesn't support this operation"); 122 | } 123 | 124 | // Get all files in /titandb directory after disable file deletions 125 | // edits include all blob file records of every column family 126 | virtual Status GetAllTitanFiles(std::vector& /*files*/, 127 | std::vector* /*edits*/) { 128 | return Status::NotSupported("TitanDB doesn't support this operation"); 129 | } 130 | 131 | using rocksdb::StackableDB::SingleDelete; 132 | Status SingleDelete(const WriteOptions& /*wopts*/, 133 | ColumnFamilyHandle* /*column_family*/, 134 | const Slice& /*key*/) override { 135 | return Status::NotSupported("Not supported operation in titan db."); 136 | } 137 | 138 | using rocksdb::StackableDB::CompactFiles; 139 | Status CompactFiles( 140 | const CompactionOptions& compact_options, 141 | ColumnFamilyHandle* column_family, 142 | const std::vector& input_file_names, const int output_level, 143 | const int output_path_id = -1, 144 | std::vector* const output_file_names = nullptr, 145 | CompactionJobInfo* compaction_job_info = nullptr) override = 0; 146 | 147 | virtual Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, 148 | const RangePtr* ranges, size_t n, 149 | bool include_end = true) = 0; 150 | 151 | virtual Status DeleteBlobFilesInRanges(ColumnFamilyHandle* column_family, 152 | const RangePtr* ranges, size_t n, 153 | bool include_end = true) = 0; 154 | 155 | using rocksdb::StackableDB::GetOptions; 156 | Options GetOptions(ColumnFamilyHandle* column_family) const override = 0; 157 | 158 | virtual TitanOptions GetTitanOptions( 159 | ColumnFamilyHandle* column_family) const = 0; 160 | virtual TitanOptions GetTitanOptions() const { 161 | return GetTitanOptions(DefaultColumnFamily()); 162 | } 163 | 164 | using rocksdb::StackableDB::SetOptions; 165 | Status SetOptions(ColumnFamilyHandle* column_family, 166 | const std::unordered_map& 167 | new_options) override = 0; 168 | 169 | virtual TitanDBOptions GetTitanDBOptions() const = 0; 170 | 171 | struct Properties { 172 | // "rocksdb.titandb.num-blob-files-at-level" - returns string containing 173 | // the number of blob files at level , where is an ASCII 174 | // representation of a level number (e.g., "0")." 175 | static const std::string kNumBlobFilesAtLevelPrefix; 176 | // "rocksdb.titandb.live-blob-size" - returns total blob value size 177 | // referenced by LSM tree. 178 | static const std::string kLiveBlobSize; 179 | // "rocksdb.titandb.num-live-blob-file" - returns total blob file count. 180 | static const std::string kNumLiveBlobFile; 181 | // "rocksdb.titandb.num-obsolete-blob-file" - return obsolete blob file. 182 | static const std::string kNumObsoleteBlobFile; 183 | // "rocksdb.titandb.live-blob-file-size" - returns total size of live blob 184 | // files. 185 | static const std::string kLiveBlobFileSize; 186 | // "rocksdb.titandb.obsolete-blob-file-size" - returns size of obsolete 187 | // blob files. 188 | static const std::string kObsoleteBlobFileSize; 189 | // "rocksdb.titandb.discardable_ratio_le0_file_num" - returns count of 190 | // file whose discardable ratio is less or equal to 0%. 191 | static const std::string kNumDiscardableRatioLE0File; 192 | // "rocksdb.titandb.discardable_ratio_le20_file_num" - returns count of 193 | // file whose discardable ratio is less or equal to 20%. 194 | static const std::string kNumDiscardableRatioLE20File; 195 | // "rocksdb.titandb.discardable_ratio_le50_file_num" - returns count of 196 | // file whose discardable ratio is less or equal to 50%. 197 | static const std::string kNumDiscardableRatioLE50File; 198 | // "rocksdb.titandb.discardable_ratio_le80_file_num" - returns count of 199 | // file whose discardable ratio is less or equal to 80%. 200 | static const std::string kNumDiscardableRatioLE80File; 201 | // "rocksdb.titandb.discardable_ratio_le100_file_num" - returns count of 202 | // file whose discardable ratio is less or equal to 100%. 203 | static const std::string kNumDiscardableRatioLE100File; 204 | }; 205 | 206 | bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, 207 | std::string* value) override = 0; 208 | bool GetProperty(const Slice& property, std::string* value) override { 209 | return GetProperty(DefaultColumnFamily(), property, value); 210 | } 211 | 212 | bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, 213 | uint64_t* value) override = 0; 214 | bool GetIntProperty(const Slice& property, uint64_t* value) override { 215 | return GetIntProperty(DefaultColumnFamily(), property, value); 216 | } 217 | }; 218 | 219 | } // namespace titandb 220 | } // namespace rocksdb 221 | -------------------------------------------------------------------------------- /include/titan/statistics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/statistics.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | std::shared_ptr CreateDBStatistics(); 9 | 10 | enum TickerType : uint32_t { 11 | TITAN_NUM_GET = TICKER_ENUM_MAX, 12 | TITAN_NUM_SEEK, 13 | TITAN_NUM_NEXT, 14 | TITAN_NUM_PREV, 15 | 16 | TITAN_BLOB_FILE_NUM_KEYS_WRITTEN, 17 | TITAN_BLOB_FILE_NUM_KEYS_READ, 18 | TITAN_BLOB_FILE_BYTES_WRITTEN, 19 | TITAN_BLOB_FILE_BYTES_READ, 20 | TITAN_BLOB_FILE_SYNCED, 21 | 22 | TITAN_GC_NUM_FILES, 23 | TITAN_GC_NUM_NEW_FILES, 24 | 25 | // the number of keys overwritten by foreground within the span of GC 26 | TITAN_GC_NUM_KEYS_OVERWRITTEN, 27 | // the number of keys relocated to new blob file by GC 28 | TITAN_GC_NUM_KEYS_RELOCATED, 29 | // the number of keys rewritten to LSM tree due to fallback 30 | TITAN_GC_NUM_KEYS_FALLBACK, 31 | TITAN_GC_BYTES_OVERWRITTEN, 32 | TITAN_GC_BYTES_RELOCATED, 33 | TITAN_GC_BYTES_FALLBACK, 34 | 35 | TITAN_GC_BYTES_WRITTEN, 36 | TITAN_GC_BYTES_READ, 37 | 38 | TITAN_BLOB_CACHE_HIT, 39 | TITAN_BLOB_CACHE_MISS, 40 | 41 | // the count of blob file gced due to discardable ratio hit the threshold 42 | TITAN_GC_DISCARDABLE, 43 | // the count of blob file gced due to small file size 44 | TITAN_GC_SMALL_FILE, 45 | // the count of blob file marked to be merged by level merge 46 | TITAN_GC_LEVEL_MERGE_MARK, 47 | // the count of blob file deleted directly by level merge 48 | TITAN_GC_LEVEL_MERGE_DELETE, 49 | 50 | TITAN_GC_NO_NEED, 51 | // the times of still has blob files remained to be gced after one round of gc 52 | TITAN_GC_REMAIN, 53 | TITAN_GC_FAILURE, 54 | TITAN_GC_SUCCESS, 55 | // the times of triggering next round of GC actively 56 | TITAN_GC_TRIGGER_NEXT, 57 | 58 | TITAN_TICKER_ENUM_MAX, 59 | }; 60 | 61 | // The order of items listed in Tickers should be the same as 62 | // the order listed in TickersNameMap 63 | const std::vector> TitanTickersNameMap = { 64 | {TITAN_NUM_GET, "titandb.num.get"}, 65 | {TITAN_NUM_SEEK, "titandb.num.seek"}, 66 | {TITAN_NUM_NEXT, "titandb.num.next"}, 67 | {TITAN_NUM_PREV, "titandb.num.prev"}, 68 | {TITAN_BLOB_FILE_NUM_KEYS_WRITTEN, "titandb.blob.file.num.keys.written"}, 69 | {TITAN_BLOB_FILE_NUM_KEYS_READ, "titandb.blob.file.num.keys.read"}, 70 | {TITAN_BLOB_FILE_BYTES_WRITTEN, "titandb.blob.file.bytes.written"}, 71 | {TITAN_BLOB_FILE_BYTES_READ, "titandb.blob.file.bytes.read"}, 72 | {TITAN_BLOB_FILE_SYNCED, "titandb.blob.file.synced"}, 73 | {TITAN_GC_NUM_FILES, "titandb.gc.num.files"}, 74 | {TITAN_GC_NUM_NEW_FILES, "titandb.gc.num.new.files"}, 75 | {TITAN_GC_NUM_KEYS_OVERWRITTEN, "titandb.gc.num.keys.overwritten"}, 76 | {TITAN_GC_NUM_KEYS_RELOCATED, "titandb.gc.num.keys.relocated"}, 77 | {TITAN_GC_NUM_KEYS_FALLBACK, "titandb.gc.num.keys.fallback"}, 78 | {TITAN_GC_BYTES_OVERWRITTEN, "titandb.gc.bytes.overwritten"}, 79 | {TITAN_GC_BYTES_RELOCATED, "titandb.gc.bytes.relocated"}, 80 | {TITAN_GC_BYTES_FALLBACK, "titandb.gc.bytes.fallback"}, 81 | {TITAN_GC_BYTES_WRITTEN, "titandb.gc.bytes.written"}, 82 | {TITAN_GC_BYTES_READ, "titandb.gc.bytes.read"}, 83 | {TITAN_BLOB_CACHE_HIT, "titandb.blob.cache.hit"}, 84 | {TITAN_BLOB_CACHE_MISS, "titandb.blob.cache.miss"}, 85 | {TITAN_GC_DISCARDABLE, "titandb.gc.discardable"}, 86 | {TITAN_GC_SMALL_FILE, "titandb.gc.small.file"}, 87 | {TITAN_GC_NO_NEED, "titandb.gc.no.need"}, 88 | {TITAN_GC_REMAIN, "titandb.gc.remain"}, 89 | {TITAN_GC_FAILURE, "titandb.gc.failure"}, 90 | {TITAN_GC_SUCCESS, "titandb.gc.success"}, 91 | {TITAN_GC_TRIGGER_NEXT, "titandb.gc.trigger.next"}, 92 | }; 93 | 94 | enum HistogramType : uint32_t { 95 | TITAN_KEY_SIZE = HISTOGRAM_ENUM_MAX, 96 | TITAN_VALUE_SIZE, 97 | 98 | TITAN_GET_MICROS, 99 | TITAN_SEEK_MICROS, 100 | TITAN_NEXT_MICROS, 101 | TITAN_PREV_MICROS, 102 | 103 | TITAN_BLOB_FILE_WRITE_MICROS, 104 | TITAN_BLOB_FILE_READ_MICROS, 105 | TITAN_BLOB_FILE_SYNC_MICROS, 106 | TITAN_MANIFEST_FILE_SYNC_MICROS, 107 | 108 | TITAN_GC_MICROS, 109 | TITAN_GC_INPUT_FILE_SIZE, 110 | TITAN_GC_OUTPUT_FILE_SIZE, 111 | 112 | TITAN_ITER_TOUCH_BLOB_FILE_COUNT, 113 | 114 | TITAN_HISTOGRAM_ENUM_MAX, 115 | }; 116 | 117 | const std::vector> 118 | TitanHistogramsNameMap = { 119 | {TITAN_KEY_SIZE, "titandb.key.size"}, 120 | {TITAN_VALUE_SIZE, "titandb.value.size"}, 121 | {TITAN_GET_MICROS, "titandb.get.micros"}, 122 | {TITAN_SEEK_MICROS, "titandb.seek.micros"}, 123 | {TITAN_NEXT_MICROS, "titandb.next.micros"}, 124 | {TITAN_PREV_MICROS, "titandb.prev.micros"}, 125 | {TITAN_BLOB_FILE_WRITE_MICROS, "titandb.blob.file.write.micros"}, 126 | {TITAN_BLOB_FILE_READ_MICROS, "titandb.blob.file.read.micros"}, 127 | {TITAN_BLOB_FILE_SYNC_MICROS, "titandb.blob.file.sync.micros"}, 128 | {TITAN_MANIFEST_FILE_SYNC_MICROS, "titandb.manifest.file.sync.micros"}, 129 | 130 | {TITAN_GC_MICROS, "titandb.gc.micros"}, 131 | {TITAN_GC_INPUT_FILE_SIZE, "titandb.gc.input.file.size"}, 132 | {TITAN_GC_OUTPUT_FILE_SIZE, "titandb.gc.output.file.size"}, 133 | {TITAN_ITER_TOUCH_BLOB_FILE_COUNT, 134 | "titandb.iter.touch.blob.file.count"}, 135 | }; 136 | 137 | } // namespace titandb 138 | } // namespace rocksdb 139 | -------------------------------------------------------------------------------- /scripts/format-diff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git diff `git merge-base master HEAD` | clang-format-diff -style=google -p1 -i 4 | -------------------------------------------------------------------------------- /scripts/travis-format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | git fetch --depth=1 origin master:master; 5 | git diff $(git merge-base master HEAD) HEAD > diff; 6 | cat diff | clang-format-diff-10 -style=google -p1 > formatted; 7 | if [ -s formatted ]; then 8 | cat formatted; 9 | echo "Run scripts/format-diff.sh to format your code."; 10 | exit 1; 11 | fi; 12 | -------------------------------------------------------------------------------- /scripts/travis-make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | cmake . -L -DTRAVIS=ON ${COMPRESSION_OPT} ${BUILD_OPT} ${SANITIZER_OPT} ${TOOLS_OPT} ${COVERAGE_OPT} 5 | make -j4 6 | ctest -R titan 7 | 8 | -------------------------------------------------------------------------------- /src/base_db_listener.cc: -------------------------------------------------------------------------------- 1 | #include "base_db_listener.h" 2 | 3 | #include "db_impl.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | BaseDbListener::BaseDbListener(TitanDBImpl* db) : db_impl_(db) { 9 | assert(db_impl_ != nullptr); 10 | } 11 | 12 | BaseDbListener::~BaseDbListener() {} 13 | 14 | void BaseDbListener::OnFlushCompleted(DB* /*db*/, 15 | const FlushJobInfo& flush_job_info) { 16 | if (db_impl_->blob_file_set_->IsOpened()) { 17 | db_impl_->OnFlushCompleted(flush_job_info); 18 | } 19 | } 20 | 21 | void BaseDbListener::OnCompactionCompleted( 22 | DB* /* db */, const CompactionJobInfo& compaction_job_info) { 23 | if (db_impl_->blob_file_set_->IsOpened()) { 24 | db_impl_->OnCompactionCompleted(compaction_job_info); 25 | } 26 | } 27 | 28 | } // namespace titandb 29 | } // namespace rocksdb 30 | -------------------------------------------------------------------------------- /src/base_db_listener.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/listener.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | class TitanDBImpl; 9 | 10 | class BaseDbListener final : public EventListener { 11 | public: 12 | BaseDbListener(TitanDBImpl* db); 13 | ~BaseDbListener(); 14 | 15 | void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info) override; 16 | 17 | void OnCompactionCompleted( 18 | DB* db, const CompactionJobInfo& compaction_job_info) override; 19 | 20 | private: 21 | rocksdb::titandb::TitanDBImpl* db_impl_; 22 | }; 23 | 24 | } // namespace titandb 25 | } // namespace rocksdb 26 | -------------------------------------------------------------------------------- /src/blob_file_builder.cc: -------------------------------------------------------------------------------- 1 | #include "blob_file_builder.h" 2 | 3 | #include "table/block_based/block_based_table_reader.h" 4 | #include "table/meta_blocks.h" 5 | #include "util/crc32c.h" 6 | 7 | namespace rocksdb { 8 | namespace titandb { 9 | 10 | BlobFileBuilder::BlobFileBuilder(const TitanDBOptions& db_options, 11 | const TitanCFOptions& cf_options, 12 | WritableFileWriter* file, 13 | uint32_t blob_file_version) 14 | : builder_state_(cf_options.blob_file_compression_options.max_dict_bytes > 0 15 | ? BuilderState::kBuffered 16 | : BuilderState::kUnbuffered), 17 | cf_options_(cf_options), 18 | file_(file), 19 | blob_file_version_(blob_file_version), 20 | encoder_(cf_options.blob_file_compression, 21 | cf_options.blob_file_compression_options) { 22 | status_ = BlobFileHeader::ValidateVersion(blob_file_version_); 23 | if (!status_.ok()) { 24 | return; 25 | } 26 | if (cf_options_.blob_file_compression_options.max_dict_bytes > 0) { 27 | if (blob_file_version_ != BlobFileHeader::kVersion2) { 28 | status_ = Status::NotSupported( 29 | "dictionary comparession is not supported by blob file version 1"); 30 | } 31 | #if ZSTD_VERSION_NUMBER < 10103 32 | status_ = Status::NotSupported("ZSTD version too old."); 33 | return; 34 | #endif 35 | } 36 | block_size_ = cf_options.enable_punch_hole_gc ? cf_options.block_size : 0; 37 | WriteHeader(); 38 | } 39 | 40 | void BlobFileBuilder::FillBlockWithPad() { 41 | if (block_size_ == 0 || file_->GetFileSize() % block_size_ == 0) { 42 | return; 43 | } 44 | static const std::string pad = std::string(4096, 0); 45 | 46 | size_t pad_size = block_size_ - (file_->GetFileSize() % block_size_); 47 | while (pad_size > pad.size()) { 48 | status_ = file_->Append(pad); 49 | if (!ok()) { 50 | return; 51 | } 52 | pad_size -= pad.size(); 53 | } 54 | status_ = file_->Append(Slice(pad.data(), pad_size)); 55 | } 56 | 57 | void BlobFileBuilder::WriteHeader() { 58 | BlobFileHeader header; 59 | header.version = blob_file_version_; 60 | if (cf_options_.blob_file_compression_options.max_dict_bytes > 0) { 61 | assert(blob_file_version_ >= BlobFileHeader::kVersion2); 62 | header.flags |= BlobFileHeader::kHasUncompressionDictionary; 63 | } 64 | header.block_size = block_size_; 65 | std::string buffer; 66 | header.EncodeTo(&buffer); 67 | status_ = file_->Append(buffer); 68 | if (block_size_ > 0) { 69 | FillBlockWithPad(); 70 | } 71 | } 72 | 73 | void BlobFileBuilder::Add(const BlobRecord& record, 74 | std::unique_ptr ctx, 75 | OutContexts* out_ctx) { 76 | if (!ok()) return; 77 | std::string key = record.key.ToString(); 78 | if (builder_state_ == BuilderState::kBuffered) { 79 | std::string record_str; 80 | // Encode to take ownership of underlying string. 81 | record.EncodeTo(&record_str); 82 | sample_records_.emplace_back(record_str); 83 | sample_str_len_ += record_str.size(); 84 | cached_contexts_.emplace_back(std::move(ctx)); 85 | if (cf_options_.blob_file_compression_options.zstd_max_train_bytes > 0 && 86 | sample_str_len_ >= 87 | cf_options_.blob_file_compression_options.zstd_max_train_bytes) { 88 | EnterUnbuffered(out_ctx); 89 | } 90 | } else { 91 | encoder_.EncodeRecord(record); 92 | WriteEncoderData(&ctx->new_blob_index.blob_handle); 93 | out_ctx->emplace_back(std::move(ctx)); 94 | } 95 | 96 | // The keys added into blob files are in order. 97 | // We do key range checks for both state 98 | if (smallest_key_.empty()) { 99 | smallest_key_.assign(record.key.data(), record.key.size()); 100 | } 101 | assert(cf_options_.comparator->Compare(record.key, Slice(smallest_key_)) >= 102 | 0); 103 | assert(cf_options_.comparator->Compare(record.key, Slice(largest_key_)) >= 0); 104 | largest_key_.assign(record.key.data(), record.key.size()); 105 | } 106 | 107 | void BlobFileBuilder::AddSmall(std::unique_ptr ctx) { 108 | cached_contexts_.emplace_back(std::move(ctx)); 109 | } 110 | 111 | void BlobFileBuilder::EnterUnbuffered(OutContexts* out_ctx) { 112 | // Using collected samples to train the compression dictionary 113 | // Then replay those records in memory, encode them to blob file 114 | // When above things are done, transform builder state into unbuffered 115 | std::string samples; 116 | samples.reserve(sample_str_len_); 117 | std::vector sample_lens; 118 | 119 | for (const auto& record_str : sample_records_) { 120 | samples.append(record_str, 0, record_str.size()); 121 | sample_lens.emplace_back(record_str.size()); 122 | } 123 | std::string dict; 124 | dict = ZSTD_TrainDictionary( 125 | samples, sample_lens, 126 | cf_options_.blob_file_compression_options.max_dict_bytes); 127 | 128 | compression_dict_.reset( 129 | new CompressionDict(dict, cf_options_.blob_file_compression, 130 | cf_options_.blob_file_compression_options.level)); 131 | encoder_.SetCompressionDict(compression_dict_.get()); 132 | 133 | FlushSampleRecords(out_ctx); 134 | 135 | builder_state_ = BuilderState::kUnbuffered; 136 | } 137 | 138 | void BlobFileBuilder::FlushSampleRecords(OutContexts* out_ctx) { 139 | assert(cached_contexts_.size() >= sample_records_.size()); 140 | size_t sample_idx = 0, ctx_idx = 0; 141 | for (; sample_idx < sample_records_.size(); sample_idx++, ctx_idx++) { 142 | const std::string& record_str = sample_records_[sample_idx]; 143 | for (; ctx_idx < cached_contexts_.size() && 144 | cached_contexts_[ctx_idx]->has_value; 145 | ctx_idx++) { 146 | out_ctx->emplace_back(std::move(cached_contexts_[ctx_idx])); 147 | } 148 | const std::unique_ptr& ctx = cached_contexts_[ctx_idx]; 149 | encoder_.EncodeSlice(record_str); 150 | WriteEncoderData(&ctx->new_blob_index.blob_handle); 151 | out_ctx->emplace_back(std::move(cached_contexts_[ctx_idx])); 152 | } 153 | for (; ctx_idx < cached_contexts_.size(); ctx_idx++) { 154 | assert(cached_contexts_[ctx_idx]->has_value); 155 | out_ctx->emplace_back(std::move(cached_contexts_[ctx_idx])); 156 | } 157 | assert(sample_idx == sample_records_.size()); 158 | assert(ctx_idx == cached_contexts_.size()); 159 | sample_records_.clear(); 160 | sample_str_len_ = 0; 161 | cached_contexts_.clear(); 162 | } 163 | 164 | void BlobFileBuilder::WriteEncoderData(BlobHandle* handle) { 165 | handle->offset = file_->GetFileSize(); 166 | handle->size = encoder_.GetEncodedSize(); 167 | live_data_size_ += handle->size; 168 | 169 | status_ = file_->Append(encoder_.GetHeader()); 170 | if (ok()) { 171 | status_ = file_->Append(encoder_.GetRecord()); 172 | num_entries_++; 173 | if (ok()) { 174 | FillBlockWithPad(); 175 | } 176 | } 177 | } 178 | 179 | void BlobFileBuilder::WriteRawBlock(const Slice& block, BlockHandle* handle) { 180 | handle->set_offset(file_->GetFileSize()); 181 | handle->set_size(block.size()); 182 | status_ = file_->Append(block); 183 | if (ok()) { 184 | // follow rocksdb's block based table format 185 | char trailer[BlockBasedTable::kBlockTrailerSize]; 186 | // only compression dictionary and meta index block are written 187 | // by this method, we use `kNoCompression` as placeholder 188 | trailer[0] = kNoCompression; 189 | char* trailer_without_type = trailer + 1; 190 | 191 | // crc32 checksum 192 | auto crc = crc32c::Value(block.data(), block.size()); 193 | crc = crc32c::Extend(crc, trailer, 1); // Extend to cover compression type 194 | EncodeFixed32(trailer_without_type, crc32c::Mask(crc)); 195 | status_ = file_->Append(Slice(trailer, BlockBasedTable::kBlockTrailerSize)); 196 | } 197 | } 198 | 199 | void BlobFileBuilder::WriteCompressionDictBlock( 200 | MetaIndexBuilder* meta_index_builder) { 201 | BlockHandle handle; 202 | WriteRawBlock(compression_dict_->GetRawDict(), &handle); 203 | if (ok()) { 204 | meta_index_builder->Add(kCompressionDictBlockName, handle); 205 | } 206 | } 207 | 208 | Status BlobFileBuilder::Finish(OutContexts* out_ctx) { 209 | if (!ok()) return status(); 210 | 211 | if (builder_state_ == BuilderState::kBuffered) { 212 | EnterUnbuffered(out_ctx); 213 | } 214 | 215 | BlobFileFooter footer; 216 | // if has compression dictionary, encode it into meta blocks 217 | if (cf_options_.blob_file_compression_options.max_dict_bytes > 0) { 218 | assert(blob_file_version_ >= BlobFileHeader::kVersion2); 219 | BlockHandle meta_index_handle; 220 | MetaIndexBuilder meta_index_builder; 221 | WriteCompressionDictBlock(&meta_index_builder); 222 | WriteRawBlock(meta_index_builder.Finish(), &meta_index_handle); 223 | footer.meta_index_handle = meta_index_handle; 224 | } 225 | 226 | std::string buffer; 227 | footer.EncodeTo(&buffer); 228 | 229 | status_ = file_->Append(buffer); 230 | if (ok()) { 231 | // The Sync will be done in `BatchFinishFiles` 232 | status_ = file_->Flush(); 233 | } 234 | return status(); 235 | } 236 | 237 | void BlobFileBuilder::Abandon() {} 238 | 239 | uint64_t BlobFileBuilder::NumEntries() { return num_entries_; } 240 | 241 | } // namespace titandb 242 | } // namespace rocksdb 243 | -------------------------------------------------------------------------------- /src/blob_file_builder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "file/writable_file_writer.h" 4 | #include "table/meta_blocks.h" 5 | #include "util/autovector.h" 6 | #include "util/compression.h" 7 | 8 | #include "blob_format.h" 9 | #include "titan/options.h" 10 | 11 | namespace rocksdb { 12 | namespace titandb { 13 | 14 | // Blob file format: 15 | // 16 | // 17 | // [blob record 1] 18 | // [blob record 2] 19 | // ... 20 | // [blob record N] 21 | // [meta block 1] 22 | // [meta block 2] 23 | // ... 24 | // [meta block K] 25 | // [meta index block] 26 | // [footer] 27 | // 28 | // 29 | // 1. The sequence of blob records in the file are stored in sorted 30 | // order. These records come one after another at the beginning of the 31 | // file, and are compressed according to the compression options. 32 | // 33 | // 2. After the blob records we store a bunch of meta blocks, and a 34 | // meta index block with block handles pointed to the meta blocks. The 35 | // meta block and the meta index block are formatted the same as the 36 | // BlockBasedTable. 37 | class BlobFileBuilder { 38 | public: 39 | // States of the builder. 40 | // 41 | // - `kBuffered`: This is the initial state where zero or more data blocks are 42 | // accumulated uncompressed in-memory. From this state, call 43 | // `EnterUnbuffered()` to finalize the compression dictionary if enabled, 44 | // compress/write out any buffered blocks, and proceed to the `kUnbuffered` 45 | // state. 46 | // 47 | // - `kUnbuffered`: This is the state when compression dictionary is finalized 48 | // either because it wasn't enabled in the first place or it's been created 49 | // from sampling previously buffered data. In this state, blocks are simply 50 | // compressed/written out as they fill up. From this state, call `Finish()` 51 | // to complete the file (write meta-blocks, etc.), or `Abandon()` to delete 52 | // the partially created file. 53 | enum class BuilderState { 54 | kBuffered, 55 | kUnbuffered, 56 | }; 57 | 58 | struct BlobRecordContext { 59 | std::string key; // original internal key 60 | BlobIndex original_blob_index; 61 | BlobIndex new_blob_index; 62 | bool has_value = false; 63 | std::string value; 64 | }; 65 | typedef autovector> OutContexts; 66 | 67 | // Constructs a builder that will store the contents of the file it 68 | // is building in "*file". Does not close the file. It is up to the 69 | // caller to sync and close the file after calling Finish(). 70 | BlobFileBuilder(const TitanDBOptions& db_options, 71 | const TitanCFOptions& cf_options, WritableFileWriter* file, 72 | uint32_t blob_file_version = BlobFileHeader::kVersion3); 73 | 74 | // Tries to add the record to the file 75 | // Notice: 76 | // 1. The `out_ctx` might be empty when builder is in `kBuffered` state. 77 | // 2. Caller should set `ctx.new_blob_index.file_number` before pass it in, 78 | // the file builder will only change the `blob_handle` of it 79 | void Add(const BlobRecord& record, std::unique_ptr ctx, 80 | OutContexts* out_ctx); 81 | 82 | // AddSmall is used to prevent the disorder issue, small KV pairs and blob 83 | // index block may be passed in here 84 | void AddSmall(std::unique_ptr ctx); 85 | 86 | // Returns builder state 87 | BuilderState GetBuilderState() { return builder_state_; } 88 | 89 | // Returns non-ok iff some error has been detected. 90 | Status status() const { return status_; } 91 | 92 | // Finishes building the table, and return status. 93 | // This method will return modify output contexts when it is called in 94 | // `kBuffered` state. 95 | // REQUIRES: Finish(), Abandon() have not been called. 96 | Status Finish(OutContexts* out_ctx); 97 | 98 | // Abandons building the table. If the caller is not going to call 99 | // Finish(), it must call Abandon() before destroying this builder. 100 | // REQUIRES: Finish(), Abandon() have not been called. 101 | void Abandon(); 102 | 103 | // Number of calls to Add() so far. 104 | uint64_t NumEntries(); 105 | // Number of sample records 106 | uint64_t NumSampleEntries() { return sample_records_.size(); } 107 | 108 | const std::string& GetSmallestKey() { return smallest_key_; } 109 | const std::string& GetLargestKey() { return largest_key_; } 110 | 111 | uint64_t live_data_size() const { return live_data_size_; } 112 | 113 | private: 114 | BuilderState builder_state_; 115 | 116 | bool ok() const { return status().ok(); } 117 | // Enter unbuffered state, only be called after collecting enough samples 118 | // for compression dictionary. It will modify `out_ctx` of the buffered 119 | // records 120 | void EnterUnbuffered(OutContexts* out_ctx); 121 | void WriteHeader(); 122 | void WriteRawBlock(const Slice& block, BlockHandle* handle); 123 | void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder); 124 | void FlushSampleRecords(OutContexts* out_ctx); 125 | void WriteEncoderData(BlobHandle* handle); 126 | void FillBlockWithPad(); 127 | 128 | TitanCFOptions cf_options_; 129 | WritableFileWriter* file_; 130 | const uint32_t blob_file_version_; 131 | 132 | Status status_; 133 | BlobEncoder encoder_; 134 | 135 | // following 3 may be refactored in to Rep 136 | std::vector sample_records_; 137 | uint64_t sample_str_len_ = 0; 138 | std::unique_ptr compression_dict_; 139 | 140 | uint64_t block_size_ = 0; 141 | 142 | OutContexts cached_contexts_; 143 | 144 | uint64_t num_entries_ = 0; 145 | std::string smallest_key_; 146 | std::string largest_key_; 147 | uint64_t live_data_size_ = 0; 148 | }; 149 | 150 | } // namespace titandb 151 | } // namespace rocksdb 152 | -------------------------------------------------------------------------------- /src/blob_file_cache.cc: -------------------------------------------------------------------------------- 1 | #include "blob_file_cache.h" 2 | 3 | #include "file/filename.h" 4 | #include "rocksdb/advanced_cache.h" 5 | 6 | #include "util.h" 7 | 8 | namespace rocksdb { 9 | namespace titandb { 10 | 11 | namespace { 12 | 13 | const Cache::CacheItemHelper kBlobFileReaderCacheItemHelper( 14 | CacheEntryRole::kBlockBasedTableReader, &DeleteCacheValue); 15 | 16 | Slice EncodeFileNumber(const uint64_t* number) { 17 | return Slice(reinterpret_cast(number), sizeof(*number)); 18 | } 19 | 20 | } // namespace 21 | 22 | BlobFileCache::BlobFileCache(const TitanDBOptions& db_options, 23 | const TitanCFOptions& cf_options, 24 | std::shared_ptr cache, TitanStats* stats) 25 | : env_(db_options.env), 26 | env_options_(db_options), 27 | db_options_(db_options), 28 | cf_options_(cf_options), 29 | cache_(cache), 30 | stats_(stats) {} 31 | 32 | Status BlobFileCache::Get(const ReadOptions& options, uint64_t file_number, 33 | const BlobHandle& handle, BlobRecord* record, 34 | OwnedSlice* buffer) { 35 | Cache::Handle* cache_handle = nullptr; 36 | Status s = GetBlobFileReaderHandle(file_number, &cache_handle); 37 | if (!s.ok()) return s; 38 | 39 | auto reader = reinterpret_cast(cache_->Value(cache_handle)); 40 | s = reader->Get(options, handle, record, buffer); 41 | cache_->Release(cache_handle); 42 | return s; 43 | } 44 | 45 | Status BlobFileCache::NewPrefetcher( 46 | uint64_t file_number, std::unique_ptr* result) { 47 | Cache::Handle* cache_handle = nullptr; 48 | Status s = GetBlobFileReaderHandle(file_number, &cache_handle); 49 | if (!s.ok()) return s; 50 | 51 | auto reader = reinterpret_cast(cache_->Value(cache_handle)); 52 | auto prefetcher = new BlobFilePrefetcher(reader); 53 | prefetcher->RegisterCleanup(&UnrefCacheHandle, cache_.get(), cache_handle); 54 | result->reset(prefetcher); 55 | return s; 56 | } 57 | 58 | void BlobFileCache::Evict(uint64_t file_number) { 59 | cache_->Erase(EncodeFileNumber(&file_number)); 60 | } 61 | 62 | Status BlobFileCache::GetBlobFileReaderHandle(uint64_t file_number, 63 | Cache::Handle** handle) { 64 | Status s; 65 | Slice cache_key = EncodeFileNumber(&file_number); 66 | *handle = cache_->Lookup(cache_key); 67 | if (*handle) { 68 | // TODO: add file reader cache hit/miss metrics 69 | return s; 70 | } 71 | std::unique_ptr file; 72 | uint64_t file_size; 73 | { 74 | std::unique_ptr f; 75 | auto file_name = BlobFileName(db_options_.dirname, file_number); 76 | auto fs = env_->GetFileSystem(); 77 | 78 | s = fs->GetFileSize(file_name, IOOptions(), &file_size, nullptr); 79 | if (!s.ok()) return s; 80 | s = fs->NewRandomAccessFile(file_name, FileOptions(env_options_), &f, 81 | nullptr /*dbg*/); 82 | if (!s.ok()) return s; 83 | if (db_options_.advise_random_on_open) { 84 | f->Hint(FSRandomAccessFile::kRandom); 85 | } 86 | file.reset(new RandomAccessFileReader(std::move(f), file_name)); 87 | } 88 | 89 | std::unique_ptr reader; 90 | s = BlobFileReader::Open(cf_options_, std::move(file), file_size, &reader, 91 | stats_); 92 | if (!s.ok()) return s; 93 | 94 | cache_->Insert(cache_key, reader.release(), &kBlobFileReaderCacheItemHelper, 95 | 1, handle); 96 | return s; 97 | } 98 | 99 | } // namespace titandb 100 | } // namespace rocksdb 101 | -------------------------------------------------------------------------------- /src/blob_file_cache.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/options.h" 4 | 5 | #include "blob_file_reader.h" 6 | #include "blob_format.h" 7 | #include "titan/options.h" 8 | #include "titan_stats.h" 9 | 10 | namespace rocksdb { 11 | namespace titandb { 12 | 13 | class BlobFileCache { 14 | public: 15 | // Constructs a blob file cache to cache opened files. 16 | BlobFileCache(const TitanDBOptions& db_options, 17 | const TitanCFOptions& cf_options, std::shared_ptr cache, 18 | TitanStats* stats); 19 | 20 | // Gets the blob record pointed by the handle in the specified file 21 | // number. The corresponding file size must be exactly "file_size" 22 | // bytes. The provided buffer is used to store the record data, so 23 | // the buffer must be valid when the record is used. 24 | Status Get(const ReadOptions& options, uint64_t file_number, 25 | const BlobHandle& handle, BlobRecord* record, OwnedSlice* buffer); 26 | 27 | // Creates a prefetcher for the specified file number. 28 | Status NewPrefetcher(uint64_t file_number, 29 | std::unique_ptr* result); 30 | 31 | // Evicts the file cache for the specified file number. 32 | void Evict(uint64_t file_number); 33 | 34 | private: 35 | // Finds the blob file reader for the specified file number. Opens the file if 36 | // the file is not found in the cache and caches it. 37 | // If successful, sets "*handle" to the cached file. 38 | Status GetBlobFileReaderHandle(uint64_t file_number, Cache::Handle** handle); 39 | 40 | Env* env_; 41 | EnvOptions env_options_; 42 | TitanDBOptions db_options_; 43 | TitanCFOptions cf_options_; 44 | std::shared_ptr cache_; 45 | TitanStats* stats_; 46 | }; 47 | 48 | } // namespace titandb 49 | } // namespace rocksdb 50 | -------------------------------------------------------------------------------- /src/blob_file_iterator.cc: -------------------------------------------------------------------------------- 1 | #include "blob_file_iterator.h" 2 | 3 | #include "table/block_based/block_based_table_reader.h" 4 | #include "util/crc32c.h" 5 | 6 | #include "blob_file_reader.h" 7 | #include "util.h" 8 | 9 | namespace rocksdb { 10 | namespace titandb { 11 | 12 | BlobFileIterator::BlobFileIterator( 13 | std::unique_ptr&& file, uint64_t file_name, 14 | uint64_t file_size, const TitanCFOptions& titan_cf_options) 15 | : file_(std::move(file)), 16 | file_number_(file_name), 17 | file_size_(file_size), 18 | titan_cf_options_(titan_cf_options) {} 19 | 20 | BlobFileIterator::~BlobFileIterator() {} 21 | 22 | bool BlobFileIterator::Init() { 23 | Slice slice; 24 | char header_buf[BlobFileHeader::kV3EncodedLength]; 25 | IOOptions io_options; 26 | io_options.rate_limiter_priority = Env::IOPriority::IO_LOW; 27 | status_ = file_->Read(io_options, 0, BlobFileHeader::kV3EncodedLength, &slice, 28 | header_buf, nullptr /*aligned_buf*/); 29 | if (!status_.ok()) { 30 | return false; 31 | } 32 | BlobFileHeader blob_file_header; 33 | status_ = DecodeInto(slice, &blob_file_header, true /*ignore_extra_bytes*/); 34 | if (!status_.ok()) { 35 | return false; 36 | } 37 | 38 | header_size_ = blob_file_header.size(); 39 | 40 | char footer_buf[BlobFileFooter::kEncodedLength]; 41 | status_ = file_->Read(io_options, file_size_ - BlobFileFooter::kEncodedLength, 42 | BlobFileFooter::kEncodedLength, &slice, footer_buf, 43 | nullptr /*aligned_buf*/); 44 | if (!status_.ok()) return false; 45 | BlobFileFooter blob_file_footer; 46 | status_ = blob_file_footer.DecodeFrom(&slice); 47 | end_of_blob_record_ = file_size_ - BlobFileFooter::kEncodedLength; 48 | if (!blob_file_footer.meta_index_handle.IsNull()) { 49 | end_of_blob_record_ -= (blob_file_footer.meta_index_handle.size() + 50 | BlockBasedTable::kBlockTrailerSize); 51 | } 52 | 53 | if (blob_file_header.flags & BlobFileHeader::kHasUncompressionDictionary) { 54 | status_ = InitUncompressionDict(blob_file_footer, file_.get(), 55 | &uncompression_dict_, 56 | titan_cf_options_.memory_allocator()); 57 | if (!status_.ok()) { 58 | return false; 59 | } 60 | decoder_.SetUncompressionDict(uncompression_dict_.get()); 61 | // the layout of blob file is like: 62 | // | .... | 63 | // | records | 64 | // | compression dict + kBlockTrailerSize(5) | 65 | // | metaindex block(40) + kBlockTrailerSize(5) | 66 | // | footer(kEncodedLength: 32) | 67 | end_of_blob_record_ -= (uncompression_dict_->GetRawDict().size() + 68 | BlockBasedTable::kBlockTrailerSize); 69 | } 70 | 71 | block_size_ = blob_file_header.block_size; 72 | 73 | assert(end_of_blob_record_ > BlobFileHeader::kV1EncodedLength); 74 | init_ = true; 75 | return true; 76 | } 77 | 78 | uint64_t BlobFileIterator::AdjustOffsetToNextBlockHead() { 79 | if (block_size_ == 0) return 0; 80 | uint64_t block_offset = iterate_offset_ % block_size_; 81 | if (block_offset != 0) { 82 | uint64_t shift = block_size_ - block_offset; 83 | iterate_offset_ += shift; 84 | return shift; 85 | } 86 | return 0; 87 | } 88 | 89 | void BlobFileIterator::SeekToFirst() { 90 | if (!init_ && !Init()) return; 91 | status_ = Status::OK(); 92 | iterate_offset_ = header_size_; 93 | if (block_size_ != 0) { 94 | AdjustOffsetToNextBlockHead(); 95 | } 96 | PrefetchAndGet(); 97 | } 98 | 99 | bool BlobFileIterator::Valid() const { return valid_ && status().ok(); } 100 | 101 | void BlobFileIterator::Next() { 102 | assert(init_); 103 | PrefetchAndGet(); 104 | } 105 | 106 | Slice BlobFileIterator::key() const { return cur_blob_record_.key; } 107 | 108 | Slice BlobFileIterator::value() const { return cur_blob_record_.value; } 109 | 110 | void BlobFileIterator::IterateForPrev(uint64_t offset) { 111 | if (!init_ && !Init()) return; 112 | 113 | status_ = Status::OK(); 114 | 115 | if (offset >= end_of_blob_record_) { 116 | iterate_offset_ = offset; 117 | status_ = Status::InvalidArgument("Out of bound"); 118 | return; 119 | } 120 | 121 | uint64_t total_length = 0; 122 | FixedSlice header_buffer; 123 | iterate_offset_ = header_size_; 124 | while (iterate_offset_ < offset) { 125 | IOOptions io_options; 126 | // Since BlobFileIterator is only used for GC, we always set IO priority to 127 | // low. 128 | io_options.rate_limiter_priority = Env::IOPriority::IO_LOW; 129 | status_ = file_->Read(io_options, iterate_offset_, kRecordHeaderSize, 130 | &header_buffer, header_buffer.get(), 131 | nullptr /*aligned_buf*/); 132 | if (!status_.ok()) return; 133 | status_ = decoder_.DecodeHeader(&header_buffer); 134 | if (!status_.ok()) return; 135 | total_length = kRecordHeaderSize + decoder_.GetRecordSize(); 136 | iterate_offset_ += total_length; 137 | if (block_size_ != 0) { 138 | total_length += AdjustOffsetToNextBlockHead(); 139 | } 140 | } 141 | 142 | if (iterate_offset_ > offset) iterate_offset_ -= total_length; 143 | valid_ = false; 144 | } 145 | 146 | void BlobFileIterator::GetBlobRecord() { 147 | FixedSlice header_buffer; 148 | // Since BlobFileIterator is only used for GC, we always set IO priority to 149 | // low. 150 | IOOptions io_options; 151 | io_options.rate_limiter_priority = Env::IOPriority::IO_LOW; 152 | status_ = 153 | file_->Read(io_options, iterate_offset_, kRecordHeaderSize, 154 | &header_buffer, header_buffer.get(), nullptr /*aligned_buf*/); 155 | if (!status_.ok()) return; 156 | status_ = decoder_.DecodeHeader(&header_buffer); 157 | if (!status_.ok()) return; 158 | 159 | Slice record_slice; 160 | auto record_size = decoder_.GetRecordSize(); 161 | buffer_.resize(record_size); 162 | status_ = 163 | file_->Read(io_options, iterate_offset_ + kRecordHeaderSize, record_size, 164 | &record_slice, buffer_.data(), nullptr /*aligned_buf*/); 165 | if (status_.ok()) { 166 | status_ = 167 | decoder_.DecodeRecord(&record_slice, &cur_blob_record_, &uncompressed_, 168 | titan_cf_options_.memory_allocator()); 169 | } 170 | if (!status_.ok()) return; 171 | 172 | cur_record_offset_ = iterate_offset_; 173 | cur_record_size_ = kRecordHeaderSize + record_size; 174 | iterate_offset_ += cur_record_size_; 175 | AdjustOffsetToNextBlockHead(); 176 | valid_ = true; 177 | } 178 | 179 | void BlobFileIterator::PrefetchAndGet() { 180 | if (iterate_offset_ >= end_of_blob_record_) { 181 | valid_ = false; 182 | return; 183 | } 184 | 185 | if (readahead_begin_offset_ > iterate_offset_ || 186 | readahead_end_offset_ < iterate_offset_) { 187 | // alignment 188 | readahead_begin_offset_ = 189 | iterate_offset_ - (iterate_offset_ & (kDefaultPageSize - 1)); 190 | readahead_end_offset_ = readahead_begin_offset_; 191 | readahead_size_ = kMinReadaheadSize; 192 | } 193 | auto min_blob_size = 194 | iterate_offset_ + kRecordHeaderSize + titan_cf_options_.min_blob_size; 195 | if (readahead_end_offset_ <= min_blob_size) { 196 | while (readahead_end_offset_ + readahead_size_ <= min_blob_size && 197 | readahead_size_ < kMaxReadaheadSize) 198 | readahead_size_ <<= 1; 199 | IOOptions io_options; 200 | io_options.rate_limiter_priority = Env::IOPriority::IO_LOW; 201 | file_->Prefetch(io_options, readahead_end_offset_, readahead_size_); 202 | readahead_end_offset_ += readahead_size_; 203 | readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ << 1); 204 | } 205 | 206 | GetBlobRecord(); 207 | 208 | if (readahead_end_offset_ < iterate_offset_) { 209 | readahead_end_offset_ = iterate_offset_; 210 | } 211 | } 212 | 213 | BlobFileMergeIterator::BlobFileMergeIterator( 214 | std::vector>&& blob_file_iterators, 215 | const Comparator* comparator) 216 | : blob_file_iterators_(std::move(blob_file_iterators)), 217 | min_heap_(BlobFileIterComparator(comparator)) {} 218 | 219 | bool BlobFileMergeIterator::Valid() const { 220 | if (current_ == nullptr) return false; 221 | if (!status().ok()) return false; 222 | return current_->Valid() && current_->status().ok(); 223 | } 224 | 225 | void BlobFileMergeIterator::SeekToFirst() { 226 | for (auto& iter : blob_file_iterators_) { 227 | iter->SeekToFirst(); 228 | if (iter->status().ok() && iter->Valid()) min_heap_.push(iter.get()); 229 | } 230 | if (!min_heap_.empty()) { 231 | current_ = min_heap_.top(); 232 | min_heap_.pop(); 233 | } else { 234 | status_ = Status::Aborted("No iterator is valid"); 235 | } 236 | } 237 | 238 | void BlobFileMergeIterator::Next() { 239 | assert(Valid()); 240 | current_->Next(); 241 | if (current_->status().ok() && current_->Valid()) min_heap_.push(current_); 242 | if (!min_heap_.empty()) { 243 | current_ = min_heap_.top(); 244 | min_heap_.pop(); 245 | } else { 246 | current_ = nullptr; 247 | } 248 | } 249 | 250 | Slice BlobFileMergeIterator::key() const { 251 | assert(current_ != nullptr); 252 | return current_->key(); 253 | } 254 | 255 | Slice BlobFileMergeIterator::value() const { 256 | assert(current_ != nullptr); 257 | return current_->value(); 258 | } 259 | 260 | } // namespace titandb 261 | } // namespace rocksdb 262 | -------------------------------------------------------------------------------- /src/blob_file_iterator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "file/random_access_file_reader.h" 8 | #include "rocksdb/slice.h" 9 | #include "rocksdb/status.h" 10 | #include "table/internal_iterator.h" 11 | 12 | #include "blob_format.h" 13 | #include "titan/options.h" 14 | #include "util.h" 15 | 16 | namespace rocksdb { 17 | namespace titandb { 18 | 19 | // Used by GC job for iterate through blob file. 20 | class BlobFileIterator { 21 | public: 22 | const uint64_t kMinReadaheadSize = 4 << 10; 23 | const uint64_t kMaxReadaheadSize = 256 << 10; 24 | 25 | BlobFileIterator(std::unique_ptr&& file, 26 | uint64_t file_name, uint64_t file_size, 27 | const TitanCFOptions& titan_cf_options); 28 | ~BlobFileIterator(); 29 | 30 | bool Init(); 31 | bool Valid() const; 32 | void SeekToFirst(); 33 | void Next(); 34 | Slice key() const; 35 | Slice value() const; 36 | Status status() const { return status_; } 37 | uint64_t header_size() const { return header_size_; } 38 | 39 | void IterateForPrev(uint64_t); 40 | 41 | BlobIndex GetBlobIndex() { 42 | BlobIndex blob_index; 43 | blob_index.file_number = file_number_; 44 | blob_index.blob_handle.offset = cur_record_offset_; 45 | blob_index.blob_handle.size = cur_record_size_; 46 | return blob_index; 47 | } 48 | 49 | private: 50 | // Blob file info 51 | const std::unique_ptr file_; 52 | const uint64_t file_number_; 53 | const uint64_t file_size_; 54 | TitanCFOptions titan_cf_options_; 55 | 56 | bool init_{false}; 57 | uint64_t end_of_blob_record_{0}; 58 | 59 | // Iterator status 60 | Status status_; 61 | bool valid_{false}; 62 | 63 | std::unique_ptr uncompression_dict_; 64 | BlobDecoder decoder_; 65 | 66 | uint64_t iterate_offset_{0}; 67 | std::vector buffer_; 68 | OwnedSlice uncompressed_; 69 | BlobRecord cur_blob_record_; 70 | uint64_t cur_record_offset_; 71 | uint64_t cur_record_size_; 72 | uint64_t header_size_; 73 | uint64_t block_size_; 74 | 75 | uint64_t readahead_begin_offset_{0}; 76 | uint64_t readahead_end_offset_{0}; 77 | uint64_t readahead_size_{kMinReadaheadSize}; 78 | 79 | void PrefetchAndGet(); 80 | void GetBlobRecord(); 81 | uint64_t AdjustOffsetToNextBlockHead(); 82 | }; 83 | 84 | class BlobFileMergeIterator { 85 | public: 86 | explicit BlobFileMergeIterator( 87 | std::vector>&&, const Comparator*); 88 | 89 | ~BlobFileMergeIterator() = default; 90 | 91 | bool Valid() const; 92 | void SeekToFirst(); 93 | void Next(); 94 | Slice key() const; 95 | Slice value() const; 96 | Status status() const { 97 | if (current_ != nullptr && !current_->status().ok()) 98 | return current_->status(); 99 | return status_; 100 | } 101 | 102 | BlobIndex GetBlobIndex() { return current_->GetBlobIndex(); } 103 | 104 | private: 105 | class BlobFileIterComparator { 106 | public: 107 | // The default constructor is not supposed to be used. 108 | // It is only to make std::priority_queue can compile. 109 | BlobFileIterComparator() : comparator_(nullptr){}; 110 | explicit BlobFileIterComparator(const Comparator* comparator) 111 | : comparator_(comparator){}; 112 | // Smaller value get Higher priority 113 | bool operator()(const BlobFileIterator* iter1, 114 | const BlobFileIterator* iter2) { 115 | assert(comparator_ != nullptr); 116 | return comparator_->Compare(iter1->key(), iter2->key()) > 0; 117 | } 118 | 119 | private: 120 | const Comparator* comparator_; 121 | }; 122 | 123 | Status status_; 124 | std::vector> blob_file_iterators_; 125 | std::priority_queue, 126 | BlobFileIterComparator> 127 | min_heap_; 128 | BlobFileIterator* current_ = nullptr; 129 | }; 130 | 131 | } // namespace titandb 132 | } // namespace rocksdb 133 | -------------------------------------------------------------------------------- /src/blob_file_manager.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "file/writable_file_writer.h" 4 | 5 | #include "blob_format.h" 6 | 7 | namespace rocksdb { 8 | namespace titandb { 9 | 10 | // Contains information to complete a blob file creation. 11 | class BlobFileHandle { 12 | public: 13 | virtual ~BlobFileHandle() {} 14 | 15 | virtual uint64_t GetNumber() const = 0; 16 | 17 | virtual const std::string& GetName() const = 0; 18 | 19 | virtual WritableFileWriter* GetFile() const = 0; 20 | }; 21 | 22 | // Manages the process of blob files creation. 23 | class BlobFileManager { 24 | public: 25 | virtual ~BlobFileManager() {} 26 | 27 | // Creates a new file. The new file should not be accessed until 28 | // FinishFile() has been called. 29 | // If successful, sets "*handle* to the new file handle with given 30 | // IOPriority. 31 | // 32 | // The reason why set the io priority for WritableFile in Flush, 33 | // Compaction and GC is that the ratelimiter will use the default 34 | // priority IO_TOTAL which won't be limited in ratelimiter. 35 | virtual Status NewFile(std::unique_ptr* handle, 36 | Env::IOPriority pri = Env::IOPriority::IO_TOTAL) = 0; 37 | 38 | // Finishes the file with the provided metadata. Stops writting to 39 | // the file anymore. 40 | // REQUIRES: FinishFile(), DeleteFile() have not been called. 41 | virtual Status FinishFile(uint32_t cf_id, std::shared_ptr file, 42 | std::unique_ptr&& handle) { 43 | std::vector, 44 | std::unique_ptr>> 45 | tmp; 46 | tmp.emplace_back(std::make_pair(file, std::move(handle))); 47 | return BatchFinishFiles(cf_id, tmp); 48 | } 49 | 50 | // Batch version of FinishFile 51 | virtual Status BatchFinishFiles( 52 | uint32_t cf_id, 53 | const std::vector, 54 | std::unique_ptr>>& files) { 55 | (void)cf_id; 56 | (void)files; 57 | return Status::OK(); 58 | }; 59 | 60 | // Deletes the file. If the caller is not going to call 61 | // FinishFile(), it must call DeleteFile() to release the handle. 62 | // REQUIRES: FinishFile(), DeleteFile() have not been called. 63 | virtual Status DeleteFile(std::unique_ptr&& handle) { 64 | std::vector> tmp; 65 | tmp.emplace_back(std::move(handle)); 66 | return BatchDeleteFiles(tmp); 67 | } 68 | 69 | // Batch version of DeleteFile 70 | virtual Status BatchDeleteFiles( 71 | const std::vector>& handles) { 72 | (void)handles; 73 | return Status::OK(); 74 | } 75 | }; 76 | 77 | } // namespace titandb 78 | } // namespace rocksdb 79 | -------------------------------------------------------------------------------- /src/blob_file_reader.cc: -------------------------------------------------------------------------------- 1 | #include "blob_file_reader.h" 2 | 3 | #ifndef __STDC_FORMAT_MACROS 4 | #define __STDC_FORMAT_MACROS 5 | #endif 6 | 7 | #include 8 | 9 | #include "file/filename.h" 10 | #include "file/readahead_raf.h" 11 | #include "rocksdb/cache.h" 12 | #include "table/block_based/block.h" 13 | #include "table/internal_iterator.h" 14 | #include "table/meta_blocks.h" 15 | #include "test_util/sync_point.h" 16 | #include "util/crc32c.h" 17 | #include "util/string_util.h" 18 | 19 | #include "titan_stats.h" 20 | 21 | namespace rocksdb { 22 | namespace titandb { 23 | 24 | Status NewBlobFileReader(uint64_t file_number, uint64_t readahead_size, 25 | const TitanDBOptions& db_options, 26 | const EnvOptions& env_options, Env* env, 27 | std::unique_ptr* result) { 28 | std::unique_ptr file; 29 | auto file_name = BlobFileName(db_options.dirname, file_number); 30 | Status s = env->GetFileSystem()->NewRandomAccessFile( 31 | file_name, FileOptions(env_options), &file, nullptr /*dbg*/); 32 | if (!s.ok()) return s; 33 | 34 | if (readahead_size > 0) { 35 | file = NewReadaheadRandomAccessFile(std::move(file), readahead_size); 36 | } 37 | result->reset(new RandomAccessFileReader( 38 | std::move(file), file_name, nullptr /*clock*/, nullptr /*io_tracer*/, 39 | nullptr /*stats*/, 0 /*hist_type*/, nullptr /*file_read_hist*/, 40 | env_options.rate_limiter)); 41 | return s; 42 | } 43 | 44 | const uint64_t kMaxReadaheadSize = 256 << 10; 45 | 46 | namespace { 47 | 48 | // Seek to the specified meta block. 49 | // Return true if it successfully seeks to that block. 50 | Status SeekToMetaBlock(InternalIterator* meta_iter, 51 | const std::string& block_name, bool* is_found, 52 | BlockHandle* block_handle = nullptr) { 53 | if (block_handle != nullptr) { 54 | *block_handle = BlockHandle::NullBlockHandle(); 55 | } 56 | *is_found = false; 57 | meta_iter->Seek(block_name); 58 | if (meta_iter->status().ok() && meta_iter->Valid() && 59 | meta_iter->key() == block_name) { 60 | *is_found = true; 61 | if (block_handle) { 62 | Slice v = meta_iter->value(); 63 | return block_handle->DecodeFrom(&v); 64 | } 65 | } 66 | return meta_iter->status(); 67 | } 68 | 69 | } // namespace 70 | 71 | Status BlobFileReader::Open(const TitanCFOptions& options, 72 | std::unique_ptr file, 73 | uint64_t file_size, 74 | std::unique_ptr* result, 75 | TitanStats* stats) { 76 | if (file_size < BlobFileFooter::kEncodedLength) { 77 | return Status::Corruption("file is too short to be a blob file"); 78 | } 79 | 80 | BlobFileHeader header; 81 | Status s = ReadHeader(file, &header); 82 | if (!s.ok()) { 83 | return s; 84 | } 85 | 86 | FixedSlice buffer; 87 | s = file->Read(IOOptions(), file_size - BlobFileFooter::kEncodedLength, 88 | BlobFileFooter::kEncodedLength, &buffer, buffer.get(), 89 | nullptr /*aligned_buf*/); 90 | if (!s.ok()) { 91 | return s; 92 | } 93 | 94 | BlobFileFooter footer; 95 | s = DecodeInto(buffer, &footer); 96 | if (!s.ok()) { 97 | return s; 98 | } 99 | 100 | auto reader = new BlobFileReader(options, std::move(file), stats); 101 | reader->footer_ = footer; 102 | if (header.flags & BlobFileHeader::kHasUncompressionDictionary) { 103 | s = InitUncompressionDict(footer, reader->file_.get(), 104 | &reader->uncompression_dict_, 105 | options.memory_allocator()); 106 | if (!s.ok()) { 107 | return s; 108 | } 109 | } 110 | result->reset(reader); 111 | return Status::OK(); 112 | } 113 | 114 | Status BlobFileReader::ReadHeader(std::unique_ptr& file, 115 | BlobFileHeader* header) { 116 | FixedSlice buffer; 117 | Status s = file->Read(IOOptions(), 0, BlobFileHeader::kV3EncodedLength, 118 | &buffer, buffer.get(), nullptr /*aligned_buf*/); 119 | if (!s.ok()) return s; 120 | 121 | s = DecodeInto(buffer, header, true /*ignore_extra_bytes*/); 122 | 123 | return s; 124 | } 125 | 126 | BlobFileReader::BlobFileReader(const TitanCFOptions& options, 127 | std::unique_ptr file, 128 | TitanStats* _stats) 129 | : options_(options), file_(std::move(file)) {} 130 | 131 | Status BlobFileReader::Get(const ReadOptions& _options, 132 | const BlobHandle& handle, BlobRecord* record, 133 | OwnedSlice* buffer) { 134 | TEST_SYNC_POINT("BlobFileReader::Get"); 135 | Slice blob; 136 | CacheAllocationPtr ubuf = 137 | AllocateBlock(handle.size, options_.memory_allocator()); 138 | Status s = file_->Read(IOOptions(), handle.offset, handle.size, &blob, 139 | ubuf.get(), nullptr /*aligned_buf*/); 140 | if (!s.ok()) { 141 | return s; 142 | } 143 | if (handle.size != static_cast(blob.size())) { 144 | return Status::Corruption( 145 | "ReadRecord actual size: " + std::to_string(blob.size()) + 146 | " not equal to blob size " + std::to_string(handle.size)); 147 | } 148 | 149 | BlobDecoder decoder(uncompression_dict_ == nullptr 150 | ? &UncompressionDict::GetEmptyDict() 151 | : uncompression_dict_.get()); 152 | s = decoder.DecodeHeader(&blob); 153 | if (!s.ok()) { 154 | return s; 155 | } 156 | buffer->reset(std::move(ubuf), blob); 157 | s = decoder.DecodeRecord(&blob, record, buffer, options_.memory_allocator()); 158 | return s; 159 | } 160 | 161 | Status BlobFilePrefetcher::Get(const ReadOptions& options, 162 | const BlobHandle& handle, BlobRecord* record, 163 | OwnedSlice* buffer) { 164 | if (handle.offset == last_offset_) { 165 | last_offset_ = handle.offset + handle.size; 166 | if (handle.offset + handle.size > readahead_limit_) { 167 | readahead_size_ = std::max(handle.size, readahead_size_); 168 | IOOptions io_options; 169 | io_options.rate_limiter_priority = Env::IOPriority::IO_HIGH; 170 | reader_->file_->Prefetch(io_options, handle.offset, readahead_size_); 171 | readahead_limit_ = handle.offset + readahead_size_; 172 | readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2); 173 | } 174 | } else { 175 | last_offset_ = handle.offset + handle.size; 176 | readahead_size_ = 0; 177 | readahead_limit_ = 0; 178 | } 179 | 180 | return reader_->Get(options, handle, record, buffer); 181 | } 182 | 183 | Status InitUncompressionDict( 184 | const BlobFileFooter& footer, RandomAccessFileReader* file, 185 | std::unique_ptr* uncompression_dict, 186 | MemoryAllocator* memory_allocator) { 187 | // TODO: Cache the compression dictionary in either block cache or blob cache. 188 | #if ZSTD_VERSION_NUMBER < 10103 189 | return Status::NotSupported("the version of libztsd is too low"); 190 | #endif 191 | // 1. read meta index block 192 | // 2. read dictionary 193 | // 3. reset the dictionary 194 | assert(footer.meta_index_handle.size() > 0); 195 | BlockHandle meta_index_handle = footer.meta_index_handle; 196 | Slice blob; 197 | 198 | CacheAllocationPtr ubuf = 199 | AllocateBlock(meta_index_handle.size(), memory_allocator); 200 | Status s = file->Read(IOOptions(), meta_index_handle.offset(), 201 | meta_index_handle.size(), &blob, ubuf.get(), 202 | nullptr /*aligned_buf*/); 203 | if (!s.ok()) { 204 | return s; 205 | } 206 | BlockContents meta_block_content(std::move(ubuf), meta_index_handle.size()); 207 | 208 | std::unique_ptr meta( 209 | new Block(std::move(meta_block_content), kDisableGlobalSequenceNumber)); 210 | 211 | std::unique_ptr meta_iter(meta->NewDataIterator( 212 | BytewiseComparator(), kDisableGlobalSequenceNumber)); 213 | 214 | bool dict_is_found = false; 215 | BlockHandle dict_block; 216 | s = SeekToMetaBlock(meta_iter.get(), kCompressionDictBlockName, 217 | &dict_is_found, &dict_block); 218 | if (!s.ok()) { 219 | return s; 220 | } 221 | 222 | if (!dict_is_found) { 223 | return Status::NotFound("uncompression dict"); 224 | } 225 | 226 | Slice dict_slice; 227 | CacheAllocationPtr dict_buf = 228 | AllocateBlock(dict_block.size(), memory_allocator); 229 | s = file->Read(IOOptions(), dict_block.offset(), dict_block.size(), 230 | &dict_slice, dict_buf.get(), nullptr /*aligned_buf*/); 231 | if (!s.ok()) { 232 | return s; 233 | } 234 | 235 | std::string dict_str(dict_buf.get(), dict_buf.get() + dict_block.size()); 236 | uncompression_dict->reset(new UncompressionDict(dict_str, true)); 237 | 238 | return s; 239 | } 240 | 241 | } // namespace titandb 242 | } // namespace rocksdb 243 | -------------------------------------------------------------------------------- /src/blob_file_reader.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "file/random_access_file_reader.h" 4 | 5 | #include "blob_format.h" 6 | #include "titan/options.h" 7 | #include "titan_stats.h" 8 | 9 | namespace rocksdb { 10 | namespace titandb { 11 | 12 | Status NewBlobFileReader(uint64_t file_number, uint64_t readahead_size, 13 | const TitanDBOptions& db_options, 14 | const EnvOptions& env_options, Env* env, 15 | std::unique_ptr* result); 16 | 17 | class BlobFileReader { 18 | public: 19 | // Opens a blob file and read the necessary metadata from it. 20 | // If successful, sets "*result" to the newly opened file reader. 21 | static Status Open(const TitanCFOptions& options, 22 | std::unique_ptr file, 23 | uint64_t file_size, 24 | std::unique_ptr* result, 25 | TitanStats* stats); 26 | 27 | // Gets the blob record pointed by the handle in this file. The data 28 | // of the record is stored in the value slice underlying, so the value slice 29 | // must be valid when the record is used. 30 | Status Get(const ReadOptions& options, const BlobHandle& handle, 31 | BlobRecord* record, OwnedSlice* buffer); 32 | 33 | private: 34 | friend class BlobFilePrefetcher; 35 | 36 | BlobFileReader(const TitanCFOptions& options, 37 | std::unique_ptr file, 38 | TitanStats* stats); 39 | 40 | static Status ReadHeader(std::unique_ptr& file, 41 | BlobFileHeader* header); 42 | 43 | TitanCFOptions options_; 44 | std::unique_ptr file_; 45 | 46 | // Information read from the file. 47 | BlobFileFooter footer_; 48 | 49 | std::unique_ptr uncompression_dict_ = nullptr; 50 | 51 | // TitanStats* stats_; 52 | }; 53 | 54 | // Performs readahead on continuous reads. 55 | class BlobFilePrefetcher : public Cleanable { 56 | public: 57 | // Constructs a prefetcher with the blob file reader. 58 | // "*reader" must be valid when the prefetcher is used. 59 | BlobFilePrefetcher(BlobFileReader* reader) : reader_(reader) {} 60 | 61 | Status Get(const ReadOptions& options, const BlobHandle& handle, 62 | BlobRecord* record, OwnedSlice* buffer); 63 | 64 | private: 65 | BlobFileReader* reader_; 66 | uint64_t last_offset_{0}; 67 | uint64_t readahead_size_{0}; 68 | uint64_t readahead_limit_{0}; 69 | }; 70 | 71 | // Init uncompression dictionary 72 | // called by BlobFileReader and BlobFileIterator when blob file has 73 | // uncompression dictionary 74 | Status InitUncompressionDict( 75 | const BlobFileFooter& footer, RandomAccessFileReader* file, 76 | std::unique_ptr* uncompression_dict, 77 | MemoryAllocator* allocator); 78 | 79 | } // namespace titandb 80 | } // namespace rocksdb 81 | -------------------------------------------------------------------------------- /src/blob_file_set.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "db/log_reader.h" 10 | #include "db/log_writer.h" 11 | #include "port/port.h" 12 | #include "rocksdb/options.h" 13 | #include "rocksdb/status.h" 14 | #include "util/mutexlock.h" 15 | 16 | #include "blob_file_cache.h" 17 | #include "blob_storage.h" 18 | #include "titan/options.h" 19 | #include "titan_stats.h" 20 | #include "version_edit.h" 21 | 22 | namespace rocksdb { 23 | namespace titandb { 24 | 25 | struct LogReporter : public log::Reader::Reporter { 26 | Status* status; 27 | void Corruption(size_t, const Status& s) override { 28 | if (status->ok()) *status = s; 29 | } 30 | }; 31 | 32 | // BlobFileSet is the set of all the blobs file generated by Titan. 33 | // It records blob file meta in terms of column family. 34 | class BlobFileSet { 35 | public: 36 | explicit BlobFileSet(const TitanDBOptions& options, TitanStats* stats, 37 | std::atomic* initialized, port::Mutex* mutex); 38 | 39 | // Sets up the storage specified in "options.dirname". 40 | // If the manifest doesn't exist, it will create one. 41 | // If the manifest exists, it will recover from the latest one. 42 | // It is a corruption if the persistent storage contains data 43 | // outside of the provided column families. 44 | Status Open(const std::map& column_families, 45 | const std::string& cache_prefix); 46 | 47 | // Applies *edit and saved to the manifest. 48 | // REQUIRES: mutex is held 49 | Status LogAndApply(VersionEdit& edit); 50 | 51 | // Adds some column families with the specified options. 52 | // REQUIRES: mutex is held 53 | void AddColumnFamilies( 54 | const std::map& column_families, 55 | const std::string& cache_prefix); 56 | 57 | // Drops some column families. The obsolete files will be deleted in 58 | // background when they will not be accessed anymore. 59 | // REQUIRES: mutex is held 60 | Status DropColumnFamilies(const std::vector& handles, 61 | SequenceNumber obsolete_sequence); 62 | 63 | // Destroy the column family. Only after this is called, the obsolete files 64 | // of the dropped column family can be physical deleted. 65 | // REQUIRES: mutex is held 66 | Status MaybeDestroyColumnFamily(uint32_t cf_id); 67 | 68 | // Logical deletes all the blobs within the ranges. 69 | // REQUIRES: mutex is held 70 | Status DeleteBlobFilesInRanges(uint32_t cf_id, const RangePtr* ranges, 71 | size_t n, bool include_end, 72 | SequenceNumber obsolete_sequence); 73 | 74 | // Allocates a new file number. 75 | uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } 76 | 77 | // REQUIRES: mutex is held 78 | std::weak_ptr GetBlobStorage(uint32_t cf_id) { 79 | auto it = column_families_.find(cf_id); 80 | if (it != column_families_.end()) { 81 | return it->second; 82 | } 83 | return std::weak_ptr(); 84 | } 85 | 86 | // REQUIRES: mutex is held 87 | void GetObsoleteFiles(std::vector* obsolete_files, 88 | SequenceNumber oldest_sequence); 89 | 90 | // REQUIRES: mutex is held 91 | void GetAllFiles(std::vector* files, 92 | std::vector* edits); 93 | 94 | // REQUIRES: mutex is held 95 | bool IsColumnFamilyObsolete(uint32_t cf_id) { 96 | return obsolete_columns_.count(cf_id) > 0; 97 | } 98 | 99 | bool IsOpened() { return opened_.load(std::memory_order_acquire); } 100 | 101 | private: 102 | struct ManifestWriter; 103 | 104 | friend class BlobFileSizeCollectorTest; 105 | friend class VersionTest; 106 | 107 | Status Recover(); 108 | 109 | Status OpenManifest(uint64_t number); 110 | 111 | Status WriteSnapshot(log::Writer* log); 112 | 113 | std::string dirname_; 114 | Env* env_; 115 | EnvOptions env_options_; 116 | TitanDBOptions db_options_; 117 | std::shared_ptr file_cache_; 118 | 119 | TitanStats* stats_; 120 | port::Mutex* mutex_; 121 | 122 | // Indicate whether the gc initialization is finished. 123 | std::atomic* initialized_; 124 | // Indicate whether the blob file set Open is called. 125 | std::atomic opened_{false}; 126 | 127 | std::vector obsolete_manifests_; 128 | 129 | // As rocksdb described, `DropColumnFamilies()` only records the drop of the 130 | // column family specified by ColumnFamilyHandle. The actual data is not 131 | // deleted until the client calls `delete column_family`, namely 132 | // `DestroyColumnFamilyHandle()`. We can still continue using the column 133 | // family if we have outstanding ColumnFamilyHandle pointer. So here record 134 | // the dropped column family but the handler is not destroyed. 135 | std::unordered_set obsolete_columns_; 136 | 137 | std::unordered_map> column_families_; 138 | std::unique_ptr manifest_; 139 | std::atomic next_file_number_{1}; 140 | uint64_t manifest_file_number_; 141 | 142 | std::deque manifest_writers_; 143 | }; 144 | 145 | } // namespace titandb 146 | } // namespace rocksdb 147 | -------------------------------------------------------------------------------- /src/blob_file_size_collector.cc: -------------------------------------------------------------------------------- 1 | #include "blob_file_size_collector.h" 2 | 3 | #include "base_db_listener.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | TablePropertiesCollector* 9 | BlobFileSizeCollectorFactory::CreateTablePropertiesCollector( 10 | rocksdb::TablePropertiesCollectorFactory::Context /* context */) { 11 | return new BlobFileSizeCollector(); 12 | } 13 | 14 | const std::string BlobFileSizeCollector::kPropertiesName = 15 | "TitanDB.blob_discardable_size"; 16 | 17 | bool BlobFileSizeCollector::Encode( 18 | const std::map& blob_files_size, std::string* result) { 19 | PutVarint32(result, static_cast(blob_files_size.size())); 20 | for (const auto& bfs : blob_files_size) { 21 | PutVarint64(result, bfs.first); 22 | PutVarint64(result, bfs.second); 23 | } 24 | return true; 25 | } 26 | bool BlobFileSizeCollector::Decode( 27 | Slice* slice, std::map* blob_files_size) { 28 | uint32_t num = 0; 29 | if (!GetVarint32(slice, &num)) { 30 | return false; 31 | } 32 | uint64_t file_number; 33 | uint64_t size; 34 | for (uint32_t i = 0; i < num; ++i) { 35 | if (!GetVarint64(slice, &file_number)) { 36 | return false; 37 | } 38 | if (!GetVarint64(slice, &size)) { 39 | return false; 40 | } 41 | (*blob_files_size)[file_number] = size; 42 | } 43 | return true; 44 | } 45 | 46 | Status BlobFileSizeCollector::AddUserKey(const Slice& /* key */, 47 | const Slice& value, EntryType type, 48 | SequenceNumber /* seq */, 49 | uint64_t /* file_size */) { 50 | if (type != kEntryBlobIndex) { 51 | return Status::OK(); 52 | } 53 | 54 | BlobIndex index; 55 | auto s = index.DecodeFrom(const_cast(&value)); 56 | if (!s.ok()) { 57 | return s; 58 | } 59 | 60 | auto iter = blob_files_size_.find(index.file_number); 61 | if (iter == blob_files_size_.end()) { 62 | blob_files_size_[index.file_number] = index.blob_handle.size; 63 | } else { 64 | iter->second += index.blob_handle.size; 65 | } 66 | 67 | return Status::OK(); 68 | } 69 | 70 | Status BlobFileSizeCollector::Finish(UserCollectedProperties* properties) { 71 | if (blob_files_size_.empty()) { 72 | return Status::OK(); 73 | } 74 | 75 | std::string res; 76 | bool ok __attribute__((__unused__)) = Encode(blob_files_size_, &res); 77 | assert(ok); 78 | assert(!res.empty()); 79 | properties->emplace(std::make_pair(kPropertiesName, res)); 80 | return Status::OK(); 81 | } 82 | 83 | } // namespace titandb 84 | } // namespace rocksdb 85 | -------------------------------------------------------------------------------- /src/blob_file_size_collector.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/listener.h" 4 | #include "rocksdb/table_properties.h" 5 | #include "util/coding.h" 6 | 7 | #include "blob_file_set.h" 8 | #include "db_impl.h" 9 | 10 | namespace rocksdb { 11 | namespace titandb { 12 | 13 | class BlobFileSizeCollectorFactory final 14 | : public TablePropertiesCollectorFactory { 15 | public: 16 | TablePropertiesCollector* CreateTablePropertiesCollector( 17 | TablePropertiesCollectorFactory::Context context) override; 18 | 19 | const char* Name() const override { return "BlobFileSizeCollector"; } 20 | }; 21 | 22 | class BlobFileSizeCollector final : public TablePropertiesCollector { 23 | public: 24 | const static std::string kPropertiesName; 25 | 26 | static bool Encode(const std::map& blob_files_size, 27 | std::string* result); 28 | static bool Decode(Slice* slice, 29 | std::map* blob_files_size); 30 | 31 | Status AddUserKey(const Slice& key, const Slice& value, EntryType type, 32 | SequenceNumber seq, uint64_t file_size) override; 33 | Status Finish(UserCollectedProperties* properties) override; 34 | UserCollectedProperties GetReadableProperties() const override { 35 | return UserCollectedProperties(); 36 | } 37 | const char* Name() const override { return "BlobFileSizeCollector"; } 38 | 39 | private: 40 | std::map blob_files_size_; 41 | }; 42 | 43 | } // namespace titandb 44 | } // namespace rocksdb 45 | -------------------------------------------------------------------------------- /src/blob_file_size_collector_test.cc: -------------------------------------------------------------------------------- 1 | #include "blob_file_size_collector.h" 2 | 3 | #include "test_util/testharness.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | class BlobFileSizeCollectorTest : public testing::Test { 9 | public: 10 | Env* env_{Env::Default()}; 11 | EnvOptions env_options_; 12 | TitanDBOptions db_options_; 13 | TitanCFOptions cf_options_; 14 | // Derived options. 15 | ImmutableDBOptions db_ioptions_; 16 | MutableCFOptions cf_moptions_; 17 | ImmutableCFOptions cf_ioptions_; 18 | ImmutableOptions ioptions_; 19 | std::shared_ptr prefix_extractor_ = nullptr; 20 | 21 | std::unique_ptr table_factory_; 22 | std::vector> collectors_; 23 | 24 | std::string tmpdir_; 25 | std::string file_name_; 26 | 27 | BlobFileSizeCollectorTest() 28 | : table_factory_(NewBlockBasedTableFactory()), 29 | tmpdir_(test::TmpDir(env_)), 30 | file_name_(tmpdir_ + "/TEST") { 31 | db_options_.dirname = tmpdir_; 32 | auto blob_file_size_collector_factory = 33 | std::make_shared(); 34 | collectors_.emplace_back(new UserKeyTablePropertiesCollectorFactory( 35 | blob_file_size_collector_factory)); 36 | // Refresh options. 37 | db_ioptions_ = ImmutableDBOptions(db_options_); 38 | cf_moptions_ = MutableCFOptions(cf_options_); 39 | cf_ioptions_ = ImmutableCFOptions(cf_options_); 40 | ioptions_ = ImmutableOptions(db_ioptions_, cf_ioptions_); 41 | } 42 | 43 | ~BlobFileSizeCollectorTest() { 44 | env_->DeleteFile(file_name_); 45 | env_->DeleteDir(tmpdir_); 46 | } 47 | 48 | void NewFileWriter(std::unique_ptr* result) { 49 | std::unique_ptr writable_file; 50 | ASSERT_OK(env_->GetFileSystem()->NewWritableFile( 51 | file_name_, FileOptions(env_options_), &writable_file, 52 | nullptr /*dbg*/)); 53 | result->reset(new WritableFileWriter(std::move(writable_file), file_name_, 54 | FileOptions(env_options_))); 55 | ASSERT_TRUE(*result); 56 | } 57 | 58 | void NewTableBuilder(WritableFileWriter* file, 59 | std::unique_ptr* result) { 60 | CompressionOptions compression_opts; 61 | TableBuilderOptions options( 62 | ioptions_, cf_moptions_, cf_ioptions_.internal_comparator, &collectors_, 63 | kNoCompression, compression_opts, 0 /*column_family_id*/, 64 | kDefaultColumnFamilyName, 0 /*level*/); 65 | result->reset(table_factory_->NewTableBuilder(options, file)); 66 | ASSERT_TRUE(*result); 67 | } 68 | 69 | void NewFileReader(std::unique_ptr* result) { 70 | std::unique_ptr file; 71 | ASSERT_OK(env_->GetFileSystem()->NewRandomAccessFile( 72 | file_name_, FileOptions(env_options_), &file, nullptr /*dbg*/)); 73 | result->reset(new RandomAccessFileReader(std::move(file), file_name_, 74 | env_->GetSystemClock().get())); 75 | ASSERT_TRUE(*result); 76 | } 77 | 78 | void NewTableReader(std::unique_ptr&& file, 79 | std::unique_ptr* result) { 80 | TableReaderOptions options(ioptions_, prefix_extractor_, env_options_, 81 | cf_ioptions_.internal_comparator, 0); 82 | uint64_t file_size = 0; 83 | ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size)); 84 | ASSERT_TRUE(file_size > 0); 85 | ASSERT_OK(table_factory_->NewTableReader(options, std::move(file), 86 | file_size, result)); 87 | ASSERT_TRUE(*result); 88 | } 89 | }; 90 | 91 | TEST_F(BlobFileSizeCollectorTest, Basic) { 92 | std::unique_ptr wfile; 93 | NewFileWriter(&wfile); 94 | std::unique_ptr table_builder; 95 | NewTableBuilder(wfile.get(), &table_builder); 96 | 97 | constexpr uint64_t kFirstFileNumber = 1ULL; 98 | constexpr uint64_t kSecondFileNumber = 2ULL; 99 | const int kNumEntries = 100; 100 | char buf[16]; 101 | for (int i = 0; i < kNumEntries; i++) { 102 | ParsedInternalKey ikey; 103 | snprintf(buf, sizeof(buf), "%15d", i); 104 | ikey.user_key = buf; 105 | ikey.type = kTypeTitanBlobIndex; 106 | std::string key; 107 | AppendInternalKey(&key, ikey); 108 | 109 | BlobIndex index; 110 | if (i % 2 == 0) { 111 | index.file_number = kFirstFileNumber; 112 | } else { 113 | index.file_number = kSecondFileNumber; 114 | } 115 | index.blob_handle.size = 10; 116 | std::string value; 117 | index.EncodeTo(&value); 118 | 119 | table_builder->Add(key, value); 120 | } 121 | ASSERT_OK(table_builder->status()); 122 | ASSERT_EQ(kNumEntries, table_builder->NumEntries()); 123 | ASSERT_OK(table_builder->Finish()); 124 | ASSERT_OK(wfile->Flush()); 125 | ASSERT_OK(wfile->Sync(true)); 126 | 127 | std::unique_ptr rfile; 128 | NewFileReader(&rfile); 129 | std::unique_ptr table_reader; 130 | NewTableReader(std::move(rfile), &table_reader); 131 | 132 | auto table_properties = table_reader->GetTableProperties(); 133 | ASSERT_TRUE(table_properties); 134 | auto iter = table_properties->user_collected_properties.find( 135 | BlobFileSizeCollector::kPropertiesName); 136 | ASSERT_TRUE(iter != table_properties->user_collected_properties.end()); 137 | 138 | Slice raw_blob_file_size_prop(iter->second); 139 | std::map result; 140 | BlobFileSizeCollector::Decode(&raw_blob_file_size_prop, &result); 141 | 142 | ASSERT_EQ(2, result.size()); 143 | 144 | ASSERT_EQ(kNumEntries / 2 * 10, result[kFirstFileNumber]); 145 | ASSERT_EQ(kNumEntries / 2 * 10, result[kSecondFileNumber]); 146 | } 147 | 148 | } // namespace titandb 149 | } // namespace rocksdb 150 | 151 | int main(int argc, char** argv) { 152 | ::testing::InitGoogleTest(&argc, argv); 153 | return RUN_ALL_TESTS(); 154 | } 155 | -------------------------------------------------------------------------------- /src/blob_file_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "file/filename.h" 4 | #include "test_util/testharness.h" 5 | 6 | #include "blob_file_builder.h" 7 | #include "blob_file_cache.h" 8 | #include "blob_file_reader.h" 9 | 10 | namespace rocksdb { 11 | namespace titandb { 12 | 13 | class BlobFileTest : public testing::Test { 14 | public: 15 | BlobFileTest() : dirname_(test::TmpDir(env_)) { 16 | file_name_ = BlobFileName(dirname_, file_number_); 17 | } 18 | 19 | ~BlobFileTest() { 20 | env_->DeleteFile(file_name_); 21 | env_->DeleteDir(dirname_); 22 | } 23 | 24 | std::string GenKey(uint64_t i) { 25 | char buf[64]; 26 | snprintf(buf, sizeof(buf), "k-%08" PRIu64, i); 27 | return buf; 28 | } 29 | 30 | std::string GenValue(uint64_t i) { return std::string(1024, i); } 31 | 32 | void AddRecord(BlobFileBuilder* builder, BlobRecord& record, 33 | BlobFileBuilder::OutContexts& contexts) { 34 | std::unique_ptr ctx( 35 | new BlobFileBuilder::BlobRecordContext); 36 | ctx->key = record.key.ToString(); 37 | BlobFileBuilder::OutContexts cur_contexts; 38 | builder->Add(record, std::move(ctx), &cur_contexts); 39 | for (size_t i = 0; i < cur_contexts.size(); i++) { 40 | contexts.emplace_back(std::move(cur_contexts[i])); 41 | } 42 | } 43 | 44 | Status Finish(BlobFileBuilder* builder, 45 | BlobFileBuilder::OutContexts& contexts) { 46 | BlobFileBuilder::OutContexts cur_contexts; 47 | Status s = builder->Finish(&cur_contexts); 48 | for (size_t i = 0; i < cur_contexts.size(); i++) { 49 | contexts.emplace_back(std::move(cur_contexts[i])); 50 | } 51 | return s; 52 | } 53 | 54 | void TestBlobFilePrefetcher(TitanOptions options, 55 | uint32_t blob_file_version = 0) { 56 | options.dirname = dirname_; 57 | TitanDBOptions db_options(options); 58 | TitanCFOptions cf_options(options); 59 | BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)}, nullptr); 60 | 61 | const int n = 100; 62 | BlobFileBuilder::OutContexts contexts; 63 | 64 | std::unique_ptr file; 65 | { 66 | std::unique_ptr f; 67 | ASSERT_OK(env_->GetFileSystem()->NewWritableFile( 68 | file_name_, FileOptions(env_options_), &f, nullptr /*dbg*/)); 69 | file.reset(new WritableFileWriter(std::move(f), file_name_, 70 | FileOptions(env_options_))); 71 | } 72 | std::unique_ptr builder; 73 | if (blob_file_version == 0) { 74 | // Default blob file version 75 | builder.reset(new BlobFileBuilder(db_options, cf_options, file.get())); 76 | } else { 77 | // Test with specific blob file version 78 | builder.reset(new BlobFileBuilder(db_options, cf_options, file.get(), 79 | blob_file_version)); 80 | } 81 | 82 | for (int i = 0; i < n; i++) { 83 | auto key = GenKey(i); 84 | auto value = GenValue(i); 85 | BlobRecord record; 86 | record.key = key; 87 | record.value = value; 88 | 89 | AddRecord(builder.get(), record, contexts); 90 | 91 | ASSERT_OK(builder->status()); 92 | } 93 | ASSERT_OK(Finish(builder.get(), contexts)); 94 | ASSERT_OK(builder->status()); 95 | 96 | uint64_t file_size = 0; 97 | ASSERT_OK(env_->GetFileSize(file_name_, &file_size)); 98 | 99 | ReadOptions ro; 100 | std::unique_ptr prefetcher; 101 | ASSERT_OK(cache.NewPrefetcher(file_number_, &prefetcher)); 102 | ASSERT_EQ(contexts.size(), n); 103 | for (int i = 0; i < n; i++) { 104 | auto key = GenKey(i); 105 | auto value = GenValue(i); 106 | BlobRecord expect; 107 | expect.key = key; 108 | expect.value = value; 109 | BlobRecord record; 110 | OwnedSlice buffer; 111 | BlobHandle blob_handle = contexts[i]->new_blob_index.blob_handle; 112 | ASSERT_OK(cache.Get(ro, file_number_, blob_handle, &record, &buffer)); 113 | ASSERT_EQ(record, expect); 114 | ASSERT_OK(cache.Get(ro, file_number_, blob_handle, &record, &buffer)); 115 | ASSERT_EQ(record, expect); 116 | ASSERT_OK(prefetcher->Get(ro, blob_handle, &record, &buffer)); 117 | ASSERT_EQ(record, expect); 118 | ASSERT_OK(prefetcher->Get(ro, blob_handle, &record, &buffer)); 119 | ASSERT_EQ(record, expect); 120 | } 121 | } 122 | 123 | void TestBlobFileReader(TitanOptions options, 124 | uint32_t blob_file_version = 0) { 125 | options.dirname = dirname_; 126 | TitanDBOptions db_options(options); 127 | TitanCFOptions cf_options(options); 128 | BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)}, nullptr); 129 | 130 | const int n = 100; 131 | BlobFileBuilder::OutContexts contexts; 132 | 133 | std::unique_ptr file; 134 | { 135 | std::unique_ptr f; 136 | ASSERT_OK(env_->GetFileSystem()->NewWritableFile( 137 | file_name_, FileOptions(env_options_), &f, nullptr /*dbg*/)); 138 | file.reset(new WritableFileWriter(std::move(f), file_name_, 139 | FileOptions(env_options_))); 140 | } 141 | 142 | std::unique_ptr builder; 143 | if (blob_file_version == 0) { 144 | // Default blob file version 145 | builder.reset(new BlobFileBuilder(db_options, cf_options, file.get())); 146 | } else { 147 | // Test with specific blob file version 148 | builder.reset(new BlobFileBuilder(db_options, cf_options, file.get(), 149 | blob_file_version)); 150 | } 151 | 152 | for (int i = 0; i < n; i++) { 153 | auto key = GenKey(i); 154 | auto value = GenValue(i); 155 | BlobRecord record; 156 | record.key = key; 157 | record.value = value; 158 | 159 | AddRecord(builder.get(), record, contexts); 160 | 161 | ASSERT_OK(builder->status()); 162 | } 163 | 164 | ASSERT_OK(Finish(builder.get(), contexts)); 165 | ASSERT_OK(builder->status()); 166 | 167 | uint64_t file_size = 0; 168 | ASSERT_OK(env_->GetFileSize(file_name_, &file_size)); 169 | 170 | ReadOptions ro; 171 | std::unique_ptr random_access_file_reader; 172 | ASSERT_OK(NewBlobFileReader(file_number_, 0, db_options, env_options_, env_, 173 | &random_access_file_reader)); 174 | std::unique_ptr blob_file_reader; 175 | ASSERT_OK(BlobFileReader::Open(cf_options, 176 | std::move(random_access_file_reader), 177 | file_size, &blob_file_reader, nullptr)); 178 | ASSERT_EQ(contexts.size(), n); 179 | 180 | for (int i = 0; i < n; i++) { 181 | auto key = GenKey(i); 182 | auto value = GenValue(i); 183 | BlobRecord expect; 184 | expect.key = key; 185 | expect.value = value; 186 | BlobRecord record; 187 | OwnedSlice buffer; 188 | BlobHandle blob_handle = contexts[i]->new_blob_index.blob_handle; 189 | ASSERT_OK(cache.Get(ro, file_number_, blob_handle, &record, &buffer)); 190 | ASSERT_EQ(record, expect); 191 | ASSERT_OK(cache.Get(ro, file_number_, blob_handle, &record, &buffer)); 192 | ASSERT_EQ(record, expect); 193 | ASSERT_OK(blob_file_reader->Get(ro, blob_handle, &record, &buffer)); 194 | ASSERT_EQ(record, expect); 195 | ASSERT_OK(blob_file_reader->Get(ro, blob_handle, &record, &buffer)); 196 | ASSERT_EQ(record, expect); 197 | } 198 | } 199 | 200 | Env* env_{Env::Default()}; 201 | EnvOptions env_options_; 202 | std::string dirname_; 203 | std::string file_name_; 204 | uint64_t file_number_{1}; 205 | }; 206 | 207 | TEST_F(BlobFileTest, BlobFileReader) { 208 | TitanOptions options; 209 | TestBlobFileReader(options); 210 | TestBlobFileReader(options, BlobFileHeader::kVersion1); 211 | options.blob_file_compression = kLZ4Compression; 212 | TestBlobFileReader(options); 213 | } 214 | 215 | TEST_F(BlobFileTest, BlobFilePrefetcher) { 216 | TitanOptions options; 217 | TestBlobFilePrefetcher(options); 218 | TestBlobFilePrefetcher(options, BlobFileHeader::kVersion1); 219 | options.blob_cache = NewLRUCache(1 << 20); 220 | TestBlobFilePrefetcher(options); 221 | options.blob_file_compression = kLZ4Compression; 222 | TestBlobFilePrefetcher(options); 223 | } 224 | 225 | } // namespace titandb 226 | } // namespace rocksdb 227 | 228 | int main(int argc, char** argv) { 229 | ::testing::InitGoogleTest(&argc, argv); 230 | return RUN_ALL_TESTS(); 231 | } 232 | -------------------------------------------------------------------------------- /src/blob_format_test.cc: -------------------------------------------------------------------------------- 1 | #include "blob_format.h" 2 | 3 | #include "test_util/testharness.h" 4 | 5 | #include "testutil.h" 6 | #include "util.h" 7 | 8 | namespace rocksdb { 9 | namespace titandb { 10 | 11 | class BlobFormatTest : public testing::Test {}; 12 | 13 | TEST(BlobFormatTest, BlobRecord) { 14 | BlobRecord input; 15 | CheckCodec(input); 16 | input.key = "hello"; 17 | input.value = "world"; 18 | CheckCodec(input); 19 | } 20 | 21 | TEST(BlobFormatTest, BlobHandle) { 22 | BlobHandle input; 23 | CheckCodec(input); 24 | input.offset = 2; 25 | input.size = 3; 26 | CheckCodec(input); 27 | } 28 | 29 | TEST(BlobFormatTest, BlobIndex) { 30 | BlobIndex input; 31 | CheckCodec(input); 32 | input.file_number = 1; 33 | input.blob_handle.offset = 2; 34 | input.blob_handle.size = 3; 35 | CheckCodec(input); 36 | } 37 | 38 | TEST(BlobFormatTest, BlobFileMeta) { 39 | BlobFileMeta input(2, 3, 0, 0, "0", "9"); 40 | CheckCodec(input); 41 | } 42 | 43 | TEST(BlobFormatTest, BlobFileFooter) { 44 | BlobFileFooter input; 45 | CheckCodec(input); 46 | input.meta_index_handle.set_offset(123); 47 | input.meta_index_handle.set_size(321); 48 | CheckCodec(input); 49 | } 50 | 51 | TEST(BlobFormatTest, BlobFileStateTransit) { 52 | BlobFileMeta blob_file; 53 | ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kNone); 54 | blob_file.FileStateTransit(BlobFileMeta::FileEvent::kDbStart); 55 | ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kPendingInit); 56 | blob_file.FileStateTransit(BlobFileMeta::FileEvent::kDbInit); 57 | ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kNormal); 58 | blob_file.FileStateTransit(BlobFileMeta::FileEvent::kGCBegin); 59 | ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kBeingGC); 60 | blob_file.FileStateTransit(BlobFileMeta::FileEvent::kGCCompleted); 61 | 62 | BlobFileMeta compaction_output; 63 | ASSERT_EQ(compaction_output.file_state(), BlobFileMeta::FileState::kNone); 64 | compaction_output.FileStateTransit( 65 | BlobFileMeta::FileEvent::kFlushOrCompactionOutput); 66 | ASSERT_EQ(compaction_output.file_state(), 67 | BlobFileMeta::FileState::kPendingLSM); 68 | compaction_output.FileStateTransit( 69 | BlobFileMeta::FileEvent::kCompactionCompleted); 70 | ASSERT_EQ(compaction_output.file_state(), BlobFileMeta::FileState::kNormal); 71 | } 72 | 73 | TEST(BlobFormatTest, BlobCompressionLZ4) { 74 | BlobEncoder encoder(kLZ4Compression); 75 | BlobDecoder decoder; 76 | 77 | BlobRecord record; 78 | record.key = "key1"; 79 | record.value = "value1"; 80 | 81 | encoder.EncodeRecord(record); 82 | Slice encoded_record = encoder.GetRecord(); 83 | Slice encoded_header = encoder.GetHeader(); 84 | 85 | decoder.DecodeHeader(&encoded_header); 86 | 87 | BlobRecord decoded_record; 88 | OwnedSlice blob; 89 | decoder.DecodeRecord(&encoded_record, &decoded_record, &blob); 90 | 91 | ASSERT_EQ(record, decoded_record); 92 | } 93 | 94 | #if defined(ZSTD) 95 | 96 | std::string CreateDict() { 97 | const int sample_count = 1000; 98 | std::string samples = ""; 99 | std::vector sample_lens; 100 | 101 | BlobRecord record; 102 | BlobEncoder encoder(kZSTD); 103 | 104 | for (int i = 0; i < sample_count; ++i) { 105 | std::string key = "key" + std::to_string(i); 106 | std::string value = "value" + std::to_string(i); 107 | record.key = Slice(key); 108 | record.value = Slice(value); 109 | encoder.EncodeRecord(record); 110 | 111 | std::string encoded_record = encoder.GetRecord().ToString(); 112 | sample_lens.push_back(encoded_record.size()); 113 | samples += encoded_record; 114 | } 115 | 116 | return ZSTD_TrainDictionary(samples, sample_lens, 4000); 117 | } 118 | 119 | TEST(BlobFormatTest, BlobCompressionZSTD) { 120 | auto dict = CreateDict(); 121 | CompressionDict compression_dict(dict, kZSTD, 10); 122 | UncompressionDict uncompression_dict(dict, true); 123 | 124 | BlobEncoder encoder(kZSTD, &compression_dict); 125 | BlobDecoder decoder(&uncompression_dict, kZSTD); 126 | 127 | BlobRecord record; 128 | record.key = "key1"; 129 | record.value = "value1"; 130 | 131 | encoder.EncodeRecord(record); 132 | Slice encoded_record = encoder.GetRecord(); 133 | Slice encoded_header = encoder.GetHeader(); 134 | 135 | decoder.DecodeHeader(&encoded_header); 136 | 137 | BlobRecord decoded_record; 138 | OwnedSlice blob; 139 | decoder.DecodeRecord(&encoded_record, &decoded_record, &blob); 140 | 141 | ASSERT_EQ(record, decoded_record); 142 | } 143 | 144 | #endif // ZSTD 145 | 146 | } // namespace titandb 147 | } // namespace rocksdb 148 | 149 | int main(int argc, char** argv) { 150 | ::testing::InitGoogleTest(&argc, argv); 151 | return RUN_ALL_TESTS(); 152 | } 153 | -------------------------------------------------------------------------------- /src/blob_gc.cc: -------------------------------------------------------------------------------- 1 | #include "blob_gc.h" 2 | 3 | namespace rocksdb { 4 | namespace titandb { 5 | 6 | BlobGC::BlobGC(std::vector>&& blob_files, 7 | TitanCFOptions&& _titan_cf_options, bool need_trigger_next) 8 | : inputs_(blob_files), 9 | titan_cf_options_(std::move(_titan_cf_options)), 10 | trigger_next_(need_trigger_next) { 11 | MarkFilesBeingGC(); 12 | } 13 | 14 | BlobGC::~BlobGC() {} 15 | 16 | void BlobGC::SetColumnFamily(ColumnFamilyHandle* cfh) { cfh_ = cfh; } 17 | 18 | ColumnFamilyData* BlobGC::GetColumnFamilyData() { 19 | auto* cfhi = reinterpret_cast(cfh_); 20 | return cfhi->cfd(); 21 | } 22 | 23 | void BlobGC::AddOutputFile(BlobFileMeta* blob_file) { 24 | outputs_.push_back(blob_file); 25 | } 26 | 27 | void BlobGC::MarkFilesBeingGC() { 28 | for (auto& f : inputs_) { 29 | f->FileStateTransit(BlobFileMeta::FileEvent::kGCBegin); 30 | } 31 | } 32 | 33 | void BlobGC::ReleaseGcFiles() { 34 | for (auto& f : inputs_) { 35 | f->FileStateTransit(BlobFileMeta::FileEvent::kGCCompleted); 36 | } 37 | 38 | for (auto& f : outputs_) { 39 | f->FileStateTransit(BlobFileMeta::FileEvent::kGCCompleted); 40 | } 41 | } 42 | 43 | } // namespace titandb 44 | } // namespace rocksdb 45 | -------------------------------------------------------------------------------- /src/blob_gc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "db/column_family.h" 6 | 7 | #include "blob_format.h" 8 | #include "titan/options.h" 9 | 10 | namespace rocksdb { 11 | namespace titandb { 12 | 13 | // A BlobGC encapsulates information about a blob gc. 14 | class BlobGC { 15 | public: 16 | BlobGC(std::vector>&& blob_files, 17 | TitanCFOptions&& _titan_cf_options, bool need_trigger_next); 18 | 19 | // No copying allowed 20 | BlobGC(const BlobGC&) = delete; 21 | void operator=(const BlobGC&) = delete; 22 | 23 | ~BlobGC(); 24 | 25 | const std::vector>& inputs() { return inputs_; } 26 | 27 | const TitanCFOptions& titan_cf_options() { return titan_cf_options_; } 28 | 29 | void SetColumnFamily(ColumnFamilyHandle* cfh); 30 | 31 | ColumnFamilyHandle* column_family_handle() { return cfh_; } 32 | 33 | ColumnFamilyData* GetColumnFamilyData(); 34 | 35 | void MarkFilesBeingGC(); 36 | 37 | void AddOutputFile(BlobFileMeta*); 38 | 39 | void ReleaseGcFiles(); 40 | 41 | bool trigger_next() { return trigger_next_; } 42 | 43 | private: 44 | std::vector> inputs_; 45 | std::vector outputs_; 46 | TitanCFOptions titan_cf_options_; 47 | ColumnFamilyHandle* cfh_{nullptr}; 48 | // Whether need to trigger gc after this gc or not 49 | const bool trigger_next_; 50 | }; 51 | 52 | struct GCScore { 53 | uint64_t file_number; 54 | double score; 55 | }; 56 | 57 | } // namespace titandb 58 | } // namespace rocksdb 59 | -------------------------------------------------------------------------------- /src/blob_gc_job.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "db/db_impl/db_impl.h" 4 | #include "rocksdb/statistics.h" 5 | #include "rocksdb/status.h" 6 | 7 | #include "blob_file_builder.h" 8 | #include "blob_file_iterator.h" 9 | #include "blob_file_manager.h" 10 | #include "blob_file_set.h" 11 | #include "blob_gc.h" 12 | #include "titan/options.h" 13 | #include "titan_stats.h" 14 | #include "version_edit.h" 15 | 16 | namespace rocksdb { 17 | namespace titandb { 18 | 19 | class BlobGCJob { 20 | public: 21 | BlobGCJob(BlobGC *blob_gc, DB *db, port::Mutex *mutex, 22 | const TitanDBOptions &titan_db_options, Env *env, 23 | const EnvOptions &env_options, BlobFileManager *blob_file_manager, 24 | BlobFileSet *blob_file_set, LogBuffer *log_buffer, 25 | std::atomic_bool *shuting_down, TitanStats *stats); 26 | 27 | // No copying allowed 28 | BlobGCJob(const BlobGCJob &) = delete; 29 | void operator=(const BlobGCJob &) = delete; 30 | 31 | ~BlobGCJob(); 32 | 33 | // REQUIRE: mutex held 34 | Status Prepare(); 35 | // REQUIRE: mutex not held 36 | Status Run(); 37 | // REQUIRE: mutex held 38 | Status Finish(); 39 | 40 | private: 41 | class GarbageCollectionWriteCallback; 42 | friend class BlobGCJobTest; 43 | 44 | void UpdateInternalOpStats(); 45 | 46 | BlobGC *blob_gc_; 47 | DB *base_db_; 48 | DBImpl *base_db_impl_; 49 | port::Mutex *mutex_; 50 | TitanDBOptions db_options_; 51 | Env *env_; 52 | EnvOptions env_options_; 53 | BlobFileManager *blob_file_manager_; 54 | BlobFileSet *blob_file_set_; 55 | LogBuffer *log_buffer_{nullptr}; 56 | 57 | std::vector, 58 | std::unique_ptr>> 59 | blob_file_builders_; 60 | std::vector> 61 | rewrite_batches_; 62 | 63 | std::atomic_bool *shuting_down_{nullptr}; 64 | 65 | TitanStats *stats_; 66 | 67 | struct { 68 | uint64_t gc_bytes_read = 0; 69 | uint64_t gc_bytes_written = 0; 70 | uint64_t gc_num_keys_overwritten = 0; 71 | uint64_t gc_bytes_overwritten = 0; 72 | uint64_t gc_num_keys_relocated = 0; 73 | uint64_t gc_bytes_relocated = 0; 74 | uint64_t gc_num_keys_fallback = 0; 75 | uint64_t gc_bytes_fallback = 0; 76 | uint64_t gc_num_new_files = 0; 77 | uint64_t gc_num_files = 0; 78 | uint64_t gc_read_lsm_micros = 0; 79 | uint64_t gc_update_lsm_micros = 0; 80 | } metrics_; 81 | 82 | uint64_t prev_bytes_read_ = 0; 83 | uint64_t prev_bytes_written_ = 0; 84 | uint64_t io_bytes_read_ = 0; 85 | uint64_t io_bytes_written_ = 0; 86 | 87 | Status DoRunGC(); 88 | void BatchWriteNewIndices(BlobFileBuilder::OutContexts &contexts, Status *s); 89 | Status BuildIterator(std::unique_ptr *result); 90 | Status DiscardEntry(const Slice &key, const BlobIndex &blob_index, 91 | bool *discardable); 92 | Status InstallOutputBlobFiles(); 93 | Status RewriteValidKeyToLSM(); 94 | Status DeleteInputBlobFiles(); 95 | 96 | bool IsShutingDown(); 97 | }; 98 | 99 | } // namespace titandb 100 | } // namespace rocksdb 101 | -------------------------------------------------------------------------------- /src/blob_gc_picker.cc: -------------------------------------------------------------------------------- 1 | #include "blob_gc_picker.h" 2 | 3 | #ifndef __STDC_FORMAT_MACROS 4 | #define __STDC_FORMAT_MACROS 5 | #endif 6 | 7 | #include 8 | 9 | #include "titan_logging.h" 10 | 11 | namespace rocksdb { 12 | namespace titandb { 13 | 14 | BasicBlobGCPicker::BasicBlobGCPicker(TitanDBOptions db_options, 15 | TitanCFOptions cf_options, 16 | TitanStats* stats) 17 | : db_options_(db_options), cf_options_(cf_options), stats_(stats) {} 18 | 19 | BasicBlobGCPicker::~BasicBlobGCPicker() {} 20 | 21 | std::unique_ptr BasicBlobGCPicker::PickBlobGC( 22 | BlobStorage* blob_storage) { 23 | Status s; 24 | std::vector> blob_files; 25 | 26 | uint64_t batch_size = 0; 27 | uint64_t estimate_output_size = 0; 28 | bool stop_picking = false; 29 | bool maybe_continue_next_time = false; 30 | uint64_t next_gc_size = 0; 31 | bool in_fallback = cf_options_.blob_run_mode == TitanBlobRunMode::kFallback; 32 | 33 | for (auto& gc_score : blob_storage->gc_score()) { 34 | if (gc_score.score < cf_options_.blob_file_discardable_ratio) { 35 | break; 36 | } 37 | // in fallback mode, only gc files that all blobs are discarded 38 | if (in_fallback && std::abs(1.0 - gc_score.score) > 39 | std::numeric_limits::epsilon()) { 40 | break; 41 | } 42 | 43 | auto blob_file = blob_storage->FindFile(gc_score.file_number).lock(); 44 | if (!CheckBlobFile(blob_file.get())) { 45 | // Skip this file id this file is being GCed 46 | // or this file had been GCed 47 | TITAN_LOG_INFO(db_options_.info_log, "Blob file %" PRIu64 " no need gc", 48 | blob_file->file_number()); 49 | continue; 50 | } 51 | if (!stop_picking) { 52 | blob_files.emplace_back(blob_file); 53 | if (blob_file->file_size() <= cf_options_.merge_small_file_threshold) { 54 | RecordTick(statistics(stats_), TITAN_GC_SMALL_FILE, 1); 55 | } else { 56 | RecordTick(statistics(stats_), TITAN_GC_DISCARDABLE, 1); 57 | } 58 | batch_size += blob_file->file_size(); 59 | estimate_output_size += blob_file->live_data_size(); 60 | if (batch_size >= cf_options_.max_gc_batch_size || 61 | estimate_output_size >= cf_options_.blob_file_target_size) { 62 | // Stop pick file for this gc, but still check file for whether need 63 | // trigger gc after this 64 | stop_picking = true; 65 | } 66 | } else { 67 | next_gc_size += blob_file->file_size(); 68 | if (next_gc_size > cf_options_.min_gc_batch_size || in_fallback) { 69 | maybe_continue_next_time = true; 70 | RecordTick(statistics(stats_), TITAN_GC_REMAIN, 1); 71 | TITAN_LOG_INFO(db_options_.info_log, 72 | "remain more than %" PRIu64 73 | " bytes to be gc and trigger after this gc", 74 | next_gc_size); 75 | break; 76 | } 77 | } 78 | } 79 | TITAN_LOG_DEBUG(db_options_.info_log, 80 | "got batch size %" PRIu64 ", estimate output %" PRIu64 81 | " bytes", 82 | batch_size, estimate_output_size); 83 | 84 | if (blob_files.empty()) return nullptr; 85 | 86 | // Skip these checks if in fallback mode, we need to gc all files in fallback 87 | // mode 88 | if (!in_fallback) { 89 | if (batch_size < cf_options_.min_gc_batch_size && 90 | estimate_output_size < cf_options_.blob_file_target_size) { 91 | return nullptr; 92 | } 93 | // if there is only one small file to merge, no need to perform 94 | if (blob_files.size() == 1 && 95 | blob_files[0]->file_size() <= cf_options_.merge_small_file_threshold && 96 | blob_files[0]->GetDiscardableRatio() < 97 | cf_options_.blob_file_discardable_ratio) { 98 | return nullptr; 99 | } 100 | } 101 | 102 | return std::unique_ptr(new BlobGC( 103 | std::move(blob_files), std::move(cf_options_), maybe_continue_next_time)); 104 | } 105 | 106 | bool BasicBlobGCPicker::CheckBlobFile(BlobFileMeta* blob_file) const { 107 | assert(blob_file == nullptr || 108 | blob_file->file_state() != BlobFileMeta::FileState::kNone); 109 | if (blob_file == nullptr || 110 | blob_file->file_state() != BlobFileMeta::FileState::kNormal) 111 | return false; 112 | 113 | return true; 114 | } 115 | 116 | } // namespace titandb 117 | } // namespace rocksdb 118 | -------------------------------------------------------------------------------- /src/blob_gc_picker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "db/column_family.h" 6 | #include "db/write_callback.h" 7 | #include "file/filename.h" 8 | #include "rocksdb/status.h" 9 | 10 | #include "blob_file_manager.h" 11 | #include "blob_format.h" 12 | #include "blob_gc.h" 13 | #include "blob_storage.h" 14 | 15 | namespace rocksdb { 16 | namespace titandb { 17 | 18 | class BlobGCPicker { 19 | public: 20 | BlobGCPicker(){}; 21 | virtual ~BlobGCPicker(){}; 22 | 23 | // Pick candidate blob files for a new gc. 24 | // Returns nullptr if there is no gc to be done. 25 | // Otherwise returns a pointer to a heap-allocated object that 26 | // describes the gc. Caller should delete the result. 27 | virtual std::unique_ptr PickBlobGC(BlobStorage* blob_storage) = 0; 28 | }; 29 | 30 | class BasicBlobGCPicker final : public BlobGCPicker { 31 | public: 32 | BasicBlobGCPicker(TitanDBOptions, TitanCFOptions, TitanStats*); 33 | ~BasicBlobGCPicker(); 34 | 35 | std::unique_ptr PickBlobGC(BlobStorage* blob_storage) override; 36 | 37 | private: 38 | TitanDBOptions db_options_; 39 | TitanCFOptions cf_options_; 40 | TitanStats* stats_; 41 | 42 | // Check if blob_file needs to gc, return true means we need pick this 43 | // file for gc 44 | bool CheckBlobFile(BlobFileMeta* blob_file) const; 45 | }; 46 | 47 | } // namespace titandb 48 | } // namespace rocksdb 49 | -------------------------------------------------------------------------------- /src/blob_gc_picker_test.cc: -------------------------------------------------------------------------------- 1 | #include "blob_gc_picker.h" 2 | 3 | #include "file/filename.h" 4 | #include "test_util/testharness.h" 5 | 6 | #include "blob_file_builder.h" 7 | #include "blob_file_cache.h" 8 | #include "blob_file_iterator.h" 9 | #include "blob_file_reader.h" 10 | 11 | namespace rocksdb { 12 | namespace titandb { 13 | 14 | class BlobGCPickerTest : public testing::Test { 15 | public: 16 | std::unique_ptr blob_storage_; 17 | std::unique_ptr basic_blob_gc_picker_; 18 | 19 | BlobGCPickerTest() {} 20 | ~BlobGCPickerTest() {} 21 | 22 | void NewBlobStorageAndPicker(const TitanDBOptions& titan_db_options, 23 | const TitanCFOptions& titan_cf_options) { 24 | auto blob_file_cache = std::make_shared( 25 | titan_db_options, titan_cf_options, NewLRUCache(128), nullptr); 26 | blob_storage_.reset(new BlobStorage(titan_db_options, titan_cf_options, 0, 27 | "", blob_file_cache, nullptr, nullptr)); 28 | basic_blob_gc_picker_.reset( 29 | new BasicBlobGCPicker(titan_db_options, titan_cf_options, nullptr)); 30 | } 31 | 32 | void AddBlobFile(uint64_t file_number, uint64_t data_size, 33 | uint64_t discardable_size, bool being_gc = false) { 34 | auto f = std::make_shared( 35 | file_number, data_size + kBlobMaxHeaderSize + kBlobFooterSize, 0, 0, "", 36 | ""); 37 | f->set_live_data_size(data_size - discardable_size); 38 | f->FileStateTransit(BlobFileMeta::FileEvent::kDbStart); 39 | f->FileStateTransit(BlobFileMeta::FileEvent::kDbInit); 40 | if (being_gc) { 41 | f->FileStateTransit(BlobFileMeta::FileEvent::kGCBegin); 42 | } 43 | blob_storage_->files_[file_number] = f; 44 | } 45 | 46 | void RemoveBlobFile(uint64_t file_number) { 47 | ASSERT_TRUE(blob_storage_->files_[file_number] != nullptr); 48 | blob_storage_->files_.erase(file_number); 49 | } 50 | 51 | void UpdateBlobStorage() { blob_storage_->ComputeGCScore(); } 52 | }; 53 | 54 | TEST_F(BlobGCPickerTest, Basic) { 55 | TitanDBOptions titan_db_options; 56 | TitanCFOptions titan_cf_options; 57 | titan_cf_options.min_gc_batch_size = 0; 58 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 59 | AddBlobFile(1U, 1U, 0U); 60 | UpdateBlobStorage(); 61 | auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 62 | ASSERT_TRUE(blob_gc == nullptr); 63 | 64 | AddBlobFile(2U, 1U, 0U); 65 | UpdateBlobStorage(); 66 | blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 67 | ASSERT_TRUE(blob_gc != nullptr); 68 | ASSERT_EQ(blob_gc->inputs().size(), 2); 69 | } 70 | 71 | TEST_F(BlobGCPickerTest, BeingGC) { 72 | TitanDBOptions titan_db_options; 73 | TitanCFOptions titan_cf_options; 74 | titan_cf_options.min_gc_batch_size = 0; 75 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 76 | AddBlobFile(1U, 1U, 0U, true); 77 | UpdateBlobStorage(); 78 | auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 79 | ASSERT_EQ(nullptr, blob_gc); 80 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 81 | AddBlobFile(1U, 1U, 0U, true); 82 | AddBlobFile(2U, 1U, 0U); 83 | AddBlobFile(3U, 1U, 0U); 84 | UpdateBlobStorage(); 85 | blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 86 | ASSERT_EQ(blob_gc->inputs().size(), 2); 87 | ASSERT_NE(blob_gc->inputs()[0]->file_number(), 1U); 88 | ASSERT_NE(blob_gc->inputs()[1]->file_number(), 1U); 89 | } 90 | 91 | TEST_F(BlobGCPickerTest, TriggerNext) { 92 | TitanDBOptions titan_db_options; 93 | TitanCFOptions titan_cf_options; 94 | titan_cf_options.max_gc_batch_size = 1 << 30; 95 | titan_cf_options.blob_file_target_size = 256 << 20; 96 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 97 | AddBlobFile(1U, 1U << 30, 1000U << 20); // valid_size = 24MB 98 | AddBlobFile(2U, 1U << 30, 512U << 20); // valid_size = 512MB 99 | AddBlobFile(3U, 1U << 30, 512U << 20); // valid_size = 512MB 100 | AddBlobFile(4U, 1U << 30, 512U << 20); // valid_size = 512MB 101 | UpdateBlobStorage(); 102 | auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 103 | ASSERT_TRUE(blob_gc != nullptr); 104 | ASSERT_EQ(blob_gc->trigger_next(), true); 105 | } 106 | 107 | TEST_F(BlobGCPickerTest, PickFileAndTriggerNext) { 108 | TitanDBOptions titan_db_options; 109 | TitanCFOptions titan_cf_options; 110 | titan_cf_options.max_gc_batch_size = 1 << 30; 111 | titan_cf_options.blob_file_target_size = 256 << 20; 112 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 113 | for (size_t i = 1; i < 41; i++) { 114 | // add 70 files with 10MB valid data each file 115 | AddBlobFile(i, titan_cf_options.blob_file_target_size, 246 << 20); 116 | } 117 | UpdateBlobStorage(); 118 | int gc_times = 0; 119 | auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 120 | ASSERT_TRUE(blob_gc != nullptr); 121 | while (blob_gc != nullptr && blob_gc->trigger_next()) { 122 | gc_times++; 123 | ASSERT_EQ(blob_gc->trigger_next(), true); 124 | ASSERT_EQ(blob_gc->inputs().size(), 4); 125 | for (auto file : blob_gc->inputs()) { 126 | RemoveBlobFile(file->file_number()); 127 | } 128 | UpdateBlobStorage(); 129 | blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 130 | } 131 | ASSERT_EQ(gc_times, 9); 132 | ASSERT_TRUE(blob_gc != nullptr); 133 | ASSERT_EQ(blob_gc->inputs().size(), 4); 134 | } 135 | 136 | TEST_F(BlobGCPickerTest, ParallelPickGC) { 137 | TitanDBOptions titan_db_options; 138 | TitanCFOptions titan_cf_options; 139 | titan_cf_options.max_gc_batch_size = 1 << 30; 140 | titan_cf_options.blob_file_target_size = 256 << 20; 141 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 142 | for (size_t i = 1; i < 9; i++) { 143 | // add 70 files with 10MB valid data each file 144 | AddBlobFile(i, titan_cf_options.blob_file_target_size, 246 << 20); 145 | } 146 | UpdateBlobStorage(); 147 | auto blob_gc1 = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 148 | ASSERT_TRUE(blob_gc1 != nullptr); 149 | ASSERT_EQ(blob_gc1->trigger_next(), true); 150 | ASSERT_EQ(blob_gc1->inputs().size(), 4); 151 | auto blob_gc2 = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 152 | ASSERT_TRUE(blob_gc2 != nullptr); 153 | ASSERT_EQ(blob_gc2->trigger_next(), false); 154 | ASSERT_EQ(blob_gc2->inputs().size(), 4); 155 | for (auto file : blob_gc1->inputs()) { 156 | RemoveBlobFile(file->file_number()); 157 | } 158 | for (auto file : blob_gc2->inputs()) { 159 | RemoveBlobFile(file->file_number()); 160 | } 161 | UpdateBlobStorage(); 162 | } 163 | 164 | TEST_F(BlobGCPickerTest, Fallback) { 165 | TitanDBOptions titan_db_options; 166 | TitanCFOptions titan_cf_options; 167 | titan_cf_options.blob_run_mode = TitanBlobRunMode::kFallback; 168 | NewBlobStorageAndPicker(titan_db_options, titan_cf_options); 169 | AddBlobFile(1U, 1U << 30, 1U << 30); // valid_size = 0MB 170 | AddBlobFile(2U, 1U << 30, 1U << 30); // valid_size = 0MB 171 | AddBlobFile(3U, 1U << 30, 512U << 20); // valid_size = 512MB 172 | AddBlobFile(4U, 1U << 30, 512U << 20); // valid_size = 512MB 173 | UpdateBlobStorage(); 174 | auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 175 | ASSERT_TRUE(blob_gc != nullptr); 176 | int gc_times = 0; 177 | while (blob_gc != nullptr) { 178 | gc_times++; 179 | for (auto file : blob_gc->inputs()) { 180 | RemoveBlobFile(file->file_number()); 181 | } 182 | UpdateBlobStorage(); 183 | if (!blob_gc->trigger_next()) break; 184 | blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); 185 | } 186 | ASSERT_EQ(gc_times, 2); 187 | ASSERT_EQ(blob_storage_->NumBlobFiles(), 2); 188 | } 189 | 190 | } // namespace titandb 191 | } // namespace rocksdb 192 | 193 | int main(int argc, char** argv) { 194 | ::testing::InitGoogleTest(&argc, argv); 195 | return RUN_ALL_TESTS(); 196 | } 197 | -------------------------------------------------------------------------------- /src/blob_storage.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef __STDC_FORMAT_MACROS 3 | #define __STDC_FORMAT_MACROS 4 | #endif 5 | #include 6 | 7 | #include "rocksdb/options.h" 8 | 9 | #include "blob_file_cache.h" 10 | #include "blob_format.h" 11 | #include "blob_gc.h" 12 | #include "titan_stats.h" 13 | 14 | namespace rocksdb { 15 | namespace titandb { 16 | 17 | // Provides methods to access the blob storage for a specific 18 | // column family. 19 | class BlobStorage { 20 | public: 21 | BlobStorage(const TitanDBOptions& _db_options, 22 | const TitanCFOptions& _cf_options, uint32_t cf_id, 23 | const std::string& cache_prefix, 24 | std::shared_ptr _file_cache, TitanStats* stats, 25 | std::atomic* initialized) 26 | : db_options_(_db_options), 27 | cf_options_(_cf_options), 28 | mutable_cf_options_(_cf_options), 29 | cf_id_(cf_id), 30 | levels_file_count_(_cf_options.num_levels, 0), 31 | blob_ranges_(InternalComparator(_cf_options.comparator)), 32 | cache_prefix_(cache_prefix), 33 | blob_cache_(_cf_options.blob_cache), 34 | file_cache_(_file_cache), 35 | destroyed_(false), 36 | stats_(stats), 37 | initialized_(initialized) {} 38 | 39 | ~BlobStorage() { 40 | for (auto& file : files_) { 41 | file_cache_->Evict(file.second->file_number()); 42 | } 43 | } 44 | 45 | const TitanDBOptions& db_options() { return db_options_; } 46 | 47 | TitanCFOptions cf_options() { 48 | auto _cf_options = cf_options_; 49 | _cf_options.UpdateMutableOptions(mutable_cf_options_); 50 | return _cf_options; 51 | } 52 | 53 | const std::vector gc_score() { 54 | MutexLock l(&mutex_); 55 | return gc_score_; 56 | } 57 | 58 | Cache* blob_cache() { return blob_cache_.get(); } 59 | 60 | // Gets the blob record pointed by the blob index. The provided 61 | // buffer is used to store the record data, so the buffer must be 62 | // valid when the record is used. 63 | Status Get(const ReadOptions& options, const BlobIndex& index, 64 | BlobRecord* record, PinnableSlice* value, 65 | bool for_compaction = false); 66 | 67 | // Gets the blob record pointed by the blob index by blob cache. 68 | // The provided buffer is used to store the record data, so the buffer must be 69 | // valid when the record is used. 70 | // If cache hit, set cache_hit to true, otherwise false. 71 | Status TryGetBlobCache(const std::string& cache_key, BlobRecord* record, 72 | PinnableSlice* value, bool* cache_hit); 73 | 74 | std::string EncodeBlobCache(const BlobIndex& index); 75 | 76 | // Creates a prefetcher for the specified file number. 77 | Status NewPrefetcher(uint64_t file_number, 78 | std::unique_ptr* result); 79 | 80 | // Get all the blob files within the ranges. 81 | Status GetBlobFilesInRanges( 82 | const RangePtr* ranges, size_t n, bool include_end, 83 | std::vector>* files); 84 | 85 | // Finds the blob file meta for the specified file number. It is a 86 | // corruption if the file doesn't exist. 87 | std::weak_ptr FindFile(uint64_t file_number) const; 88 | 89 | void StartInitializeAllFiles() { 90 | MutexLock l(&mutex_); 91 | for (auto& file : files_) { 92 | file.second->FileStateTransit(BlobFileMeta::FileEvent::kDbStart); 93 | } 94 | } 95 | 96 | // Must call before TitanDBImpl initialized. 97 | void InitializeAllFiles() { 98 | MutexLock l(&mutex_); 99 | for (auto& file : files_) { 100 | file.second->FileStateTransit(BlobFileMeta::FileEvent::kDbInit); 101 | } 102 | } 103 | 104 | // The corresponding column family is dropped, so mark destroyed and we can 105 | // remove this blob storage later. 106 | void MarkDestroyed() { 107 | MutexLock l(&mutex_); 108 | destroyed_ = true; 109 | } 110 | 111 | // Returns whether this blob storage can be deleted now. 112 | bool MaybeRemove() const { 113 | MutexLock l(&mutex_); 114 | return destroyed_ && obsolete_files_.empty(); 115 | } 116 | 117 | // Computes GC score. 118 | void ComputeGCScore(); 119 | 120 | // Collects and updates statistics. 121 | void UpdateStats(); 122 | 123 | // Add a new blob file to this blob storage. 124 | void AddBlobFile(std::shared_ptr& file); 125 | 126 | // Gets all obsolete blob files whose obsolete_sequence is smaller than the 127 | // oldest_sequence. Note that the files returned would be erased from internal 128 | // structure, so for the next call, the files returned before wouldn't be 129 | // returned again. 130 | void GetObsoleteFiles(std::vector* obsolete_files, 131 | SequenceNumber oldest_sequence); 132 | 133 | // Gets all files (start with '/titandb' prefix), including obsolete files. 134 | void GetAllFiles(std::vector* files); 135 | 136 | // Mark the file as obsolete, and retrun value indicates whether the file is 137 | // founded. 138 | bool MarkFileObsolete(uint64_t file_number, SequenceNumber obsolete_sequence); 139 | 140 | // Returns the number of blob files, including obsolete files. 141 | std::size_t NumBlobFiles() const { 142 | MutexLock l(&mutex_); 143 | return files_.size(); 144 | } 145 | 146 | uint64_t NumBlobFilesAtLevel(int level) const { 147 | MutexLock l(&mutex_); 148 | if (level >= static_cast(levels_file_count_.size())) { 149 | return 0; 150 | } 151 | return levels_file_count_[level]; 152 | } 153 | 154 | // Returns the number of obsolete blob files. 155 | // TODO: use this method to calculate `kNumObsoleteBlobFile` DB property. 156 | std::size_t NumObsoleteBlobFiles() const { 157 | MutexLock l(&mutex_); 158 | return obsolete_files_.size(); 159 | } 160 | 161 | // Exports all blob files' meta. Only for tests. 162 | void ExportBlobFiles( 163 | std::map>& ret) const; 164 | 165 | void SetMutableCFOptions(const MutableTitanCFOptions& options) { 166 | MutexLock l(&mutex_); 167 | mutable_cf_options_ = options; 168 | } 169 | 170 | private: 171 | friend class BlobFileSet; 172 | friend class VersionTest; 173 | friend class BlobGCPickerTest; 174 | friend class BlobGCJobTest; 175 | friend class BlobFileSizeCollectorTest; 176 | 177 | void MarkFileObsoleteLocked(std::shared_ptr file, 178 | SequenceNumber obsolete_sequence); 179 | bool RemoveFile(uint64_t file_number); 180 | 181 | TitanDBOptions db_options_; 182 | TitanCFOptions cf_options_; 183 | MutableTitanCFOptions mutable_cf_options_; 184 | const uint32_t cf_id_; 185 | 186 | mutable port::Mutex mutex_; 187 | 188 | // Only BlobStorage OWNS BlobFileMeta 189 | // file_number -> file_meta 190 | std::unordered_map> files_; 191 | std::vector levels_file_count_; 192 | 193 | class InternalComparator { 194 | public: 195 | // The default constructor is not supposed to be used. 196 | // It is only to make std::multimap can compile. 197 | InternalComparator() : comparator_(nullptr){}; 198 | explicit InternalComparator(const Comparator* comparator) 199 | : comparator_(comparator){}; 200 | bool operator()(const Slice& key1, const Slice& key2) const { 201 | assert(comparator_ != nullptr); 202 | return comparator_->Compare(key1, key2) < 0; 203 | } 204 | 205 | private: 206 | const Comparator* comparator_; 207 | }; 208 | // smallest_key -> file_meta 209 | std::multimap, InternalComparator> 210 | blob_ranges_; 211 | 212 | const std::string cache_prefix_; 213 | std::shared_ptr blob_cache_; 214 | std::shared_ptr file_cache_; 215 | 216 | std::vector gc_score_; 217 | 218 | std::list> obsolete_files_; 219 | // It is marked when the column family handle is destroyed, indicating the 220 | // in-memory data structure can be destroyed. Physical files may still be 221 | // kept. 222 | bool destroyed_; 223 | 224 | TitanStats* stats_; 225 | 226 | // Indicates whether the files' live data size is initialized. 227 | std::atomic* initialized_; 228 | }; 229 | 230 | } // namespace titandb 231 | } // namespace rocksdb 232 | -------------------------------------------------------------------------------- /src/compaction_filter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "db_impl.h" 7 | #include "rocksdb/compaction_filter.h" 8 | #include "titan_logging.h" 9 | #include "util/mutexlock.h" 10 | 11 | namespace rocksdb { 12 | namespace titandb { 13 | 14 | class TitanCompactionFilter final : public CompactionFilter { 15 | public: 16 | TitanCompactionFilter(TitanDBImpl *db, const std::string &cf_name, 17 | const CompactionFilter *original, 18 | std::unique_ptr &&owned_filter, 19 | std::shared_ptr blob_storage, 20 | bool skip_value) 21 | : db_(db), 22 | cf_name_(cf_name), 23 | blob_storage_(std::move(blob_storage)), 24 | original_filter_(original), 25 | owned_filter_(std::move(owned_filter)), 26 | skip_value_(skip_value), 27 | filter_name_(std::string("TitanCompactionfilter.") 28 | .append(original_filter_->Name())) { 29 | assert(blob_storage_ != nullptr); 30 | assert(original_filter_ != nullptr); 31 | } 32 | 33 | const char *Name() const override { return filter_name_.c_str(); } 34 | 35 | bool IsStackedBlobDbInternalCompactionFilter() const override { return true; } 36 | 37 | Decision UnsafeFilter(int level, const Slice &key, ValueType value_type, 38 | const Slice &value, std::string *new_value, 39 | std::string *skip_until) const override { 40 | Status s; 41 | Slice user_key = key; 42 | 43 | // Since IsStackedBlobDbInternalCompactionFilter was implemented as true, 44 | // the key is an internal key when value_type is kBlobIndex, which is caused 45 | // by a hack in RocksDB. 46 | if (value_type == kBlobIndex) { 47 | ParsedInternalKey ikey; 48 | s = ParseInternalKey(key, &ikey, false /*log_err_key*/); 49 | if (s.ok()) { 50 | user_key = ikey.user_key; 51 | } else { 52 | TITAN_LOG_ERROR(db_->db_options_.info_log, 53 | "[%s] Unable to parse internal key", cf_name_.c_str()); 54 | { 55 | MutexLock l(&db_->mutex_); 56 | db_->SetBGError(s); 57 | } 58 | return Decision::kKeep; 59 | } 60 | } 61 | 62 | if (skip_value_) { 63 | return original_filter_->UnsafeFilter(level, user_key, value_type, 64 | Slice(), new_value, skip_until); 65 | } 66 | if (value_type != kBlobIndex) { 67 | return original_filter_->UnsafeFilter(level, user_key, value_type, value, 68 | new_value, skip_until); 69 | } 70 | 71 | BlobIndex blob_index; 72 | Slice original_value(value.data()); 73 | s = blob_index.DecodeFrom(&original_value); 74 | if (!s.ok()) { 75 | TITAN_LOG_ERROR(db_->db_options_.info_log, 76 | "[%s] Unable to decode blob index", cf_name_.c_str()); 77 | // TODO(yiwu): Better to fail the compaction as well, but current 78 | // compaction filter API doesn't support it. 79 | { 80 | MutexLock l(&db_->mutex_); 81 | db_->SetBGError(s); 82 | } 83 | // Unable to decode blob index. Keeping the value. 84 | return Decision::kKeep; 85 | } 86 | 87 | BlobRecord record; 88 | PinnableSlice buffer; 89 | ReadOptions read_options; 90 | s = blob_storage_->Get(read_options, blob_index, &record, &buffer, true); 91 | 92 | if (s.IsCorruption()) { 93 | // Could be cause by blob file beinged GC-ed, or real corruption. 94 | // TODO(yiwu): Tell the two cases apart. 95 | return Decision::kKeep; 96 | } else if (s.ok()) { 97 | auto decision = original_filter_->UnsafeFilter( 98 | level, user_key, kValue, record.value, new_value, skip_until); 99 | 100 | // It would be a problem if it change the value whereas the value_type 101 | // is still kBlobIndex. For now, just returns kKeep. 102 | // TODO: we should make rocksdb Filter API support changing value_type 103 | // assert(decision != CompactionFilter::Decision::kChangeValue); 104 | if (decision == Decision::kChangeValue) { 105 | { 106 | MutexLock l(&db_->mutex_); 107 | db_->SetBGError(Status::NotSupported( 108 | "It would be a problem if it change the value whereas the " 109 | "value_type is still kBlobIndex.")); 110 | } 111 | decision = Decision::kKeep; 112 | } 113 | return decision; 114 | } else { 115 | { 116 | MutexLock l(&db_->mutex_); 117 | db_->SetBGError(s); 118 | } 119 | // GetBlobRecord failed, keep the value. 120 | return Decision::kKeep; 121 | } 122 | } 123 | 124 | private: 125 | TitanDBImpl *db_; 126 | const std::string cf_name_; 127 | std::shared_ptr blob_storage_; 128 | const CompactionFilter *original_filter_; 129 | const std::unique_ptr owned_filter_; 130 | bool skip_value_; 131 | std::string filter_name_; 132 | }; 133 | 134 | class TitanCompactionFilterFactory final : public CompactionFilterFactory { 135 | public: 136 | TitanCompactionFilterFactory( 137 | const CompactionFilter *original_filter, 138 | std::shared_ptr original_filter_factory, 139 | TitanDBImpl *db, bool skip_value, const std::string &cf_name) 140 | : original_filter_(original_filter), 141 | original_filter_factory_(original_filter_factory), 142 | titan_db_impl_(db), 143 | skip_value_(skip_value), 144 | cf_name_(cf_name) { 145 | assert(original_filter != nullptr || original_filter_factory != nullptr); 146 | if (original_filter_ != nullptr) { 147 | factory_name_ = std::string("TitanCompactionFilterFactory.") 148 | .append(original_filter_->Name()); 149 | } else { 150 | factory_name_ = std::string("TitanCompactionFilterFactory.") 151 | .append(original_filter_factory_->Name()); 152 | } 153 | } 154 | 155 | const char *Name() const override { return factory_name_.c_str(); } 156 | 157 | std::unique_ptr CreateCompactionFilter( 158 | const CompactionFilter::Context &context) override { 159 | assert(original_filter_ != nullptr || original_filter_factory_ != nullptr); 160 | 161 | std::shared_ptr blob_storage; 162 | { 163 | MutexLock l(&titan_db_impl_->mutex_); 164 | blob_storage = titan_db_impl_->blob_file_set_ 165 | ->GetBlobStorage(context.column_family_id) 166 | .lock(); 167 | } 168 | if (blob_storage == nullptr) { 169 | assert(false); 170 | // Shouldn't be here, but ignore compaction filter when we hit error. 171 | return nullptr; 172 | } 173 | 174 | const CompactionFilter *original_filter = original_filter_; 175 | std::unique_ptr original_filter_from_factory; 176 | if (original_filter == nullptr) { 177 | original_filter_from_factory = 178 | original_filter_factory_->CreateCompactionFilter(context); 179 | original_filter = original_filter_from_factory.get(); 180 | } 181 | 182 | if (original_filter == nullptr) { 183 | return nullptr; 184 | } 185 | 186 | return std::unique_ptr(new TitanCompactionFilter( 187 | titan_db_impl_, cf_name_, original_filter, 188 | std::move(original_filter_from_factory), blob_storage, skip_value_)); 189 | } 190 | 191 | private: 192 | const CompactionFilter *original_filter_; 193 | std::shared_ptr original_filter_factory_; 194 | TitanDBImpl *titan_db_impl_; 195 | bool skip_value_; 196 | const std::string cf_name_; 197 | std::string factory_name_; 198 | }; 199 | 200 | } // namespace titandb 201 | } // namespace rocksdb 202 | -------------------------------------------------------------------------------- /src/compaction_filter_test.cc: -------------------------------------------------------------------------------- 1 | #include "db_impl.h" 2 | #include "test_util/testharness.h" 3 | 4 | namespace rocksdb { 5 | namespace titandb { 6 | 7 | class TestCompactionFilter : public CompactionFilter { 8 | public: 9 | explicit TestCompactionFilter(uint64_t min_blob_size) 10 | : min_blob_size_(min_blob_size) {} 11 | 12 | const char *Name() const override { return "DeleteCompactionFilter"; } 13 | 14 | bool Filter(int level, const Slice &key, const Slice &value, 15 | std::string * /*&new_value*/, 16 | bool * /*value_changed*/) const override { 17 | AssertValue(key, value); 18 | return !value.starts_with("remain"); 19 | } 20 | 21 | private: 22 | void AssertValue(const Slice &key, const Slice &value) const { 23 | if (key.ToString() == "mykey") { 24 | ASSERT_EQ(value.ToString(), "myvalue"); 25 | } 26 | if (key.ToString() == "bigkey") { 27 | ASSERT_EQ(value.ToString(), std::string(min_blob_size_ + 1, 'v')); 28 | } 29 | if (key.starts_with("bigkey100")) { 30 | ASSERT_EQ(key.size(), 100); 31 | } 32 | if (key.starts_with("skip")) { 33 | ASSERT_EQ(value, Slice()); 34 | } 35 | } 36 | 37 | uint64_t min_blob_size_; 38 | }; 39 | 40 | class TitanCompactionFilterTest : public testing::Test { 41 | public: 42 | TitanCompactionFilterTest() : dbname_(test::TmpDir()) { 43 | options_.dirname = dbname_ + "/titandb"; 44 | options_.create_if_missing = true; 45 | options_.disable_background_gc = true; 46 | options_.disable_auto_compactions = true; 47 | options_.compaction_filter = 48 | new TestCompactionFilter(options_.min_blob_size); 49 | 50 | DeleteDir(options_.dirname); 51 | DeleteDir(dbname_); 52 | } 53 | 54 | ~TitanCompactionFilterTest() override { 55 | Close(); 56 | delete options_.compaction_filter; 57 | DeleteDir(options_.dirname); 58 | DeleteDir(dbname_); 59 | } 60 | 61 | static void DeleteDir(const std::string &dirname) { 62 | Env *env = Env::Default(); 63 | std::vector filenames; 64 | env->GetChildren(dirname, &filenames); 65 | 66 | for (auto &fname : filenames) { 67 | env->DeleteFile(dirname + "/" + fname); 68 | } 69 | env->DeleteDir(dirname); 70 | } 71 | 72 | void Open() { 73 | ASSERT_OK(TitanDB::Open(options_, dbname_, &db_)); 74 | db_impl_ = reinterpret_cast(db_); 75 | } 76 | 77 | void Close() { 78 | if (!db_) return; 79 | 80 | ASSERT_OK(db_->Close()); 81 | delete db_; 82 | db_ = nullptr; 83 | } 84 | 85 | Status Get(const std::string &key, std::string *value) { 86 | ReadOptions ropts; 87 | return db_->Get(ropts, key, value); 88 | } 89 | 90 | Status Put(const std::string &key, const std::string &value) { 91 | WriteOptions wopts; 92 | return db_->Put(wopts, key, value); 93 | } 94 | 95 | std::string GetBigValue() { 96 | return std::string(options_.min_blob_size + 1, 'v'); 97 | } 98 | 99 | void CompactAll() { 100 | CompactRangeOptions copts; 101 | ASSERT_OK(db_->CompactRange(copts, nullptr, nullptr)); 102 | } 103 | 104 | protected: 105 | std::string dbname_; 106 | TitanOptions options_; 107 | TitanDB *db_{nullptr}; 108 | TitanDBImpl *db_impl_{nullptr}; 109 | }; 110 | 111 | TEST_F(TitanCompactionFilterTest, CompactNormalValue) { 112 | Open(); 113 | 114 | Status s = Put("mykey", "myvalue"); 115 | ASSERT_OK(s); 116 | 117 | std::string value; 118 | s = Get("mykey", &value); 119 | ASSERT_OK(s); 120 | ASSERT_EQ(value, "myvalue"); 121 | 122 | CompactAll(); 123 | 124 | s = Get("mykey", &value); 125 | ASSERT_TRUE(s.IsNotFound()); 126 | } 127 | 128 | TEST_F(TitanCompactionFilterTest, CompactBlobValue) { 129 | Open(); 130 | 131 | std::string value = GetBigValue(); 132 | ASSERT_GT(value.length(), options_.min_blob_size); 133 | Status s = Put("bigkey", value); 134 | ASSERT_OK(s); 135 | 136 | std::string value1; 137 | s = Get("bigkey", &value1); 138 | ASSERT_OK(s); 139 | ASSERT_EQ(value1, value); 140 | 141 | char keybuf[1024] = {'\0'}; 142 | for (int i = 0; i < 1000; i++) { 143 | memset(keybuf, 0, 1024); 144 | snprintf(keybuf, 1024, "bigkey100_%090d", i); 145 | s = Put(keybuf, value); 146 | ASSERT_OK(s); 147 | } 148 | 149 | CompactAll(); 150 | 151 | s = Get("bigkey", &value1); 152 | ASSERT_TRUE(s.IsNotFound()); 153 | } 154 | 155 | TEST_F(TitanCompactionFilterTest, CompactUpdateValue) { 156 | options_.blob_file_discardable_ratio = 0.01; 157 | options_.min_blob_size = 1; 158 | options_.target_file_size_base = 1; 159 | Open(); 160 | 161 | ASSERT_OK(db_->Put(WriteOptions(), "update-key", "remain1")); 162 | ASSERT_OK(db_->Put(WriteOptions(), "update-another-key", "remain2")); 163 | ASSERT_OK(db_->Flush(FlushOptions())); 164 | ASSERT_OK(db_->Put(WriteOptions(), "update-key", "value")); 165 | ASSERT_OK(db_->Flush(FlushOptions())); 166 | ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); 167 | 168 | uint32_t cf_id = db_->DefaultColumnFamily()->GetID(); 169 | ASSERT_OK(db_impl_->TEST_StartGC(cf_id)); 170 | ASSERT_OK(db_impl_->TEST_PurgeObsoleteFiles()); 171 | ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); 172 | 173 | std::string value; 174 | ASSERT_TRUE(db_->Get(ReadOptions(), "update-key", &value).IsNotFound()); 175 | ASSERT_OK(db_->Get(ReadOptions(), "update-another-key", &value)); 176 | ASSERT_EQ(value, "remain2"); 177 | } 178 | 179 | TEST_F(TitanCompactionFilterTest, CompactSkipValue) { 180 | options_.skip_value_in_compaction_filter = true; 181 | Open(); 182 | 183 | ASSERT_OK(db_->Put(WriteOptions(), "skip-key", "skip-value")); 184 | ASSERT_OK(db_->Flush(FlushOptions())); 185 | ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); 186 | 187 | std::string value; 188 | ASSERT_TRUE(db_->Get(ReadOptions(), "skip-key", &value).IsNotFound()); 189 | } 190 | 191 | TEST_F(TitanCompactionFilterTest, FilterNewColumnFamily) { 192 | options_.skip_value_in_compaction_filter = true; 193 | Open(); 194 | TitanCFDescriptor desc("last_summer", options_); 195 | ColumnFamilyHandle *handle = nullptr; 196 | ASSERT_OK(db_->CreateColumnFamily(desc, &handle)); 197 | 198 | ASSERT_OK(db_->Put(WriteOptions(), handle, "skip-key", "skip-value")); 199 | ASSERT_OK(db_->Flush(FlushOptions(), handle)); 200 | ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handle, nullptr, nullptr)); 201 | 202 | std::string value; 203 | ASSERT_TRUE(db_->Get(ReadOptions(), handle, "skip-key", &value).IsNotFound()); 204 | ASSERT_OK(db_->DestroyColumnFamilyHandle(handle)); 205 | } 206 | 207 | } // namespace titandb 208 | } // namespace rocksdb 209 | 210 | int main(int argc, char **argv) { 211 | ::testing::InitGoogleTest(&argc, argv); 212 | return RUN_ALL_TESTS(); 213 | } -------------------------------------------------------------------------------- /src/db.cc: -------------------------------------------------------------------------------- 1 | #include "titan/db.h" 2 | 3 | #include "db_impl.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | Status TitanDB::Open(const TitanOptions& options, const std::string& dbname, 9 | TitanDB** db) { 10 | TitanDBOptions db_options(options); 11 | TitanCFOptions cf_options(options); 12 | std::vector descs; 13 | descs.emplace_back(kDefaultColumnFamilyName, cf_options); 14 | std::vector handles; 15 | Status s = TitanDB::Open(db_options, dbname, descs, &handles, db); 16 | if (s.ok()) { 17 | assert(handles.size() == 1); 18 | // DBImpl is always holding the default handle. 19 | delete handles[0]; 20 | } 21 | return s; 22 | } 23 | 24 | Status TitanDB::Open(const TitanDBOptions& db_options, 25 | const std::string& dbname, 26 | const std::vector& descs, 27 | std::vector* handles, TitanDB** db) { 28 | auto impl = new TitanDBImpl(db_options, dbname); 29 | auto s = impl->Open(descs, handles); 30 | if (s.ok()) { 31 | *db = impl; 32 | } else { 33 | *db = nullptr; 34 | delete impl; 35 | } 36 | return s; 37 | } 38 | 39 | } // namespace titandb 40 | } // namespace rocksdb 41 | -------------------------------------------------------------------------------- /src/db_impl_files.cc: -------------------------------------------------------------------------------- 1 | #include "db_impl.h" 2 | #include "titan_logging.h" 3 | 4 | namespace rocksdb { 5 | namespace titandb { 6 | 7 | Status TitanDBImpl::PurgeObsoleteFilesImpl() { 8 | Status s; 9 | 10 | MutexLock delete_file_lock(&delete_titandb_file_mutex_); 11 | if (disable_titandb_file_deletions_ > 0) { 12 | return s; 13 | } 14 | 15 | std::vector candidate_files; 16 | auto oldest_sequence = GetOldestSnapshotSequence(); 17 | { 18 | MutexLock l(&mutex_); 19 | blob_file_set_->GetObsoleteFiles(&candidate_files, oldest_sequence); 20 | } 21 | 22 | // dedup state.inputs so we don't try to delete the same 23 | // file twice 24 | std::sort(candidate_files.begin(), candidate_files.end()); 25 | candidate_files.erase( 26 | std::unique(candidate_files.begin(), candidate_files.end()), 27 | candidate_files.end()); 28 | 29 | for (const auto& candidate_file : candidate_files) { 30 | TITAN_LOG_INFO(db_options_.info_log, "Titan deleting obsolete file [%s]", 31 | candidate_file.c_str()); 32 | Status delete_status = env_->DeleteFile(candidate_file); 33 | if (!s.ok()) { 34 | // Move on despite error deleting the file. 35 | TITAN_LOG_ERROR(db_options_.info_log, 36 | "Titan deleting file [%s] failed, status:%s", 37 | candidate_file.c_str(), s.ToString().c_str()); 38 | s = delete_status; 39 | } 40 | } 41 | return s; 42 | } 43 | 44 | void TitanDBImpl::PurgeObsoleteFiles() { 45 | Status s __attribute__((__unused__)) = PurgeObsoleteFilesImpl(); 46 | assert(s.ok()); 47 | } 48 | 49 | Status TitanDBImpl::TEST_PurgeObsoleteFiles() { 50 | return PurgeObsoleteFilesImpl(); 51 | } 52 | 53 | } // namespace titandb 54 | } // namespace rocksdb 55 | -------------------------------------------------------------------------------- /src/db_iter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef __STDC_FORMAT_MACROS 4 | #define __STDC_FORMAT_MACROS 5 | #endif 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include "db/arena_wrapped_db_iter.h" 13 | #include "db/db_iter.h" 14 | #include "rocksdb/env.h" 15 | 16 | #include "blob_file_reader.h" 17 | #include "blob_format.h" 18 | #include "blob_storage.h" 19 | #include "titan_logging.h" 20 | #include "titan_stats.h" 21 | 22 | namespace rocksdb { 23 | namespace titandb { 24 | 25 | class TitanDBIterator : public Iterator { 26 | public: 27 | TitanDBIterator(const TitanReadOptions &options, BlobStorage *storage, 28 | std::shared_ptr snap, 29 | std::unique_ptr iter, SystemClock *clock, 30 | TitanStats *stats, Logger *info_log) 31 | : options_(options), 32 | storage_(storage), 33 | snap_(snap), 34 | iter_(std::move(iter)), 35 | clock_(clock), 36 | stats_(stats), 37 | info_log_(info_log) {} 38 | 39 | ~TitanDBIterator() { 40 | RecordInHistogram(statistics(stats_), TITAN_ITER_TOUCH_BLOB_FILE_COUNT, 41 | files_.size()); 42 | } 43 | 44 | bool Valid() const override { return iter_->Valid() && status_.ok(); } 45 | 46 | Status status() const override { 47 | // assume volatile inner iter 48 | if (status_.ok()) { 49 | return iter_->status(); 50 | } else { 51 | return status_; 52 | } 53 | } 54 | 55 | void SeekToFirst() override { 56 | iter_->SeekToFirst(); 57 | type_ = TITAN_NUM_SEEK; 58 | } 59 | 60 | void SeekToLast() override { 61 | iter_->SeekToLast(); 62 | type_ = TITAN_NUM_SEEK; 63 | } 64 | 65 | void Seek(const Slice &target) override { 66 | iter_->Seek(target); 67 | type_ = TITAN_NUM_SEEK; 68 | } 69 | 70 | void SeekForPrev(const Slice &target) override { 71 | iter_->SeekForPrev(target); 72 | type_ = TITAN_NUM_SEEK; 73 | } 74 | 75 | void Next() override { 76 | assert(Valid()); 77 | iter_->Next(); 78 | type_ = TITAN_NUM_NEXT; 79 | } 80 | 81 | void Prev() override { 82 | assert(Valid()); 83 | iter_->Prev(); 84 | type_ = TITAN_NUM_PREV; 85 | } 86 | 87 | Slice key() const override { 88 | assert(Valid()); 89 | return iter_->key(); 90 | } 91 | 92 | Slice value() const override { 93 | assert(Valid() && !options_.key_only); 94 | if (options_.key_only) return Slice(); 95 | if (!iter_->IsBlob()) return iter_->value(); 96 | 97 | HistogramType hist_type; 98 | switch (type_) { 99 | case TITAN_NUM_SEEK: 100 | hist_type = TITAN_SEEK_MICROS; 101 | break; 102 | case TITAN_NUM_NEXT: 103 | hist_type = TITAN_NEXT_MICROS; 104 | break; 105 | case TITAN_NUM_PREV: 106 | hist_type = TITAN_PREV_MICROS; 107 | break; 108 | default: 109 | hist_type = TITAN_SEEK_MICROS; 110 | assert(false); 111 | }; 112 | StopWatch sw(clock_, statistics(stats_), hist_type); 113 | RecordTick(statistics(stats_), type_); 114 | status_ = GetBlobValue(); 115 | if (!status_.ok()) { 116 | return Slice(); 117 | } 118 | return record_.value; 119 | } 120 | 121 | private: 122 | Status GetBlobValue() const { 123 | assert(iter_->status().ok()); 124 | 125 | Status s; 126 | BlobIndex index; 127 | s = DecodeInto(iter_->value(), &index); 128 | if (!s.ok()) { 129 | TITAN_LOG_ERROR( 130 | info_log_, "Titan iterator: failed to decode blob index %s: %s", 131 | iter_->value().ToString(true /*hex*/).c_str(), s.ToString().c_str()); 132 | if (options_.abort_on_failure) std::abort(); 133 | return s; 134 | } 135 | 136 | std::string cache_key; 137 | auto blob_cache = storage_->blob_cache(); 138 | if (blob_cache) { 139 | cache_key = storage_->EncodeBlobCache(index); 140 | bool cache_hit; 141 | s = storage_->TryGetBlobCache(cache_key, &record_, &buffer_, &cache_hit); 142 | if (!s.ok()) return s; 143 | if (cache_hit) return s; 144 | } 145 | 146 | auto it = files_.find(index.file_number); 147 | if (it == files_.end()) { 148 | std::unique_ptr prefetcher; 149 | s = storage_->NewPrefetcher(index.file_number, &prefetcher); 150 | if (!s.ok()) { 151 | TITAN_LOG_ERROR( 152 | info_log_, 153 | "Titan iterator: failed to create prefetcher for blob file %" PRIu64 154 | ": %s", 155 | index.file_number, s.ToString().c_str()); 156 | if (options_.abort_on_failure) std::abort(); 157 | return s; 158 | } 159 | it = files_.emplace(index.file_number, std::move(prefetcher)).first; 160 | } 161 | 162 | buffer_.Reset(); 163 | OwnedSlice blob; 164 | s = it->second->Get(options_, index.blob_handle, &record_, &blob); 165 | if (!s.ok()) { 166 | TITAN_LOG_ERROR( 167 | info_log_, 168 | "Titan iterator: failed to read blob value from file %" PRIu64 169 | ", offset %" PRIu64 ", size %" PRIu64 ": %s\n", 170 | index.file_number, index.blob_handle.offset, index.blob_handle.size, 171 | s.ToString().c_str()); 172 | if (options_.abort_on_failure) std::abort(); 173 | } 174 | 175 | if (blob_cache && options_.fill_cache) { 176 | Cache::Handle *cache_handle = nullptr; 177 | auto cache_value = new OwnedSlice(std::move(blob)); 178 | blob_cache->Insert(cache_key, cache_value, &kBlobValueCacheItemHelper, 179 | cache_value->size() + sizeof(*cache_value), 180 | &cache_handle, Cache::Priority::BOTTOM); 181 | buffer_.PinSlice(*cache_value, UnrefCacheHandle, blob_cache, 182 | cache_handle); 183 | } else { 184 | buffer_.PinSlice(blob, OwnedSlice::CleanupFunc, blob.release(), nullptr); 185 | } 186 | return s; 187 | } 188 | 189 | mutable Status status_; 190 | mutable BlobRecord record_; 191 | mutable PinnableSlice buffer_; 192 | TickerType type_; 193 | 194 | TitanReadOptions options_; 195 | BlobStorage *storage_; 196 | std::shared_ptr snap_; 197 | std::unique_ptr iter_; 198 | mutable std::unordered_map> 199 | files_; 200 | 201 | SystemClock *clock_; 202 | TitanStats *stats_; 203 | Logger *info_log_; 204 | }; 205 | 206 | } // namespace titandb 207 | } // namespace rocksdb 208 | -------------------------------------------------------------------------------- /src/options.cc: -------------------------------------------------------------------------------- 1 | #include "titan/options.h" 2 | 3 | #ifndef __STDC_FORMAT_MACROS 4 | #define __STDC_FORMAT_MACROS 5 | #endif 6 | 7 | #include 8 | 9 | #include "options/options_helper.h" 10 | #include "rocksdb/convenience.h" 11 | #include "titan_logging.h" 12 | 13 | namespace rocksdb { 14 | namespace titandb { 15 | 16 | void TitanDBOptions::Dump(Logger* logger) const { 17 | TITAN_LOG_HEADER(logger, "TitanDBOptions.dirname : %s", 18 | dirname.c_str()); 19 | TITAN_LOG_HEADER(logger, "TitanDBOptions.disable_background_gc : %d", 20 | static_cast(disable_background_gc)); 21 | TITAN_LOG_HEADER(logger, 22 | "TitanDBOptions.max_background_gc : %" PRIi32, 23 | max_background_gc); 24 | TITAN_LOG_HEADER(logger, 25 | "TitanDBOptions.purge_obsolete_files_period_sec: %" PRIu32, 26 | purge_obsolete_files_period_sec); 27 | TITAN_LOG_HEADER(logger, 28 | "TitanDBOptions.titan_stats_dump_period_sec: %" PRIu32, 29 | titan_stats_dump_period_sec); 30 | } 31 | 32 | TitanCFOptions::TitanCFOptions(const ColumnFamilyOptions& cf_opts, 33 | const ImmutableTitanCFOptions& immutable_opts, 34 | const MutableTitanCFOptions& mutable_opts) 35 | : ColumnFamilyOptions(cf_opts), 36 | min_blob_size(mutable_opts.min_blob_size), 37 | blob_file_compression(mutable_opts.blob_file_compression), 38 | blob_file_target_size(immutable_opts.blob_file_target_size), 39 | blob_cache(immutable_opts.blob_cache), 40 | max_gc_batch_size(immutable_opts.max_gc_batch_size), 41 | min_gc_batch_size(immutable_opts.min_gc_batch_size), 42 | blob_file_discardable_ratio(mutable_opts.blob_file_discardable_ratio), 43 | merge_small_file_threshold(immutable_opts.merge_small_file_threshold), 44 | blob_run_mode(mutable_opts.blob_run_mode), 45 | skip_value_in_compaction_filter( 46 | immutable_opts.skip_value_in_compaction_filter), 47 | block_size(immutable_opts.block_size), 48 | enable_punch_hole_gc(immutable_opts.enable_punch_hole_gc) {} 49 | 50 | void TitanCFOptions::Dump(Logger* logger) const { 51 | TITAN_LOG_HEADER(logger, 52 | "TitanCFOptions.min_blob_size : %" PRIu64, 53 | min_blob_size); 54 | std::string compression_str = "unknown"; 55 | for (auto& compression_type : compression_type_string_map) { 56 | if (compression_type.second == blob_file_compression) { 57 | compression_str = compression_type.first; 58 | break; 59 | } 60 | } 61 | TITAN_LOG_HEADER(logger, "TitanCFOptions.blob_file_compression : %s", 62 | compression_str.c_str()); 63 | TITAN_LOG_HEADER(logger, "TItanCFOptions.blob_file_compression_options: "); 64 | TITAN_LOG_HEADER(logger, " window_bits : %d", 65 | blob_file_compression_options.window_bits); 66 | TITAN_LOG_HEADER(logger, " level : %d", 67 | blob_file_compression_options.level); 68 | TITAN_LOG_HEADER(logger, " strategy : %d", 69 | blob_file_compression_options.strategy); 70 | TITAN_LOG_HEADER(logger, " max_dict_bytes : %" PRIu32, 71 | blob_file_compression_options.max_dict_bytes); 72 | TITAN_LOG_HEADER(logger, " zstd_max_train_bytes : %" PRIu32, 73 | blob_file_compression_options.zstd_max_train_bytes); 74 | TITAN_LOG_HEADER(logger, 75 | "TitanCFOptions.blob_file_target_size : %" PRIu64, 76 | blob_file_target_size); 77 | TITAN_LOG_HEADER(logger, "TitanCFOptions.blob_cache : %p", 78 | blob_cache.get()); 79 | if (blob_cache != nullptr) { 80 | TITAN_LOG_HEADER(logger, "%s", blob_cache->GetPrintableOptions().c_str()); 81 | } 82 | TITAN_LOG_HEADER(logger, 83 | "TitanCFOptions.max_gc_batch_size : %" PRIu64, 84 | max_gc_batch_size); 85 | TITAN_LOG_HEADER(logger, 86 | "TitanCFOptions.min_gc_batch_size : %" PRIu64, 87 | min_gc_batch_size); 88 | TITAN_LOG_HEADER(logger, "TitanCFOptions.blob_file_discardable_ratio : %lf", 89 | blob_file_discardable_ratio); 90 | TITAN_LOG_HEADER(logger, 91 | "TitanCFOptions.merge_small_file_threshold : %" PRIu64, 92 | merge_small_file_threshold); 93 | std::string blob_run_mode_str = "unknown"; 94 | if (blob_run_mode_to_string.count(blob_run_mode) > 0) { 95 | blob_run_mode_str = blob_run_mode_to_string.at(blob_run_mode); 96 | } 97 | TITAN_LOG_HEADER(logger, "TitanCFOptions.blob_run_mode : %s", 98 | blob_run_mode_str.c_str()); 99 | } 100 | 101 | void TitanCFOptions::UpdateMutableOptions( 102 | const MutableTitanCFOptions& new_options) { 103 | blob_run_mode = new_options.blob_run_mode; 104 | min_blob_size = new_options.min_blob_size; 105 | blob_file_compression = new_options.blob_file_compression; 106 | blob_file_discardable_ratio = new_options.blob_file_discardable_ratio; 107 | } 108 | 109 | std::map 110 | TitanOptionsHelper::blob_run_mode_to_string = { 111 | {TitanBlobRunMode::kNormal, "kNormal"}, 112 | {TitanBlobRunMode::kReadOnly, "kReadOnly"}, 113 | {TitanBlobRunMode::kFallback, "kFallback"}}; 114 | 115 | std::unordered_map 116 | TitanOptionsHelper::blob_run_mode_string_map = { 117 | {"kNormal", TitanBlobRunMode::kNormal}, 118 | {"kReadOnly", TitanBlobRunMode::kReadOnly}, 119 | {"kFallback", TitanBlobRunMode::kFallback}}; 120 | 121 | } // namespace titandb 122 | } // namespace rocksdb 123 | -------------------------------------------------------------------------------- /src/table_builder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/types.h" 4 | #include "table/table_builder.h" 5 | 6 | #include "blob_file_builder.h" 7 | #include "blob_file_manager.h" 8 | #include "blob_file_set.h" 9 | #include "titan/options.h" 10 | #include "titan_stats.h" 11 | 12 | namespace rocksdb { 13 | namespace titandb { 14 | 15 | class TitanTableBuilder : public TableBuilder { 16 | public: 17 | TitanTableBuilder(uint32_t cf_id, const TitanDBOptions& db_options, 18 | const TitanCFOptions& cf_options, 19 | std::unique_ptr base_builder, 20 | std::shared_ptr blob_manager, 21 | std::weak_ptr blob_storage, TitanStats* stats, 22 | int merge_level, int target_level) 23 | : cf_id_(cf_id), 24 | db_options_(db_options), 25 | cf_options_(cf_options), 26 | base_builder_(std::move(base_builder)), 27 | blob_manager_(blob_manager), 28 | blob_storage_(blob_storage), 29 | stats_(stats), 30 | target_level_(target_level), 31 | merge_level_(merge_level) {} 32 | 33 | void Add(const Slice& key, const Slice& value) override; 34 | 35 | Status status() const override; 36 | 37 | Status Finish() override; 38 | 39 | void Abandon() override; 40 | 41 | uint64_t NumEntries() const override; 42 | 43 | uint64_t FileSize() const override; 44 | 45 | bool NeedCompact() const override; 46 | 47 | TableProperties GetTableProperties() const override; 48 | 49 | IOStatus io_status() const override; 50 | 51 | std::string GetFileChecksum() const override; 52 | 53 | const char* GetFileChecksumFuncName() const override; 54 | 55 | private: 56 | friend class TableBuilderTest; 57 | 58 | bool ok() const { return status().ok(); } 59 | 60 | bool builder_unbuffered() const { 61 | return !blob_builder_ || blob_builder_->GetBuilderState() == 62 | BlobFileBuilder::BuilderState::kUnbuffered; 63 | } 64 | 65 | std::unique_ptr NewCachedRecordContext( 66 | const ParsedInternalKey& ikey, const Slice& value); 67 | 68 | void AddBlob(const ParsedInternalKey& ikey, const Slice& value); 69 | 70 | void AddBlobResultsToBase(const BlobFileBuilder::OutContexts& contexts); 71 | 72 | bool ShouldMerge(const std::shared_ptr& file); 73 | 74 | void FinishBlobFile(); 75 | 76 | void UpdateInternalOpStats(); 77 | 78 | Status GetBlobRecord(const BlobIndex& index, BlobRecord* record, 79 | OwnedSlice* buffer); 80 | 81 | void AddBase(const Slice& key, const ParsedInternalKey& parsedKey, 82 | const Slice& value); 83 | 84 | Status status_; 85 | uint32_t cf_id_; 86 | TitanDBOptions db_options_; 87 | TitanCFOptions cf_options_; 88 | std::unique_ptr base_builder_; 89 | std::unique_ptr blob_handle_; 90 | std::shared_ptr blob_manager_; 91 | std::unique_ptr blob_builder_; 92 | std::weak_ptr blob_storage_; 93 | std::vector< 94 | std::pair, std::unique_ptr>> 95 | finished_blobs_; 96 | std::unordered_map> 97 | input_file_prefetchers_; 98 | TitanStats* stats_; 99 | 100 | // target level in LSM-Tree for generated SSTs and blob files 101 | int target_level_; 102 | // with cf_options_.level_merge == true, if target_level_ is higher than or 103 | // equals to merge_level_, values belong to blob files which have lower level 104 | // than target_level_ will be merged to new blob file 105 | int merge_level_; 106 | 107 | // counters 108 | uint64_t bytes_read_ = 0; 109 | uint64_t bytes_written_ = 0; 110 | uint64_t io_bytes_read_ = 0; 111 | uint64_t io_bytes_written_ = 0; 112 | uint64_t gc_num_keys_relocated_ = 0; 113 | uint64_t gc_bytes_relocated_ = 0; 114 | uint64_t error_read_cnt_ = 0; 115 | }; 116 | 117 | } // namespace titandb 118 | } // namespace rocksdb 119 | -------------------------------------------------------------------------------- /src/table_factory.cc: -------------------------------------------------------------------------------- 1 | #include "table_factory.h" 2 | 3 | #include "db_impl.h" 4 | #include "table_builder.h" 5 | 6 | namespace rocksdb { 7 | namespace titandb { 8 | 9 | Status TitanTableFactory::NewTableReader( 10 | const ReadOptions &ro, const TableReaderOptions &options, 11 | std::unique_ptr &&file, uint64_t file_size, 12 | std::unique_ptr *result, 13 | bool prefetch_index_and_filter_in_cache) const { 14 | return base_factory_->NewTableReader(ro, options, std::move(file), file_size, 15 | result, 16 | prefetch_index_and_filter_in_cache); 17 | } 18 | 19 | TableBuilder *TitanTableFactory::NewTableBuilder( 20 | const TableBuilderOptions &options, WritableFileWriter *file) const { 21 | std::unique_ptr base_builder( 22 | base_factory_->NewTableBuilder(options, file)); 23 | // When opening base DB, it may trigger flush L0. But blob_file_set_ is not 24 | // opened yet, then it would write to the uninitialized manifest writer. So do 25 | // not enable titan table builder now. 26 | if (!blob_file_set_->IsOpened()) { 27 | return base_builder.release(); 28 | } 29 | TitanCFOptions cf_options = cf_options_; 30 | cf_options.UpdateMutableOptions(mutable_cf_options_); 31 | 32 | std::weak_ptr blob_storage; 33 | 34 | // since we force use dynamic_level_bytes=true when level_merge=true, the last 35 | // level of a cf is always cf_options.num_levels - 1. 36 | int num_levels = cf_options.num_levels; 37 | 38 | { 39 | MutexLock l(db_mutex_); 40 | blob_storage = blob_file_set_->GetBlobStorage(options.column_family_id); 41 | } 42 | 43 | return new TitanTableBuilder( 44 | options.column_family_id, db_options_, cf_options, 45 | std::move(base_builder), blob_manager_, blob_storage, stats_, 46 | std::max(1, num_levels - 2) /* merge level */, options.level_at_creation); 47 | } 48 | 49 | } // namespace titandb 50 | } // namespace rocksdb 51 | -------------------------------------------------------------------------------- /src/table_factory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "rocksdb/table.h" 6 | 7 | #include "blob_file_manager.h" 8 | #include "blob_file_set.h" 9 | #include "titan/options.h" 10 | #include "titan_stats.h" 11 | 12 | namespace rocksdb { 13 | namespace titandb { 14 | 15 | class TitanDBImpl; 16 | 17 | class TitanTableFactory : public TableFactory { 18 | public: 19 | TitanTableFactory(const TitanDBOptions& db_options, 20 | const TitanCFOptions& cf_options, 21 | std::shared_ptr blob_manager, 22 | port::Mutex* db_mutex, BlobFileSet* blob_file_set, 23 | TitanStats* stats) 24 | : db_options_(db_options), 25 | cf_options_(cf_options), 26 | mutable_cf_options_(cf_options), 27 | base_factory_(cf_options.table_factory), 28 | blob_manager_(blob_manager), 29 | db_mutex_(db_mutex), 30 | blob_file_set_(blob_file_set), 31 | stats_(stats) {} 32 | 33 | const char* Name() const override { return "TitanTable"; } 34 | 35 | using TableFactory::NewTableReader; 36 | 37 | Status NewTableReader( 38 | const ReadOptions& ro, const TableReaderOptions& options, 39 | std::unique_ptr&& file, uint64_t file_size, 40 | std::unique_ptr* result, 41 | bool prefetch_index_and_filter_in_cache = true) const override; 42 | 43 | TableBuilder* NewTableBuilder(const TableBuilderOptions& options, 44 | WritableFileWriter* file) const override; 45 | 46 | void SetMutableCFOptions(const MutableTitanCFOptions& mutable_cf_options) { 47 | db_mutex_->AssertHeld(); 48 | mutable_cf_options_ = mutable_cf_options; 49 | } 50 | 51 | bool IsDeleteRangeSupported() const override { 52 | return base_factory_->IsDeleteRangeSupported(); 53 | } 54 | 55 | private: 56 | const TitanDBOptions db_options_; 57 | const TitanCFOptions cf_options_; 58 | MutableTitanCFOptions mutable_cf_options_; 59 | 60 | std::shared_ptr base_factory_; 61 | std::shared_ptr blob_manager_; 62 | port::Mutex* db_mutex_; 63 | BlobFileSet* blob_file_set_; 64 | TitanStats* stats_; 65 | }; 66 | 67 | } // namespace titandb 68 | } // namespace rocksdb 69 | -------------------------------------------------------------------------------- /src/testutil.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rocksdb/cache.h" 4 | #include "test_util/testharness.h" 5 | #include "util/compression.h" 6 | 7 | namespace rocksdb { 8 | namespace titandb { 9 | 10 | template 11 | void CheckCodec(const T& input) { 12 | std::string buffer; 13 | input.EncodeTo(&buffer); 14 | T output; 15 | ASSERT_OK(DecodeInto(buffer, &output)); 16 | ASSERT_EQ(output, input); 17 | } 18 | 19 | } // namespace titandb 20 | } // namespace rocksdb 21 | -------------------------------------------------------------------------------- /src/titan_checkpoint_impl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "file/filename.h" 4 | 5 | #include "titan/checkpoint.h" 6 | 7 | namespace rocksdb { 8 | namespace titandb { 9 | 10 | class VersionEdit; 11 | 12 | class TitanCheckpointImpl : public Checkpoint { 13 | public: 14 | explicit TitanCheckpointImpl(TitanDB* db) : db_(db) {} 15 | 16 | // Follow these steps to build an openable snapshot of TitanDB: 17 | // (1) Create base db checkpoint. 18 | // (2) Hard linked all existing blob files(live + obsolete) if the output 19 | // directory is on the same filesystem, and copied otherwise. 20 | // (3) Create MANIFEST file include all records about existing blob files. 21 | // (4) Craft CURRENT file manually based on MANIFEST file number. 22 | // This will include redundant blob files, but hopefully not a lot of them, 23 | // and on restart Titan will recalculate GC stats and GC out those redundant 24 | // blob files. 25 | using Checkpoint::CreateCheckpoint; 26 | virtual Status CreateCheckpoint(const std::string& base_checkpoint_dir, 27 | const std::string& titan_checkpoint_dir = "", 28 | uint64_t log_size_for_flush = 0) override; 29 | 30 | // Checkpoint logic can be customized by providing callbacks for link, copy, 31 | // or create. 32 | Status CreateCustomCheckpoint( 33 | const TitanDBOptions& titandb_options, 34 | std::function 36 | link_file_cb, 37 | std::function 40 | copy_file_cb, 41 | std::function 43 | create_file_cb, 44 | uint64_t log_size_for_flush, const std::string full_private_path); 45 | 46 | private: 47 | void CleanStagingDirectory(const std::string& path, Logger* info_log); 48 | 49 | // Create titan manifest file based on the content of VersionEdit 50 | Status CreateTitanManifest(const std::string& file_name, 51 | std::vector* edits); 52 | 53 | private: 54 | TitanDB* db_; 55 | }; 56 | 57 | } // namespace titandb 58 | } // namespace rocksdb 59 | -------------------------------------------------------------------------------- /src/titan_fault_injection_test_env.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "rocksdb/env.h" 6 | #include "utilities/fault_injection_env.h" 7 | 8 | namespace rocksdb { 9 | namespace titandb { 10 | 11 | class TitanFaultInjectionTestEnv; 12 | 13 | class TitanTestRandomAccessFile : public RandomAccessFile { 14 | public: 15 | explicit TitanTestRandomAccessFile(std::unique_ptr&& f, 16 | TitanFaultInjectionTestEnv* env) 17 | : target_(std::move(f)), env_(env) { 18 | assert(target_ != nullptr); 19 | } 20 | virtual ~TitanTestRandomAccessFile() {} 21 | Status Read(uint64_t offset, size_t n, Slice* result, 22 | char* scratch) const override; 23 | Status Prefetch(uint64_t offset, size_t n) override; 24 | size_t GetUniqueId(char* id, size_t max_size) const override { 25 | return target_->GetUniqueId(id, max_size); 26 | } 27 | void Hint(AccessPattern pattern) override { return target_->Hint(pattern); } 28 | bool use_direct_io() const override { return target_->use_direct_io(); } 29 | size_t GetRequiredBufferAlignment() const override { 30 | return target_->GetRequiredBufferAlignment(); 31 | } 32 | Status InvalidateCache(size_t offset, size_t length) override; 33 | 34 | private: 35 | std::unique_ptr target_; 36 | TitanFaultInjectionTestEnv* env_; 37 | }; 38 | 39 | class TitanFaultInjectionTestEnv : public FaultInjectionTestEnv { 40 | public: 41 | TitanFaultInjectionTestEnv(Env* t) : FaultInjectionTestEnv(t) {} 42 | virtual ~TitanFaultInjectionTestEnv() {} 43 | Status NewRandomAccessFile(const std::string& fname, 44 | std::unique_ptr* result, 45 | const EnvOptions& soptions) { 46 | if (!IsFilesystemActive()) { 47 | return GetError(); 48 | } 49 | Status s = target()->NewRandomAccessFile(fname, result, soptions); 50 | if (s.ok()) { 51 | result->reset(new TitanTestRandomAccessFile(std::move(*result), this)); 52 | } 53 | return s; 54 | } 55 | }; 56 | 57 | Status TitanTestRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, 58 | char* scratch) const { 59 | if (!env_->IsFilesystemActive()) { 60 | return env_->GetError(); 61 | } 62 | return target_->Read(offset, n, result, scratch); 63 | } 64 | 65 | Status TitanTestRandomAccessFile::Prefetch(uint64_t offset, size_t n) { 66 | if (!env_->IsFilesystemActive()) { 67 | return env_->GetError(); 68 | } 69 | return target_->Prefetch(offset, n); 70 | } 71 | 72 | Status TitanTestRandomAccessFile::InvalidateCache(size_t offset, 73 | size_t length) { 74 | if (!env_->IsFilesystemActive()) { 75 | return env_->GetError(); 76 | } 77 | return target_->InvalidateCache(offset, length); 78 | } 79 | 80 | } // namespace titandb 81 | } // namespace rocksdb 82 | -------------------------------------------------------------------------------- /src/titan_logging.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 | // This source code is licensed under both the GPLv2 (found in the 3 | // COPYING file in the root directory) and Apache 2.0 License 4 | // (found in the LICENSE.Apache file in the root directory). 5 | // 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 | // Use of this source code is governed by a BSD-style license that can be 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 | // 10 | // Must not be included from any .h files to avoid polluting the namespace 11 | // with macros. 12 | 13 | #pragma once 14 | 15 | // This file is adapted from rocksdb/logging/logging.h. 16 | // We make a copy of this file because we want to have source files under 17 | // Titan directory being shorten. 18 | 19 | // Helper macros that include information about file name and line number 20 | #define TITAN_LOG_STRINGIFY(x) #x 21 | #define TITAN_LOG_TOSTRING(x) TITAN_LOG_STRINGIFY(x) 22 | #define TITAN_LOG_PREPEND_FILE_LINE(FMT) \ 23 | ("[titan/%s:" TITAN_LOG_TOSTRING(__LINE__) "] " FMT) 24 | 25 | inline const char* TitanLogShorterFileName(const char* file) { 26 | // 16 is the length of "titan_logging.h". 27 | // If the name of this file changed, please change this number, too. 28 | return file + (sizeof(__FILE__) > 16 ? sizeof(__FILE__) - 16 : 0); 29 | } 30 | 31 | // Don't inclide file/line info in HEADER level 32 | #define TITAN_LOG_HEADER(LGR, FMT, ...) \ 33 | rocksdb::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__) 34 | 35 | #define TITAN_LOG_DEBUG(LGR, FMT, ...) \ 36 | rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, \ 37 | TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 38 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 39 | 40 | #define TITAN_LOG_INFO(LGR, FMT, ...) \ 41 | rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, \ 42 | TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 43 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 44 | 45 | #define TITAN_LOG_WARN(LGR, FMT, ...) \ 46 | rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, \ 47 | TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 48 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 49 | 50 | #define TITAN_LOG_ERROR(LGR, FMT, ...) \ 51 | rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, \ 52 | TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 53 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 54 | 55 | #define TITAN_LOG_FATAL(LGR, FMT, ...) \ 56 | rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, \ 57 | TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 58 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 59 | 60 | #define TITAN_LOG_BUFFER(LOG_BUF, FMT, ...) \ 61 | rocksdb::LogToBuffer(LOG_BUF, TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 62 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 63 | 64 | #define TITAN_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...) \ 65 | rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, \ 66 | TITAN_LOG_PREPEND_FILE_LINE(FMT), \ 67 | TitanLogShorterFileName(__FILE__), ##__VA_ARGS__) 68 | 69 | #define TITAN_LOG_DETAILS(LGR, FMT, ...) \ 70 | ; // due to overhead by default skip such lines 71 | // TITAN_LOG_DEBUG(LGR, FMT, ##__VA_ARGS__) 72 | -------------------------------------------------------------------------------- /src/titan_options_test.cc: -------------------------------------------------------------------------------- 1 | #include "test_util/testharness.h" 2 | 3 | #include "titan/db.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | class TitanOptionsTest : public testing::Test { 9 | public: 10 | TitanOptionsTest() : db_name_(test::TmpDir()) { 11 | titan_options_.create_if_missing = true; 12 | titan_options_.dirname = db_name_ + "/titandb"; 13 | } 14 | 15 | ~TitanOptionsTest() { 16 | Status s = Close(); 17 | assert(s.ok()); 18 | } 19 | 20 | Status Open() { return TitanDB::Open(titan_options_, db_name_, &titan_db); } 21 | 22 | Status DeleteDir(const std::string& dirname) { 23 | Status s; 24 | Env* env = Env::Default(); 25 | std::vector filenames; 26 | s = env->GetChildren(dirname, &filenames); 27 | if (!s.ok()) { 28 | return s; 29 | } 30 | for (auto& fname : filenames) { 31 | s = env->DeleteFile(dirname + "/" + fname); 32 | if (!s.ok()) { 33 | return s; 34 | } 35 | } 36 | s = env->DeleteDir(dirname); 37 | return s; 38 | } 39 | 40 | Status Close() { 41 | Status s; 42 | if (titan_db != nullptr) { 43 | s = titan_db->Close(); 44 | if (!s.ok()) { 45 | return s; 46 | } 47 | titan_db = nullptr; 48 | s = DeleteDir(titan_options_.dirname); 49 | if (!s.ok()) { 50 | return s; 51 | } 52 | rocksdb::Options opts; 53 | s = rocksdb::DestroyDB(db_name_, opts); 54 | } 55 | return s; 56 | } 57 | 58 | protected: 59 | std::string db_name_; 60 | TitanOptions titan_options_; 61 | TitanDB* titan_db = nullptr; 62 | }; 63 | 64 | TEST_F(TitanOptionsTest, LevelMerge) { 65 | titan_options_.level_merge = true; 66 | titan_options_.level_compaction_dynamic_level_bytes = false; 67 | Status s = Open(); 68 | ASSERT_TRUE(s.IsInvalidArgument()); 69 | } 70 | 71 | } // namespace titandb 72 | } // namespace rocksdb 73 | 74 | int main(int argc, char** argv) { 75 | ::testing::InitGoogleTest(&argc, argv); 76 | return RUN_ALL_TESTS(); 77 | } 78 | -------------------------------------------------------------------------------- /src/titan_stats.cc: -------------------------------------------------------------------------------- 1 | #include "titan_stats.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "monitoring/statistics_impl.h" 8 | 9 | #include "blob_file_set.h" 10 | #include "blob_storage.h" 11 | #include "titan/db.h" 12 | 13 | namespace rocksdb { 14 | namespace titandb { 15 | 16 | std::shared_ptr CreateDBStatistics() { 17 | return rocksdb::CreateDBStatistics(); 19 | } 20 | 21 | static const std::string titandb_prefix = "rocksdb.titandb."; 22 | 23 | static const std::string num_blob_files_at_level_prefix = 24 | "num-blob-files-at-level"; 25 | static const std::string live_blob_size = "live-blob-size"; 26 | static const std::string num_live_blob_file = "num-live-blob-file"; 27 | static const std::string num_obsolete_blob_file = "num-obsolete-blob-file"; 28 | static const std::string live_blob_file_size = "live-blob-file-size"; 29 | static const std::string obsolete_blob_file_size = "obsolete-blob-file-size"; 30 | static const std::string num_discardable_ratio_le0_file = 31 | "num-discardable-ratio-le0-file"; 32 | static const std::string num_discardable_ratio_le20_file = 33 | "num-discardable-ratio-le20-file"; 34 | static const std::string num_discardable_ratio_le50_file = 35 | "num-discardable-ratio-le50-file"; 36 | static const std::string num_discardable_ratio_le80_file = 37 | "num-discardable-ratio-le80-file"; 38 | static const std::string num_discardable_ratio_le100_file = 39 | "num-discardable-ratio-le100-file"; 40 | 41 | const std::string TitanDB::Properties::kNumBlobFilesAtLevelPrefix = 42 | titandb_prefix + num_blob_files_at_level_prefix; 43 | const std::string TitanDB::Properties::kLiveBlobSize = 44 | titandb_prefix + live_blob_size; 45 | const std::string TitanDB::Properties::kNumLiveBlobFile = 46 | titandb_prefix + num_live_blob_file; 47 | const std::string TitanDB::Properties::kNumObsoleteBlobFile = 48 | titandb_prefix + num_obsolete_blob_file; 49 | const std::string TitanDB::Properties::kLiveBlobFileSize = 50 | titandb_prefix + live_blob_file_size; 51 | const std::string TitanDB::Properties::kObsoleteBlobFileSize = 52 | titandb_prefix + obsolete_blob_file_size; 53 | const std::string TitanDB::Properties::kNumDiscardableRatioLE0File = 54 | titandb_prefix + num_discardable_ratio_le0_file; 55 | const std::string TitanDB::Properties::kNumDiscardableRatioLE20File = 56 | titandb_prefix + num_discardable_ratio_le20_file; 57 | const std::string TitanDB::Properties::kNumDiscardableRatioLE50File = 58 | titandb_prefix + num_discardable_ratio_le50_file; 59 | const std::string TitanDB::Properties::kNumDiscardableRatioLE80File = 60 | titandb_prefix + num_discardable_ratio_le80_file; 61 | const std::string TitanDB::Properties::kNumDiscardableRatioLE100File = 62 | titandb_prefix + num_discardable_ratio_le100_file; 63 | 64 | const std::unordered_map< 65 | std::string, std::function> 66 | TitanInternalStats::stats_type_string_map = { 67 | {TitanDB::Properties::kNumBlobFilesAtLevelPrefix, 68 | &TitanInternalStats::HandleNumBlobFilesAtLevel}, 69 | {TitanDB::Properties::kLiveBlobSize, 70 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 71 | TitanInternalStats::LIVE_BLOB_SIZE, std::placeholders::_2)}, 72 | {TitanDB::Properties::kNumLiveBlobFile, 73 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 74 | TitanInternalStats::NUM_LIVE_BLOB_FILE, 75 | std::placeholders::_2)}, 76 | {TitanDB::Properties::kNumObsoleteBlobFile, 77 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 78 | TitanInternalStats::NUM_OBSOLETE_BLOB_FILE, 79 | std::placeholders::_2)}, 80 | {TitanDB::Properties::kLiveBlobFileSize, 81 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 82 | TitanInternalStats::LIVE_BLOB_FILE_SIZE, 83 | std::placeholders::_2)}, 84 | {TitanDB::Properties::kObsoleteBlobFileSize, 85 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 86 | TitanInternalStats::OBSOLETE_BLOB_FILE_SIZE, 87 | std::placeholders::_2)}, 88 | {TitanDB::Properties::kNumDiscardableRatioLE0File, 89 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 90 | TitanInternalStats::NUM_DISCARDABLE_RATIO_LE0, 91 | std::placeholders::_2)}, 92 | {TitanDB::Properties::kNumDiscardableRatioLE20File, 93 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 94 | TitanInternalStats::NUM_DISCARDABLE_RATIO_LE20, 95 | std::placeholders::_2)}, 96 | {TitanDB::Properties::kNumDiscardableRatioLE50File, 97 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 98 | TitanInternalStats::NUM_DISCARDABLE_RATIO_LE50, 99 | std::placeholders::_2)}, 100 | {TitanDB::Properties::kNumDiscardableRatioLE80File, 101 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 102 | TitanInternalStats::NUM_DISCARDABLE_RATIO_LE80, 103 | std::placeholders::_2)}, 104 | {TitanDB::Properties::kNumDiscardableRatioLE100File, 105 | std::bind(&TitanInternalStats::HandleStatsValue, std::placeholders::_1, 106 | TitanInternalStats::NUM_DISCARDABLE_RATIO_LE100, 107 | std::placeholders::_2)}, 108 | }; 109 | 110 | const std::array(InternalOpType::INTERNAL_OP_ENUM_MAX)> 112 | TitanInternalStats::internal_op_names = {{ 113 | "Flush ", 114 | "Compaction", 115 | "GC ", 116 | }}; 117 | 118 | // Assumes that trailing numbers represent an optional argument. This requires 119 | // property names to not end with numbers. 120 | std::pair GetPropertyNameAndArg(const Slice& property) { 121 | Slice name = property, arg = property; 122 | size_t sfx_len = 0; 123 | while (sfx_len < property.size() && 124 | isdigit(property[property.size() - sfx_len - 1])) { 125 | ++sfx_len; 126 | } 127 | name.remove_suffix(sfx_len); 128 | arg.remove_prefix(property.size() - sfx_len); 129 | return {name, arg}; 130 | } 131 | 132 | bool TitanInternalStats::GetIntProperty(const Slice& property, 133 | uint64_t* value) const { 134 | auto ppt = GetPropertyNameAndArg(property); 135 | auto p = stats_type_string_map.find(ppt.first.ToString()); 136 | if (p != stats_type_string_map.end()) { 137 | *value = (p->second)(this, ppt.second); 138 | return true; 139 | } 140 | return false; 141 | } 142 | 143 | bool TitanInternalStats::GetStringProperty(const Slice& property, 144 | std::string* value) const { 145 | uint64_t int_value; 146 | if (GetIntProperty(property, &int_value)) { 147 | *value = std::to_string(int_value); 148 | return true; 149 | } 150 | return false; 151 | } 152 | 153 | uint64_t TitanInternalStats::HandleStatsValue( 154 | TitanInternalStats::StatsType type, Slice _arg) const { 155 | return stats_[type].load(std::memory_order_relaxed); 156 | } 157 | 158 | uint64_t TitanInternalStats::HandleNumBlobFilesAtLevel(Slice arg) const { 159 | auto s = arg.ToString(); 160 | int level = ParseInt(s); 161 | return blob_storage_->NumBlobFilesAtLevel(level); 162 | } 163 | 164 | void TitanInternalStats::DumpAndResetInternalOpStats(LogBuffer* log_buffer) { 165 | constexpr double GB = 1.0 * 1024 * 1024 * 1024; 166 | constexpr double SECOND = 1.0 * 1000000; 167 | LogToBuffer(log_buffer, 168 | "OP COUNT READ(GB) WRITE(GB) IO_READ(GB) IO_WRITE(GB) " 169 | " FILE_IN FILE_OUT GC_READ(MICROS) GC_UPDATE(MICROS)"); 170 | LogToBuffer(log_buffer, 171 | "----------------------------------------------------------------" 172 | "-----------------"); 173 | for (int op = 0; op < static_cast(InternalOpType::INTERNAL_OP_ENUM_MAX); 174 | op++) { 175 | LogToBuffer( 176 | log_buffer, 177 | "%s %5d %10.1f %10.1f %10.1f %10.1f %8d %8d %10.1f %10.1f %10.1f", 178 | internal_op_names[op].c_str(), 179 | GetAndResetStats(&internal_op_stats_[op], InternalOpStatsType::COUNT), 180 | GetAndResetStats(&internal_op_stats_[op], 181 | InternalOpStatsType::BYTES_READ) / 182 | GB, 183 | GetAndResetStats(&internal_op_stats_[op], 184 | InternalOpStatsType::BYTES_WRITTEN) / 185 | GB, 186 | GetAndResetStats(&internal_op_stats_[op], 187 | InternalOpStatsType::IO_BYTES_READ) / 188 | GB, 189 | GetAndResetStats(&internal_op_stats_[op], 190 | InternalOpStatsType::IO_BYTES_WRITTEN) / 191 | GB, 192 | GetAndResetStats(&internal_op_stats_[op], 193 | InternalOpStatsType::INPUT_FILE_NUM), 194 | GetAndResetStats(&internal_op_stats_[op], 195 | InternalOpStatsType::OUTPUT_FILE_NUM), 196 | GetAndResetStats(&internal_op_stats_[op], 197 | InternalOpStatsType::GC_READ_LSM_MICROS) / 198 | SECOND, 199 | GetAndResetStats(&internal_op_stats_[op], 200 | InternalOpStatsType::GC_UPDATE_LSM_MICROS) / 201 | SECOND); 202 | } 203 | } 204 | 205 | void TitanStats::InitializeCF(uint32_t cf_id, 206 | std::shared_ptr blob_storage) { 207 | internal_stats_[cf_id] = std::make_shared(blob_storage); 208 | } 209 | 210 | } // namespace titandb 211 | template class StatisticsImpl; 213 | } // namespace rocksdb 214 | -------------------------------------------------------------------------------- /src/titan_stats.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "logging/log_buffer.h" 10 | #include "monitoring/histogram.h" 11 | #include "monitoring/statistics_impl.h" 12 | #include "rocksdb/iostats_context.h" 13 | #include "rocksdb/statistics.h" 14 | #include "util/string_util.h" 15 | 16 | #include "titan/options.h" 17 | #include "titan/statistics.h" 18 | 19 | namespace rocksdb { 20 | 21 | extern template class StatisticsImpl; 23 | 24 | namespace titandb { 25 | 26 | std::shared_ptr CreateDBStatistics(); 27 | 28 | enum class InternalOpStatsType : int { 29 | COUNT = 0, 30 | BYTES_READ, 31 | BYTES_WRITTEN, 32 | IO_BYTES_READ, 33 | IO_BYTES_WRITTEN, 34 | INPUT_FILE_NUM, 35 | OUTPUT_FILE_NUM, 36 | GC_READ_LSM_MICROS, 37 | // Update lsm and write callback 38 | GC_UPDATE_LSM_MICROS, 39 | INTERNAL_OP_STATS_ENUM_MAX, 40 | }; 41 | 42 | enum class InternalOpType : int { 43 | FLUSH = 0, 44 | COMPACTION, 45 | GC, 46 | INTERNAL_OP_ENUM_MAX, 47 | }; 48 | 49 | using InternalOpStats = 50 | std::array, 51 | static_cast( 52 | InternalOpStatsType::INTERNAL_OP_STATS_ENUM_MAX)>; 53 | 54 | class BlobStorage; 55 | 56 | // Titan internal stats does NOT optimize race 57 | // condition by making thread local copies of 58 | // data. 59 | class TitanInternalStats { 60 | public: 61 | enum StatsType { 62 | LIVE_BLOB_SIZE = 63 | 0, // deprecated, it isn't accurate enough and hard to make it accurate 64 | NUM_LIVE_BLOB_FILE, 65 | NUM_OBSOLETE_BLOB_FILE, 66 | LIVE_BLOB_FILE_SIZE, 67 | OBSOLETE_BLOB_FILE_SIZE, 68 | 69 | NUM_DISCARDABLE_RATIO_LE0, 70 | NUM_DISCARDABLE_RATIO_LE20, 71 | NUM_DISCARDABLE_RATIO_LE50, 72 | NUM_DISCARDABLE_RATIO_LE80, 73 | NUM_DISCARDABLE_RATIO_LE100, 74 | 75 | INTERNAL_STATS_ENUM_MAX, 76 | }; 77 | 78 | TitanInternalStats(std::shared_ptr blob_storage) 79 | : blob_storage_(blob_storage) { 80 | Clear(); 81 | } 82 | 83 | void Clear() { 84 | for (int stat = 0; stat < INTERNAL_STATS_ENUM_MAX; stat++) { 85 | stats_[stat].store(0, std::memory_order_relaxed); 86 | } 87 | for (int op = 0; 88 | op < static_cast(InternalOpType::INTERNAL_OP_ENUM_MAX); op++) { 89 | assert( 90 | internal_op_stats_[op].size() == 91 | static_cast(InternalOpStatsType::INTERNAL_OP_STATS_ENUM_MAX)); 92 | for (int stat = 0; 93 | stat < 94 | static_cast(InternalOpStatsType::INTERNAL_OP_STATS_ENUM_MAX); 95 | stat++) { 96 | internal_op_stats_[op][stat].store(0, std::memory_order_relaxed); 97 | } 98 | } 99 | } 100 | 101 | void ResetStats(StatsType type) { 102 | stats_[type].store(0, std::memory_order_relaxed); 103 | } 104 | 105 | void SetStats(StatsType type, uint64_t value) { 106 | stats_[type].store(value, std::memory_order_relaxed); 107 | } 108 | 109 | void AddStats(StatsType type, uint64_t value) { 110 | auto& v = stats_[type]; 111 | v.fetch_add(value, std::memory_order_relaxed); 112 | } 113 | 114 | void SubStats(StatsType type, uint64_t value) { 115 | auto& v = stats_[type]; 116 | v.fetch_sub(value, std::memory_order_relaxed); 117 | } 118 | 119 | InternalOpStats* GetInternalOpStatsForType(InternalOpType type) { 120 | return &internal_op_stats_[static_cast(type)]; 121 | } 122 | 123 | void DumpAndResetInternalOpStats(LogBuffer* log_buffer); 124 | 125 | bool GetIntProperty(const Slice& property, uint64_t* value) const; 126 | bool GetStringProperty(const Slice& property, std::string* value) const; 127 | uint64_t HandleStatsValue(TitanInternalStats::StatsType type, 128 | Slice _arg) const; 129 | uint64_t HandleNumBlobFilesAtLevel(Slice arg) const; 130 | 131 | private: 132 | static const std::unordered_map< 133 | std::string, std::function> 134 | stats_type_string_map; 135 | static const std::array< 136 | std::string, static_cast(InternalOpType::INTERNAL_OP_ENUM_MAX)> 137 | internal_op_names; 138 | std::array, INTERNAL_STATS_ENUM_MAX> stats_; 139 | std::array(InternalOpType::INTERNAL_OP_ENUM_MAX)> 141 | internal_op_stats_; 142 | std::shared_ptr blob_storage_; 143 | }; 144 | 145 | class TitanStats { 146 | public: 147 | TitanStats(Statistics* stats) : stats_(stats) {} 148 | 149 | void InitializeCF(uint32_t cf_id, std::shared_ptr blob_storage); 150 | 151 | Statistics* statistics() { return stats_; } 152 | 153 | TitanInternalStats* internal_stats(uint32_t cf_id) { 154 | auto p = internal_stats_.find(cf_id); 155 | if (p == internal_stats_.end()) { 156 | return nullptr; 157 | } else { 158 | return p->second.get(); 159 | } 160 | } 161 | 162 | void DumpInternalOpStats(uint32_t cf_id, const std::string& cf_name); 163 | 164 | // Resets all ticker and histogram stats 165 | Status Reset() { 166 | for (auto& p : internal_stats_) { 167 | p.second->Clear(); 168 | } 169 | return stats_->Reset(); 170 | } 171 | 172 | private: 173 | // RocksDB statistics 174 | Statistics* stats_ = nullptr; 175 | std::unordered_map> 176 | internal_stats_; 177 | }; 178 | 179 | // Utility functions for RocksDB stats types 180 | inline Statistics* statistics(TitanStats* stats) { 181 | return (stats) ? stats->statistics() : nullptr; 182 | } 183 | 184 | // Utility functions for Titan ticker and histogram stats types 185 | inline void ResetStats(TitanStats* stats, uint32_t cf_id, 186 | TitanInternalStats::StatsType type) { 187 | if (stats) { 188 | auto p = stats->internal_stats(cf_id); 189 | if (p) { 190 | p->ResetStats(type); 191 | } 192 | } 193 | } 194 | 195 | inline void SetStats(TitanStats* stats, uint32_t cf_id, 196 | TitanInternalStats::StatsType type, uint64_t value) { 197 | if (stats) { 198 | auto p = stats->internal_stats(cf_id); 199 | if (p) { 200 | p->SetStats(type, value); 201 | } 202 | } 203 | } 204 | 205 | inline void AddStats(TitanStats* stats, uint32_t cf_id, 206 | TitanInternalStats::StatsType type, uint64_t value) { 207 | if (stats) { 208 | auto p = stats->internal_stats(cf_id); 209 | if (p) { 210 | p->AddStats(type, value); 211 | } 212 | } 213 | } 214 | 215 | inline void SubStats(TitanStats* stats, uint32_t cf_id, 216 | TitanInternalStats::StatsType type, uint64_t value) { 217 | if (stats) { 218 | auto p = stats->internal_stats(cf_id); 219 | if (p) { 220 | p->SubStats(type, value); 221 | } 222 | } 223 | } 224 | 225 | // Utility functions for Titan internal operation stats type 226 | inline uint64_t GetAndResetStats(InternalOpStats* stats, 227 | InternalOpStatsType type) { 228 | if (stats != nullptr) { 229 | return (*stats)[static_cast(type)].exchange(0, 230 | std::memory_order_relaxed); 231 | } 232 | return 0; 233 | } 234 | 235 | inline void AddStats(InternalOpStats* stats, InternalOpStatsType type, 236 | uint64_t value = 1) { 237 | if (stats != nullptr) { 238 | (*stats)[static_cast(type)].fetch_add(value, 239 | std::memory_order_relaxed); 240 | } 241 | } 242 | 243 | // IOStatsContext helper 244 | 245 | inline void SavePrevIOBytes(uint64_t* prev_bytes_read, 246 | uint64_t* prev_bytes_written) { 247 | IOStatsContext* io_stats = get_iostats_context(); 248 | if (io_stats != nullptr) { 249 | *prev_bytes_read = io_stats->bytes_read; 250 | *prev_bytes_written = io_stats->bytes_written; 251 | } 252 | } 253 | 254 | inline void UpdateIOBytes(uint64_t prev_bytes_read, uint64_t prev_bytes_written, 255 | uint64_t* bytes_read, uint64_t* bytes_written) { 256 | IOStatsContext* io_stats = get_iostats_context(); 257 | if (io_stats != nullptr) { 258 | *bytes_read += io_stats->bytes_read - prev_bytes_read; 259 | *bytes_written += io_stats->bytes_written - prev_bytes_written; 260 | } 261 | } 262 | 263 | class TitanStopWatch { 264 | public: 265 | TitanStopWatch(Env* env, uint64_t& stats) 266 | : env_(env), stats_(stats), start_(env_->NowMicros()) {} 267 | 268 | ~TitanStopWatch() { stats_ += env_->NowMicros() - start_; } 269 | 270 | private: 271 | Env* env_; 272 | uint64_t& stats_; 273 | uint64_t start_; 274 | }; 275 | 276 | } // namespace titandb 277 | } // namespace rocksdb 278 | -------------------------------------------------------------------------------- /src/util.cc: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | #include "util/compression.h" 4 | #include "util/stop_watch.h" 5 | 6 | namespace rocksdb { 7 | namespace titandb { 8 | 9 | // See util/compression.h. 10 | const uint32_t kCompressionFormat = 2; 11 | 12 | bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { 13 | // Check to see if compressed less than 12.5% 14 | return compressed_size < raw_size - (raw_size / 8u); 15 | } 16 | 17 | Slice Compress(const CompressionInfo& info, const Slice& input, 18 | std::string* output, CompressionType* type) { 19 | *type = info.type(); 20 | if (info.type() == kNoCompression) { 21 | return input; 22 | } 23 | if (!CompressData(input, info, kCompressionFormat, output)) { 24 | // Compression method is not supported, or not good compression 25 | // ratio, so just fall back to uncompressed form. 26 | *type = kNoCompression; 27 | return input; 28 | } 29 | return *output; 30 | } 31 | 32 | Status Uncompress(const UncompressionInfo& info, const Slice& input, 33 | OwnedSlice* output, MemoryAllocator* allocator) { 34 | assert(info.type() != kNoCompression); 35 | size_t usize = 0; 36 | CacheAllocationPtr ubuf = UncompressData( 37 | info, input.data(), input.size(), &usize, kCompressionFormat, allocator); 38 | if (!ubuf.get()) { 39 | return Status::Corruption("Corrupted compressed blob"); 40 | } 41 | output->reset(std::move(ubuf), usize); 42 | return Status::OK(); 43 | } 44 | 45 | void UnrefCacheHandle(void* arg1, void* arg2) { 46 | Cache* cache = reinterpret_cast(arg1); 47 | Cache::Handle* h = reinterpret_cast(arg2); 48 | cache->Release(h); 49 | } 50 | 51 | Status SyncTitanManifest(TitanStats* stats, 52 | const ImmutableDBOptions* db_options, 53 | WritableFileWriter* file) { 54 | StopWatch sw(db_options->clock, statistics(stats), 55 | TITAN_MANIFEST_FILE_SYNC_MICROS); 56 | return file->Sync(db_options->use_fsync); 57 | } 58 | 59 | } // namespace titandb 60 | } // namespace rocksdb 61 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "file/writable_file_writer.h" 4 | #include "options/db_options.h" 5 | #include "rocksdb/cache.h" 6 | #include "util/compression.h" 7 | 8 | #include "titan_stats.h" 9 | 10 | namespace rocksdb { 11 | namespace titandb { 12 | 13 | // A slice pointed to an owned buffer. 14 | class OwnedSlice : public Slice { 15 | public: 16 | void reset(CacheAllocationPtr _data, size_t _size) { 17 | data_ = _data.get(); 18 | size_ = _size; 19 | buffer_ = std::move(_data); 20 | } 21 | 22 | void reset(CacheAllocationPtr buffer, const Slice& s) { 23 | data_ = s.data(); 24 | size_ = s.size(); 25 | buffer_ = std::move(buffer); 26 | } 27 | 28 | char* release() { 29 | data_ = nullptr; 30 | size_ = 0; 31 | return buffer_.release(); 32 | } 33 | 34 | static void CleanupFunc(void* buffer, void*) { 35 | delete[] reinterpret_cast(buffer); 36 | } 37 | 38 | private: 39 | CacheAllocationPtr buffer_; 40 | }; 41 | 42 | // A slice pointed to a fixed size buffer. 43 | template 44 | class FixedSlice : public Slice { 45 | public: 46 | FixedSlice() : Slice(buffer_, T) {} 47 | 48 | char* get() { return buffer_; } 49 | 50 | private: 51 | char buffer_[T]; 52 | }; 53 | 54 | // Compresses the input data according to the compression context. 55 | // Returns a slice with the output data and sets "*type" to the output 56 | // compression type. 57 | // 58 | // If compression is actually performed, fills "*output" with the 59 | // compressed data. However, if the compression ratio is not good, it 60 | // returns the input slice directly and sets "*type" to 61 | // kNoCompression. 62 | Slice Compress(const CompressionInfo& info, const Slice& input, 63 | std::string* output, CompressionType* type); 64 | 65 | // Uncompresses the input data according to the uncompression type. 66 | // If successful, fills "*buffer" with the uncompressed data and 67 | // points "*output" to it. 68 | Status Uncompress(const UncompressionInfo& info, const Slice& input, 69 | OwnedSlice* output, MemoryAllocator* allocator = nullptr); 70 | 71 | void UnrefCacheHandle(void* cache, void* handle); 72 | 73 | template 74 | void DeleteCacheValue(void* value, MemoryAllocator*) { 75 | delete reinterpret_cast(value); 76 | } 77 | 78 | const Cache::CacheItemHelper kBlobValueCacheItemHelper( 79 | CacheEntryRole::kBlobValue, &DeleteCacheValue); 80 | 81 | Status SyncTitanManifest(TitanStats* stats, 82 | const ImmutableDBOptions* db_options, 83 | WritableFileWriter* file); 84 | 85 | } // namespace titandb 86 | } // namespace rocksdb 87 | -------------------------------------------------------------------------------- /src/util_test.cc: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | #include "test_util/testharness.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | class UtilTest : public testing::Test {}; 9 | 10 | TEST(UtilTest, Compression) { 11 | std::string input(1024, 'a'); 12 | for (auto compression : 13 | {kSnappyCompression, kZlibCompression, kLZ4Compression, kZSTD}) { 14 | CompressionOptions compression_opt; 15 | CompressionContext compression_ctx(compression, compression_opt); 16 | CompressionInfo compression_info( 17 | compression_opt, compression_ctx, CompressionDict::GetEmptyDict(), 18 | compression, 0 /* sample_for_compression */); 19 | std::string buffer; 20 | auto compressed = Compress(compression_info, input, &buffer, &compression); 21 | if (compression != kNoCompression) { 22 | ASSERT_TRUE(compressed.size() <= input.size()); 23 | UncompressionContext uncompression_ctx(compression); 24 | UncompressionInfo uncompression_info( 25 | uncompression_ctx, UncompressionDict::GetEmptyDict(), compression); 26 | OwnedSlice output; 27 | ASSERT_OK(Uncompress(uncompression_info, compressed, &output)); 28 | ASSERT_EQ(output, input); 29 | } 30 | } 31 | } 32 | 33 | } // namespace titandb 34 | } // namespace rocksdb 35 | 36 | int main(int argc, char** argv) { 37 | ::testing::InitGoogleTest(&argc, argv); 38 | return RUN_ALL_TESTS(); 39 | } 40 | -------------------------------------------------------------------------------- /src/version_edit.cc: -------------------------------------------------------------------------------- 1 | #include "version_edit.h" 2 | 3 | #include "util/coding.h" 4 | 5 | namespace rocksdb { 6 | namespace titandb { 7 | 8 | void VersionEdit::EncodeTo(std::string* dst) const { 9 | if (has_next_file_number_) { 10 | PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); 11 | } 12 | 13 | PutVarint32Varint32(dst, kColumnFamilyID, column_family_id_); 14 | 15 | for (auto& file : added_files_) { 16 | PutVarint32(dst, kAddedBlobFileV3); 17 | file->EncodeTo(dst); 18 | } 19 | for (auto& file : deleted_files_) { 20 | // obsolete sequence is a inpersistent field, so no need to encode it. 21 | PutVarint32Varint64(dst, kDeletedBlobFile, file.first); 22 | } 23 | } 24 | 25 | Status VersionEdit::DecodeFrom(Slice* src) { 26 | uint32_t tag; 27 | uint64_t file_number; 28 | std::shared_ptr blob_file; 29 | Status s; 30 | 31 | const char* error = nullptr; 32 | while (!error && !src->empty()) { 33 | if (!GetVarint32(src, &tag)) { 34 | error = "invalid tag"; 35 | break; 36 | } 37 | switch (tag) { 38 | case kNextFileNumber: 39 | if (GetVarint64(src, &next_file_number_)) { 40 | has_next_file_number_ = true; 41 | } else { 42 | error = "next file number"; 43 | } 44 | break; 45 | case kColumnFamilyID: 46 | if (GetVarint32(src, &column_family_id_)) { 47 | } else { 48 | error = "column family id"; 49 | } 50 | break; 51 | // for compatibility issue 52 | case kAddedBlobFile: 53 | blob_file = std::make_shared(); 54 | s = blob_file->DecodeFromV1(src); 55 | if (s.ok()) { 56 | AddBlobFile(blob_file); 57 | } else { 58 | error = s.ToString().c_str(); 59 | } 60 | break; 61 | case kAddedBlobFileV2: 62 | blob_file = std::make_shared(); 63 | s = blob_file->DecodeFromV2(src); 64 | if (s.ok()) { 65 | AddBlobFile(blob_file); 66 | } else { 67 | error = s.ToString().c_str(); 68 | } 69 | break; 70 | case kAddedBlobFileV3: 71 | blob_file = std::make_shared(); 72 | s = blob_file->DecodeFrom(src); 73 | if (s.ok()) { 74 | AddBlobFile(blob_file); 75 | } else { 76 | error = s.ToString().c_str(); 77 | } 78 | break; 79 | case kDeletedBlobFile: 80 | if (GetVarint64(src, &file_number)) { 81 | DeleteBlobFile(file_number, 0); 82 | } else { 83 | error = "deleted blob file"; 84 | } 85 | break; 86 | default: 87 | error = "unknown tag"; 88 | break; 89 | } 90 | } 91 | 92 | if (error) { 93 | return Status::Corruption("VersionEdit", error); 94 | } 95 | return Status::OK(); 96 | } 97 | 98 | bool operator==(const VersionEdit& lhs, const VersionEdit& rhs) { 99 | if (lhs.added_files_.size() != rhs.added_files_.size()) { 100 | return false; 101 | } 102 | std::map> blob_files; 103 | for (std::size_t idx = 0; idx < lhs.added_files_.size(); idx++) { 104 | blob_files.insert( 105 | {lhs.added_files_[idx]->file_number(), lhs.added_files_[idx]}); 106 | } 107 | for (std::size_t idx = 0; idx < rhs.added_files_.size(); idx++) { 108 | auto iter = blob_files.find(rhs.added_files_[idx]->file_number()); 109 | if (iter == blob_files.end() || !(*iter->second == *rhs.added_files_[idx])) 110 | return false; 111 | } 112 | 113 | return (lhs.has_next_file_number_ == rhs.has_next_file_number_ && 114 | lhs.next_file_number_ == rhs.next_file_number_ && 115 | lhs.column_family_id_ == rhs.column_family_id_ && 116 | lhs.deleted_files_ == rhs.deleted_files_); 117 | } 118 | 119 | void VersionEdit::Dump(bool with_keys) const { 120 | fprintf(stdout, "column_family_id: %" PRIu32 "\n", column_family_id_); 121 | if (has_next_file_number_) { 122 | fprintf(stdout, "next_file_number: %" PRIu64 "\n", next_file_number_); 123 | } 124 | if (!added_files_.empty()) { 125 | fprintf(stdout, "add files:\n"); 126 | for (auto& file : added_files_) { 127 | file->Dump(with_keys); 128 | } 129 | } 130 | if (!deleted_files_.empty()) { 131 | fprintf(stdout, "delete files:\n"); 132 | for (auto& file : deleted_files_) { 133 | fprintf(stdout, "file %" PRIu64 ", seq %" PRIu64 "\n", file.first, 134 | file.second); 135 | } 136 | } 137 | } 138 | 139 | } // namespace titandb 140 | } // namespace rocksdb 141 | -------------------------------------------------------------------------------- /src/version_edit.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "rocksdb/slice.h" 8 | 9 | #include "blob_format.h" 10 | 11 | namespace rocksdb { 12 | namespace titandb { 13 | 14 | enum Tag { 15 | kNextFileNumber = 1, 16 | kColumnFamilyID = 10, 17 | kAddedBlobFile = 11, 18 | kDeletedBlobFile = 12, // Deprecated, leave here for backward compatibility 19 | kAddedBlobFileV2 = 13, // Comparing to kAddedBlobFile, it newly includes 20 | // smallest_key and largest_key of blob file 21 | kAddedBlobFileV3 = 14, // Comparing to kAddedBlobFileV2, it newly includes 22 | // block_size of blob file 23 | }; 24 | 25 | class VersionEdit { 26 | public: 27 | void SetNextFileNumber(uint64_t v) { 28 | has_next_file_number_ = true; 29 | next_file_number_ = v; 30 | } 31 | 32 | void SetColumnFamilyID(uint32_t v) { column_family_id_ = v; } 33 | 34 | void AddBlobFile(std::shared_ptr file) { 35 | added_files_.push_back(file); 36 | } 37 | 38 | void DeleteBlobFile(uint64_t file_number, SequenceNumber obsolete_sequence) { 39 | deleted_files_.emplace_back(std::make_pair(file_number, obsolete_sequence)); 40 | } 41 | 42 | void EncodeTo(std::string* dst) const; 43 | Status DecodeFrom(Slice* src); 44 | 45 | friend bool operator==(const VersionEdit& lhs, const VersionEdit& rhs); 46 | 47 | void Dump(bool with_keys) const; 48 | 49 | private: 50 | friend class BlobFileSet; 51 | friend class VersionTest; 52 | friend class EditCollector; 53 | 54 | bool has_next_file_number_{false}; 55 | uint64_t next_file_number_{0}; 56 | uint32_t column_family_id_{0}; 57 | 58 | std::vector> added_files_; 59 | std::vector> deleted_files_; 60 | }; 61 | 62 | } // namespace titandb 63 | } // namespace rocksdb 64 | -------------------------------------------------------------------------------- /tools/blob_file_dump.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021-present TiKV Project Authors. Licensed under Apache-2.0. 2 | 3 | #include "file/filename.h" 4 | #include "util/gflags_compat.h" 5 | 6 | #include "blob_file_iterator.h" 7 | 8 | using GFLAGS_NAMESPACE::ParseCommandLineFlags; 9 | using GFLAGS_NAMESPACE::SetUsageMessage; 10 | 11 | DEFINE_string(path, "", "Path of blob file."); 12 | DEFINE_bool(dump, false, ""); 13 | 14 | #define handle_error(s, location) \ 15 | if (!s.ok()) { \ 16 | fprintf(stderr, "error when %s: %s\n", location, s.ToString().c_str()); \ 17 | return 1; \ 18 | } 19 | 20 | namespace rocksdb { 21 | namespace titandb { 22 | 23 | int blob_file_dump() { 24 | Env* env = Env::Default(); 25 | Status s; 26 | 27 | std::string file_name = FLAGS_path; 28 | uint64_t file_size = 0; 29 | s = env->GetFileSize(file_name, &file_size); 30 | handle_error(s, "getting file size"); 31 | 32 | std::unique_ptr file; 33 | std::unique_ptr f; 34 | s = env->GetFileSystem()->NewRandomAccessFile(file_name, FileOptions(), &f, 35 | nullptr /*dbg*/); 36 | handle_error(s, "open file"); 37 | file.reset(new RandomAccessFileReader(std::move(f), file_name)); 38 | 39 | std::unique_ptr iter(new BlobFileIterator( 40 | std::move(file), 1 /*fake file number*/, file_size, TitanCFOptions())); 41 | 42 | iter->SeekToFirst(); 43 | while (iter->Valid()) { 44 | handle_error(iter->status(), "status"); 45 | if (FLAGS_dump) { 46 | std::string key = iter->key().ToString(true); 47 | std::string value = iter->value().ToString(true); 48 | fprintf(stdout, "%s: %s\n", key.c_str(), value.c_str()); 49 | } 50 | iter->Next(); 51 | } 52 | handle_error(iter->status(), "reading blob file"); 53 | return 0; 54 | } 55 | 56 | } // namespace titandb 57 | } // namespace rocksdb 58 | 59 | int main(int argc, char** argv) { 60 | SetUsageMessage(std::string("\nUSAGE\n") + std::string(argv[0]) + 61 | " [OPTIONS]..."); 62 | ParseCommandLineFlags(&argc, &argv, true); 63 | return rocksdb::titandb::blob_file_dump(); 64 | } 65 | -------------------------------------------------------------------------------- /tools/db_bench.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-present, Facebook, Inc. All rights reserved. 2 | // This source code is licensed under both the GPLv2 (found in the 3 | // COPYING file in the root directory) and Apache 2.0 License 4 | // (found in the LICENSE.Apache file in the root directory). 5 | // 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 | // Use of this source code is governed by a BSD-style license that can be 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 | 10 | #ifndef __STDC_FORMAT_MACROS 11 | #define __STDC_FORMAT_MACROS 12 | #endif 13 | 14 | #ifndef GFLAGS 15 | #include 16 | int main() { 17 | fprintf(stderr, "Please install gflags to run rocksdb tools\n"); 18 | return 1; 19 | } 20 | #else 21 | #include "db_bench_tool.h" 22 | int main(int argc, char** argv) { return rocksdb::db_bench_tool(argc, argv); } 23 | #endif // GFLAGS 24 | -------------------------------------------------------------------------------- /tools/db_bench_tool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-present, Facebook, Inc. All rights reserved. 2 | // This source code is licensed under both the GPLv2 (found in the 3 | // COPYING file in the root directory) and Apache 2.0 License 4 | // (found in the LICENSE.Apache file in the root directory). 5 | #pragma once 6 | 7 | namespace rocksdb { 8 | int db_bench_tool(int argc, char** argv); 9 | } // namespace rocksdb 10 | -------------------------------------------------------------------------------- /tools/manifest_dump.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021-present TiKV Project Authors. Licensed under Apache-2.0. 2 | 3 | #ifndef GFLAGS 4 | #include 5 | int main() { 6 | fprintf(stderr, "Please install gflags to run Titan tools.\n"); 7 | return 1; 8 | } 9 | #else 10 | 11 | #include 12 | 13 | #include "file/sequence_file_reader.h" 14 | #include "rocksdb/env.h" 15 | #include "util/gflags_compat.h" 16 | 17 | #include "edit_collector.h" 18 | #include "version_edit.h" 19 | 20 | using GFLAGS_NAMESPACE::ParseCommandLineFlags; 21 | using GFLAGS_NAMESPACE::SetUsageMessage; 22 | 23 | DEFINE_string(path, "", "Path for Titan manifest file."); 24 | DEFINE_bool(ignore_tail_err, true, 25 | "Ignore error encounter towards the tail of manifest."); 26 | DEFINE_bool(verbose, false, "Output each manifest record."); 27 | DEFINE_bool(with_keys, false, "Output blob file boundary keys"); 28 | 29 | #define handle_error(s, location) \ 30 | if (!s.ok()) { \ 31 | fprintf(stderr, "error when %s: %s\n", location, s.ToString().c_str()); \ 32 | return 1; \ 33 | } 34 | 35 | namespace rocksdb { 36 | namespace titandb { 37 | 38 | int manifest_dump() { 39 | if (FLAGS_path.empty()) { 40 | fprintf(stderr, "Manifest file path not given.\n"); 41 | return 1; 42 | } 43 | Env* env = Env::Default(); 44 | Status s; 45 | 46 | // Open manifest file. 47 | std::unique_ptr file_reader; 48 | std::unique_ptr file; 49 | s = env->GetFileSystem()->NewSequentialFile(FLAGS_path, FileOptions(), &file, 50 | nullptr /*dbg*/); 51 | handle_error(s, "open manifest file"); 52 | file_reader.reset(new SequentialFileReader(std::move(file), FLAGS_path)); 53 | 54 | // Open log reader. 55 | LogReporter reporter; 56 | reporter.status = &s; 57 | log::Reader log_reader(nullptr, std::move(file_reader), &reporter, 58 | true /*checksum*/, 0 /*log_num*/); 59 | Slice record; 60 | std::string scratch; 61 | 62 | // Loop through log records. 63 | EditCollector edit_collector(nullptr, true); 64 | while (log_reader.ReadRecord(&record, &scratch) && s.ok()) { 65 | VersionEdit edit; 66 | s = DecodeInto(record, &edit); 67 | handle_error(s, "parse version edit"); 68 | if (FLAGS_verbose) { 69 | edit.Dump(FLAGS_with_keys); 70 | fprintf(stdout, "\n"); 71 | } 72 | s = edit_collector.AddEdit(edit); 73 | handle_error(s, "collect version edit"); 74 | } 75 | if (!FLAGS_ignore_tail_err) { 76 | handle_error(s, "parse manifest record"); 77 | } 78 | edit_collector.Dump(FLAGS_with_keys); 79 | return 0; 80 | } 81 | 82 | } // namespace titandb 83 | } // namespace rocksdb 84 | 85 | int main(int argc, char** argv) { 86 | SetUsageMessage(std::string("\nUSAGE\n") + std::string(argv[0]) + 87 | " [OPTIONS]..."); 88 | ParseCommandLineFlags(&argc, &argv, true); 89 | return rocksdb::titandb::manifest_dump(); 90 | } 91 | 92 | #endif // GFLAGS 93 | -------------------------------------------------------------------------------- /util/titan_build_version.cc.in: -------------------------------------------------------------------------------- 1 | #include "titan_build_version.h" 2 | const char* titan_build_git_sha = "titan_build_git_sha:@@GIT_SHA@@"; 3 | -------------------------------------------------------------------------------- /util/titan_build_version.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | extern const char* titan_build_git_sha; --------------------------------------------------------------------------------