├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── examples └── simple_example.rs ├── src ├── common │ ├── error.rs │ ├── file.rs │ ├── file_system │ │ ├── async_file_system.rs │ │ ├── mod.rs │ │ ├── posix_file_system.rs │ │ ├── reader.rs │ │ └── writer.rs │ ├── format.rs │ ├── mod.rs │ ├── options.rs │ ├── slice_transform.rs │ └── snapshot.rs ├── compaction │ ├── compaction_iter.rs │ ├── compaction_job.rs │ ├── flush_job.rs │ ├── mod.rs │ └── picker.rs ├── db.rs ├── iterator │ ├── async_merge_iterator.rs │ ├── db_iterator.rs │ ├── merge_iterator.rs │ ├── mod.rs │ ├── table_accessor.rs │ └── two_level_iterator.rs ├── lib.rs ├── log │ ├── mod.rs │ ├── reader.rs │ └── writer.rs ├── manifest.rs ├── memtable │ ├── arena.rs │ ├── concurrent_arena.rs │ ├── context.rs │ ├── inline_skiplist.rs │ ├── memtable.rs │ ├── mod.rs │ ├── skiplist.rs │ └── skiplist_rep.rs ├── options.rs ├── pipeline.rs ├── table │ ├── block_based │ │ ├── block.rs │ │ ├── block_builder.rs │ │ ├── compression.rs │ │ ├── data_block_hash_index_builder.rs │ │ ├── filter_block_builder.rs │ │ ├── filter_reader.rs │ │ ├── full_filter_block_builder.rs │ │ ├── index_builder.rs │ │ ├── index_reader.rs │ │ ├── lz4.rs │ │ ├── meta_block.rs │ │ ├── mod.rs │ │ ├── options.rs │ │ ├── table_builder.rs │ │ ├── table_builder_factory.rs │ │ ├── table_iterator.rs │ │ └── table_reader.rs │ ├── format.rs │ ├── mod.rs │ └── table_properties.rs ├── util │ ├── btree.rs │ ├── hash.rs │ ├── mod.rs │ └── test_sync_point.rs ├── version │ ├── column_family.rs │ ├── edit.rs │ ├── mod.rs │ ├── snapshot.rs │ ├── super_version.rs │ ├── table.rs │ ├── version.rs │ ├── version_set.rs │ └── version_storage_info.rs ├── wal.rs └── write_batch.rs └── tests └── benches ├── bench_memtable.rs └── mod.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ ubuntu-latest, macos-latest ] 18 | steps: 19 | - uses: actions/checkout@v2 20 | with: 21 | ref: ${{ github.event.pull_request.head.sha }} 22 | - name: Install nightly toolchain 23 | uses: actions-rs/toolchain@v1 24 | with: 25 | profile: minimal 26 | toolchain: nightly-2022-01-17 27 | override: true 28 | components: rustfmt, rust-src 29 | - uses: Swatinem/rust-cache@v1 30 | with: 31 | sharedKey: ${{ matrix.os }} 32 | - name: Cache dependencies 33 | if: ${{ matrix.os == 'ubuntu-latest' }} 34 | run: if [[ ! -e ~/.cargo/bin/grcov ]]; then cargo install grcov; fi 35 | - name: Format 36 | run: cargo fmt --all -- --check 37 | - name: Run tests 38 | run: | 39 | cargo test --all --verbose -- --nocapture 40 | env: 41 | RUST_BACKTRACE: 1 42 | - name: Run asan tests 43 | if: ${{ matrix.os == 'ubuntu-latest' }} 44 | run: | 45 | cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --all --verbose -- --nocapture 46 | env: 47 | RUST_BACKTRACE: 1 48 | RUSTFLAGS: '-Zsanitizer=address' 49 | RUSTDOCFLAGS: '-Zsanitizer=address' 50 | coverage: 51 | runs-on: ubuntu-latest 52 | needs: build 53 | steps: 54 | - uses: actions/checkout@v2 55 | with: 56 | ref: ${{ github.event.pull_request.head.sha }} 57 | - name: Install nightly toolchain 58 | uses: actions-rs/toolchain@v1 59 | with: 60 | profile: minimal 61 | toolchain: nightly-2022-01-17 62 | override: true 63 | components: llvm-tools-preview 64 | - uses: Swatinem/rust-cache@v1 65 | with: 66 | sharedKey: ubuntu-latest 67 | - name: Install grcov 68 | run: if [[ ! -e ~/.cargo/bin/grcov ]]; then cargo install --locked grcov; fi 69 | - name: Run tests 70 | run: | 71 | cargo test --all 72 | env: 73 | RUSTFLAGS: '-Zinstrument-coverage' 74 | LLVM_PROFILE_FILE: '%p-%m.profraw' 75 | - name: Run grcov 76 | run: grcov `find . \( -name "*.profraw" \) -print` --binary-path target/debug/deps/ -s . -t lcov --branch --ignore-not-existing --ignore '../**' --ignore '/*' -o coverage.lcov 77 | - name: Upload 78 | uses: codecov/codecov-action@v2 79 | with: 80 | file: coverage.lcov 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rocksdb-rs" 3 | version = "0.1.0" 4 | edition = "2018" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | async-trait = "0.1" 10 | bytes = "1.0" 11 | crc32c = "0.6" 12 | crossbeam = "0.8" 13 | futures = { version = "0.3", features = ["thread-pool", "compat"] } 14 | futures-executor = "0.3.1" 15 | futures-timer = "3.0" 16 | thiserror = "1.0" 17 | tokio = { version = "1", features = ["full"] } 18 | nix = "0.22" 19 | rand = "0.7" 20 | yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } 21 | time = "0.1" 22 | spin = "0.9.2" 23 | lz4 = "1.18.131" 24 | libc = "0.2.11" 25 | 26 | [dependencies.lazy_static] 27 | version = "1.2" 28 | 29 | [dev-dependencies] 30 | tempfile = "3.1" 31 | criterion = "0.3" 32 | 33 | [dev-dependencies.crossbeam-skiplist] 34 | git = "https://github.com/tikv/crossbeam.git" 35 | branch = "tikv-5.0" 36 | package = "crossbeam-skiplist" 37 | 38 | [[bench]] 39 | name = "benches" 40 | path = "tests/benches/mod.rs" 41 | harness = false -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rocksdb-rs 2 | rust version of rocksdb 3 | 4 | ## Why we need to build a rust version of rocksdb 5 | 6 | ### Clean and simple architecture 7 | 8 | RocksDB is a common data engine for multiple kinds of database, 9 | and one of the most important applications among them is MyRocks, which is the kernel 10 | engine to replace InnoDB in MySQL. Obviously, most users in RocksDB community do not need 11 | a transaction engine for MySQL, we just want a simple but well-performed KV engine. 12 | The RocksDB has merged so many features which we may never enable them and they made this project 13 | hard to maintain. I want to build a simple engine which is easy to maintain for simple KV application. 14 | 15 | ### Better support for asynchronous IO 16 | 17 | RocksDB does not support asynchronous IO. Not only for IO, but also other method such as `Ingest` and `CreateColumnFamily` are 18 | also synchronous. It means that every method may block the user thread for a long time. In cloud environment, this problem may 19 | be worse because the latency of cloud disk is much higher than local NVMe SSD. 20 | 21 | ## Development Guide 22 | 23 | ### Model And Architecture 24 | 25 | Our engine has five main modules, which are `WAL`, `MANIFEST`, `Version`, `Compaction`, `Table`. 26 | 27 | * `WAL` module will assign sequence for every write and then write them into a write-ahead-log file. It will run as an 28 | independent future task, and some other jobs may also be processed in this module, such as ingest. 29 | You can think of him as a combination of `write_thread` and `WriteToWAL` in RocksDB. 30 | The format of file is compatible with RocksDB, so that we can start this engine at the RocksDB directory. 31 | * `MANIFEST` will persist changes for SST files, include the result of compaction and flush jobs. 32 | * The most important structure of `Version` module are `VersionSet` and `KernelNumberContext`. I split them from `VersionSet` of RocksDB. If one operation can convert to an atomic operation, 33 | I store it in `KernelNumberContext`, otherwise it must acquire a lock guard for `Arc>`. `VersionSet` will manage the info of `ColumnFamily` and every `ColumnFamily` will 34 | own a `SuperVersion`, which include the collection of `Memtable` and the collection of `SSTable`. `SuperVersion` consists of `MemtableList` and `Version`, 35 | every time we switch memtable for one `ColumnFamily`, we will create a new `SuperVersion` with the new `Memtable` and the old `Version`. Every time we finish a compaction job or a flush job, 36 | we will create a new `SuperVersion` with the old `Memtable` and the new `Version`. 37 | * `Compaction` module consists of all codes for `Compaction` and `Flush`. 38 | * `Table` module consists of the SSTable format and the read/write operations above it. 39 | 40 | ## TODO List 41 | 42 | ### Compaction 43 | 44 | * refactor compaction pickup strategy and calculate the effect of deleted keys. 45 | 46 | ### Table 47 | 48 | * Support LZ4 and ZSTD compression algorithm. 49 | * Support hash-index for small data block. 50 | * Support block-cache. 51 | 52 | ### IO 53 | * Support AIO for asynchronous IO. (I use user threads as independent io threads, but I'm not sure if it's a better solution than AIO.) 54 | -------------------------------------------------------------------------------- /examples/simple_example.rs: -------------------------------------------------------------------------------- 1 | use rocksdb_rs::AsyncFileSystem; 2 | use rocksdb_rs::DBOptions; 3 | use rocksdb_rs::Engine; 4 | use rocksdb_rs::ReadOptions; 5 | use rocksdb_rs::WriteBatch; 6 | use std::sync::Arc; 7 | 8 | type Result = std::result::Result>; 9 | 10 | #[tokio::main(flavor = "current_thread")] 11 | async fn main() -> Result<()> { 12 | // Open engine 13 | let dir = tempfile::Builder::new() 14 | .prefix("db_simple_example") 15 | .tempdir()?; 16 | let mut db_options = DBOptions::default(); 17 | db_options.fs = Arc::new(AsyncFileSystem::new(2)); 18 | db_options.db_path = dir.path().to_str().unwrap().to_string(); 19 | 20 | let mut engine = Engine::open(db_options.clone(), vec![], None).await?; 21 | 22 | // Put key value 23 | let mut wb = WriteBatch::new(); 24 | wb.put(b"key1", b"value1"); 25 | engine.write(&mut wb).await?; 26 | 27 | // Get value 28 | let opts = ReadOptions::default(); 29 | let v = engine.get(&opts, 0, b"key1").await?; 30 | assert!(v.is_some()); 31 | assert_eq!(v.unwrap(), b"value1".to_vec()); 32 | 33 | // Atomically apply a set of updates 34 | let mut wb = WriteBatch::new(); 35 | wb.put(b"key2", b"value"); 36 | wb.delete(b"key1"); 37 | engine.write(&mut wb).await?; 38 | 39 | let v = engine.get(&opts, 0, b"key1").await?; 40 | assert!(v.is_none(), "value is {:?}", v); 41 | 42 | let v = engine.get(&opts, 0, b"key2").await?; 43 | assert!(v.is_some()); 44 | assert_eq!(v.unwrap(), b"value".to_vec()); 45 | 46 | Ok(()) 47 | } 48 | -------------------------------------------------------------------------------- /src/common/error.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::result; 3 | 4 | use thiserror::Error; 5 | 6 | #[derive(Debug, Error)] 7 | pub enum Error { 8 | #[error("Invalid Configuration: {0}")] 9 | Config(String), 10 | #[error("IO error: {0}")] 11 | Io(#[source] Box), 12 | #[error("Empty key")] 13 | EmptyKey, 14 | #[error("Too long: {0}")] 15 | TooLong(String), 16 | #[error("Unsupported feature: {0}")] 17 | Unsupported(String), 18 | #[error("Invalid checksum")] 19 | InvalidChecksum(String), 20 | #[error("Invalid filename")] 21 | InvalidFile(String), 22 | #[error("Invalid data: {0}")] 23 | VarDecode(&'static str), 24 | #[error("Error when reading table: {0}")] 25 | TableRead(String), 26 | #[error("Database Closed")] 27 | DBClosed, 28 | #[error("Task Cancel because of: {0}")] 29 | Cancel(&'static str), 30 | #[error("Error when reading from log: {0}")] 31 | LogRead(String), 32 | #[error("Invalid column family id: {0}")] 33 | InvalidColumnFamily(u32), 34 | #[error("Error when compaction: {0}")] 35 | CompactionError(String), 36 | #[error("Other Error: {0}")] 37 | Other(String), 38 | } 39 | 40 | impl From for Error { 41 | #[inline] 42 | fn from(e: io::Error) -> Error { 43 | Error::Io(Box::new(e)) 44 | } 45 | } 46 | 47 | impl Clone for Error { 48 | fn clone(&self) -> Self { 49 | match self { 50 | Error::Config(e) => Error::Config(e.clone()), 51 | Error::Io(e) => Error::Other(format!("IO Error: {:?}", e)), 52 | Error::EmptyKey => Error::EmptyKey, 53 | Error::TooLong(e) => Error::TooLong(e.clone()), 54 | Error::InvalidChecksum(e) => Error::InvalidChecksum(e.clone()), 55 | Error::InvalidFile(e) => Error::InvalidFile(e.clone()), 56 | Error::VarDecode(e) => Error::VarDecode(*e), 57 | Error::TableRead(e) => Error::TableRead(e.clone()), 58 | Error::DBClosed => Error::DBClosed, 59 | Error::Cancel(e) => Error::Cancel(*e), 60 | Error::LogRead(e) => Error::LogRead(e.clone()), 61 | Error::InvalidColumnFamily(e) => Error::InvalidColumnFamily(*e), 62 | Error::Other(e) => Error::Other(e.clone()), 63 | Error::CompactionError(e) => Error::CompactionError(e.clone()), 64 | Error::Unsupported(e) => Error::Unsupported(e.clone()), 65 | } 66 | } 67 | } 68 | 69 | pub type Result = result::Result; 70 | -------------------------------------------------------------------------------- /src/common/file.rs: -------------------------------------------------------------------------------- 1 | use super::{Error, Result}; 2 | use std::path::PathBuf; 3 | 4 | const ROCKS_DB_TFILE_EXT: &str = "sst"; 5 | const UNENCRYPTED_TEMP_FILE_NAME_SUFFIX: &str = "dbtmp.plain"; 6 | 7 | #[derive(Clone, Eq, PartialEq, Debug)] 8 | pub enum DBFileType { 9 | Current, 10 | DescriptorFile, 11 | LogFile, 12 | } 13 | 14 | pub fn make_current_file(path: &str) -> PathBuf { 15 | let p = format!("{}/CURRENT", path); 16 | PathBuf::from(p) 17 | } 18 | 19 | pub fn make_descriptor_file_name(path: &str, number: u64) -> PathBuf { 20 | let p = format!("{}/MANIFEST-{:06}", path, number); 21 | PathBuf::from(p) 22 | } 23 | 24 | pub fn make_file_name(path: &str, number: u64, suffix: &str) -> PathBuf { 25 | let p = format!("{}/{:06}.{}", path, number, suffix); 26 | PathBuf::from(p) 27 | } 28 | 29 | pub fn make_log_file(path: &str, number: u64) -> PathBuf { 30 | make_file_name(path, number, "log") 31 | } 32 | 33 | // TODO: support multi path 34 | pub fn make_table_file_name(path: &str, number: u64) -> PathBuf { 35 | make_file_name(path, number, ROCKS_DB_TFILE_EXT) 36 | } 37 | 38 | pub fn make_temp_plain_file_name(path: &str, number: u64) -> PathBuf { 39 | make_file_name(path, number, ROCKS_DB_TFILE_EXT) 40 | } 41 | 42 | pub fn parse_file_name(fname: &str) -> Result<(DBFileType, u64)> { 43 | if fname == "CURRENT" { 44 | return Ok((DBFileType::Current, 0)); 45 | } else if fname.starts_with("MANIFEST-") { 46 | let prefix = fname.trim_start_matches("MANIFEST-"); 47 | let y = prefix.parse::().map_err(|e| { 48 | Error::InvalidFile(format!( 49 | "cant not parse file {} to manifest, Error: {:?}", 50 | fname, e 51 | )) 52 | })?; 53 | return Ok((DBFileType::DescriptorFile, y)); 54 | } else if fname.ends_with(".log") { 55 | let prefix = fname.trim_end_matches(".log"); 56 | let y = prefix.parse::().map_err(|e| { 57 | Error::InvalidFile(format!( 58 | "cant not parse file {} to manifest, Error: {:?}", 59 | fname, e 60 | )) 61 | })?; 62 | return Ok((DBFileType::LogFile, y)); 63 | } 64 | Err(Error::InvalidFile(format!("cant parse file {}", fname))) 65 | } 66 | #[cfg(test)] 67 | mod tests { 68 | use super::*; 69 | 70 | #[test] 71 | fn test_write_file() { 72 | let p = make_descriptor_file_name("/home/rocksdb", 123); 73 | let m = PathBuf::from("/home/rocksdb/MANIFEST-000123"); 74 | assert_eq!(p, m); 75 | let (tp, n) = parse_file_name(m.file_name().unwrap().to_str().unwrap()).unwrap(); 76 | assert_eq!(tp, DBFileType::DescriptorFile); 77 | assert_eq!(n, 123); 78 | let p = make_log_file("/home/rocksdb", 456); 79 | let l = PathBuf::from("/home/rocksdb/000456.log"); 80 | assert_eq!(p, l); 81 | let (tp, n) = parse_file_name(l.file_name().unwrap().to_str().unwrap()).unwrap(); 82 | assert_eq!(tp, DBFileType::LogFile); 83 | assert_eq!(n, 456); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/common/file_system/reader.rs: -------------------------------------------------------------------------------- 1 | use crate::common::file_system::SequentialFile; 2 | use crate::common::RandomAccessFile; 3 | use crate::common::Result; 4 | 5 | pub struct RandomAccessFileReader { 6 | file: Box, 7 | filename: String, 8 | } 9 | 10 | impl RandomAccessFileReader { 11 | pub fn new(file: Box, filename: String) -> Self { 12 | Self { file, filename } 13 | } 14 | pub async fn read_exact(&self, offset: usize, n: usize, buf: &mut [u8]) -> Result { 15 | self.file.read_exact(offset, n, buf).await 16 | } 17 | 18 | pub async fn read(&self, offset: usize, buf: &mut [u8]) -> Result { 19 | self.file.read(offset, buf).await 20 | } 21 | 22 | pub fn name(&self) -> &str { 23 | self.filename.as_str() 24 | } 25 | 26 | pub fn use_direct_io(&self) -> bool { 27 | self.file.use_direct_io() 28 | } 29 | 30 | pub fn file_size(&self) -> usize { 31 | self.file.file_size() 32 | } 33 | } 34 | 35 | pub struct SequentialFileReader { 36 | file: Box, 37 | filename: String, 38 | } 39 | 40 | impl SequentialFileReader { 41 | pub fn new(file: Box, filename: String) -> Self { 42 | Self { file, filename } 43 | } 44 | 45 | pub async fn read(&mut self, buf: &mut [u8]) -> Result { 46 | self.file.read_sequential(buf).await 47 | } 48 | 49 | pub fn name(&self) -> &str { 50 | self.filename.as_str() 51 | } 52 | 53 | pub fn use_direct_io(&self) -> bool { 54 | false 55 | } 56 | 57 | pub fn file_size(&self) -> usize { 58 | self.file.get_file_size() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/common/file_system/writer.rs: -------------------------------------------------------------------------------- 1 | use crate::common::Result; 2 | use crate::common::WritableFile; 3 | 4 | pub struct WritableFileWriter { 5 | file_name: String, 6 | writable_file: Box, 7 | buf: Vec, 8 | file_size: usize, 9 | last_sync_size: u64, 10 | max_buffer_size: usize, 11 | } 12 | 13 | impl WritableFileWriter { 14 | pub fn new( 15 | writable_file: Box, 16 | file_name: String, 17 | max_buffer_size: usize, 18 | ) -> Self { 19 | let file_size = writable_file.get_file_size(); 20 | WritableFileWriter { 21 | file_name, 22 | writable_file, 23 | buf: Vec::with_capacity(std::cmp::min(65536, max_buffer_size)), 24 | last_sync_size: 0, 25 | file_size, 26 | max_buffer_size, 27 | } 28 | } 29 | 30 | pub async fn append(&mut self, data: &[u8]) -> Result<()> { 31 | self.file_size += data.len(); 32 | if self.max_buffer_size == 0 { 33 | self.writable_file.append(data).await?; 34 | } else if self.buf.is_empty() && data.len() >= self.max_buffer_size { 35 | self.writable_file.append(data).await?; 36 | } else { 37 | self.buf.extend_from_slice(data); 38 | if self.buf.len() >= self.max_buffer_size { 39 | self.writable_file.append(&self.buf).await?; 40 | self.buf.clear(); 41 | } 42 | } 43 | Ok(()) 44 | } 45 | 46 | pub async fn flush(&mut self) -> Result<()> { 47 | if !self.buf.is_empty() { 48 | self.writable_file.append(&self.buf).await?; 49 | self.buf.clear(); 50 | } 51 | Ok(()) 52 | } 53 | 54 | pub async fn pad(&mut self, pad_bytes: usize) -> Result<()> { 55 | self.file_size += pad_bytes; 56 | if self.buf.is_empty() { 57 | self.buf.resize(pad_bytes, 0); 58 | self.writable_file.append(&self.buf).await?; 59 | } else if pad_bytes < 100 { 60 | let pad: [u8; 100] = [0u8; 100]; 61 | self.append(&pad[..pad_bytes]).await?; 62 | } else { 63 | let pad = vec![0u8; pad_bytes]; 64 | self.append(&pad).await?; 65 | } 66 | Ok(()) 67 | } 68 | 69 | pub async fn sync(&mut self) -> Result<()> { 70 | if !self.buf.is_empty() { 71 | self.flush().await?; 72 | } 73 | self.writable_file.sync().await?; 74 | Ok(()) 75 | } 76 | 77 | pub fn file_size(&self) -> usize { 78 | self.file_size 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/common/format.rs: -------------------------------------------------------------------------------- 1 | use crate::common::DISABLE_GLOBAL_SEQUENCE_NUMBER; 2 | use crate::util::decode_fixed_uint64; 3 | 4 | #[repr(u8)] 5 | #[derive(Eq, PartialEq, Clone, Copy, Debug)] 6 | pub enum CompressionType { 7 | NoCompression = 0x0, 8 | SnappyCompression = 0x1, 9 | ZlibCompression = 0x2, 10 | BZip2Compression = 0x3, 11 | LZ4Compression = 0x4, 12 | LZ4HCCompression = 0x5, 13 | XpressCompression = 0x6, 14 | ZSTD = 0x7, 15 | 16 | // Only use ZSTDNotFinalCompression if you have to use ZSTD lib older than 17 | // 0.8.0 or consider a possibility of downgrading the service or copying 18 | // the database files to another service running with an older version of 19 | // RocksDB that doesn't have ZSTD. Otherwise, you should use ZSTD. We will 20 | // eventually remove the option from the public API. 21 | ZSTDNotFinalCompression = 0x40, 22 | 23 | // DisableCompressionOption is used to disable some compression options. 24 | DisableCompressionOption = 0xff, 25 | } 26 | 27 | impl From for CompressionType { 28 | fn from(x: u8) -> Self { 29 | unsafe { std::mem::transmute(x) } 30 | } 31 | } 32 | 33 | #[repr(u8)] 34 | #[derive(Eq, PartialEq, Clone, Copy, Debug)] 35 | pub enum ValueType { 36 | TypeDeletion = 0x0, 37 | TypeValue = 0x1, 38 | TypeMerge = 0x2, 39 | TypeLogData = 0x3, // WAL only. 40 | TypeColumnFamilyDeletion = 0x4, // WAL only. 41 | TypeColumnFamilyValue = 0x5, // WAL only. 42 | TypeColumnFamilyMerge = 0x6, // WAL only. 43 | 44 | TypeColumnFamilyRangeDeletion = 0xE, // WAL only. 45 | TypeRangeDeletion = 0xF, // meta block 46 | TypeColumnFamilyBlobIndex = 0x10, // Blob DB only 47 | TypeBlobIndex = 0x11, // Blob DB only 48 | MaxValue = 0x7F, // Not used for storing records. 49 | } 50 | 51 | impl From for ValueType { 52 | fn from(x: u8) -> Self { 53 | unsafe { std::mem::transmute(x) } 54 | } 55 | } 56 | 57 | #[derive(Default, Clone)] 58 | pub struct Slice { 59 | pub offset: usize, 60 | pub limit: usize, 61 | } 62 | 63 | impl Slice { 64 | pub fn len(&self) -> usize { 65 | if self.offset > self.limit { 66 | 0 67 | } else { 68 | self.limit - self.offset 69 | } 70 | } 71 | } 72 | 73 | pub const VALUE_TYPE_FOR_SEEK: u8 = ValueType::TypeBlobIndex as u8; 74 | pub const VALUE_TYPE_FOR_SEEK_FOR_PREV: u8 = ValueType::TypeDeletion as u8; 75 | 76 | #[inline(always)] 77 | pub fn pack_sequence_and_type(seq: u64, t: u8) -> u64 { 78 | (seq << 8) | t as u64 79 | } 80 | 81 | pub fn pack_sequence_and_type_to_key(key: &[u8], seq: u64, t: ValueType) -> Vec { 82 | let mut ret = Vec::with_capacity(key.len() + 8); 83 | ret.extend_from_slice(key); 84 | ret.extend_from_slice(&pack_sequence_and_type(seq, t as u8).to_le_bytes()); 85 | ret 86 | } 87 | 88 | #[inline(always)] 89 | pub fn extract_user_key(key: &[u8]) -> &[u8] { 90 | let l = key.len(); 91 | &key[..(l - 8)] 92 | } 93 | 94 | #[inline(always)] 95 | pub fn extract_internal_key_footer(key: &[u8]) -> u64 { 96 | unsafe { u64::from_le_bytes(*(key as *const _ as *const [u8; 8])) } 97 | } 98 | 99 | #[inline(always)] 100 | pub fn extract_value_type(key: &[u8]) -> u8 { 101 | let l = key.len(); 102 | assert!(l >= 8); 103 | let num = extract_internal_key_footer(&key[(l - 8)..]); 104 | (num & 0xffu64) as u8 105 | } 106 | 107 | #[inline(always)] 108 | pub fn is_value_type(t: u8) -> bool { 109 | t <= ValueType::TypeMerge as u8 || t == ValueType::TypeBlobIndex as u8 110 | } 111 | 112 | pub fn is_extended_value_type(t: u8) -> bool { 113 | t <= ValueType::TypeMerge as u8 114 | || t == ValueType::TypeBlobIndex as u8 115 | || t == ValueType::TypeRangeDeletion as u8 116 | } 117 | 118 | pub struct GlobalSeqnoAppliedKey { 119 | internal_key: Vec, 120 | buf: Vec, 121 | global_seqno: u64, 122 | is_user_key: bool, 123 | } 124 | 125 | impl GlobalSeqnoAppliedKey { 126 | pub fn new(global_seqno: u64, is_user_key: bool) -> Self { 127 | Self { 128 | internal_key: vec![], 129 | buf: vec![], 130 | global_seqno, 131 | is_user_key, 132 | } 133 | } 134 | 135 | pub fn get_key(&self) -> &[u8] { 136 | &self.internal_key 137 | } 138 | 139 | pub fn get_user_key(&self) -> &[u8] { 140 | if self.is_user_key { 141 | &self.internal_key 142 | } else { 143 | extract_user_key(&self.internal_key) 144 | } 145 | } 146 | 147 | pub fn set_user_key(&mut self, key: &[u8]) { 148 | self.is_user_key = true; 149 | self.set_key(key); 150 | } 151 | 152 | pub fn set_key(&mut self, key: &[u8]) { 153 | if self.global_seqno == DISABLE_GLOBAL_SEQUENCE_NUMBER { 154 | self.internal_key.clear(); 155 | self.internal_key.extend_from_slice(key); 156 | return; 157 | } 158 | let tail_offset = key.len() - 8; 159 | let num = decode_fixed_uint64(&key[tail_offset..]); 160 | self.buf.clear(); 161 | self.buf.extend_from_slice(key); 162 | self.internal_key.clear(); 163 | self.internal_key.extend_from_slice(&key[..tail_offset]); 164 | let num = pack_sequence_and_type(self.global_seqno, (num & 0xff) as u8); 165 | self.internal_key.extend_from_slice(&num.to_le_bytes()); 166 | } 167 | 168 | pub fn update_internal_key(&mut self, seq: u64, tp: ValueType) { 169 | let newval = (seq << 8) | ((tp as u8) as u64); 170 | let l = self.internal_key.len() - 8; 171 | self.internal_key[l..].copy_from_slice(&newval.to_le_bytes()); 172 | } 173 | 174 | pub fn trim_append(&mut self, key: &[u8], shared: usize) { 175 | if self.global_seqno == DISABLE_GLOBAL_SEQUENCE_NUMBER { 176 | self.internal_key.resize(shared, 0); 177 | self.internal_key.extend_from_slice(key); 178 | return; 179 | } 180 | self.buf.resize(shared, 0); 181 | self.buf.extend_from_slice(key); 182 | let tail_offset = self.buf.len() - 8; 183 | let num = decode_fixed_uint64(&self.buf[tail_offset..]); 184 | let num = pack_sequence_and_type(self.global_seqno, (num & 0xff) as u8); 185 | if self.internal_key.len() > self.buf.len() { 186 | let limit = shared + key.len(); 187 | self.internal_key[shared..limit].copy_from_slice(key); 188 | self.internal_key.resize(limit, 0); 189 | self.internal_key[tail_offset..limit].copy_from_slice(&num.to_le_bytes()); 190 | assert_eq!(limit, self.buf.len()); 191 | } else { 192 | self.internal_key.clear(); 193 | self.internal_key 194 | .extend_from_slice(&self.buf[..tail_offset]); 195 | self.internal_key.extend_from_slice(&num.to_le_bytes()); 196 | } 197 | } 198 | } 199 | 200 | pub struct ParsedInternalKey<'a> { 201 | key: &'a [u8], 202 | pub tp: ValueType, 203 | pub sequence: u64, 204 | user_key: Slice, 205 | } 206 | 207 | impl<'a> ParsedInternalKey<'a> { 208 | pub fn new(key: &'a [u8]) -> Self { 209 | let l = key.len(); 210 | if l < 8 { 211 | Self { 212 | key, 213 | tp: ValueType::MaxValue, 214 | sequence: 0, 215 | user_key: Slice::default(), 216 | } 217 | } else { 218 | let offset = l - 8; 219 | let x = decode_fixed_uint64(&key[offset..]); 220 | let c = (x & 0xff) as u8; 221 | let sequence = x >> 8; 222 | Self { 223 | key, 224 | tp: c.into(), 225 | sequence, 226 | user_key: Slice { 227 | offset: 0, 228 | limit: offset, 229 | }, 230 | } 231 | } 232 | } 233 | 234 | pub fn valid(&self) -> bool { 235 | self.user_key.limit > 0 236 | } 237 | 238 | pub fn user_key(&self) -> &[u8] { 239 | &self.key[..self.user_key.limit] 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/common/mod.rs: -------------------------------------------------------------------------------- 1 | mod error; 2 | mod file; 3 | mod file_system; 4 | pub mod format; 5 | mod slice_transform; 6 | mod snapshot; 7 | pub use file::*; 8 | 9 | use crate::util::decode_fixed_uint64; 10 | pub use slice_transform::{InternalKeySliceTransform, SliceTransform}; 11 | 12 | pub use error::Error; 13 | pub use file_system::{ 14 | AsyncFileSystem, FileSystem, IOOption, InMemFileSystem, RandomAccessFile, 15 | RandomAccessFileReader, SequentialFile, SequentialFileReader, SyncPosixFileSystem, 16 | WritableFile, WritableFileWriter, 17 | }; 18 | pub type Result = std::result::Result; 19 | 20 | pub use format::{extract_user_key, CompressionType, ValueType, VALUE_TYPE_FOR_SEEK}; 21 | use std::cmp::Ordering; 22 | use std::sync::Arc; 23 | 24 | pub const MAX_SEQUENCE_NUMBER: u64 = (1u64 << 56) - 1; 25 | pub const DISABLE_GLOBAL_SEQUENCE_NUMBER: u64 = u64::MAX; 26 | 27 | pub trait KeyComparator: Send + Sync { 28 | fn name(&self) -> &str; 29 | fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering; 30 | fn less_than(&self, lhs: &[u8], rhs: &[u8]) -> bool { 31 | self.compare_key(lhs, rhs) == Ordering::Less 32 | } 33 | fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { 34 | self.compare_key(lhs, rhs) == Ordering::Equal 35 | } 36 | fn find_shortest_separator(&self, start: &mut Vec, limit: &[u8]); 37 | fn find_short_successor(&self, key: &mut Vec) { 38 | // Find first character that can be incremented 39 | let n = key.len(); 40 | for i in 0..n { 41 | let byte = key[i]; 42 | if byte != 0xff { 43 | key[i] = byte + 1; 44 | key.resize(i + 1, 0); 45 | return; 46 | } 47 | } 48 | // *key is a run of 0xffs. Leave it alone. 49 | } 50 | } 51 | 52 | #[derive(Default, Clone)] 53 | pub struct DefaultUserComparator {} 54 | 55 | impl KeyComparator for DefaultUserComparator { 56 | fn name(&self) -> &str { 57 | "leveldb.BytewiseComparator" 58 | } 59 | 60 | fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { 61 | lhs.cmp(rhs) 62 | } 63 | 64 | fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { 65 | lhs.eq(rhs) 66 | } 67 | 68 | fn find_shortest_separator(&self, start: &mut Vec, limit: &[u8]) { 69 | let l = std::cmp::min(start.len(), limit.len()); 70 | let mut diff_index = 0; 71 | while diff_index < l && start[diff_index] == limit[diff_index] { 72 | diff_index += 1; 73 | } 74 | if diff_index < l { 75 | let start_byte = start[diff_index]; 76 | let limit_byte = limit[diff_index]; 77 | if start_byte >= limit_byte { 78 | return; 79 | } 80 | if diff_index + 1 < limit.len() || start_byte + 1 < limit_byte { 81 | start[diff_index] += 1; 82 | start.resize(diff_index + 1, 0); 83 | } else { 84 | diff_index += 1; 85 | while diff_index < start.len() { 86 | if start[diff_index] < 0xffu8 { 87 | start[diff_index] += 1; 88 | start.resize(diff_index + 1, 0); 89 | break; 90 | } 91 | diff_index += 1; 92 | } 93 | } 94 | } 95 | } 96 | } 97 | 98 | #[derive(Clone)] 99 | pub struct InternalKeyComparator { 100 | user_comparator: Arc, 101 | name: String, 102 | } 103 | 104 | impl Default for InternalKeyComparator { 105 | fn default() -> Self { 106 | InternalKeyComparator::new(Arc::new(DefaultUserComparator::default())) 107 | } 108 | } 109 | 110 | impl InternalKeyComparator { 111 | pub fn new(user_comparator: Arc) -> InternalKeyComparator { 112 | let mut name = "rocksdb.InternalKeyComparator:".to_string(); 113 | name.push_str(user_comparator.name()); 114 | InternalKeyComparator { 115 | user_comparator, 116 | name, 117 | } 118 | } 119 | 120 | pub fn get_user_comparator(&self) -> &Arc { 121 | &self.user_comparator 122 | } 123 | } 124 | 125 | impl KeyComparator for InternalKeyComparator { 126 | fn name(&self) -> &str { 127 | &self.name 128 | } 129 | 130 | #[inline] 131 | fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { 132 | let mut ret = self 133 | .user_comparator 134 | .compare_key(extract_user_key(lhs), extract_user_key(rhs)); 135 | if ret == Ordering::Equal { 136 | let l = lhs.len() - 8; 137 | let r = rhs.len() - 8; 138 | let anum = decode_fixed_uint64(&lhs[l..]); 139 | let bnum = decode_fixed_uint64(&rhs[r..]); 140 | if anum > bnum { 141 | ret = Ordering::Less; 142 | } else { 143 | ret = Ordering::Greater; 144 | } 145 | } 146 | ret 147 | } 148 | 149 | fn find_shortest_separator(&self, start: &mut Vec, limit: &[u8]) { 150 | let user_start = extract_user_key(start); 151 | let user_limit = extract_user_key(limit); 152 | let mut tmp = user_start.to_vec(); 153 | self.user_comparator 154 | .find_shortest_separator(&mut tmp, user_limit); 155 | if tmp.len() <= user_start.len() 156 | && self.user_comparator.compare_key(user_start, &tmp) == Ordering::Less 157 | { 158 | tmp.extend_from_slice( 159 | &format::pack_sequence_and_type(MAX_SEQUENCE_NUMBER, VALUE_TYPE_FOR_SEEK) 160 | .to_le_bytes(), 161 | ); 162 | std::mem::swap(start, &mut tmp); 163 | } 164 | } 165 | fn find_short_successor(&self, key: &mut Vec) { 166 | let user_key = extract_user_key(key); 167 | let mut tmp = user_key.to_vec(); 168 | self.user_comparator.find_short_successor(&mut tmp); 169 | if tmp.len() <= user_key.len() 170 | && self.user_comparator.compare_key(user_key, &tmp) == Ordering::Less 171 | { 172 | tmp.extend_from_slice( 173 | &format::pack_sequence_and_type(MAX_SEQUENCE_NUMBER, VALUE_TYPE_FOR_SEEK) 174 | .to_le_bytes(), 175 | ); 176 | std::mem::swap(key, &mut tmp); 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/common/options.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rust-lib-project/calibur/0085de0cb16e4e785698f62900ad0ae452aab194/src/common/options.rs -------------------------------------------------------------------------------- /src/common/slice_transform.rs: -------------------------------------------------------------------------------- 1 | use crate::common::format::extract_user_key; 2 | use std::sync::Arc; 3 | 4 | pub trait SliceTransform: Send + Sync { 5 | fn name(&self) -> &'static str; 6 | fn transform<'a>(&self, key: &'a [u8]) -> &'a [u8]; 7 | fn in_domain(&self, key: &[u8]) -> bool; 8 | fn in_range(&self, _key: &[u8]) -> bool { 9 | false 10 | } 11 | } 12 | 13 | pub struct InternalKeySliceTransform { 14 | transform: Arc, 15 | } 16 | 17 | impl InternalKeySliceTransform { 18 | pub fn new(transform: Arc) -> Self { 19 | Self { transform } 20 | } 21 | 22 | pub fn user_prefix_extractor(&self) -> Arc { 23 | self.transform.clone() 24 | } 25 | } 26 | 27 | impl Default for InternalKeySliceTransform { 28 | fn default() -> Self { 29 | InternalKeySliceTransform::new(Arc::new(NoopTransform::default())) 30 | } 31 | } 32 | 33 | impl SliceTransform for InternalKeySliceTransform { 34 | fn name(&self) -> &'static str { 35 | self.transform.name() 36 | } 37 | 38 | fn transform<'a>(&self, key: &'a [u8]) -> &'a [u8] { 39 | self.transform.transform(extract_user_key(key)) 40 | } 41 | 42 | fn in_domain(&self, key: &[u8]) -> bool { 43 | self.transform.in_domain(extract_user_key(key)) 44 | } 45 | 46 | fn in_range(&self, key: &[u8]) -> bool { 47 | self.transform.in_range(extract_user_key(key)) 48 | } 49 | } 50 | 51 | #[derive(Clone, Default)] 52 | pub struct NoopTransform {} 53 | impl SliceTransform for NoopTransform { 54 | fn name(&self) -> &'static str { 55 | "NoopTransform" 56 | } 57 | 58 | fn transform<'a>(&self, key: &'a [u8]) -> &'a [u8] { 59 | key 60 | } 61 | 62 | fn in_domain(&self, _key: &[u8]) -> bool { 63 | true 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/common/snapshot.rs: -------------------------------------------------------------------------------- 1 | pub struct Snapshot {} 2 | 3 | pub struct SnapshotList {} 4 | -------------------------------------------------------------------------------- /src/compaction/compaction_job.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | 4 | use crate::common::format::extract_user_key; 5 | use crate::common::CompressionType; 6 | use crate::common::{make_table_file_name, Result}; 7 | use crate::compaction::compaction_iter::CompactionIter; 8 | use crate::compaction::{CompactionEngine, CompactionRequest}; 9 | use crate::iterator::{AsyncIterator, AsyncMergingIterator, TwoLevelIterator, VecTableAccessor}; 10 | use crate::table::TableBuilderOptions; 11 | use crate::util::BtreeComparable; 12 | use crate::version::{FileMetaData, KernelNumberContext, TableFile, VersionEdit}; 13 | 14 | pub async fn run_compaction_job( 15 | mut engine: Engine, 16 | request: CompactionRequest, 17 | kernel: Arc, 18 | ) -> Result<()> { 19 | let mut iters: Vec> = vec![]; 20 | let mut level_tables: HashMap>> = HashMap::default(); 21 | for (level, f) in request.input.iter() { 22 | if *level > 0 { 23 | if let Some(files) = level_tables.get_mut(level) { 24 | files.push(f.clone()); 25 | } else { 26 | level_tables.insert(*level, vec![f.clone()]); 27 | } 28 | } else { 29 | iters.push(f.reader.new_iterator()); 30 | } 31 | } 32 | for (_level, mut tables) in level_tables { 33 | if tables.len() > 1 { 34 | let accessor = VecTableAccessor::new(tables); 35 | let two_level_iter = TwoLevelIterator::new(accessor); 36 | iters.push(Box::new(two_level_iter)); 37 | } else { 38 | let table = tables.pop().unwrap(); 39 | iters.push(table.reader.new_iterator()); 40 | } 41 | } 42 | let iter = AsyncMergingIterator::new(iters, request.cf_options.comparator.clone()); 43 | let user_comparator = request.cf_options.comparator.get_user_comparator().clone(); 44 | let mut compact_iter = 45 | CompactionIter::new_with_async(Box::new(iter), user_comparator.clone(), vec![], false); 46 | compact_iter.seek_to_first().await; 47 | 48 | let mut meta = FileMetaData::new( 49 | kernel.new_file_number(), 50 | request.output_level, 51 | vec![], 52 | vec![], 53 | ); 54 | let fname = make_table_file_name(&request.options.db_path, meta.id()); 55 | let file = request.options.fs.open_writable_file_writer(&fname)?; 56 | let mut build_opts = TableBuilderOptions::default(); 57 | build_opts.skip_filter = false; 58 | build_opts.column_family_id = request.cf; 59 | build_opts.compression_type = CompressionType::NoCompression; 60 | build_opts.target_file_size = 0; 61 | build_opts.internal_comparator = request.cf_options.comparator.clone(); 62 | 63 | let mut builder = request.cf_options.factory.new_builder(&build_opts, file)?; 64 | let mut metas = vec![]; 65 | while compact_iter.valid() { 66 | let key = compact_iter.key(); 67 | let value = compact_iter.value(); 68 | if builder.file_size() > request.target_file_size_base as u64 69 | && !user_comparator 70 | .same_key(extract_user_key(builder.last_key()), extract_user_key(key)) 71 | { 72 | builder.finish().await?; 73 | meta.fd.file_size = builder.file_size(); 74 | metas.push(meta); 75 | meta = FileMetaData::new( 76 | kernel.new_file_number(), 77 | request.output_level, 78 | vec![], 79 | vec![], 80 | ); 81 | let fname = make_table_file_name(&request.options.db_path, meta.id()); 82 | let file = request.options.fs.open_writable_file_writer(&fname)?; 83 | builder = request.cf_options.factory.new_builder(&build_opts, file)?; 84 | } else if builder.should_flush() { 85 | builder.flush().await?; 86 | } 87 | builder.add(key, value)?; 88 | meta.update_boundary(key, compact_iter.current_sequence()); 89 | compact_iter.next().await; 90 | } 91 | drop(compact_iter); 92 | builder.finish().await?; 93 | meta.fd.file_size = builder.file_size(); 94 | meta.num_entries = builder.num_entries(); 95 | metas.push(meta); 96 | let mut edit = VersionEdit::default(); 97 | // edit.prev_log_number = 0; 98 | // edit.set_log_number(mems[i].last().unwrap().get_next_log_number()); 99 | for m in metas { 100 | edit.add_file( 101 | request.output_level, 102 | m.id(), 103 | m.fd.file_size, 104 | m.smallest.as_ref(), 105 | m.largest.as_ref(), 106 | m.fd.smallest_seqno, 107 | m.fd.largest_seqno, 108 | ); 109 | } 110 | for (level, table) in request.input.iter() { 111 | edit.delete_file(*level, table.id()); 112 | } 113 | edit.column_family = request.cf; 114 | engine.apply(vec![edit]).await 115 | } 116 | -------------------------------------------------------------------------------- /src/compaction/flush_job.rs: -------------------------------------------------------------------------------- 1 | use crate::common::CompressionType; 2 | use crate::common::{make_table_file_name, InternalKeyComparator, Result}; 3 | use crate::compaction::compaction_iter::CompactionIter; 4 | use crate::compaction::{CompactionEngine, FlushRequest}; 5 | use crate::iterator::{InternalIterator, MergingIterator}; 6 | use crate::memtable::Memtable; 7 | use crate::options::{ColumnFamilyOptions, ImmutableDBOptions}; 8 | use crate::sync_point; 9 | use crate::table::TableBuilderOptions; 10 | use crate::version::{FileMetaData, KernelNumberContext, VersionEdit}; 11 | use std::collections::HashMap; 12 | use std::sync::Arc; 13 | 14 | pub struct FlushJob { 15 | engine: E, 16 | options: Arc, 17 | cf_options: Arc, 18 | version_edit: VersionEdit, 19 | mems: Vec>, 20 | meta: FileMetaData, 21 | comparator: InternalKeyComparator, 22 | snapshots: Vec, 23 | cf_id: u32, 24 | } 25 | 26 | impl FlushJob { 27 | pub fn new( 28 | engine: E, 29 | options: Arc, 30 | cf_options: Arc, 31 | mems: Vec>, 32 | comparator: InternalKeyComparator, 33 | cf_id: u32, 34 | file_number: u64, 35 | snapshots: Vec, 36 | ) -> Self { 37 | let mut version_edit = VersionEdit::default(); 38 | version_edit.column_family = cf_id; 39 | version_edit.prev_log_number = 0; 40 | version_edit.set_log_number(mems.last().unwrap().get_next_log_number()); 41 | let meta = FileMetaData::new(file_number, 0, vec![], vec![]); 42 | Self { 43 | engine, 44 | options, 45 | version_edit, 46 | mems, 47 | cf_id, 48 | meta, 49 | comparator, 50 | cf_options, 51 | snapshots, 52 | } 53 | } 54 | 55 | fn new_merging_iterator(&self, mems: &[Arc]) -> Box { 56 | if mems.len() == 1 { 57 | mems[0].new_iterator() 58 | } else { 59 | let iters = mems.iter().map(|mem| mem.new_iterator()).collect(); 60 | let iter = MergingIterator::new(iters, mems[0].get_comparator()); 61 | Box::new(iter) 62 | } 63 | } 64 | 65 | pub async fn run(&mut self) -> Result { 66 | let fname = make_table_file_name(&self.options.db_path, self.meta.id()); 67 | let file = self.options.fs.open_writable_file_writer(&fname)?; 68 | let mut build_opts = TableBuilderOptions::default(); 69 | build_opts.skip_filter = false; 70 | build_opts.column_family_id = self.cf_id; 71 | build_opts.compression_type = CompressionType::NoCompression; 72 | build_opts.target_file_size = 0; 73 | build_opts.internal_comparator = self.comparator.clone(); 74 | let mut builder = self.cf_options.factory.new_builder(&build_opts, file)?; 75 | let iter = self.new_merging_iterator(&self.mems); 76 | let mut compact_iter = CompactionIter::new( 77 | iter, 78 | self.comparator.get_user_comparator().clone(), 79 | self.snapshots.clone(), 80 | false, 81 | ); 82 | compact_iter.seek_to_first().await; 83 | while compact_iter.valid() { 84 | let key = compact_iter.key(); 85 | let value = compact_iter.value(); 86 | if builder.should_flush() { 87 | builder.flush().await?; 88 | } 89 | builder.add(key, value)?; 90 | self.meta 91 | .update_boundary(key, compact_iter.current_sequence()); 92 | compact_iter.next().await; 93 | } 94 | drop(compact_iter); 95 | builder.finish().await?; 96 | self.meta.num_entries = builder.num_entries(); 97 | self.meta.fd.file_size = builder.file_size(); 98 | Ok(self.meta.clone()) 99 | } 100 | } 101 | 102 | pub async fn run_flush_memtable_job( 103 | mut engine: Engine, 104 | reqs: Vec, 105 | kernel: Arc, 106 | options: Arc, 107 | cf_options: HashMap>, 108 | snapshots: Vec, 109 | ) -> Result<()> { 110 | let mut mems = vec![]; 111 | for req in &reqs { 112 | for (cf, mem) in &req.mems { 113 | while *cf >= mems.len() as u32 { 114 | mems.push(vec![]); 115 | } 116 | mems[(*cf) as usize].push(mem.clone()); 117 | } 118 | } 119 | let mut edits = vec![]; 120 | for (i, memtables) in mems.iter().enumerate() { 121 | if !memtables.is_empty() { 122 | let file_number = kernel.new_file_number(); 123 | let idx = i as u32; 124 | let cf_opt = cf_options 125 | .get(&idx) 126 | .cloned() 127 | .unwrap_or_else(|| Arc::new(ColumnFamilyOptions::default())); 128 | let comparator = cf_opt.comparator.clone(); 129 | let mut job = FlushJob::new( 130 | engine.clone(), 131 | options.clone(), 132 | cf_opt, 133 | memtables.clone(), 134 | comparator, 135 | i as u32, 136 | file_number, 137 | snapshots.clone(), 138 | ); 139 | let meta = job.run().await?; 140 | let mut edit = VersionEdit::default(); 141 | edit.prev_log_number = 0; 142 | edit.set_log_number(memtables.last().unwrap().get_next_log_number()); 143 | sync_point!( 144 | "run_flush_memtable_job", 145 | edit.get_log_number() * 1000 + i as u64 146 | ); 147 | edit.add_file( 148 | 0, 149 | file_number, 150 | meta.fd.file_size, 151 | meta.smallest.as_ref(), 152 | meta.largest.as_ref(), 153 | meta.fd.smallest_seqno, 154 | meta.fd.largest_seqno, 155 | ); 156 | edit.column_family = i as u32; 157 | edits.push(edit); 158 | } 159 | } 160 | engine.apply(edits).await 161 | } 162 | -------------------------------------------------------------------------------- /src/compaction/mod.rs: -------------------------------------------------------------------------------- 1 | mod compaction_iter; 2 | mod compaction_job; 3 | mod flush_job; 4 | mod picker; 5 | 6 | use crate::common::Result; 7 | use crate::memtable::Memtable; 8 | use crate::options::{ColumnFamilyOptions, ImmutableDBOptions}; 9 | use crate::version::{TableFile, Version, VersionEdit}; 10 | use std::sync::Arc; 11 | 12 | pub use compaction_job::run_compaction_job; 13 | pub use flush_job::run_flush_memtable_job; 14 | pub use picker::LevelCompactionPicker; 15 | 16 | #[async_trait::async_trait] 17 | pub trait CompactionEngine: Clone + Sync + Send { 18 | async fn apply(&mut self, edits: Vec) -> Result<()>; 19 | } 20 | 21 | pub struct CompactionRequest { 22 | input: Vec<(u32, Arc)>, 23 | input_version: Arc, 24 | cf: u32, 25 | output_level: u32, 26 | cf_options: Arc, 27 | options: Arc, 28 | target_file_size_base: usize, 29 | } 30 | 31 | pub struct FlushRequest { 32 | pub mems: Vec<(u32, Arc)>, 33 | pub wait_commit_request: u64, 34 | } 35 | 36 | impl FlushRequest { 37 | pub fn new(mems: Vec<(u32, Arc)>, wait_commit_request: u64) -> Self { 38 | Self { 39 | mems, 40 | wait_commit_request, 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/iterator/async_merge_iterator.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{InternalKeyComparator, KeyComparator}; 2 | use crate::iterator::AsyncIterator; 3 | use std::cmp::Ordering; 4 | use std::collections::BinaryHeap; 5 | 6 | struct IteratorWrapper { 7 | inner: Box, 8 | comparator: InternalKeyComparator, 9 | } 10 | 11 | impl PartialEq for IteratorWrapper { 12 | fn eq(&self, other: &Self) -> bool { 13 | if self.inner.valid() && other.inner.valid() { 14 | return self 15 | .comparator 16 | .same_key(self.inner.key(), other.inner.key()); 17 | } 18 | if !self.inner.valid() && !other.inner.valid() { 19 | return true; 20 | } 21 | false 22 | } 23 | } 24 | 25 | impl Eq for IteratorWrapper {} 26 | 27 | impl PartialOrd for IteratorWrapper { 28 | fn partial_cmp(&self, other: &Self) -> Option { 29 | Some(self.cmp(other)) 30 | } 31 | } 32 | 33 | impl Ord for IteratorWrapper { 34 | fn cmp(&self, other: &Self) -> Ordering { 35 | if self.inner.valid() && other.inner.valid() { 36 | self.comparator 37 | .compare_key(other.inner.key(), self.inner.key()) 38 | } else if self.inner.valid() { 39 | Ordering::Less 40 | } else if other.inner.valid() { 41 | Ordering::Greater 42 | } else { 43 | Ordering::Equal 44 | } 45 | } 46 | } 47 | 48 | pub struct MergingIterator { 49 | children: BinaryHeap, 50 | other: Vec, 51 | } 52 | 53 | impl MergingIterator { 54 | pub fn new(iters: Vec>, cmp: InternalKeyComparator) -> Self { 55 | let other: Vec = iters 56 | .into_iter() 57 | .map(|iter| IteratorWrapper { 58 | inner: iter, 59 | comparator: cmp.clone(), 60 | }) 61 | .collect(); 62 | Self { 63 | children: BinaryHeap::with_capacity(other.len()), 64 | other, 65 | } 66 | } 67 | 68 | fn current_forward(&mut self) { 69 | while let Some(x) = self.children.peek() { 70 | if !x.inner.valid() { 71 | let iter = self.children.pop().unwrap(); 72 | self.other.push(iter); 73 | } else { 74 | break; 75 | } 76 | } 77 | } 78 | 79 | fn collect_iterators(&mut self) -> Vec { 80 | let mut iters = Vec::with_capacity(self.other.len() + self.children.len()); 81 | std::mem::swap(&mut iters, &mut self.other); 82 | while let Some(iter) = self.children.pop() { 83 | iters.push(iter); 84 | } 85 | iters 86 | } 87 | } 88 | 89 | #[async_trait::async_trait] 90 | impl AsyncIterator for MergingIterator { 91 | fn valid(&self) -> bool { 92 | self.children 93 | .peek() 94 | .map_or(false, |iter| iter.inner.valid()) 95 | } 96 | 97 | async fn seek(&mut self, key: &[u8]) { 98 | let iters = self.collect_iterators(); 99 | for mut iter in iters { 100 | iter.inner.seek(key).await; 101 | if iter.inner.valid() { 102 | self.children.push(iter); 103 | } else { 104 | self.other.push(iter); 105 | } 106 | } 107 | } 108 | 109 | async fn seek_to_first(&mut self) { 110 | let iters = self.collect_iterators(); 111 | for mut iter in iters { 112 | iter.inner.seek_to_first().await; 113 | if iter.inner.valid() { 114 | self.children.push(iter); 115 | } else { 116 | self.other.push(iter); 117 | } 118 | } 119 | } 120 | 121 | async fn seek_to_last(&mut self) { 122 | let iters = self.collect_iterators(); 123 | for mut iter in iters { 124 | iter.inner.seek_to_last().await; 125 | if iter.inner.valid() { 126 | self.children.push(iter); 127 | } else { 128 | self.other.push(iter); 129 | } 130 | } 131 | } 132 | 133 | async fn seek_for_prev(&mut self, key: &[u8]) { 134 | let iters = self.collect_iterators(); 135 | for mut iter in iters { 136 | iter.inner.seek_for_prev(key).await; 137 | if iter.inner.valid() { 138 | self.children.push(iter); 139 | } else { 140 | self.other.push(iter); 141 | } 142 | } 143 | } 144 | 145 | async fn next(&mut self) { 146 | { 147 | let mut x = self.children.peek_mut().unwrap(); 148 | x.inner.next().await; 149 | } 150 | self.current_forward(); 151 | } 152 | 153 | async fn prev(&mut self) { 154 | { 155 | let mut x = self.children.peek_mut().unwrap(); 156 | x.inner.prev().await; 157 | } 158 | self.current_forward(); 159 | } 160 | 161 | fn key(&self) -> &[u8] { 162 | self.children.peek().unwrap().inner.key() 163 | } 164 | 165 | fn value(&self) -> &[u8] { 166 | self.children.peek().unwrap().inner.value() 167 | } 168 | } 169 | 170 | #[cfg(test)] 171 | mod tests { 172 | use super::*; 173 | use crate::common::extract_user_key; 174 | use crate::table::InMemTableIterator; 175 | use tokio::runtime::Runtime; 176 | 177 | #[test] 178 | fn test_merge_iterator() { 179 | let mut tables = vec![]; 180 | let mut data = vec![]; 181 | let v = b"v00000000000001"; 182 | let mut ikey = vec![]; 183 | let comparator = InternalKeyComparator::default(); 184 | let mut keys = vec![]; 185 | for i in 0..100 { 186 | for j in 0..100 { 187 | let k = (i * 100 + j).to_string(); 188 | ikey.clear(); 189 | ikey.extend_from_slice(k.as_bytes()); 190 | ikey.extend_from_slice(&(i as u64).to_le_bytes()); 191 | data.push((ikey.clone(), v.to_vec())); 192 | if data.len() > 1600 { 193 | let table: Box = 194 | Box::new(InMemTableIterator::new(data.clone(), &comparator)); 195 | tables.push(table); 196 | data.clear(); 197 | } 198 | keys.push(k); 199 | } 200 | } 201 | if !data.is_empty() { 202 | let table: Box = 203 | Box::new(InMemTableIterator::new(data, &comparator)); 204 | tables.push(table); 205 | } 206 | let mut iter = MergingIterator::new(tables, comparator); 207 | let r = Runtime::new().unwrap(); 208 | r.block_on(iter.seek_to_first()); 209 | let mut i = 0; 210 | keys.sort(); 211 | while iter.valid() { 212 | let k = iter.key(); 213 | if let Ok(user_key) = String::from_utf8(extract_user_key(k).to_vec()) { 214 | assert_eq!(user_key, keys[i]); 215 | } 216 | r.block_on(iter.next()); 217 | i += 1; 218 | } 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/iterator/db_iterator.rs: -------------------------------------------------------------------------------- 1 | use crate::common::format::{ 2 | pack_sequence_and_type, ParsedInternalKey, ValueType, VALUE_TYPE_FOR_SEEK, 3 | VALUE_TYPE_FOR_SEEK_FOR_PREV, 4 | }; 5 | use crate::iterator::AsyncIterator; 6 | use crate::KeyComparator; 7 | use std::sync::Arc; 8 | 9 | pub struct DBIterator { 10 | user_comparator: Arc, 11 | inner: Box, 12 | sequence: u64, 13 | buf: Vec, 14 | current_user_key: Vec, 15 | last_key_type: ValueType, 16 | is_backward: bool, 17 | pinned_value: Vec, 18 | } 19 | 20 | impl DBIterator { 21 | pub fn new( 22 | inner: Box, 23 | user_comparator: Arc, 24 | sequence: u64, 25 | ) -> DBIterator { 26 | DBIterator { 27 | inner, 28 | user_comparator, 29 | sequence, 30 | last_key_type: ValueType::TypeValue, 31 | buf: vec![], 32 | current_user_key: vec![], 33 | is_backward: false, 34 | pinned_value: vec![], 35 | } 36 | } 37 | 38 | pub fn valid(&self) -> bool { 39 | if self.is_backward { 40 | self.inner.valid() 41 | } else { 42 | !self.current_user_key.is_empty() 43 | } 44 | } 45 | 46 | pub async fn seek(&mut self, key: &[u8]) { 47 | self.buf.clear(); 48 | self.buf.extend_from_slice(key); 49 | let num = pack_sequence_and_type(self.sequence, VALUE_TYPE_FOR_SEEK); 50 | self.buf.extend_from_slice(&num.to_le_bytes()); 51 | self.inner.seek(&self.buf).await; 52 | self.backward_for_user_key().await; 53 | self.is_backward = true; 54 | } 55 | 56 | pub async fn seek_for_prev(&mut self, key: &[u8]) { 57 | self.buf.clear(); 58 | self.buf.extend_from_slice(key); 59 | let num = pack_sequence_and_type(self.sequence, VALUE_TYPE_FOR_SEEK_FOR_PREV); 60 | self.buf.extend_from_slice(&num.to_le_bytes()); 61 | self.inner.seek_for_prev(&self.buf).await; 62 | self.is_backward = false; 63 | self.current_user_key.clear(); 64 | self.pinned_value.clear(); 65 | if !self.inner.valid() { 66 | return; 67 | } 68 | let ikey = ParsedInternalKey::new(self.inner.key()); 69 | if ikey.tp == ValueType::TypeValue && ikey.sequence <= self.sequence { 70 | self.current_user_key.extend_from_slice(ikey.user_key()); 71 | self.pinned_value.extend_from_slice(self.inner.value()); 72 | } 73 | self.forward_for_user_key().await; 74 | } 75 | 76 | pub async fn seek_to_first(&mut self) { 77 | self.is_backward = true; 78 | self.inner.seek_to_first().await; 79 | self.backward_for_user_key().await; 80 | } 81 | 82 | pub async fn seek_to_last(&mut self) { 83 | self.inner.seek_to_last().await; 84 | self.current_user_key.clear(); 85 | self.pinned_value.clear(); 86 | self.is_backward = false; 87 | if !self.inner.valid() { 88 | return; 89 | } 90 | let ikey = ParsedInternalKey::new(self.inner.key()); 91 | if ikey.tp == ValueType::TypeValue && ikey.sequence <= self.sequence { 92 | self.current_user_key.extend_from_slice(ikey.user_key()); 93 | self.pinned_value.extend_from_slice(self.inner.value()); 94 | } 95 | self.forward_for_user_key().await; 96 | } 97 | 98 | pub async fn next(&mut self) { 99 | self.inner.next().await; 100 | self.backward_for_user_key().await; 101 | } 102 | 103 | pub async fn prev(&mut self) { 104 | self.current_user_key.clear(); 105 | self.pinned_value.clear(); 106 | let ikey = ParsedInternalKey::new(self.inner.key()); 107 | if ikey.tp == ValueType::TypeValue { 108 | self.current_user_key.extend_from_slice(ikey.user_key()); 109 | self.pinned_value.extend_from_slice(self.inner.value()); 110 | } 111 | self.inner.prev().await; 112 | self.forward_for_user_key().await; 113 | } 114 | 115 | pub fn key(&self) -> &[u8] { 116 | &self.current_user_key 117 | } 118 | 119 | pub fn value(&self) -> &[u8] { 120 | if self.is_backward { 121 | self.inner.value() 122 | } else { 123 | &self.pinned_value 124 | } 125 | } 126 | } 127 | 128 | impl DBIterator { 129 | async fn backward_for_user_key(&mut self) { 130 | self.current_user_key.clear(); 131 | while self.inner.valid() { 132 | let key = self.inner.key(); 133 | let ikey = ParsedInternalKey::new(key); 134 | if ikey.sequence > self.sequence { 135 | self.current_user_key.clear(); 136 | } else if ikey.tp == ValueType::TypeValue { 137 | if self.current_user_key.is_empty() 138 | || !self 139 | .user_comparator 140 | .same_key(&self.current_user_key, ikey.user_key()) 141 | { 142 | self.current_user_key.extend_from_slice(ikey.user_key()); 143 | return; 144 | } 145 | } else if ikey.tp == ValueType::TypeDeletion 146 | && (self.current_user_key.is_empty() 147 | || !self 148 | .user_comparator 149 | .same_key(&self.current_user_key, ikey.user_key())) 150 | { 151 | self.current_user_key.clear(); 152 | self.current_user_key.extend_from_slice(ikey.user_key()); 153 | } 154 | self.inner.next().await; 155 | } 156 | } 157 | 158 | async fn forward_for_user_key(&mut self) { 159 | while self.inner.valid() { 160 | let key = self.inner.key(); 161 | let ikey = ParsedInternalKey::new(key); 162 | if ikey.sequence > self.sequence { 163 | if !self.current_user_key.is_empty() { 164 | break; 165 | } 166 | } else if ikey.tp == ValueType::TypeValue { 167 | if self.current_user_key.is_empty() { 168 | self.current_user_key.extend_from_slice(ikey.user_key()); 169 | self.pinned_value.extend_from_slice(self.inner.value()); 170 | } else if !self 171 | .user_comparator 172 | .same_key(&self.current_user_key, ikey.user_key()) 173 | { 174 | break; 175 | } else { 176 | self.current_user_key.clear(); 177 | self.current_user_key.extend_from_slice(ikey.user_key()); 178 | self.pinned_value.extend_from_slice(self.inner.value()); 179 | } 180 | } else if ikey.tp == ValueType::TypeDeletion { 181 | if !self.current_user_key.is_empty() 182 | && !self 183 | .user_comparator 184 | .same_key(&self.current_user_key, ikey.user_key()) 185 | { 186 | break; 187 | } else { 188 | self.current_user_key.clear(); 189 | self.pinned_value.clear(); 190 | } 191 | } 192 | self.inner.prev().await; 193 | } 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/iterator/merge_iterator.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{InternalKeyComparator, KeyComparator}; 2 | use crate::iterator::InternalIterator; 3 | use std::cmp::Ordering; 4 | use std::collections::BinaryHeap; 5 | 6 | struct IteratorWrapper { 7 | inner: Box, 8 | comparator: InternalKeyComparator, 9 | } 10 | 11 | impl PartialEq for IteratorWrapper { 12 | fn eq(&self, other: &Self) -> bool { 13 | if self.inner.valid() && other.inner.valid() { 14 | return self 15 | .comparator 16 | .same_key(self.inner.key(), other.inner.key()); 17 | } 18 | if !self.inner.valid() && !other.inner.valid() { 19 | return true; 20 | } 21 | false 22 | } 23 | } 24 | 25 | impl Eq for IteratorWrapper {} 26 | 27 | impl PartialOrd for IteratorWrapper { 28 | fn partial_cmp(&self, other: &Self) -> Option { 29 | Some(self.cmp(other)) 30 | } 31 | } 32 | 33 | impl Ord for IteratorWrapper { 34 | fn cmp(&self, other: &Self) -> Ordering { 35 | if self.inner.valid() && other.inner.valid() { 36 | self.comparator 37 | .compare_key(other.inner.key(), self.inner.key()) 38 | } else if self.inner.valid() { 39 | Ordering::Less 40 | } else if other.inner.valid() { 41 | Ordering::Greater 42 | } else { 43 | Ordering::Equal 44 | } 45 | } 46 | } 47 | 48 | pub struct MergingIterator { 49 | children: BinaryHeap, 50 | other: Vec, 51 | } 52 | 53 | impl MergingIterator { 54 | pub fn new(iters: Vec>, cmp: InternalKeyComparator) -> Self { 55 | let other: Vec = iters 56 | .into_iter() 57 | .map(|iter| IteratorWrapper { 58 | inner: iter, 59 | comparator: cmp.clone(), 60 | }) 61 | .collect(); 62 | Self { 63 | children: BinaryHeap::with_capacity(other.len()), 64 | other, 65 | } 66 | } 67 | } 68 | 69 | impl MergingIterator { 70 | fn current_forward(&mut self) { 71 | while let Some(x) = self.children.peek() { 72 | if !x.inner.valid() { 73 | let iter = self.children.pop().unwrap(); 74 | self.other.push(iter); 75 | } else { 76 | break; 77 | } 78 | } 79 | } 80 | 81 | fn collect_iterators(&mut self) -> Vec { 82 | let mut iters = Vec::with_capacity(self.other.len() + self.children.len()); 83 | std::mem::swap(&mut iters, &mut self.other); 84 | while let Some(iter) = self.children.pop() { 85 | iters.push(iter); 86 | } 87 | iters 88 | } 89 | } 90 | 91 | impl InternalIterator for MergingIterator { 92 | fn valid(&self) -> bool { 93 | self.children 94 | .peek() 95 | .map_or(false, |iter| iter.inner.valid()) 96 | } 97 | 98 | fn seek(&mut self, key: &[u8]) { 99 | let iters = self.collect_iterators(); 100 | for mut iter in iters { 101 | iter.inner.seek(key); 102 | if iter.inner.valid() { 103 | self.children.push(iter); 104 | } else { 105 | self.other.push(iter); 106 | } 107 | } 108 | } 109 | 110 | fn seek_to_first(&mut self) { 111 | let iters = self.collect_iterators(); 112 | for mut iter in iters { 113 | iter.inner.seek_to_first(); 114 | if iter.inner.valid() { 115 | self.children.push(iter); 116 | } else { 117 | self.other.push(iter); 118 | } 119 | } 120 | } 121 | 122 | fn seek_to_last(&mut self) { 123 | let iters = self.collect_iterators(); 124 | for mut iter in iters { 125 | iter.inner.seek_to_last(); 126 | if iter.inner.valid() { 127 | self.children.push(iter); 128 | } else { 129 | self.other.push(iter); 130 | } 131 | } 132 | } 133 | 134 | fn seek_for_prev(&mut self, key: &[u8]) { 135 | let iters = self.collect_iterators(); 136 | for mut iter in iters { 137 | iter.inner.seek_for_prev(key); 138 | if iter.inner.valid() { 139 | self.children.push(iter); 140 | } else { 141 | self.other.push(iter); 142 | } 143 | } 144 | } 145 | 146 | fn next(&mut self) { 147 | { 148 | let mut x = self.children.peek_mut().unwrap(); 149 | x.inner.next(); 150 | } 151 | self.current_forward(); 152 | } 153 | 154 | fn prev(&mut self) { 155 | { 156 | let mut x = self.children.peek_mut().unwrap(); 157 | x.inner.prev(); 158 | } 159 | self.current_forward(); 160 | } 161 | 162 | fn key(&self) -> &[u8] { 163 | self.children.peek().unwrap().inner.key() 164 | } 165 | 166 | fn value(&self) -> &[u8] { 167 | self.children.peek().unwrap().inner.value() 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/iterator/mod.rs: -------------------------------------------------------------------------------- 1 | mod async_merge_iterator; 2 | mod db_iterator; 3 | mod merge_iterator; 4 | mod table_accessor; 5 | mod two_level_iterator; 6 | 7 | pub use async_merge_iterator::MergingIterator as AsyncMergingIterator; 8 | pub use db_iterator::DBIterator; 9 | pub use merge_iterator::MergingIterator; 10 | pub use table_accessor::*; 11 | pub use two_level_iterator::TwoLevelIterator; 12 | 13 | use async_trait::async_trait; 14 | 15 | pub trait InternalIterator: Send { 16 | fn valid(&self) -> bool; 17 | fn seek(&mut self, key: &[u8]); 18 | fn seek_to_first(&mut self); 19 | fn seek_to_last(&mut self); 20 | fn seek_for_prev(&mut self, key: &[u8]); 21 | fn next(&mut self); 22 | fn prev(&mut self); 23 | fn key(&self) -> &[u8]; 24 | fn value(&self) -> &[u8]; 25 | } 26 | 27 | #[async_trait] 28 | pub trait AsyncIterator: Send { 29 | fn valid(&self) -> bool; 30 | async fn seek(&mut self, key: &[u8]); 31 | async fn seek_for_prev(&mut self, key: &[u8]); 32 | async fn seek_to_first(&mut self); 33 | async fn seek_to_last(&mut self); 34 | async fn next(&mut self); 35 | async fn prev(&mut self); 36 | fn key(&self) -> &[u8]; 37 | fn value(&self) -> &[u8]; 38 | } 39 | -------------------------------------------------------------------------------- /src/iterator/table_accessor.rs: -------------------------------------------------------------------------------- 1 | use crate::util::{BTreeIter, BtreeComparable, PageIterator}; 2 | use crate::version::TableFile; 3 | use std::sync::Arc; 4 | 5 | pub trait TableAccessor: Send { 6 | fn seek(&mut self, key: &[u8]); 7 | fn seek_for_previous(&mut self, key: &[u8]); 8 | fn seek_to_first(&mut self); 9 | fn seek_to_last(&mut self); 10 | fn next(&mut self); 11 | fn prev(&mut self); 12 | fn valid(&self) -> bool; 13 | fn size(&self) -> usize; 14 | fn table(&self) -> Arc; 15 | } 16 | 17 | pub struct VecTableAccessor { 18 | tables: Vec>, 19 | cursor: usize, 20 | } 21 | 22 | impl VecTableAccessor { 23 | pub fn new(tables: Vec>) -> Self { 24 | Self { tables, cursor: 0 } 25 | } 26 | } 27 | 28 | impl TableAccessor for VecTableAccessor { 29 | fn seek(&mut self, key: &[u8]) { 30 | self.cursor = match self.tables.binary_search_by(|node| node.largest().cmp(key)) { 31 | Ok(idx) => idx, 32 | Err(upper) => upper, 33 | }; 34 | } 35 | 36 | fn seek_for_previous(&mut self, key: &[u8]) { 37 | self.cursor = match self 38 | .tables 39 | .binary_search_by(|node| node.smallest().cmp(key)) 40 | { 41 | Ok(idx) => idx, 42 | Err(upper) => { 43 | if upper == 0 { 44 | self.tables.len() 45 | } else { 46 | upper - 1 47 | } 48 | } 49 | }; 50 | } 51 | 52 | fn seek_to_first(&mut self) { 53 | self.cursor = 0; 54 | } 55 | 56 | fn seek_to_last(&mut self) { 57 | if self.tables.is_empty() { 58 | self.cursor = 0; 59 | } else { 60 | self.cursor = self.tables.len() - 1; 61 | } 62 | } 63 | 64 | fn next(&mut self) { 65 | self.cursor += 1; 66 | } 67 | 68 | fn prev(&mut self) { 69 | if self.cursor == 0 { 70 | self.cursor = self.tables.len() 71 | } else { 72 | self.cursor -= 1; 73 | } 74 | } 75 | 76 | fn valid(&self) -> bool { 77 | self.cursor < self.tables.len() 78 | } 79 | 80 | fn size(&self) -> usize { 81 | self.tables.len() 82 | } 83 | 84 | fn table(&self) -> Arc { 85 | self.tables[self.cursor].clone() 86 | } 87 | } 88 | 89 | pub struct BTreeTableAccessor { 90 | iter: BTreeIter>, 91 | } 92 | 93 | impl BTreeTableAccessor { 94 | pub fn new(iter: BTreeIter>) -> Self { 95 | Self { iter } 96 | } 97 | } 98 | 99 | impl TableAccessor for BTreeTableAccessor { 100 | fn seek(&mut self, key: &[u8]) { 101 | self.iter.seek(key) 102 | } 103 | 104 | fn seek_for_previous(&mut self, key: &[u8]) { 105 | self.iter.seek_for_previous(key) 106 | } 107 | 108 | fn seek_to_first(&mut self) { 109 | self.iter.seek_to_first() 110 | } 111 | 112 | fn seek_to_last(&mut self) { 113 | self.iter.seek_to_last() 114 | } 115 | 116 | fn next(&mut self) { 117 | self.iter.next() 118 | } 119 | 120 | fn prev(&mut self) { 121 | self.iter.prev() 122 | } 123 | 124 | fn valid(&self) -> bool { 125 | self.iter.valid() 126 | } 127 | 128 | fn size(&self) -> usize { 129 | self.iter.size() 130 | } 131 | 132 | fn table(&self) -> Arc { 133 | self.iter.record().unwrap() 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/iterator/two_level_iterator.rs: -------------------------------------------------------------------------------- 1 | use crate::common::format::extract_user_key; 2 | use crate::iterator::table_accessor::TableAccessor; 3 | use crate::iterator::AsyncIterator; 4 | 5 | pub struct TwoLevelIterator { 6 | table_accessor: Acessor, 7 | current: Option>, 8 | } 9 | 10 | impl TwoLevelIterator { 11 | pub fn new(table_accessor: Accessor) -> Self { 12 | Self { 13 | table_accessor, 14 | current: None, 15 | } 16 | } 17 | 18 | async fn forward_iterator(&mut self) { 19 | while self.table_accessor.valid() { 20 | let mut iter = self.table_accessor.table().reader.new_iterator(); 21 | iter.seek_to_last().await; 22 | if iter.valid() { 23 | self.current = Some(iter); 24 | return; 25 | } 26 | } 27 | self.current = None; 28 | } 29 | 30 | async fn backward_iterator(&mut self) { 31 | while self.table_accessor.valid() { 32 | let mut iter = self.table_accessor.table().reader.new_iterator(); 33 | iter.seek_to_first().await; 34 | if iter.valid() { 35 | self.current = Some(iter); 36 | return; 37 | } 38 | } 39 | self.current = None; 40 | } 41 | } 42 | 43 | #[async_trait::async_trait] 44 | impl AsyncIterator for TwoLevelIterator { 45 | fn valid(&self) -> bool { 46 | self.current.as_ref().map_or(false, |iter| iter.valid()) 47 | } 48 | 49 | async fn seek(&mut self, key: &[u8]) { 50 | self.table_accessor.seek(extract_user_key(key)); 51 | if self.table_accessor.valid() { 52 | let mut iter = self.table_accessor.table().reader.new_iterator(); 53 | iter.seek(key).await; 54 | if iter.valid() { 55 | self.current = Some(iter); 56 | return; 57 | } 58 | self.table_accessor.next(); 59 | } 60 | self.backward_iterator().await; 61 | } 62 | 63 | async fn seek_for_prev(&mut self, key: &[u8]) { 64 | self.table_accessor.seek_for_previous(extract_user_key(key)); 65 | if self.table_accessor.valid() { 66 | let mut iter = self.table_accessor.table().reader.new_iterator(); 67 | iter.seek_for_prev(key).await; 68 | if iter.valid() { 69 | self.current = Some(iter); 70 | return; 71 | } 72 | self.table_accessor.prev(); 73 | } 74 | self.forward_iterator().await; 75 | } 76 | async fn seek_to_first(&mut self) { 77 | self.table_accessor.seek_to_first(); 78 | self.backward_iterator().await; 79 | } 80 | 81 | async fn seek_to_last(&mut self) { 82 | self.table_accessor.seek_to_last(); 83 | self.forward_iterator().await; 84 | } 85 | 86 | async fn next(&mut self) { 87 | self.current.as_mut().unwrap().next().await; 88 | if self.current.as_ref().unwrap().valid() { 89 | return; 90 | } 91 | self.table_accessor.next(); 92 | self.backward_iterator().await; 93 | } 94 | 95 | async fn prev(&mut self) { 96 | self.current.as_mut().unwrap().prev().await; 97 | if self.current.as_ref().unwrap().valid() { 98 | return; 99 | } 100 | self.table_accessor.prev(); 101 | self.forward_iterator().await; 102 | } 103 | 104 | fn key(&self) -> &[u8] { 105 | assert!(self.valid()); 106 | self.current.as_ref().unwrap().key() 107 | } 108 | 109 | fn value(&self) -> &[u8] { 110 | self.current.as_ref().unwrap().value() 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | mod common; 4 | mod compaction; 5 | mod db; 6 | mod iterator; 7 | mod log; 8 | mod manifest; 9 | mod memtable; 10 | mod options; 11 | mod pipeline; 12 | mod table; 13 | mod util; 14 | mod version; 15 | mod wal; 16 | mod write_batch; 17 | pub use memtable::{InlineSkipListMemtableRep, MemTableContext, MemtableRep, SkipListMemtableRep}; 18 | 19 | pub use common::{ 20 | AsyncFileSystem, Error, FileSystem, InternalKeyComparator, KeyComparator, Result, 21 | SliceTransform, SyncPosixFileSystem, 22 | }; 23 | pub use db::*; 24 | pub use iterator::{DBIterator, InternalIterator}; 25 | pub use options::*; 26 | pub use table::{ 27 | BlockBasedTableFactory, BlockBasedTableOptions, FilterBlockFactory, FullFilterBlockFactory, 28 | }; 29 | pub use write_batch::*; 30 | -------------------------------------------------------------------------------- /src/log/mod.rs: -------------------------------------------------------------------------------- 1 | mod reader; 2 | mod writer; 3 | 4 | pub const HEADER_SIZE: usize = 4 + 2 + 1; 5 | pub const RECYCLABLE_HEADER_SIZE: usize = 4 + 2 + 1 + 4; 6 | 7 | #[cfg(test)] 8 | pub const BLOCK_SIZE: usize = 4096; 9 | #[cfg(not(test))] 10 | pub const BLOCK_SIZE: usize = 32768; 11 | pub const LOG_PADDING: &[u8] = b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 12 | 13 | #[repr(u8)] 14 | #[derive(Eq, PartialEq, Clone, Copy, Debug)] 15 | pub enum RecordType { 16 | // Zero is reserved for preallocated files 17 | ZeroType = 0, 18 | FullType = 1, 19 | 20 | // For fragments 21 | FirstType = 2, 22 | MiddleType = 3, 23 | LastType = 4, 24 | // For recycled log files 25 | RecyclableFullType = 5, 26 | // RecyclableFirstType = 6, 27 | // RecyclableMiddleType = 7, 28 | RecyclableLastType = 8, 29 | Unknown = 127, 30 | } 31 | 32 | impl From for RecordType { 33 | fn from(x: u8) -> Self { 34 | if x > 8 { 35 | RecordType::Unknown 36 | } else { 37 | unsafe { std::mem::transmute(x) } 38 | } 39 | } 40 | } 41 | 42 | const MAX_RECORD_TYPE: u8 = RecordType::RecyclableLastType as u8; 43 | 44 | #[repr(u8)] 45 | #[derive(Eq, PartialEq, Clone, Copy, Debug)] 46 | pub enum RecordError { 47 | Eof = 9, 48 | // Returned whenever we find an invalid physical record. 49 | // Currently there are three situations in which this happens: 50 | // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) 51 | // * The record is a 0-length record (No drop is reported) 52 | BadRecord = 10, 53 | // Returned when we fail to read a valid header. 54 | BadHeader = 11, 55 | // Returned when we read an old record from a previous user of the log. 56 | OldRecord = 12, 57 | // Returned when we get a bad record length 58 | BadRecordLen = 13, 59 | // Returned when we get a bad record checksum 60 | BadRecordChecksum = 14, 61 | Unknown = 127, 62 | } 63 | 64 | impl From for RecordError { 65 | fn from(x: u8) -> Self { 66 | if !(9..=14).contains(&x) { 67 | RecordError::Unknown 68 | } else { 69 | unsafe { std::mem::transmute(x) } 70 | } 71 | } 72 | } 73 | 74 | pub use reader::LogReader; 75 | pub use writer::LogWriter; 76 | -------------------------------------------------------------------------------- /src/log/writer.rs: -------------------------------------------------------------------------------- 1 | use super::{RecordType, BLOCK_SIZE, HEADER_SIZE, LOG_PADDING, RECYCLABLE_HEADER_SIZE}; 2 | use crate::common::{Result, WritableFileWriter}; 3 | use crate::util; 4 | use crc32c::crc32c; 5 | use crc32c::crc32c_append; 6 | 7 | pub struct LogWriter { 8 | writer: Box, 9 | block_offset: usize, 10 | log_number: u64, 11 | type_crc: Vec, 12 | } 13 | 14 | impl LogWriter { 15 | pub fn new(writer: Box, log_number: u64) -> Self { 16 | let type_crc = vec![ 17 | crc32c(&[0]), 18 | crc32c(&[1]), 19 | crc32c(&[2]), 20 | crc32c(&[3]), 21 | crc32c(&[4]), 22 | ]; 23 | LogWriter { 24 | writer, 25 | log_number, 26 | block_offset: 0, 27 | type_crc, 28 | } 29 | } 30 | 31 | pub fn get_log_number(&self) -> u64 { 32 | self.log_number 33 | } 34 | 35 | pub fn get_file_mut(&mut self) -> &mut WritableFileWriter { 36 | self.writer.as_mut() 37 | } 38 | 39 | pub fn get_file_size(&self) -> usize { 40 | self.writer.file_size() 41 | } 42 | 43 | pub async fn fsync(&mut self) -> Result<()> { 44 | self.writer.sync().await 45 | } 46 | 47 | pub async fn add_record(&mut self, data: &[u8]) -> Result<()> { 48 | let mut left = data.len(); 49 | let mut begin = true; 50 | let mut offset = 0; 51 | while left > 0 { 52 | let leftover = BLOCK_SIZE - self.block_offset; 53 | if leftover < HEADER_SIZE { 54 | if leftover > 0 { 55 | self.writer.append(&LOG_PADDING[..leftover]).await?; 56 | } 57 | self.block_offset = 0; 58 | } 59 | let avail = BLOCK_SIZE - self.block_offset - HEADER_SIZE; 60 | let fragment_length = std::cmp::min(left, avail); 61 | let record_type = if begin && left == fragment_length { 62 | RecordType::FullType 63 | } else if begin { 64 | RecordType::FirstType 65 | } else if left == fragment_length { 66 | RecordType::LastType 67 | } else { 68 | RecordType::MiddleType 69 | }; 70 | self.emit_physical_record(record_type, &data[offset..(offset + fragment_length)]) 71 | .await?; 72 | offset += fragment_length; 73 | left -= fragment_length; 74 | begin = false; 75 | } 76 | self.writer.flush().await?; 77 | Ok(()) 78 | } 79 | 80 | async fn emit_physical_record(&mut self, record_type: RecordType, data: &[u8]) -> Result<()> { 81 | let mut buf: [u8; RECYCLABLE_HEADER_SIZE] = [0u8; RECYCLABLE_HEADER_SIZE]; 82 | // TODO: We do not support recycle wal log. 83 | buf[4] = (data.len() & 0xff) as u8; 84 | buf[5] = (data.len() >> 8) as u8; 85 | buf[6] = record_type as u8; 86 | let mut crc = self.type_crc[buf[6] as usize]; 87 | crc = crc32c_append(crc, data); 88 | crc = util::crc_mask(crc); 89 | buf[..4].copy_from_slice(&crc.to_le_bytes()); 90 | self.writer.append(&buf[..HEADER_SIZE]).await?; 91 | self.writer.append(data).await?; 92 | self.block_offset += HEADER_SIZE + data.len(); 93 | Ok(()) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/memtable/arena.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicU32, Ordering}; 2 | use std::sync::Arc; 3 | use std::{mem, ptr}; 4 | 5 | struct ArenaCore { 6 | len: AtomicU32, 7 | cap: usize, 8 | ptr: *mut u8, 9 | } 10 | 11 | impl Drop for ArenaCore { 12 | fn drop(&mut self) { 13 | unsafe { 14 | let ptr = self.ptr as *mut u64; 15 | let cap = self.cap / 8; 16 | Vec::from_raw_parts(ptr, 0, cap); 17 | } 18 | } 19 | } 20 | 21 | pub struct Arena { 22 | core: Arc, 23 | } 24 | 25 | impl Arena { 26 | pub fn with_capacity(cap: u32) -> Arena { 27 | let mut buf: Vec = Vec::with_capacity(cap as usize / 8); 28 | let ptr = buf.as_mut_ptr() as *mut u8; 29 | let cap = buf.capacity() * 8; 30 | mem::forget(buf); 31 | Arena { 32 | core: Arc::new(ArenaCore { 33 | len: AtomicU32::new(1), 34 | cap, 35 | ptr, 36 | }), 37 | } 38 | } 39 | 40 | pub fn len(&self) -> u32 { 41 | self.core.len.load(Ordering::SeqCst) 42 | } 43 | 44 | pub fn alloc(&self, align: usize, size: usize) -> u32 { 45 | let align_mask = align - 1; 46 | // Leave enough padding for align. 47 | let size = size + align_mask; 48 | let offset = self.core.len.fetch_add(size as u32, Ordering::SeqCst); 49 | // Calculate the correct align point, it equals to 50 | // (offset + align_mask) / align * align. 51 | let ptr_offset = (offset as usize + align_mask) & !align_mask; 52 | assert!(offset as usize + size <= self.core.cap); 53 | ptr_offset as u32 54 | } 55 | 56 | pub unsafe fn get_mut(&self, offset: u32) -> *mut N { 57 | if offset == 0 { 58 | return ptr::null_mut(); 59 | } 60 | self.core.ptr.add(offset as usize) as _ 61 | } 62 | 63 | pub fn offset(&self, ptr: *const N) -> u32 { 64 | let ptr_addr = ptr as usize; 65 | let self_addr = self.core.ptr as usize; 66 | if ptr_addr > self_addr && ptr_addr < self_addr + self.core.cap { 67 | (ptr_addr - self_addr) as u32 68 | } else { 69 | 0 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/memtable/concurrent_arena.rs: -------------------------------------------------------------------------------- 1 | use spin::Mutex; 2 | use std::ptr::null_mut; 3 | use std::sync::atomic::{AtomicPtr, AtomicUsize, Ordering}; 4 | 5 | const BLOCK_DATA_SIZE: usize = 4 * 1024 * 1024; 6 | const PAGE_DATA_SIZE: usize = 8 * 1024; 7 | 8 | struct Block { 9 | data: Vec, 10 | offset: AtomicUsize, 11 | } 12 | 13 | pub struct ArenaContent { 14 | blocks: Vec>, 15 | current: Box, 16 | } 17 | 18 | impl ArenaContent { 19 | pub fn new() -> Self { 20 | Self { 21 | blocks: vec![], 22 | current: Box::new(Block { 23 | data: Vec::with_capacity(BLOCK_DATA_SIZE), 24 | offset: AtomicUsize::new(0), 25 | }), 26 | } 27 | } 28 | } 29 | 30 | pub trait Arena { 31 | fn mem_size(&self) -> usize; 32 | unsafe fn allocate(&self, alloc_size: usize) -> *mut u8; 33 | unsafe fn allocate_in_thread(&self, _: usize, alloc_size: usize) -> *mut u8 { 34 | self.allocate(alloc_size) 35 | } 36 | } 37 | 38 | pub struct ArenaShard { 39 | arena: Mutex, 40 | current: AtomicPtr, 41 | } 42 | 43 | impl Default for ArenaShard { 44 | fn default() -> Self { 45 | let mut arena = ArenaContent::new(); 46 | ArenaShard { 47 | current: AtomicPtr::new(arena.current.as_mut()), 48 | arena: Mutex::new(arena), 49 | } 50 | } 51 | } 52 | 53 | impl ArenaShard { 54 | unsafe fn allocate_from_current_block(&self, data_size: usize) -> *mut u8 { 55 | let current = self.current.load(Ordering::Acquire); 56 | let offset = (*current).offset.fetch_add(data_size, Ordering::SeqCst); 57 | if offset + data_size < BLOCK_DATA_SIZE { 58 | return (*current).data.as_mut_ptr().add(offset) as _; 59 | } 60 | return null_mut(); 61 | } 62 | 63 | unsafe fn allocate_heap(&self, data_size: usize, mem_size: &AtomicUsize) -> *mut u8 { 64 | let mut arena = self.arena.lock(); 65 | let offset = arena.current.offset.fetch_add(data_size, Ordering::SeqCst); 66 | if offset + data_size < BLOCK_DATA_SIZE { 67 | return arena.current.data.as_mut_ptr().add(offset) as _; 68 | } 69 | 70 | let mut block_size = BLOCK_DATA_SIZE; 71 | while block_size < data_size { 72 | if block_size + PAGE_DATA_SIZE < data_size { 73 | block_size += (data_size - block_size) / PAGE_DATA_SIZE * PAGE_DATA_SIZE; 74 | } else { 75 | block_size += PAGE_DATA_SIZE; 76 | } 77 | } 78 | let mut block = Box::new(Block { 79 | data: Vec::with_capacity(block_size), 80 | offset: AtomicUsize::new(data_size), 81 | }); 82 | self.current.store(block.as_mut(), Ordering::Release); 83 | let old = std::mem::replace(&mut arena.current, block); 84 | mem_size.fetch_add(old.data.capacity(), Ordering::Relaxed); 85 | arena.blocks.push(old); 86 | arena.current.data.as_mut_ptr() 87 | } 88 | } 89 | 90 | pub struct ConcurrentArena { 91 | arena: ArenaShard, 92 | mem_size: AtomicUsize, 93 | } 94 | 95 | impl ConcurrentArena { 96 | pub fn new() -> Self { 97 | ConcurrentArena { 98 | arena: ArenaShard::default(), 99 | mem_size: AtomicUsize::new(0), 100 | } 101 | } 102 | } 103 | 104 | impl Arena for ConcurrentArena { 105 | fn mem_size(&self) -> usize { 106 | self.mem_size.load(Ordering::Relaxed) 107 | } 108 | 109 | unsafe fn allocate(&self, alloc_size: usize) -> *mut u8 { 110 | let data_size = ((alloc_size - 1) | (std::mem::size_of::<*mut u8>() - 1)) + 1; 111 | let addr = self.arena.allocate_from_current_block(data_size); 112 | if !addr.is_null() { 113 | return addr; 114 | } 115 | self.arena.allocate_heap(data_size, &self.mem_size) 116 | } 117 | } 118 | 119 | const ARENA_COUNT: usize = 4; 120 | 121 | pub struct SharedArena { 122 | arenas: Vec, 123 | mem_size: AtomicUsize, 124 | id: AtomicUsize, 125 | } 126 | 127 | impl SharedArena { 128 | pub fn new() -> Self { 129 | let mut arenas = vec![]; 130 | for _ in 0..ARENA_COUNT { 131 | arenas.push(ArenaShard::default()); 132 | } 133 | SharedArena { 134 | arenas, 135 | mem_size: AtomicUsize::new(0), 136 | id: AtomicUsize::new(1), 137 | } 138 | } 139 | } 140 | 141 | impl Arena for SharedArena { 142 | fn mem_size(&self) -> usize { 143 | self.mem_size.load(Ordering::Acquire) 144 | } 145 | 146 | unsafe fn allocate(&self, alloc_size: usize) -> *mut u8 { 147 | let data_size = ((alloc_size - 1) | (std::mem::size_of::<*mut u8>() - 1)) + 1; 148 | let arena = &self.arenas[0]; 149 | let addr = arena.allocate_from_current_block(data_size); 150 | if !addr.is_null() { 151 | return addr; 152 | } 153 | arena.allocate_heap(data_size, &self.mem_size) 154 | } 155 | 156 | unsafe fn allocate_in_thread(&self, idx: usize, alloc_size: usize) -> *mut u8 { 157 | let data_size = ((alloc_size - 1) | (std::mem::size_of::<*mut u8>() - 1)) + 1; 158 | let arena = &self.arenas[idx % ARENA_COUNT]; 159 | let addr = arena.allocate_from_current_block(data_size); 160 | if !addr.is_null() { 161 | return addr; 162 | } 163 | arena.allocate_heap(data_size, &self.mem_size) 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/memtable/context.rs: -------------------------------------------------------------------------------- 1 | use crate::memtable::Splice; 2 | use std::cell::RefCell; 3 | use std::sync::atomic::{AtomicUsize, Ordering}; 4 | 5 | thread_local! { 6 | pub static CACHE_ID: RefCell = RefCell::new(0); 7 | } 8 | 9 | lazy_static::lazy_static! { 10 | static ref GLOBAL_CACHE_ID: AtomicUsize = AtomicUsize::new(1); 11 | } 12 | 13 | #[derive(Default, Clone)] 14 | pub struct MemTableContext { 15 | pub(crate) splice: Splice, 16 | thread_id: usize, 17 | // TODO: Support allocate data from local thread arena. 18 | } 19 | 20 | impl MemTableContext { 21 | pub fn get_thread_id(&mut self) -> usize { 22 | if self.thread_id == 0 { 23 | let idx = CACHE_ID.with(|x| { 24 | if *x.borrow() != 0 { 25 | return *x.borrow(); 26 | } 27 | *x.borrow_mut() = GLOBAL_CACHE_ID.fetch_add(1, Ordering::SeqCst); 28 | *x.borrow() 29 | }); 30 | self.thread_id = idx; 31 | } 32 | self.thread_id 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/memtable/memtable.rs: -------------------------------------------------------------------------------- 1 | use crate::common::format::extract_user_key; 2 | use crate::common::InternalKeyComparator; 3 | use crate::iterator::{AsyncIterator, InternalIterator}; 4 | use crate::memtable::context::MemTableContext; 5 | use crate::memtable::MemtableRep; 6 | use crate::InlineSkipListMemtableRep; 7 | use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; 8 | 9 | pub struct Memtable { 10 | rep: Box, 11 | mem_next_logfile_number: AtomicU64, 12 | cf_id: u32, 13 | comparator: InternalKeyComparator, 14 | pending_schedule: AtomicBool, 15 | first_seqno: AtomicU64, 16 | earliest_seqno: AtomicU64, 17 | max_write_buffer_size: usize, 18 | } 19 | 20 | impl Memtable { 21 | pub fn new( 22 | cf_id: u32, 23 | max_write_buffer_size: usize, 24 | comparator: InternalKeyComparator, 25 | earliest_seq: u64, 26 | ) -> Self { 27 | Self { 28 | rep: Box::new(InlineSkipListMemtableRep::new(comparator.clone())), 29 | comparator, 30 | mem_next_logfile_number: AtomicU64::new(0), 31 | cf_id, 32 | pending_schedule: AtomicBool::new(false), 33 | max_write_buffer_size, 34 | first_seqno: AtomicU64::new(0), 35 | earliest_seqno: AtomicU64::new(earliest_seq), 36 | } 37 | } 38 | 39 | pub fn new_iterator(&self) -> Box { 40 | self.rep.new_iterator() 41 | } 42 | 43 | pub fn new_async_iterator(&self) -> Box { 44 | let iter = self.rep.new_iterator(); 45 | Box::new(MemIteratorWrapper { inner: iter }) 46 | } 47 | 48 | pub fn get_comparator(&self) -> InternalKeyComparator { 49 | self.comparator.clone() 50 | } 51 | 52 | pub fn add(&self, ctx: &mut MemTableContext, key: &[u8], value: &[u8], sequence: u64) { 53 | self.update_first_sequence(sequence); 54 | self.rep.add(ctx, key, value, sequence); 55 | } 56 | 57 | pub fn delete(&self, ctx: &mut MemTableContext, key: &[u8], sequence: u64) { 58 | self.update_first_sequence(sequence); 59 | self.rep.delete(ctx, key, sequence); 60 | } 61 | 62 | pub fn get_column_family_id(&self) -> u32 { 63 | self.cf_id 64 | } 65 | 66 | pub fn get(&self, key: &[u8]) -> Option> { 67 | let mut iter = self.rep.new_iterator(); 68 | iter.seek(key); 69 | if iter.valid() 70 | && self 71 | .comparator 72 | .get_user_comparator() 73 | .same_key(extract_user_key(key), extract_user_key(iter.key())) 74 | { 75 | return Some(iter.value().to_vec()); 76 | } 77 | None 78 | } 79 | 80 | pub fn set_next_log_number(&self, num: u64) { 81 | self.mem_next_logfile_number.store(num, Ordering::Release); 82 | } 83 | 84 | pub fn get_next_log_number(&self) -> u64 { 85 | self.mem_next_logfile_number.load(Ordering::Acquire) 86 | } 87 | 88 | // TODO: support write buffer manager 89 | pub fn should_flush(&self) -> bool { 90 | self.rep.mem_size() as usize > self.max_write_buffer_size 91 | } 92 | 93 | pub fn get_mem_size(&self) -> usize { 94 | self.rep.mem_size() as usize 95 | } 96 | 97 | pub fn is_empty(&self) -> bool { 98 | self.first_seqno.load(Ordering::Acquire) == 0 99 | } 100 | 101 | pub fn mark_schedule_flush(&self) { 102 | self.pending_schedule.store(true, Ordering::Release); 103 | } 104 | 105 | pub fn is_pending_schedule(&self) -> bool { 106 | self.pending_schedule.load(Ordering::Acquire) 107 | } 108 | 109 | fn update_first_sequence(&self, sequence: u64) { 110 | let mut cur_seq_num = self.first_seqno.load(Ordering::Relaxed); 111 | while cur_seq_num == 0 || sequence < cur_seq_num { 112 | match self.first_seqno.compare_exchange_weak( 113 | cur_seq_num, 114 | sequence, 115 | Ordering::SeqCst, 116 | Ordering::SeqCst, 117 | ) { 118 | Ok(_) => break, 119 | Err(v) => cur_seq_num = v, 120 | } 121 | } 122 | let mut cur_earliest_seqno = self.earliest_seqno.load(Ordering::Relaxed); 123 | while sequence < cur_earliest_seqno { 124 | match self.earliest_seqno.compare_exchange_weak( 125 | cur_earliest_seqno, 126 | sequence, 127 | Ordering::SeqCst, 128 | Ordering::SeqCst, 129 | ) { 130 | Ok(_) => break, 131 | Err(v) => cur_earliest_seqno = v, 132 | } 133 | } 134 | } 135 | } 136 | 137 | pub struct MemIteratorWrapper { 138 | inner: Box, 139 | } 140 | 141 | #[async_trait::async_trait] 142 | impl AsyncIterator for MemIteratorWrapper { 143 | fn valid(&self) -> bool { 144 | self.inner.valid() 145 | } 146 | 147 | async fn seek(&mut self, key: &[u8]) { 148 | self.inner.seek(key) 149 | } 150 | 151 | async fn seek_to_first(&mut self) { 152 | self.inner.seek_to_first() 153 | } 154 | 155 | async fn seek_to_last(&mut self) { 156 | self.inner.seek_to_last() 157 | } 158 | 159 | async fn seek_for_prev(&mut self, key: &[u8]) { 160 | self.inner.seek_for_prev(key) 161 | } 162 | 163 | async fn next(&mut self) { 164 | self.inner.next() 165 | } 166 | 167 | async fn prev(&mut self) { 168 | self.inner.prev() 169 | } 170 | 171 | fn key(&self) -> &[u8] { 172 | self.inner.key() 173 | } 174 | 175 | fn value(&self) -> &[u8] { 176 | self.inner.value() 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/memtable/mod.rs: -------------------------------------------------------------------------------- 1 | mod arena; 2 | mod concurrent_arena; 3 | mod context; 4 | mod inline_skiplist; 5 | mod memtable; 6 | mod skiplist; 7 | mod skiplist_rep; 8 | 9 | use crate::iterator::InternalIterator; 10 | pub use context::MemTableContext; 11 | pub use inline_skiplist::Splice; 12 | pub use memtable::Memtable; 13 | pub use skiplist_rep::{InlineSkipListMemtableRep, SkipListMemtableRep}; 14 | 15 | use std::cmp::Ordering; 16 | 17 | const MAX_HEIGHT: usize = 20; 18 | 19 | pub trait MemtableRep: Send + Sync { 20 | fn new_iterator(&self) -> Box; 21 | fn add(&self, splice: &mut MemTableContext, key: &[u8], value: &[u8], sequence: u64); 22 | fn delete(&self, splice: &mut MemTableContext, key: &[u8], sequence: u64); 23 | fn mem_size(&self) -> usize; 24 | fn name(&self) -> &str; 25 | fn cmp(&self, start: &[u8], end: &[u8]) -> Ordering { 26 | start.cmp(end) 27 | } 28 | 29 | fn scan(&self, start: &[u8], end: &[u8], mut f: F) { 30 | let mut iter = self.new_iterator(); 31 | iter.seek(start); 32 | while iter.valid() && self.cmp(iter.key(), end) == Ordering::Less { 33 | f(iter.key(), iter.value()); 34 | iter.next(); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/memtable/skiplist_rep.rs: -------------------------------------------------------------------------------- 1 | use super::inline_skiplist::Comparator; 2 | use crate::common::format::pack_sequence_and_type; 3 | use crate::common::ValueType; 4 | use crate::iterator::InternalIterator; 5 | use crate::memtable::concurrent_arena::SharedArena; 6 | use crate::memtable::inline_skiplist::{InlineSkipList, SkipListIterator}; 7 | use crate::memtable::skiplist::{IterRef, Skiplist}; 8 | use crate::memtable::{MemTableContext, MemtableRep}; 9 | use crate::util::get_var_uint32; 10 | use crate::{InternalKeyComparator, KeyComparator}; 11 | use std::cmp::Ordering; 12 | 13 | pub struct DefaultComparator { 14 | comparator: InternalKeyComparator, 15 | } 16 | 17 | impl DefaultComparator { 18 | pub fn new(comparator: InternalKeyComparator) -> Self { 19 | Self { comparator } 20 | } 21 | } 22 | 23 | impl Comparator for DefaultComparator { 24 | fn compare(&self, k1: &[u8], k2: &[u8]) -> std::cmp::Ordering { 25 | self.comparator.compare_key(k1, k2) 26 | } 27 | 28 | unsafe fn compare_raw_key(&self, k1: *const u8, k2: *const u8) -> Ordering { 29 | let key1 = if *k1 < 128 { 30 | std::slice::from_raw_parts(k1.add(1), (*k1) as usize) 31 | } else { 32 | let data = std::slice::from_raw_parts(k1, 5); 33 | let mut offset = 0; 34 | let l = get_var_uint32(data, &mut offset).unwrap(); 35 | std::slice::from_raw_parts(k1.add(offset), l as usize) 36 | }; 37 | let key2 = if ((*k2) & 128) == 0 { 38 | std::slice::from_raw_parts(k2.add(1), (*k2) as usize) 39 | } else { 40 | let data = std::slice::from_raw_parts(k2, 5); 41 | let mut offset = 0; 42 | let l = get_var_uint32(data, &mut offset).unwrap(); 43 | std::slice::from_raw_parts(k2.add(offset), l as usize) 44 | }; 45 | self.comparator.compare_key(key1, key2) 46 | } 47 | 48 | unsafe fn compare_key(&self, k1: *const u8, k2: &[u8]) -> Ordering { 49 | let key1 = if ((*k1) & 128) == 0 { 50 | std::slice::from_raw_parts(k1.add(1), (*k1) as usize) 51 | } else { 52 | let data = std::slice::from_raw_parts(k1, 5); 53 | let mut offset = 0; 54 | let l = get_var_uint32(data, &mut offset).unwrap(); 55 | std::slice::from_raw_parts(k1.add(offset), l as usize) 56 | }; 57 | self.comparator.compare_key(key1, k2) 58 | } 59 | } 60 | 61 | // TODO: support in memory bloom filter 62 | pub struct InlineSkipListMemtableRep { 63 | list: InlineSkipList, 64 | comparator: InternalKeyComparator, 65 | } 66 | 67 | pub struct InlineSkipListMemtableIter { 68 | iter: SkipListIterator, 69 | current_offset: usize, 70 | current_key_size: usize, 71 | buf: Vec, 72 | } 73 | 74 | unsafe impl Send for InlineSkipListMemtableRep {} 75 | unsafe impl Sync for InlineSkipListMemtableRep {} 76 | unsafe impl Send for InlineSkipListMemtableIter {} 77 | unsafe impl Sync for InlineSkipListMemtableIter {} 78 | 79 | impl InternalIterator for InlineSkipListMemtableIter { 80 | fn valid(&self) -> bool { 81 | self.iter.valid() 82 | } 83 | 84 | fn seek(&mut self, key: &[u8]) { 85 | self.iter.seek(&mut self.buf, key) 86 | } 87 | 88 | fn seek_to_first(&mut self) { 89 | self.iter.seek_to_first(); 90 | } 91 | 92 | fn seek_to_last(&mut self) { 93 | self.iter.seek_to_last(); 94 | } 95 | 96 | fn seek_for_prev(&mut self, key: &[u8]) { 97 | self.iter.seek_for_prev(&mut self.buf, key) 98 | } 99 | 100 | fn next(&mut self) { 101 | self.iter.next(); 102 | } 103 | 104 | fn prev(&mut self) { 105 | self.iter.prev() 106 | } 107 | 108 | fn key(&self) -> &[u8] { 109 | self.iter.key() 110 | } 111 | 112 | fn value(&self) -> &[u8] { 113 | self.iter.value() 114 | } 115 | } 116 | 117 | impl InlineSkipListMemtableRep { 118 | pub fn new(comparator: InternalKeyComparator) -> Self { 119 | Self { 120 | list: InlineSkipList::new( 121 | SharedArena::new(), 122 | DefaultComparator { 123 | comparator: comparator.clone(), 124 | }, 125 | ), 126 | comparator, 127 | } 128 | } 129 | } 130 | 131 | impl MemtableRep for InlineSkipListMemtableRep { 132 | fn new_iterator(&self) -> Box { 133 | Box::new(InlineSkipListMemtableIter { 134 | iter: SkipListIterator::new(&self.list), 135 | current_offset: 0, 136 | current_key_size: 0, 137 | buf: vec![], 138 | }) 139 | } 140 | 141 | fn add(&self, ctx: &mut MemTableContext, key: &[u8], value: &[u8], sequence: u64) { 142 | self.list.add(ctx, key, value, sequence) 143 | } 144 | 145 | fn delete(&self, ctx: &mut MemTableContext, key: &[u8], sequence: u64) { 146 | self.list.delete(ctx, key, sequence) 147 | } 148 | 149 | fn mem_size(&self) -> usize { 150 | self.list.mem_size() 151 | } 152 | 153 | fn name(&self) -> &str { 154 | "InlineSkipListMemtable" 155 | } 156 | 157 | fn cmp(&self, start: &[u8], end: &[u8]) -> Ordering { 158 | self.comparator.compare_key(start, end) 159 | } 160 | } 161 | 162 | pub struct SkipListMemtableRep { 163 | list: Skiplist, 164 | comp: InternalKeyComparator, 165 | } 166 | 167 | impl SkipListMemtableRep { 168 | pub fn new(comparator: InternalKeyComparator, write_buffer_size: usize) -> Self { 169 | Self { 170 | list: Skiplist::with_capacity(comparator.clone(), write_buffer_size as u32), 171 | comp: comparator, 172 | } 173 | } 174 | } 175 | 176 | pub struct MemIterator { 177 | inner: IterRef, 178 | } 179 | 180 | impl InternalIterator for MemIterator { 181 | fn valid(&self) -> bool { 182 | self.inner.valid() 183 | } 184 | 185 | fn seek(&mut self, key: &[u8]) { 186 | self.inner.seek(key) 187 | } 188 | 189 | fn seek_to_first(&mut self) { 190 | self.inner.seek_to_first(); 191 | } 192 | 193 | fn seek_to_last(&mut self) { 194 | self.inner.seek_to_last() 195 | } 196 | 197 | fn seek_for_prev(&mut self, key: &[u8]) { 198 | self.inner.seek_for_prev(key) 199 | } 200 | 201 | fn next(&mut self) { 202 | self.inner.next() 203 | } 204 | 205 | fn prev(&mut self) { 206 | self.inner.prev() 207 | } 208 | 209 | fn key(&self) -> &[u8] { 210 | self.inner.key().as_ref() 211 | } 212 | 213 | fn value(&self) -> &[u8] { 214 | self.inner.value().as_ref() 215 | } 216 | } 217 | 218 | impl MemtableRep for SkipListMemtableRep { 219 | fn new_iterator(&self) -> Box { 220 | Box::new(MemIterator { 221 | inner: self.list.iter(), 222 | }) 223 | } 224 | 225 | fn add(&self, _: &mut MemTableContext, key: &[u8], value: &[u8], sequence: u64) { 226 | let mut ukey = Vec::with_capacity(key.len() + 8); 227 | ukey.extend_from_slice(key); 228 | ukey.extend_from_slice( 229 | &pack_sequence_and_type(sequence, ValueType::TypeValue as u8).to_le_bytes(), 230 | ); 231 | self.list.put(ukey, value.to_vec()); 232 | } 233 | fn delete(&self, _: &mut MemTableContext, key: &[u8], sequence: u64) { 234 | let mut ukey = Vec::with_capacity(key.len() + 8); 235 | ukey.extend_from_slice(key); 236 | ukey.extend_from_slice( 237 | &pack_sequence_and_type(sequence, ValueType::TypeDeletion as u8).to_le_bytes(), 238 | ); 239 | self.list.put(ukey, vec![]); 240 | } 241 | 242 | fn mem_size(&self) -> usize { 243 | self.list.mem_size() as usize 244 | } 245 | 246 | fn name(&self) -> &str { 247 | "SkipListMemtable" 248 | } 249 | 250 | fn cmp(&self, start: &[u8], end: &[u8]) -> Ordering { 251 | self.comp.compare_key(start, end) 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /src/options.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{FileSystem, InternalKeyComparator, InternalKeySliceTransform}; 2 | use crate::memtable::MemTableContext; 3 | use crate::table::{BlockBasedTableFactory, TableFactory}; 4 | use crate::{KeyComparator, SliceTransform, SyncPosixFileSystem}; 5 | use std::sync::Arc; 6 | 7 | pub struct ImmutableDBOptions { 8 | pub max_manifest_file_size: usize, 9 | pub max_total_wal_size: usize, 10 | pub db_path: String, 11 | pub fs: Arc, 12 | pub max_background_jobs: usize, 13 | } 14 | 15 | #[derive(Clone)] 16 | pub struct ColumnFamilyOptions { 17 | pub write_buffer_size: usize, 18 | pub max_write_buffer_number: usize, 19 | pub factory: Arc, 20 | pub comparator: InternalKeyComparator, 21 | pub prefix_extractor: Arc, 22 | pub max_level: u32, 23 | pub max_bytes_for_level_base: u64, 24 | pub max_bytes_for_level_multiplier: f64, 25 | pub target_file_size_base: usize, 26 | pub level0_file_num_compaction_trigger: usize, 27 | pub max_compaction_bytes: usize, 28 | } 29 | 30 | impl PartialEq for ColumnFamilyOptions { 31 | fn eq(&self, other: &Self) -> bool { 32 | self.write_buffer_size == other.write_buffer_size 33 | && self.max_write_buffer_number == other.max_write_buffer_number 34 | && self.factory.name().eq(other.factory.name()) 35 | && self.comparator.name().eq(other.comparator.name()) 36 | && self.max_level == other.max_level 37 | && self.max_bytes_for_level_base == other.max_bytes_for_level_base 38 | && self.max_bytes_for_level_multiplier == other.max_bytes_for_level_multiplier 39 | && self.target_file_size_base == other.target_file_size_base 40 | } 41 | } 42 | 43 | impl Eq for ColumnFamilyOptions {} 44 | 45 | impl Default for ColumnFamilyOptions { 46 | fn default() -> Self { 47 | ColumnFamilyOptions { 48 | write_buffer_size: 4 << 20, 49 | max_write_buffer_number: 1, 50 | factory: Arc::new(BlockBasedTableFactory::default()), 51 | comparator: InternalKeyComparator::default(), 52 | prefix_extractor: Arc::new(InternalKeySliceTransform::default()), 53 | max_level: 7, 54 | max_bytes_for_level_base: 256 * 1024 * 1024, 55 | max_bytes_for_level_multiplier: 10.0, 56 | target_file_size_base: 64 * 1024 * 1024, 57 | level0_file_num_compaction_trigger: 4, 58 | max_compaction_bytes: 64 * 1024usize * 1024usize * 25, 59 | } 60 | } 61 | } 62 | 63 | #[derive(Clone)] 64 | pub struct DBOptions { 65 | pub max_manifest_file_size: usize, 66 | pub max_total_wal_size: usize, 67 | pub create_if_missing: bool, 68 | pub create_missing_column_families: bool, 69 | pub fs: Arc, 70 | pub db_path: String, 71 | pub db_name: String, 72 | pub max_background_jobs: usize, 73 | } 74 | 75 | impl Default for DBOptions { 76 | fn default() -> Self { 77 | Self { 78 | max_manifest_file_size: 128 * 1024 * 1024, // 100MB 79 | max_total_wal_size: 128 * 1024 * 1024, // 100MB 80 | create_if_missing: false, 81 | create_missing_column_families: false, 82 | fs: Arc::new(SyncPosixFileSystem {}), 83 | db_path: "db".to_string(), 84 | db_name: "db".to_string(), 85 | max_background_jobs: 2, 86 | } 87 | } 88 | } 89 | 90 | #[derive(Clone)] 91 | pub struct ColumnFamilyDescriptor { 92 | pub name: String, 93 | pub options: ColumnFamilyOptions, 94 | } 95 | 96 | impl From for ImmutableDBOptions { 97 | fn from(opt: DBOptions) -> Self { 98 | Self { 99 | max_manifest_file_size: opt.max_manifest_file_size, 100 | max_total_wal_size: opt.max_total_wal_size, 101 | db_path: opt.db_path.clone(), 102 | fs: opt.fs.clone(), 103 | max_background_jobs: opt.max_background_jobs, 104 | } 105 | } 106 | } 107 | 108 | #[derive(Default, Clone)] 109 | pub struct ReadOptions { 110 | pub snapshot: Option, 111 | pub fill_cache: bool, 112 | pub total_order_seek: bool, 113 | pub prefix_same_as_start: bool, 114 | pub skip_filter: bool, 115 | } 116 | 117 | #[derive(Default, Clone)] 118 | pub struct WriteOptions { 119 | pub ctx: MemTableContext, 120 | pub disable_wal: bool, 121 | pub sync: bool, 122 | } 123 | 124 | const DEFAULT_COMPRESSION_LEVEL: i32 = 32767; 125 | 126 | #[derive(Clone)] 127 | pub struct CompressionOptions { 128 | pub window_bits: i32, 129 | pub level: i32, 130 | pub strategy: i32, 131 | } 132 | 133 | impl Default for CompressionOptions { 134 | fn default() -> Self { 135 | CompressionOptions { 136 | window_bits: -14, 137 | level: DEFAULT_COMPRESSION_LEVEL, 138 | strategy: 0, 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/pipeline.rs: -------------------------------------------------------------------------------- 1 | use crate::version::KernelNumberContext; 2 | use std::sync::{Arc, Condvar, Mutex}; 3 | 4 | /// PipelineCommitQueue will make the sequence of writebatch can be commit as their sequence order. 5 | 6 | struct QueueState { 7 | waiter_ount: usize, 8 | stop: bool, 9 | } 10 | 11 | pub struct PipelineCommitQueue { 12 | cond: Condvar, 13 | waiters: Mutex, 14 | kernel: Arc, 15 | } 16 | 17 | impl PipelineCommitQueue { 18 | pub fn new(kernel: Arc) -> Self { 19 | Self { 20 | cond: Condvar::new(), 21 | waiters: Mutex::new(QueueState { 22 | waiter_ount: 0, 23 | stop: false, 24 | }), 25 | kernel, 26 | } 27 | } 28 | 29 | pub fn commit(&self, last_commit_sequence: u64, commit_sequence: u64) { 30 | // We do not need to use compare_exchange because every request will have its own unique 31 | // `commit_sequence` and `last_commit_sequence`. 32 | for _ in 0..100 { 33 | if self.kernel.last_sequence() == last_commit_sequence { 34 | self.kernel.set_last_sequence(commit_sequence); 35 | self.cond.notify_all(); 36 | return; 37 | } 38 | } 39 | let mut state = self.waiters.lock().unwrap(); 40 | if self.kernel.last_sequence() == last_commit_sequence { 41 | self.kernel.set_last_sequence(commit_sequence); 42 | self.cond.notify_all(); 43 | return; 44 | } 45 | while self.kernel.last_sequence() != last_commit_sequence && !state.stop { 46 | state = self.cond.wait(state).unwrap(); 47 | } 48 | if !state.stop { 49 | self.kernel.set_last_sequence(commit_sequence); 50 | self.cond.notify_all(); 51 | } 52 | } 53 | 54 | // return true means this commit queue has been stopped and the DB will be closed. 55 | pub fn wait_pending_writers(&self, commit_sequence: u64) -> bool { 56 | for _ in 0..100 { 57 | if self.kernel.last_sequence() >= commit_sequence { 58 | return false; 59 | } 60 | } 61 | let mut state = self.waiters.lock().unwrap(); 62 | while self.kernel.last_sequence() < commit_sequence && !state.stop { 63 | state = self.cond.wait(state).unwrap(); 64 | } 65 | state.stop 66 | } 67 | 68 | pub fn stop(&mut self) { 69 | let mut state = self.waiters.lock().unwrap(); 70 | state.stop = true; 71 | self.cond.notify_all(); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/table/block_based/compression.rs: -------------------------------------------------------------------------------- 1 | use crate::common::Result; 2 | 3 | #[derive(Default)] 4 | pub struct CompressionInfo {} 5 | 6 | pub struct UncompressionInfo {} 7 | 8 | pub trait CompressionAlgorithm { 9 | fn compress(&self, info: &CompressionInfo, format_version: u32, data: &[u8]) 10 | -> Result>; 11 | fn name(&self) -> &'static str; 12 | fn uncompress( 13 | &self, 14 | info: &UncompressionInfo, 15 | format_version: u32, 16 | origin_data: &[u8], 17 | ) -> Result>; 18 | } 19 | -------------------------------------------------------------------------------- /src/table/block_based/data_block_hash_index_builder.rs: -------------------------------------------------------------------------------- 1 | use crate::util::decode_fixed_uint16; 2 | use crate::util::hash::key_hash; 3 | 4 | const MAX_RESTART_SUPPORTED_BY_HASH_INDEX: usize = 253; 5 | const NO_ENTRY: u8 = 255; 6 | const COLLIISION: u8 = 254; 7 | 8 | pub struct DataBlockHashIndexBuilder { 9 | valid: bool, 10 | bucket_per_key: f64, 11 | estimated_num_buckets: f64, 12 | hash_and_restart_pairs: Vec<(u32, u8)>, 13 | } 14 | 15 | impl Default for DataBlockHashIndexBuilder { 16 | fn default() -> Self { 17 | DataBlockHashIndexBuilder { 18 | bucket_per_key: -1.0, 19 | estimated_num_buckets: 0.0, 20 | valid: false, 21 | hash_and_restart_pairs: vec![], 22 | } 23 | } 24 | } 25 | 26 | impl DataBlockHashIndexBuilder { 27 | pub fn init(&mut self, mut ratio: f64) { 28 | if ratio <= 0.0 { 29 | ratio = 0.75; 30 | } 31 | self.bucket_per_key = 1.0 / ratio; 32 | self.valid = true; 33 | } 34 | 35 | pub fn clear(&mut self) { 36 | self.estimated_num_buckets = 0.0; 37 | self.valid = true; 38 | self.hash_and_restart_pairs.clear(); 39 | } 40 | 41 | pub fn valid(&self) -> bool { 42 | self.valid && self.bucket_per_key > 0.0 43 | } 44 | 45 | pub fn add(&mut self, user_key: &[u8], restart_index: usize) { 46 | if restart_index > MAX_RESTART_SUPPORTED_BY_HASH_INDEX { 47 | self.valid = false; 48 | return; 49 | } 50 | let h = key_hash(user_key); 51 | self.hash_and_restart_pairs.push((h, restart_index as u8)); 52 | self.estimated_num_buckets += self.bucket_per_key; 53 | } 54 | 55 | pub fn finish(&mut self, data: &mut Vec) { 56 | let mut num_buckets = self.estimated_num_buckets.round() as u16; 57 | if num_buckets == 0 { 58 | num_buckets = 1; 59 | } 60 | num_buckets |= 1; 61 | let mut buckets = vec![NO_ENTRY; num_buckets as usize]; 62 | for (hash_value, restart_index) in &self.hash_and_restart_pairs { 63 | let buck_idx = (*hash_value) as usize % num_buckets as usize; 64 | if buckets[buck_idx] == NO_ENTRY { 65 | buckets[buck_idx] = *restart_index; 66 | } else if buckets[buck_idx] != *restart_index { 67 | buckets[buck_idx] = COLLIISION; 68 | } 69 | } 70 | data.extend_from_slice(&buckets); 71 | data.extend_from_slice(&num_buckets.to_le_bytes()); 72 | } 73 | 74 | pub fn estimate_size(&self) -> usize { 75 | let mut estimated_num_buckets = self.estimated_num_buckets.round() as u16; 76 | 77 | // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. 78 | estimated_num_buckets |= 1; 79 | 80 | std::mem::size_of::() + estimated_num_buckets as usize * std::mem::size_of::() 81 | } 82 | } 83 | 84 | #[derive(Default, Clone)] 85 | pub struct DataBlockHashIndex { 86 | num_buckets: u16, 87 | } 88 | 89 | impl DataBlockHashIndex { 90 | pub fn initialize(&mut self, data: &[u8]) -> u16 { 91 | let offset = data.len() - std::mem::size_of::(); 92 | self.num_buckets = decode_fixed_uint16(&data[offset..]); 93 | (offset - self.num_buckets as usize * std::mem::size_of::()) as u16 94 | } 95 | 96 | pub fn lookup(&self, data: &[u8], map_offset: usize, key: &[u8]) -> u8 { 97 | let h = key_hash(key); 98 | let idx = (h % self.num_buckets as u32) as usize; 99 | data[idx + map_offset] 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/table/block_based/filter_block_builder.rs: -------------------------------------------------------------------------------- 1 | use crate::common::Result; 2 | use crate::table::block_based::filter_reader::FilterBlockReader; 3 | use crate::table::block_based::options::BlockBasedTableOptions; 4 | 5 | pub trait FilterBlockBuilder: Send { 6 | fn is_block_based(&self) -> bool { 7 | false 8 | } 9 | fn add(&mut self, key: &[u8]); 10 | fn start_block(&mut self, offset: u64); 11 | fn finish(&mut self) -> Result<&[u8]>; 12 | fn num_added(&self) -> usize; 13 | } 14 | 15 | pub trait FilterBlockFactory: Send + Sync { 16 | fn create_builder(&self, opts: &BlockBasedTableOptions) -> Box; 17 | fn create_filter_reader(&self, filter_block: Vec) -> Box; 18 | fn name(&self) -> &'static str; 19 | fn is_block_based(&self) -> bool { 20 | false 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/table/block_based/filter_reader.rs: -------------------------------------------------------------------------------- 1 | use crate::util::decode_fixed_uint32; 2 | use crate::util::hash::bloom_hash; 3 | 4 | pub trait FilterBlockReader: Sync + Send { 5 | fn is_block_based(&self) -> bool { 6 | false 7 | } 8 | fn key_may_match(&self, key: &[u8]) -> bool; 9 | } 10 | 11 | pub struct FullFilterBlockReader { 12 | data: Vec, 13 | num_probes: usize, 14 | num_lines: u32, 15 | log2_cache_line_size: u32, 16 | } 17 | 18 | impl FullFilterBlockReader { 19 | pub fn new(data: Vec) -> Self { 20 | let mut reader = FullFilterBlockReader { 21 | data, 22 | num_probes: 0, 23 | num_lines: 0, 24 | log2_cache_line_size: 0, 25 | }; 26 | let l = reader.data.len(); 27 | if l > 5 { 28 | reader.num_probes = reader.data[l - 5] as usize; 29 | reader.num_lines = decode_fixed_uint32(&reader.data[(l - 4)..]); 30 | } 31 | if reader.num_lines != 0 && (reader.data.len() - 5) % reader.num_lines as usize != 0 { 32 | reader.num_lines = 0; 33 | reader.num_probes = 0; 34 | } else if reader.num_lines != 0 { 35 | let mut num_lines_at_curr_cache_size = 36 | (reader.data.len() as u32 - 5) >> reader.log2_cache_line_size; 37 | while num_lines_at_curr_cache_size != reader.num_lines { 38 | reader.log2_cache_line_size += 1; 39 | num_lines_at_curr_cache_size = 40 | (reader.data.len() as u32 - 5) >> reader.log2_cache_line_size; 41 | } 42 | } 43 | reader 44 | } 45 | 46 | fn hash_may_match(&self, mut h: u32, bit_offset: u32) -> bool { 47 | let delta = (h >> 17) | (h << 15); 48 | for _ in 0..self.num_probes { 49 | let bitpos = bit_offset + (h & ((1 << (self.log2_cache_line_size + 3)) - 1)); 50 | if (self.data[bitpos as usize / 8] & (1u8 << (bitpos % 8) as u8)) == 0 { 51 | return false; 52 | } 53 | h = h.wrapping_add(delta); 54 | } 55 | true 56 | } 57 | } 58 | 59 | impl FilterBlockReader for FullFilterBlockReader { 60 | fn key_may_match(&self, key: &[u8]) -> bool { 61 | let h = bloom_hash(key); 62 | // Left shift by an extra 3 to convert bytes to bits 63 | let bit_offset = (h % self.num_lines) << (self.log2_cache_line_size + 3); 64 | self.hash_may_match(h, bit_offset) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/table/block_based/full_filter_block_builder.rs: -------------------------------------------------------------------------------- 1 | use crate::common::SliceTransform; 2 | use crate::table::block_based::filter_block_builder::*; 3 | use crate::table::block_based::filter_reader::{FilterBlockReader, FullFilterBlockReader}; 4 | use crate::table::block_based::options::BlockBasedTableOptions; 5 | use crate::util::hash::bloom_hash; 6 | use std::sync::Arc; 7 | 8 | const CACHE_LINE_SIZE: u32 = 128; 9 | 10 | pub struct FullFilterBitsBuilder { 11 | hash_entries: Vec, 12 | bits_per_key: usize, 13 | num_probes: usize, 14 | } 15 | 16 | impl FullFilterBitsBuilder { 17 | fn add_key(&mut self, key: &[u8]) { 18 | let h = bloom_hash(key); 19 | if self.hash_entries.last().map_or(true, |e| *e != h) { 20 | self.hash_entries.push(h); 21 | } 22 | } 23 | 24 | fn calculate_space(&self, num_entry: usize) -> (Vec, u32, u32) { 25 | if num_entry > 0 { 26 | let total_bits_tmp = num_entry as u32 * self.bits_per_key as u32; 27 | let mut num_lines = (total_bits_tmp + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); 28 | if num_lines % 2 == 0 { 29 | num_lines += 1; 30 | } 31 | let total_bits = num_lines * (CACHE_LINE_SIZE * 8); 32 | let data = vec![0u8; total_bits as usize / 8 + 5]; 33 | (data, total_bits, num_lines) 34 | } else { 35 | (vec![0u8; 5], 0, 0) 36 | } 37 | } 38 | 39 | fn add_hash(&self, mut h: u32, data: &mut Vec, num_lines: u32) { 40 | let delta = (h >> 17) | (h << 15); 41 | let b = (h % num_lines) * (CACHE_LINE_SIZE * 8); 42 | for _ in 0..self.num_probes { 43 | let bitpos = b + (h % (CACHE_LINE_SIZE * 8)); 44 | data[bitpos as usize / 8] |= (1 << (bitpos % 8)) as u8; 45 | h = h.wrapping_add(delta); 46 | } 47 | } 48 | 49 | fn finish(&mut self) -> Vec { 50 | let (mut data, total_bits, num_lines) = self.calculate_space(self.hash_entries.len()); 51 | let hash_entries = std::mem::take(&mut self.hash_entries); 52 | if total_bits != 0 && num_lines != 0 { 53 | for h in hash_entries { 54 | self.add_hash(h, &mut data, num_lines); 55 | } 56 | } 57 | let pos = total_bits as usize / 8; 58 | data[pos] = self.num_probes as u8; 59 | data[(pos + 1)..(pos + 5)].copy_from_slice(&num_lines.to_le_bytes()); 60 | data 61 | } 62 | } 63 | 64 | pub struct FullFilterBlockBuilder { 65 | prefix_extractor: Option>, 66 | filter_bits_builder: FullFilterBitsBuilder, 67 | whole_key_filtering: bool, 68 | last_whole_key_recorded: bool, 69 | last_prefix_recorded: bool, 70 | last_whole_key_str: Vec, 71 | last_prefix_str: Vec, 72 | filter_data: Vec, 73 | num_added: u32, 74 | } 75 | 76 | impl FullFilterBlockBuilder { 77 | pub fn new(opts: &BlockBasedTableOptions, filter_bits_builder: FullFilterBitsBuilder) -> Self { 78 | FullFilterBlockBuilder { 79 | prefix_extractor: opts.prefix_extractor.clone(), 80 | filter_bits_builder, 81 | whole_key_filtering: opts.whole_key_filtering, 82 | last_whole_key_recorded: false, 83 | last_prefix_recorded: false, 84 | last_whole_key_str: vec![], 85 | last_prefix_str: vec![], 86 | filter_data: vec![], 87 | num_added: 0, 88 | } 89 | } 90 | 91 | fn add_prefix(&mut self, key: &[u8]) { 92 | let prefix_extractor = self.prefix_extractor.take().unwrap(); 93 | let prefix = prefix_extractor.transform(key); 94 | if self.whole_key_filtering { 95 | let last_prefix = self.last_prefix_str.as_slice(); 96 | if !self.last_prefix_recorded || last_prefix.eq(prefix) { 97 | self.add_key(prefix); 98 | self.last_prefix_recorded = true; 99 | self.last_prefix_str = prefix.to_vec(); 100 | } 101 | } else { 102 | self.add_key(prefix); 103 | } 104 | self.prefix_extractor = Some(prefix_extractor); 105 | } 106 | 107 | fn add_key(&mut self, key: &[u8]) { 108 | self.filter_bits_builder.add_key(key); 109 | self.num_added += 1; 110 | } 111 | 112 | fn reset(&mut self) { 113 | self.last_whole_key_recorded = false; 114 | self.last_prefix_recorded = false; 115 | } 116 | } 117 | 118 | impl FilterBlockBuilder for FullFilterBlockBuilder { 119 | fn add(&mut self, key: &[u8]) { 120 | let add_prefix = self 121 | .prefix_extractor 122 | .as_ref() 123 | .map_or(false, |extractor| extractor.in_domain(key)); 124 | if self.whole_key_filtering { 125 | if !add_prefix { 126 | self.add_key(key); 127 | } else { 128 | let last_whole_key = self.last_whole_key_str.as_slice(); 129 | if !self.last_whole_key_recorded || !last_whole_key.eq(key) { 130 | // drop(last_whole_key); 131 | self.add_key(key); 132 | self.last_whole_key_recorded = true; 133 | self.last_whole_key_str = key.to_vec(); 134 | } 135 | } 136 | } 137 | if add_prefix { 138 | self.add_prefix(key); 139 | } 140 | } 141 | 142 | fn start_block(&mut self, _: u64) {} 143 | 144 | fn finish(&mut self) -> crate::common::Result<&[u8]> { 145 | if self.num_added != 0 { 146 | self.filter_data = self.filter_bits_builder.finish(); 147 | return Ok(&self.filter_data); 148 | } 149 | self.filter_data.clear(); 150 | Ok(&self.filter_data) 151 | } 152 | 153 | fn num_added(&self) -> usize { 154 | self.num_added as usize 155 | } 156 | } 157 | 158 | pub struct FullFilterBlockFactory { 159 | bits_per_key: usize, 160 | num_probes: usize, 161 | } 162 | 163 | impl FullFilterBlockFactory { 164 | pub fn new(bits_per_key: usize) -> Self { 165 | let mut num_probes = (bits_per_key as f64 * 0.69).round() as usize; // 0.69 =~ ln(2) 166 | if num_probes < 1 { 167 | num_probes = 1; 168 | } 169 | if num_probes > 30 { 170 | num_probes = 30; 171 | } 172 | Self { 173 | bits_per_key, 174 | num_probes, 175 | } 176 | } 177 | } 178 | 179 | impl FilterBlockFactory for FullFilterBlockFactory { 180 | fn create_builder(&self, opts: &BlockBasedTableOptions) -> Box { 181 | let bits = FullFilterBitsBuilder { 182 | hash_entries: vec![], 183 | bits_per_key: self.bits_per_key, 184 | num_probes: self.num_probes, 185 | }; 186 | let builder = FullFilterBlockBuilder::new(opts, bits); 187 | Box::new(builder) 188 | } 189 | 190 | fn create_filter_reader(&self, filter_block: Vec) -> Box { 191 | Box::new(FullFilterBlockReader::new(filter_block)) 192 | } 193 | 194 | fn name(&self) -> &'static str { 195 | "rocksdb.BuiltinBloomFilter" 196 | } 197 | } 198 | #[cfg(test)] 199 | mod tests { 200 | use super::*; 201 | 202 | #[test] 203 | fn test_index_builder() { 204 | let options = BlockBasedTableOptions { 205 | whole_key_filtering: true, 206 | ..Default::default() 207 | }; 208 | let factory = FullFilterBlockFactory::new(10); 209 | let mut builder = factory.create_builder(&options); 210 | builder.add(b"abcdeeeeee"); 211 | builder.add(b"abcdefffff"); 212 | builder.add(b"abcdeggggg"); 213 | builder.add(b"abcdehhhhh"); 214 | builder.add(b"abcdeiiiii"); 215 | builder.add(b"abcdejjjjj"); 216 | let data = builder.finish().unwrap().to_vec(); 217 | let reader = factory.create_filter_reader(data); 218 | assert!(reader.key_may_match(b"abcdeeeeee")); 219 | assert!(reader.key_may_match(b"abcdefffff")); 220 | assert!(reader.key_may_match(b"abcdeggggg")); 221 | assert!(reader.key_may_match(b"abcdehhhhh")); 222 | assert!(reader.key_may_match(b"abcdeiiiii")); 223 | assert!(reader.key_may_match(b"abcdejjjjj")); 224 | assert!(!reader.key_may_match(b"abcdejjjjjk")); 225 | assert!(!reader.key_may_match(b"abcdejjjj")); 226 | assert!(!reader.key_may_match(b"abcdejjjjjj")); 227 | } 228 | } 229 | -------------------------------------------------------------------------------- /src/table/block_based/index_builder.rs: -------------------------------------------------------------------------------- 1 | use super::options::IndexShorteningMode; 2 | use crate::common::format::extract_user_key; 3 | use crate::common::{InternalKeyComparator, KeyComparator, Result}; 4 | use crate::table::block_based::block_builder::{BlockBuilder, DEFAULT_HASH_TABLE_UTIL_RATIO}; 5 | use crate::table::block_based::options::{BlockBasedTableOptions, DataBlockIndexType, IndexType}; 6 | use crate::table::format::*; 7 | use std::cmp::Ordering; 8 | 9 | pub struct IndexBlocks { 10 | pub index_block_contents: Vec, 11 | // pub meta_blocks: std::collections::HashMap, Vec>, 12 | } 13 | 14 | pub trait IndexBuilder: Send { 15 | fn add_index_entry( 16 | &mut self, 17 | last_key_in_current_block: &mut Vec, 18 | first_key_in_next_block: &[u8], 19 | block_handle: &BlockHandle, 20 | ); 21 | fn on_key_added(&mut self, key: &[u8]); 22 | fn finish(&mut self) -> Result<&[u8]>; 23 | fn index_size(&self) -> usize; 24 | fn seperator_is_key_plus_seq(&self) -> bool { 25 | true 26 | } 27 | } 28 | 29 | pub struct ShortenedIndexBuilder { 30 | index_block_builder: BlockBuilder, 31 | index_block_builder_without_seq: BlockBuilder, 32 | include_first_key: bool, 33 | shortening_mode: IndexShorteningMode, 34 | current_block_first_internal_key: Vec, 35 | comparator: InternalKeyComparator, 36 | seperator_is_key_plus_seq: bool, 37 | index_size: usize, 38 | } 39 | 40 | impl ShortenedIndexBuilder { 41 | fn new( 42 | comparator: InternalKeyComparator, 43 | index_block_restart_interval: usize, 44 | format_version: u32, 45 | include_first_key: bool, 46 | shortening_mode: IndexShorteningMode, 47 | ) -> Self { 48 | let index_block_builder = BlockBuilder::new( 49 | index_block_restart_interval, 50 | true, 51 | DataBlockIndexType::DataBlockBinarySearch, 52 | DEFAULT_HASH_TABLE_UTIL_RATIO, 53 | ); 54 | let index_block_builder_without_seq = BlockBuilder::new( 55 | index_block_restart_interval, 56 | true, 57 | DataBlockIndexType::DataBlockBinarySearch, 58 | DEFAULT_HASH_TABLE_UTIL_RATIO, 59 | ); 60 | ShortenedIndexBuilder { 61 | index_block_builder, 62 | index_block_builder_without_seq, 63 | include_first_key, 64 | shortening_mode, 65 | current_block_first_internal_key: vec![], 66 | comparator, 67 | seperator_is_key_plus_seq: format_version <= 2, 68 | index_size: 0, 69 | } 70 | } 71 | } 72 | 73 | impl IndexBuilder for ShortenedIndexBuilder { 74 | fn add_index_entry( 75 | &mut self, 76 | last_key_in_current_block: &mut Vec, 77 | first_key_in_next_block: &[u8], 78 | block_handle: &BlockHandle, 79 | ) { 80 | if !first_key_in_next_block.is_empty() { 81 | if self.shortening_mode != IndexShorteningMode::NoShortening { 82 | self.comparator 83 | .find_shortest_separator(last_key_in_current_block, first_key_in_next_block); 84 | } 85 | if !self.seperator_is_key_plus_seq 86 | && self.comparator.get_user_comparator().compare_key( 87 | extract_user_key(last_key_in_current_block.as_slice()), 88 | extract_user_key(first_key_in_next_block), 89 | ) == Ordering::Equal 90 | { 91 | self.seperator_is_key_plus_seq = true; 92 | } 93 | } else if self.shortening_mode == IndexShorteningMode::ShortenSeparatorsAndSuccessor { 94 | self.comparator 95 | .find_short_successor(last_key_in_current_block); 96 | } 97 | let sep = last_key_in_current_block.as_slice(); 98 | let entry = IndexValueRef::new(block_handle); 99 | let mut encoded_entry = vec![]; 100 | entry.encode_to(&mut encoded_entry); 101 | self.index_block_builder.add(sep, &encoded_entry); 102 | if !self.seperator_is_key_plus_seq { 103 | self.index_block_builder_without_seq 104 | .add(extract_user_key(sep), &encoded_entry); 105 | } 106 | self.current_block_first_internal_key.clear(); 107 | } 108 | 109 | fn on_key_added(&mut self, key: &[u8]) { 110 | if self.include_first_key && self.current_block_first_internal_key.is_empty() { 111 | self.current_block_first_internal_key = key.to_vec(); 112 | } 113 | } 114 | 115 | fn finish(&mut self) -> Result<&[u8]> { 116 | let buf = if self.seperator_is_key_plus_seq { 117 | self.index_block_builder.finish() 118 | } else { 119 | self.index_block_builder_without_seq.finish() 120 | }; 121 | self.index_size = buf.len(); 122 | Ok(buf) 123 | // meta_blocks: HashMap::default(), 124 | } 125 | 126 | fn index_size(&self) -> usize { 127 | self.index_size 128 | } 129 | 130 | fn seperator_is_key_plus_seq(&self) -> bool { 131 | self.seperator_is_key_plus_seq 132 | } 133 | } 134 | 135 | pub fn create_index_builder( 136 | _: IndexType, 137 | comparator: InternalKeyComparator, 138 | opts: &BlockBasedTableOptions, 139 | ) -> Box { 140 | let builder = ShortenedIndexBuilder::new( 141 | comparator, 142 | opts.index_block_restart_interval, 143 | opts.format_version, 144 | false, 145 | opts.index_shortening, 146 | ); 147 | Box::new(builder) 148 | } 149 | 150 | #[cfg(test)] 151 | mod tests { 152 | use super::*; 153 | use crate::common::{ 154 | FileSystem, InMemFileSystem, InternalKeyComparator, DISABLE_GLOBAL_SEQUENCE_NUMBER, 155 | }; 156 | use crate::table::block_based::index_reader::IndexReader; 157 | use crate::table::InternalIterator; 158 | use std::path::Path; 159 | use std::sync::Arc; 160 | use tokio::runtime::Runtime; 161 | 162 | #[test] 163 | fn test_index_builder() { 164 | let mut builder = ShortenedIndexBuilder::new( 165 | InternalKeyComparator::default(), 166 | 1, 167 | 2, 168 | false, 169 | IndexShorteningMode::ShortenSeparators, 170 | ); 171 | let mut kvs = vec![ 172 | (b"abcdeeeeee".to_vec(), BlockHandle::new(100, 50)), 173 | (b"abcdefffff".to_vec(), BlockHandle::new(150, 50)), 174 | (b"abcdeggggg".to_vec(), BlockHandle::new(200, 50)), 175 | (b"abcdehhhhh".to_vec(), BlockHandle::new(250, 50)), 176 | (b"abcdeiiiii".to_vec(), BlockHandle::new(300, 50)), 177 | (b"abcdejjjjj".to_vec(), BlockHandle::new(350, 50)), 178 | ]; 179 | 180 | for (k, _) in kvs.iter_mut() { 181 | k.extend_from_slice(&0u64.to_le_bytes()); 182 | } 183 | for (k, v) in kvs.iter() { 184 | builder.add_index_entry(&mut k.clone(), &[], v); 185 | } 186 | let data = builder.finish().unwrap().to_vec(); 187 | let seperate = builder.seperator_is_key_plus_seq(); 188 | let fs = InMemFileSystem::default(); 189 | let mut f = fs 190 | .open_writable_file_writer(Path::new("index_block")) 191 | .unwrap(); 192 | let r = Runtime::new().unwrap(); 193 | r.block_on(f.append(&data)).unwrap(); 194 | let trailer: [u8; 5] = [0; 5]; 195 | r.block_on(f.append(&trailer)).unwrap(); 196 | r.block_on(f.sync()).unwrap(); 197 | let readfile = fs 198 | .open_random_access_file(Path::new("index_block")) 199 | .unwrap(); 200 | let handle = BlockHandle::new(0, data.len() as u64); 201 | 202 | let f = IndexReader::open( 203 | readfile.as_ref(), 204 | &handle, 205 | DISABLE_GLOBAL_SEQUENCE_NUMBER, 206 | seperate, 207 | ); 208 | let reader = r.block_on(f).unwrap(); 209 | let mut iter = reader.new_iterator(Arc::new(InternalKeyComparator::default())); 210 | iter.seek_to_first(); 211 | for (k, v) in kvs { 212 | assert!(iter.valid()); 213 | assert_eq!(iter.key(), k.as_slice()); 214 | assert_eq!(iter.index_value().handle, v); 215 | iter.next(); 216 | } 217 | let mut w = b"abcde".to_vec(); 218 | w.extend_from_slice(&0u64.to_le_bytes()); 219 | iter.seek(&w); 220 | assert!(iter.valid()); 221 | assert_eq!(iter.index_value().handle.offset, 100); 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/table/block_based/index_reader.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{InternalKeyComparator, RandomAccessFileReader, Result}; 2 | use crate::table::block_based::block::{read_block_from_file, Block, IndexBlockIter}; 3 | use crate::table::format::BlockHandle; 4 | use std::sync::Arc; 5 | 6 | pub struct IndexReader { 7 | index_block: Arc, 8 | index_key_includes_seq: bool, 9 | } 10 | 11 | impl IndexReader { 12 | pub async fn open( 13 | file: &RandomAccessFileReader, 14 | handle: &BlockHandle, 15 | global_seqno: u64, 16 | index_key_includes_seq: bool, 17 | ) -> Result { 18 | let index_block = read_block_from_file(file, handle, global_seqno).await?; 19 | let reader = IndexReader { 20 | index_block, 21 | index_key_includes_seq, 22 | }; 23 | Ok(reader) 24 | } 25 | 26 | pub fn new_iterator(&self, comparator: Arc) -> Box { 27 | let iter = if self.index_key_includes_seq { 28 | self.index_block 29 | .new_index_iterator(comparator, self.index_key_includes_seq) 30 | } else { 31 | self.index_block.new_index_iterator( 32 | comparator.get_user_comparator().clone(), 33 | self.index_key_includes_seq, 34 | ) 35 | }; 36 | Box::new(iter) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/table/block_based/lz4.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{Error, Result}; 2 | use crate::table::block_based::compression::{ 3 | CompressionAlgorithm, CompressionInfo, UncompressionInfo, 4 | }; 5 | use crate::util::{encode_var_uint32, get_var_uint32}; 6 | use libc::c_int; 7 | use lz4::liblz4::{ 8 | LZ4_compressBound, LZ4_compress_continue, LZ4_createStream, LZ4_createStreamDecode, 9 | LZ4_decompress_safe_continue, LZ4_freeStream, LZ4_freeStreamDecode, 10 | }; 11 | 12 | #[derive(Default)] 13 | pub struct LZ4CompressionAlgorithm {} 14 | 15 | impl CompressionAlgorithm for LZ4CompressionAlgorithm { 16 | fn compress( 17 | &self, 18 | _info: &CompressionInfo, 19 | format_version: u32, 20 | data: &[u8], 21 | ) -> crate::Result> { 22 | if data.len() > u32::MAX as usize { 23 | return Err(Error::CompactionError( 24 | "can not compress block larger than 4GB".to_string(), 25 | )); 26 | } 27 | if format_version != 2 { 28 | unimplemented!(); 29 | } 30 | unsafe { 31 | let mut tmp: [u8; 5] = [0u8; 5]; 32 | let output_header_len = encode_var_uint32(&mut tmp, data.len() as u32); 33 | 34 | let compress_bound = LZ4_compressBound(data.len() as c_int); 35 | let mut output = vec![0u8; output_header_len + compress_bound as usize]; 36 | output[..output_header_len].copy_from_slice(&tmp[..output_header_len]); 37 | let stream = LZ4_createStream(); 38 | let outlen = LZ4_compress_continue( 39 | stream, 40 | data.as_ptr(), 41 | output.as_mut_ptr().add(output_header_len), 42 | data.len() as c_int, 43 | ); 44 | LZ4_freeStream(stream); 45 | output.resize(outlen as usize + output_header_len, 0); 46 | Ok(output) 47 | } 48 | } 49 | 50 | fn name(&self) -> &'static str { 51 | "lz4" 52 | } 53 | 54 | fn uncompress( 55 | &self, 56 | _info: &UncompressionInfo, 57 | format_version: u32, 58 | origin_data: &[u8], 59 | ) -> Result> { 60 | if format_version != 2 { 61 | unimplemented!(); 62 | } 63 | let mut offset = 0; 64 | let l = get_var_uint32(origin_data, &mut offset) 65 | .ok_or(Error::VarDecode("uncompress failed"))?; 66 | let compressed_size = origin_data.len() - offset as usize; 67 | let mut output = vec![0u8; l as usize]; 68 | unsafe { 69 | let stream = LZ4_createStreamDecode(); 70 | let decompress_size = LZ4_decompress_safe_continue( 71 | stream, 72 | origin_data.as_ptr().add(offset), 73 | output.as_mut_ptr(), 74 | compressed_size as i32, 75 | l as i32, 76 | ); 77 | LZ4_freeStreamDecode(stream); 78 | if decompress_size < 0 { 79 | return Err(Error::VarDecode("decompress data failed")); 80 | } 81 | assert_eq!(decompress_size, l as i32); 82 | Ok(output) 83 | } 84 | } 85 | } 86 | 87 | #[cfg(test)] 88 | mod tests { 89 | use super::*; 90 | 91 | #[test] 92 | fn test_lz4_algorithm() { 93 | let lz4 = LZ4CompressionAlgorithm::default(); 94 | let mut data = Vec::with_capacity(4096); 95 | for i in 0..100u64 { 96 | data.extend_from_slice(&i.to_le_bytes()); 97 | } 98 | let output = lz4.compress(&CompressionInfo {}, 2, &data).unwrap(); 99 | let origin = lz4.uncompress(&UncompressionInfo {}, 2, &output).unwrap(); 100 | assert_eq!(data, origin); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/table/block_based/mod.rs: -------------------------------------------------------------------------------- 1 | mod block; 2 | mod block_builder; 3 | mod compression; 4 | mod data_block_hash_index_builder; 5 | mod filter_block_builder; 6 | mod filter_reader; 7 | mod full_filter_block_builder; 8 | mod index_builder; 9 | mod index_reader; 10 | mod lz4; 11 | mod meta_block; 12 | mod options; 13 | mod table_builder; 14 | mod table_builder_factory; 15 | mod table_iterator; 16 | mod table_reader; 17 | 18 | pub use filter_block_builder::FilterBlockFactory; 19 | pub use full_filter_block_builder::FullFilterBlockFactory; 20 | pub use options::BlockBasedTableOptions; 21 | pub use table_builder_factory::BlockBasedTableFactory; 22 | const BLOCK_TRAILER_SIZE: usize = 5; 23 | const FILTER_BLOCK_PREFIX: &str = "filter."; 24 | const FULL_FILTER_BLOCK_PREFIX: &str = "fullfilter."; 25 | -------------------------------------------------------------------------------- /src/table/block_based/options.rs: -------------------------------------------------------------------------------- 1 | use crate::common::SliceTransform; 2 | use crate::table::block_based::filter_block_builder::FilterBlockFactory; 3 | use crate::table::block_based::FullFilterBlockFactory; 4 | use crate::table::format::ChecksumType; 5 | use std::sync::Arc; 6 | 7 | #[derive(Eq, PartialEq, Clone)] 8 | pub enum DataBlockIndexType { 9 | DataBlockBinarySearch, 10 | DataBlockBinaryAndHash, 11 | } 12 | 13 | #[derive(Eq, PartialEq, Clone, Copy)] 14 | pub enum IndexType { 15 | BinarySearch, 16 | HashSearch, 17 | TwoLevelIndexSearch, 18 | // Not support 19 | BinarySearchWithFirstKey, 20 | } 21 | 22 | #[derive(Eq, PartialEq, Clone, Copy)] 23 | pub enum IndexShorteningMode { 24 | // Use full keys. 25 | NoShortening, 26 | // Shorten index keys between blocks, but use full key for the last index 27 | // key, which is the upper bound of the whole file. 28 | ShortenSeparators, 29 | // Shorten both keys between blocks and key after last block. 30 | ShortenSeparatorsAndSuccessor, 31 | } 32 | 33 | #[derive(Clone)] 34 | pub struct BlockBasedTableOptions { 35 | pub block_align: bool, 36 | pub block_restart_interval: usize, 37 | pub block_size: usize, 38 | pub checksum: ChecksumType, 39 | pub data_block_hash_table_util_ratio: f64, 40 | pub data_block_index_type: DataBlockIndexType, 41 | pub filter_factory: Arc, 42 | pub format_version: u32, 43 | pub prefix_extractor: Option>, 44 | pub index_block_restart_interval: usize, 45 | pub index_shortening: IndexShorteningMode, 46 | pub index_type: IndexType, 47 | pub use_delta_encoding: bool, 48 | pub whole_key_filtering: bool, 49 | } 50 | 51 | impl Default for BlockBasedTableOptions { 52 | fn default() -> Self { 53 | Self { 54 | block_align: false, 55 | block_restart_interval: 16, 56 | block_size: 4096, 57 | checksum: ChecksumType::NoChecksum, 58 | data_block_hash_table_util_ratio: 0.75, 59 | data_block_index_type: DataBlockIndexType::DataBlockBinarySearch, 60 | filter_factory: Arc::new(FullFilterBlockFactory::new(10)), 61 | format_version: 2, 62 | prefix_extractor: None, 63 | index_block_restart_interval: 1, 64 | index_shortening: IndexShorteningMode::NoShortening, 65 | index_type: IndexType::BinarySearch, 66 | use_delta_encoding: true, 67 | whole_key_filtering: false, 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/table/block_based/table_builder_factory.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{RandomAccessFileReader, WritableFileWriter}; 2 | use crate::table::block_based::options::BlockBasedTableOptions; 3 | use crate::table::block_based::table_builder::BlockBasedTableBuilder; 4 | use crate::table::block_based::table_reader::BlockBasedTable; 5 | use crate::table::{ 6 | TableBuilder, TableBuilderOptions, TableFactory, TableReader, TableReaderOptions, 7 | }; 8 | use async_trait::async_trait; 9 | 10 | #[derive(Default)] 11 | pub struct BlockBasedTableFactory { 12 | opts: BlockBasedTableOptions, 13 | } 14 | 15 | impl BlockBasedTableFactory { 16 | pub fn new(opts: BlockBasedTableOptions) -> Self { 17 | Self { opts } 18 | } 19 | } 20 | 21 | #[async_trait] 22 | impl TableFactory for BlockBasedTableFactory { 23 | fn name(&self) -> &'static str { 24 | "BlockBasedTableFactory" 25 | } 26 | 27 | async fn open_reader( 28 | &self, 29 | opts: &TableReaderOptions, 30 | reader: Box, 31 | ) -> crate::common::Result> { 32 | let reader = BlockBasedTable::open(opts, self.opts.clone(), reader).await?; 33 | Ok(Box::new(reader)) 34 | } 35 | 36 | fn new_builder( 37 | &self, 38 | opts: &TableBuilderOptions, 39 | file: Box, 40 | ) -> crate::common::Result> { 41 | let builder = BlockBasedTableBuilder::new(opts, self.opts.clone(), file); 42 | Ok(Box::new(builder)) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/table/block_based/table_iterator.rs: -------------------------------------------------------------------------------- 1 | use crate::common::InternalKeyComparator; 2 | use crate::table::block_based::block::{DataBlockIter, IndexBlockIter}; 3 | use crate::table::block_based::table_reader::BlockBasedTableRep; 4 | use crate::table::{AsyncIterator, InternalIterator}; 5 | use async_trait::async_trait; 6 | use std::sync::Arc; 7 | 8 | pub struct BlockBasedTableIterator { 9 | table: Arc, 10 | comparator: Arc, 11 | index_iter: Box, 12 | data_iter: Option, 13 | // TODO: check whether the data block is in upper bound. 14 | is_out_of_bound: bool, 15 | } 16 | 17 | impl BlockBasedTableIterator { 18 | pub fn new( 19 | table: Arc, 20 | comparator: Arc, 21 | index_iter: Box, 22 | ) -> Self { 23 | Self { 24 | table, 25 | comparator, 26 | index_iter, 27 | data_iter: None, 28 | is_out_of_bound: false, 29 | } 30 | } 31 | } 32 | 33 | impl BlockBasedTableIterator { 34 | async fn init_data_block(&mut self) -> bool { 35 | let v = self.index_iter.index_value(); 36 | match self.table.new_data_block_iterator(&v.handle).await { 37 | Ok(iter) => { 38 | self.data_iter = Some(iter); 39 | true 40 | } 41 | Err(_) => { 42 | // TODO: record the IO Error 43 | self.data_iter.take(); 44 | false 45 | } 46 | } 47 | } 48 | 49 | async fn find_block_forward(&mut self) { 50 | while self.data_iter.as_ref().map_or(false, |iter| !iter.valid()) { 51 | self.data_iter.take(); 52 | self.index_iter.next(); 53 | if !self.index_iter.valid() { 54 | return; 55 | } 56 | if !self.init_data_block().await { 57 | return; 58 | } 59 | self.data_iter.as_mut().unwrap().seek_to_first(); 60 | } 61 | } 62 | 63 | async fn find_key_backward(&mut self) { 64 | while self.data_iter.as_ref().map_or(false, |iter| !iter.valid()) { 65 | self.data_iter.take(); 66 | self.index_iter.prev(); 67 | if !self.index_iter.valid() { 68 | return; 69 | } 70 | if !self.init_data_block().await { 71 | return; 72 | } 73 | self.data_iter.as_mut().unwrap().seek_to_last(); 74 | } 75 | } 76 | } 77 | 78 | #[async_trait] 79 | impl AsyncIterator for BlockBasedTableIterator { 80 | fn valid(&self) -> bool { 81 | self.data_iter.as_ref().map_or(false, |iter| iter.valid()) 82 | } 83 | 84 | async fn seek(&mut self, key: &[u8]) { 85 | // TODO: check prefix seek 86 | // TODO: near seek, do not seek index iterator. 87 | if key.is_empty() { 88 | self.index_iter.seek_to_first(); 89 | } else { 90 | self.index_iter.seek(key); 91 | } 92 | if !self.index_iter.valid() { 93 | self.data_iter.take(); 94 | return; 95 | } 96 | if !self.init_data_block().await { 97 | return; 98 | } 99 | // TODO: check whether the data block is in upper bound. 100 | assert!(self.data_iter.is_some()); 101 | if key.is_empty() { 102 | self.data_iter.as_mut().unwrap().seek_to_first(); 103 | } else { 104 | self.data_iter.as_mut().unwrap().seek(key); 105 | } 106 | if !self.data_iter.as_ref().unwrap().valid() { 107 | self.find_block_forward().await; 108 | } 109 | } 110 | 111 | async fn seek_to_first(&mut self) { 112 | self.seek(&[]).await; 113 | } 114 | 115 | async fn seek_to_last(&mut self) { 116 | self.index_iter.seek_to_last(); 117 | if !self.index_iter.valid() { 118 | self.data_iter.take(); 119 | return; 120 | } 121 | if !self.init_data_block().await { 122 | return; 123 | } 124 | self.data_iter.as_mut().unwrap().seek_to_last(); 125 | self.find_key_backward().await; 126 | } 127 | 128 | async fn seek_for_prev(&mut self, key: &[u8]) { 129 | self.index_iter.seek(key); 130 | if !self.index_iter.valid() { 131 | self.index_iter.seek_to_last(); 132 | } 133 | if !self.init_data_block().await { 134 | return; 135 | } 136 | self.data_iter.as_mut().unwrap().seek_for_prev(key); 137 | } 138 | 139 | async fn next(&mut self) { 140 | if self.data_iter.is_none() { 141 | return; 142 | } 143 | self.data_iter.as_mut().unwrap().next(); 144 | self.find_block_forward().await; 145 | } 146 | 147 | async fn prev(&mut self) { 148 | if self.data_iter.is_none() { 149 | return; 150 | } 151 | self.data_iter.as_mut().unwrap().prev(); 152 | self.find_key_backward().await; 153 | } 154 | 155 | fn key(&self) -> &[u8] { 156 | self.data_iter.as_ref().unwrap().key() 157 | } 158 | 159 | fn value(&self) -> &[u8] { 160 | self.data_iter.as_ref().unwrap().value() 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/table/block_based/table_reader.rs: -------------------------------------------------------------------------------- 1 | use crate::common::format::extract_user_key; 2 | use crate::common::{ 3 | DefaultUserComparator, InternalKeyComparator, RandomAccessFileReader, Result, 4 | DISABLE_GLOBAL_SEQUENCE_NUMBER, 5 | }; 6 | use crate::options::ReadOptions; 7 | use crate::table::block_based::block::{ 8 | read_block_content_from_file, read_block_from_file, DataBlockIter, 9 | }; 10 | use crate::table::block_based::filter_reader::FilterBlockReader; 11 | use crate::table::block_based::index_reader::IndexReader; 12 | use crate::table::block_based::meta_block::read_properties; 13 | use crate::table::block_based::table_iterator::BlockBasedTableIterator; 14 | use crate::table::block_based::BlockBasedTableOptions; 15 | use crate::table::block_based::{FILTER_BLOCK_PREFIX, FULL_FILTER_BLOCK_PREFIX}; 16 | use crate::table::format::{ 17 | BlockHandle, Footer, BLOCK_BASED_TABLE_MAGIC_NUMBER, NEW_VERSIONS_ENCODED_LENGTH, 18 | }; 19 | use crate::table::table_properties::{ 20 | seek_to_metablock, seek_to_properties_block, TableProperties, 21 | }; 22 | use crate::table::{AsyncIterator, InternalIterator, TableReader, TableReaderOptions}; 23 | use async_trait::async_trait; 24 | use std::cmp::Ordering; 25 | use std::sync::Arc; 26 | 27 | pub struct BlockBasedTableRep { 28 | footer: Footer, 29 | file: Box, 30 | index_reader: IndexReader, 31 | properties: Option>, 32 | internal_comparator: Arc, 33 | filter_reader: Option>, 34 | whole_key_filtering: bool, 35 | global_seqno: u64, 36 | } 37 | 38 | impl BlockBasedTableRep { 39 | pub async fn new_data_block_iterator(&self, handle: &BlockHandle) -> Result { 40 | // TODO: support block cache. 41 | let block = read_block_from_file(self.file.as_ref(), handle, self.global_seqno).await?; 42 | let iter = block.new_data_iterator(self.internal_comparator.clone()); 43 | Ok(iter) 44 | } 45 | } 46 | 47 | pub struct BlockBasedTable { 48 | rep: Arc, 49 | } 50 | 51 | impl BlockBasedTable { 52 | pub async fn open( 53 | opts: &TableReaderOptions, 54 | table_opts: BlockBasedTableOptions, 55 | file: Box, 56 | ) -> Result { 57 | // Read in the following order: 58 | // 1. Footer 59 | // 2. [metaindex block] 60 | // 3. [meta block: properties] 61 | // 4. [meta block: range deletion tombstone] 62 | // 5. [meta block: compression dictionary] 63 | // 6. [meta block: index] 64 | // 7. [meta block: filter] 65 | 66 | // TODO: prefetch file for meta block and index block. 67 | let footer = read_footer_from_file(file.as_ref(), opts.file_size).await?; 68 | let meta_block = read_block_from_file( 69 | file.as_ref(), 70 | &footer.metaindex_handle, 71 | DISABLE_GLOBAL_SEQUENCE_NUMBER, 72 | ) 73 | .await?; 74 | let mut meta_iter = 75 | meta_block.new_data_iterator(Arc::new(DefaultUserComparator::default())); 76 | let mut global_seqno = DISABLE_GLOBAL_SEQUENCE_NUMBER; 77 | let mut index_key_includes_seq = true; 78 | let properties = if seek_to_properties_block(&mut meta_iter)? { 79 | let (properties, _handle) = read_properties(meta_iter.value(), file.as_ref()).await?; 80 | // TODO: checksum 81 | global_seqno = get_global_seqno(properties.as_ref(), opts.largest_seqno)?; 82 | index_key_includes_seq = properties.index_key_is_user_key == 0; 83 | Some(properties) 84 | } else { 85 | None 86 | }; 87 | let index_reader = IndexReader::open( 88 | file.as_ref(), 89 | &footer.index_handle, 90 | global_seqno, 91 | index_key_includes_seq, 92 | ) 93 | .await?; 94 | let mut key = if table_opts.filter_factory.is_block_based() { 95 | FILTER_BLOCK_PREFIX.to_string() 96 | } else { 97 | FULL_FILTER_BLOCK_PREFIX.to_string() 98 | }; 99 | key.push_str(table_opts.filter_factory.name()); 100 | let mut handle = BlockHandle::default(); 101 | let filter_reader = if seek_to_metablock(&mut meta_iter, &key, Some(&mut handle))? { 102 | let block = read_block_content_from_file(file.as_ref(), &handle).await?; 103 | let filter_raeder = table_opts.filter_factory.create_filter_reader(block); 104 | Some(filter_raeder) 105 | } else { 106 | None 107 | }; 108 | let table = BlockBasedTable { 109 | rep: Arc::new(BlockBasedTableRep { 110 | footer, 111 | file, 112 | properties, 113 | index_reader, 114 | global_seqno, 115 | internal_comparator: Arc::new(opts.internal_comparator.clone()), 116 | whole_key_filtering: table_opts.whole_key_filtering, 117 | filter_reader, 118 | }), 119 | }; 120 | Ok(table) 121 | } 122 | 123 | fn full_filter_key_may_match(&self, key: &[u8]) -> bool { 124 | if let Some(filter) = self.rep.filter_reader.as_ref() { 125 | if self.rep.whole_key_filtering { 126 | let user_key = extract_user_key(key); 127 | return filter.key_may_match(user_key); 128 | } else { 129 | // TODO: finish prefix key match 130 | } 131 | } 132 | true 133 | } 134 | } 135 | 136 | async fn read_footer_from_file(file: &RandomAccessFileReader, file_size: usize) -> Result