├── .github └── workflows │ └── smoke_test.yml ├── .gitignore ├── Cargo.toml ├── LICENSE.md ├── README.md ├── bindings └── c │ ├── Cargo.toml │ ├── build.rs │ ├── cbindgen.toml │ ├── include │ └── mvcc.h │ └── src │ ├── errors.rs │ ├── lib.rs │ └── types.rs ├── docs ├── DESIGN.md └── figures │ ├── transactions.excalidraw │ └── transactions.png └── mvcc-rs ├── Cargo.toml ├── benches └── my_benchmark.rs ├── src ├── clock.rs ├── cursor.rs ├── database │ ├── mod.rs │ └── tests.rs ├── errors.rs ├── lib.rs └── persistent_storage │ ├── mod.rs │ └── s3.rs └── tests └── concurrency_test.rs /.github/workflows/smoke_test.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | RUST_LOG: info,mvcc_rs=trace 12 | 13 | jobs: 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | - name: Check 21 | run: cargo check --all-targets --all-features 22 | - name: Clippy 23 | run: cargo clippy --all-targets --all-features -- -D warnings 24 | - name: Run tests 25 | run: cargo test --verbose 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | target/ 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "mvcc-rs", 5 | "bindings/c", 6 | ] 7 | 8 | [profile.release] 9 | codegen-units = 1 10 | panic = "abort" 11 | strip = true 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright 2023 Pekka Enberg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tihku 2 | 3 | Tihku is an _work-in-progress_, open-source implementation of the Hekaton multi-version concurrency control (MVCC) written in Rust. 4 | The project aims to provide a foundational building block for implementing database management systems. 5 | 6 | One of the projects using Tihku is an experimental [libSQL branch with MVCC](https://github.com/penberg/libsql/tree/mvcc) that aims to implement `BEGIN CONCURRENT` with Tihku improve SQLite write concurrency. 7 | 8 | ## Features 9 | 10 | * Main memory architecture, rows are accessed via an index 11 | * Optimistic multi-version concurrency control 12 | * Rust and C APIs 13 | 14 | ## Experimental Evaluation 15 | 16 | **Single-threaded micro-benchmarks** 17 | 18 | Operations | Throughput 19 | -----------------------------------|------------ 20 | `begin_tx`, `read`, and `commit` | 2.2M ops/second 21 | `begin_tx`, `update`, and `commit` | 2.2M ops/second 22 | `read` | 12.9M ops/second 23 | `update` | 6.2M ops/second 24 | 25 | (The `cargo bench` was run on a AMD Ryzen 9 3900XT 2.2 GHz CPU.) 26 | 27 | ## Development 28 | 29 | Run tests: 30 | 31 | ```console 32 | cargo test 33 | ``` 34 | 35 | Test coverage report: 36 | 37 | ```console 38 | cargo tarpaulin -o html 39 | ``` 40 | 41 | Run benchmarks: 42 | 43 | ```console 44 | cargo bench 45 | ``` 46 | 47 | Run benchmarks and generate flamegraphs: 48 | 49 | ```console 50 | echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid 51 | cargo bench --bench my_benchmark -- --profile-time=5 52 | ``` 53 | 54 | ## References 55 | 56 | Larson et al. [High-Performance Concurrency Control Mechanisms for Main-Memory Databases](https://vldb.org/pvldb/vol5/p298_per-akelarson_vldb2012.pdf). VLDB '11 57 | 58 | Paper errata: The visibility check in Table 2 is wrong and causes uncommitted delete to become visible to transactions (fixed in [commit 6ca3773]( https://github.com/penberg/mvcc-rs/commit/6ca377320bb59b52ecc0430b9e5e422e8d61658d)). 59 | -------------------------------------------------------------------------------- /bindings/c/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mvcc-c" 3 | version = "0.0.0" 4 | edition = "2021" 5 | 6 | [lib] 7 | crate-type = ["cdylib", "staticlib"] 8 | doc = false 9 | 10 | [build-dependencies] 11 | cbindgen = "0.24.0" 12 | 13 | [dependencies] 14 | base64 = "0.21.0" 15 | mvcc-rs = { path = "../../mvcc-rs" } 16 | tracing = "0.1.37" 17 | tracing-subscriber = { version = "0" } 18 | 19 | [features] 20 | default = [] 21 | json_on_disk_storage = [] 22 | s3_storage = [] 23 | -------------------------------------------------------------------------------- /bindings/c/build.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | fn main() { 4 | let header_file = Path::new("include").join("mvcc.h"); 5 | cbindgen::generate(".") 6 | .expect("Failed to generate C bindings") 7 | .write_to_file(header_file); 8 | } 9 | -------------------------------------------------------------------------------- /bindings/c/cbindgen.toml: -------------------------------------------------------------------------------- 1 | language = "C" 2 | cpp_compat = true 3 | include_guard = "MVCC_H" 4 | line_length = 120 5 | no_includes = true 6 | style = "type" 7 | sys_includes = ["stdint.h"] 8 | -------------------------------------------------------------------------------- /bindings/c/include/mvcc.h: -------------------------------------------------------------------------------- 1 | #ifndef MVCC_H 2 | #define MVCC_H 3 | 4 | #include 5 | 6 | typedef enum { 7 | MVCC_OK = 0, 8 | MVCC_IO_ERROR_READ = 266, 9 | MVCC_IO_ERROR_WRITE = 778, 10 | } MVCCError; 11 | 12 | typedef struct DbContext DbContext; 13 | 14 | typedef struct ScanCursorContext ScanCursorContext; 15 | 16 | typedef const DbContext *MVCCDatabaseRef; 17 | 18 | typedef ScanCursorContext *MVCCScanCursorRef; 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif // __cplusplus 23 | 24 | MVCCDatabaseRef MVCCDatabaseOpen(const char *path); 25 | 26 | void MVCCDatabaseClose(MVCCDatabaseRef db); 27 | 28 | uint64_t MVCCTransactionBegin(MVCCDatabaseRef db); 29 | 30 | MVCCError MVCCTransactionCommit(MVCCDatabaseRef db, uint64_t tx_id); 31 | 32 | MVCCError MVCCTransactionRollback(MVCCDatabaseRef db, uint64_t tx_id); 33 | 34 | MVCCError MVCCDatabaseInsert(MVCCDatabaseRef db, 35 | uint64_t tx_id, 36 | uint64_t table_id, 37 | uint64_t row_id, 38 | const void *value_ptr, 39 | uintptr_t value_len); 40 | 41 | MVCCError MVCCDatabaseRead(MVCCDatabaseRef db, 42 | uint64_t tx_id, 43 | uint64_t table_id, 44 | uint64_t row_id, 45 | uint8_t **value_ptr, 46 | int64_t *value_len); 47 | 48 | void MVCCFreeStr(void *ptr); 49 | 50 | MVCCScanCursorRef MVCCScanCursorOpen(MVCCDatabaseRef db, uint64_t tx_id, uint64_t table_id); 51 | 52 | void MVCCScanCursorClose(MVCCScanCursorRef cursor); 53 | 54 | MVCCError MVCCScanCursorRead(MVCCScanCursorRef cursor, uint8_t **value_ptr, int64_t *value_len); 55 | 56 | int MVCCScanCursorNext(MVCCScanCursorRef cursor); 57 | 58 | uint64_t MVCCScanCursorPosition(MVCCScanCursorRef cursor); 59 | 60 | #ifdef __cplusplus 61 | } // extern "C" 62 | #endif // __cplusplus 63 | 64 | #endif /* MVCC_H */ 65 | -------------------------------------------------------------------------------- /bindings/c/src/errors.rs: -------------------------------------------------------------------------------- 1 | #[repr(C)] 2 | pub enum MVCCError { 3 | MVCC_OK = 0, 4 | MVCC_IO_ERROR_READ = 266, 5 | MVCC_IO_ERROR_WRITE = 778, 6 | } 7 | -------------------------------------------------------------------------------- /bindings/c/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_camel_case_types)] 2 | #![allow(clippy::missing_safety_doc)] 3 | 4 | mod errors; 5 | mod types; 6 | 7 | use errors::MVCCError; 8 | use mvcc_rs::persistent_storage::{s3, Storage}; 9 | use mvcc_rs::*; 10 | use types::{DbContext, MVCCDatabaseRef, MVCCScanCursorRef, ScanCursorContext}; 11 | 12 | /// cbindgen:ignore 13 | type Clock = clock::LocalClock; 14 | 15 | /// cbindgen:ignore 16 | /// Note - We use String type in C bindings as Row type. Type is generic. 17 | type Db = database::Database; 18 | 19 | /// cbindgen:ignore 20 | /// Note - We use String type in C bindings as Row type. Type is generic. 21 | type ScanCursor = cursor::ScanCursor<'static, Clock, String>; 22 | 23 | static INIT_RUST_LOG: std::sync::Once = std::sync::Once::new(); 24 | 25 | fn storage_for(main_db_path: &str) -> database::Result { 26 | // TODO: let's accept an URL instead of main_db_path here, so we can 27 | // pass custom S3 endpoints, options, etc. 28 | if cfg!(feature = "json_on_disk_storage") { 29 | tracing::info!("JSONonDisk storage stored in {main_db_path}-mvcc"); 30 | return Ok(Storage::new_json_on_disk(format!("{main_db_path}-mvcc"))); 31 | } 32 | if cfg!(feature = "s3_storage") { 33 | tracing::info!("S3 storage for {main_db_path}"); 34 | let options = s3::Options::with_create_bucket_if_not_exists(true); 35 | return Storage::new_s3(options); 36 | } 37 | tracing::info!("No persistent storage for {main_db_path}"); 38 | Ok(Storage::new_noop()) 39 | } 40 | 41 | #[no_mangle] 42 | pub unsafe extern "C" fn MVCCDatabaseOpen(path: *const std::ffi::c_char) -> MVCCDatabaseRef { 43 | INIT_RUST_LOG.call_once(|| { 44 | tracing_subscriber::fmt::init(); 45 | }); 46 | 47 | tracing::debug!("MVCCDatabaseOpen"); 48 | 49 | let clock = clock::LocalClock::new(); 50 | let main_db_path = unsafe { std::ffi::CStr::from_ptr(path) }; 51 | let main_db_path = match main_db_path.to_str() { 52 | Ok(path) => path, 53 | Err(_) => { 54 | tracing::error!("Invalid UTF-8 path"); 55 | return MVCCDatabaseRef::null(); 56 | } 57 | }; 58 | 59 | tracing::debug!("mvccrs: opening persistent storage for {main_db_path}"); 60 | let storage = match storage_for(main_db_path) { 61 | Ok(storage) => storage, 62 | Err(e) => { 63 | tracing::error!("Failed to open persistent storage: {e}"); 64 | return MVCCDatabaseRef::null(); 65 | } 66 | }; 67 | let db = Db::new(clock, storage); 68 | 69 | db.recover().ok(); 70 | 71 | let db = Box::leak(Box::new(DbContext { db })); 72 | MVCCDatabaseRef::from(db) 73 | } 74 | 75 | #[no_mangle] 76 | pub unsafe extern "C" fn MVCCDatabaseClose(db: MVCCDatabaseRef) { 77 | tracing::debug!("MVCCDatabaseClose"); 78 | if db.is_null() { 79 | tracing::debug!("warning: `db` is null in MVCCDatabaseClose()"); 80 | return; 81 | } 82 | let _ = unsafe { Box::from_raw(db.get_ref_mut()) }; 83 | } 84 | 85 | #[no_mangle] 86 | pub unsafe extern "C" fn MVCCTransactionBegin(db: MVCCDatabaseRef) -> u64 { 87 | let db = db.get_ref(); 88 | let tx_id = db.begin_tx(); 89 | tracing::debug!("MVCCTransactionBegin: {tx_id}"); 90 | tx_id 91 | } 92 | 93 | #[no_mangle] 94 | pub unsafe extern "C" fn MVCCTransactionCommit(db: MVCCDatabaseRef, tx_id: u64) -> MVCCError { 95 | let db = db.get_ref(); 96 | tracing::debug!("MVCCTransactionCommit: {tx_id}"); 97 | match db.commit_tx(tx_id) { 98 | Ok(()) => MVCCError::MVCC_OK, 99 | Err(e) => { 100 | tracing::error!("MVCCTransactionCommit: {e}"); 101 | MVCCError::MVCC_IO_ERROR_WRITE 102 | } 103 | } 104 | } 105 | 106 | #[no_mangle] 107 | pub unsafe extern "C" fn MVCCTransactionRollback(db: MVCCDatabaseRef, tx_id: u64) -> MVCCError { 108 | let db = db.get_ref(); 109 | tracing::debug!("MVCCTransactionRollback: {tx_id}"); 110 | db.rollback_tx(tx_id); 111 | MVCCError::MVCC_OK 112 | } 113 | 114 | #[no_mangle] 115 | pub unsafe extern "C" fn MVCCDatabaseInsert( 116 | db: MVCCDatabaseRef, 117 | tx_id: u64, 118 | table_id: u64, 119 | row_id: u64, 120 | value_ptr: *const std::ffi::c_void, 121 | value_len: usize, 122 | ) -> MVCCError { 123 | let db = db.get_ref(); 124 | let value = std::slice::from_raw_parts(value_ptr as *const u8, value_len); 125 | let data = match std::str::from_utf8(value) { 126 | Ok(value) => value.to_string(), 127 | Err(_) => { 128 | tracing::info!("Invalid UTF-8, let's base64 this fellow"); 129 | use base64::{engine::general_purpose, Engine as _}; 130 | general_purpose::STANDARD.encode(value) 131 | } 132 | }; 133 | let id = database::RowID { table_id, row_id }; 134 | let row = database::Row { id, data }; 135 | tracing::debug!("MVCCDatabaseInsert: {row:?}"); 136 | match db.insert(tx_id, row) { 137 | Ok(_) => { 138 | tracing::debug!("MVCCDatabaseInsert: success"); 139 | MVCCError::MVCC_OK 140 | } 141 | Err(e) => { 142 | tracing::error!("MVCCDatabaseInsert: {e}"); 143 | MVCCError::MVCC_IO_ERROR_WRITE 144 | } 145 | } 146 | } 147 | 148 | #[no_mangle] 149 | pub unsafe extern "C" fn MVCCDatabaseRead( 150 | db: MVCCDatabaseRef, 151 | tx_id: u64, 152 | table_id: u64, 153 | row_id: u64, 154 | value_ptr: *mut *mut u8, 155 | value_len: *mut i64, 156 | ) -> MVCCError { 157 | let db = db.get_ref(); 158 | 159 | match { 160 | let id = database::RowID { table_id, row_id }; 161 | let maybe_row = db.read(tx_id, id); 162 | match maybe_row { 163 | Ok(Some(row)) => { 164 | tracing::debug!("Found row {row:?}"); 165 | let str_len = row.data.len() + 1; 166 | let value = std::ffi::CString::new(row.data.as_bytes()).unwrap_or_default(); 167 | unsafe { 168 | *value_ptr = value.into_raw() as *mut u8; 169 | *value_len = str_len as i64; 170 | } 171 | } 172 | _ => unsafe { *value_len = -1 }, 173 | }; 174 | Ok::<(), mvcc_rs::errors::DatabaseError>(()) 175 | } { 176 | Ok(_) => { 177 | tracing::debug!("MVCCDatabaseRead: success"); 178 | MVCCError::MVCC_OK 179 | } 180 | Err(e) => { 181 | tracing::error!("MVCCDatabaseRead: {e}"); 182 | MVCCError::MVCC_IO_ERROR_READ 183 | } 184 | } 185 | } 186 | 187 | #[no_mangle] 188 | pub unsafe extern "C" fn MVCCFreeStr(ptr: *mut std::ffi::c_void) { 189 | if ptr.is_null() { 190 | return; 191 | } 192 | let _ = std::ffi::CString::from_raw(ptr as *mut std::ffi::c_char); 193 | } 194 | 195 | #[no_mangle] 196 | pub unsafe extern "C" fn MVCCScanCursorOpen( 197 | db: MVCCDatabaseRef, 198 | tx_id: u64, 199 | table_id: u64, 200 | ) -> MVCCScanCursorRef { 201 | tracing::debug!("MVCCScanCursorOpen()"); 202 | // Reference is transmuted to &'static in order to be able to pass the cursor back to C. 203 | // The contract with C is to never use a cursor after MVCCDatabaseClose() has been called. 204 | let db = unsafe { std::mem::transmute::<&Db, &'static Db>(db.get_ref()) }; 205 | match mvcc_rs::cursor::ScanCursor::new(db, tx_id, table_id) { 206 | Ok(cursor) => { 207 | if cursor.is_empty() { 208 | tracing::debug!("Cursor is empty"); 209 | return MVCCScanCursorRef { 210 | ptr: std::ptr::null_mut(), 211 | }; 212 | } 213 | tracing::debug!("Cursor open: {cursor:?}"); 214 | MVCCScanCursorRef { 215 | ptr: Box::into_raw(Box::new(ScanCursorContext { cursor })), 216 | } 217 | } 218 | Err(e) => { 219 | tracing::error!("MVCCScanCursorOpen: {e}"); 220 | MVCCScanCursorRef { 221 | ptr: std::ptr::null_mut(), 222 | } 223 | } 224 | } 225 | } 226 | 227 | #[no_mangle] 228 | pub unsafe extern "C" fn MVCCScanCursorClose(cursor: MVCCScanCursorRef) { 229 | tracing::debug!("MVCCScanCursorClose()"); 230 | if cursor.ptr.is_null() { 231 | tracing::debug!("warning: `cursor` is null in MVCCScanCursorClose()"); 232 | return; 233 | } 234 | let cursor = unsafe { Box::from_raw(cursor.ptr) }.cursor; 235 | cursor.close().ok(); 236 | } 237 | 238 | #[no_mangle] 239 | pub unsafe extern "C" fn MVCCScanCursorRead( 240 | cursor: MVCCScanCursorRef, 241 | value_ptr: *mut *mut u8, 242 | value_len: *mut i64, 243 | ) -> MVCCError { 244 | tracing::debug!("MVCCScanCursorRead()"); 245 | if cursor.ptr.is_null() { 246 | tracing::debug!("warning: `cursor` is null in MVCCScanCursorRead()"); 247 | return MVCCError::MVCC_IO_ERROR_READ; 248 | } 249 | let cursor = cursor.get_ref(); 250 | 251 | match { 252 | let maybe_row = cursor.current_row(); 253 | match maybe_row { 254 | Ok(Some(row)) => { 255 | tracing::debug!("Found row {row:?}"); 256 | let str_len = row.data.len() + 1; 257 | let value = std::ffi::CString::new(row.data.as_bytes()).unwrap_or_default(); 258 | unsafe { 259 | *value_ptr = value.into_raw() as *mut u8; 260 | *value_len = str_len as i64; 261 | } 262 | } 263 | _ => unsafe { *value_len = -1 }, 264 | }; 265 | Ok::<(), mvcc_rs::errors::DatabaseError>(()) 266 | } { 267 | Ok(_) => { 268 | tracing::debug!("MVCCDatabaseRead: success"); 269 | MVCCError::MVCC_OK 270 | } 271 | Err(e) => { 272 | tracing::error!("MVCCDatabaseRead: {e}"); 273 | MVCCError::MVCC_IO_ERROR_READ 274 | } 275 | } 276 | } 277 | 278 | #[no_mangle] 279 | pub unsafe extern "C" fn MVCCScanCursorNext(cursor: MVCCScanCursorRef) -> std::ffi::c_int { 280 | let cursor = cursor.get_ref_mut(); 281 | tracing::debug!("MVCCScanCursorNext(): {}", cursor.index); 282 | if cursor.forward() { 283 | tracing::debug!("Forwarded to {}", cursor.index); 284 | 1 285 | } else { 286 | tracing::debug!("Forwarded to end"); 287 | 0 288 | } 289 | } 290 | 291 | #[no_mangle] 292 | pub unsafe extern "C" fn MVCCScanCursorPosition(cursor: MVCCScanCursorRef) -> u64 { 293 | let cursor = cursor.get_ref(); 294 | cursor 295 | .current_row_id() 296 | .map(|row_id| row_id.row_id) 297 | .unwrap_or(0) 298 | } 299 | -------------------------------------------------------------------------------- /bindings/c/src/types.rs: -------------------------------------------------------------------------------- 1 | use crate::Db; 2 | 3 | #[derive(Clone, Debug)] 4 | #[repr(transparent)] 5 | pub struct MVCCDatabaseRef { 6 | ptr: *const DbContext, 7 | } 8 | 9 | impl MVCCDatabaseRef { 10 | pub fn null() -> MVCCDatabaseRef { 11 | MVCCDatabaseRef { 12 | ptr: std::ptr::null(), 13 | } 14 | } 15 | 16 | pub fn is_null(&self) -> bool { 17 | self.ptr.is_null() 18 | } 19 | 20 | pub fn get_ref(&self) -> &Db { 21 | &unsafe { &*(self.ptr) }.db 22 | } 23 | 24 | #[allow(clippy::mut_from_ref)] 25 | pub fn get_ref_mut(&self) -> &mut Db { 26 | let ptr_mut = self.ptr as *mut DbContext; 27 | &mut unsafe { &mut (*ptr_mut) }.db 28 | } 29 | } 30 | 31 | #[allow(clippy::from_over_into)] 32 | impl From<&DbContext> for MVCCDatabaseRef { 33 | fn from(value: &DbContext) -> Self { 34 | Self { ptr: value } 35 | } 36 | } 37 | 38 | #[allow(clippy::from_over_into)] 39 | impl From<&mut DbContext> for MVCCDatabaseRef { 40 | fn from(value: &mut DbContext) -> Self { 41 | Self { ptr: value } 42 | } 43 | } 44 | 45 | pub struct DbContext { 46 | pub(crate) db: Db, 47 | } 48 | 49 | pub struct ScanCursorContext { 50 | pub(crate) cursor: crate::ScanCursor, 51 | } 52 | 53 | #[derive(Clone, Debug)] 54 | #[repr(transparent)] 55 | pub struct MVCCScanCursorRef { 56 | pub ptr: *mut ScanCursorContext, 57 | } 58 | 59 | impl MVCCScanCursorRef { 60 | pub fn null() -> MVCCScanCursorRef { 61 | MVCCScanCursorRef { 62 | ptr: std::ptr::null_mut(), 63 | } 64 | } 65 | 66 | pub fn is_null(&self) -> bool { 67 | self.ptr.is_null() 68 | } 69 | 70 | pub fn get_ref(&self) -> &crate::ScanCursor { 71 | &unsafe { &*(self.ptr) }.cursor 72 | } 73 | 74 | #[allow(clippy::mut_from_ref)] 75 | pub fn get_ref_mut(&self) -> &mut crate::ScanCursor { 76 | let ptr_mut = self.ptr as *mut ScanCursorContext; 77 | &mut unsafe { &mut (*ptr_mut) }.cursor 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /docs/DESIGN.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | ## Persistent storage 4 | 5 | Persistent storage must implement the `Storage` trait that the MVCC module uses for transaction logging. 6 | 7 | Figure 1 shows an example of write-ahead log across three transactions. 8 | The first transaction T0 executes a `INSERT (id) VALUES (1)` statement, which results in a log record with `id` set to `1`, begin timestamp to 0 (which is the transaction ID) and end timestamp as infinity (meaning the row version is still visible). 9 | The second transaction T1 executes another `INSERT` statement, which adds another log record to the transaction log with `id` set to `2`, begin timesstamp to 1 and end timestamp as infinity, similar to what T0 did. 10 | Finally, a third transaction T2 executes two statements: `DELETE WHERE id = 1` and `INSERT (id) VALUES (3)`. The first one results in a log record with `id` set to `1` and begin timestamp set to 0 (which is the transaction that created the entry). However, the end timestamp is now set to 2 (the current transaction), which means the entry is now deleted. 11 | The second statement results in an entry in the transaction log similar to the `INSERT` statements in T0 and T1. 12 | 13 | ![Transactions](figures/transactions.png) 14 |

15 | Figure 1. Transaction log of three transactions. 16 |

17 | 18 | When MVCC bootstraps or recovers, it simply redos the transaction log. 19 | If the transaction log grows big, we can checkpoint it it by dropping all entries that are no longer visible after the the latest transaction and create a snapshot. 20 | -------------------------------------------------------------------------------- /docs/figures/transactions.excalidraw: -------------------------------------------------------------------------------- 1 | { 2 | "type": "excalidraw", 3 | "version": 2, 4 | "source": "https://excalidraw.com", 5 | "elements": [ 6 | { 7 | "id": "tFvpBUMWe3qPFUTQVV14X", 8 | "type": "text", 9 | "x": 233.14035848761839, 10 | "y": 205.73272444200816, 11 | "width": 278.57781982421875, 12 | "height": 25, 13 | "angle": 0, 14 | "strokeColor": "#087f5b", 15 | "backgroundColor": "#82c91e", 16 | "fillStyle": "hachure", 17 | "strokeWidth": 1, 18 | "strokeStyle": "solid", 19 | "roughness": 1, 20 | "opacity": 100, 21 | "groupIds": [], 22 | "roundness": null, 23 | "seed": 94988319, 24 | "version": 510, 25 | "versionNonce": 1210831775, 26 | "isDeleted": false, 27 | "boundElements": null, 28 | "updated": 1683370319070, 29 | "link": null, 30 | "locked": false, 31 | "text": "", 32 | "fontSize": 20, 33 | "fontFamily": 1, 34 | "textAlign": "left", 35 | "verticalAlign": "top", 36 | "baseline": 18, 37 | "containerId": null, 38 | "originalText": "", 39 | "lineHeight": 1.25 40 | }, 41 | { 42 | "type": "text", 43 | "version": 515, 44 | "versionNonce": 1881893969, 45 | "isDeleted": false, 46 | "id": "7i88n1PIb89NxUbVQmTTi", 47 | "fillStyle": "hachure", 48 | "strokeWidth": 1, 49 | "strokeStyle": "solid", 50 | "roughness": 1, 51 | "opacity": 100, 52 | "angle": 0, 53 | "x": 938.4614491858606, 54 | "y": 311.23272444200813, 55 | "strokeColor": "#0b7285", 56 | "backgroundColor": "#82c91e", 57 | "width": 279.0400085449219, 58 | "height": 25, 59 | "seed": 1123646321, 60 | "groupIds": [], 61 | "roundness": null, 62 | "boundElements": [], 63 | "updated": 1683370316909, 64 | "link": null, 65 | "locked": false, 66 | "fontSize": 20, 67 | "fontFamily": 1, 68 | "text": "", 69 | "textAlign": "left", 70 | "verticalAlign": "top", 71 | "containerId": null, 72 | "originalText": "", 73 | "lineHeight": 1.25, 74 | "baseline": 18 75 | }, 76 | { 77 | "type": "text", 78 | "version": 556, 79 | "versionNonce": 153125934, 80 | "isDeleted": false, 81 | "id": "Yh8XLtKqXUUYmcmG4SEXn", 82 | "fillStyle": "hachure", 83 | "strokeWidth": 1, 84 | "strokeStyle": "solid", 85 | "roughness": 1, 86 | "opacity": 100, 87 | "angle": 0, 88 | "x": 581.1603475012903, 89 | "y": 256.23272444200813, 90 | "strokeColor": "#e67700", 91 | "backgroundColor": "#82c91e", 92 | "width": 270.71783447265625, 93 | "height": 25, 94 | "seed": 1685524017, 95 | "groupIds": [], 96 | "roundness": null, 97 | "boundElements": [], 98 | "updated": 1683371076075, 99 | "link": null, 100 | "locked": false, 101 | "fontSize": 20, 102 | "fontFamily": 1, 103 | "text": "", 104 | "textAlign": "left", 105 | "verticalAlign": "top", 106 | "containerId": null, 107 | "originalText": "", 108 | "lineHeight": 1.25, 109 | "baseline": 18 110 | }, 111 | { 112 | "id": "8l0CCJzCAtOLt_2GRcNpa", 113 | "type": "text", 114 | "x": 256.1403584876185, 115 | "y": 409.73272444200813, 116 | "width": 234.41998291015625, 117 | "height": 75, 118 | "angle": 0, 119 | "strokeColor": "#087f5b", 120 | "backgroundColor": "#82c91e", 121 | "fillStyle": "hachure", 122 | "strokeWidth": 1, 123 | "strokeStyle": "solid", 124 | "roughness": 1, 125 | "opacity": 100, 126 | "groupIds": [], 127 | "roundness": null, 128 | "seed": 583129809, 129 | "version": 570, 130 | "versionNonce": 561756721, 131 | "isDeleted": false, 132 | "boundElements": null, 133 | "updated": 1683370316909, 134 | "link": null, 135 | "locked": false, 136 | "text": "BEGIN\nINSERT (id) VALUEs (1)\nCOMMIT", 137 | "fontSize": 20, 138 | "fontFamily": 1, 139 | "textAlign": "left", 140 | "verticalAlign": "top", 141 | "baseline": 68, 142 | "containerId": null, 143 | "originalText": "BEGIN\nINSERT (id) VALUEs (1)\nCOMMIT", 144 | "lineHeight": 1.25 145 | }, 146 | { 147 | "type": "text", 148 | "version": 628, 149 | "versionNonce": 282656095, 150 | "isDeleted": false, 151 | "id": "3m7VluAP5tair6-60b_sp", 152 | "fillStyle": "hachure", 153 | "strokeWidth": 1, 154 | "strokeStyle": "solid", 155 | "roughness": 1, 156 | "opacity": 100, 157 | "angle": 0, 158 | "x": 962.0903554358606, 159 | "y": 416.23272444200813, 160 | "strokeColor": "#0b7285", 161 | "backgroundColor": "#82c91e", 162 | "width": 243.91998291015625, 163 | "height": 100, 164 | "seed": 479705617, 165 | "groupIds": [], 166 | "roundness": null, 167 | "boundElements": [], 168 | "updated": 1683370316909, 169 | "link": null, 170 | "locked": false, 171 | "fontSize": 20, 172 | "fontFamily": 1, 173 | "text": "BEGIN\nDELETE WHERE id =1\nINSERT (id) VALUES (3)\nCOMMIT", 174 | "textAlign": "left", 175 | "verticalAlign": "top", 176 | "containerId": null, 177 | "originalText": "BEGIN\nDELETE WHERE id =1\nINSERT (id) VALUES (3)\nCOMMIT", 178 | "lineHeight": 1.25, 179 | "baseline": 93 180 | }, 181 | { 182 | "type": "text", 183 | "version": 574, 184 | "versionNonce": 1128746001, 185 | "isDeleted": false, 186 | "id": "Z-Mh1kti2oC6sIMnuGluo", 187 | "fillStyle": "hachure", 188 | "strokeWidth": 1, 189 | "strokeStyle": "solid", 190 | "roughness": 1, 191 | "opacity": 100, 192 | "angle": 0, 193 | "x": 613.0903554358607, 194 | "y": 417.23272444200813, 195 | "strokeColor": "#e67700", 196 | "backgroundColor": "#82c91e", 197 | "width": 243.239990234375, 198 | "height": 75, 199 | "seed": 580440625, 200 | "groupIds": [], 201 | "roundness": null, 202 | "boundElements": [], 203 | "updated": 1683370316909, 204 | "link": null, 205 | "locked": false, 206 | "fontSize": 20, 207 | "fontFamily": 1, 208 | "text": "BEGIN\nINSERT (id) VALUEs (2)\nCOMMIT", 209 | "textAlign": "left", 210 | "verticalAlign": "top", 211 | "containerId": null, 212 | "originalText": "BEGIN\nINSERT (id) VALUEs (2)\nCOMMIT", 213 | "lineHeight": 1.25, 214 | "baseline": 68 215 | }, 216 | { 217 | "type": "line", 218 | "version": 1502, 219 | "versionNonce": 1835608607, 220 | "isDeleted": false, 221 | "id": "VuJNZCgz1Y0WEWwug7pGk", 222 | "fillStyle": "hachure", 223 | "strokeWidth": 1, 224 | "strokeStyle": "solid", 225 | "roughness": 0, 226 | "opacity": 100, 227 | "angle": 0, 228 | "x": 226.3083636621349, 229 | "y": 173.11701218356845, 230 | "strokeColor": "#000000", 231 | "backgroundColor": "transparent", 232 | "width": 20.336010349032712, 233 | "height": 203.23377930246647, 234 | "seed": 1879839231, 235 | "groupIds": [], 236 | "roundness": null, 237 | "boundElements": [], 238 | "updated": 1683370316909, 239 | "link": null, 240 | "locked": false, 241 | "startBinding": null, 242 | "endBinding": null, 243 | "lastCommittedPoint": null, 244 | "startArrowhead": null, 245 | "endArrowhead": null, 246 | "points": [ 247 | [ 248 | 0, 249 | 0 250 | ], 251 | [ 252 | -20.264781987976257, 253 | -0.0011773927935071482 254 | ], 255 | [ 256 | -20.336010349032712, 257 | 203.23260190967298 258 | ], 259 | [ 260 | -0.07239358683375485, 261 | 203.135377672515 262 | ] 263 | ] 264 | }, 265 | { 266 | "type": "line", 267 | "version": 1755, 268 | "versionNonce": 1487752017, 269 | "isDeleted": false, 270 | "id": "GpZg3Rw4Hszxzxf38Q4Hn", 271 | "fillStyle": "hachure", 272 | "strokeWidth": 1, 273 | "strokeStyle": "solid", 274 | "roughness": 0, 275 | "opacity": 100, 276 | "angle": 3.141592653589793, 277 | "x": 539.3083636621348, 278 | "y": 178.11701218356845, 279 | "strokeColor": "#000000", 280 | "backgroundColor": "transparent", 281 | "width": 20.336010349032712, 282 | "height": 203.23377930246647, 283 | "seed": 470135121, 284 | "groupIds": [], 285 | "roundness": null, 286 | "boundElements": [], 287 | "updated": 1683370316909, 288 | "link": null, 289 | "locked": false, 290 | "startBinding": null, 291 | "endBinding": null, 292 | "lastCommittedPoint": null, 293 | "startArrowhead": null, 294 | "endArrowhead": null, 295 | "points": [ 296 | [ 297 | 0, 298 | 0 299 | ], 300 | [ 301 | -20.264781987976257, 302 | -0.0011773927935071482 303 | ], 304 | [ 305 | -20.336010349032712, 306 | 203.23260190967298 307 | ], 308 | [ 309 | -0.07239358683375485, 310 | 203.135377672515 311 | ] 312 | ] 313 | }, 314 | { 315 | "type": "text", 316 | "version": 528, 317 | "versionNonce": 1276939839, 318 | "isDeleted": false, 319 | "id": "AGEyNvBxBm2cwm1WRW8n8", 320 | "fillStyle": "hachure", 321 | "strokeWidth": 1, 322 | "strokeStyle": "solid", 323 | "roughness": 1, 324 | "opacity": 100, 325 | "angle": 0, 326 | "x": 576.6403584876185, 327 | "y": 210.23272444200816, 328 | "strokeColor": "#087f5b", 329 | "backgroundColor": "#82c91e", 330 | "width": 278.57781982421875, 331 | "height": 25, 332 | "seed": 877528401, 333 | "groupIds": [], 334 | "roundness": null, 335 | "boundElements": [], 336 | "updated": 1683370316909, 337 | "link": null, 338 | "locked": false, 339 | "fontSize": 20, 340 | "fontFamily": 1, 341 | "text": "", 342 | "textAlign": "left", 343 | "verticalAlign": "top", 344 | "containerId": null, 345 | "originalText": "", 346 | "lineHeight": 1.25, 347 | "baseline": 18 348 | }, 349 | { 350 | "type": "line", 351 | "version": 1557, 352 | "versionNonce": 773679889, 353 | "isDeleted": false, 354 | "id": "Q8E0gAcLvq6VXqMDZhLdA", 355 | "fillStyle": "hachure", 356 | "strokeWidth": 1, 357 | "strokeStyle": "solid", 358 | "roughness": 0, 359 | "opacity": 100, 360 | "angle": 0, 361 | "x": 581.8083636621351, 362 | "y": 177.61701218356845, 363 | "strokeColor": "#000000", 364 | "backgroundColor": "transparent", 365 | "width": 20.336010349032712, 366 | "height": 203.23377930246647, 367 | "seed": 153279217, 368 | "groupIds": [], 369 | "roundness": null, 370 | "boundElements": [], 371 | "updated": 1683370316909, 372 | "link": null, 373 | "locked": false, 374 | "startBinding": null, 375 | "endBinding": null, 376 | "lastCommittedPoint": null, 377 | "startArrowhead": null, 378 | "endArrowhead": null, 379 | "points": [ 380 | [ 381 | 0, 382 | 0 383 | ], 384 | [ 385 | -20.264781987976257, 386 | -0.0011773927935071482 387 | ], 388 | [ 389 | -20.336010349032712, 390 | 203.23260190967298 391 | ], 392 | [ 393 | -0.07239358683375485, 394 | 203.135377672515 395 | ] 396 | ] 397 | }, 398 | { 399 | "type": "line", 400 | "version": 1810, 401 | "versionNonce": 1561283199, 402 | "isDeleted": false, 403 | "id": "uhh3ZkPO6bwwf0-AI8syI", 404 | "fillStyle": "hachure", 405 | "strokeWidth": 1, 406 | "strokeStyle": "solid", 407 | "roughness": 0, 408 | "opacity": 100, 409 | "angle": 3.141592653589793, 410 | "x": 894.8083636621349, 411 | "y": 182.61701218356845, 412 | "strokeColor": "#000000", 413 | "backgroundColor": "transparent", 414 | "width": 20.336010349032712, 415 | "height": 203.23377930246647, 416 | "seed": 315380945, 417 | "groupIds": [], 418 | "roundness": null, 419 | "boundElements": [], 420 | "updated": 1683370316909, 421 | "link": null, 422 | "locked": false, 423 | "startBinding": null, 424 | "endBinding": null, 425 | "lastCommittedPoint": null, 426 | "startArrowhead": null, 427 | "endArrowhead": null, 428 | "points": [ 429 | [ 430 | 0, 431 | 0 432 | ], 433 | [ 434 | -20.264781987976257, 435 | -0.0011773927935071482 436 | ], 437 | [ 438 | -20.336010349032712, 439 | 203.23260190967298 440 | ], 441 | [ 442 | -0.07239358683375485, 443 | 203.135377672515 444 | ] 445 | ] 446 | }, 447 | { 448 | "type": "text", 449 | "version": 575, 450 | "versionNonce": 910156017, 451 | "isDeleted": false, 452 | "id": "jI5YKyaOdGYYKiBWZmCMs", 453 | "fillStyle": "hachure", 454 | "strokeWidth": 1, 455 | "strokeStyle": "solid", 456 | "roughness": 1, 457 | "opacity": 100, 458 | "angle": 0, 459 | "x": 929.6403584876182, 460 | "y": 215.23272444200813, 461 | "strokeColor": "#087f5b", 462 | "backgroundColor": "#82c91e", 463 | "width": 278.57781982421875, 464 | "height": 25, 465 | "seed": 121503167, 466 | "groupIds": [], 467 | "roundness": null, 468 | "boundElements": [], 469 | "updated": 1683370316909, 470 | "link": null, 471 | "locked": false, 472 | "fontSize": 20, 473 | "fontFamily": 1, 474 | "text": "", 475 | "textAlign": "left", 476 | "verticalAlign": "top", 477 | "containerId": null, 478 | "originalText": "", 479 | "lineHeight": 1.25, 480 | "baseline": 18 481 | }, 482 | { 483 | "type": "line", 484 | "version": 1604, 485 | "versionNonce": 19920575, 486 | "isDeleted": false, 487 | "id": "QqIk7VTnRWYq499wkttvv", 488 | "fillStyle": "hachure", 489 | "strokeWidth": 1, 490 | "strokeStyle": "solid", 491 | "roughness": 0, 492 | "opacity": 100, 493 | "angle": 0, 494 | "x": 934.8083636621348, 495 | "y": 182.61701218356842, 496 | "strokeColor": "#000000", 497 | "backgroundColor": "transparent", 498 | "width": 20.336010349032712, 499 | "height": 203.23377930246647, 500 | "seed": 2012037663, 501 | "groupIds": [], 502 | "roundness": null, 503 | "boundElements": [], 504 | "updated": 1683370316909, 505 | "link": null, 506 | "locked": false, 507 | "startBinding": null, 508 | "endBinding": null, 509 | "lastCommittedPoint": null, 510 | "startArrowhead": null, 511 | "endArrowhead": null, 512 | "points": [ 513 | [ 514 | 0, 515 | 0 516 | ], 517 | [ 518 | -20.264781987976257, 519 | -0.0011773927935071482 520 | ], 521 | [ 522 | -20.336010349032712, 523 | 203.23260190967298 524 | ], 525 | [ 526 | -0.07239358683375485, 527 | 203.135377672515 528 | ] 529 | ] 530 | }, 531 | { 532 | "type": "line", 533 | "version": 1857, 534 | "versionNonce": 1660885169, 535 | "isDeleted": false, 536 | "id": "gk89VsYpnf9Jby9KEUBd3", 537 | "fillStyle": "hachure", 538 | "strokeWidth": 1, 539 | "strokeStyle": "solid", 540 | "roughness": 0, 541 | "opacity": 100, 542 | "angle": 3.141592653589793, 543 | "x": 1247.808363662135, 544 | "y": 187.61701218356842, 545 | "strokeColor": "#000000", 546 | "backgroundColor": "transparent", 547 | "width": 20.336010349032712, 548 | "height": 203.23377930246647, 549 | "seed": 509453887, 550 | "groupIds": [], 551 | "roundness": null, 552 | "boundElements": [], 553 | "updated": 1683370316909, 554 | "link": null, 555 | "locked": false, 556 | "startBinding": null, 557 | "endBinding": null, 558 | "lastCommittedPoint": null, 559 | "startArrowhead": null, 560 | "endArrowhead": null, 561 | "points": [ 562 | [ 563 | 0, 564 | 0 565 | ], 566 | [ 567 | -20.264781987976257, 568 | -0.0011773927935071482 569 | ], 570 | [ 571 | -20.336010349032712, 572 | 203.23260190967298 573 | ], 574 | [ 575 | -0.07239358683375485, 576 | 203.135377672515 577 | ] 578 | ] 579 | }, 580 | { 581 | "type": "text", 582 | "version": 620, 583 | "versionNonce": 1588681010, 584 | "isDeleted": false, 585 | "id": "a1c-iZI0SafCiy0u4xieZ", 586 | "fillStyle": "hachure", 587 | "strokeWidth": 1, 588 | "strokeStyle": "solid", 589 | "roughness": 1, 590 | "opacity": 100, 591 | "angle": 0, 592 | "x": 934.3714375891809, 593 | "y": 261.23272444200813, 594 | "strokeColor": "#e67700", 595 | "backgroundColor": "#82c91e", 596 | "width": 270.71783447265625, 597 | "height": 25, 598 | "seed": 1742829553, 599 | "groupIds": [], 600 | "roundness": null, 601 | "boundElements": [], 602 | "updated": 1683371080181, 603 | "link": null, 604 | "locked": false, 605 | "fontSize": 20, 606 | "fontFamily": 1, 607 | "text": "", 608 | "textAlign": "left", 609 | "verticalAlign": "top", 610 | "containerId": null, 611 | "originalText": "", 612 | "lineHeight": 1.25, 613 | "baseline": 18 614 | }, 615 | { 616 | "type": "text", 617 | "version": 564, 618 | "versionNonce": 1968863633, 619 | "isDeleted": false, 620 | "id": "hdhhgp5nA06o5EcSgHQE8", 621 | "fillStyle": "hachure", 622 | "strokeWidth": 1, 623 | "strokeStyle": "solid", 624 | "roughness": 1, 625 | "opacity": 100, 626 | "angle": 0, 627 | "x": 937.6203542151575, 628 | "y": 354.23272444200813, 629 | "strokeColor": "#0b7285", 630 | "backgroundColor": "#82c91e", 631 | "width": 287.73785400390625, 632 | "height": 25, 633 | "seed": 309558367, 634 | "groupIds": [], 635 | "roundness": null, 636 | "boundElements": [], 637 | "updated": 1683370363648, 638 | "link": null, 639 | "locked": false, 640 | "fontSize": 20, 641 | "fontFamily": 1, 642 | "text": "", 643 | "textAlign": "left", 644 | "verticalAlign": "top", 645 | "containerId": null, 646 | "originalText": "", 647 | "lineHeight": 1.25, 648 | "baseline": 18 649 | } 650 | ], 651 | "appState": { 652 | "gridSize": null, 653 | "viewBackgroundColor": "#ffffff" 654 | }, 655 | "files": {} 656 | } -------------------------------------------------------------------------------- /docs/figures/transactions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/penberg/tihku/b73ad11e352de703c4446e6ecabf0c2d755b4008/docs/figures/transactions.png -------------------------------------------------------------------------------- /mvcc-rs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mvcc-rs" 3 | version = "0.0.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | anyhow = "1.0.70" 8 | thiserror = "1.0.40" 9 | tracing = "0.1.37" 10 | serde = { version = "1.0.160", features = ["derive"] } 11 | serde_json = "1.0.96" 12 | tracing-subscriber = { version = "0", optional = true } 13 | base64 = "0.21.0" 14 | aws-sdk-s3 = "0.27.0" 15 | aws-config = "0.55.2" 16 | parking_lot = "0.12.1" 17 | futures = "0.3.28" 18 | crossbeam-skiplist = "0.1.1" 19 | tracing-test = "0" 20 | 21 | [dev-dependencies] 22 | criterion = { version = "0.4", features = ["html_reports", "async", "async_futures"] } 23 | pprof = { version = "0.11.1", features = ["criterion", "flamegraph"] } 24 | tracing-subscriber = "0" 25 | mvcc-rs = { path = "." } 26 | 27 | [[bench]] 28 | name = "my_benchmark" 29 | harness = false 30 | 31 | [features] 32 | default = [] 33 | c_bindings = ["dep:tracing-subscriber"] 34 | -------------------------------------------------------------------------------- /mvcc-rs/benches/my_benchmark.rs: -------------------------------------------------------------------------------- 1 | use criterion::async_executor::FuturesExecutor; 2 | use criterion::{criterion_group, criterion_main, Criterion, Throughput}; 3 | use mvcc_rs::clock::LocalClock; 4 | use mvcc_rs::database::{Database, Row, RowID}; 5 | use pprof::criterion::{Output, PProfProfiler}; 6 | 7 | fn bench_db() -> Database { 8 | let clock = LocalClock::default(); 9 | let storage = mvcc_rs::persistent_storage::Storage::new_noop(); 10 | Database::new(clock, storage) 11 | } 12 | 13 | fn bench(c: &mut Criterion) { 14 | let mut group = c.benchmark_group("mvcc-ops-throughput"); 15 | group.throughput(Throughput::Elements(1)); 16 | 17 | let db = bench_db(); 18 | group.bench_function("begin_tx", |b| { 19 | b.to_async(FuturesExecutor).iter(|| async { 20 | db.begin_tx(); 21 | }) 22 | }); 23 | 24 | let db = bench_db(); 25 | group.bench_function("begin_tx + rollback_tx", |b| { 26 | b.to_async(FuturesExecutor).iter(|| async { 27 | let tx_id = db.begin_tx(); 28 | db.rollback_tx(tx_id) 29 | }) 30 | }); 31 | 32 | let db = bench_db(); 33 | group.bench_function("begin_tx + commit_tx", |b| { 34 | b.to_async(FuturesExecutor).iter(|| async { 35 | let tx_id = db.begin_tx(); 36 | db.commit_tx(tx_id) 37 | }) 38 | }); 39 | 40 | let db = bench_db(); 41 | group.bench_function("begin_tx-read-commit_tx", |b| { 42 | b.to_async(FuturesExecutor).iter(|| async { 43 | let tx_id = db.begin_tx(); 44 | db.read( 45 | tx_id, 46 | RowID { 47 | table_id: 1, 48 | row_id: 1, 49 | }, 50 | ) 51 | .unwrap(); 52 | db.commit_tx(tx_id) 53 | }) 54 | }); 55 | 56 | let db = bench_db(); 57 | group.bench_function("begin_tx-update-commit_tx", |b| { 58 | b.to_async(FuturesExecutor).iter(|| async { 59 | let tx_id = db.begin_tx(); 60 | db.update( 61 | tx_id, 62 | Row { 63 | id: RowID { 64 | table_id: 1, 65 | row_id: 1, 66 | }, 67 | data: "World".to_string(), 68 | }, 69 | ) 70 | .unwrap(); 71 | db.commit_tx(tx_id) 72 | }) 73 | }); 74 | 75 | let db = bench_db(); 76 | let tx = db.begin_tx(); 77 | db.insert( 78 | tx, 79 | Row { 80 | id: RowID { 81 | table_id: 1, 82 | row_id: 1, 83 | }, 84 | data: "Hello".to_string(), 85 | }, 86 | ) 87 | .unwrap(); 88 | group.bench_function("read", |b| { 89 | b.to_async(FuturesExecutor).iter(|| async { 90 | db.read( 91 | tx, 92 | RowID { 93 | table_id: 1, 94 | row_id: 1, 95 | }, 96 | ) 97 | .unwrap(); 98 | }) 99 | }); 100 | 101 | let db = bench_db(); 102 | let tx = db.begin_tx(); 103 | db.insert( 104 | tx, 105 | Row { 106 | id: RowID { 107 | table_id: 1, 108 | row_id: 1, 109 | }, 110 | data: "Hello".to_string(), 111 | }, 112 | ) 113 | .unwrap(); 114 | group.bench_function("update", |b| { 115 | b.to_async(FuturesExecutor).iter(|| async { 116 | db.update( 117 | tx, 118 | Row { 119 | id: RowID { 120 | table_id: 1, 121 | row_id: 1, 122 | }, 123 | data: "World".to_string(), 124 | }, 125 | ) 126 | .unwrap(); 127 | }) 128 | }); 129 | } 130 | 131 | criterion_group! { 132 | name = benches; 133 | config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); 134 | targets = bench 135 | } 136 | criterion_main!(benches); 137 | -------------------------------------------------------------------------------- /mvcc-rs/src/clock.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicU64, Ordering}; 2 | 3 | /// Logical clock. 4 | pub trait LogicalClock { 5 | fn get_timestamp(&self) -> u64; 6 | fn reset(&self, ts: u64); 7 | } 8 | 9 | /// A node-local clock backed by an atomic counter. 10 | #[derive(Debug, Default)] 11 | pub struct LocalClock { 12 | ts_sequence: AtomicU64, 13 | } 14 | 15 | impl LocalClock { 16 | pub fn new() -> Self { 17 | Self { 18 | ts_sequence: AtomicU64::new(0), 19 | } 20 | } 21 | } 22 | 23 | impl LogicalClock for LocalClock { 24 | fn get_timestamp(&self) -> u64 { 25 | self.ts_sequence.fetch_add(1, Ordering::SeqCst) 26 | } 27 | 28 | fn reset(&self, ts: u64) { 29 | self.ts_sequence.store(ts, Ordering::SeqCst); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /mvcc-rs/src/cursor.rs: -------------------------------------------------------------------------------- 1 | use serde::de::DeserializeOwned; 2 | use serde::Serialize; 3 | 4 | use crate::clock::LogicalClock; 5 | use crate::database::{Database, Result, Row, RowID}; 6 | use std::fmt::Debug; 7 | 8 | #[derive(Debug)] 9 | pub struct ScanCursor<'a, Clock: LogicalClock, T: Sync + Send + Clone + Serialize + DeserializeOwned + Debug> { 10 | pub db: &'a Database, 11 | pub row_ids: Vec, 12 | pub index: usize, 13 | tx_id: u64, 14 | } 15 | 16 | impl<'a, Clock: LogicalClock, T: Sync + Send + Clone + Serialize + DeserializeOwned + Debug + 'static> ScanCursor<'a, Clock, T> { 17 | pub fn new( 18 | db: &'a Database, 19 | tx_id: u64, 20 | table_id: u64, 21 | ) -> Result> { 22 | let row_ids = db.scan_row_ids_for_table(table_id)?; 23 | Ok(Self { 24 | db, 25 | tx_id, 26 | row_ids, 27 | index: 0, 28 | }) 29 | } 30 | 31 | pub fn current_row_id(&self) -> Option { 32 | if self.index >= self.row_ids.len() { 33 | return None; 34 | } 35 | Some(self.row_ids[self.index]) 36 | } 37 | 38 | pub fn current_row(&self) -> Result>> { 39 | if self.index >= self.row_ids.len() { 40 | return Ok(None); 41 | } 42 | let id = self.row_ids[self.index]; 43 | self.db.read(self.tx_id, id) 44 | } 45 | 46 | pub fn close(self) -> Result<()> { 47 | Ok(()) 48 | } 49 | 50 | pub fn forward(&mut self) -> bool { 51 | self.index += 1; 52 | self.index < self.row_ids.len() 53 | } 54 | 55 | pub fn is_empty(&self) -> bool { 56 | self.index >= self.row_ids.len() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /mvcc-rs/src/database/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::clock::LogicalClock; 2 | use crate::errors::DatabaseError; 3 | use crate::persistent_storage::Storage; 4 | use crossbeam_skiplist::{SkipMap, SkipSet}; 5 | use serde::de::DeserializeOwned; 6 | use serde::{Deserialize, Serialize}; 7 | use std::fmt::Debug; 8 | use std::sync::atomic::{AtomicU64, Ordering}; 9 | use std::sync::RwLock; 10 | 11 | pub type Result = std::result::Result; 12 | 13 | #[cfg(test)] 14 | mod tests; 15 | 16 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Hash)] 17 | pub struct RowID { 18 | pub table_id: u64, 19 | pub row_id: u64, 20 | } 21 | 22 | #[derive(Clone, Debug, PartialEq, PartialOrd, Serialize, Deserialize)] 23 | 24 | pub struct Row { 25 | pub id: RowID, 26 | pub data: T, 27 | } 28 | 29 | /// A row version. 30 | #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] 31 | pub struct RowVersion { 32 | begin: TxTimestampOrID, 33 | end: Option, 34 | row: Row, 35 | } 36 | 37 | pub type TxID = u64; 38 | 39 | /// A log record contains all the versions inserted and deleted by a transaction. 40 | #[derive(Clone, Debug, Serialize, Deserialize)] 41 | pub struct LogRecord { 42 | pub(crate) tx_timestamp: TxID, 43 | row_versions: Vec>, 44 | } 45 | 46 | impl LogRecord { 47 | fn new(tx_timestamp: TxID) -> Self { 48 | Self { 49 | tx_timestamp, 50 | row_versions: Vec::new(), 51 | } 52 | } 53 | } 54 | 55 | /// A transaction timestamp or ID. 56 | /// 57 | /// Versions either track a timestamp or a transaction ID, depending on the 58 | /// phase of the transaction. During the active phase, new versions track the 59 | /// transaction ID in the `begin` and `end` fields. After a transaction commits, 60 | /// versions switch to tracking timestamps. 61 | #[derive(Clone, Debug, PartialEq, PartialOrd, Serialize, Deserialize)] 62 | enum TxTimestampOrID { 63 | Timestamp(u64), 64 | TxID(TxID), 65 | } 66 | 67 | /// Transaction 68 | #[derive(Debug, Serialize, Deserialize)] 69 | pub struct Transaction { 70 | /// The state of the transaction. 71 | state: AtomicTransactionState, 72 | /// The transaction ID. 73 | tx_id: u64, 74 | /// The transaction begin timestamp. 75 | begin_ts: u64, 76 | /// The transaction write set. 77 | #[serde(with = "skipset_rowid")] 78 | write_set: SkipSet, 79 | /// The transaction read set. 80 | #[serde(with = "skipset_rowid")] 81 | read_set: SkipSet, 82 | } 83 | 84 | mod skipset_rowid { 85 | use super::*; 86 | use serde::{de, ser, ser::SerializeSeq}; 87 | 88 | struct SkipSetDeserializer; 89 | 90 | impl<'de> serde::de::Visitor<'de> for SkipSetDeserializer { 91 | type Value = SkipSet; 92 | 93 | fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { 94 | formatter.write_str("SkipSet key value sequence.") 95 | } 96 | 97 | fn visit_seq(self, mut seq: A) -> std::result::Result 98 | where 99 | A: serde::de::SeqAccess<'de>, 100 | { 101 | let new_skipset = SkipSet::new(); 102 | while let Some(elem) = seq.next_element()? { 103 | new_skipset.insert(elem); 104 | } 105 | 106 | Ok(new_skipset) 107 | } 108 | } 109 | 110 | pub fn serialize( 111 | value: &SkipSet, 112 | ser: S, 113 | ) -> std::result::Result { 114 | let mut set = ser.serialize_seq(Some(value.len()))?; 115 | for v in value { 116 | set.serialize_element(v.value())?; 117 | } 118 | set.end() 119 | } 120 | 121 | pub fn deserialize<'de, D: de::Deserializer<'de>>( 122 | de: D, 123 | ) -> std::result::Result, D::Error> { 124 | de.deserialize_seq(SkipSetDeserializer) 125 | } 126 | } 127 | 128 | impl Transaction { 129 | fn new(tx_id: u64, begin_ts: u64) -> Transaction { 130 | Transaction { 131 | state: TransactionState::Active.into(), 132 | tx_id, 133 | begin_ts, 134 | write_set: SkipSet::new(), 135 | read_set: SkipSet::new(), 136 | } 137 | } 138 | 139 | fn insert_to_read_set(&self, id: RowID) { 140 | self.read_set.insert(id); 141 | } 142 | 143 | fn insert_to_write_set(&mut self, id: RowID) { 144 | self.write_set.insert(id); 145 | } 146 | } 147 | 148 | impl std::fmt::Display for Transaction { 149 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { 150 | write!( 151 | f, 152 | "{{ state: {}, id: {}, begin_ts: {}, write_set: {:?}, read_set: {:?}", 153 | self.state.load(), 154 | self.tx_id, 155 | self.begin_ts, 156 | // FIXME: I'm sorry, we obviously shouldn't be cloning here. 157 | self.write_set 158 | .iter() 159 | .map(|v| *v.value()) 160 | .collect::>(), 161 | self.read_set 162 | .iter() 163 | .map(|v| *v.value()) 164 | .collect::>() 165 | ) 166 | } 167 | } 168 | 169 | /// Transaction state. 170 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 171 | enum TransactionState { 172 | Active, 173 | Preparing, 174 | Aborted, 175 | Terminated, 176 | Committed(u64), 177 | } 178 | 179 | impl TransactionState { 180 | pub fn encode(&self) -> u64 { 181 | match self { 182 | TransactionState::Active => 0, 183 | TransactionState::Preparing => 1, 184 | TransactionState::Aborted => 2, 185 | TransactionState::Terminated => 3, 186 | TransactionState::Committed(ts) => { 187 | // We only support 2*62 - 1 timestamps, because the extra bit 188 | // is used to encode the type. 189 | assert!(ts & 0x8000_0000_0000_0000 == 0); 190 | 0x8000_0000_0000_0000 | ts 191 | } 192 | } 193 | } 194 | 195 | pub fn decode(v: u64) -> Self { 196 | match v { 197 | 0 => TransactionState::Active, 198 | 1 => TransactionState::Preparing, 199 | 2 => TransactionState::Aborted, 200 | 3 => TransactionState::Terminated, 201 | v if v & 0x8000_0000_0000_0000 != 0 => { 202 | TransactionState::Committed(v & 0x7fff_ffff_ffff_ffff) 203 | } 204 | _ => panic!("Invalid transaction state"), 205 | } 206 | } 207 | } 208 | 209 | // Transaction state encoded into a single 64-bit atomic. 210 | #[derive(Debug, Serialize, Deserialize)] 211 | pub(crate) struct AtomicTransactionState { 212 | pub(crate) state: AtomicU64, 213 | } 214 | 215 | impl From for AtomicTransactionState { 216 | fn from(state: TransactionState) -> Self { 217 | Self { 218 | state: AtomicU64::new(state.encode()), 219 | } 220 | } 221 | } 222 | 223 | impl From for TransactionState { 224 | fn from(state: AtomicTransactionState) -> Self { 225 | let encoded = state.state.load(Ordering::Acquire); 226 | TransactionState::decode(encoded) 227 | } 228 | } 229 | 230 | impl std::cmp::PartialEq for AtomicTransactionState { 231 | fn eq(&self, other: &TransactionState) -> bool { 232 | &self.load() == other 233 | } 234 | } 235 | 236 | impl std::fmt::Display for TransactionState { 237 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { 238 | match self { 239 | TransactionState::Active => write!(f, "Active"), 240 | TransactionState::Preparing => write!(f, "Preparing"), 241 | TransactionState::Committed(ts) => write!(f, "Committed({ts})"), 242 | TransactionState::Aborted => write!(f, "Aborted"), 243 | TransactionState::Terminated => write!(f, "Terminated"), 244 | } 245 | } 246 | } 247 | 248 | impl AtomicTransactionState { 249 | fn store(&self, state: TransactionState) { 250 | self.state.store(state.encode(), Ordering::Release); 251 | } 252 | 253 | fn load(&self) -> TransactionState { 254 | TransactionState::decode(self.state.load(Ordering::Acquire)) 255 | } 256 | } 257 | 258 | #[derive(Debug)] 259 | pub struct Database< 260 | Clock: LogicalClock, 261 | T: Sync + Send + Clone + Serialize + Debug + DeserializeOwned, 262 | > { 263 | rows: SkipMap>>>, 264 | txs: SkipMap>, 265 | tx_ids: AtomicU64, 266 | clock: Clock, 267 | storage: Storage, 268 | } 269 | 270 | impl 271 | Database 272 | { 273 | /// Creates a new database. 274 | pub fn new(clock: Clock, storage: Storage) -> Self { 275 | Self { 276 | rows: SkipMap::new(), 277 | txs: SkipMap::new(), 278 | tx_ids: AtomicU64::new(1), // let's reserve transaction 0 for special purposes 279 | clock, 280 | storage, 281 | } 282 | } 283 | 284 | // Extracts the begin timestamp from a transaction 285 | fn get_begin_timestamp(&self, ts_or_id: &TxTimestampOrID) -> u64 { 286 | match ts_or_id { 287 | TxTimestampOrID::Timestamp(ts) => *ts, 288 | TxTimestampOrID::TxID(tx_id) => { 289 | self.txs 290 | .get(tx_id) 291 | .unwrap() 292 | .value() 293 | .read() 294 | .unwrap() 295 | .begin_ts 296 | } 297 | } 298 | } 299 | 300 | /// Inserts a new row version into the database, while making sure that 301 | /// the row version is inserted in the correct order. 302 | fn insert_version(&self, id: RowID, row_version: RowVersion) { 303 | let versions = self.rows.get_or_insert_with(id, || RwLock::new(Vec::new())); 304 | let mut versions = versions.value().write().unwrap(); 305 | self.insert_version_raw(&mut versions, row_version) 306 | } 307 | 308 | /// Inserts a new row version into the internal data structure for versions, 309 | /// while making sure that the row version is inserted in the correct order. 310 | fn insert_version_raw(&self, versions: &mut Vec>, row_version: RowVersion) { 311 | // NOTICE: this is an insert a'la insertion sort, with pessimistic linear complexity. 312 | // However, we expect the number of versions to be nearly sorted, so we deem it worthy 313 | // to search linearly for the insertion point instead of paying the price of using 314 | // another data structure, e.g. a BTreeSet. If it proves to be too quadratic empirically, 315 | // we can either switch to a tree-like structure, or at least use partition_point() 316 | // which performs a binary search for the insertion point. 317 | let position = versions 318 | .iter() 319 | .rposition(|v| { 320 | self.get_begin_timestamp(&v.begin) < self.get_begin_timestamp(&row_version.begin) 321 | }) 322 | .map(|p| p + 1) 323 | .unwrap_or(0); 324 | if versions.len() - position > 3 { 325 | tracing::debug!( 326 | "Inserting a row version {} positions from the end", 327 | versions.len() - position 328 | ); 329 | } 330 | versions.insert(position, row_version); 331 | } 332 | 333 | /// Inserts a new row into the database. 334 | /// 335 | /// This function inserts a new `row` into the database within the context 336 | /// of the transaction `tx_id`. 337 | /// 338 | /// # Arguments 339 | /// 340 | /// * `tx_id` - the ID of the transaction in which to insert the new row. 341 | /// * `row` - the row object containing the values to be inserted. 342 | /// 343 | pub fn insert(&self, tx_id: TxID, row: Row) -> Result<()> { 344 | let tx = self 345 | .txs 346 | .get(&tx_id) 347 | .ok_or(DatabaseError::NoSuchTransactionID(tx_id))?; 348 | let mut tx = tx.value().write().unwrap(); 349 | assert_eq!(tx.state, TransactionState::Active); 350 | let id = row.id; 351 | let row_version = RowVersion { 352 | begin: TxTimestampOrID::TxID(tx.tx_id), 353 | end: None, 354 | row, 355 | }; 356 | tx.insert_to_write_set(id); 357 | drop(tx); 358 | self.insert_version(id, row_version); 359 | Ok(()) 360 | } 361 | 362 | /// Updates a row in the database with new values. 363 | /// 364 | /// This function updates an existing row in the database within the 365 | /// context of the transaction `tx_id`. The `row` argument identifies the 366 | /// row to be updated as `id` and contains the new values to be inserted. 367 | /// 368 | /// If the row identified by the `id` does not exist, this function does 369 | /// nothing and returns `false`. Otherwise, the function updates the row 370 | /// with the new values and returns `true`. 371 | /// 372 | /// # Arguments 373 | /// 374 | /// * `tx_id` - the ID of the transaction in which to update the new row. 375 | /// * `row` - the row object containing the values to be updated. 376 | /// 377 | /// # Returns 378 | /// 379 | /// Returns `true` if the row was successfully updated, and `false` otherwise. 380 | pub fn update(&self, tx_id: TxID, row: Row) -> Result { 381 | if !self.delete(tx_id, row.id)? { 382 | return Ok(false); 383 | } 384 | self.insert(tx_id, row)?; 385 | Ok(true) 386 | } 387 | 388 | /// Inserts a row in the database with new values, previously deleting 389 | /// any old data if it existed. Bails on a delete error, e.g. write-write conflict. 390 | pub fn upsert(&self, tx_id: TxID, row: Row) -> Result<()> { 391 | self.delete(tx_id, row.id)?; 392 | self.insert(tx_id, row) 393 | } 394 | 395 | /// Deletes a row from the table with the given `id`. 396 | /// 397 | /// This function deletes an existing row `id` in the database within the 398 | /// context of the transaction `tx_id`. 399 | /// 400 | /// # Arguments 401 | /// 402 | /// * `tx_id` - the ID of the transaction in which to delete the new row. 403 | /// * `id` - the ID of the row to delete. 404 | /// 405 | /// # Returns 406 | /// 407 | /// Returns `true` if the row was successfully deleted, and `false` otherwise. 408 | /// 409 | pub fn delete(&self, tx_id: TxID, id: RowID) -> Result { 410 | let row_versions_opt = self.rows.get(&id); 411 | if let Some(ref row_versions) = row_versions_opt { 412 | let mut row_versions = row_versions.value().write().unwrap(); 413 | for rv in row_versions.iter_mut().rev() { 414 | let tx = self 415 | .txs 416 | .get(&tx_id) 417 | .ok_or(DatabaseError::NoSuchTransactionID(tx_id))?; 418 | let tx = tx.value().read().unwrap(); 419 | assert_eq!(tx.state, TransactionState::Active); 420 | if is_write_write_conflict(&self.txs, &tx, rv) { 421 | drop(row_versions); 422 | drop(row_versions_opt); 423 | drop(tx); 424 | self.rollback_tx(tx_id); 425 | return Err(DatabaseError::WriteWriteConflict); 426 | } 427 | if is_version_visible(&self.txs, &tx, rv) { 428 | rv.end = Some(TxTimestampOrID::TxID(tx.tx_id)); 429 | drop(row_versions); 430 | drop(row_versions_opt); 431 | drop(tx); 432 | let tx = self 433 | .txs 434 | .get(&tx_id) 435 | .ok_or(DatabaseError::NoSuchTransactionID(tx_id))?; 436 | let mut tx = tx.value().write().unwrap(); 437 | tx.insert_to_write_set(id); 438 | return Ok(true); 439 | } 440 | } 441 | } 442 | Ok(false) 443 | } 444 | 445 | /// Retrieves a row from the table with the given `id`. 446 | /// 447 | /// This operation is performed within the scope of the transaction identified 448 | /// by `tx_id`. 449 | /// 450 | /// # Arguments 451 | /// 452 | /// * `tx_id` - The ID of the transaction to perform the read operation in. 453 | /// * `id` - The ID of the row to retrieve. 454 | /// 455 | /// # Returns 456 | /// 457 | /// Returns `Some(row)` with the row data if the row with the given `id` exists, 458 | /// and `None` otherwise. 459 | pub fn read(&self, tx_id: TxID, id: RowID) -> Result>> { 460 | let tx = self.txs.get(&tx_id).unwrap(); 461 | let tx = tx.value().read().unwrap(); 462 | assert_eq!(tx.state, TransactionState::Active); 463 | if let Some(row_versions) = self.rows.get(&id) { 464 | let row_versions = row_versions.value().read().unwrap(); 465 | for rv in row_versions.iter().rev() { 466 | if is_version_visible(&self.txs, &tx, rv) { 467 | tx.insert_to_read_set(id); 468 | return Ok(Some(rv.row.clone())); 469 | } 470 | } 471 | } 472 | Ok(None) 473 | } 474 | 475 | /// Gets all row ids in the database. 476 | pub fn scan_row_ids(&self) -> Result> { 477 | let keys = self.rows.iter().map(|entry| *entry.key()); 478 | Ok(keys.collect()) 479 | } 480 | 481 | /// Gets all row ids in the database for a given table. 482 | pub fn scan_row_ids_for_table(&self, table_id: u64) -> Result> { 483 | Ok(self 484 | .rows 485 | .range( 486 | RowID { 487 | table_id, 488 | row_id: 0, 489 | }..RowID { 490 | table_id, 491 | row_id: u64::MAX, 492 | }, 493 | ) 494 | .map(|entry| *entry.key()) 495 | .collect()) 496 | } 497 | 498 | /// Begins a new transaction in the database. 499 | /// 500 | /// This function starts a new transaction in the database and returns a `TxID` value 501 | /// that you can use to perform operations within the transaction. All changes made within the 502 | /// transaction are isolated from other transactions until you commit the transaction. 503 | pub fn begin_tx(&self) -> TxID { 504 | let tx_id = self.get_tx_id(); 505 | let begin_ts = self.get_timestamp(); 506 | let tx = Transaction::new(tx_id, begin_ts); 507 | tracing::trace!("BEGIN {tx}"); 508 | self.txs.insert(tx_id, RwLock::new(tx)); 509 | tx_id 510 | } 511 | 512 | /// Commits a transaction with the specified transaction ID. 513 | /// 514 | /// This function commits the changes made within the specified transaction and finalizes the 515 | /// transaction. Once a transaction has been committed, all changes made within the transaction 516 | /// are visible to other transactions that access the same data. 517 | /// 518 | /// # Arguments 519 | /// 520 | /// * `tx_id` - The ID of the transaction to commit. 521 | pub fn commit_tx(&self, tx_id: TxID) -> Result<()> { 522 | let end_ts = self.get_timestamp(); 523 | // NOTICE: the first shadowed tx keeps the entry alive in the map 524 | // for the duration of this whole function, which is important for correctness! 525 | let tx = self.txs.get(&tx_id).ok_or(DatabaseError::TxTerminated)?; 526 | let tx = tx.value().write().unwrap(); 527 | match tx.state.load() { 528 | TransactionState::Terminated => return Err(DatabaseError::TxTerminated), 529 | _ => { 530 | assert_eq!(tx.state, TransactionState::Active); 531 | } 532 | } 533 | tx.state.store(TransactionState::Preparing); 534 | tracing::trace!("PREPARE {tx}"); 535 | 536 | /* TODO: The code we have here is sufficient for snapshot isolation. 537 | ** In order to implement serializability, we need the following steps: 538 | ** 539 | ** 1. Validate if all read versions are still visible by inspecting the read_set 540 | ** 2. Validate if there are no phantoms by walking the scans from scan_set (which we don't even have yet) 541 | ** - a phantom is a version that became visible in the middle of our transaction, 542 | ** but wasn't taken into account during one of the scans from the scan_set 543 | ** 3. Wait for commit dependencies, which we don't even track yet... 544 | ** Excerpt from what's a commit dependency and how it's tracked in the original paper: 545 | ** """ 546 | A transaction T1 has a commit dependency on another transaction 547 | T2, if T1 is allowed to commit only if T2 commits. If T2 aborts, 548 | T1 must also abort, so cascading aborts are possible. T1 acquires a 549 | commit dependency either by speculatively reading or speculatively ignoring a version, 550 | instead of waiting for T2 to commit. 551 | We implement commit dependencies by a register-and-report 552 | approach: T1 registers its dependency with T2 and T2 informs T1 553 | when it has committed or aborted. Each transaction T contains a 554 | counter, CommitDepCounter, that counts how many unresolved 555 | commit dependencies it still has. A transaction cannot commit 556 | until this counter is zero. In addition, T has a Boolean variable 557 | AbortNow that other transactions can set to tell T to abort. Each 558 | transaction T also has a set, CommitDepSet, that stores transaction IDs 559 | of the transactions that depend on T. 560 | To take a commit dependency on a transaction T2, T1 increments 561 | its CommitDepCounter and adds its transaction ID to T2’s CommitDepSet. 562 | When T2 has committed, it locates each transaction in 563 | its CommitDepSet and decrements their CommitDepCounter. If 564 | T2 aborted, it tells the dependent transactions to also abort by 565 | setting their AbortNow flags. If a dependent transaction is not 566 | found, this means that it has already aborted. 567 | Note that a transaction with commit dependencies may not have to 568 | wait at all - the dependencies may have been resolved before it is 569 | ready to commit. Commit dependencies consolidate all waits into 570 | a single wait and postpone the wait to just before commit. 571 | Some transactions may have to wait before commit. 572 | Waiting raises a concern of deadlocks. 573 | However, deadlocks cannot occur because an older transaction never 574 | waits on a younger transaction. In 575 | a wait-for graph the direction of edges would always be from a 576 | younger transaction (higher end timestamp) to an older transaction 577 | (lower end timestamp) so cycles are impossible. 578 | """ 579 | ** If you're wondering when a speculative read happens, here you go: 580 | ** Case 1: speculative read of TB: 581 | """ 582 | If transaction TB is in the Preparing state, it has acquired an end 583 | timestamp TS which will be V’s begin timestamp if TB commits. 584 | A safe approach in this situation would be to have transaction T 585 | wait until transaction TB commits. However, we want to avoid all 586 | blocking during normal processing so instead we continue with 587 | the visibility test and, if the test returns true, allow T to 588 | speculatively read V. Transaction T acquires a commit dependency on 589 | TB, restricting the serialization order of the two transactions. That 590 | is, T is allowed to commit only if TB commits. 591 | """ 592 | ** Case 2: speculative ignore of TE: 593 | """ 594 | If TE’s state is Preparing, it has an end timestamp TS that will become 595 | the end timestamp of V if TE does commit. If TS is greater than the read 596 | time RT, it is obvious that V will be visible if TE commits. If TE 597 | aborts, V will still be visible, because any transaction that updates 598 | V after TE has aborted will obtain an end timestamp greater than 599 | TS. If TS is less than RT, we have a more complicated situation: 600 | if TE commits, V will not be visible to T but if TE aborts, it will 601 | be visible. We could handle this by forcing T to wait until TE 602 | commits or aborts but we want to avoid all blocking during normal processing. 603 | Instead we allow T to speculatively ignore V and 604 | proceed with its processing. Transaction T acquires a commit 605 | dependency (see Section 2.7) on TE, that is, T is allowed to commit 606 | only if TE commits. 607 | """ 608 | */ 609 | tx.state.store(TransactionState::Committed(end_ts)); 610 | tracing::trace!("COMMIT {tx}"); 611 | let tx_begin_ts = tx.begin_ts; 612 | let write_set: Vec = tx.write_set.iter().map(|v| *v.value()).collect(); 613 | drop(tx); 614 | // Postprocessing: inserting row versions and logging the transaction to persistent storage. 615 | // TODO: we should probably save to persistent storage first, and only then update the in-memory structures. 616 | let mut log_record: LogRecord = LogRecord::new(end_ts); 617 | for ref id in write_set { 618 | if let Some(row_versions) = self.rows.get(id) { 619 | let mut row_versions = row_versions.value().write().unwrap(); 620 | for row_version in row_versions.iter_mut() { 621 | if let TxTimestampOrID::TxID(id) = row_version.begin { 622 | if id == tx_id { 623 | row_version.begin = TxTimestampOrID::Timestamp(tx_begin_ts); 624 | self.insert_version_raw( 625 | &mut log_record.row_versions, 626 | row_version.clone(), 627 | ); // FIXME: optimize cloning out 628 | } 629 | } 630 | if let Some(TxTimestampOrID::TxID(id)) = row_version.end { 631 | if id == tx_id { 632 | row_version.end = Some(TxTimestampOrID::Timestamp(end_ts)); 633 | self.insert_version_raw( 634 | &mut log_record.row_versions, 635 | row_version.clone(), 636 | ); // FIXME: optimize cloning out 637 | } 638 | } 639 | } 640 | } 641 | } 642 | tracing::trace!("UPDATED TX{tx_id}"); 643 | // We have now updated all the versions with a reference to the 644 | // transaction ID to a timestamp and can, therefore, remove the 645 | // transaction. Please note that when we move to lockless, the 646 | // invariant doesn't necessarily hold anymore because another thread 647 | // might have speculatively read a version that we want to remove. 648 | // But that's a problem for another day. 649 | // FIXME: it actually just become a problem for today!!! 650 | // TODO: test that reproduces this failure, and then a fix 651 | self.txs.remove(&tx_id); 652 | if !log_record.row_versions.is_empty() { 653 | self.storage.log_tx(log_record)?; 654 | } 655 | tracing::trace!("LOGGED {tx_id}"); 656 | Ok(()) 657 | } 658 | 659 | /// Rolls back a transaction with the specified ID. 660 | /// 661 | /// This function rolls back a transaction with the specified `tx_id` by 662 | /// discarding any changes made by the transaction. 663 | /// 664 | /// # Arguments 665 | /// 666 | /// * `tx_id` - The ID of the transaction to abort. 667 | pub fn rollback_tx(&self, tx_id: TxID) { 668 | let tx_unlocked = self.txs.get(&tx_id).unwrap(); 669 | let tx = tx_unlocked.value().write().unwrap(); 670 | assert_eq!(tx.state, TransactionState::Active); 671 | tx.state.store(TransactionState::Aborted); 672 | tracing::trace!("ABORT {tx}"); 673 | let write_set: Vec = tx.write_set.iter().map(|v| *v.value()).collect(); 674 | drop(tx); 675 | 676 | for ref id in write_set { 677 | if let Some(row_versions) = self.rows.get(id) { 678 | let mut row_versions = row_versions.value().write().unwrap(); 679 | row_versions.retain(|rv| rv.begin != TxTimestampOrID::TxID(tx_id)); 680 | if row_versions.is_empty() { 681 | self.rows.remove(id); 682 | } 683 | } 684 | } 685 | 686 | let tx = tx_unlocked.value().write().unwrap(); 687 | tx.state.store(TransactionState::Terminated); 688 | tracing::trace!("TERMINATE {tx}"); 689 | // FIXME: verify that we can already remove the transaction here! 690 | // Maybe it's fine for snapshot isolation, but too early for serializable? 691 | self.txs.remove(&tx_id); 692 | } 693 | 694 | /// Generates next unique transaction id 695 | pub fn get_tx_id(&self) -> u64 { 696 | self.tx_ids.fetch_add(1, Ordering::SeqCst) 697 | } 698 | 699 | /// Gets current timestamp 700 | pub fn get_timestamp(&self) -> u64 { 701 | self.clock.get_timestamp() 702 | } 703 | 704 | /// Removes unused row versions with very loose heuristics, 705 | /// which sometimes leaves versions intact for too long. 706 | /// Returns the number of removed versions. 707 | pub fn drop_unused_row_versions(&self) -> usize { 708 | tracing::trace!( 709 | "Dropping unused row versions. Database stats: transactions: {}; rows: {}", 710 | self.txs.len(), 711 | self.rows.len() 712 | ); 713 | let mut dropped = 0; 714 | let mut to_remove = Vec::new(); 715 | for entry in self.rows.iter() { 716 | let mut row_versions = entry.value().write().unwrap(); 717 | row_versions.retain(|rv| { 718 | // FIXME: should take rv.begin into account as well 719 | let should_stay = match rv.end { 720 | Some(TxTimestampOrID::Timestamp(version_end_ts)) => { 721 | // a transaction started before this row version ended, ergo row version is needed 722 | // NOTICE: O(row_versions x transactions), but also lock-free, so sounds acceptable 723 | self.txs.iter().any(|tx| { 724 | let tx = tx.value().read().unwrap(); 725 | // FIXME: verify! 726 | match tx.state.load() { 727 | TransactionState::Active | TransactionState::Preparing => { 728 | version_end_ts > tx.begin_ts 729 | } 730 | _ => false, 731 | } 732 | }) 733 | } 734 | // Let's skip potentially complex logic if the transafction is still 735 | // active/tracked. We will drop the row version when the transaction 736 | // gets garbage-collected itself, it will always happen eventually. 737 | Some(TxTimestampOrID::TxID(tx_id)) => !self.txs.contains_key(&tx_id), 738 | // this row version is current, ergo visible 739 | None => true, 740 | }; 741 | if !should_stay { 742 | dropped += 1; 743 | tracing::trace!( 744 | "Dropping row version {:?} {:?}-{:?}", 745 | entry.key(), 746 | rv.begin, 747 | rv.end 748 | ); 749 | } 750 | should_stay 751 | }); 752 | if row_versions.is_empty() { 753 | to_remove.push(*entry.key()); 754 | } 755 | } 756 | for id in to_remove { 757 | self.rows.remove(&id); 758 | } 759 | dropped 760 | } 761 | 762 | pub fn recover(&self) -> Result<()> { 763 | let tx_log = self.storage.read_tx_log()?; 764 | for record in tx_log { 765 | tracing::debug!("RECOVERING {:?}", record); 766 | for version in record.row_versions { 767 | self.insert_version(version.row.id, version); 768 | } 769 | self.clock.reset(record.tx_timestamp); 770 | } 771 | Ok(()) 772 | } 773 | } 774 | 775 | /// A write-write conflict happens when transaction T_m attempts to update a 776 | /// row version that is currently being updated by an active transaction T_n. 777 | pub(crate) fn is_write_write_conflict( 778 | txs: &SkipMap>, 779 | tx: &Transaction, 780 | rv: &RowVersion, 781 | ) -> bool { 782 | match rv.end { 783 | Some(TxTimestampOrID::TxID(rv_end)) => { 784 | let te = txs.get(&rv_end).unwrap(); 785 | let te = te.value().read().unwrap(); 786 | match te.state.load() { 787 | TransactionState::Active | TransactionState::Preparing => tx.tx_id != te.tx_id, 788 | _ => false, 789 | } 790 | } 791 | Some(TxTimestampOrID::Timestamp(_)) => false, 792 | None => false, 793 | } 794 | } 795 | 796 | pub(crate) fn is_version_visible( 797 | txs: &SkipMap>, 798 | tx: &Transaction, 799 | rv: &RowVersion, 800 | ) -> bool { 801 | is_begin_visible(txs, tx, rv) && is_end_visible(txs, tx, rv) 802 | } 803 | 804 | fn is_begin_visible( 805 | txs: &SkipMap>, 806 | tx: &Transaction, 807 | rv: &RowVersion, 808 | ) -> bool { 809 | match rv.begin { 810 | TxTimestampOrID::Timestamp(rv_begin_ts) => tx.begin_ts >= rv_begin_ts, 811 | TxTimestampOrID::TxID(rv_begin) => { 812 | let tb = txs.get(&rv_begin).unwrap(); 813 | let tb = tb.value().read().unwrap(); 814 | let visible = match tb.state.load() { 815 | TransactionState::Active => tx.tx_id == tb.tx_id && rv.end.is_none(), 816 | TransactionState::Preparing => false, // NOTICE: makes sense for snapshot isolation, not so much for serializable! 817 | TransactionState::Committed(committed_ts) => tx.begin_ts >= committed_ts, 818 | TransactionState::Aborted => false, 819 | TransactionState::Terminated => { 820 | tracing::debug!("TODO: should reread rv's end field - it should have updated the timestamp in the row version by now"); 821 | false 822 | } 823 | }; 824 | tracing::trace!( 825 | "is_begin_visible: tx={tx}, tb={tb} rv = {:?}-{:?} visible = {visible}", 826 | rv.begin, 827 | rv.end 828 | ); 829 | visible 830 | } 831 | } 832 | } 833 | 834 | fn is_end_visible( 835 | txs: &SkipMap>, 836 | tx: &Transaction, 837 | rv: &RowVersion, 838 | ) -> bool { 839 | match rv.end { 840 | Some(TxTimestampOrID::Timestamp(rv_end_ts)) => tx.begin_ts < rv_end_ts, 841 | Some(TxTimestampOrID::TxID(rv_end)) => { 842 | let te = txs.get(&rv_end).unwrap(); 843 | let te = te.value().read().unwrap(); 844 | let visible = match te.state.load() { 845 | TransactionState::Active => tx.tx_id != te.tx_id, 846 | TransactionState::Preparing => false, // NOTICE: makes sense for snapshot isolation, not so much for serializable! 847 | TransactionState::Committed(committed_ts) => tx.begin_ts < committed_ts, 848 | TransactionState::Aborted => false, 849 | TransactionState::Terminated => { 850 | tracing::debug!("TODO: should reread rv's end field - it should have updated the timestamp in the row version by now"); 851 | false 852 | } 853 | }; 854 | tracing::trace!( 855 | "is_end_visible: tx={tx}, te={te} rv = {:?}-{:?} visible = {visible}", 856 | rv.begin, 857 | rv.end 858 | ); 859 | visible 860 | } 861 | None => true, 862 | } 863 | } 864 | -------------------------------------------------------------------------------- /mvcc-rs/src/database/tests.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use crate::clock::LocalClock; 3 | use tracing_test::traced_test; 4 | 5 | fn test_db() -> Database { 6 | let clock = LocalClock::new(); 7 | let storage = crate::persistent_storage::Storage::new_noop(); 8 | Database::new(clock, storage) 9 | } 10 | 11 | #[traced_test] 12 | #[test] 13 | fn test_insert_read() { 14 | let db = test_db(); 15 | 16 | let tx1 = db.begin_tx(); 17 | let tx1_row = Row { 18 | id: RowID { 19 | table_id: 1, 20 | row_id: 1, 21 | }, 22 | data: "Hello".to_string(), 23 | }; 24 | db.insert(tx1, tx1_row.clone()).unwrap(); 25 | let row = db 26 | .read( 27 | tx1, 28 | RowID { 29 | table_id: 1, 30 | row_id: 1, 31 | }, 32 | ) 33 | .unwrap() 34 | .unwrap(); 35 | assert_eq!(tx1_row, row); 36 | db.commit_tx(tx1).unwrap(); 37 | 38 | let tx2 = db.begin_tx(); 39 | let row = db 40 | .read( 41 | tx2, 42 | RowID { 43 | table_id: 1, 44 | row_id: 1, 45 | }, 46 | ) 47 | .unwrap() 48 | .unwrap(); 49 | assert_eq!(tx1_row, row); 50 | } 51 | 52 | #[traced_test] 53 | #[test] 54 | fn test_read_nonexistent() { 55 | let db = test_db(); 56 | let tx = db.begin_tx(); 57 | let row = db.read( 58 | tx, 59 | RowID { 60 | table_id: 1, 61 | row_id: 1, 62 | }, 63 | ); 64 | assert!(row.unwrap().is_none()); 65 | } 66 | 67 | #[traced_test] 68 | #[test] 69 | fn test_delete() { 70 | let db = test_db(); 71 | 72 | let tx1 = db.begin_tx(); 73 | let tx1_row = Row { 74 | id: RowID { 75 | table_id: 1, 76 | row_id: 1, 77 | }, 78 | data: "Hello".to_string(), 79 | }; 80 | db.insert(tx1, tx1_row.clone()).unwrap(); 81 | let row = db 82 | .read( 83 | tx1, 84 | RowID { 85 | table_id: 1, 86 | row_id: 1, 87 | }, 88 | ) 89 | .unwrap() 90 | .unwrap(); 91 | assert_eq!(tx1_row, row); 92 | db.delete( 93 | tx1, 94 | RowID { 95 | table_id: 1, 96 | row_id: 1, 97 | }, 98 | ) 99 | .unwrap(); 100 | let row = db 101 | .read( 102 | tx1, 103 | RowID { 104 | table_id: 1, 105 | row_id: 1, 106 | }, 107 | ) 108 | .unwrap(); 109 | assert!(row.is_none()); 110 | db.commit_tx(tx1).unwrap(); 111 | 112 | let tx2 = db.begin_tx(); 113 | let row = db 114 | .read( 115 | tx2, 116 | RowID { 117 | table_id: 1, 118 | row_id: 1, 119 | }, 120 | ) 121 | .unwrap(); 122 | assert!(row.is_none()); 123 | } 124 | 125 | #[traced_test] 126 | #[test] 127 | fn test_delete_nonexistent() { 128 | let db = test_db(); 129 | let tx = db.begin_tx(); 130 | assert!(!db 131 | .delete( 132 | tx, 133 | RowID { 134 | table_id: 1, 135 | row_id: 1 136 | } 137 | ) 138 | .unwrap()); 139 | } 140 | 141 | #[traced_test] 142 | #[test] 143 | fn test_commit() { 144 | let db = test_db(); 145 | let tx1 = db.begin_tx(); 146 | let tx1_row = Row { 147 | id: RowID { 148 | table_id: 1, 149 | row_id: 1, 150 | }, 151 | data: "Hello".to_string(), 152 | }; 153 | db.insert(tx1, tx1_row.clone()).unwrap(); 154 | let row = db 155 | .read( 156 | tx1, 157 | RowID { 158 | table_id: 1, 159 | row_id: 1, 160 | }, 161 | ) 162 | .unwrap() 163 | .unwrap(); 164 | assert_eq!(tx1_row, row); 165 | let tx1_updated_row = Row { 166 | id: RowID { 167 | table_id: 1, 168 | row_id: 1, 169 | }, 170 | data: "World".to_string(), 171 | }; 172 | db.update(tx1, tx1_updated_row.clone()).unwrap(); 173 | let row = db 174 | .read( 175 | tx1, 176 | RowID { 177 | table_id: 1, 178 | row_id: 1, 179 | }, 180 | ) 181 | .unwrap() 182 | .unwrap(); 183 | assert_eq!(tx1_updated_row, row); 184 | db.commit_tx(tx1).unwrap(); 185 | 186 | let tx2 = db.begin_tx(); 187 | let row = db 188 | .read( 189 | tx2, 190 | RowID { 191 | table_id: 1, 192 | row_id: 1, 193 | }, 194 | ) 195 | .unwrap() 196 | .unwrap(); 197 | db.commit_tx(tx2).unwrap(); 198 | assert_eq!(tx1_updated_row, row); 199 | db.drop_unused_row_versions(); 200 | } 201 | 202 | #[traced_test] 203 | #[test] 204 | fn test_rollback() { 205 | let db = test_db(); 206 | let tx1 = db.begin_tx(); 207 | let row1 = Row { 208 | id: RowID { 209 | table_id: 1, 210 | row_id: 1, 211 | }, 212 | data: "Hello".to_string(), 213 | }; 214 | db.insert(tx1, row1.clone()).unwrap(); 215 | let row2 = db 216 | .read( 217 | tx1, 218 | RowID { 219 | table_id: 1, 220 | row_id: 1, 221 | }, 222 | ) 223 | .unwrap() 224 | .unwrap(); 225 | assert_eq!(row1, row2); 226 | let row3 = Row { 227 | id: RowID { 228 | table_id: 1, 229 | row_id: 1, 230 | }, 231 | data: "World".to_string(), 232 | }; 233 | db.update(tx1, row3.clone()).unwrap(); 234 | let row4 = db 235 | .read( 236 | tx1, 237 | RowID { 238 | table_id: 1, 239 | row_id: 1, 240 | }, 241 | ) 242 | .unwrap() 243 | .unwrap(); 244 | assert_eq!(row3, row4); 245 | db.rollback_tx(tx1); 246 | let tx2 = db.begin_tx(); 247 | let row5 = db 248 | .read( 249 | tx2, 250 | RowID { 251 | table_id: 1, 252 | row_id: 1, 253 | }, 254 | ) 255 | .unwrap(); 256 | assert_eq!(row5, None); 257 | } 258 | 259 | #[traced_test] 260 | #[test] 261 | fn test_dirty_write() { 262 | let db = test_db(); 263 | 264 | // T1 inserts a row with ID 1, but does not commit. 265 | let tx1 = db.begin_tx(); 266 | let tx1_row = Row { 267 | id: RowID { 268 | table_id: 1, 269 | row_id: 1, 270 | }, 271 | data: "Hello".to_string(), 272 | }; 273 | db.insert(tx1, tx1_row.clone()).unwrap(); 274 | let row = db 275 | .read( 276 | tx1, 277 | RowID { 278 | table_id: 1, 279 | row_id: 1, 280 | }, 281 | ) 282 | .unwrap() 283 | .unwrap(); 284 | assert_eq!(tx1_row, row); 285 | 286 | // T2 attempts to delete row with ID 1, but fails because T1 has not committed. 287 | let tx2 = db.begin_tx(); 288 | let tx2_row = Row { 289 | id: RowID { 290 | table_id: 1, 291 | row_id: 1, 292 | }, 293 | data: "World".to_string(), 294 | }; 295 | assert!(!db.update(tx2, tx2_row).unwrap()); 296 | 297 | let row = db 298 | .read( 299 | tx1, 300 | RowID { 301 | table_id: 1, 302 | row_id: 1, 303 | }, 304 | ) 305 | .unwrap() 306 | .unwrap(); 307 | assert_eq!(tx1_row, row); 308 | } 309 | 310 | #[traced_test] 311 | #[test] 312 | fn test_dirty_read() { 313 | let db = test_db(); 314 | 315 | // T1 inserts a row with ID 1, but does not commit. 316 | let tx1 = db.begin_tx(); 317 | let row1 = Row { 318 | id: RowID { 319 | table_id: 1, 320 | row_id: 1, 321 | }, 322 | data: "Hello".to_string(), 323 | }; 324 | db.insert(tx1, row1).unwrap(); 325 | 326 | // T2 attempts to read row with ID 1, but doesn't see one because T1 has not committed. 327 | let tx2 = db.begin_tx(); 328 | let row2 = db 329 | .read( 330 | tx2, 331 | RowID { 332 | table_id: 1, 333 | row_id: 1, 334 | }, 335 | ) 336 | .unwrap(); 337 | assert_eq!(row2, None); 338 | } 339 | 340 | #[traced_test] 341 | #[test] 342 | fn test_dirty_read_deleted() { 343 | let db = test_db(); 344 | 345 | // T1 inserts a row with ID 1 and commits. 346 | let tx1 = db.begin_tx(); 347 | let tx1_row = Row { 348 | id: RowID { 349 | table_id: 1, 350 | row_id: 1, 351 | }, 352 | data: "Hello".to_string(), 353 | }; 354 | db.insert(tx1, tx1_row.clone()).unwrap(); 355 | db.commit_tx(tx1).unwrap(); 356 | 357 | // T2 deletes row with ID 1, but does not commit. 358 | let tx2 = db.begin_tx(); 359 | assert!(db 360 | .delete( 361 | tx2, 362 | RowID { 363 | table_id: 1, 364 | row_id: 1 365 | } 366 | ) 367 | .unwrap()); 368 | 369 | // T3 reads row with ID 1, but doesn't see the delete because T2 hasn't committed. 370 | let tx3 = db.begin_tx(); 371 | let row = db 372 | .read( 373 | tx3, 374 | RowID { 375 | table_id: 1, 376 | row_id: 1, 377 | }, 378 | ) 379 | .unwrap() 380 | .unwrap(); 381 | assert_eq!(tx1_row, row); 382 | } 383 | 384 | #[traced_test] 385 | #[test] 386 | fn test_fuzzy_read() { 387 | let db = test_db(); 388 | 389 | // T1 inserts a row with ID 1 and commits. 390 | let tx1 = db.begin_tx(); 391 | let tx1_row = Row { 392 | id: RowID { 393 | table_id: 1, 394 | row_id: 1, 395 | }, 396 | data: "Hello".to_string(), 397 | }; 398 | db.insert(tx1, tx1_row.clone()).unwrap(); 399 | let row = db 400 | .read( 401 | tx1, 402 | RowID { 403 | table_id: 1, 404 | row_id: 1, 405 | }, 406 | ) 407 | .unwrap() 408 | .unwrap(); 409 | assert_eq!(tx1_row, row); 410 | db.commit_tx(tx1).unwrap(); 411 | 412 | // T2 reads the row with ID 1 within an active transaction. 413 | let tx2 = db.begin_tx(); 414 | let row = db 415 | .read( 416 | tx2, 417 | RowID { 418 | table_id: 1, 419 | row_id: 1, 420 | }, 421 | ) 422 | .unwrap() 423 | .unwrap(); 424 | assert_eq!(tx1_row, row); 425 | 426 | // T3 updates the row and commits. 427 | let tx3 = db.begin_tx(); 428 | let tx3_row = Row { 429 | id: RowID { 430 | table_id: 1, 431 | row_id: 1, 432 | }, 433 | data: "World".to_string(), 434 | }; 435 | db.update(tx3, tx3_row).unwrap(); 436 | db.commit_tx(tx3).unwrap(); 437 | 438 | // T2 still reads the same version of the row as before. 439 | let row = db 440 | .read( 441 | tx2, 442 | RowID { 443 | table_id: 1, 444 | row_id: 1, 445 | }, 446 | ) 447 | .unwrap() 448 | .unwrap(); 449 | assert_eq!(tx1_row, row); 450 | } 451 | 452 | #[traced_test] 453 | #[test] 454 | fn test_lost_update() { 455 | let db = test_db(); 456 | 457 | // T1 inserts a row with ID 1 and commits. 458 | let tx1 = db.begin_tx(); 459 | let tx1_row = Row { 460 | id: RowID { 461 | table_id: 1, 462 | row_id: 1, 463 | }, 464 | data: "Hello".to_string(), 465 | }; 466 | db.insert(tx1, tx1_row.clone()).unwrap(); 467 | let row = db 468 | .read( 469 | tx1, 470 | RowID { 471 | table_id: 1, 472 | row_id: 1, 473 | }, 474 | ) 475 | .unwrap() 476 | .unwrap(); 477 | assert_eq!(tx1_row, row); 478 | db.commit_tx(tx1).unwrap(); 479 | 480 | // T2 attempts to update row ID 1 within an active transaction. 481 | let tx2 = db.begin_tx(); 482 | let tx2_row = Row { 483 | id: RowID { 484 | table_id: 1, 485 | row_id: 1, 486 | }, 487 | data: "World".to_string(), 488 | }; 489 | assert!(db.update(tx2, tx2_row.clone()).unwrap()); 490 | 491 | // T3 also attempts to update row ID 1 within an active transaction. 492 | let tx3 = db.begin_tx(); 493 | let tx3_row = Row { 494 | id: RowID { 495 | table_id: 1, 496 | row_id: 1, 497 | }, 498 | data: "Hello, world!".to_string(), 499 | }; 500 | assert_eq!( 501 | Err(DatabaseError::WriteWriteConflict), 502 | db.update(tx3, tx3_row) 503 | ); 504 | 505 | db.commit_tx(tx2).unwrap(); 506 | assert_eq!(Err(DatabaseError::TxTerminated), db.commit_tx(tx3)); 507 | 508 | let tx4 = db.begin_tx(); 509 | let row = db 510 | .read( 511 | tx4, 512 | RowID { 513 | table_id: 1, 514 | row_id: 1, 515 | }, 516 | ) 517 | .unwrap() 518 | .unwrap(); 519 | assert_eq!(tx2_row, row); 520 | } 521 | 522 | // Test for the visibility to check if a new transaction can see old committed values. 523 | // This test checks for the typo present in the paper, explained in https://github.com/penberg/mvcc-rs/issues/15 524 | #[traced_test] 525 | #[test] 526 | fn test_committed_visibility() { 527 | let db = test_db(); 528 | 529 | // let's add $10 to my account since I like money 530 | let tx1 = db.begin_tx(); 531 | let tx1_row = Row { 532 | id: RowID { 533 | table_id: 1, 534 | row_id: 1, 535 | }, 536 | data: "10".to_string(), 537 | }; 538 | db.insert(tx1, tx1_row.clone()).unwrap(); 539 | db.commit_tx(tx1).unwrap(); 540 | 541 | // but I like more money, so let me try adding $10 more 542 | let tx2 = db.begin_tx(); 543 | let tx2_row = Row { 544 | id: RowID { 545 | table_id: 1, 546 | row_id: 1, 547 | }, 548 | data: "20".to_string(), 549 | }; 550 | assert!(db.update(tx2, tx2_row.clone()).unwrap()); 551 | let row = db 552 | .read( 553 | tx2, 554 | RowID { 555 | table_id: 1, 556 | row_id: 1, 557 | }, 558 | ) 559 | .unwrap() 560 | .unwrap(); 561 | assert_eq!(row, tx2_row); 562 | 563 | // can I check how much money I have? 564 | let tx3 = db.begin_tx(); 565 | let row = db 566 | .read( 567 | tx3, 568 | RowID { 569 | table_id: 1, 570 | row_id: 1, 571 | }, 572 | ) 573 | .unwrap() 574 | .unwrap(); 575 | assert_eq!(tx1_row, row); 576 | } 577 | 578 | // Test to check if a older transaction can see (un)committed future rows 579 | #[traced_test] 580 | #[test] 581 | fn test_future_row() { 582 | let db = test_db(); 583 | 584 | let tx1 = db.begin_tx(); 585 | 586 | let tx2 = db.begin_tx(); 587 | let tx2_row = Row { 588 | id: RowID { 589 | table_id: 1, 590 | row_id: 1, 591 | }, 592 | data: "10".to_string(), 593 | }; 594 | db.insert(tx2, tx2_row).unwrap(); 595 | 596 | // transaction in progress, so tx1 shouldn't be able to see the value 597 | let row = db 598 | .read( 599 | tx1, 600 | RowID { 601 | table_id: 1, 602 | row_id: 1, 603 | }, 604 | ) 605 | .unwrap(); 606 | assert_eq!(row, None); 607 | 608 | // lets commit the transaction and check if tx1 can see it 609 | db.commit_tx(tx2).unwrap(); 610 | let row = db 611 | .read( 612 | tx1, 613 | RowID { 614 | table_id: 1, 615 | row_id: 1, 616 | }, 617 | ) 618 | .unwrap(); 619 | assert_eq!(row, None); 620 | } 621 | 622 | #[traced_test] 623 | #[test] 624 | fn test_storage1() { 625 | let clock = LocalClock::new(); 626 | let mut path = std::env::temp_dir(); 627 | path.push(format!( 628 | "mvcc-rs-storage-test-{}", 629 | std::time::SystemTime::now() 630 | .duration_since(std::time::UNIX_EPOCH) 631 | .unwrap() 632 | .as_nanos(), 633 | )); 634 | let storage = crate::persistent_storage::Storage::new_json_on_disk(path.clone()); 635 | let db = Database::new(clock, storage); 636 | 637 | let tx1 = db.begin_tx(); 638 | let tx2 = db.begin_tx(); 639 | let tx3 = db.begin_tx(); 640 | 641 | db.insert( 642 | tx3, 643 | Row { 644 | id: RowID { 645 | table_id: 1, 646 | row_id: 1, 647 | }, 648 | data: "testme".to_string(), 649 | }, 650 | ) 651 | .unwrap(); 652 | 653 | db.commit_tx(tx1).unwrap(); 654 | db.rollback_tx(tx2); 655 | db.commit_tx(tx3).unwrap(); 656 | 657 | let tx4 = db.begin_tx(); 658 | db.insert( 659 | tx4, 660 | Row { 661 | id: RowID { 662 | table_id: 1, 663 | row_id: 2, 664 | }, 665 | data: "testme2".to_string(), 666 | }, 667 | ) 668 | .unwrap(); 669 | db.insert( 670 | tx4, 671 | Row { 672 | id: RowID { 673 | table_id: 1, 674 | row_id: 3, 675 | }, 676 | data: "testme3".to_string(), 677 | }, 678 | ) 679 | .unwrap(); 680 | 681 | assert_eq!( 682 | db.read( 683 | tx4, 684 | RowID { 685 | table_id: 1, 686 | row_id: 1 687 | } 688 | ) 689 | .unwrap() 690 | .unwrap() 691 | .data, 692 | "testme" 693 | ); 694 | assert_eq!( 695 | db.read( 696 | tx4, 697 | RowID { 698 | table_id: 1, 699 | row_id: 2 700 | } 701 | ) 702 | .unwrap() 703 | .unwrap() 704 | .data, 705 | "testme2" 706 | ); 707 | assert_eq!( 708 | db.read( 709 | tx4, 710 | RowID { 711 | table_id: 1, 712 | row_id: 3 713 | } 714 | ) 715 | .unwrap() 716 | .unwrap() 717 | .data, 718 | "testme3" 719 | ); 720 | db.commit_tx(tx4).unwrap(); 721 | 722 | let clock = LocalClock::new(); 723 | let storage = crate::persistent_storage::Storage::new_json_on_disk(path); 724 | let db: Database = Database::new(clock, storage); 725 | db.recover().unwrap(); 726 | println!("{:#?}", db); 727 | 728 | let tx5 = db.begin_tx(); 729 | println!( 730 | "{:#?}", 731 | db.read( 732 | tx5, 733 | RowID { 734 | table_id: 1, 735 | row_id: 1 736 | } 737 | ) 738 | ); 739 | assert_eq!( 740 | db.read( 741 | tx5, 742 | RowID { 743 | table_id: 1, 744 | row_id: 1 745 | } 746 | ) 747 | .unwrap() 748 | .unwrap() 749 | .data, 750 | "testme" 751 | ); 752 | assert_eq!( 753 | db.read( 754 | tx5, 755 | RowID { 756 | table_id: 1, 757 | row_id: 2 758 | } 759 | ) 760 | .unwrap() 761 | .unwrap() 762 | .data, 763 | "testme2" 764 | ); 765 | assert_eq!( 766 | db.read( 767 | tx5, 768 | RowID { 769 | table_id: 1, 770 | row_id: 3 771 | } 772 | ) 773 | .unwrap() 774 | .unwrap() 775 | .data, 776 | "testme3" 777 | ); 778 | } 779 | 780 | /* States described in the Hekaton paper *for serializability*: 781 | 782 | Table 1: Case analysis of action to take when version V’s 783 | Begin field contains the ID of transaction TB 784 | ------------------------------------------------------------------------------------------------------ 785 | TB’s state | TB’s end timestamp | Action to take when transaction T checks visibility of version V. 786 | ------------------------------------------------------------------------------------------------------ 787 | Active | Not set | V is visible only if TB=T and V’s end timestamp equals infinity. 788 | ------------------------------------------------------------------------------------------------------ 789 | Preparing | TS | V’s begin timestamp will be TS ut V is not yet committed. Use TS 790 | | as V’s begin time when testing visibility. If the test is true, 791 | | allow T to speculatively read V. Committed TS V’s begin timestamp 792 | | will be TS and V is committed. Use TS as V’s begin time to test 793 | | visibility. 794 | ------------------------------------------------------------------------------------------------------ 795 | Committed | TS | V’s begin timestamp will be TS and V is committed. Use TS as V’s 796 | | begin time to test visibility. 797 | ------------------------------------------------------------------------------------------------------ 798 | Aborted | Irrelevant | Ignore V; it’s a garbage version. 799 | ------------------------------------------------------------------------------------------------------ 800 | Terminated | Irrelevant | Reread V’s Begin field. TB has terminated so it must have finalized 801 | or not found | | the timestamp. 802 | ------------------------------------------------------------------------------------------------------ 803 | 804 | Table 2: Case analysis of action to take when V's End field 805 | contains a transaction ID TE. 806 | ------------------------------------------------------------------------------------------------------ 807 | TE’s state | TE’s end timestamp | Action to take when transaction T checks visibility of a version V 808 | | | as of read time RT. 809 | ------------------------------------------------------------------------------------------------------ 810 | Active | Not set | V is visible only if TE is not T. 811 | ------------------------------------------------------------------------------------------------------ 812 | Preparing | TS | V’s end timestamp will be TS provided that TE commits. If TS > RT, 813 | | V is visible to T. If TS < RT, T speculatively ignores V. 814 | ------------------------------------------------------------------------------------------------------ 815 | Committed | TS | V’s end timestamp will be TS and V is committed. Use TS as V’s end 816 | | timestamp when testing visibility. 817 | ------------------------------------------------------------------------------------------------------ 818 | Aborted | Irrelevant | V is visible. 819 | ------------------------------------------------------------------------------------------------------ 820 | Terminated | Irrelevant | Reread V’s End field. TE has terminated so it must have finalized 821 | or not found | | the timestamp. 822 | */ 823 | 824 | fn new_tx(tx_id: TxID, begin_ts: u64, state: TransactionState) -> RwLock { 825 | let state = state.into(); 826 | RwLock::new(Transaction { 827 | state, 828 | tx_id, 829 | begin_ts, 830 | write_set: SkipSet::new(), 831 | read_set: SkipSet::new(), 832 | }) 833 | } 834 | 835 | #[traced_test] 836 | #[test] 837 | fn test_snapshot_isolation_tx_visible1() { 838 | let txs: SkipMap> = SkipMap::from_iter([ 839 | (1, new_tx(1, 1, TransactionState::Committed(2))), 840 | (2, new_tx(2, 2, TransactionState::Committed(5))), 841 | (3, new_tx(3, 3, TransactionState::Aborted)), 842 | (5, new_tx(5, 5, TransactionState::Preparing)), 843 | (6, new_tx(6, 6, TransactionState::Committed(10))), 844 | (7, new_tx(7, 7, TransactionState::Active)), 845 | ]); 846 | 847 | let current_tx = new_tx(4, 4, TransactionState::Preparing); 848 | let current_tx = current_tx.read().unwrap(); 849 | 850 | let rv_visible = |begin: TxTimestampOrID, end: Option| { 851 | let row_version = RowVersion { 852 | begin, 853 | end, 854 | row: Row { 855 | id: RowID { 856 | table_id: 1, 857 | row_id: 1, 858 | }, 859 | data: "testme".to_string(), 860 | }, 861 | }; 862 | tracing::debug!("Testing visibility of {row_version:?}"); 863 | is_version_visible(&txs, ¤t_tx, &row_version) 864 | }; 865 | 866 | // begin visible: transaction committed with ts < current_tx.begin_ts 867 | // end visible: inf 868 | assert!(rv_visible(TxTimestampOrID::TxID(1), None)); 869 | 870 | // begin invisible: transaction committed with ts > current_tx.begin_ts 871 | assert!(!rv_visible(TxTimestampOrID::TxID(2), None)); 872 | 873 | // begin invisible: transaction aborted 874 | assert!(!rv_visible(TxTimestampOrID::TxID(3), None)); 875 | 876 | // begin visible: timestamp < current_tx.begin_ts 877 | // end invisible: transaction committed with ts > current_tx.begin_ts 878 | assert!(!rv_visible( 879 | TxTimestampOrID::Timestamp(0), 880 | Some(TxTimestampOrID::TxID(1)) 881 | )); 882 | 883 | // begin visible: timestamp < current_tx.begin_ts 884 | // end visible: transaction committed with ts < current_tx.begin_ts 885 | assert!(rv_visible( 886 | TxTimestampOrID::Timestamp(0), 887 | Some(TxTimestampOrID::TxID(2)) 888 | )); 889 | 890 | // begin visible: timestamp < current_tx.begin_ts 891 | // end invisible: transaction aborted 892 | assert!(!rv_visible( 893 | TxTimestampOrID::Timestamp(0), 894 | Some(TxTimestampOrID::TxID(3)) 895 | )); 896 | 897 | // begin invisible: transaction preparing 898 | assert!(!rv_visible(TxTimestampOrID::TxID(5), None)); 899 | 900 | // begin invisible: transaction committed with ts > current_tx.begin_ts 901 | assert!(!rv_visible(TxTimestampOrID::TxID(6), None)); 902 | 903 | // begin invisible: transaction active 904 | assert!(!rv_visible(TxTimestampOrID::TxID(7), None)); 905 | 906 | // begin invisible: transaction committed with ts > current_tx.begin_ts 907 | assert!(!rv_visible(TxTimestampOrID::TxID(6), None)); 908 | 909 | // begin invisible: transaction active 910 | assert!(!rv_visible(TxTimestampOrID::TxID(7), None)); 911 | 912 | // begin visible: timestamp < current_tx.begin_ts 913 | // end invisible: transaction preparing 914 | assert!(!rv_visible( 915 | TxTimestampOrID::Timestamp(0), 916 | Some(TxTimestampOrID::TxID(5)) 917 | )); 918 | 919 | // begin invisible: timestamp > current_tx.begin_ts 920 | assert!(!rv_visible( 921 | TxTimestampOrID::Timestamp(6), 922 | Some(TxTimestampOrID::TxID(6)) 923 | )); 924 | 925 | // begin visible: timestamp < current_tx.begin_ts 926 | // end visible: some active transaction will eventually overwrite this version, 927 | // but that hasn't happened 928 | // (this is the https://avi.im/blag/2023/hekaton-paper-typo/ case, I believe!) 929 | assert!(rv_visible( 930 | TxTimestampOrID::Timestamp(0), 931 | Some(TxTimestampOrID::TxID(7)) 932 | )); 933 | } 934 | -------------------------------------------------------------------------------- /mvcc-rs/src/errors.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | #[derive(Error, Debug, PartialEq)] 4 | pub enum DatabaseError { 5 | #[error("no such transaction ID: `{0}`")] 6 | NoSuchTransactionID(u64), 7 | #[error("transaction aborted because of a write-write conflict")] 8 | WriteWriteConflict, 9 | #[error("transaction is terminated")] 10 | TxTerminated, 11 | #[error("I/O error: {0}")] 12 | Io(String), 13 | } 14 | -------------------------------------------------------------------------------- /mvcc-rs/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Multiversion concurrency control (MVCC) for Rust. 2 | //! 3 | //! This module implements the main memory MVCC method outlined in the paper 4 | //! "High-Performance Concurrency Control Mechanisms for Main-Memory Databases" 5 | //! by Per-Åke Larson et al (VLDB, 2011). 6 | //! 7 | //! ## Data anomalies 8 | //! 9 | //! * A *dirty write* occurs when transaction T_m updates a value that is written by 10 | //! transaction T_n but not yet committed. The MVCC algorithm prevents dirty 11 | //! writes by validating that a row version is visible to transaction T_m before 12 | //! allowing update to it. 13 | //! 14 | //! * A *dirty read* occurs when transaction T_m reads a value that was written by 15 | //! transaction T_n but not yet committed. The MVCC algorithm prevents dirty 16 | //! reads by validating that a row version is visible to transaction T_m. 17 | //! 18 | //! * A *fuzzy read* (non-repeatable read) occurs when transaction T_m reads a 19 | //! different value in the course of the transaction because another 20 | //! transaction T_n has updated the value. 21 | //! 22 | //! * A *lost update* occurs when transactions T_m and T_n both attempt to update 23 | //! the same value, resulting in one of the updates being lost. The MVCC algorithm 24 | //! prevents lost updates by detecting the write-write conflict and letting the 25 | //! first-writer win by aborting the later transaction. 26 | //! 27 | //! TODO: phantom reads, cursor lost updates, read skew, write skew. 28 | //! 29 | //! ## TODO 30 | //! 31 | //! * Optimistic reads and writes 32 | //! * Garbage collection 33 | 34 | pub mod clock; 35 | pub mod cursor; 36 | pub mod database; 37 | pub mod errors; 38 | pub mod persistent_storage; 39 | -------------------------------------------------------------------------------- /mvcc-rs/src/persistent_storage/mod.rs: -------------------------------------------------------------------------------- 1 | use serde::Serialize; 2 | use serde::de::DeserializeOwned; 3 | use std::fmt::Debug; 4 | 5 | use crate::database::{LogRecord, Result}; 6 | use crate::errors::DatabaseError; 7 | 8 | pub mod s3; 9 | 10 | #[derive(Debug)] 11 | pub enum Storage { 12 | Noop, 13 | JsonOnDisk(std::path::PathBuf), 14 | S3(s3::Replicator), 15 | } 16 | 17 | impl Storage { 18 | pub fn new_noop() -> Self { 19 | Self::Noop 20 | } 21 | 22 | pub fn new_json_on_disk(path: impl Into) -> Self { 23 | let path = path.into(); 24 | Self::JsonOnDisk(path) 25 | } 26 | 27 | pub fn new_s3(options: s3::Options) -> Result { 28 | let replicator = futures::executor::block_on(s3::Replicator::new(options))?; 29 | Ok(Self::S3(replicator)) 30 | } 31 | } 32 | 33 | impl Storage { 34 | pub fn log_tx(&self, m: LogRecord) -> Result<()> { 35 | match self { 36 | Self::JsonOnDisk(path) => { 37 | use std::io::Write; 38 | let t = serde_json::to_vec(&m).map_err(|e| DatabaseError::Io(e.to_string()))?; 39 | let mut file = std::fs::OpenOptions::new() 40 | .create(true) 41 | .append(true) 42 | .open(path) 43 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 44 | file.write_all(&t) 45 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 46 | file.write_all(b"\n") 47 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 48 | } 49 | Self::S3(replicator) => { 50 | futures::executor::block_on(replicator.replicate_tx(m))?; 51 | } 52 | Self::Noop => (), 53 | } 54 | Ok(()) 55 | } 56 | 57 | pub fn read_tx_log(&self) -> Result>> { 58 | match self { 59 | Self::JsonOnDisk(path) => { 60 | use std::io::BufRead; 61 | let file = std::fs::OpenOptions::new() 62 | .read(true) 63 | .open(path) 64 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 65 | 66 | let mut records: Vec> = Vec::new(); 67 | let mut lines = std::io::BufReader::new(file).lines(); 68 | while let Some(Ok(line)) = lines.next() { 69 | records.push( 70 | serde_json::from_str(&line) 71 | .map_err(|e| DatabaseError::Io(e.to_string()))?, 72 | ) 73 | } 74 | Ok(records) 75 | } 76 | Self::S3(replicator) => futures::executor::block_on(replicator.read_tx_log()), 77 | Self::Noop => Err(crate::errors::DatabaseError::Io( 78 | "cannot read from Noop storage".to_string(), 79 | )), 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /mvcc-rs/src/persistent_storage/s3.rs: -------------------------------------------------------------------------------- 1 | use crate::database::{LogRecord, Result}; 2 | use crate::errors::DatabaseError; 3 | use aws_sdk_s3::Client; 4 | use serde::Serialize; 5 | use serde::de::DeserializeOwned; 6 | use std::fmt::Debug; 7 | 8 | #[derive(Clone, Copy, Debug)] 9 | #[non_exhaustive] 10 | pub struct Options { 11 | pub create_bucket_if_not_exists: bool, 12 | } 13 | 14 | impl Options { 15 | pub fn with_create_bucket_if_not_exists(create_bucket_if_not_exists: bool) -> Self { 16 | Self { 17 | create_bucket_if_not_exists, 18 | } 19 | } 20 | } 21 | 22 | #[derive(Debug)] 23 | pub struct Replicator { 24 | pub client: Client, 25 | pub bucket: String, 26 | pub prefix: String, 27 | } 28 | 29 | impl Replicator { 30 | pub async fn new(options: Options) -> Result { 31 | let mut loader = aws_config::from_env(); 32 | if let Ok(endpoint) = std::env::var("MVCCRS_ENDPOINT") { 33 | loader = loader.endpoint_url(endpoint); 34 | } 35 | let sdk_config = loader.load().await; 36 | let config = aws_sdk_s3::config::Builder::from(&sdk_config) 37 | .force_path_style(true) 38 | .build(); 39 | let bucket = std::env::var("MVCCRS_BUCKET").unwrap_or_else(|_| "mvccrs".to_string()); 40 | let prefix = std::env::var("MVCCRS_PREFIX").unwrap_or_else(|_| "tx".to_string()); 41 | let client = Client::from_conf(config); 42 | 43 | match client.head_bucket().bucket(&bucket).send().await { 44 | Ok(_) => tracing::info!("Bucket {bucket} exists and is accessible"), 45 | Err(aws_sdk_s3::error::SdkError::ServiceError(err)) if err.err().is_not_found() => { 46 | if options.create_bucket_if_not_exists { 47 | tracing::info!("Bucket {bucket} not found, recreating"); 48 | client 49 | .create_bucket() 50 | .bucket(&bucket) 51 | .send() 52 | .await 53 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 54 | } else { 55 | tracing::error!("Bucket {bucket} does not exist"); 56 | return Err(DatabaseError::Io(err.err().to_string())); 57 | } 58 | } 59 | Err(e) => { 60 | tracing::error!("Bucket checking error: {e}"); 61 | return Err(DatabaseError::Io(e.to_string())); 62 | } 63 | } 64 | 65 | Ok(Self { 66 | client, 67 | bucket, 68 | prefix, 69 | }) 70 | } 71 | 72 | pub async fn replicate_tx(&self, record: LogRecord) -> Result<()> { 73 | let key = format!("{}-{:020}", self.prefix, record.tx_timestamp); 74 | tracing::trace!("Replicating {key}"); 75 | let body = serde_json::to_vec(&record).map_err(|e| DatabaseError::Io(e.to_string()))?; 76 | let resp = self 77 | .client 78 | .put_object() 79 | .bucket(&self.bucket) 80 | .key(&key) 81 | .body(body.into()) 82 | .send() 83 | .await 84 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 85 | tracing::trace!("Replicator response: {:?}", resp); 86 | Ok(()) 87 | } 88 | 89 | pub async fn read_tx_log(&self) -> Result>> { 90 | let mut records: Vec> = Vec::new(); 91 | // Read all objects from the bucket, one log record is stored in one object 92 | let mut next_token = None; 93 | loop { 94 | let mut req = self 95 | .client 96 | .list_objects_v2() 97 | .bucket(&self.bucket) 98 | .prefix(&self.prefix); 99 | if let Some(next_token) = next_token { 100 | req = req.continuation_token(next_token); 101 | } 102 | let resp = req 103 | .send() 104 | .await 105 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 106 | tracing::trace!("List objects response: {:?}", resp); 107 | if let Some(contents) = resp.contents { 108 | // read the record from s3 based on the object metadata (`contents`) 109 | // and store it in the `records` vector 110 | for object in contents { 111 | let key = object.key.unwrap(); 112 | let resp = self 113 | .client 114 | .get_object() 115 | .bucket(&self.bucket) 116 | .key(&key) 117 | .send() 118 | .await 119 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 120 | tracing::trace!("Get object response: {:?}", resp); 121 | let body = resp 122 | .body 123 | .collect() 124 | .await 125 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 126 | let record: LogRecord = serde_json::from_slice(&body.into_bytes()) 127 | .map_err(|e| DatabaseError::Io(e.to_string()))?; 128 | records.push(record); 129 | } 130 | } 131 | if resp.next_continuation_token.is_none() { 132 | break; 133 | } 134 | next_token = resp.next_continuation_token; 135 | } 136 | tracing::trace!("Records: {records:?}"); 137 | Ok(records) 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /mvcc-rs/tests/concurrency_test.rs: -------------------------------------------------------------------------------- 1 | use mvcc_rs::clock::LocalClock; 2 | use mvcc_rs::database::{Database, Row, RowID}; 3 | use std::sync::atomic::AtomicU64; 4 | use std::sync::atomic::Ordering; 5 | use std::sync::{Arc, Once}; 6 | 7 | static IDS: AtomicU64 = AtomicU64::new(1); 8 | 9 | static START: Once = Once::new(); 10 | 11 | #[test] 12 | fn test_non_overlapping_concurrent_inserts() { 13 | START.call_once(|| { 14 | tracing_subscriber::fmt::init(); 15 | }); 16 | // Two threads insert to the database concurrently using non-overlapping 17 | // row IDs. 18 | let clock = LocalClock::default(); 19 | let storage = mvcc_rs::persistent_storage::Storage::new_noop(); 20 | let db = Arc::new(Database::new(clock, storage)); 21 | let iterations = 100000; 22 | 23 | let th1 = { 24 | let db = db.clone(); 25 | std::thread::spawn(move || { 26 | for _ in 0..iterations { 27 | let tx = db.begin_tx(); 28 | let id = IDS.fetch_add(1, Ordering::SeqCst); 29 | let id = RowID { 30 | table_id: 1, 31 | row_id: id, 32 | }; 33 | let row = Row { 34 | id, 35 | data: "Hello".to_string(), 36 | }; 37 | db.insert(tx, row.clone()).unwrap(); 38 | db.commit_tx(tx).unwrap(); 39 | let tx = db.begin_tx(); 40 | let committed_row = db.read(tx, id).unwrap(); 41 | db.commit_tx(tx).unwrap(); 42 | assert_eq!(committed_row, Some(row)); 43 | } 44 | }) 45 | }; 46 | let th2 = { 47 | std::thread::spawn(move || { 48 | for _ in 0..iterations { 49 | let tx = db.begin_tx(); 50 | let id = IDS.fetch_add(1, Ordering::SeqCst); 51 | let id = RowID { 52 | table_id: 1, 53 | row_id: id, 54 | }; 55 | let row = Row { 56 | id, 57 | data: "World".to_string(), 58 | }; 59 | db.insert(tx, row.clone()).unwrap(); 60 | db.commit_tx(tx).unwrap(); 61 | let tx = db.begin_tx(); 62 | let committed_row = db.read(tx, id).unwrap(); 63 | db.commit_tx(tx).unwrap(); 64 | assert_eq!(committed_row, Some(row)); 65 | } 66 | }) 67 | }; 68 | th1.join().unwrap(); 69 | th2.join().unwrap(); 70 | } 71 | 72 | #[test] 73 | fn test_overlapping_concurrent_inserts_read_your_writes() { 74 | START.call_once(|| { 75 | tracing_subscriber::fmt::init(); 76 | }); // Two threads insert to the database concurrently using overlapping row IDs. 77 | let clock = LocalClock::default(); 78 | let storage = mvcc_rs::persistent_storage::Storage::new_noop(); 79 | let db = Arc::new(Database::new(clock, storage)); 80 | let iterations = 100000; 81 | 82 | let work = |prefix: &'static str| { 83 | let db = db.clone(); 84 | std::thread::spawn(move || { 85 | let mut failed_upserts = 0; 86 | for i in 0..iterations { 87 | if i % 1000 == 0 { 88 | tracing::debug!("{prefix}: {i}"); 89 | } 90 | if i % 10000 == 0 { 91 | let dropped = db.drop_unused_row_versions(); 92 | tracing::debug!("garbage collected {dropped} versions"); 93 | } 94 | let tx = db.begin_tx(); 95 | let id = i % 16; 96 | let id = RowID { 97 | table_id: 1, 98 | row_id: id, 99 | }; 100 | let row = Row { 101 | id, 102 | data: format!("{prefix} @{tx}"), 103 | }; 104 | if let Err(e) = db.upsert(tx, row.clone()) { 105 | tracing::trace!("upsert failed: {e}"); 106 | failed_upserts += 1; 107 | continue; 108 | } 109 | let committed_row = db.read(tx, id).unwrap(); 110 | db.commit_tx(tx).unwrap(); 111 | assert_eq!(committed_row, Some(row)); 112 | } 113 | tracing::info!( 114 | "{prefix}'s failed upserts: {failed_upserts}/{iterations} {:.2}%", 115 | (failed_upserts * 100) as f64 / iterations as f64 116 | ); 117 | }) 118 | }; 119 | 120 | let threads = vec![work("A"), work("B"), work("C"), work("D")]; 121 | for th in threads { 122 | th.join().unwrap(); 123 | } 124 | } 125 | --------------------------------------------------------------------------------