├── .gitignore ├── src ├── tran_mgr │ ├── mod.rs │ └── tran_mgr.rs ├── buf_mgr │ ├── mod.rs │ ├── lru.rs │ ├── buf_writer.rs │ └── buf_mgr.rs ├── log_mgr │ ├── mod.rs │ ├── log_mgr.rs │ ├── fs.rs │ ├── io.rs │ └── buf.rs ├── system │ ├── mod.rs │ ├── config.rs │ └── checkpointer.rs ├── block_mgr │ ├── mod.rs │ └── allocator.rs ├── common │ ├── mod.rs │ ├── str.rs │ ├── misc.rs │ ├── crc32.rs │ ├── defs.rs │ ├── intercom.rs │ └── errors.rs ├── storage │ ├── mod.rs │ ├── fs_ops.rs │ ├── driver.rs │ └── checkpoint_store.rs └── lib.rs ├── doc └── architecture.png ├── .github ├── workflows │ └── rust.yml └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── Cargo.toml ├── CONTRIBUTING.md ├── LICENSE ├── Cargo.lock └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | -------------------------------------------------------------------------------- /src/tran_mgr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod tran_mgr; 2 | -------------------------------------------------------------------------------- /src/buf_mgr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod buf_mgr; 2 | pub mod buf_writer; 3 | pub mod lru; 4 | -------------------------------------------------------------------------------- /src/log_mgr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod log_mgr; 2 | mod buf; 3 | mod fs; 4 | mod io; 5 | 6 | -------------------------------------------------------------------------------- /src/system/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod instance; 2 | pub mod config; 3 | pub mod checkpointer; 4 | -------------------------------------------------------------------------------- /doc/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stencillogic/db-core/HEAD/doc/architecture.png -------------------------------------------------------------------------------- /src/block_mgr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod block_mgr; 2 | pub mod free_info; 3 | pub mod block; 4 | pub mod allocator; 5 | -------------------------------------------------------------------------------- /src/common/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod errors; 2 | pub mod str; 3 | pub mod misc; 4 | pub mod crc32; 5 | pub mod defs; 6 | pub mod intercom; 7 | -------------------------------------------------------------------------------- /src/storage/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod driver; 2 | mod fs_ops; 3 | pub mod datastore; 4 | mod block_driver; 5 | mod checkpoint_store; 6 | mod version_store; 7 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ master, develop ] 6 | pull_request: 7 | branches: [ master, develop ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "db-core" 3 | version = "0.2.1" 4 | authors = ["stencillogic"] 5 | edition = "2018" 6 | description = "dbms engine" 7 | license = "MIT" 8 | categories = ["database-implementations"] 9 | keywords = ["dbms"] 10 | repository = "https://github.com/stencillogic/db-core" 11 | readme = "README.md" 12 | 13 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 14 | 15 | [dependencies] 16 | fs2 = "0.4.3" 17 | log = "0.4.8" 18 | 19 | [dev-dependencies] 20 | rand = "0.8.4" 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | 4 | If you would like to contribute please get in touch using [email](mailto:stencillogic@gmail.com). 5 | When contributing to this repository, please first discuss the change you wish to make via issue, 6 | or any other method with the owners of this repository. 7 | 8 | 9 | ### Code of conduct 10 | 11 | We expect participants to be open, welcoming, friendly, and patient. 12 | Inacceptable behavior includes: 13 | 14 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 15 | - Trolling, insulting/derogatory comments, and personal or political attacks 16 | - Public or private harassment 17 | - Publishing others' private information, such as a physical or electronic address, without explicit permission 18 | - Other conduct which could reasonably be considered inappropriate in a professional setting 19 | 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 stencillogic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/common/str.rs: -------------------------------------------------------------------------------- 1 | //! Build &str using existing [u8] array 2 | /* 3 | use std; 4 | 5 | pub struct StrBuilder<'a> { 6 | buf: &'a mut [u8], 7 | written: usize, 8 | } 9 | 10 | impl<'a> StrBuilder<'a> { 11 | pub fn new(buf: &'a mut [u8]) -> Self { 12 | StrBuilder { 13 | buf, 14 | written: 0 15 | } 16 | } 17 | 18 | pub fn add(mut self, chunk: &str) -> Self { 19 | let src = chunk.as_bytes(); 20 | let dst = &mut self.buf[self.written..self.written + src.len()]; 21 | dst.copy_from_slice(src); 22 | self.written += src.len(); 23 | self 24 | } 25 | 26 | pub fn get_str(self) -> Result<&'a str, std::str::Utf8Error> { 27 | std::str::from_utf8(self.buf) 28 | } 29 | } 30 | 31 | #[cfg(test)] 32 | mod tests { 33 | 34 | use super::*; 35 | 36 | #[test] 37 | fn test_build() { 38 | 39 | let expected = "Hello, World!"; 40 | 41 | let mut buf = [0; 13]; 42 | 43 | let s = StrBuilder::new(&mut buf) 44 | .add("Hello") 45 | .add(", ") 46 | .add("World") 47 | .add("!") 48 | .add("") 49 | .get_str(); 50 | 51 | assert_eq!(s.unwrap(), expected); 52 | } 53 | 54 | #[test] 55 | #[should_panic] 56 | fn test_overflow() { 57 | 58 | let expected = "Hello, World!"; 59 | 60 | let mut buf = [0; 13]; 61 | 62 | let s = StrBuilder::new(&mut buf) 63 | .add("Hello") 64 | .add(", ") 65 | .add("World") 66 | .add("!") 67 | .add("a") 68 | .get_str(); 69 | 70 | assert_eq!(s.unwrap(), expected); 71 | } 72 | 73 | #[test] 74 | fn test_empty() { 75 | 76 | let expected = ""; 77 | 78 | let mut buf = [0; 0]; 79 | 80 | let s = StrBuilder::new(&mut buf) 81 | .add("") 82 | .add("") 83 | .add("") 84 | .get_str(); 85 | 86 | assert_eq!(s.unwrap(), expected); 87 | 88 | let s = StrBuilder::new(&mut buf) 89 | .get_str(); 90 | 91 | assert_eq!(s.unwrap(), expected); 92 | } 93 | 94 | #[test] 95 | fn test_incomplete() { 96 | 97 | let expected = "Hello"; 98 | 99 | let mut buf = [0; 13]; 100 | 101 | let s = StrBuilder::new(&mut buf) 102 | .add("Hello") 103 | .get_str(); 104 | 105 | assert_eq!(&s.unwrap()[..5], expected); 106 | } 107 | } 108 | */ 109 | -------------------------------------------------------------------------------- /src/buf_mgr/lru.rs: -------------------------------------------------------------------------------- 1 | /// LRU eviction mechanism implementation. 2 | 3 | use crate::buf_mgr::buf_mgr::EvictionMech; 4 | use crate::buf_mgr::buf_mgr::CacheItem; 5 | use crate::buf_mgr::buf_mgr::CacheItemIterator; 6 | 7 | 8 | /// Lru linked list node. 9 | struct LruNode { 10 | value: T, 11 | prev: Option<*mut LruNode>, 12 | next: Option<*mut LruNode>, 13 | } 14 | 15 | pub struct LruNodeRef { 16 | ptr: *mut LruNode 17 | } 18 | 19 | impl LruNodeRef { 20 | 21 | fn new(node: *mut LruNode) -> Self { 22 | LruNodeRef { 23 | ptr: node 24 | } 25 | } 26 | 27 | fn as_node_ptr(&self) -> *mut LruNode { 28 | self.ptr 29 | } 30 | } 31 | 32 | 33 | impl CacheItem for LruNodeRef { 34 | 35 | fn get_value(&self) -> &T { 36 | unsafe { &(*self.ptr).value } 37 | } 38 | 39 | fn get_value_mut(&mut self) -> &mut T { 40 | unsafe { &mut (*self.ptr).value } 41 | } 42 | 43 | fn clone(&self) -> Self { 44 | LruNodeRef { 45 | ptr: self.ptr, 46 | } 47 | } 48 | } 49 | 50 | 51 | 52 | /// Lru eviction mechanism implementation with linked list. 53 | pub struct LruList { 54 | head: *mut LruNode, 55 | tail: *mut LruNode, 56 | } 57 | 58 | 59 | impl EvictionMech, LruNodeIter> for LruList { 60 | 61 | fn new(value: T) -> Self { 62 | let node = Box::into_raw(Box::new(LruNode { 63 | value, 64 | prev: None, 65 | next: None, 66 | })); 67 | 68 | LruList { 69 | head: node, 70 | tail: node, 71 | } 72 | } 73 | 74 | // register an item in cache. 75 | fn add_item(&mut self, value: T) { 76 | 77 | let node = Box::into_raw(Box::new(LruNode { 78 | value, 79 | prev: Some(self.head), 80 | next: None, 81 | })); 82 | 83 | unsafe { (*self.head).next = Some(node); } 84 | 85 | self.head = node; 86 | } 87 | 88 | // updates eviction priority of the item. 89 | fn on_access(&mut self, item: LruNodeRef) { 90 | 91 | let node = item.as_node_ptr(); 92 | 93 | // move node to head position 94 | unsafe { 95 | if let Some(next) = (*node).next { 96 | (*next).prev = (*node).prev; 97 | 98 | if let Some(prev) = (*node).prev { 99 | (*prev).next = (*node).next; 100 | } else { 101 | self.tail = next; 102 | } 103 | 104 | (*node).next = None; 105 | (*node).prev = Some(self.head); 106 | (*self.head).next = Some(node); 107 | self.head = node; 108 | } 109 | } 110 | } 111 | 112 | // iterator of items for potential eviction. 113 | fn iter(&self) -> LruNodeIter { 114 | LruNodeIter { 115 | node: Some(self.tail), 116 | } 117 | } 118 | } 119 | 120 | 121 | /// Lru linked list iterator. 122 | pub struct LruNodeIter { 123 | node: Option<*mut LruNode>, 124 | } 125 | 126 | impl CacheItemIterator> for LruNodeIter { 127 | 128 | fn next(&mut self) -> Option> { 129 | if let Some(ret) = self.node { 130 | self.node = unsafe { (*ret).next }; 131 | Some(LruNodeRef::new(ret)) 132 | } else { 133 | None 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/storage/fs_ops.rs: -------------------------------------------------------------------------------- 1 | //! 2 | //! File system utility functions 3 | //! 4 | 5 | 6 | use crate::common::errors::Error; 7 | use std::fs::DirEntry; 8 | use std::path::Path; 9 | 10 | 11 | /// traverse directory and call callback on each entry (dirs, files, links, etc.) 12 | pub fn traverse_dir(path: &Path, include_subdirs: bool, mut callback: T) -> Result<(), Error> 13 | where T: FnMut(&DirEntry) -> Result<(), Error> { 14 | 15 | let mut stack = Vec::new(); 16 | let mut dir = path.to_path_buf(); 17 | 18 | loop { 19 | for entry in std::fs::read_dir(dir)? { 20 | 21 | let entry = entry?; 22 | let entry_path = entry.path(); 23 | 24 | if entry_path.is_dir() && include_subdirs { 25 | stack.push(entry_path); 26 | } 27 | 28 | callback(&entry)?; 29 | } 30 | 31 | if let Some(entry_path) = stack.pop() { 32 | dir = entry_path; 33 | } else { 34 | break; 35 | } 36 | } 37 | 38 | Ok(()) 39 | } 40 | 41 | 42 | #[cfg(test)] 43 | mod tests { 44 | use super::*; 45 | 46 | fn prepare_dir_tree(dirs: &[&str], files: &[&str]) { 47 | for item in dirs { 48 | std::fs::create_dir_all(item).unwrap(); 49 | } 50 | for item in files { 51 | std::fs::write(item, "test").unwrap(); 52 | } 53 | } 54 | 55 | fn cleanup(dirs: &[&str], files: &[&str]) { 56 | for item in files { 57 | std::fs::remove_file(item).unwrap(); 58 | } 59 | for item in dirs { 60 | std::fs::remove_dir_all(item).unwrap(); 61 | } 62 | } 63 | 64 | #[test] 65 | fn traverse_test() { 66 | 67 | let dirs = ["dir_890232676743234/subdir1/subsub", 68 | "dir_890232676743234/subdir2"]; 69 | let files = ["dir_890232676743234/test1.file", 70 | "dir_890232676743234/test2.file", 71 | "dir_890232676743234/subdir1/test1.file", 72 | "dir_890232676743234/subdir2/test1.file"]; 73 | 74 | prepare_dir_tree(&dirs, &files); 75 | 76 | 77 | // traverse with subdirs 78 | let mut result = std::collections::HashSet::::new(); 79 | 80 | traverse_dir(std::path::Path::new("dir_890232676743234"), true, |entry: &DirEntry| -> Result<(), Error> { 81 | result.insert(String::from(entry.path().to_str().unwrap())); 82 | Ok(()) 83 | }).unwrap(); 84 | 85 | let mut expected = std::collections::HashSet::::new(); 86 | expected.insert(String::from("dir_890232676743234/subdir1")); 87 | for item in &dirs { 88 | expected.insert(String::from(*item)); 89 | } 90 | for item in &files { 91 | expected.insert(String::from(*item)); 92 | } 93 | 94 | assert_eq!(expected, result); 95 | 96 | 97 | // traverse excluding subdirs 98 | let mut result = std::collections::HashSet::::new(); 99 | 100 | traverse_dir(std::path::Path::new("dir_890232676743234"), false, |entry: &DirEntry| -> Result<(), Error> { 101 | result.insert(String::from(entry.path().to_str().unwrap())); 102 | Ok(()) 103 | }).unwrap(); 104 | 105 | let mut expected = std::collections::HashSet::::new(); 106 | expected.insert(String::from("dir_890232676743234/subdir1")); 107 | expected.insert(String::from("dir_890232676743234/subdir2")); 108 | expected.insert(String::from("dir_890232676743234/test1.file")); 109 | expected.insert(String::from("dir_890232676743234/test2.file")); 110 | 111 | assert_eq!(expected, result); 112 | 113 | 114 | cleanup(&["dir_890232676743234"], &files); 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "cfg-if" 5 | version = "0.1.10" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" 8 | 9 | [[package]] 10 | name = "cfg-if" 11 | version = "1.0.0" 12 | source = "registry+https://github.com/rust-lang/crates.io-index" 13 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 14 | 15 | [[package]] 16 | name = "db-core" 17 | version = "0.2.1" 18 | dependencies = [ 19 | "fs2", 20 | "log", 21 | "rand", 22 | ] 23 | 24 | [[package]] 25 | name = "fs2" 26 | version = "0.4.3" 27 | source = "registry+https://github.com/rust-lang/crates.io-index" 28 | checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" 29 | dependencies = [ 30 | "libc", 31 | "winapi", 32 | ] 33 | 34 | [[package]] 35 | name = "getrandom" 36 | version = "0.2.3" 37 | source = "registry+https://github.com/rust-lang/crates.io-index" 38 | checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" 39 | dependencies = [ 40 | "cfg-if 1.0.0", 41 | "libc", 42 | "wasi", 43 | ] 44 | 45 | [[package]] 46 | name = "libc" 47 | version = "0.2.66" 48 | source = "registry+https://github.com/rust-lang/crates.io-index" 49 | checksum = "d515b1f41455adea1313a4a2ac8a8a477634fbae63cc6100e3aebb207ce61558" 50 | 51 | [[package]] 52 | name = "log" 53 | version = "0.4.8" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" 56 | dependencies = [ 57 | "cfg-if 0.1.10", 58 | ] 59 | 60 | [[package]] 61 | name = "ppv-lite86" 62 | version = "0.2.10" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 65 | 66 | [[package]] 67 | name = "rand" 68 | version = "0.8.4" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" 71 | dependencies = [ 72 | "libc", 73 | "rand_chacha", 74 | "rand_core", 75 | "rand_hc", 76 | ] 77 | 78 | [[package]] 79 | name = "rand_chacha" 80 | version = "0.3.1" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 83 | dependencies = [ 84 | "ppv-lite86", 85 | "rand_core", 86 | ] 87 | 88 | [[package]] 89 | name = "rand_core" 90 | version = "0.6.3" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" 93 | dependencies = [ 94 | "getrandom", 95 | ] 96 | 97 | [[package]] 98 | name = "rand_hc" 99 | version = "0.3.1" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" 102 | dependencies = [ 103 | "rand_core", 104 | ] 105 | 106 | [[package]] 107 | name = "wasi" 108 | version = "0.10.2+wasi-snapshot-preview1" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" 111 | 112 | [[package]] 113 | name = "winapi" 114 | version = "0.3.8" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" 117 | dependencies = [ 118 | "winapi-i686-pc-windows-gnu", 119 | "winapi-x86_64-pc-windows-gnu", 120 | ] 121 | 122 | [[package]] 123 | name = "winapi-i686-pc-windows-gnu" 124 | version = "0.4.0" 125 | source = "registry+https://github.com/rust-lang/crates.io-index" 126 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 127 | 128 | [[package]] 129 | name = "winapi-x86_64-pc-windows-gnu" 130 | version = "0.4.0" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 133 | -------------------------------------------------------------------------------- /src/common/misc.rs: -------------------------------------------------------------------------------- 1 | //! Different utility functions and other stuff 2 | 3 | use std::convert::TryInto; 4 | use std::time::SystemTime; 5 | use crate::common::errors::Error; 6 | 7 | 8 | pub const BYTE_BITS: [u8; 8] = [0b00000001,0b00000010,0b00000100,0b00001000,0b00010000,0b00100000,0b01000000,0b10000000,]; 9 | 10 | 11 | pub trait SliceToIntConverter { 12 | fn slice_to_int(slice: &[u8]) -> Result; 13 | } 14 | 15 | macro_rules! impl_for { 16 | ( $t:ty ) => { 17 | impl SliceToIntConverter<$t> for $t { 18 | 19 | #[inline] 20 | fn slice_to_int(slice: &[u8]) -> Result<$t, Error> { 21 | let val = slice.try_into()?; 22 | Ok(<$t>::from_ne_bytes(val)) 23 | } 24 | } 25 | }; 26 | } 27 | 28 | impl_for!(u8); 29 | impl_for!(u16); 30 | impl_for!(u32); 31 | impl_for!(u64); 32 | impl_for!(u128); 33 | 34 | impl_for!(i8); 35 | impl_for!(i16); 36 | impl_for!(i32); 37 | impl_for!(i64); 38 | impl_for!(i128); 39 | 40 | 41 | /// Return Unix epoch time in seconds. 42 | pub fn epoch_as_secs() -> u64 { 43 | SystemTime::now() 44 | .duration_since(SystemTime::UNIX_EPOCH) 45 | .expect("Current time is earlier than Unix epoch. Check time settings.") 46 | .as_secs() 47 | } 48 | 49 | 50 | /// Allocate byte buffer on heap 51 | pub fn alloc_buf(size: usize) -> Result<*mut u8, Error> { 52 | 53 | if size == 0 || size > (std::isize::MAX as usize) { 54 | return Err(Error::incorrect_allocation_size()); 55 | } 56 | 57 | let ptr: *mut u8; 58 | 59 | unsafe { 60 | let align = std::mem::align_of::(); 61 | ptr = std::alloc::alloc( 62 | std::alloc::Layout::from_size_align(size, align) 63 | .map_err(|e| { Error::incorrect_layout(e) })? 64 | ); 65 | } 66 | 67 | if ptr.is_null() { 68 | Err(Error::allocation_failure()) 69 | } else { 70 | Ok(ptr) 71 | } 72 | } 73 | 74 | 75 | /// Deallocate byte buffer 76 | pub fn dealloc_buf(ptr: *mut u8, size: usize) { 77 | let align = std::mem::align_of::(); 78 | unsafe { 79 | std::alloc::dealloc(ptr as *mut u8, std::alloc::Layout::from_size_align(size, align).unwrap()); 80 | } 81 | } 82 | 83 | 84 | #[cfg(test)] 85 | mod tests { 86 | 87 | use super::*; 88 | 89 | #[test] 90 | fn run_conversions() { 91 | let arr: [u8; 16] = if cfg!(target_endian = "big") { 92 | [0x10, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 9, 8, 7, 6, 5, 4, 3, 2, 1] 93 | } else { 94 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10] 95 | }; 96 | 97 | let res = u8::slice_to_int(&arr[0..1]).unwrap(); 98 | assert_eq!(res, 1, "expect 1, got {:x?}", res); 99 | 100 | let res = u16::slice_to_int(&arr[0..2]).unwrap(); 101 | assert_eq!(res, 0x201, "expect 0x201, got {:x?}", res); 102 | 103 | let res = u32::slice_to_int(&arr[0..4]).unwrap(); 104 | assert_eq!(res, 0x4030201, "expect 0x4030201, got {:x?}", res); 105 | 106 | let res = u64::slice_to_int(&arr[0..8]).unwrap(); 107 | assert_eq!(res, 0x807060504030201, "expect 0x807060504030201, got {:x?}", res); 108 | 109 | let res = u128::slice_to_int(&arr[..]).unwrap(); 110 | assert_eq!(res, 0x100F0E0D0C0B0A090807060504030201, "expect 0x100F0E0D0C0B0A0807060504030201, got {:x?}", res); 111 | 112 | let res = i8::slice_to_int(&arr[0..1]).unwrap(); 113 | assert_eq!(res, 1, "expect 1, got {:x?}", res); 114 | 115 | let res = i16::slice_to_int(&arr[0..2]).unwrap(); 116 | assert_eq!(res, 0x201, "expect 0x201, got {:x?}", res); 117 | 118 | let res = i32::slice_to_int(&arr[0..4]).unwrap(); 119 | assert_eq!(res, 0x4030201, "expect 0x4030201, got {:x?}", res); 120 | 121 | let res = i64::slice_to_int(&arr[0..8]).unwrap(); 122 | assert_eq!(res, 0x807060504030201, "expect 0x807060504030201, got {:x?}", res); 123 | 124 | let res = i128::slice_to_int(&arr[..]).unwrap(); 125 | assert_eq!(res, 0x100F0E0D0C0B0A090807060504030201, "expect 0x100F0E0D0C0B0A0807060504030201, got {:x?}", res); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/common/crc32.rs: -------------------------------------------------------------------------------- 1 | //! functions for crc32 sum calculation 2 | //! 3 | 4 | const CRC_TABLE: [u32; 256] = [ 5 | 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 6 | 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, 7 | 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 8 | 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 9 | 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 10 | 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 11 | 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 12 | 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 13 | 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 14 | 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 15 | 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 16 | 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 17 | 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 18 | 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, 19 | 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 20 | 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 21 | 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 22 | 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, 23 | 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 24 | 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, 25 | 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 26 | 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 27 | 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 28 | 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 29 | 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 30 | 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 31 | 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 32 | 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 33 | 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 34 | 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, 35 | 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 36 | 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, 37 | 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 38 | 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 39 | 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 40 | 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 41 | 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 42 | 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 43 | 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 44 | 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 45 | 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 46 | 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, 47 | 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 48 | 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, 49 | 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 50 | 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 51 | 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 52 | 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 53 | 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 54 | 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 55 | 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 56 | 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 57 | 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 58 | 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 59 | 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 60 | 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, 61 | 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 62 | 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 63 | 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 64 | 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, 65 | 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 66 | 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, 67 | 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 68 | 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D, 69 | ]; 70 | 71 | /// add slice of u8 to accumulated crc32 sum 72 | #[inline] 73 | pub fn crc32_arr(mut crc32: u32, data: &[u8]) -> u32 { 74 | for byte in data { 75 | let n_lookup_index = ((crc32 ^ (*byte as u32)) & 0xFF) as usize; 76 | crc32 = (crc32 >> 8) ^ CRC_TABLE[n_lookup_index]; 77 | } 78 | 79 | crc32 80 | } 81 | 82 | /// add value of type T to accumulated crc32 sum 83 | #[inline] 84 | pub fn crc32_num>(mut crc32: u32, value: T) -> u32 { 85 | let mut tmp = value.into(); 86 | 87 | for _ in 0..std::mem::size_of::() { 88 | let byte = tmp as u8; 89 | let n_lookup_index = ((crc32 ^ (byte as u32)) & 0xFF) as usize; 90 | crc32 = (crc32 >> 8) ^ CRC_TABLE[n_lookup_index]; 91 | tmp >>= 8; 92 | } 93 | 94 | crc32 95 | } 96 | 97 | /// returns initial value 98 | pub fn crc32_begin() -> u32 { 99 | 0xFFFFFFFFu32 100 | } 101 | 102 | /// finalize calculation 103 | pub fn crc32_finalize(crc32: u32) -> u32 { 104 | crc32 ^ 0xFFFFFFFFu32 105 | } 106 | 107 | #[cfg(test)] 108 | mod tests { 109 | 110 | use super::*; 111 | 112 | #[test] 113 | fn test_crc32() { 114 | let mut acc = crc32_begin(); 115 | acc = crc32_arr(acc, &[1, 2, 3]); 116 | acc = crc32_num(acc, 0x01u8); 117 | acc = crc32_num(acc, 0x0201u16); 118 | acc = crc32_num(acc, 0x04030201u32); 119 | acc = crc32_num(acc, 0x0807060504030201u64); 120 | acc = crc32_finalize(acc); 121 | assert_eq!(acc, 0x1EEF9A41u32); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DB-Core 2 | 3 | ![Rust](https://github.com/stencillogic/db-core/workflows/Rust/badge.svg) 4 | 5 | Minimal schema-less database management system with ACID guaranties. 6 | 7 | Current status: WIP (See roadmap section for details) 8 | 9 | 10 | ### Usage 11 | 12 | Client application creates a so called instance of db-core. 13 | When instance is created it opens data store, initializes its internal components, and restores state from the last checkpoint. 14 | Using created instance client application can initiate transactions, create or open existing object, read and write data, delete objects, and then commit or rollback changes. 15 | Instance can be cloned and moved to other thread in a multithreaded application. 16 | In the end, application can shutdown all instances. 17 | 18 | 19 | 20 | Database initialization (this is one-time action): 21 | 22 | 23 | ``` rust 24 | use db_core::instance::Instance; 25 | use db_core::FileState; 26 | use db_core::FileType; 27 | use db_core::FileDesc; 28 | use db_core::instance::Read; 29 | use db_core::instance::Write; 30 | use db_core::config::ConfigMt; 31 | use std::path::Path; 32 | 33 | 34 | 35 | // Block size for the database. 36 | let block_size = 8192; 37 | 38 | 39 | // Create an empty directory for the database files. 40 | let dspath = "/tmp/db-core-test-db"; 41 | if Path::new(dspath).exists() { 42 | std::fs::remove_dir_all(dspath).expect("Failed to delete test dir on cleanup"); 43 | } 44 | std::fs::create_dir(dspath).expect("Failed to create test dir"); 45 | 46 | // Transaction log directory. 47 | let log_dir = "/tmp/db-core-test-tranlog"; 48 | if Path::new(log_dir).exists() { 49 | std::fs::remove_dir_all(log_dir).expect("Failed to delete test dir on cleanup"); 50 | } 51 | std::fs::create_dir(log_dir).expect("Failed to create test dir"); 52 | 53 | 54 | // Define initial database files. There should be at least one file of each type (data store, 55 | // versioning store, checkpont store). 56 | let mut fdset = vec![]; 57 | let desc1 = FileDesc { 58 | state: FileState::InUse, 59 | file_id: 3, 60 | extent_size: 16, 61 | extent_num: 3, 62 | max_extent_num: 65500, 63 | file_type: FileType::DataStoreFile, 64 | }; 65 | let desc2 = FileDesc { 66 | state: FileState::InUse, 67 | file_id: 4, 68 | extent_size: 10, 69 | extent_num: 3, 70 | max_extent_num: 65500, 71 | file_type: FileType::VersioningStoreFile, 72 | }; 73 | let desc3 = FileDesc { 74 | state: FileState::InUse, 75 | file_id: 5, 76 | extent_size: 10, 77 | extent_num: 3, 78 | max_extent_num: 65500, 79 | file_type: FileType::CheckpointStoreFile, 80 | }; 81 | 82 | fdset.push(desc1); 83 | fdset.push(desc2); 84 | fdset.push(desc3); 85 | 86 | // Create a database. 87 | Instance::initialize_datastore(dspath, block_size, &fdset).expect("Failed to init datastore"); 88 | ``` 89 | 90 | Instance startup and termination, transaction management, read and write data: 91 | 92 | 93 | ``` rust 94 | // Some random data. 95 | let data = b"Hello, world!"; 96 | 97 | 98 | // Prepare configuration. 99 | let conf = ConfigMt::new(); 100 | let mut c = conf.get_conf(); 101 | c.set_log_dir(log_dir.to_owned()); 102 | c.set_datastore_path(dspath.to_owned()); 103 | drop(c); 104 | 105 | // Start instance and open existing database. 106 | let instance = Instance::new(conf.clone()).expect("Failed to create instance"); 107 | 108 | // Begin transaction. 109 | let mut trn = instance.begin_transaction().expect("Failed to begin transaction"); 110 | 111 | // Create a new object. 112 | let file_id = 3; 113 | let mut obj = instance.open_create(file_id, &mut trn, data.len()).expect("Failed to create object"); 114 | let obj_id = obj.get_id(); 115 | 116 | // Write some data. 117 | obj.write_next(data).expect("Failed to write"); 118 | drop(obj); 119 | 120 | // Commit transaction. 121 | instance.commit(trn).expect("Failed to commit"); 122 | 123 | // Begin transaction. 124 | let mut trn = instance.begin_transaction().expect("Failed to begin transaction"); 125 | 126 | // Open object for reading and read some data. 127 | let mut obj = instance.open_read(&obj_id, &trn).expect("Failed to open for reading"); 128 | let mut read_buf = vec![0u8;data.len()]; 129 | let mut read = 0; 130 | let len = read_buf.len(); 131 | while read < len { 132 | let r = obj.read_next(&mut read_buf[read..len]).expect("Failed to read"); 133 | if r == 0 {break;} 134 | read += r; 135 | } 136 | assert_eq!(read_buf, data); 137 | drop(obj); 138 | 139 | // Delete object (if object is in use wait for other transaction to finish for 1 second). 140 | let wait_lock_ms = 1000; 141 | instance.delete(&obj_id, &mut trn, wait_lock_ms).expect("Failed to delete object"); 142 | 143 | // Rollback transaction. 144 | instance.rollback(trn).expect("Failed to rollback"); 145 | 146 | // Spawn another instance in a different thread. 147 | let ss = instance.get_shared_state().expect("Failed to get shared state"); 148 | 149 | let th = std::thread::spawn(move || { 150 | let instance2 = Instance::from_shared_state(ss).expect("Failed to create instance"); 151 | // ... 152 | instance2.terminate(); 153 | }); 154 | 155 | // Add a database file. 156 | let new_file_id = instance.add_datafile(FileType::DataStoreFile, 1000, 10, 1000).expect("Failed to add data file"); 157 | 158 | // Terminate instance. 159 | instance.terminate(); 160 | ``` 161 | 162 | ### Roadmap 163 | 164 | 1. Allow storing data directly on block device. 165 | 2. Implement vacuuming. 166 | 3. Add more tests. 167 | 168 | See [issues](https://github.com/stencillogic/db-core/issues) for details. 169 | 170 | 171 | ### Contribution 172 | 173 | Contributions are welcome! Please see [CONTRIBUTING.md](https://github.com/stencillogic/db-core/blob/master/CONTRIBUTING.md) for details. 174 | 175 | -------------------------------------------------------------------------------- /src/common/defs.rs: -------------------------------------------------------------------------------- 1 | //! Common definitions. 2 | 3 | 4 | use std::io::Write; 5 | use std::sync::Arc; 6 | use std::sync::atomic::{AtomicU64, Ordering}; 7 | use crate::common::misc::SliceToIntConverter; 8 | 9 | 10 | /// Object identifier. 11 | #[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)] 12 | pub struct ObjectId 13 | { 14 | pub file_id: u16, 15 | pub extent_id: u16, 16 | pub block_id: u16, 17 | pub entry_id: u16, 18 | } 19 | 20 | impl ObjectId { 21 | 22 | pub fn new() -> Self { 23 | ObjectId { 24 | file_id: 0, 25 | extent_id: 0, 26 | block_id: 0, 27 | entry_id: 0, 28 | } 29 | } 30 | 31 | pub fn init(file_id: u16, 32 | extent_id: u16, 33 | block_id: u16, 34 | entry_id: u16) -> Self 35 | { 36 | ObjectId { 37 | file_id, 38 | extent_id, 39 | block_id, 40 | entry_id, 41 | } 42 | } 43 | 44 | /// Calculate bucket for certain number of buckets. 45 | pub fn obj_bkt(&self, nbkt: usize) -> usize { 46 | (self.block_id as usize + 47 | self.extent_id as usize + 48 | self.entry_id as usize + 49 | self.file_id as usize) % nbkt 50 | } 51 | } 52 | 53 | /// Block identifier. 54 | #[derive(Clone, Copy, Eq, Hash, PartialEq, Debug)] 55 | pub struct BlockId 56 | { 57 | pub file_id: u16, 58 | pub extent_id: u16, 59 | pub block_id: u16, 60 | } 61 | 62 | 63 | impl BlockId { 64 | 65 | pub fn new() -> Self { 66 | BlockId { 67 | file_id: 0, 68 | extent_id: 0, 69 | block_id: 0, 70 | } 71 | } 72 | 73 | pub fn init(file_id: u16, extent_id: u16, block_id: u16) -> Self { 74 | BlockId { 75 | file_id, 76 | extent_id, 77 | block_id, 78 | } 79 | } 80 | 81 | pub fn from_obj(obj: &ObjectId) -> Self { 82 | BlockId { 83 | file_id: obj.file_id, 84 | extent_id: obj.extent_id, 85 | block_id: obj.block_id, 86 | } 87 | } 88 | 89 | pub fn hash(&self, n: usize) -> usize { 90 | (self.block_id as usize + 91 | self.extent_id as usize + 92 | self.file_id as usize) % n 93 | } 94 | } 95 | 96 | /// Numeric sequence that can be shared by several threads. 97 | #[derive(Clone)] 98 | pub struct Sequence { 99 | sn: Arc, 100 | } 101 | 102 | impl Sequence { 103 | pub fn new(sn: u64) -> Self { 104 | Sequence { 105 | sn: Arc::new(AtomicU64::new(sn)), 106 | } 107 | } 108 | 109 | pub fn set(&self, sn: u64) { 110 | self.sn.store(sn, Ordering::Relaxed) 111 | } 112 | 113 | pub fn get_next(&self) -> u64 { 114 | self.sn.fetch_add(1, Ordering::Relaxed) + 1 115 | } 116 | 117 | pub fn get_cur(&self) -> u64 { 118 | self.sn.load(Ordering::Relaxed) 119 | } 120 | /* 121 | pub fn swap(&self, sn: u64) { 122 | let mut current = self.sn.load(Ordering::Relaxed); 123 | while current < sn { 124 | current = self.sn.compare_and_swap(current, sn, Ordering::Relaxed); 125 | } 126 | } 127 | */ 128 | } 129 | 130 | /// Change sequence numbers that often are shared gathered into one struct. 131 | #[derive(Clone)] 132 | pub struct SharedSequences { 133 | pub csn: Sequence, 134 | pub latest_commit_csn: Arc, 135 | pub checkpoint_csn: Sequence, 136 | } 137 | 138 | 139 | 140 | pub const VECTOR_DATA_LENGTH: usize = 10; 141 | 142 | 143 | /// Vector (data pointer). 144 | #[derive(Clone, Copy, Debug)] 145 | pub struct Vector { 146 | obj: ObjectId, 147 | entry_pos: u16, 148 | data: [u8;VECTOR_DATA_LENGTH], 149 | } 150 | 151 | 152 | impl Vector { 153 | pub fn new() -> Self { 154 | Vector { 155 | obj: ObjectId::new(), 156 | entry_pos: 0, 157 | data: [0;VECTOR_DATA_LENGTH], 158 | } 159 | } 160 | 161 | pub fn init(block_id: BlockId, entry_id: u16, entry_pos: u16) -> Self { 162 | let obj = ObjectId::init(block_id.file_id, block_id.extent_id, block_id.block_id, entry_id); 163 | Vector { 164 | obj, 165 | entry_pos, 166 | data: [0;VECTOR_DATA_LENGTH], 167 | } 168 | } 169 | 170 | pub fn update_from_buf(&mut self) { 171 | self.obj.file_id = u16::slice_to_int(&self.data[0..2]).unwrap(); 172 | self.obj.extent_id = u16::slice_to_int(&self.data[2..4]).unwrap(); 173 | self.obj.block_id = u16::slice_to_int(&self.data[4..6]).unwrap(); 174 | self.obj.entry_id = u16::slice_to_int(&self.data[6..8]).unwrap(); 175 | self.entry_pos = u16::slice_to_int(&self.data[8..10]).unwrap(); 176 | } 177 | 178 | pub fn to_data(&mut self) -> &[u8] { 179 | let mut slice = &mut self.data[0..VECTOR_DATA_LENGTH]; 180 | slice.write_all(&self.obj.file_id.to_ne_bytes()).unwrap(); 181 | slice.write_all(&self.obj.extent_id.to_ne_bytes()).unwrap(); 182 | slice.write_all(&self.obj.block_id.to_ne_bytes()).unwrap(); 183 | slice.write_all(&self.obj.entry_id.to_ne_bytes()).unwrap(); 184 | slice.write_all(&self.entry_pos.to_ne_bytes()).unwrap(); 185 | slice.flush().unwrap(); 186 | &self.data 187 | } 188 | 189 | pub fn buf_mut(&mut self) -> &mut [u8] { 190 | &mut self.data 191 | } 192 | 193 | pub fn buf(&self) -> &[u8] { 194 | &self.data 195 | } 196 | 197 | pub fn obj_id(&self) -> ObjectId { 198 | self.obj 199 | } 200 | 201 | pub fn entry_pos(&self) -> u16 { 202 | self.entry_pos 203 | } 204 | } 205 | 206 | /// Seek postition. 207 | pub enum SeekFrom { 208 | Start, 209 | Current, 210 | } 211 | 212 | -------------------------------------------------------------------------------- /src/tran_mgr/tran_mgr.rs: -------------------------------------------------------------------------------- 1 | /// Transaction state management 2 | 3 | 4 | use crate::common::errors::Error; 5 | use crate::system::config::ConfigMt; 6 | use crate::common::defs::ObjectId; 7 | use crate::common::defs::Sequence; 8 | use std::sync::Arc; 9 | use std::sync::Mutex; 10 | use std::sync::Condvar; 11 | use std::collections::HashSet; 12 | use std::collections::HashMap; 13 | use std::time::Duration; 14 | 15 | 16 | /// TranMgr keeps track of transactions. 17 | #[derive(Clone)] 18 | pub struct TranMgr { 19 | tsn: Sequence, 20 | nbkt: usize, 21 | nobj_bkt: usize, 22 | trn_set: Arc>, Condvar)>>, 23 | obj_locks: Arc>, Condvar)>>, 24 | } 25 | 26 | impl TranMgr { 27 | 28 | pub fn new(conf: ConfigMt) -> Result { 29 | 30 | let conf = conf.get_conf(); 31 | let nbkt = *conf.get_tran_mgr_n_buckets() as usize; 32 | let ntran = *conf.get_tran_mgr_n_tran() as usize; 33 | let nobj_bkt = *conf.get_tran_mgr_n_obj_buckets() as usize; 34 | let nobj_lock = *conf.get_tran_mgr_n_obj_lock() as usize; 35 | 36 | let mut trn_set = Vec::with_capacity(nbkt); 37 | for _ in 0..nbkt { 38 | trn_set.push((Mutex::new(HashSet::with_capacity(ntran)), Condvar::new())); 39 | } 40 | 41 | let mut obj_locks = Vec::with_capacity(nobj_bkt); 42 | for _ in 0..nobj_bkt { 43 | obj_locks.push((Mutex::new(HashMap::with_capacity(nobj_lock)), Condvar::new())); 44 | } 45 | 46 | Ok(TranMgr { 47 | tsn: Sequence::new(1), 48 | nbkt, 49 | nobj_bkt, 50 | trn_set: Arc::new(trn_set), 51 | obj_locks: Arc::new(obj_locks), 52 | }) 53 | } 54 | 55 | /// Set initial tsn. 56 | pub fn set_tsn(&self, tsn: u64) { 57 | self.tsn.set(tsn); 58 | } 59 | 60 | /// Get current tsn. 61 | pub fn get_tsn(&self) -> u64 { 62 | self.tsn.get_cur() 63 | } 64 | 65 | /// Register a new transaction and return its tsn. 66 | pub fn start_tran(&self) -> u64 { 67 | let tsn = self.tsn.get_next(); 68 | let b = (tsn % self.nbkt as u64) as usize; 69 | let (lock, _) = &self.trn_set[b]; 70 | let mut hm = lock.lock().unwrap(); 71 | hm.insert(tsn); 72 | tsn 73 | } 74 | 75 | /// Unregister transaction with specified tsn. 76 | pub fn delete_tran(&self, tsn: u64) { 77 | let b = (tsn % self.nbkt as u64) as usize; 78 | let (lock, cvar) = &self.trn_set[b]; 79 | let mut hm = lock.lock().unwrap(); 80 | hm.remove(&tsn); 81 | cvar.notify_all(); 82 | } 83 | 84 | /// Lock an object with certain object id in transaction with specified tsn. 85 | pub fn lock_object<'a>(&'a self, tsn: u64, obj_id: &'a ObjectId) -> ObjectLockGuard<'a> { 86 | let b = obj_id.obj_bkt(self.nobj_bkt); 87 | let (lock, cvar) = &self.obj_locks[b]; 88 | let mut hm = lock.lock().unwrap(); 89 | 90 | if let Some(t) = hm.get(obj_id) { 91 | if *t != tsn { 92 | while hm.contains_key(obj_id) { 93 | hm = cvar.wait(hm).unwrap(); 94 | } 95 | } 96 | } 97 | 98 | hm.insert(*obj_id, tsn); 99 | 100 | ObjectLockGuard { 101 | obj_id, 102 | trman: &self, 103 | } 104 | } 105 | 106 | /// Wait for transaction with specified tsn and timeout in milliseconds to finish. 107 | /// In case of timeout returns false, otherwise true. 108 | pub fn wait_for(&self, tsn: u64, timeout: i64) -> bool { 109 | let b = (tsn % self.nbkt as u64) as usize; 110 | let (lock, cvar) = &self.trn_set[b]; 111 | let mut hm = lock.lock().unwrap(); 112 | if timeout >= 0 { 113 | while hm.contains(&tsn) { 114 | let (h, w) = cvar.wait_timeout(hm, Duration::from_millis(timeout as u64)).unwrap(); 115 | if w.timed_out() { 116 | return false; 117 | } 118 | hm = h; 119 | } 120 | } else { 121 | while hm.contains(&tsn) { 122 | let (h, _w) = cvar.wait_timeout(hm, Duration::from_millis(1000u64)).unwrap(); 123 | hm = h; 124 | } 125 | } 126 | return true; 127 | } 128 | 129 | fn unlock_object(&self, obj_id: &ObjectId) { 130 | let b = obj_id.obj_bkt(self.nobj_bkt); 131 | let (lock, cvar) = &self.obj_locks[b]; 132 | let mut hm = lock.lock().unwrap(); 133 | 134 | hm.remove(obj_id); 135 | 136 | cvar.notify_one(); 137 | } 138 | } 139 | 140 | 141 | pub struct ObjectLockGuard<'a> { 142 | obj_id: &'a ObjectId, 143 | trman: &'a TranMgr, 144 | } 145 | 146 | impl<'a> Drop for ObjectLockGuard<'a> { 147 | 148 | fn drop(&mut self) { 149 | self.trman.unlock_object(self.obj_id); 150 | } 151 | } 152 | 153 | 154 | #[cfg(test)] 155 | mod tests { 156 | 157 | use super::*; 158 | 159 | #[test] 160 | fn test_tran_mgr() { 161 | 162 | let conf = ConfigMt::new(); 163 | let c = conf.get_conf(); 164 | drop(c); 165 | 166 | let tm = TranMgr::new(conf).expect("Failed to create transaction manager"); 167 | 168 | let tsn = 1; 169 | let obj = ObjectId::init(1,1,1,1); 170 | 171 | tm.set_tsn(tsn); 172 | 173 | let tsn = tm.start_tran(); 174 | let lock = tm.lock_object(tsn, &obj); 175 | assert!(!tm.wait_for(tsn, 100)); 176 | drop(lock); 177 | tm.delete_tran(tsn); 178 | 179 | let tsn = tm.start_tran(); 180 | let _lock = tm.lock_object(tsn, &obj); 181 | tm.delete_tran(tsn); 182 | 183 | assert!(tm.wait_for(tsn, 100)); 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/storage/driver.rs: -------------------------------------------------------------------------------- 1 | /// Interface for storage 2 | 3 | use crate::common::errors::Error; 4 | use crate::common::defs::ObjectId; 5 | use crate::common::defs::Vector; 6 | use crate::common::defs::SeekFrom; 7 | use crate::common::defs::SharedSequences; 8 | use crate::storage::block_driver::BlockStorageDriver; 9 | use crate::storage::block_driver::BlockStorageSharedState; 10 | use crate::storage::block_driver::Cursor; 11 | use crate::storage::block_driver::ReplayState; 12 | use crate::system::config::ConfigMt; 13 | use crate::storage::datastore::FileType; 14 | 15 | 16 | /// Shared state represents parts that can be sent between threads safely, because StorageDriver 17 | /// may not let send itself directly. 18 | pub struct StorageDriverSharedState { 19 | ss: BlockStorageSharedState, 20 | } 21 | 22 | 23 | /// Handle for accessing objects being written or read. 24 | pub struct Handle { 25 | cursor: Cursor, 26 | } 27 | 28 | /// Handle for replaying object changes. 29 | pub struct ReplayHandle { 30 | rs: ReplayState, 31 | } 32 | 33 | impl ReplayHandle { 34 | pub fn update(&mut self, v: &Vector, tsn: u64, csn: u64) { 35 | self.rs.update(v, tsn, csn); 36 | } 37 | } 38 | 39 | 40 | /// Storage driver is abstration for providing access to data storage. 41 | /// It serves as wrapper for actual implementation, e.g. block store. 42 | pub struct StorageDriver { 43 | driver: BlockStorageDriver, 44 | } 45 | 46 | 47 | impl<'b> StorageDriver { 48 | 49 | pub fn new(conf: ConfigMt, csns: SharedSequences) -> Result { 50 | 51 | let driver = BlockStorageDriver::new(conf.clone(), csns.clone())?; 52 | Ok(StorageDriver{ 53 | driver, 54 | }) 55 | } 56 | 57 | /// Build instance of storage driver using shared state. 58 | pub fn from_shared_state(ss: StorageDriverSharedState) -> Result { 59 | Ok(StorageDriver { 60 | driver: BlockStorageDriver::from_shared_state(ss.ss)?, 61 | }) 62 | } 63 | 64 | pub fn get_shared_state(&self) -> Result { 65 | Ok(StorageDriverSharedState { 66 | ss: self.driver.get_shared_state()?, 67 | }) 68 | } 69 | 70 | /// Return tsn if object is currently being locked / written by other transaction. 71 | pub fn is_locked(&self, obj_id: &ObjectId) -> Result, Error> { 72 | self.driver.is_locked(obj_id) 73 | } 74 | 75 | /// Create a new object. 76 | pub fn create(&self, file_id: u16, tsn: u64, csn: u64, initial_size: usize) -> Result<(ObjectId, Handle), Error> { 77 | let (o, c) = self.driver.create(file_id, tsn, csn, initial_size)?; 78 | Ok((o, Handle {cursor: c})) 79 | } 80 | 81 | /// Delete existing object. 82 | pub fn delete(&self, obj_id: &ObjectId, tsn: u64, csn: u64) -> Result { 83 | self.driver.delete(obj_id, tsn, csn) 84 | } 85 | 86 | /// Begin writing to an existing object. 87 | pub fn begin_write(&self, obj_id: &ObjectId, tsn: u64, csn: u64) -> Result { 88 | let cursor = self.driver.begin_write(obj_id, tsn, csn)?; 89 | Ok(Handle { 90 | cursor, 91 | }) 92 | } 93 | 94 | /// Begin reading an existing object. 95 | pub fn begin_read(&self, obj_id: &ObjectId, tsn: u64, csn: u64) -> Result { 96 | let cursor = self.driver.begin_read(obj_id, tsn, csn)?; 97 | Ok(Handle { 98 | cursor, 99 | }) 100 | } 101 | 102 | /// Read data from an object opened for read. 103 | pub fn read(&self, h: &mut Handle, buf: &mut [u8]) -> Result { 104 | self.driver.read(&mut h.cursor, buf) 105 | } 106 | 107 | /// Write data to object opened for write. 108 | pub fn write(&'b self, h: &'b mut Handle, data: &[u8]) -> Result<(Vector, usize, u64), Error> { 109 | self.driver.write(&mut h.cursor, data) 110 | } 111 | 112 | /// seek to a certain position in an opened object. 113 | pub fn seek(&self, h: &mut Handle, from: SeekFrom, pos: u64, obj_id: &ObjectId) -> Result { 114 | self.driver.seek(&mut h.cursor, from, pos, obj_id) 115 | } 116 | 117 | /// Rollback changes made by a transaction. 118 | pub fn rollback_transaction(&self, tsn: u64) -> Result<(), Error> { 119 | self.driver.rollback_transaction(tsn) 120 | } 121 | 122 | /// Perform checkpoint. See concrete implementation for details. 123 | pub fn checkpoint(&self, checkpoint_csn: u64) -> Result<(), Error> { 124 | self.driver.checkpoint(checkpoint_csn) 125 | } 126 | 127 | /// Restore state of storage to as of last checkpoint. 128 | pub fn restore_checkpoint(&self, checkpoint_csn: u64) -> Result<(), Error> { 129 | self.driver.restore_checkpoint(checkpoint_csn) 130 | } 131 | 132 | /// Perform transaction finalization in the storage regardless of commit or rollback. 133 | pub fn finish_tran(&self, tsn: u64) { 134 | self.driver.finish_tran(tsn) 135 | } 136 | 137 | pub fn terminate(self) { 138 | self.driver.terminate(); 139 | } 140 | 141 | /// Add a new file to datastore. 142 | pub fn add_datafile(&self, file_type: FileType, extent_size: u16, extent_num: u16, max_extent_num: u16) -> Result { 143 | self.driver.add_datafile(file_type, extent_size, extent_num, max_extent_num) 144 | } 145 | 146 | /// Begin replay of an object. 147 | pub fn begin_replay(&self, obj: &ObjectId, entry_pos: u16, tsn: u64, csn: u64) -> ReplayHandle { 148 | ReplayHandle { 149 | rs: ReplayState::new(obj, entry_pos, tsn, csn) 150 | } 151 | } 152 | 153 | /// Replay changes from the log. 154 | pub fn replay(&self, rh: &mut ReplayHandle, data: &[u8]) -> Result<(), Error> { 155 | self.driver.replay(&mut rh.rs, data) 156 | } 157 | } 158 | 159 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Minimal schema-less database management system with ACID guaranties. 2 | //! 3 | //! # Examples 4 | //! 5 | //! ``` 6 | //! use db_core::instance::Instance; 7 | //! use db_core::FileState; 8 | //! use db_core::FileType; 9 | //! use db_core::FileDesc; 10 | //! use db_core::instance::Read; 11 | //! use db_core::instance::Write; 12 | //! use db_core::config::ConfigMt; 13 | //! use std::path::Path; 14 | //! 15 | //! 16 | //! // 17 | //! // Database initialization (this is one-time action). 18 | //! // 19 | //! 20 | //! 21 | //! // Block size for the database. 22 | //! let block_size = 8192; 23 | //! 24 | //! 25 | //! // Create an empty directory for the database files. 26 | //! let dspath = "/tmp/db-core-test-db"; 27 | //! if Path::new(dspath).exists() { 28 | //! std::fs::remove_dir_all(dspath).expect("Failed to delete test dir on cleanup"); 29 | //! } 30 | //! std::fs::create_dir(dspath).expect("Failed to create test dir"); 31 | //! 32 | //! // Transaction log directory. 33 | //! let log_dir = "/tmp/db-core-test-tranlog"; 34 | //! if Path::new(log_dir).exists() { 35 | //! std::fs::remove_dir_all(log_dir).expect("Failed to delete test dir on cleanup"); 36 | //! } 37 | //! std::fs::create_dir(log_dir).expect("Failed to create test dir"); 38 | //! 39 | //! 40 | //! // Define initial database files. There should be at least one file of each type (data store, 41 | //! // versioning store, checkpont store). 42 | //! let mut fdset = vec![]; 43 | //! let desc1 = FileDesc { 44 | //! state: FileState::InUse, 45 | //! file_id: 3, 46 | //! extent_size: 16, 47 | //! extent_num: 3, 48 | //! max_extent_num: 65500, 49 | //! file_type: FileType::DataStoreFile, 50 | //! }; 51 | //! let desc2 = FileDesc { 52 | //! state: FileState::InUse, 53 | //! file_id: 4, 54 | //! extent_size: 10, 55 | //! extent_num: 3, 56 | //! max_extent_num: 65500, 57 | //! file_type: FileType::VersioningStoreFile, 58 | //! }; 59 | //! let desc3 = FileDesc { 60 | //! state: FileState::InUse, 61 | //! file_id: 5, 62 | //! extent_size: 10, 63 | //! extent_num: 3, 64 | //! max_extent_num: 65500, 65 | //! file_type: FileType::CheckpointStoreFile, 66 | //! }; 67 | //! 68 | //! fdset.push(desc1); 69 | //! fdset.push(desc2); 70 | //! fdset.push(desc3); 71 | //! 72 | //! // Create a database. 73 | //! Instance::initialize_datastore(dspath, block_size, &fdset).expect("Failed to init datastore"); 74 | //! 75 | //! 76 | //! // 77 | //! // Startup and termination, transaction management, read and write data. 78 | //! // 79 | //! 80 | //! 81 | //! // Some random data. 82 | //! let data = b"Hello, world!"; 83 | //! 84 | //! 85 | //! // Prepare configuration. 86 | //! let conf = ConfigMt::new(); 87 | //! let mut c = conf.get_conf(); 88 | //! c.set_log_dir(log_dir.to_owned()); 89 | //! c.set_datastore_path(dspath.to_owned()); 90 | //! drop(c); 91 | //! 92 | //! // Start instance and open existing database. 93 | //! let instance = Instance::new(conf.clone()).expect("Failed to create instance"); 94 | //! 95 | //! // Begin transaction. 96 | //! let mut trn = instance.begin_transaction().expect("Failed to begin transaction"); 97 | //! 98 | //! // Create a new object. 99 | //! let file_id = 3; 100 | //! let mut obj = instance.open_create(file_id, &mut trn, data.len()).expect("Failed to create object"); 101 | //! let obj_id = obj.get_id(); 102 | //! 103 | //! // Write some data. 104 | //! obj.write_next(data).expect("Failed to write"); 105 | //! drop(obj); 106 | //! 107 | //! // Commit transaction. 108 | //! instance.commit(trn).expect("Failed to commit"); 109 | //! 110 | //! // Begin transaction. 111 | //! let mut trn = instance.begin_transaction().expect("Failed to begin transaction"); 112 | //! 113 | //! // Open object for reading and read some data. 114 | //! let mut obj = instance.open_read(&obj_id, &trn).expect("Failed to open for reading"); 115 | //! let mut read_buf = vec![0u8;data.len()]; 116 | //! let mut read = 0; 117 | //! let len = read_buf.len(); 118 | //! while read < len { 119 | //! let r = obj.read_next(&mut read_buf[read..len]).expect("Failed to read"); 120 | //! if r == 0 {break;} 121 | //! read += r; 122 | //! } 123 | //! assert_eq!(read_buf, data); 124 | //! drop(obj); 125 | //! 126 | //! // Delete object (if object is in use wait for other transaction to finish for 1 second). 127 | //! let wait_lock_ms = 1000; 128 | //! instance.delete(&obj_id, &mut trn, wait_lock_ms).expect("Failed to delete object"); 129 | //! 130 | //! // Rollback transaction. 131 | //! instance.rollback(trn).expect("Failed to rollback"); 132 | //! 133 | //! // Spawn another instance in a different thread. 134 | //! let ss = instance.get_shared_state().expect("Failed to get shared state"); 135 | //! 136 | //! let th = std::thread::spawn(move || { 137 | //! let instance2 = Instance::from_shared_state(ss).expect("Failed to create instance"); 138 | //! // ... 139 | //! instance2.terminate(); 140 | //! }); 141 | //! 142 | //! // Add a database file. 143 | //! let new_file_id = instance.add_datafile(FileType::DataStoreFile, 1000, 10, 1000).expect("Failed to add data file"); 144 | //! 145 | //! // Terminate instance. 146 | //! instance.terminate(); 147 | //! 148 | //! ``` 149 | //! 150 | //! ### Notes. 151 | //! 152 | //! 1. Block size is defined at the moment of database creation and can't be changed later. 153 | //! 2. Each `file_id` must be unique. `file_id` 0, 1, and 2 are reserved and can't be used. 154 | //! 155 | 156 | mod buf_mgr; 157 | mod common; 158 | mod log_mgr; 159 | mod storage; 160 | mod system; 161 | mod tran_mgr; 162 | mod block_mgr; 163 | 164 | pub use system::config; 165 | pub use system::instance; 166 | pub use common::errors; 167 | 168 | pub use common::defs::ObjectId; 169 | pub use common::defs::SeekFrom; 170 | pub use storage::datastore::FileState; 171 | pub use storage::datastore::FileType; 172 | pub use storage::datastore::FileDesc; 173 | 174 | -------------------------------------------------------------------------------- /src/log_mgr/log_mgr.rs: -------------------------------------------------------------------------------- 1 | /// Transaction log management 2 | 3 | use crate::common::errors::Error; 4 | use crate::common::defs::ObjectId; 5 | use crate::common::defs::Vector; 6 | use crate::common::defs::Sequence; 7 | use crate::system::config::ConfigMt; 8 | use crate::log_mgr::fs::BufferedFileStream; 9 | use crate::log_mgr::fs::FileStream; 10 | use crate::log_mgr::fs::FileOps; 11 | use crate::log_mgr::io::LogWriter; 12 | 13 | pub use crate::log_mgr::io::LogReader; 14 | pub use crate::log_mgr::io::LogRecordHeader; 15 | pub use crate::log_mgr::io::RecType; 16 | 17 | 18 | #[derive(Clone)] 19 | pub struct LogMgr { 20 | writer: LogWriter, 21 | log_dir: String, 22 | starting_csn: u64, 23 | latest_commit_csn: u64, 24 | } 25 | 26 | impl LogMgr { 27 | 28 | pub fn new(conf: ConfigMt) -> Result { 29 | let conf = conf.get_conf(); 30 | let log_dir = conf.get_log_dir().to_owned(); 31 | let max_log_file_size = *conf.get_max_log_file_size(); 32 | let buf_sz = *conf.get_log_writer_buf_size(); 33 | 34 | let file_id = FileOps::init_file_logging(&log_dir, max_log_file_size)?; 35 | 36 | let fs = FileStream::new(log_dir.clone(), max_log_file_size, file_id, 4, false, true)?; 37 | let mut lr = LogReader::new(fs)?; 38 | let (start_pos, lsn, starting_csn, latest_commit_csn) = lr.find_write_position()?; 39 | drop(lr); 40 | 41 | let bfs = BufferedFileStream::new(log_dir.clone(), max_log_file_size, buf_sz as usize, file_id, start_pos)?; 42 | 43 | let lsn = Sequence::new(lsn); 44 | 45 | let writer = LogWriter::new(bfs, lsn)?; 46 | 47 | Ok(LogMgr { 48 | writer, 49 | log_dir, 50 | starting_csn, 51 | latest_commit_csn, 52 | }) 53 | } 54 | 55 | pub fn write_data(&self, csn: u64, checkpoint_csn: u64, tsn: u64, obj_id: &ObjectId, vector: &mut Vector, data: &[u8]) -> Result<(), Error> { 56 | self.writer.write_data(csn, checkpoint_csn, tsn, obj_id, vector, data) 57 | } 58 | 59 | pub fn write_commit(&self, csn: u64, tsn: u64) -> Result<(), Error> { 60 | self.writer.write_commit(csn, tsn) 61 | } 62 | 63 | pub fn write_rollback(&self, csn: u64, tsn: u64) -> Result<(), Error> { 64 | self.writer.write_rollback(csn, tsn) 65 | } 66 | 67 | pub fn write_checkpoint_begin(&self, checkpoint_csn: u64, latest_commit_csn: u64) -> Result<(), Error> { 68 | self.writer.write_checkpoint_begin(checkpoint_csn, latest_commit_csn) 69 | } 70 | 71 | pub fn write_checkpoint_completed(&self, checkpoint_csn: u64, latest_commit_csn: u64, current_tsn: u64) -> Result<(), Error> { 72 | self.writer.write_checkpoint_completed(checkpoint_csn, latest_commit_csn, current_tsn) 73 | } 74 | 75 | pub fn write_delete(&self, csn: u64, checkpoint_csn: u64, tsn: u64, obj_id: &ObjectId) -> Result<(), Error> { 76 | self.writer.write_delete(csn, checkpoint_csn, tsn, obj_id) 77 | } 78 | 79 | pub fn get_reader(&self) -> Result { 80 | let file_id = FileOps::find_latest_log_file(&self.log_dir)?; 81 | let fs = FileStream::new(self.log_dir.clone(), 0, file_id, 4, false, true)?; 82 | Ok(LogReader::new(fs)?) 83 | } 84 | 85 | pub fn starting_csn(&self) -> u64 { 86 | self.starting_csn 87 | } 88 | 89 | pub fn latest_commit_csn(&self) -> u64 { 90 | self.latest_commit_csn 91 | } 92 | 93 | pub fn terminate(self) { 94 | self.writer.terminate(); 95 | } 96 | } 97 | 98 | #[cfg(test)] 99 | mod tests { 100 | 101 | use super::*; 102 | use crate::common::defs::BlockId; 103 | use std::path::Path; 104 | 105 | #[test] 106 | fn test_log_mgr() { 107 | let log_dir = "/tmp/test_log_mgr_34566576"; 108 | 109 | if Path::new(log_dir).exists() { 110 | std::fs::remove_dir_all(log_dir).expect("Failed to delete test dir on cleanup"); 111 | } 112 | std::fs::create_dir(log_dir).expect("Failed to create test dir"); 113 | 114 | let conf = ConfigMt::new(); 115 | let mut c = conf.get_conf(); 116 | c.set_log_dir(log_dir.to_owned()); 117 | drop(c); 118 | 119 | let lm = LogMgr::new(conf.clone()).expect("Failed to create log mgr"); 120 | 121 | let csn = 123; 122 | let checkpoint_csn = 124; 123 | let tsn = 125; 124 | let current_tsn = 234; 125 | let obj_id = ObjectId::init(100,101,102,103); 126 | let mut vec = Vector::init(BlockId::init(1,1,1),1,1); 127 | let data = [0,1,2,3,4,5,6,7,8,9]; 128 | let latest_commit_csn = 126; 129 | 130 | lm.write_data(csn, checkpoint_csn-1, tsn, &obj_id, &mut vec, &data).expect("Failed to write data"); 131 | lm.write_commit(csn+1, tsn+1).expect("Failed to write commit"); 132 | lm.write_rollback(csn+2, tsn+2).expect("Failed to write rollback"); 133 | lm.write_checkpoint_begin(checkpoint_csn, latest_commit_csn).expect("Failed to write checkpoint csn"); 134 | lm.write_checkpoint_completed(checkpoint_csn, latest_commit_csn+1, current_tsn).expect("Failed to write checkpoint completed"); 135 | lm.write_delete(csn+3, checkpoint_csn, tsn+3, &obj_id).expect("Failed to delete"); 136 | lm.write_data(csn+4, checkpoint_csn, tsn+4, &obj_id, &mut vec, &data).expect("Failed to write data"); 137 | lm.write_commit(csn+5, tsn+5).expect("Failed to write commit"); 138 | lm.write_data(csn+6, checkpoint_csn, tsn+6, &obj_id, &mut vec, &data).expect("Failed to write data"); 139 | 140 | lm.terminate(); 141 | 142 | let lm = LogMgr::new(conf.clone()).expect("Failed to create log mgr"); 143 | 144 | let starting_csn = lm.starting_csn(); 145 | assert_eq!(starting_csn, csn+6); 146 | 147 | let latest_commit_csn = lm.latest_commit_csn(); 148 | assert_eq!(latest_commit_csn, csn+5); 149 | 150 | 151 | let mut lr = lm.get_reader().expect("Failed to get log reader"); 152 | let (start_pos, lsn, starting_csn, latest_commit_csn) = lr.find_write_position().expect("Failed to find write position"); 153 | assert_eq!(start_pos, 457); 154 | assert_eq!(lsn, 10); 155 | assert_eq!(starting_csn, csn+6); 156 | assert_eq!(latest_commit_csn, csn+5); 157 | 158 | let mut lr = lm.get_reader().expect("Failed to get log reader"); 159 | let (ccsn, ctsn) = lr.seek_to_latest_checkpoint().expect("Failed to get latest checkpoint").unwrap(); 160 | assert_eq!(ccsn, checkpoint_csn); 161 | assert_eq!(ctsn, current_tsn); 162 | let _lrh = lr.read_next().expect("Failed to get latest checkpoint"); 163 | let _lrh = lr.read_next().expect("Failed to get latest checkpoint"); 164 | let obj = lr.get_object_id(); 165 | assert_eq!(obj, obj_id); 166 | let vec2 = lr.get_vector(); 167 | assert_eq!(vec2.obj_id(), vec.obj_id()); 168 | assert_eq!(vec2.entry_pos(), vec.entry_pos()); 169 | let data1 = lr.get_data(); 170 | assert_eq!(data1, data); 171 | 172 | lm.terminate(); 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/system/config.rs: -------------------------------------------------------------------------------- 1 | //! Configuration management. 2 | 3 | 4 | use crate::common::errors::Error; 5 | use std::fs::OpenOptions; 6 | use std::io::{BufRead, BufReader}; 7 | use std::sync::Arc; 8 | use std::sync::Mutex; 9 | use std::sync::MutexGuard; 10 | use log::warn; 11 | 12 | 13 | #[derive(Clone)] 14 | /// Global configuration with multithreading support. 15 | pub struct ConfigMt { 16 | conf: Arc>, 17 | } 18 | 19 | impl ConfigMt { 20 | /// Create a new configuration instance. 21 | pub fn new() -> ConfigMt { 22 | ConfigMt { 23 | conf: Arc::new(Mutex::new(Config::new())) 24 | } 25 | } 26 | 27 | /// Lock and return configration for reading or modification. 28 | pub fn get_conf(&self) -> MutexGuard { 29 | self.conf.lock().unwrap() 30 | } 31 | } 32 | 33 | macro_rules! gen_config { 34 | ( $( $name:ident, $data_type:ty, $default_val:expr, $get_fn:ident, $set_fn:ident, $str_name:literal, $conv_fn:path ), *) => { 35 | /// Struct to access configuration values. 36 | pub struct Config { 37 | $( 38 | $name: $data_type, 39 | )* 40 | } 41 | 42 | impl Config { 43 | 44 | /// Create a new configuration instance. 45 | pub fn new() -> Config { 46 | Config { 47 | $( 48 | $name: $default_val, 49 | )* 50 | } 51 | } 52 | 53 | $( 54 | /// Get value of a parameter. 55 | pub fn $get_fn(&self) -> &$data_type { 56 | &self.$name 57 | } 58 | 59 | /// Set value of a parameter. 60 | pub fn $set_fn(&mut self, $name: $data_type) { 61 | self.$name = $name; 62 | } 63 | )* 64 | 65 | fn process_config_file_entry(&mut self, name: &str, val: &str) -> Result<(), Error> { 66 | 67 | match name { 68 | $( 69 | $str_name => { self.$name = $conv_fn(val)?; }, 70 | )* 71 | _ => warn!("Skipping unexpected config entry: {}", name) 72 | }; 73 | 74 | Ok(()) 75 | } 76 | } 77 | } 78 | } 79 | 80 | 81 | impl Config { 82 | 83 | /// Load configuration from a file. 84 | pub fn load(&mut self, file_path: &str) -> Result<(), Error> { 85 | let f = BufReader::new(OpenOptions::new() 86 | .create(false) 87 | .write(false) 88 | .read(true) 89 | .truncate(false) 90 | .open(file_path)?); 91 | 92 | for line in f.lines() { 93 | if let Ok(line) = line { 94 | if let Ok((name, val)) = Self::process_config_file_line(&line) { 95 | self.process_config_file_entry(name, val)?; 96 | } 97 | } 98 | } 99 | 100 | Ok(()) 101 | } 102 | 103 | fn process_config_file_line<'a>(s: &'a str) -> Result<(&'a str, &'a str), ()> { 104 | let line = s.as_bytes(); 105 | let mut p = 0; 106 | 107 | // skip space 108 | while line[p] == b' ' || line[p] == b'\t' { p += 1; }; 109 | 110 | // check for comment line 111 | if line[p] == b'#' { return Err(()) } 112 | 113 | // read 'name' part 114 | let p1 = p; 115 | while (line[p] >= b'a' && line[p] <= b'z') 116 | || (line[p] >= b'A' && line[p] <= b'Z') 117 | || (line[p] >= b'0' && line[p] <= b'9') 118 | || line[p] == b'_' || line[p] == b'-' 119 | { 120 | p+= 1; 121 | } 122 | if p == p1 { return Err(()) } 123 | let p2 = p; 124 | 125 | // read '=' 126 | while line[p] == b' ' || line[p] == b'\t' { p += 1; }; 127 | if line[p] != b'=' { return Err(()) } 128 | while line[p] == b' ' || line[p] == b'\t' { p += 1; }; 129 | 130 | // return 'name' and 'val' 131 | Ok((&s[p1..p2], &s[p..])) 132 | } 133 | 134 | fn load_string(value: &str) -> Result { 135 | Ok(String::from(value)) 136 | } 137 | 138 | fn load_u32_val(value: &str) -> Result { 139 | let ret = str::parse::(value)?; 140 | Ok(ret) 141 | } 142 | 143 | fn load_u64_val(value: &str) -> Result { 144 | let ret = str::parse::(value)?; 145 | Ok(ret) 146 | } 147 | } 148 | 149 | gen_config![log_dir, String, "trnlog".to_owned(), get_log_dir, set_log_dir, "log_dir", Config::load_string, 150 | datastore_path, String, ".".to_owned(), get_datastore_path, set_datastore_path, "datastore_path", Config::load_string, 151 | max_log_file_size, u32, 10*1024*1024, get_max_log_file_size, set_max_log_file_size, "max_log_file_size", Config::load_u32_val, 152 | log_writer_buf_size, u32, 1048576, get_log_writer_buf_size, set_log_writer_buf_size, "log_writer_buf_size", Config::load_u32_val, 153 | tran_mgr_n_buckets, u32, 128, get_tran_mgr_n_buckets, set_tran_mgr_n_buckets, "tran_mgr_n_buckets", Config::load_u32_val, 154 | tran_mgr_n_tran, u32, 1024, get_tran_mgr_n_tran, set_tran_mgr_n_tran, "tran_mgr_n_tran", Config::load_u32_val, 155 | tran_mgr_n_obj_buckets, u32, 128, get_tran_mgr_n_obj_buckets, set_tran_mgr_n_obj_buckets, "tran_mgr_n_obj_buckets", Config::load_u32_val, 156 | tran_mgr_n_obj_lock, u32, 1024, get_tran_mgr_n_obj_lock, set_tran_mgr_n_obj_lock, "tran_mgr_n_obj_lock", Config::load_u32_val, 157 | block_mgr_n_lock, u32, 1024, get_block_mgr_n_lock, set_block_mgr_n_lock, "block_mgr_n_lock", Config::load_u32_val, 158 | free_info_n_file_lock, u32, 16, get_free_info_n_file_lock, set_free_info_n_file_lock, "free_info_n_file_lock", Config::load_u32_val, 159 | free_info_n_extent_lock, u32, 128, get_free_info_n_extent_lock, set_free_info_n_extent_lock, "free_info_n_extent_lock", Config::load_u32_val, 160 | block_buf_size, u64, 32*1024*1024, get_block_buf_size, set_block_buf_size, "block_buf_size", Config::load_u64_val, 161 | checkpoint_data_threshold, u64, 10*1024*1024, get_checkpoint_data_threshold, set_checkpoint_data_threshold, "checkpoint_data_threshold", Config::load_u64_val, 162 | version_retain_time, u32, 3600, get_version_retain_time, set_version_retain_time, "version_retain_time", Config::load_u32_val, 163 | block_fill_ratio, u32, 80, get_block_fill_ratio, set_block_fill_ratio, "block_fill_ratio", Config::load_u32_val, 164 | writer_num, u32, 2, get_writer_num, set_writer_num, "writer_num", Config::load_u32_val]; 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /src/common/intercom.rs: -------------------------------------------------------------------------------- 1 | /// Interthread communication 2 | 3 | use std::sync::{Arc, Mutex, Condvar, MutexGuard}; 4 | use std::sync::atomic::AtomicBool; 5 | use std::sync::atomic::AtomicU64; 6 | use std::sync::atomic::Ordering; 7 | use std::time::Duration; 8 | use std::ops::Deref; 9 | use std::ops::DerefMut; 10 | use std::cell::UnsafeCell; 11 | 12 | 13 | 14 | /// Event notification. 15 | #[derive(Clone)] 16 | pub struct SyncNotification { 17 | pair: Arc<(Mutex, Condvar)>, 18 | } 19 | 20 | 21 | impl SyncNotification { 22 | pub fn new(initial: T) -> SyncNotification { 23 | let pair = Arc::new((Mutex::new(initial), Condvar::new())); 24 | SyncNotification { 25 | pair 26 | } 27 | } 28 | 29 | /// Wait while check_cond returns true. 30 | /// Return associated value when check_cond returned false. 31 | pub fn wait_for(&self, check_cond: &mut (dyn FnMut(&T) -> bool)) -> MutexGuard 32 | where T: PartialEq 33 | { 34 | let (lock, cvar) = &(*self.pair); 35 | let mut lock_val = lock.lock().unwrap(); 36 | while check_cond(&(*lock_val)) { 37 | lock_val = cvar.wait(lock_val).unwrap(); 38 | } 39 | lock_val 40 | } 41 | 42 | /// Set associated value to val and notify all or just one thread depending on notify_all 43 | /// value. 44 | pub fn send(&self, val: T, notify_all: bool) { 45 | let (lock, cvar) = &(*self.pair); 46 | let mut lock_val = lock.lock().unwrap(); 47 | *lock_val = val; 48 | if notify_all { 49 | cvar.notify_all(); 50 | } else { 51 | cvar.notify_one(); 52 | } 53 | } 54 | 55 | /// Wait while check_cond returns true and check for interrupt_cond periodically with interval 56 | /// specified by timeout. 57 | /// Return associated value or None if interrupt_cond returned true. 58 | pub fn wait_for_interruptable(&self, 59 | check_cond: &mut (dyn FnMut(&T) -> bool), 60 | interrupt_cond: &mut (dyn FnMut() -> bool), 61 | timeout: Duration 62 | ) -> Option> where T: PartialEq 63 | { 64 | let (lock, cvar) = &(*self.pair); 65 | let mut lock_val = lock.lock().unwrap(); 66 | while check_cond(&(*lock_val)) { 67 | lock_val = cvar.wait_timeout(lock_val, timeout).unwrap().0; 68 | 69 | if interrupt_cond() { 70 | return None; 71 | } 72 | } 73 | Some(lock_val) 74 | } 75 | /* 76 | pub fn notify_all(&self) { 77 | let (_, cvar) = &(*self.pair); 78 | cvar.notify_all(); 79 | } 80 | 81 | pub fn notify_one(&self) { 82 | let (_, cvar) = &(*self.pair); 83 | cvar.notify_one(); 84 | } 85 | */ 86 | 87 | } 88 | 89 | /* 90 | /// Lightweight lock 91 | pub struct LockLw { 92 | lock: AtomicBool, 93 | val: UnsafeCell, 94 | } 95 | 96 | impl LockLw { 97 | 98 | pub fn new(val: T) -> Self { 99 | LockLw { 100 | lock: AtomicBool::new(false), 101 | val: UnsafeCell::new(val), 102 | } 103 | } 104 | 105 | pub fn lock(&self) -> LockLwGuard { 106 | 107 | let mut i = 0; 108 | let cur = self.lock.load(Ordering::Relaxed); 109 | 110 | loop { 111 | 112 | let cur = self.lock.compare_and_swap(cur, true, Ordering::Relaxed); 113 | 114 | if !cur { 115 | break; 116 | } 117 | 118 | i += 1; 119 | if i > 10000 { 120 | std::thread::yield_now(); 121 | i = 0; 122 | } 123 | } 124 | 125 | std::sync::atomic::fence(Ordering::Acquire); 126 | 127 | LockLwGuard { 128 | parent: self, 129 | } 130 | } 131 | 132 | fn unlock(&self) { 133 | self.lock.store(false, Ordering::Release); 134 | } 135 | } 136 | 137 | pub struct LockLwGuard<'a, T> { 138 | parent: &'a LockLw, 139 | } 140 | 141 | impl Drop for LockLwGuard<'_, T> { 142 | fn drop(&mut self) { 143 | self.parent.unlock(); 144 | } 145 | } 146 | 147 | impl Deref for LockLwGuard<'_, T> { 148 | type Target = T; 149 | 150 | fn deref(&self) -> &T { 151 | unsafe { & *self.parent.val.get() } 152 | } 153 | } 154 | 155 | impl DerefMut for LockLwGuard<'_, T> { 156 | 157 | fn deref_mut(&mut self) -> &mut T { 158 | unsafe { &mut *self.parent.val.get() } 159 | } 160 | } 161 | */ 162 | 163 | 164 | /// Read/write lightweight lock (multiple readers and single writer) 165 | pub struct RwLockLw { 166 | wr_lock: AtomicBool, 167 | rd_lock: AtomicU64, 168 | val: UnsafeCell, 169 | } 170 | 171 | impl RwLockLw { 172 | 173 | pub fn new(val: T) -> Self { 174 | RwLockLw { 175 | wr_lock: AtomicBool::new(false), 176 | rd_lock: AtomicU64::new(0), 177 | val: UnsafeCell::new(val), 178 | } 179 | } 180 | 181 | pub fn read_lock(&self) -> ReadLockLwGuard { 182 | 183 | self.wr_lock(); 184 | 185 | self.rd_lock(); 186 | 187 | self.wr_unlock(); 188 | 189 | std::sync::atomic::fence(Ordering::Acquire); 190 | 191 | ReadLockLwGuard { 192 | parent: self, 193 | } 194 | } 195 | 196 | pub fn write_lock(&self) -> WriteLockLwGuard { 197 | 198 | self.wr_lock(); 199 | 200 | let mut i = 0; 201 | while self.rd_lock.load(Ordering::Relaxed) > 0 { 202 | core::sync::atomic::spin_loop_hint(); 203 | i += 1; 204 | if i > 10000 { 205 | std::thread::yield_now(); 206 | i = 0; 207 | } 208 | } 209 | 210 | std::sync::atomic::fence(Ordering::Acquire); 211 | 212 | WriteLockLwGuard { 213 | parent: self, 214 | } 215 | } 216 | 217 | fn wr_unlock(&self) { 218 | self.wr_lock.store(false, Ordering::Relaxed); 219 | } 220 | 221 | fn wr_lock(&self) { 222 | let mut i = 0; 223 | while let Err(_) = self.wr_lock.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) { 224 | i += 1; 225 | if i > 10000 { 226 | std::thread::yield_now(); 227 | } 228 | } 229 | } 230 | 231 | fn rd_unlock(&self) { 232 | self.rd_lock.fetch_sub(1, Ordering::Relaxed); 233 | } 234 | 235 | fn rd_lock(&self) { 236 | self.rd_lock.fetch_add(1, Ordering::Relaxed); 237 | } 238 | } 239 | 240 | 241 | /// Read-lock guard 242 | pub struct ReadLockLwGuard<'a, T> { 243 | parent: &'a RwLockLw, 244 | } 245 | 246 | impl Drop for ReadLockLwGuard<'_, T> { 247 | 248 | fn drop(&mut self) { 249 | self.parent.rd_unlock(); 250 | } 251 | } 252 | 253 | impl Deref for ReadLockLwGuard<'_, T> { 254 | 255 | type Target = T; 256 | 257 | fn deref(&self) -> &T { 258 | unsafe { & *self.parent.val.get() } 259 | } 260 | } 261 | 262 | 263 | /// Write-lock guard 264 | pub struct WriteLockLwGuard<'a, T> { 265 | parent: &'a RwLockLw, 266 | } 267 | 268 | impl Drop for WriteLockLwGuard<'_, T> { 269 | 270 | fn drop(&mut self) { 271 | self.parent.wr_unlock(); 272 | std::sync::atomic::fence(Ordering::Release); 273 | } 274 | } 275 | 276 | impl Deref for WriteLockLwGuard<'_, T> { 277 | 278 | type Target = T; 279 | 280 | fn deref(&self) -> &T { 281 | unsafe { & *self.parent.val.get() } 282 | } 283 | } 284 | 285 | impl DerefMut for WriteLockLwGuard<'_, T> { 286 | 287 | fn deref_mut(&mut self) -> &mut T { 288 | unsafe { &mut *self.parent.val.get() } 289 | } 290 | } 291 | -------------------------------------------------------------------------------- /src/system/checkpointer.rs: -------------------------------------------------------------------------------- 1 | /// Checkpoint manager initiates and controls checkpoints. 2 | /// Checkpoint procedure is the following: 3 | /// 4 | /// 1. Checkpoint sequence number is incremented. 5 | /// 2. Checkpointer thread writes record in the log about start of new checkpoint. 6 | /// 3. Checkpointer thread calls storage driver to process checkpoint with the new sequence number. 7 | /// 4. After storage driver completes checkpoint procedure checkpointer thread writes record in the 8 | /// log about completion. 9 | /// 10 | /// System will do changes replay on its start only starting form the last completed checkpoint. 11 | 12 | 13 | use crate::common::errors::Error; 14 | use crate::common::intercom::SyncNotification; 15 | use crate::common::defs::SharedSequences; 16 | use crate::system::config::ConfigMt; 17 | use crate::log_mgr::log_mgr::LogMgr; 18 | use crate::storage::driver::StorageDriver; 19 | use crate::tran_mgr::tran_mgr::TranMgr; 20 | use std::sync::Arc; 21 | use std::sync::atomic::AtomicBool; 22 | use std::sync::atomic::AtomicU64; 23 | use std::sync::atomic::AtomicU32; 24 | use std::sync::atomic::Ordering; 25 | use std::thread::JoinHandle; 26 | use std::time::Duration; 27 | use log::error; 28 | use log::warn; 29 | use log::info; 30 | 31 | 32 | const CONDVAR_WAIT_INTERVAL_MS: u64 = 1000; 33 | 34 | 35 | pub struct Checkpointer { 36 | checkpointer_thread: JoinHandle<()>, 37 | terminate: Arc, 38 | checkpoint_ready: SyncNotification, 39 | processed_data_threashold: u64, 40 | processed_data_size: AtomicU64, 41 | checkpoint_req_count: Arc, 42 | } 43 | 44 | impl Checkpointer { 45 | 46 | pub fn new(log_mgr: LogMgr, 47 | csns: SharedSequences, 48 | conf: ConfigMt, 49 | tran_mgr: TranMgr 50 | ) -> Result 51 | { 52 | let processed_data_threashold = *conf.get_conf().get_checkpoint_data_threshold(); 53 | let terminate = Arc::new(AtomicBool::new(false)); 54 | 55 | let checkpoint_ready = SyncNotification::new(false); 56 | let checkpoint_req_count = Arc::new(AtomicU32::new(0)); 57 | 58 | let terminate2 = terminate.clone(); 59 | let checkpoint_ready2 = checkpoint_ready.clone(); 60 | let checkpoint_req_count2 = checkpoint_req_count.clone(); 61 | 62 | let checkpointer_thread = std::thread::spawn(move || { 63 | Self::checkpointer_thread(conf, 64 | terminate2, 65 | checkpoint_ready2, 66 | log_mgr.clone(), 67 | csns.clone(), 68 | checkpoint_req_count2, 69 | tran_mgr); 70 | }); 71 | 72 | assert!(processed_data_threashold > 0); 73 | 74 | let processed_data_size = AtomicU64::new(0); 75 | 76 | Ok(Checkpointer { 77 | checkpointer_thread, 78 | terminate, 79 | checkpoint_ready, 80 | processed_data_threashold, 81 | processed_data_size, 82 | checkpoint_req_count, 83 | }) 84 | } 85 | 86 | pub fn register_processed_data_size(&self, size: u64) { 87 | let prev_size = self.processed_data_size.fetch_add(size, Ordering::Relaxed); 88 | if prev_size < self.processed_data_threashold && prev_size + size >= self.processed_data_threashold { 89 | if let Err(e) = self.initiate_checkpoint() { 90 | warn!("Failed to initiate checkpoint, error: {}", e); 91 | } 92 | self.processed_data_size.store(0, Ordering::Relaxed); 93 | } 94 | } 95 | 96 | pub fn terminate(self) { 97 | self.terminate.store(true, Ordering::Relaxed); 98 | self.checkpointer_thread.join().unwrap(); 99 | } 100 | 101 | 102 | fn initiate_checkpoint(&self) -> Result<(), Error> { 103 | 104 | let mut req_count = self.checkpoint_req_count.load(Ordering::Relaxed); 105 | 106 | while req_count < 2 { 107 | if let Err(new_req_count) = self.checkpoint_req_count.compare_exchange(req_count, req_count + 1, Ordering::Relaxed, Ordering::Relaxed) { 108 | req_count = new_req_count; 109 | } else { 110 | if req_count == 0 { 111 | self.checkpoint_ready.send(true, false); 112 | } 113 | break; 114 | } 115 | } 116 | 117 | Ok(()) 118 | } 119 | 120 | 121 | fn checkpointer_thread(conf: ConfigMt, 122 | terminate: Arc, 123 | checkpoint_ready: SyncNotification, 124 | log_mgr: LogMgr, 125 | csns: SharedSequences, 126 | checkpoint_req_count: Arc, 127 | tran_mgr: TranMgr) 128 | { 129 | match StorageDriver::new(conf, csns.clone()) { 130 | Ok(sd) => { 131 | let mut checkpoint_csn = csns.checkpoint_csn.get_cur(); 132 | loop { 133 | if let Some(mut lock) = checkpoint_ready.wait_for_interruptable( 134 | &mut (|state| -> bool { ! *state }), 135 | &mut (|| -> bool { terminate.load(Ordering::Relaxed) }), 136 | Duration::from_millis(CONDVAR_WAIT_INTERVAL_MS) 137 | ) { 138 | *lock = false; 139 | drop(lock); 140 | 141 | let mut req_count = checkpoint_req_count.load(Ordering::Relaxed); 142 | while req_count > 0 { 143 | req_count = checkpoint_req_count.fetch_sub(1, Ordering::Relaxed); 144 | 145 | checkpoint_csn += 1; 146 | 147 | // Write to log to be sure all subsequent writes related to this checkpoint go after this record. 148 | if let Err(e) = log_mgr.write_checkpoint_begin(checkpoint_csn, csns.latest_commit_csn.load(Ordering::Relaxed)) { 149 | error!("Failed to write to log about checkpoint initiation, error: {}", e); 150 | } else { 151 | info!("Checkpoint initiated"); 152 | csns.checkpoint_csn.set(checkpoint_csn); 153 | 154 | if let Err(e) = sd.checkpoint(checkpoint_csn) { 155 | error!("Failed to perform checkpoint: {}", e); 156 | } else { 157 | if let Err(e) = log_mgr.write_checkpoint_completed(checkpoint_csn-1, 158 | csns.latest_commit_csn.load(Ordering::Relaxed), 159 | tran_mgr.get_tsn()) { 160 | error!("Failed to write to log about checkpoint completion, error: {}", e); 161 | } else { 162 | info!("Checkpoint completed successfully"); 163 | } 164 | } 165 | } 166 | } 167 | } else { 168 | break; 169 | } 170 | } 171 | }, 172 | Err(e) => { 173 | error!("Failed to initialize checkpointer thread, storage driver failure, error: {}", e); 174 | }, 175 | } 176 | } 177 | } 178 | 179 | -------------------------------------------------------------------------------- /src/common/errors.rs: -------------------------------------------------------------------------------- 1 | //! Error codes and related functions 2 | //! 3 | 4 | 5 | use std; 6 | 7 | /// Error representation. 8 | #[derive(Debug)] 9 | pub struct Error { 10 | kind: ErrorKind, 11 | repr: ErrorRepr 12 | } 13 | 14 | 15 | impl Error { 16 | 17 | fn new(kind: ErrorKind, repr: ErrorRepr) -> Error { 18 | Error { 19 | kind, 20 | repr 21 | } 22 | } 23 | 24 | /// Returns kind of error. 25 | pub fn kind(&self) -> ErrorKind { 26 | self.kind 27 | } 28 | } 29 | 30 | 31 | impl std::error::Error for Error { } 32 | 33 | 34 | macro_rules! gen_error_kinds { 35 | ( $($kind:ident, $msg:literal), *) => { 36 | #[derive(Debug, PartialEq, Copy, Clone)] 37 | /// Types of errors. 38 | pub enum ErrorKind { 39 | $( 40 | $kind, 41 | )* 42 | } 43 | 44 | impl Error { 45 | /// Returns string description of a error. 46 | pub fn str_desc(&self) -> &str { 47 | match self.kind { 48 | $( 49 | ErrorKind::$kind => $msg, 50 | )* 51 | } 52 | } 53 | } 54 | }; 55 | } 56 | 57 | 58 | macro_rules! gen_create_fun { 59 | ( $($kind:ident, $create_fun:ident), *) => { 60 | impl Error { 61 | $( 62 | /// Create an error of a given type. 63 | #[inline] 64 | pub fn $create_fun() -> Self { 65 | Self::new(ErrorKind::$kind, ErrorRepr::Simple) 66 | } 67 | )* 68 | } 69 | }; 70 | 71 | ( $($kind:ident, $create_fun:ident, $fun_arg:path), *) => { 72 | impl Error { 73 | $( 74 | /// Create an error of a given type. 75 | #[inline] 76 | pub fn $create_fun(e: $fun_arg) -> Self { 77 | Self::new(ErrorKind::$kind, ErrorRepr::$kind(e)) 78 | } 79 | )* 80 | } 81 | } 82 | } 83 | 84 | 85 | macro_rules! gen_error_repr { 86 | ($( $from_type:ty, $error_kind:ident, $fun_name:ident ), *) => { 87 | 88 | #[derive(Debug)] 89 | enum ErrorRepr { 90 | Simple, 91 | $( 92 | $error_kind($from_type), 93 | )* 94 | } 95 | 96 | $( 97 | impl From<$from_type> for Error { 98 | fn from(error: $from_type) -> Self { 99 | Error::new(ErrorKind::$error_kind, ErrorRepr::$error_kind(error)) 100 | } 101 | } 102 | )* 103 | 104 | impl std::fmt::Display for Error { 105 | fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { 106 | write!(f, "{}", self.str_desc())?; 107 | match &self.repr { 108 | $( 109 | ErrorRepr::$error_kind(e) => write!(f, ": {}", e), 110 | )* 111 | _ => Ok(()) 112 | }?; 113 | 114 | Ok(()) 115 | } 116 | } 117 | 118 | impl Error { 119 | 120 | $( 121 | pub fn $fun_name(self) -> Option<$from_type> { 122 | match self.repr { 123 | ErrorRepr::$error_kind(e) => Some(e), 124 | _ => None 125 | } 126 | } 127 | )* 128 | } 129 | 130 | }; 131 | } 132 | 133 | 134 | gen_error_kinds![ 135 | NoError , "no error", 136 | NotImplemented , "not implemented", 137 | IoError , "io error", 138 | IncorrectAllocationSize , "incorrect allocation size", 139 | PathIsTooLong , "path is too long", 140 | Utf8ValidationError , "utf-8 validation error", 141 | AllocationFailure , "allocation failure", 142 | IncorrectLayout , "incorrect layout", 143 | ArrayIsFull , "array is full", 144 | MagicMismatch , "magic mismatch", 145 | DataFileNotInitialized , "data file was not properly initialized", 146 | SliceConversionError , "unexpected conversion failure", 147 | LockError , "lock failure", 148 | FailedToBuildPath , "failed to build path", 149 | FileIdOverflow , "overflow of file_id for data files", 150 | ExtentLimitReached , "data file extent limit reached", 151 | IncorrectExtentSize , "incorrect extent size (less than minimum size, or greater than maximum size)", 152 | IncorrectBlockSize , "incorrect block size (less than minimum size, or greater than maximum size, or not is power of two)", 153 | IncorrectExtentSettings , "initial extent number is too small or greater than the maximum allowed number of extents", 154 | LoggerIsTerminated , "logger is terminated", 155 | StringParseError , "string parse error", 156 | FileNotOpened , "file is not opened", 157 | ExtentDoesNotExist , "extent with specified id does not exist", 158 | FileDoesNotExist , "file with specified id does not exist", 159 | BlockDoesNotExist , "block with specified id does not exist", 160 | ObjectDoesNotExist , "object with specified id does not exist", 161 | ObjectIsDeleted , "object with specified id was deleted", 162 | UnexpectedCheckpoint , "unexpected checkpoint record appeared while reading transaction log", 163 | BlockChecksumMismatch , "block checksum mismatch", 164 | TimeOperationError , "internal error while performing operation on time value", 165 | DbSizeLimitReached , "configured database size limits have been reached, can't add more data", 166 | TryLockError , "lock operation was not successful", 167 | BlockCrcMismatch , "crc cehcksum failed for block", 168 | Timeout , "operations timed out", 169 | CheckpointStoreSizeLimitReached , "checkpoint store size limit reached" 170 | ]; 171 | 172 | 173 | gen_create_fun![ 174 | NoError , no_error , 175 | NotImplemented , not_implemented , 176 | IncorrectAllocationSize , incorrect_allocation_size , 177 | PathIsTooLong , path_is_too_long , 178 | AllocationFailure , allocation_failure , 179 | ArrayIsFull , array_is_full , 180 | MagicMismatch , magic_mismatch , 181 | DataFileNotInitialized , data_file_not_initialized , 182 | LockError , lock_error , 183 | FailedToBuildPath , failed_to_build_path , 184 | FileIdOverflow , file_id_overflow , 185 | ExtentLimitReached , extent_limit_reached , 186 | IncorrectExtentSize , incorrect_extent_size , 187 | IncorrectBlockSize , incorrect_block_size , 188 | IncorrectExtentSettings , incorrect_extent_settings , 189 | LoggerIsTerminated , logger_is_terminated , 190 | FileNotOpened , file_not_opened , 191 | ExtentDoesNotExist , extent_does_not_exist , 192 | FileDoesNotExist , file_does_not_exist , 193 | BlockDoesNotExist , block_does_not_exist , 194 | ObjectDoesNotExist , object_does_not_exist , 195 | ObjectIsDeleted , object_is_deleted , 196 | UnexpectedCheckpoint , unexpected_checkpoint , 197 | BlockChecksumMismatch , block_checksum_mismatch , 198 | DbSizeLimitReached , db_size_limit_reached , 199 | TryLockError , try_lock_error , 200 | BlockCrcMismatch , block_crc_mismatch , 201 | Timeout , timeout, 202 | CheckpointStoreSizeLimitReached , checkpoint_store_size_limit_reached 203 | ]; 204 | 205 | 206 | gen_create_fun![ 207 | IoError , io_error , std::io::Error, 208 | Utf8ValidationError , utf8_validation_error , std::str::Utf8Error, 209 | IncorrectLayout , incorrect_layout , std::alloc::LayoutErr, 210 | SliceConversionError , slice_conversion_error , std::array::TryFromSliceError, 211 | StringParseError , string_parse_error , std::num::ParseIntError, 212 | TimeOperationError , time_operation_error , std::time::SystemTimeError 213 | ]; 214 | 215 | 216 | gen_error_repr![ 217 | std::io::Error, IoError, io_err, 218 | std::str::Utf8Error, Utf8ValidationError, utf8_err, 219 | std::alloc::LayoutErr, IncorrectLayout, layout_err, 220 | std::array::TryFromSliceError, SliceConversionError, slice_err, 221 | std::num::ParseIntError, StringParseError, string_parse_err, 222 | std::time::SystemTimeError, TimeOperationError, time_operation_err 223 | ]; 224 | 225 | -------------------------------------------------------------------------------- /src/storage/checkpoint_store.rs: -------------------------------------------------------------------------------- 1 | /// Block storage for checkpoint restore. 2 | /// CheckpointStore represents part of BlockStorageDriver functionality related to checkpointing. 3 | /// Before any data block can be modifed its original state must be saved in the special block 4 | /// checkpoint store. 5 | /// When system starts it searches through the transaction log for the last checkpoint 6 | /// mark and then uses checkpoint store to restore all blocks from it. After all blocks are 7 | /// restored database represents state as of checkpoint start, and system can read the changes from 8 | /// the transaction log and apply them to database. 9 | /// Each checkpoint is identified by unique checkpoint sequence number. Checkpoint can be in 10 | /// not completed state, because when new checkpoint begins, a sequence of actions like writing all the 11 | /// previous checkpoint blocks to disk is required. As soon as all them done, the checkpoint is marked as 12 | /// completed, and corresponding record is made in the transaction log. 13 | /// System can use only completed checkpoint for restore. That means the checkpoint 14 | /// store keeps previous checkpoint's blocks until the new checkpoint is marked as completed. As soon 15 | /// as the new checkpoint is completed all blocks of the previous checkpoint are discarded and space 16 | /// can be reused by the current and future checkpoints. 17 | 18 | 19 | use crate::common::errors::Error; 20 | use crate::common::defs::BlockId; 21 | use crate::storage::datastore::FileDesc; 22 | use crate::block_mgr::block_mgr::BlockMgr; 23 | use crate::block_mgr::allocator::BlockAllocator; 24 | use crate::block_mgr::block::BlockLockedMut; 25 | use crate::block_mgr::block::DataBlock; 26 | use crate::block_mgr::block::BasicBlock; 27 | use std::cell::RefCell; 28 | use std::cell::Ref; 29 | use std::rc::Rc; 30 | 31 | 32 | #[derive(Clone)] 33 | pub struct CheckpointStore { 34 | block_mgr: Rc, 35 | block_allocator: Rc, 36 | file_info: RefCell>, 37 | } 38 | 39 | impl CheckpointStore { 40 | 41 | pub fn new(block_mgr: Rc, block_allocator: Rc) -> Result { 42 | let file_info = RefCell::new(vec![]); 43 | Ok(CheckpointStore { 44 | block_mgr, 45 | block_allocator, 46 | file_info, 47 | }) 48 | } 49 | 50 | /// Add block to checkpoint store. 51 | pub fn add_block(&self, block: &BlockLockedMut, checkpoint_csn: u64) -> Result<(), Error> { 52 | let mut tgt_block = self.block_allocator.get_free_checkpoint_block(checkpoint_csn)?; 53 | tgt_block.copy_from(&block); 54 | tgt_block.set_original_id(block.get_id()); 55 | self.block_mgr.set_checkpoint_block_id(block.get_buf_idx(), tgt_block.get_id()); 56 | self.block_mgr.set_checkpoint_written(block.get_buf_idx(), false); 57 | 58 | Ok(()) 59 | } 60 | 61 | /// Return iterator over checkpoint store blocks. 62 | pub fn get_iter(&self, checkpoint_csn: u64) -> Result { 63 | self.block_mgr.get_checkpoint_files(&mut self.file_info.borrow_mut()); 64 | Ok(Iterator::new(&self.block_mgr, self.file_info.borrow(), checkpoint_csn)) 65 | } 66 | } 67 | 68 | /// Iterator over blocks of checkpoint store for certain checkpoint sequence number. 69 | pub struct Iterator<'a> { 70 | block_mgr: &'a BlockMgr, 71 | file_desc: Ref<'a, Vec>, 72 | cur_extent_id: u16, 73 | cur_block_id: u16, 74 | checkpoint_csn: u64, 75 | cur_file_idx: usize, 76 | } 77 | 78 | impl<'a> Iterator<'a> { 79 | 80 | fn new(block_mgr: &'a BlockMgr, file_desc: Ref<'a, Vec>, checkpoint_csn: u64) -> Self { 81 | Iterator { 82 | block_mgr, 83 | file_desc, 84 | cur_extent_id: (checkpoint_csn & 0x1) as u16 + 1, // avoid extent 0 by adding 1 85 | cur_block_id: 0, 86 | checkpoint_csn, 87 | cur_file_idx: 0, 88 | } 89 | } 90 | 91 | pub fn get_next(&mut self) -> Result, Error> { 92 | while let Some(block_id) = self.calc_next_block_id() { 93 | let block = self.block_mgr.get_block_mut_no_lock(&block_id)?; 94 | if block.get_checkpoint_csn() == self.checkpoint_csn { 95 | return Ok(Some((block.get_original_id(), block))); 96 | } else { 97 | break; 98 | } 99 | } 100 | 101 | Ok(None) 102 | } 103 | 104 | fn calc_next_block_id(&mut self) -> Option { 105 | self.cur_block_id += 1; 106 | if self.cur_block_id == self.file_desc[self.cur_file_idx].extent_size { 107 | self.cur_block_id = 0; 108 | self.cur_extent_id += 2; 109 | if self.cur_extent_id >= self.file_desc[self.cur_file_idx].extent_num { 110 | self.cur_extent_id = (self.checkpoint_csn & 0x1) as u16; 111 | self.cur_file_idx += 1; 112 | if self.cur_file_idx == self.file_desc.len() { 113 | return None; 114 | } 115 | } 116 | } 117 | 118 | Some(BlockId { 119 | file_id: self.file_desc[self.cur_file_idx].file_id, 120 | extent_id: self.cur_extent_id, 121 | block_id: self.cur_block_id, 122 | }) 123 | } 124 | } 125 | 126 | 127 | #[cfg(test)] 128 | mod tests { 129 | 130 | use super::*; 131 | use crate::storage::datastore::DataStore; 132 | use crate::storage::datastore::FileType; 133 | use crate::storage::datastore::FileDesc; 134 | use crate::storage::datastore::FileState; 135 | use crate::buf_mgr::buf_writer::BufWriter; 136 | use crate::system::config::ConfigMt; 137 | use std::time::Duration; 138 | use std::path::Path; 139 | 140 | 141 | fn init_datastore(dspath: &str, block_size: usize) -> Vec { 142 | 143 | if Path::new(&dspath).exists() { 144 | std::fs::remove_dir_all(&dspath).expect("Failed to delete test dir on cleanup"); 145 | } 146 | std::fs::create_dir(&dspath).expect("Failed to create test dir"); 147 | 148 | let mut fdset = vec![]; 149 | let desc1 = FileDesc { 150 | state: FileState::InUse, 151 | file_id: 3, 152 | extent_size: 16, 153 | extent_num: 3, 154 | max_extent_num: 65500, 155 | file_type: FileType::DataStoreFile, 156 | }; 157 | let desc2 = FileDesc { 158 | state: FileState::InUse, 159 | file_id: 4, 160 | extent_size: 10, 161 | extent_num: 3, 162 | max_extent_num: 65500, 163 | file_type: FileType::VersioningStoreFile, 164 | }; 165 | let desc3 = FileDesc { 166 | state: FileState::InUse, 167 | file_id: 5, 168 | extent_size: 10, 169 | extent_num: 5, 170 | max_extent_num: 65500, 171 | file_type: FileType::CheckpointStoreFile, 172 | }; 173 | 174 | fdset.push(desc1); 175 | fdset.push(desc2); 176 | fdset.push(desc3); 177 | 178 | DataStore::initialize_datastore(dspath, block_size, &fdset).expect("Failed to init datastore"); 179 | fdset 180 | } 181 | 182 | fn check_added_num(expected_cnt: usize, cs: &CheckpointStore, checkpoint_csn: u64) { 183 | let mut block_num = 0; 184 | 185 | let mut iter = cs.get_iter(checkpoint_csn).expect("Failed to get iterator"); 186 | 187 | while let Some(_block) = iter.get_next().expect("Failed to get next block") { 188 | block_num += 1; 189 | } 190 | 191 | assert_eq!(expected_cnt, block_num); 192 | } 193 | 194 | fn add_block(file_id: u16, extent_id: u16, block_id: u16, cs: &CheckpointStore, block_mgr: &BlockMgr, checkpoint_csn: u64) -> usize { 195 | let block_id = BlockId::init(file_id, extent_id, block_id); 196 | let mut block = block_mgr.get_block_mut(&block_id).expect("Failed to get block"); 197 | let ret = block.get_buf_idx(); 198 | block.set_checkpoint_csn(checkpoint_csn); 199 | cs.add_block(&block, checkpoint_csn).expect("Failed to add block"); 200 | ret 201 | } 202 | 203 | fn flush_blocks(block_mgr: &BlockMgr, idxs: &[usize]) { 204 | 205 | let mut i =0; 206 | assert!(loop { 207 | std::thread::sleep(Duration::new(2,0)); 208 | let mut dirty = false; 209 | for idx in idxs.iter() { 210 | let desc = block_mgr.get_block_desc(*idx).unwrap(); 211 | if desc.dirty { 212 | dirty = true; 213 | } 214 | } 215 | if ! dirty { 216 | break true; 217 | } 218 | i += 1; 219 | if i == 30 { 220 | break false; 221 | } 222 | }, "Writers couldn't complete in 60 secs"); 223 | } 224 | 225 | #[test] 226 | fn test_checkpoint_store() { 227 | let dspath = "/tmp/test_checkpoint_store_68343467"; 228 | let block_size = 8192; 229 | let block_num = 100; 230 | let writer_num = 2; 231 | 232 | let conf = ConfigMt::new(); 233 | let mut c = conf.get_conf(); 234 | c.set_datastore_path(dspath.to_owned()); 235 | c.set_block_mgr_n_lock(10); 236 | c.set_free_info_n_file_lock(10); 237 | c.set_free_info_n_extent_lock(10); 238 | c.set_block_buf_size(block_num*block_size as u64); 239 | c.set_checkpoint_data_threshold(10*1024); 240 | c.set_version_retain_time(10_000); 241 | c.set_writer_num(2); 242 | drop(c); 243 | 244 | let _init_fdesc = init_datastore(dspath, block_size); 245 | 246 | let block_mgr = Rc::new(BlockMgr::new(conf.clone()).expect("Failed to create block mgr")); 247 | let block_allocator = Rc::new(BlockAllocator::new(conf.clone(), block_mgr.clone())); 248 | let buf_writer = BufWriter::new(&block_mgr, writer_num).expect("Failed to create buf writer"); 249 | let cs = CheckpointStore::new(block_mgr.clone(), block_allocator).expect("Failed to create checkpoint store"); 250 | 251 | let mut checkpoint_csn = 1; 252 | let add_cnt = 7; 253 | let mut idxs = vec![]; 254 | 255 | for i in 0..add_cnt { 256 | idxs.push(add_block(3, 1, 1 + i, &cs, &block_mgr, checkpoint_csn)); 257 | } 258 | flush_blocks(&block_mgr, &idxs); 259 | 260 | check_added_num(add_cnt as usize, &cs, checkpoint_csn); 261 | 262 | idxs.truncate(0); 263 | idxs.push(add_block(3, 1, 1 + add_cnt, &cs, &block_mgr, checkpoint_csn)); 264 | flush_blocks(&block_mgr, &idxs); 265 | 266 | // emulate restart, otherwise buf can contain unsynced block data previously read from checkpint store. 267 | drop(cs); 268 | buf_writer.terminate(); 269 | if let Ok(bm) = Rc::try_unwrap(block_mgr) { 270 | drop(bm); 271 | } else { 272 | panic!("Failed to unwrap block mgr"); 273 | } 274 | 275 | let block_mgr = Rc::new(BlockMgr::new(conf.clone()).expect("Failed to create block mgr")); 276 | let block_allocator = Rc::new(BlockAllocator::new(conf.clone(), block_mgr.clone())); 277 | let buf_writer = BufWriter::new(&block_mgr, writer_num).expect("Failed to create buf writer"); 278 | let cs = CheckpointStore::new(block_mgr.clone(), block_allocator).expect("Failed to create checkpoint store"); 279 | 280 | check_added_num(add_cnt as usize + 1, &cs, checkpoint_csn); 281 | 282 | 283 | checkpoint_csn += 1; 284 | 285 | idxs.truncate(0); 286 | for i in 0..add_cnt { 287 | idxs.push(add_block(3, 1, 1 + i, &cs, &block_mgr, checkpoint_csn)); 288 | idxs.push(add_block(3, 2, 1 + i, &cs, &block_mgr, checkpoint_csn)); 289 | } 290 | 291 | flush_blocks(&block_mgr, &idxs); 292 | 293 | check_added_num(add_cnt as usize * 2, &cs, checkpoint_csn); 294 | buf_writer.terminate(); 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /src/block_mgr/allocator.rs: -------------------------------------------------------------------------------- 1 | /// Searh of free blocks and allocation of new blocks. 2 | 3 | 4 | use crate::common::errors::Error; 5 | use crate::common::errors::ErrorKind; 6 | use crate::common::defs::BlockId; 7 | use crate::common::defs::Sequence; 8 | use crate::storage::datastore::FileDesc; 9 | use crate::system::config::ConfigMt; 10 | use crate::block_mgr::block_mgr::BlockMgr; 11 | use crate::block_mgr::block::DataBlock; 12 | use crate::block_mgr::block::BlockLockedMut; 13 | use crate::block_mgr::free_info::FreeInfo; 14 | use crate::block_mgr::free_info::FiData; 15 | use crate::block_mgr::free_info::FreeInfoSharedState; 16 | use crate::buf_mgr::buf_mgr::BlockType; 17 | use std::cell::RefCell; 18 | use std::rc::Rc; 19 | 20 | 21 | /// Shared state that can be sent to other threads. 22 | pub struct BlockAllocatorSharedState { 23 | fi_ss: FreeInfoSharedState, 24 | checkpoint_store_seq: Sequence, 25 | } 26 | 27 | 28 | pub struct BlockAllocator { 29 | block_mgr: Rc, 30 | file_desc_buf: RefCell>, 31 | free_info: FreeInfo, 32 | file_fi_data: RefCell, 33 | extent_fi_data: RefCell, 34 | checkpoint_store_seq: Sequence, 35 | } 36 | 37 | impl BlockAllocator { 38 | 39 | pub fn new(conf: ConfigMt, block_mgr: Rc) -> Self { 40 | let free_info = FreeInfo::new(conf.clone(), block_mgr.clone()); 41 | let file_desc_buf = RefCell::new(vec![]); 42 | let file_fi_data = RefCell::new(FiData:: new()); 43 | let extent_fi_data = RefCell::new(FiData:: new()); 44 | let checkpoint_store_seq = Sequence::new(1); 45 | 46 | BlockAllocator { 47 | block_mgr, 48 | free_info, 49 | file_desc_buf, 50 | file_fi_data, 51 | extent_fi_data, 52 | checkpoint_store_seq, 53 | } 54 | } 55 | 56 | /// Build instance from shared state. 57 | pub fn from_shared_state(block_mgr: Rc, ss: BlockAllocatorSharedState) -> Result { 58 | let BlockAllocatorSharedState { fi_ss, checkpoint_store_seq } = ss; 59 | 60 | let free_info = FreeInfo::from_shared_state(block_mgr.clone(), fi_ss)?; 61 | let file_desc_buf = RefCell::new(vec![]); 62 | let file_fi_data = RefCell::new(FiData:: new()); 63 | let extent_fi_data = RefCell::new(FiData:: new()); 64 | 65 | Ok(BlockAllocator { 66 | block_mgr, 67 | free_info, 68 | file_desc_buf, 69 | file_fi_data, 70 | extent_fi_data, 71 | checkpoint_store_seq, 72 | }) 73 | } 74 | 75 | /// Return shared state that can be sent to other threads. 76 | pub fn get_shared_state(&self) -> BlockAllocatorSharedState { 77 | BlockAllocatorSharedState { 78 | fi_ss: self.free_info.get_shared_state(), 79 | checkpoint_store_seq: self.checkpoint_store_seq.clone(), 80 | } 81 | } 82 | 83 | /// Find or allocate a new data block. 84 | pub fn get_free(&self, file_id: u16) -> Result, Error> { 85 | if let Some(block) = self.find_free_block(file_id)? { 86 | Ok(block) 87 | } else { 88 | self.allocate_block(file_id) 89 | } 90 | } 91 | 92 | /// mark an extent as full. 93 | pub fn mark_extent_full(&self, file_id: u16, extent_id: u16) -> Result<(), Error> { 94 | self.free_info.set_extent_bit(file_id, extent_id, true) 95 | } 96 | 97 | /// In some of datastore files add a new extent, return first free block from that extent. 98 | pub fn allocate_block(&self, file_id: u16) -> Result, Error> { 99 | // in some of data files add a new extent; 100 | // try to get free block in that extent and return the block; 101 | // if free block was not found then add extent and repeat attempt. 102 | let desc = self.block_mgr.get_file_desc(file_id).ok_or(Error::file_does_not_exist())?; 103 | // try adding a new extent to datastore file 104 | self.free_info.get_fi_for_file(desc.file_id, &mut self.file_fi_data.borrow_mut())?; 105 | if self.file_fi_data.borrow().size() < desc.max_extent_num { 106 | self.block_mgr.add_extent(desc.file_id)?; 107 | self.free_info.add_extent(desc.file_id)?; 108 | let extent_id = self.file_fi_data.borrow().size(); 109 | if let Some(block) = self.find_free_block_in_extent(desc.file_id, extent_id)? { 110 | return Ok(block); 111 | } 112 | } 113 | 114 | return Err(Error::db_size_limit_reached()); 115 | } 116 | 117 | /// Mark block as full in free info section. 118 | pub fn set_free_info_used(&self, block_id: &BlockId) -> Result<(), Error> { 119 | // lock extent free info and set the bit for the block accordingly 120 | // check if extent changed from full to free or wise versa 121 | // if extent changed then lock file free info, and then release extent lock 122 | // update file free info accordingly 123 | self.free_info.set_block_bit(block_id, true) 124 | } 125 | 126 | /// Mark block as having free space in free info section. 127 | pub fn set_free_info_free(&self, block_id: &BlockId) -> Result<(), Error> { 128 | // lock extent free info and set the bit for the block accordingly 129 | // check if extent changed from full to free or wise versa 130 | // if extent changed then lock file free info, and then release extent lock 131 | // update file free info accordingly 132 | self.free_info.set_block_bit(block_id, false) 133 | } 134 | 135 | /// Return free data block from checkpoint store. 136 | pub fn get_free_checkpoint_block(&self, checkpoint_csn: u64) -> Result { 137 | // determine next available block_id; 138 | // find a free block from buffer and assign it to block_id; 139 | // return the block. 140 | let block_id = self.get_next_checkpoint_block_id(checkpoint_csn); 141 | self.block_mgr.allocate_on_cache_mut_no_lock(block_id, BlockType::CheckpointBlock) 142 | } 143 | 144 | /// Allocate extent in the versioning store. 145 | pub fn allocate_versioning_extent(&self) -> Result<(u16, u16, u16), Error> { 146 | self.block_mgr.get_versioning_files(&mut self.file_desc_buf.borrow_mut()); 147 | let file_desc_set = &self.file_desc_buf.borrow(); 148 | for desc in file_desc_set.iter() { 149 | if desc.extent_num < desc.max_extent_num { 150 | self.block_mgr.add_extent(desc.file_id)?; 151 | return Ok((desc.file_id, desc.extent_num, desc.extent_size)); 152 | } 153 | } 154 | return Err(Error::db_size_limit_reached()); 155 | } 156 | 157 | /// Allocate block with specified id. 158 | pub fn allocate_block_with_id(&self, block_id: &BlockId) -> Result<(), Error> { 159 | self.block_mgr.get_data_files(&mut self.file_desc_buf.borrow_mut()); 160 | let file_desc_set = &self.file_desc_buf.borrow(); 161 | for desc in file_desc_set.iter() { 162 | if desc.file_id == block_id.file_id { 163 | let mut extent_num = desc.extent_num; 164 | while extent_num <= block_id.extent_id { 165 | if desc.extent_num >= desc.max_extent_num { 166 | return Err(Error::db_size_limit_reached()); 167 | } else { 168 | self.block_mgr.add_extent(desc.file_id)?; 169 | self.free_info.add_extent(desc.file_id)?; 170 | } 171 | extent_num += 1; 172 | } 173 | 174 | return Ok(()); 175 | } 176 | } 177 | return Err(Error::file_does_not_exist()); 178 | } 179 | 180 | // return next generated checkpoint block_id. 181 | fn get_next_checkpoint_block_id(&self, checkpoint_csn: u64) -> BlockId { 182 | // use fake block_id because it is not considered by writer when writer 183 | // writes checkpoint block to disk. 184 | let seq_num = self.checkpoint_store_seq.get_next(); 185 | let file_id = (checkpoint_csn & 0x1) as u16; // file_id 0 and 1 are reserved for checkpointing store file ids. 186 | let block_id = (seq_num & 0xffff) as u16; 187 | let seq_num = seq_num >> 16; 188 | let extent_id = (seq_num & 0xffff) as u16; 189 | 190 | BlockId { 191 | file_id, 192 | extent_id, 193 | block_id, 194 | } 195 | } 196 | 197 | // find and return block with free space. 198 | fn find_free_block(&self, file_id: u16) -> Result>, Error> { 199 | // find exntents with free blocks; 200 | // try getting the block. 201 | let desc = self.block_mgr.get_file_desc(file_id).ok_or(Error::file_does_not_exist())?; 202 | self.free_info.get_fi_for_file(desc.file_id, &mut self.file_fi_data.borrow_mut())?; 203 | let file_fi_data = self.file_fi_data.borrow(); 204 | let mut free_iter = file_fi_data.free_iter(); 205 | while let Some(extent_id) = free_iter.next() { 206 | if let Some(block) = self.find_free_block_in_extent(desc.file_id, extent_id)? { 207 | return Ok(Some(block)); 208 | } 209 | } 210 | Ok(None) 211 | } 212 | 213 | fn find_free_block_in_extent(&self, file_id: u16, extent_id: u16) -> Result>, Error> { 214 | // find a free block in given extent; 215 | // try locking the block; 216 | // if the block is already locked then continue search, 217 | // else check the used space in the block; 218 | // if block is full then unlock and continue search, 219 | // else return the block. 220 | self.free_info.get_fi_for_extent(file_id, extent_id, &mut self.extent_fi_data.borrow_mut())?; 221 | let extent_fi_data = self.extent_fi_data.borrow(); 222 | let mut free_iter = extent_fi_data.free_iter(); 223 | while let Some(block_id) = free_iter.next() { 224 | let blid = BlockId { 225 | file_id, 226 | extent_id: extent_id, 227 | block_id, 228 | }; 229 | 230 | match self.block_mgr.get_block_for_write::(&blid, DataBlock::new, true, 0) { 231 | Ok(block) => { 232 | if block.get_used_space() < self.block_mgr.block_fill_size() { 233 | return Ok(Some(block)); 234 | } 235 | drop(block); 236 | }, 237 | Err(e) => { 238 | match e.kind() { 239 | ErrorKind::TryLockError => {}, 240 | _ => return Err(e) 241 | } 242 | }, 243 | }; 244 | } 245 | Ok(None) 246 | } 247 | } 248 | 249 | #[cfg(test)] 250 | mod tests { 251 | 252 | use super::*; 253 | use crate::storage::datastore::DataStore; 254 | use crate::storage::datastore::FileType; 255 | use crate::storage::datastore::FileState; 256 | use crate::block_mgr::block::BasicBlock; 257 | use std::path::Path; 258 | 259 | 260 | fn init_datastore(dspath: &str, block_size: usize) -> Vec { 261 | 262 | if Path::new(&dspath).exists() { 263 | std::fs::remove_dir_all(&dspath).expect("Failed to delete test dir on cleanup"); 264 | } 265 | std::fs::create_dir(&dspath).expect("Failed to create test dir"); 266 | 267 | let mut fdset = vec![]; 268 | let desc1 = FileDesc { 269 | state: FileState::InUse, 270 | file_id: 3, 271 | extent_size: 16, 272 | extent_num: 3, 273 | max_extent_num: 65500, 274 | file_type: FileType::DataStoreFile, 275 | }; 276 | let desc2 = FileDesc { 277 | state: FileState::InUse, 278 | file_id: 4, 279 | extent_size: 10, 280 | extent_num: 3, 281 | max_extent_num: 65500, 282 | file_type: FileType::VersioningStoreFile, 283 | }; 284 | let desc3 = FileDesc { 285 | state: FileState::InUse, 286 | file_id: 5, 287 | extent_size: 10, 288 | extent_num: 3, 289 | max_extent_num: 65500, 290 | file_type: FileType::CheckpointStoreFile, 291 | }; 292 | 293 | fdset.push(desc1); 294 | fdset.push(desc2); 295 | fdset.push(desc3); 296 | 297 | DataStore::initialize_datastore(dspath, block_size, &fdset).expect("Failed to init datastore"); 298 | fdset 299 | } 300 | 301 | #[test] 302 | fn test_allocator() { 303 | let dspath = "/tmp/test_allocator_655637"; 304 | let block_size = 8192; 305 | let block_num = 100; 306 | 307 | let conf = ConfigMt::new(); 308 | let mut c = conf.get_conf(); 309 | c.set_datastore_path(dspath.to_owned()); 310 | c.set_block_mgr_n_lock(10); 311 | c.set_block_buf_size(block_num*block_size as u64); 312 | drop(c); 313 | 314 | let _init_fdesc = init_datastore(dspath, block_size); 315 | 316 | let block_mgr = Rc::new(BlockMgr::new(conf.clone()).expect("Failed to create instance")); 317 | 318 | let ba = BlockAllocator::new(conf.clone(), block_mgr.clone()); 319 | let ss = ba.get_shared_state(); 320 | let ba = BlockAllocator::from_shared_state(block_mgr.clone(), ss).expect("Failed to get block allocator"); 321 | 322 | let checkpoint_csn = 34544; 323 | 324 | let file_id = 3; 325 | let block_id = BlockId::init(file_id, 1, 1); 326 | let block = ba.get_free(file_id).unwrap(); 327 | assert_eq!(block_id, block.get_id()); 328 | drop(block); 329 | ba.set_free_info_used(&block_id).expect("Failed to set block bit"); 330 | let block = ba.get_free(file_id).unwrap(); 331 | assert_eq!(BlockId::init(file_id, 1, 2), block.get_id()); 332 | drop(block); 333 | ba.set_free_info_free(&block_id).expect("Failed to set block bit"); 334 | let block = ba.get_free(file_id).unwrap(); 335 | assert_eq!(block_id, block.get_id()); 336 | drop(block); 337 | 338 | 339 | let block_id = BlockId::init(file_id, 3, 1); 340 | let block = ba.allocate_block(file_id).expect("Failed to allocate block"); 341 | assert_eq!(block_id, block.get_id()); 342 | drop(block); 343 | 344 | let block = ba.get_free_checkpoint_block(checkpoint_csn).expect("Failed to get checkpoint block"); 345 | assert_eq!(BlockId::init(0, 0, 2), block.get_id()); 346 | drop(block); 347 | let block = ba.get_free_checkpoint_block(checkpoint_csn+1).expect("Failed to get checkpoint block"); 348 | assert_eq!(BlockId::init(1, 0, 3), block.get_id()); 349 | drop(block); 350 | 351 | 352 | let block_id = BlockId::init(3, 5, 1); 353 | assert!(block_mgr.get_block(&block_id).is_err()); 354 | ba.allocate_block_with_id(&block_id).expect("Failed to allocate block"); 355 | assert!(block_mgr.get_block(&block_id).is_ok()); 356 | } 357 | } 358 | -------------------------------------------------------------------------------- /src/log_mgr/fs.rs: -------------------------------------------------------------------------------- 1 | /// All about transaction log on a file system 2 | 3 | use crate::common::errors::Error; 4 | use crate::common::intercom::SyncNotification; 5 | use crate::log_mgr::buf::DoubleBuf; 6 | use crate::log_mgr::buf::Slice; 7 | use std::fs::File; 8 | use std::io::{Write, Read}; 9 | use std::io::{Seek, SeekFrom}; 10 | use std::fs::OpenOptions; 11 | use log::info; 12 | use std::path::{PathBuf, Path}; 13 | use std::sync::{Arc, atomic::AtomicBool, atomic::Ordering}; 14 | use std::thread::JoinHandle; 15 | use log::error; 16 | use std::time::Duration; 17 | 18 | 19 | 20 | const LOG_FILE_PREFIX: &str = "log"; 21 | const LOG_FILE_MAGIC: [u8; 3] = [b'L', b'G', 0xAB]; 22 | const LOG_FILE_ONLINE_BIT: u8 = 0xCD; 23 | const RETRY_DURATION_SEC: u64 = 1; 24 | 25 | 26 | /// FileStream is read & write interface for transaction log in file system. 27 | pub struct FileStream { 28 | f: File, 29 | max_file_size: u32, 30 | log_dir: String, 31 | rotation: Option, 32 | file_id: u32, 33 | offset: u32, 34 | } 35 | 36 | 37 | impl FileStream { 38 | 39 | pub fn new(log_dir: String, max_file_size: u32, file_id: u32, start_pos: u32, enable_rotation: bool, read: bool) -> Result { 40 | 41 | let file = FileOps::build_file_name(&log_dir, file_id); 42 | 43 | let mut f = OpenOptions::new() 44 | .create(false) 45 | .read(read) 46 | .write(true) 47 | .truncate(false) 48 | .open(file)?; 49 | 50 | let min_offset = (LOG_FILE_MAGIC.len() + std::mem::size_of::()) as u32; 51 | let offset = if start_pos < min_offset { min_offset } else { start_pos }; 52 | 53 | let rotation = if enable_rotation { 54 | if !FileOps::build_file_name(&log_dir, file_id + 1).exists() { 55 | let _next_file = FileOps::create_log_file(&log_dir, file_id+1, max_file_size, false); 56 | } 57 | Some(FileRotation::new(&log_dir, max_file_size)) 58 | } else { 59 | None 60 | }; 61 | 62 | f.seek(SeekFrom::Start(offset as u64))?; 63 | 64 | Ok(FileStream { 65 | f, 66 | max_file_size, 67 | log_dir, 68 | rotation, 69 | file_id, 70 | offset, 71 | }) 72 | } 73 | 74 | pub fn get_cur_pos(&mut self) -> std::io::Result { 75 | Ok(self.f.seek(SeekFrom::Current(0))?) 76 | } 77 | 78 | fn reopen(&mut self) -> std::io::Result<()> { 79 | let file_id = self.file_id + 1; 80 | 81 | let mut file = PathBuf::from(&self.log_dir); 82 | file.push(LOG_FILE_PREFIX); 83 | file.set_extension(file_id.to_string()); 84 | 85 | if let Some(rotation) = &self.rotation { 86 | if !file.exists() { 87 | rotation.wait_for_file(file_id); 88 | } 89 | 90 | rotation.request_new_file(file_id + 1); 91 | } 92 | 93 | let mut f = OpenOptions::new() 94 | .create(false) 95 | .write(true) 96 | .truncate(false) 97 | .open(file)?; 98 | 99 | // mark online 100 | f.seek(SeekFrom::Start(3))?; 101 | f.write_all(&[LOG_FILE_ONLINE_BIT])?; 102 | f.flush()?; 103 | 104 | self.f = f; 105 | self.offset = self.get_cur_pos()? as u32; 106 | self.file_id = file_id; 107 | 108 | Ok(()) 109 | } 110 | 111 | pub fn terminate(self) { 112 | if let Some(rotation) = self.rotation { 113 | rotation.terminate(); 114 | } 115 | } 116 | } 117 | 118 | impl Write for FileStream { 119 | 120 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 121 | 122 | if self.offset > self.max_file_size { 123 | self.reopen()?; 124 | } 125 | 126 | let ret = self.f.write(buf); 127 | if let Ok(written) = ret { 128 | self.offset += written as u32; 129 | } 130 | ret 131 | } 132 | 133 | fn flush(&mut self) -> std::io::Result<()> { 134 | self.f.flush() 135 | } 136 | } 137 | 138 | impl Read for FileStream { 139 | 140 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 141 | self.f.read(buf) 142 | } 143 | } 144 | 145 | impl Seek for FileStream { 146 | 147 | fn seek(&mut self, pos: SeekFrom) -> std::io::Result { 148 | self.f.seek(pos) 149 | } 150 | } 151 | 152 | 153 | /// Wrapper around FileStream for direct read & buffered write to a transaction log 154 | #[derive(Clone)] 155 | pub struct BufferedFileStream { 156 | writer_thread: Arc>, 157 | terminate: Arc, 158 | buf: DoubleBuf, 159 | } 160 | 161 | impl BufferedFileStream { 162 | 163 | pub fn new(log_dir: String, max_file_size: u32, buf_sz: usize, file_id: u32, start_pos: u32) -> Result { 164 | 165 | let terminate = Arc::new(AtomicBool::new(false)); 166 | 167 | let terminate2 = terminate.clone(); 168 | 169 | let db = DoubleBuf::new(buf_sz)?; 170 | 171 | let db2 = db.clone(); 172 | 173 | let fs = FileStream::new(log_dir, max_file_size, file_id, start_pos, true, false)?; 174 | 175 | let retry_duration = Duration::new(RETRY_DURATION_SEC, 0); 176 | 177 | let writer_thread = std::thread::spawn(move || { 178 | 179 | Self::write_log_loop(fs, db2, terminate2, retry_duration); 180 | }); 181 | 182 | Ok(BufferedFileStream { 183 | writer_thread: Arc::new(writer_thread), 184 | terminate, 185 | buf: db, 186 | }) 187 | } 188 | 189 | pub fn get_for_write(&self, reserve_size: usize) -> Result, ()> { 190 | self.buf.reserve_slice(reserve_size, false) 191 | } 192 | 193 | pub fn flush(&self) { 194 | self.buf.flush() 195 | } 196 | 197 | pub fn terminate(self) { 198 | if let Ok(jh) = Arc::try_unwrap(self.writer_thread) { 199 | self.terminate.store(true, Ordering::Relaxed); 200 | 201 | self.buf.seal_buffers(); 202 | 203 | jh.join().unwrap(); 204 | } 205 | } 206 | 207 | fn write_log_loop(mut fs: FileStream, buf: DoubleBuf, terminate: Arc, retry_duration: Duration) { 208 | 209 | let mut terminated_cnt = 0; 210 | 211 | loop { 212 | 213 | let (slice, buf_id) = buf.reserve_for_read(); 214 | 215 | while let Err(e) = fs.write_all(slice) { 216 | error!("Failed to write to transaction log file: {}", e); 217 | std::thread::sleep(retry_duration); 218 | } 219 | 220 | while let Err(e) = fs.flush() { 221 | error!("Failed to flush transaction log file: {}", e); 222 | std::thread::sleep(retry_duration); 223 | } 224 | 225 | if terminate.load(Ordering::Relaxed) { 226 | buf.set_buf_terminated(buf_id); 227 | 228 | terminated_cnt += 1; 229 | 230 | if terminated_cnt == buf.get_buf_cnt() { 231 | fs.terminate(); 232 | break; 233 | } 234 | 235 | } else { 236 | 237 | buf.set_buf_appendable(buf_id); 238 | } 239 | } 240 | } 241 | } 242 | 243 | 244 | 245 | /// Log file rotation 246 | struct FileRotation { 247 | new_file_req: SyncNotification, 248 | file_created: SyncNotification, 249 | rotation_thread: JoinHandle<()>, 250 | } 251 | 252 | impl FileRotation { 253 | 254 | pub fn new(log_dir: &String, max_file_size: u32) -> FileRotation { 255 | let new_file_req = SyncNotification::new(FileRotationReq::Noop); 256 | let file_created = SyncNotification::new(0); 257 | 258 | let new_file_req2 = new_file_req.clone(); 259 | let file_created2 = file_created.clone(); 260 | 261 | let log_dir2 = log_dir.clone(); 262 | 263 | let rotation_thread = std::thread::spawn(move || { 264 | 265 | let mut terminate = false; 266 | 267 | loop { 268 | let mut file_id = 0; 269 | 270 | let mut check = |val: &FileRotationReq| -> bool { 271 | match *val { 272 | FileRotationReq::CreateFile(val) => { 273 | file_id = val; 274 | return false; 275 | }, 276 | FileRotationReq::Terminate => { 277 | terminate = true; 278 | return false; 279 | }, 280 | FileRotationReq::Noop => return true, 281 | } 282 | }; 283 | 284 | let mut locked_val = new_file_req2.wait_for(&mut check); 285 | *locked_val = FileRotationReq::Noop; 286 | if terminate { 287 | return; 288 | } else { 289 | let res = FileOps::create_log_file(&log_dir2, file_id, max_file_size, false); 290 | if let Err(e) = res { 291 | error!("Failed to create a new log file {}", e); 292 | } else { 293 | file_created2.send(file_id, true); 294 | } 295 | } 296 | } 297 | }); 298 | 299 | FileRotation { 300 | new_file_req, 301 | file_created, 302 | rotation_thread, 303 | } 304 | } 305 | 306 | pub fn wait_for_file(&self, file_id: u32) { 307 | let mut check = |val: &u32| -> bool {*val != file_id}; 308 | let mut locked_val = self.file_created.wait_for(&mut check); 309 | *locked_val = 0; 310 | } 311 | 312 | pub fn request_new_file(&self, file_id: u32) { 313 | self.new_file_req.send(FileRotationReq::CreateFile(file_id), true); 314 | } 315 | 316 | pub fn terminate(self) { 317 | self.new_file_req.send(FileRotationReq::Terminate, true); 318 | self.rotation_thread.join().unwrap(); 319 | } 320 | } 321 | 322 | 323 | #[derive(Clone, Copy, PartialEq, Debug)] 324 | enum FileRotationReq { 325 | Noop, 326 | CreateFile(u32), 327 | Terminate, 328 | } 329 | 330 | 331 | /// Log file related utility operations 332 | pub struct FileOps { } 333 | 334 | impl FileOps { 335 | 336 | pub fn find_latest_log_file(log_dir: &str) -> Result { 337 | 338 | let mut max_id = 0; 339 | 340 | for entry in std::fs::read_dir(log_dir)? { 341 | 342 | let entry = entry?; 343 | let path = entry.path(); 344 | 345 | if let Ok(ftype) = entry.file_type() { 346 | if ftype.is_file() { 347 | if let Some(stem) = path.file_stem() { 348 | if stem == LOG_FILE_PREFIX { 349 | if let Some(extension) = path.extension() { 350 | if let Ok(num) = extension.to_string_lossy().parse::() { 351 | if num > max_id { 352 | if FileOps::check_if_online(&path)? { 353 | max_id = num; 354 | } 355 | } 356 | } else { 357 | info!("Skipping entry in transaction log directory {:?}: extension is not i32 number", entry.path()); 358 | } 359 | } else { 360 | info!("Skipping entry in transaction log directory {:?}: no file extension", entry.path()); 361 | } 362 | } else { 363 | info!("Skipping entry in transaction log directory {:?}: file name doesn't match {}", entry.path(), LOG_FILE_PREFIX); 364 | } 365 | } else { 366 | info!("Skipping entry in transaction log directory {:?}: no stem in file name", entry.path()); 367 | } 368 | } else { 369 | info!("Skipping entry in transaction log directory {:?}: entry is not a file", entry.path()); 370 | } 371 | } else { 372 | info!("Skipping entry in transaction log directory {:?}: unable to determine file type", entry.path()); 373 | } 374 | } 375 | 376 | Ok(max_id) 377 | } 378 | 379 | pub fn build_file_name(log_dir: &str, file_id: u32) -> PathBuf { 380 | 381 | let mut file = PathBuf::from(log_dir); 382 | file.push(LOG_FILE_PREFIX); 383 | file.set_extension(file_id.to_string()); 384 | file 385 | } 386 | 387 | pub fn create_log_file(log_dir: &str, file_id: u32, size: u32, mark_online: bool) -> Result<(), Error> { 388 | let file = FileOps::build_file_name(log_dir, file_id); 389 | 390 | let mut f = OpenOptions::new() 391 | .create_new(true) 392 | .write(true) 393 | .truncate(false) 394 | .open(file)?; 395 | 396 | f.set_len(size as u64)?; 397 | 398 | f.write_all(&LOG_FILE_MAGIC)?; 399 | if mark_online { 400 | f.write_all(&[LOG_FILE_ONLINE_BIT])?; 401 | } 402 | 403 | f.sync_all()?; 404 | 405 | Ok(()) 406 | } 407 | 408 | pub fn check_if_online(path: &Path) -> std::io::Result { 409 | let mut f = OpenOptions::new() 410 | .create(false) 411 | .read(true) 412 | .write(false) 413 | .truncate(false) 414 | .open(path)?; 415 | 416 | let mut magic = [0,0,0]; 417 | let mut online_bit = [0]; 418 | f.read_exact(&mut magic)?; 419 | for i in 0..magic.len() { 420 | if magic[i] != LOG_FILE_MAGIC[i] { 421 | return Ok(false); 422 | } 423 | } 424 | 425 | f.read_exact(&mut online_bit)?; 426 | 427 | Ok(online_bit[0] == LOG_FILE_ONLINE_BIT) 428 | } 429 | 430 | pub fn init_file_logging(log_dir: &str, file_size: u32) -> Result { 431 | 432 | let file_id = FileOps::find_latest_log_file(log_dir)?; 433 | if file_id == 0 { 434 | if !FileOps::build_file_name(&log_dir, 1).exists() { 435 | FileOps::create_log_file(log_dir, 1, file_size, true)?; 436 | } 437 | Ok(1) 438 | } else { 439 | Ok(file_id) 440 | } 441 | } 442 | } 443 | 444 | 445 | #[cfg(test)] 446 | mod tests { 447 | 448 | use super::*; 449 | 450 | #[test] 451 | fn test_file_stream() { 452 | let log_dir = "/tmp/test_fs_435354345"; 453 | let max_file_size = 100; 454 | let file_id = 1; 455 | let start_pos = 0; 456 | let enable_rotation = true; 457 | let buf_sz = 100; 458 | 459 | if Path::new(log_dir).exists() { 460 | std::fs::remove_dir_all(log_dir).expect("Failed to delete test dir on cleanup"); 461 | } 462 | std::fs::create_dir(log_dir).expect("Failed to create test dir"); 463 | 464 | 465 | FileOps::init_file_logging(log_dir, max_file_size).expect("Failed to init logging"); 466 | 467 | 468 | // FileStream 469 | 470 | 471 | let mut fs = FileStream::new(log_dir.to_owned(), max_file_size, file_id, start_pos, enable_rotation, false).expect("Failed to create file stream"); 472 | assert_eq!(fs.get_cur_pos().unwrap(), 4); 473 | 474 | let buf = [1,2,3,4,5,6,7,8,0,1,2,3,4,5]; 475 | fs.write_all(&buf).unwrap(); 476 | fs.flush().unwrap(); 477 | 478 | assert_eq!(fs.get_cur_pos().unwrap(), buf.len() as u64 + 4); 479 | 480 | for _ in 0..20 { 481 | fs.write_all(&buf).unwrap(); 482 | } 483 | fs.flush().unwrap(); 484 | 485 | fs.terminate(); 486 | 487 | 488 | // BufferedFileStream 489 | 490 | 491 | let bfs = BufferedFileStream::new(log_dir.to_owned(), max_file_size, buf_sz, 2, (11 * buf.len() + 4) as u32).expect("Failed to create BufferedFileStream"); 492 | 493 | for _ in 0..20 { 494 | let mut slice = bfs.get_for_write(buf.len()).unwrap(); 495 | (&mut slice).copy_from_slice(&buf); 496 | } 497 | bfs.flush(); 498 | 499 | bfs.terminate(); 500 | } 501 | 502 | } 503 | -------------------------------------------------------------------------------- /src/log_mgr/io.rs: -------------------------------------------------------------------------------- 1 | /// Transaction log writer & reader 2 | 3 | use crate::common::errors::Error; 4 | use crate::common::errors::ErrorKind; 5 | use crate::common::misc::SliceToIntConverter; 6 | use crate::common::crc32; 7 | use crate::common::defs::Sequence; 8 | use crate::common::defs::Vector; 9 | use crate::common::defs::VECTOR_DATA_LENGTH; 10 | use crate::log_mgr::fs::BufferedFileStream; 11 | use crate::log_mgr::fs::FileStream; 12 | use crate::common::defs::ObjectId; 13 | use std::io::Read; 14 | use std::io::Write; 15 | use std::io::Seek; 16 | use std::io::SeekFrom; 17 | 18 | 19 | const OBJECT_ID_WRITE_SZ: usize = 2 * 4; 20 | const LRH_WRITE_SZ: usize = 8 * 4 + 1 + 4; 21 | 22 | 23 | #[derive(Debug)] 24 | pub struct LogRecordHeader { 25 | pub lsn: u64, 26 | pub csn: u64, 27 | pub checkpoint_csn: u64, 28 | pub tsn: u64, 29 | pub rec_type: RecType, 30 | pub crc32: u32, 31 | } 32 | 33 | impl LogRecordHeader { 34 | fn new() -> Self { 35 | LogRecordHeader { 36 | lsn: 0, 37 | csn: 0, 38 | checkpoint_csn: 0, 39 | tsn: 0, 40 | rec_type: RecType::Unspecified, 41 | crc32: 0, 42 | } 43 | } 44 | } 45 | 46 | #[derive(Eq, Hash, PartialEq, Clone, Copy, Debug)] 47 | pub enum RecType { 48 | Unspecified = 0, 49 | Commit = 1, 50 | Rollback = 2, 51 | Data = 3, 52 | Delete = 4, 53 | CheckpointBegin = 5, 54 | CheckpointCompleted = 6, 55 | } 56 | 57 | 58 | 59 | /// Log writer 60 | #[derive(Clone)] 61 | pub struct LogWriter { 62 | out_stream: BufferedFileStream, 63 | lsn: Sequence, 64 | } 65 | 66 | impl LogWriter { 67 | 68 | pub fn new(out_stream: BufferedFileStream, lsn: Sequence) -> Result { 69 | Ok(LogWriter { 70 | out_stream, 71 | lsn, 72 | }) 73 | } 74 | 75 | pub fn write_data(&self, csn: u64, checkpoint_csn: u64, tsn: u64, obj: &ObjectId, vector: &mut Vector, data: &[u8]) -> Result<(), Error> { 76 | let mut lrh = self.prepare_lrh(csn, checkpoint_csn, tsn, RecType::Data); 77 | LogOps::calc_obj_id_crc(&mut lrh.crc32, obj); 78 | let v = vector.to_data(); 79 | lrh.crc32 = crc32::crc32_arr(lrh.crc32, v); 80 | lrh.crc32 = crc32::crc32_num(lrh.crc32, data.len() as u32); 81 | lrh.crc32 = crc32::crc32_arr(lrh.crc32, data); 82 | lrh.crc32 = crc32::crc32_finalize(lrh.crc32); 83 | 84 | let mut dst_locked = self.out_stream.get_for_write(LRH_WRITE_SZ + OBJECT_ID_WRITE_SZ + VECTOR_DATA_LENGTH + 4 + data.len()).unwrap(); 85 | let mut slice: &mut [u8] = &mut dst_locked; 86 | 87 | self.write_header(&lrh, &mut slice)?; 88 | self.write_obj_id(obj, &mut slice)?; 89 | slice.write_all(v)?; 90 | slice.write_all(&(data.len() as u32).to_ne_bytes())?; 91 | slice.write_all(data)?; 92 | slice.flush()?; 93 | 94 | drop(dst_locked); 95 | 96 | Ok(()) 97 | } 98 | 99 | pub fn write_commit(&self, csn: u64, tsn: u64) -> Result<(), Error> { 100 | let _lsn = self.write_header_only_rec(csn, 0, tsn, RecType::Commit)?; 101 | self.out_stream.flush(); 102 | Ok(()) 103 | } 104 | 105 | pub fn write_rollback(&self, csn: u64, tsn: u64) -> Result<(), Error> { 106 | self.write_header_only_rec(csn, 0, tsn, RecType::Rollback)?; 107 | Ok(()) 108 | } 109 | 110 | pub fn write_checkpoint_begin(&self, checkpoint_csn: u64, latest_commit_csn: u64) -> Result<(), Error> { 111 | let mut lrh = self.prepare_lrh(0, checkpoint_csn, 0, RecType::CheckpointBegin); 112 | lrh.crc32 = crc32::crc32_num(lrh.crc32, latest_commit_csn); 113 | lrh.crc32 = crc32::crc32_finalize(lrh.crc32); 114 | 115 | let mut dst_locked = self.out_stream.get_for_write(LRH_WRITE_SZ + 8).unwrap(); 116 | let mut slice: &mut [u8] = &mut dst_locked; 117 | 118 | self.write_header(&lrh, &mut slice)?; 119 | slice.write_all(&latest_commit_csn.to_ne_bytes())?; 120 | slice.flush()?; 121 | drop(dst_locked); 122 | 123 | self.out_stream.flush(); 124 | 125 | Ok(()) 126 | } 127 | 128 | pub fn write_checkpoint_completed(&self, checkpoint_csn: u64, latest_commit_csn: u64, current_tsn: u64) -> Result<(), Error> { 129 | let mut lrh = self.prepare_lrh(0, checkpoint_csn, current_tsn, RecType::CheckpointCompleted); 130 | lrh.crc32 = crc32::crc32_num(lrh.crc32, latest_commit_csn); 131 | lrh.crc32 = crc32::crc32_finalize(lrh.crc32); 132 | 133 | let mut dst_locked = self.out_stream.get_for_write(LRH_WRITE_SZ + 8).unwrap(); 134 | let mut slice: &mut [u8] = &mut dst_locked; 135 | 136 | self.write_header(&lrh, &mut slice)?; 137 | slice.write_all(&latest_commit_csn.to_ne_bytes())?; 138 | slice.flush()?; 139 | drop(dst_locked); 140 | 141 | self.out_stream.flush(); 142 | 143 | Ok(()) 144 | } 145 | 146 | 147 | pub fn write_delete(&self, csn: u64, checkpoint_csn: u64, tsn: u64, obj: &ObjectId) -> Result<(), Error> { 148 | let mut lrh = self.prepare_lrh(csn, checkpoint_csn, tsn, RecType::Delete); 149 | LogOps::calc_obj_id_crc(&mut lrh.crc32, obj); 150 | lrh.crc32 = crc32::crc32_finalize(lrh.crc32); 151 | 152 | let mut dst_locked = self.out_stream.get_for_write(LRH_WRITE_SZ + 8).unwrap(); 153 | let mut slice: &mut [u8] = &mut dst_locked; 154 | 155 | self.write_header(&lrh, &mut slice)?; 156 | self.write_obj_id(obj, &mut slice)?; 157 | 158 | drop(dst_locked); 159 | 160 | Ok(()) 161 | } 162 | 163 | fn write_header_only_rec(&self, csn: u64, checkpoint_csn: u64, tsn: u64, rec_type: RecType) -> Result { 164 | let mut lrh = self.prepare_lrh(csn, checkpoint_csn, tsn, rec_type); 165 | lrh.crc32 = crc32::crc32_finalize(lrh.crc32); 166 | 167 | let mut dst_locked = self.out_stream.get_for_write(LRH_WRITE_SZ).unwrap(); 168 | let mut slice: &mut [u8] = &mut dst_locked; 169 | self.write_header(&lrh, &mut slice)?; 170 | drop(dst_locked); 171 | 172 | Ok(lrh.lsn) 173 | } 174 | 175 | fn write_header(&self, lrh: &LogRecordHeader, slice: &mut &mut [u8]) -> std::io::Result<()> { 176 | (*slice).write_all(&lrh.lsn.to_ne_bytes())?; 177 | (*slice).write_all(&lrh.csn.to_ne_bytes())?; 178 | (*slice).write_all(&lrh.checkpoint_csn.to_ne_bytes())?; 179 | (*slice).write_all(&lrh.tsn.to_ne_bytes())?; 180 | (*slice).write_all(&[(lrh.rec_type as u8)])?; 181 | (*slice).write_all(&lrh.crc32.to_ne_bytes())?; 182 | (*slice).flush()?; 183 | 184 | Ok(()) 185 | } 186 | 187 | fn write_obj_id(&self, obj: &ObjectId, slice: &mut &mut [u8]) -> std::io::Result<()> { 188 | (*slice).write_all(&obj.file_id.to_ne_bytes())?; 189 | (*slice).write_all(&obj.extent_id.to_ne_bytes())?; 190 | (*slice).write_all(&obj.block_id.to_ne_bytes())?; 191 | (*slice).write_all(&obj.entry_id.to_ne_bytes())?; 192 | (*slice).flush()?; 193 | 194 | Ok(()) 195 | } 196 | 197 | fn prepare_lrh(&self, csn: u64, checkpoint_csn: u64, tsn: u64, rec_type: RecType) -> LogRecordHeader { 198 | let mut lrh = LogRecordHeader::new(); 199 | let mut crc32; 200 | 201 | lrh.lsn = self.lsn.get_next(); 202 | lrh.tsn = tsn; 203 | lrh.csn = csn; 204 | lrh.checkpoint_csn = checkpoint_csn; 205 | lrh.rec_type = rec_type; 206 | 207 | crc32 = crc32::crc32_begin(); 208 | LogOps::calc_header_crc(&mut crc32, &lrh); 209 | lrh.crc32 = crc32; 210 | 211 | lrh 212 | } 213 | 214 | pub fn terminate(self) { 215 | self.out_stream.terminate(); 216 | } 217 | } 218 | 219 | 220 | // Log reader 221 | pub struct LogReader { 222 | fs: FileStream, 223 | data_buf: Vec, 224 | vector: Vector, 225 | obj: ObjectId, 226 | stop_pos: u32, 227 | lsn: u64, 228 | csn: u64, 229 | latest_commit_csn: u64, 230 | checkpoint_csn: u64, 231 | } 232 | 233 | impl LogReader { 234 | 235 | pub fn new(fs: FileStream) -> Result { 236 | 237 | Ok(LogReader { 238 | fs, 239 | data_buf: Vec::new(), 240 | vector: Vector::new(), 241 | obj: ObjectId::new(), 242 | stop_pos: 0, 243 | lsn: 1, 244 | csn: 1, 245 | latest_commit_csn: 1, 246 | checkpoint_csn: 1, 247 | }) 248 | } 249 | 250 | pub fn get_vector(&mut self) -> &mut Vector { 251 | &mut self.vector 252 | } 253 | 254 | pub fn find_write_position(&mut self) -> Result<(u32, u64, u64, u64), Error> { 255 | while let Some(_lrh) = self.read_next()? { } 256 | 257 | Ok((self.stop_pos, self.lsn, self.csn, self.latest_commit_csn)) 258 | } 259 | 260 | pub fn seek_to_latest_checkpoint(&mut self) -> Result, Error> { 261 | 262 | let mut seek_pos = 0; 263 | let mut start_seek_pos = 0; 264 | let mut csn = 0; 265 | let mut tsn = 0; 266 | 267 | // find latest completed checkpoint 268 | while let Some(lrh) = self.read_next()? { 269 | 270 | if lrh.rec_type == RecType::CheckpointBegin { 271 | start_seek_pos = self.fs.get_cur_pos()?; 272 | csn = lrh.checkpoint_csn; 273 | } else if lrh.rec_type == RecType::CheckpointCompleted { 274 | if csn == lrh.checkpoint_csn { 275 | seek_pos = start_seek_pos; 276 | self.checkpoint_csn = csn; 277 | tsn = lrh.tsn; 278 | } 279 | } 280 | } 281 | 282 | if seek_pos > 0 { 283 | self.fs.seek(SeekFrom::Start(seek_pos))?; 284 | 285 | Ok(Some((self.checkpoint_csn, tsn))) 286 | } else { 287 | Ok(None) 288 | } 289 | } 290 | 291 | pub fn read_next(&mut self) -> Result, Error> { 292 | let data_len: u32; 293 | let mut crc32; 294 | let mut u32_buf = [0u8; 4]; 295 | let mut u64_buf = [0u8; 8]; 296 | let mut latest_commit_csn = 0; 297 | let mut checkpoint_csn = 0; 298 | 299 | match self.read_header() { 300 | Ok(lrh) => { 301 | crc32 = crc32::crc32_begin(); 302 | LogOps::calc_header_crc(&mut crc32, &lrh); 303 | 304 | match lrh.rec_type { 305 | RecType::Commit => { 306 | latest_commit_csn = lrh.csn; 307 | }, 308 | RecType::Rollback => { 309 | }, 310 | RecType::CheckpointBegin => { 311 | self.fs.read_exact(&mut u64_buf)?; 312 | checkpoint_csn = u64::from_ne_bytes(u64_buf); 313 | crc32 = crc32::crc32_num(crc32, checkpoint_csn); 314 | 315 | if self.latest_commit_csn < checkpoint_csn { 316 | latest_commit_csn = checkpoint_csn; 317 | } 318 | }, 319 | RecType::CheckpointCompleted => { 320 | self.fs.read_exact(&mut u64_buf)?; 321 | checkpoint_csn = u64::from_ne_bytes(u64_buf); 322 | crc32 = crc32::crc32_num(crc32, checkpoint_csn); 323 | 324 | if self.latest_commit_csn < checkpoint_csn { 325 | latest_commit_csn = checkpoint_csn; 326 | } 327 | }, 328 | RecType::Data => { 329 | self.obj = self.read_object_id()?; 330 | LogOps::calc_obj_id_crc(&mut crc32, &self.obj); 331 | 332 | self.fs.read_exact(self.vector.buf_mut())?; 333 | self.vector.update_from_buf(); 334 | crc32 = crc32::crc32_arr(crc32, &self.vector.buf()); 335 | 336 | self.fs.read_exact(&mut u32_buf)?; 337 | data_len = u32::from_ne_bytes(u32_buf); 338 | crc32 = crc32::crc32_num(crc32, data_len); 339 | 340 | if data_len > 0 { 341 | self.data_buf.resize(data_len as usize, 0); 342 | 343 | self.fs.read_exact(&mut self.data_buf)?; 344 | crc32 = crc32::crc32_arr(crc32, &self.data_buf); 345 | } 346 | }, 347 | RecType::Delete => { 348 | self.obj = self.read_object_id()?; 349 | LogOps::calc_obj_id_crc(&mut crc32, &self.obj); 350 | }, 351 | RecType::Unspecified => {}, 352 | } 353 | 354 | crc32 = crc32::crc32_finalize(crc32); 355 | 356 | if crc32 == lrh.crc32 { 357 | self.stop_pos = self.fs.get_cur_pos()? as u32; 358 | self.lsn = lrh.lsn; 359 | self.csn = lrh.csn; 360 | if lrh.rec_type == RecType::Commit || 361 | lrh.rec_type == RecType::CheckpointBegin || 362 | lrh.rec_type == RecType::CheckpointCompleted 363 | { 364 | self.latest_commit_csn = latest_commit_csn; 365 | if lrh.rec_type == RecType::CheckpointBegin || lrh.rec_type == RecType::CheckpointCompleted { 366 | self.checkpoint_csn = checkpoint_csn; 367 | } 368 | } 369 | 370 | return Ok(Some(lrh)); 371 | } else { 372 | return Ok(None); 373 | } 374 | }, 375 | Err(e) => { 376 | if ErrorKind::IoError == e.kind() { 377 | let ioe = e.io_err().unwrap(); 378 | if ioe.kind() == std::io::ErrorKind::UnexpectedEof { 379 | return Ok(None); 380 | } else { 381 | return Err(Error::io_error(ioe)); 382 | } 383 | } 384 | 385 | return Err(e); 386 | } 387 | } 388 | } 389 | 390 | pub fn get_object_id(&self) -> ObjectId { 391 | self.obj 392 | } 393 | 394 | pub fn get_data(&self) -> &[u8] { 395 | &self.data_buf 396 | } 397 | 398 | fn read_header(&mut self) -> Result { 399 | let mut lrh = LogRecordHeader::new(); 400 | 401 | let mut u32_buf = [0u8; 4]; 402 | let mut u64_buf = [0u8; 8]; 403 | let mut byte = [0u8]; 404 | self.fs.read_exact(&mut u64_buf)?; 405 | lrh.lsn = u64::from_ne_bytes(u64_buf); 406 | self.fs.read_exact(&mut u64_buf)?; 407 | lrh.csn = u64::from_ne_bytes(u64_buf); 408 | self.fs.read_exact(&mut u64_buf)?; 409 | lrh.checkpoint_csn = u64::from_ne_bytes(u64_buf); 410 | self.fs.read_exact(&mut u64_buf)?; 411 | lrh.tsn = u64::from_ne_bytes(u64_buf); 412 | self.fs.read(&mut byte)?; 413 | lrh.rec_type = match byte[0] { 414 | 0 => RecType::Unspecified, 415 | 1 => RecType::Commit, 416 | 2 => RecType::Rollback, 417 | 3 => RecType::Data, 418 | 4 => RecType::Delete, 419 | 5 => RecType::CheckpointBegin, 420 | 6 => RecType::CheckpointCompleted, 421 | _ => panic!("Unexpected record type in the log"), 422 | }; 423 | self.fs.read_exact(&mut u32_buf)?; 424 | lrh.crc32 = u32::from_ne_bytes(u32_buf); 425 | 426 | Ok(lrh) 427 | } 428 | 429 | fn read_object_id(&mut self) -> Result { 430 | let mut u64_buf = [0u8; 8]; 431 | let mut ret = ObjectId::new(); 432 | 433 | self.fs.read_exact(&mut u64_buf)?; 434 | 435 | ret.file_id = u16::slice_to_int(&u64_buf[0..2]).unwrap(); 436 | ret.extent_id = u16::slice_to_int(&u64_buf[2..4]).unwrap(); 437 | ret.block_id = u16::slice_to_int(&u64_buf[4..6]).unwrap(); 438 | ret.entry_id = u16::slice_to_int(&u64_buf[6..8]).unwrap(); 439 | 440 | Ok(ret) 441 | } 442 | } 443 | 444 | 445 | /// Utility functions for working with log 446 | pub struct LogOps {} 447 | 448 | impl LogOps { 449 | 450 | fn calc_header_crc(crc32: &mut u32, lrh: &LogRecordHeader) { 451 | *crc32 = crc32::crc32_num(*crc32, lrh.lsn); 452 | *crc32 = crc32::crc32_num(*crc32, lrh.csn); 453 | *crc32 = crc32::crc32_num(*crc32, lrh.checkpoint_csn); 454 | *crc32 = crc32::crc32_num(*crc32, lrh.tsn); 455 | *crc32 = crc32::crc32_num(*crc32, lrh.rec_type as u8); 456 | } 457 | 458 | fn calc_obj_id_crc(crc32: &mut u32, obj: &ObjectId) { 459 | *crc32 = crc32::crc32_num(*crc32, obj.file_id); 460 | *crc32 = crc32::crc32_num(*crc32, obj.extent_id); 461 | *crc32 = crc32::crc32_num(*crc32, obj.block_id); 462 | *crc32 = crc32::crc32_num(*crc32, obj.entry_id); 463 | } 464 | } 465 | 466 | 467 | -------------------------------------------------------------------------------- /src/buf_mgr/buf_writer.rs: -------------------------------------------------------------------------------- 1 | /// Write data blocks from the buffer to database file 2 | 3 | 4 | use crate::common::errors::Error; 5 | use crate::common::defs::Sequence; 6 | use crate::common::defs::BlockId; 7 | use crate::common::intercom::SyncNotification; 8 | use crate::storage::datastore::FileDesc; 9 | use crate::buf_mgr::buf_mgr::BlockType; 10 | use crate::block_mgr::block_mgr::BlockMgr; 11 | use crate::block_mgr::block::BasicBlock; 12 | use crate::block_mgr::block::BlockLockedMut; 13 | use crate::block_mgr::block::DataBlock; 14 | use std::sync::Arc; 15 | use std::sync::atomic::AtomicBool; 16 | use std::sync::atomic::AtomicU64; 17 | use std::sync::atomic::Ordering; 18 | use std::sync::RwLock; 19 | use std::thread::JoinHandle; 20 | use std::time::Duration; 21 | use log::error; 22 | 23 | 24 | const CONDVAR_WAIT_INTERVAL_MS: u64 = 1000; 25 | const WAKE_WRITER_THREADS_INTERVAL_MS: u64 = 1000; 26 | 27 | 28 | #[derive(Clone)] 29 | pub struct BufWriter { 30 | writer_threads: Arc>>, 31 | terminate: Arc, 32 | write_ready: SyncNotification, 33 | chkpnt_allocators: CheckpointStoreBlockAllocators, 34 | waker: Arc> 35 | } 36 | 37 | 38 | impl BufWriter { 39 | 40 | pub fn new(block_mgr: &BlockMgr, writer_num: usize) -> Result { 41 | let terminate = Arc::new(AtomicBool::new(false)); 42 | let mut writer_threads = Vec::with_capacity(writer_num); 43 | let write_ready = SyncNotification::new(0); 44 | let chkpnt_allocators = CheckpointStoreBlockAllocators::new(); 45 | 46 | for _ in 0..writer_num { 47 | let terminate2 = terminate.clone(); 48 | 49 | let block_mgr2 = block_mgr.clone()?; 50 | 51 | let write_ready2 = write_ready.clone(); 52 | 53 | let chkpnt_allocators2 = chkpnt_allocators.clone(); 54 | 55 | let handle = std::thread::spawn(move || { 56 | Self::writer_thread(block_mgr2, terminate2, write_ready2, chkpnt_allocators2); 57 | }); 58 | 59 | writer_threads.push(handle); 60 | } 61 | 62 | let writer_threads = Arc::new(writer_threads); 63 | let wr = write_ready.clone(); 64 | let wt = writer_threads.clone(); 65 | let tm = terminate.clone(); 66 | let waker = Arc::new(std::thread::spawn(move || { 67 | Self::waker_thread(wr, wt, tm); 68 | })); 69 | 70 | Ok(BufWriter { 71 | writer_threads, 72 | terminate, 73 | write_ready, 74 | chkpnt_allocators, 75 | waker, 76 | }) 77 | } 78 | 79 | // wake writers with time interval 80 | fn waker_thread(write_ready: SyncNotification, 81 | writer_threads: Arc>>, 82 | terminate: Arc) { 83 | while !terminate.load(Ordering::Relaxed) { 84 | std::thread::sleep(Duration::from_millis(WAKE_WRITER_THREADS_INTERVAL_MS)); 85 | write_ready.send(writer_threads.len(), true); 86 | loop { 87 | if let Some(lock) = write_ready.wait_for_interruptable( 88 | &mut (|count| -> bool { *count != 0 }), 89 | &mut (|| -> bool { terminate.load(Ordering::Relaxed) }), 90 | Duration::from_millis(CONDVAR_WAIT_INTERVAL_MS) 91 | ) { 92 | if *lock == 0 { 93 | break; 94 | } 95 | } else { 96 | break; 97 | } 98 | } 99 | } 100 | } 101 | 102 | pub fn terminate(self) { 103 | if let Ok(waker) = Arc::try_unwrap(self.waker) { 104 | self.terminate.store(true, Ordering::Relaxed); 105 | waker.join().unwrap(); 106 | 107 | let mut writer_threads = Arc::try_unwrap(self.writer_threads).unwrap(); 108 | for jh in writer_threads.drain(..) { 109 | jh.join().unwrap(); 110 | } 111 | } 112 | } 113 | 114 | /// Write just one data block in current thread. 115 | /// Check if block dirty and write it. If block has not written checkpoint copy the checkpoint 116 | /// copy must be written to disk first. 117 | pub fn write_data_block(&self, block: &mut BlockLockedMut, block_mgr: &BlockMgr, leave_dirty: bool) -> Result<(), Error> { 118 | Self::write_block(block, block_mgr, &self.chkpnt_allocators, leave_dirty) 119 | } 120 | 121 | fn writer_thread(block_mgr: BlockMgr, terminate: Arc, write_ready: SyncNotification, mut chkpnt_allocators: CheckpointStoreBlockAllocators) { 122 | 123 | loop { 124 | if let Some(mut lock) = write_ready.wait_for_interruptable( 125 | &mut (|count| -> bool { *count == 0 }), 126 | &mut (|| -> bool { terminate.load(Ordering::Relaxed) }), 127 | Duration::from_millis(CONDVAR_WAIT_INTERVAL_MS) 128 | ) { 129 | if *lock > 0 { 130 | *lock -= 1; 131 | drop(lock); 132 | if let Err(e) = Self::write_blocks(&block_mgr, &mut chkpnt_allocators) { 133 | error!("Failed to perform block write: {}", e); 134 | } 135 | } 136 | } else { 137 | break; 138 | } 139 | } 140 | } 141 | 142 | fn write_blocks(block_mgr: &BlockMgr, chkpnt_allocators: &CheckpointStoreBlockAllocators) -> Result<(), Error> { 143 | 144 | let mut iter = block_mgr.get_iter(); 145 | 146 | while let Some(desc) = iter.next() { 147 | if desc.dirty && desc.block_type != BlockType::CheckpointBlock { 148 | if let Some(mut block) = block_mgr.get_block_by_idx(desc.id, desc.block_id, desc.block_type) { 149 | if block.get_id() == desc.block_id { 150 | Self::write_block(&mut block, block_mgr, chkpnt_allocators, false)?; 151 | } 152 | } 153 | } 154 | } 155 | 156 | Ok(()) 157 | } 158 | 159 | fn write_block(mut block: &mut BlockLockedMut, block_mgr: &BlockMgr, chkpnt_allocators: &CheckpointStoreBlockAllocators, leave_dirty: bool) -> Result<(), Error> { 160 | 161 | let desc = block_mgr.get_block_desc(block.get_buf_idx()).unwrap(); 162 | 163 | if desc.dirty { 164 | if desc.block_type == BlockType::DataBlock { 165 | if !desc.checkpoint_written { 166 | let mut checkpoint_block = block_mgr.get_block_mut_no_lock(&desc.checkpoint_block_id)?; 167 | let checkpoint_csn = checkpoint_block.get_checkpoint_csn(); 168 | let actual_block_id = chkpnt_allocators.get_next_block_id(checkpoint_csn, block_mgr)?; 169 | checkpoint_block.set_id(actual_block_id); 170 | block_mgr.write_block(&mut checkpoint_block)?; 171 | block_mgr.set_checkpoint_written(desc.id, true); 172 | } 173 | block_mgr.write_block(&mut block)?; 174 | if !leave_dirty { 175 | block_mgr.set_dirty(desc.id, false); 176 | } 177 | } else if desc.block_type == BlockType::VersionBlock { 178 | block_mgr.write_block(&mut block)?; 179 | if !leave_dirty { 180 | block_mgr.set_dirty(desc.id, false); 181 | } 182 | } 183 | } 184 | 185 | Ok(()) 186 | } 187 | } 188 | 189 | 190 | /// Array of two allocators. 191 | #[derive(Clone)] 192 | struct CheckpointStoreBlockAllocators { 193 | allocators: [CheckpointStoreBlockAllocator;2], 194 | } 195 | 196 | impl CheckpointStoreBlockAllocators { 197 | pub fn new() -> Self { 198 | CheckpointStoreBlockAllocators { 199 | allocators: [CheckpointStoreBlockAllocator::new(), CheckpointStoreBlockAllocator::new()], 200 | } 201 | } 202 | 203 | /// Return next block id for writing block to checkpoint store. 204 | pub fn get_next_block_id(&self, checkpoint_csn: u64, block_mgr: &BlockMgr) -> Result { 205 | let allocator_id = checkpoint_csn as usize & 0x1; 206 | self.allocators[allocator_id].get_next_block_id(checkpoint_csn, &block_mgr) 207 | } 208 | } 209 | 210 | 211 | struct AllocatorState { 212 | file_info: Vec, 213 | parity: u64, 214 | } 215 | 216 | 217 | /// Block allocator determines to which block on disk in the checkpoint datastore a block 218 | /// from the buffer can be written. 219 | #[derive(Clone)] 220 | struct CheckpointStoreBlockAllocator { 221 | seqn: Sequence, 222 | lock: Arc>, 223 | cur_checkpoint_csn: Arc, 224 | } 225 | 226 | impl CheckpointStoreBlockAllocator { 227 | 228 | pub fn new() -> Self { 229 | let seqn = Sequence::new(0); 230 | let file_info = Vec::::new(); 231 | let parity = 0; 232 | 233 | let allocator_state = AllocatorState { 234 | file_info, 235 | parity, 236 | }; 237 | let lock = Arc::new(RwLock::new(allocator_state)); 238 | 239 | let cur_checkpoint_csn = Arc::new(AtomicU64::new(0)); 240 | 241 | CheckpointStoreBlockAllocator { 242 | seqn, 243 | lock, 244 | cur_checkpoint_csn, 245 | } 246 | } 247 | 248 | /// Return next block id for writing block to checkpoint store. 249 | pub fn get_next_block_id(&self, checkpoint_csn: u64, block_mgr: &BlockMgr) -> Result { 250 | 251 | if self.cur_checkpoint_csn.load(Ordering::Relaxed) != checkpoint_csn { 252 | self.next_checkpoint(checkpoint_csn, block_mgr); 253 | } 254 | 255 | let sl = self.lock.read().unwrap(); 256 | 257 | let n = self.seqn.get_next(); 258 | let fid_shift = (n % sl.file_info.len() as u64) as usize; 259 | 260 | let filen = sl.file_info.len(); 261 | for i in fid_shift..filen+fid_shift { 262 | 263 | let fid = if i >= filen { 264 | i - filen 265 | } else { 266 | i 267 | }; 268 | 269 | let fi = sl.file_info[fid]; 270 | 271 | let file_id = fi.file_id; 272 | let n = n / filen as u64; 273 | let extent_size = fi.extent_size as u64; 274 | let extent_id = (n / extent_size * 2 + sl.parity) as u16 + 1; // avoid extent 0 by adding 1 275 | let block_id = (n % extent_size as u64) as u16; 276 | if extent_id >= fi.extent_num { 277 | if fi.extent_num == fi.max_extent_num { 278 | continue; 279 | } else { 280 | block_mgr.add_extent(file_id)?; 281 | let mut inc = 1; 282 | if extent_id > fi.extent_num { 283 | if fi.extent_num <= fi.max_extent_num - 2 { 284 | block_mgr.add_extent(file_id)?; 285 | inc += 1; 286 | } else { 287 | continue; 288 | } 289 | } 290 | drop(sl); 291 | let mut xl = self.lock.write().unwrap(); 292 | xl.file_info[fid].extent_num += inc; 293 | drop(xl); 294 | } 295 | } 296 | 297 | return Ok(BlockId { 298 | file_id, 299 | extent_id, 300 | block_id, 301 | }); 302 | } 303 | 304 | Err(Error::checkpoint_store_size_limit_reached()) 305 | } 306 | 307 | // The function is called when new checkpoint is created. 308 | fn next_checkpoint(&self, checkpoint_csn: u64, block_mgr: &BlockMgr) { 309 | let mut xl = self.lock.write().unwrap(); 310 | 311 | if self.cur_checkpoint_csn.load(Ordering::Relaxed) != checkpoint_csn { 312 | xl.parity = checkpoint_csn & 0x1; 313 | 314 | self.seqn.set(0); 315 | 316 | block_mgr.get_checkpoint_files(&mut xl.file_info); 317 | 318 | self.cur_checkpoint_csn.store(checkpoint_csn, Ordering::Relaxed); 319 | } 320 | } 321 | } 322 | 323 | 324 | #[cfg(test)] 325 | mod tests { 326 | 327 | use super::*; 328 | use crate::storage::datastore::DataStore; 329 | use crate::storage::datastore::FileType; 330 | use crate::storage::datastore::FileState; 331 | use crate::block_mgr::block::BasicBlock; 332 | use crate::system::config::ConfigMt; 333 | use crate::buf_mgr::buf_mgr::Pinned; 334 | use crate::buf_mgr::buf_mgr::BlockArea; 335 | use std::time::Duration; 336 | use std::path::Path; 337 | use std::rc::Rc; 338 | use std::cell::Ref; 339 | 340 | 341 | fn init_datastore(dspath: &str, block_size: usize) -> Vec { 342 | 343 | if Path::new(&dspath).exists() { 344 | std::fs::remove_dir_all(&dspath).expect("Failed to delete test dir on cleanup"); 345 | } 346 | std::fs::create_dir(&dspath).expect("Failed to create test dir"); 347 | 348 | let mut fdset = vec![]; 349 | let desc1 = FileDesc { 350 | state: FileState::InUse, 351 | file_id: 3, 352 | extent_size: 16, 353 | extent_num: 3, 354 | max_extent_num: 65500, 355 | file_type: FileType::DataStoreFile, 356 | }; 357 | let desc2 = FileDesc { 358 | state: FileState::InUse, 359 | file_id: 4, 360 | extent_size: 10, 361 | extent_num: 3, 362 | max_extent_num: 65500, 363 | file_type: FileType::DataStoreFile, 364 | }; 365 | let desc3 = FileDesc { 366 | state: FileState::InUse, 367 | file_id: 5, 368 | extent_size: 10, 369 | extent_num: 3, 370 | max_extent_num: 65500, 371 | file_type: FileType::CheckpointStoreFile, 372 | }; 373 | let desc4 = FileDesc { 374 | state: FileState::InUse, 375 | file_id: 6, 376 | extent_size: 10, 377 | extent_num: 2, 378 | max_extent_num: 65500, 379 | file_type: FileType::VersioningStoreFile, 380 | }; 381 | 382 | fdset.push(desc1); 383 | fdset.push(desc2); 384 | fdset.push(desc3); 385 | fdset.push(desc4); 386 | 387 | DataStore::initialize_datastore(dspath, block_size, &fdset).expect("Failed to init datastore"); 388 | fdset 389 | } 390 | 391 | #[test] 392 | fn test_buf_writer() { 393 | let writer_num = 2; 394 | let dspath = "/tmp/test_buf_writer_34554654"; 395 | let block_size = 8192; 396 | let block_num = 100; 397 | 398 | let conf = ConfigMt::new(); 399 | let mut c = conf.get_conf(); 400 | c.set_datastore_path(dspath.to_owned()); 401 | c.set_block_mgr_n_lock(10); 402 | c.set_block_buf_size(block_num*block_size as u64); 403 | drop(c); 404 | 405 | let _init_fdesc = init_datastore(dspath, block_size); 406 | 407 | let block_mgr = Rc::new(BlockMgr::new(conf.clone()).expect("Failed to create instance")); 408 | 409 | let bw = BufWriter::new(&block_mgr.clone(), writer_num).expect("Failed to create buffer writers"); 410 | 411 | // prepare dirty blocks 412 | let mut idxs = vec![]; 413 | for i in 1..16 { 414 | let block_id = BlockId::init(3,1,i); 415 | let mut block = block_mgr.get_block_mut(&block_id).expect("Failed to get block for write"); 416 | block.set_checkpoint_csn(1); 417 | block.add_entry(148); 418 | let idx = block.get_buf_idx(); 419 | idxs.push(idx); 420 | let desc = block_mgr.get_block_desc(idx).unwrap(); 421 | assert!(desc.dirty); 422 | 423 | // add checkpoint block to it 424 | let mut c_block = block_mgr.allocate_on_cache_mut_no_lock(BlockId::init(0,0,i), BlockType::CheckpointBlock).expect("Failed to allocate block"); 425 | c_block.copy_from(&block); 426 | c_block.set_original_id(block.get_id()); 427 | block_mgr.set_checkpoint_block_id(block.get_buf_idx(), c_block.get_id()); 428 | block_mgr.set_checkpoint_written(block.get_buf_idx(), false); 429 | 430 | drop(c_block); 431 | drop(block); 432 | } 433 | 434 | std::thread::sleep(Duration::from_millis(2*WAKE_WRITER_THREADS_INTERVAL_MS)); 435 | 436 | let mut i =0; 437 | assert!(loop { 438 | std::thread::sleep(Duration::new(2,0)); 439 | let mut dirty = false; 440 | for idx in idxs.iter() { 441 | let desc = block_mgr.get_block_desc(*idx).unwrap(); 442 | if desc.dirty { 443 | dirty = true; 444 | } 445 | } 446 | if ! dirty { 447 | break true; 448 | } 449 | i += 1; 450 | if i == 30 { 451 | break false; 452 | } 453 | }, "Writers couldn't complete in 60 secs"); 454 | 455 | 456 | // direct block write 457 | let block_id = BlockId::init(4,1,1); 458 | let mut block = block_mgr.get_block_mut(&block_id).expect("Failed to get versioning for write"); 459 | let idx = block.get_buf_idx(); 460 | block.set_checkpoint_csn(123); 461 | 462 | bw.write_data_block(&mut block, &block_mgr, true).expect("Failed to write versioning block"); 463 | let desc = block_mgr.get_block_desc(idx).unwrap(); 464 | assert!(desc.dirty); 465 | 466 | bw.write_data_block(&mut block, &block_mgr, false).expect("Failed to write versioning block"); 467 | let desc = block_mgr.get_block_desc(idx).unwrap(); 468 | assert!(!desc.dirty); 469 | 470 | 471 | bw.terminate(); 472 | 473 | drop(block); 474 | drop(block_mgr); 475 | 476 | 477 | let ds = DataStore::new(conf).expect("Failed to create datastore"); 478 | let stub_pin = AtomicU64::new(1000); 479 | for i in 1..16 { 480 | let block_id = BlockId::init(3,1,i); 481 | let ba: Ref = ds.load_block(&block_id, FileState::InUse).expect("Failed to load block"); 482 | let db = DataBlock::new(block_id, 0, Pinned::::new(ba.clone(), &stub_pin)); 483 | assert_eq!(db.get_checkpoint_csn(), 1); 484 | assert!(db.has_entry(0)); 485 | drop(db); 486 | drop(ba); 487 | 488 | let block_id = BlockId::init(5, if i<10 {2} else {4}, if i<10 {i} else {i - 10}); 489 | let ba: Ref = ds.load_block(&block_id, FileState::InUse).expect("Failed to load block"); 490 | let db = DataBlock::new(block_id, 0, Pinned::::new(ba.clone(), &stub_pin)); 491 | assert_eq!(db.get_checkpoint_csn(), 1); 492 | assert!(db.has_entry(0)); 493 | drop(db); 494 | drop(ba); 495 | } 496 | 497 | let block_id = BlockId::init(4,1,1); 498 | let ba: Ref = ds.load_block(&block_id, FileState::InUse).expect("Failed to load block"); 499 | let db = DataBlock::new(block_id, 0, Pinned::::new(ba.clone(), &stub_pin)); 500 | assert_eq!(db.get_checkpoint_csn(), 123); 501 | } 502 | } 503 | 504 | -------------------------------------------------------------------------------- /src/log_mgr/buf.rs: -------------------------------------------------------------------------------- 1 | //! Log buffer 2 | 3 | use std::sync::atomic::{AtomicUsize, Ordering, compiler_fence}; 4 | use std::sync::{Mutex, Condvar, Arc}; 5 | use std::cell::RefCell; 6 | use std::ops::{Deref, DerefMut}; 7 | use crate::common::errors::{Error}; 8 | 9 | 10 | #[cfg(feature = "metrics")] 11 | use std::sync::atomic::AtomicU64; 12 | 13 | 14 | thread_local! { 15 | static CUR_BUF: RefCell = RefCell::new(0); 16 | } 17 | 18 | 19 | #[repr(align(64))] 20 | struct CacheAligned (T); 21 | 22 | 23 | /// Shared buffer which allows reserving mutable areas of memory concurrently. 24 | /// # Safety 25 | /// This structure is internal, and can't be considered safe by itself. 26 | pub struct Buf { 27 | acquire_size: CacheAligned, 28 | done_size: CacheAligned, 29 | used_size: AtomicUsize, 30 | ptr: *mut T, 31 | size: usize, 32 | } 33 | 34 | impl Buf { 35 | 36 | /// Create a new instance with allocation of `size` items. The memory is never deallocated 37 | /// after the allocation. 38 | /// `size` must be > 0 and <= std::isize::MAX. 39 | /// # Errors 40 | /// Returns `Err` if allocation has failed. 41 | fn new(size: usize) -> Result, Error> { 42 | 43 | let dt_sz = std::mem::size_of::(); 44 | 45 | if size == 0 || dt_sz == 0 || size > (std::isize::MAX as usize) / dt_sz { 46 | 47 | return Err(Error::incorrect_allocation_size()); 48 | } 49 | 50 | let ptr: *mut T; 51 | 52 | unsafe { 53 | 54 | let align = std::mem::align_of::(); 55 | ptr = std::alloc::alloc( 56 | std::alloc::Layout::from_size_align(size * dt_sz, align) 57 | .map_err(|e| { Error::incorrect_layout(e) })? 58 | ) as *mut T; 59 | } 60 | 61 | if ptr.is_null() { 62 | 63 | Err(Error::allocation_failure()) 64 | 65 | } else { 66 | 67 | Ok(Buf { 68 | acquire_size: CacheAligned(AtomicUsize::new(0)), 69 | done_size: CacheAligned(AtomicUsize::new(0)), 70 | used_size: AtomicUsize::new(0), 71 | ptr, 72 | size, 73 | }) 74 | } 75 | } 76 | 77 | 78 | /// Sets used space count to zero. 79 | fn reset(&self) { 80 | 81 | self.used_size.store(0, Ordering::Relaxed); 82 | self.done_size.0.store(0, Ordering::Relaxed); 83 | 84 | compiler_fence(Ordering::SeqCst); 85 | 86 | self.acquire_size.0.store(0, Ordering::Relaxed); 87 | } 88 | 89 | 90 | /// Returns Slice instance, and "notify writer" 91 | fn reserve_slice(&self, reserve_size: usize, relaxed: bool) -> (Option<&mut [T]>, bool) { 92 | 93 | if reserve_size == 0 { 94 | 95 | return (Some(&mut []), false); 96 | } 97 | 98 | if reserve_size > self.size || reserve_size > std::usize::MAX - self.size { 99 | 100 | return (None, false); 101 | } 102 | 103 | let mut prev_acq_size = self.acquire_size.0.load(Ordering::Relaxed); 104 | 105 | loop { 106 | 107 | if prev_acq_size > self.size { 108 | 109 | return (None, false); 110 | } 111 | 112 | let cur_acq_size = self.acquire_size.0.compare_and_swap( 113 | prev_acq_size, 114 | prev_acq_size + reserve_size, 115 | Ordering::Relaxed, 116 | ); 117 | 118 | if cur_acq_size == prev_acq_size { 119 | 120 | if cur_acq_size + reserve_size > self.size { 121 | 122 | if self.size > cur_acq_size { 123 | let done_size = self.size - cur_acq_size; 124 | if relaxed { 125 | self.used_size.fetch_add(done_size, Ordering::Relaxed); 126 | 127 | let slice; 128 | unsafe { 129 | 130 | slice = std::slice::from_raw_parts_mut(self.ptr.offset(cur_acq_size as isize), done_size); 131 | } 132 | 133 | return (Some(slice), true); 134 | } else { 135 | let total_done = self.done_size.0.fetch_add(done_size, Ordering::Relaxed) + done_size; 136 | return (None, total_done == self.size); 137 | } 138 | } 139 | 140 | return (None, false); 141 | 142 | } else { 143 | 144 | self.used_size.fetch_add(reserve_size, Ordering::Relaxed); 145 | 146 | let slice; 147 | unsafe { 148 | 149 | slice = std::slice::from_raw_parts_mut(self.ptr.offset(cur_acq_size as isize), reserve_size); 150 | } 151 | 152 | return (Some(slice), true); 153 | } 154 | 155 | } else { 156 | 157 | prev_acq_size = cur_acq_size; 158 | } 159 | } 160 | } 161 | 162 | 163 | #[inline] 164 | fn inc_done_size(&self, reserve_size: usize) -> usize { 165 | return self.done_size.0.fetch_add(reserve_size, Ordering::Relaxed) + reserve_size; 166 | } 167 | 168 | /// try to take up all remaining space, return true if to notify writer 169 | fn reserve_rest(&self) -> bool { 170 | 171 | let reserve_size = self.size + 1; 172 | 173 | let mut prev_acq_size = self.acquire_size.0.load(Ordering::Relaxed); 174 | 175 | loop { 176 | 177 | if prev_acq_size > self.size { 178 | 179 | return false; 180 | } 181 | 182 | let cur_acq_size = self.acquire_size.0.compare_and_swap( 183 | prev_acq_size, 184 | prev_acq_size + reserve_size, 185 | Ordering::Relaxed, 186 | ); 187 | 188 | if cur_acq_size == prev_acq_size { 189 | 190 | if self.size > cur_acq_size { 191 | 192 | let done_size = self.size - cur_acq_size; 193 | let total_done = self.done_size.0.fetch_add(done_size, Ordering::Relaxed) + done_size; 194 | 195 | return total_done == self.size; 196 | } 197 | 198 | return false; 199 | 200 | } else { 201 | 202 | prev_acq_size = cur_acq_size; 203 | } 204 | } 205 | } 206 | 207 | /// Returns buffer for reading. 208 | fn acquire_for_read(&self) -> &mut [T] { 209 | 210 | let total_written = self.used_size.load(Ordering::Relaxed); 211 | 212 | let ret; 213 | 214 | unsafe { 215 | 216 | ret = std::slice::from_raw_parts_mut(self.ptr, total_written); 217 | }; 218 | 219 | ret 220 | } 221 | } 222 | 223 | 224 | impl Drop for Buf { 225 | 226 | fn drop(&mut self) { 227 | 228 | let align = std::mem::align_of::(); 229 | 230 | unsafe { 231 | 232 | std::alloc::dealloc(self.ptr as *mut u8, std::alloc::Layout::from_size_align(self.size, align).unwrap()); 233 | } 234 | } 235 | } 236 | 237 | 238 | unsafe impl Sync for Buf {} 239 | unsafe impl Send for Buf {} 240 | 241 | /// Metrics values. 242 | #[derive(Debug)] 243 | pub struct Metrics { 244 | wait_time: u64, 245 | wait_count: u64, 246 | } 247 | 248 | #[cfg(feature = "metrics")] 249 | struct MetricsInternal { 250 | wait_time: CacheAligned, 251 | wait_count: CacheAligned, 252 | } 253 | 254 | 255 | 256 | /// Doubled Buf instances (flip-flop buffer) 257 | pub struct DoubleBuf { 258 | bufs: Arc>>, 259 | #[cfg(feature = "metrics")] 260 | metrics: Arc, 261 | buf_state: Arc<(Mutex<[BufState; 2]>, Condvar, Condvar)>, 262 | size: usize, 263 | } 264 | 265 | 266 | impl DoubleBuf { 267 | 268 | 269 | /// Create an instance of buffer pair, each of size `sz`. 270 | pub fn new(sz: usize) -> Result, Error> { 271 | 272 | let bufs = Arc::new(vec![Buf::::new(sz)?, Buf::new(sz)?]); 273 | 274 | let buf_state = Arc::new((Mutex::new([BufState::Appendable, BufState::Appendable]), Condvar::new(), Condvar::new())); 275 | 276 | #[cfg(feature = "metrics")] 277 | let metrics = Arc::new(MetricsInternal { 278 | wait_time: CacheAligned(AtomicU64::new(0)), 279 | wait_count: CacheAligned(AtomicU64::new(0)), 280 | }); 281 | 282 | Ok(DoubleBuf { 283 | bufs, 284 | #[cfg(feature = "metrics")] 285 | metrics, 286 | buf_state, 287 | size: sz, 288 | }) 289 | } 290 | 291 | /// return number of buffers 292 | #[inline] 293 | pub fn get_buf_cnt(&self) -> usize { 294 | 295 | self.bufs.len() 296 | } 297 | 298 | fn try_reserve(&self, buf_id: usize, reserve_size: usize, relaxed: bool) -> Option> { 299 | 300 | match self.bufs[buf_id].reserve_slice(reserve_size, relaxed) { 301 | (None, notify) => { 302 | if notify { 303 | self.set_buf_readable(buf_id); 304 | } 305 | return None; 306 | }, 307 | (Some(slice), _) => { 308 | CUR_BUF.with( |v| { 309 | *v.borrow_mut() = buf_id; 310 | }); 311 | 312 | return Some(Slice { 313 | slice, 314 | parent: self, 315 | buf_id, 316 | }); 317 | } 318 | } 319 | } 320 | 321 | /// Reserve slice for write. 322 | pub fn reserve_slice(&self, reserve_size: usize, relaxed: bool) -> Result, ()> { 323 | 324 | let mut cur_buf = 0; 325 | 326 | CUR_BUF.with( |v| { 327 | cur_buf = *v.borrow(); 328 | }); 329 | 330 | let mut appendable = 0; 331 | 332 | loop { 333 | 334 | if let Some(slice) = self.try_reserve(cur_buf, reserve_size, relaxed) { 335 | 336 | return Ok(slice); 337 | 338 | } else if let Some(slice) = self.try_reserve(1 - cur_buf, reserve_size, relaxed) { 339 | 340 | return Ok(slice); 341 | 342 | } else { 343 | 344 | if appendable > 0 { 345 | 346 | #[cfg(feature = "metrics")] 347 | let now = std::time::Instant::now(); 348 | 349 | std::thread::yield_now(); 350 | 351 | if appendable > 10000 { 352 | std::thread::sleep(std::time::Duration::new(0,10_000_000)); 353 | appendable = 0; 354 | } 355 | 356 | #[cfg(feature = "metrics")] 357 | self.inc_metrics(1, std::time::Instant::now().duration_since(now).as_nanos() as u64); 358 | } 359 | 360 | let (buf_id, state) = self.wait_for(BufState::Appendable as u32 | BufState::Terminated as u32); 361 | 362 | if state == BufState::Terminated { 363 | return Err(()); 364 | } 365 | 366 | cur_buf = buf_id; 367 | 368 | appendable += 1; 369 | } 370 | } 371 | } 372 | 373 | 374 | /// Return buffer slice with data and bffer id. 375 | /// If the buffer is not full/ready the method blocks until buffer is ready for processing. 376 | pub fn reserve_for_read(&self) -> (&mut [T], usize) { 377 | 378 | let (buf_id, _) = self.wait_for(BufState::Readable as u32); 379 | 380 | return (self.bufs[buf_id].acquire_for_read(), buf_id); 381 | } 382 | 383 | 384 | /// Wait until one of the buffers has certain `state`. 385 | /// Return buffer id with that state. 386 | fn wait_for(&self, state: u32) -> (usize, BufState) { 387 | 388 | let (ref lock, ref cvar_a, ref cvar_r) = *self.buf_state; 389 | 390 | let mut cur_state = lock.lock().unwrap(); 391 | 392 | let cvar = DoubleBuf::::determine_cvar(state, cvar_a, cvar_r); 393 | 394 | loop { 395 | 396 | for i in 0..cur_state.len() { 397 | 398 | if 0 != (cur_state[i] as u32 & state) { 399 | 400 | return (i, cur_state[i]); 401 | } 402 | } 403 | 404 | #[cfg(feature = "metrics")] { 405 | let now = std::time::Instant::now(); 406 | cur_state = cvar.wait(cur_state).unwrap(); 407 | self.inc_metrics(1, std::time::Instant::now().duration_since(now).as_nanos() as u64); 408 | } 409 | 410 | #[cfg(not(feature = "metrics"))] { 411 | cur_state = cvar.wait(cur_state).unwrap(); 412 | } 413 | } 414 | } 415 | 416 | 417 | #[cfg(feature = "metrics")] 418 | #[inline] 419 | fn inc_metrics(&self, wait_cnt: u64, wait_time: u64) { 420 | self.metrics.wait_time.0.fetch_add(wait_time, Ordering::Relaxed); 421 | self.metrics.wait_count.0.fetch_add(wait_cnt, Ordering::Relaxed); 422 | } 423 | 424 | 425 | #[cfg(feature = "metrics")] 426 | pub fn get_metrics(&self) -> Metrics { 427 | #[cfg(feature = "metrics")] { 428 | Metrics { 429 | wait_time: self.metrics.wait_time.0.load(Ordering::Relaxed), 430 | wait_count: self.metrics.wait_count.0.load(Ordering::Relaxed), 431 | } 432 | } 433 | 434 | #[cfg(not(feature = "metrics"))] { 435 | Metrics { 436 | wait_time: 0, 437 | wait_count: 0, 438 | } 439 | } 440 | } 441 | 442 | 443 | /// Wait until i'th buffer has certain `state`. 444 | fn wait_for_buf(&self, state: u32, buf_id: usize) -> BufState { 445 | 446 | let (ref lock, ref cvar_a, ref cvar_r) = *self.buf_state; 447 | 448 | let mut cur_state = lock.lock().unwrap(); 449 | 450 | let cvar = DoubleBuf::::determine_cvar(state, cvar_a, cvar_r); 451 | 452 | loop { 453 | 454 | if 0 != (cur_state[buf_id] as u32 & state) { 455 | 456 | return cur_state[buf_id]; 457 | } 458 | 459 | cur_state = cvar.wait(cur_state).unwrap(); 460 | } 461 | } 462 | 463 | fn set_buf_readable(&self, buf_id: usize) { 464 | 465 | let (ref lock, ref _cvar_a, ref cvar_r) = *self.buf_state; 466 | 467 | let mut cur_state = lock.lock().unwrap(); 468 | 469 | cur_state[buf_id] = BufState::Readable; 470 | 471 | cvar_r.notify_all(); 472 | } 473 | 474 | 475 | pub fn set_buf_terminated(&self, buf_id: usize) { 476 | 477 | let (ref lock, ref cvar_a, ref _cvar_r) = *self.buf_state; 478 | 479 | let mut cur_state = lock.lock().unwrap(); 480 | 481 | cur_state[buf_id] = BufState::Terminated; 482 | 483 | cvar_a.notify_all(); 484 | } 485 | 486 | 487 | pub fn set_buf_appendable(&self, buf_id: usize) { 488 | 489 | let (ref lock, ref cvar_a, ref _cvar_r) = *self.buf_state; 490 | 491 | let mut cur_state = lock.lock().unwrap(); 492 | 493 | compiler_fence(Ordering::SeqCst); 494 | self.bufs[buf_id].reset(); 495 | compiler_fence(Ordering::SeqCst); 496 | 497 | cur_state[buf_id] = BufState::Appendable; 498 | 499 | cvar_a.notify_all(); 500 | } 501 | 502 | 503 | #[inline] 504 | fn determine_cvar<'a>(state: u32, cvar_a: &'a Condvar, cvar_r: &'a Condvar) -> &'a Condvar { 505 | 506 | if 0 != state & (BufState::Readable as u32) { cvar_r } else { cvar_a } 507 | } 508 | 509 | 510 | /// Flush the last used in current thread buffer 511 | pub fn flush(&self) { 512 | 513 | let mut buf_id = 0; 514 | 515 | CUR_BUF.with( |v| { 516 | buf_id = *v.borrow(); 517 | }); 518 | 519 | self.flush_buf(buf_id); 520 | 521 | CUR_BUF.with( |v| { 522 | *v.borrow_mut() = 1 - buf_id; 523 | }); 524 | } 525 | 526 | fn flush_buf(&self, buf_id: usize) { 527 | 528 | if self.bufs[buf_id].reserve_rest() { 529 | 530 | self.set_buf_readable(buf_id); 531 | 532 | self.wait_for_buf(BufState::Appendable as u32, buf_id); 533 | } 534 | } 535 | 536 | /// Prevent buffers from writing 537 | pub fn seal_buffers(&self) { 538 | 539 | let mut sealed = [false; 2]; 540 | 541 | loop { 542 | 543 | for buf_id in 0..2 { 544 | 545 | if ! sealed[buf_id] { 546 | if self.bufs[buf_id].reserve_rest() { 547 | self.set_buf_readable(buf_id); 548 | } 549 | } 550 | } 551 | 552 | for buf_id in 0..2 { 553 | 554 | if ! sealed[buf_id] { 555 | 556 | let state = self.wait_for_buf(BufState::Terminated as u32 | BufState::Appendable as u32, buf_id); 557 | 558 | sealed[buf_id] = state == BufState::Terminated; 559 | } 560 | } 561 | 562 | if sealed[0] && sealed[1] { 563 | 564 | break; 565 | 566 | } else { 567 | 568 | std::thread::sleep(std::time::Duration::new(0,10_000_000)); 569 | } 570 | } 571 | } 572 | } 573 | 574 | 575 | impl Clone for DoubleBuf { 576 | fn clone(&self) -> Self { 577 | DoubleBuf { 578 | bufs: self.bufs.clone(), 579 | #[cfg(feature = "metrics")] 580 | metrics: self.metrics.clone(), 581 | buf_state: self.buf_state.clone(), 582 | size: self.size, 583 | } 584 | } 585 | } 586 | 587 | /// Buffer states for buffer tracking 588 | #[derive(Copy, Clone, PartialEq, Debug)] 589 | enum BufState { 590 | Appendable = 0b001, 591 | Readable = 0b010, 592 | Terminated = 0b100, 593 | } 594 | 595 | 596 | /// Wrapper for writable slice of [T]. 597 | pub struct Slice<'a, T> { 598 | slice: &'a mut [T], 599 | parent: &'a DoubleBuf, 600 | buf_id: usize, 601 | } 602 | 603 | 604 | impl<'a, T> Deref for Slice<'a, T> { 605 | 606 | type Target = [T]; 607 | 608 | fn deref(&self) -> &Self::Target { 609 | self.slice 610 | } 611 | } 612 | 613 | impl<'a, T> DerefMut for Slice<'a, T> { 614 | 615 | fn deref_mut(&mut self) -> &mut Self::Target { 616 | &mut self.slice 617 | } 618 | } 619 | 620 | impl<'a, T> Drop for Slice<'a, T> { 621 | 622 | fn drop(&mut self) { 623 | 624 | let buf = &self.parent.bufs[self.buf_id]; 625 | let total_done = buf.inc_done_size(self.slice.len()); 626 | if total_done == self.parent.size { 627 | self.parent.set_buf_readable(self.buf_id); 628 | } 629 | } 630 | } 631 | 632 | 633 | 634 | #[cfg(test)] 635 | mod tests { 636 | use super::*; 637 | use std; 638 | 639 | #[test] 640 | fn too_big_size() { 641 | if let Ok(_) = Buf::::new(std::isize::MAX as usize) { 642 | panic!("Buf::new takes size value more than expected"); 643 | } 644 | 645 | if let Ok(_) = Buf::::new((std::isize::MAX as usize) / std::mem::size_of::() + 1) { 646 | panic!("Buf::new takes size value more than expected"); 647 | } 648 | } 649 | 650 | #[test] 651 | fn zero_size() { 652 | if let Ok(_) = Buf::::new(0) { 653 | panic!("Buf::new takes zero size"); 654 | } 655 | 656 | struct Tst {} 657 | 658 | if let Ok(_) = Buf::::new(123) { 659 | panic!("Buf::new takes zero size"); 660 | } 661 | } 662 | } 663 | -------------------------------------------------------------------------------- /src/buf_mgr/buf_mgr.rs: -------------------------------------------------------------------------------- 1 | /// Data buffer management 2 | 3 | use crate::common::errors::Error; 4 | use crate::common::defs::BlockId; 5 | use crate::common::misc::alloc_buf; 6 | use crate::common::misc::dealloc_buf; 7 | use crate::common::intercom::RwLockLw; 8 | use std::collections::HashMap; 9 | use std::sync::RwLock; 10 | use std::sync::Mutex; 11 | use std::sync::Arc; 12 | use std::sync::atomic::Ordering; 13 | use std::sync::atomic::AtomicU64; 14 | use std::ops::Deref; 15 | use std::ops::DerefMut; 16 | use std::sync::RwLockWriteGuard; 17 | 18 | 19 | const DIRTY_BIT: u64 = 0x4000000000000000; // if set then block is dirty 20 | const PINLOCK_BIT: u64 = 0x8000000000000000; // if set then block can't be pinned 21 | const PIN_COUNTER_MASK: u64 = 0x0fffffffffffffff; // bits dedicated for counting pins. 22 | 23 | 24 | #[derive(Copy, Clone, Debug)] 25 | pub struct BlockDesc { 26 | pub id: usize, 27 | pub block_id: BlockId, 28 | pub dirty: bool, 29 | pub block_type: BlockType, 30 | pub checkpoint_block_id: BlockId, 31 | pub checkpoint_written: bool, 32 | } 33 | 34 | 35 | #[derive(Clone, Copy, PartialEq, Hash, Debug)] 36 | pub enum BlockType { 37 | NotUsed, 38 | DataBlock, 39 | VersionBlock, 40 | CheckpointBlock, 41 | } 42 | 43 | unsafe impl Send for BufMgr {} 44 | unsafe impl Sync for BufMgr {} 45 | 46 | 47 | /// Block memory area accessible as slice. 48 | pub struct BlockArea { 49 | data: *mut u8, 50 | block_size: usize, 51 | } 52 | 53 | impl BlockArea { 54 | 55 | pub fn new(data: *mut u8, block_size: usize) -> Self { 56 | BlockArea { 57 | data, 58 | block_size, 59 | } 60 | } 61 | 62 | pub fn data_ptr(&self) -> *mut u8 { 63 | self.data 64 | } 65 | 66 | pub fn size(&self) -> usize { 67 | self.block_size 68 | } 69 | } 70 | 71 | unsafe impl Send for BlockArea {} 72 | unsafe impl Sync for BlockArea {} 73 | 74 | 75 | impl Deref for BlockArea { 76 | 77 | type Target = [u8]; 78 | 79 | fn deref(&self) -> &[u8] { 80 | unsafe { 81 | std::slice::from_raw_parts(self.data as *const u8, self.block_size) 82 | } 83 | } 84 | } 85 | 86 | impl DerefMut for BlockArea { 87 | 88 | fn deref_mut(&mut self) -> &mut [u8] { 89 | unsafe { 90 | std::slice::from_raw_parts_mut(self.data, self.block_size) 91 | } 92 | } 93 | } 94 | 95 | impl Clone for BlockArea { 96 | 97 | fn clone(&self) -> Self { 98 | BlockArea { 99 | data: self.data, 100 | block_size: self.block_size, 101 | } 102 | } 103 | } 104 | 105 | 106 | 107 | /// Buffer of blocks is preallocated continuous region of memory divided by blocks. 108 | /// Buffer allows search by block id, eviction of unused blocks and allocation of new blocks in 109 | /// place of evicted according to eviction mechanism provided for buffer manager. 110 | pub struct BufMgr { 111 | mem: *mut u8, 112 | block_desc: Arc>>, 113 | block_map: Arc>>, 114 | eviction_mech: Arc>, 115 | pins: Arc>, 116 | block_size: usize, 117 | block_num: usize, 118 | } 119 | 120 | impl Clone for BufMgr { 121 | 122 | fn clone(&self) -> Self { 123 | BufMgr { 124 | mem: self.mem, 125 | block_desc: self.block_desc.clone(), 126 | block_map: self.block_map.clone(), 127 | eviction_mech: self.eviction_mech.clone(), 128 | pins: self.pins.clone(), 129 | block_size: self.block_size, 130 | block_num: self.block_num, 131 | } 132 | } 133 | } 134 | 135 | impl BufMgr 136 | where I: CacheItem<(usize, BlockId)>, 137 | { 138 | pub fn new(block_size: usize, block_num: usize) -> Result 139 | where E: EvictionMech<(usize, BlockId), I, Q>, Q: CacheItemIterator<(usize, BlockId), I> 140 | { 141 | let mem = alloc_buf(block_num*block_size)?; 142 | 143 | let mut block_desc = Vec::with_capacity(block_num); 144 | 145 | for i in 0..block_num { 146 | let bd = RwLockLw::new(BlockDesc { 147 | id: i, 148 | block_id: BlockId::new(), 149 | dirty: false, 150 | block_type: BlockType::NotUsed, 151 | checkpoint_block_id: BlockId::new(), 152 | checkpoint_written: true, 153 | }); 154 | block_desc.push(bd); 155 | }; 156 | 157 | let block_map = RwLock::new(HashMap::with_capacity(block_num * 2)); 158 | 159 | let mut eviction_mech = E::new((0, BlockId::new())); 160 | for i in 1..block_num { 161 | eviction_mech.add_item((i, BlockId::new())); 162 | } 163 | 164 | let mut pins = Vec::with_capacity(block_num); 165 | for _ in 0..block_num { 166 | pins.push(AtomicU64::new(0)); 167 | } 168 | 169 | Ok(BufMgr:: { 170 | mem, 171 | block_desc: Arc::new(block_desc), 172 | block_map: Arc::new(block_map), 173 | eviction_mech: Arc::new(Mutex::new(eviction_mech)), 174 | pins: Arc::new(pins), 175 | block_size, 176 | block_num, 177 | }) 178 | } 179 | 180 | 181 | /// return block for reading, return None if block is not in cache 182 | pub fn get_block(&self, block_id: &BlockId) -> Option<(Pinned, usize)> 183 | where E: EvictionMech<(usize, BlockId), I, Q>, Q: CacheItemIterator<(usize, BlockId), I> 184 | { 185 | // acquire read lock 186 | // get block from hash map 187 | // pin the block 188 | // update lru position for the block 189 | // unlock and return block 190 | 191 | let block_map = self.block_map.read().unwrap(); 192 | 193 | if let Some(item) = block_map.get(block_id) { 194 | let buf_idx = item.get_value().0; 195 | if let Some(pinned_block) = self.try_pin(buf_idx) { 196 | 197 | let mut em = self.eviction_mech.lock().unwrap(); 198 | em.on_access(item.clone()); 199 | 200 | return Some((pinned_block, buf_idx)); 201 | } 202 | } 203 | 204 | None 205 | } 206 | 207 | /// put block on cache (if it not yet there) 208 | pub fn allocate_on_cache(&self, block_id: &BlockId, block_type: BlockType) -> Option<(Pinned, usize)> 209 | where E: EvictionMech<(usize, BlockId), I, Q>, Q: CacheItemIterator<(usize, BlockId), I> 210 | { 211 | // acquire write lock 212 | // check if block is already there 213 | // if block is there then pin the block, unlock, and return block 214 | // otherwise, find a free block 215 | // add block to hash map 216 | // pin the block 217 | // update lru position 218 | // unlock and return 219 | 220 | let mut block_map = self.block_map.write().unwrap(); 221 | 222 | if let Some(mut item) = self.get_free_block(&mut block_map) { 223 | item.get_value_mut().1 = *block_id; 224 | Some(self.put_on_map(block_map, item, block_type)) 225 | } else { 226 | None 227 | } 228 | } 229 | 230 | /// get block by index 231 | pub fn get_block_by_idx(&self, idx: usize) -> Option> { 232 | // acquire read lock 233 | // pin the block 234 | // unlock and return block 235 | // (lru position is not updated since this function is used only by writers to disk) 236 | 237 | if idx < self.block_num { 238 | let lock = self.block_map.read().unwrap(); 239 | let ret = self.try_pin(idx); 240 | drop(lock); 241 | ret 242 | } else { 243 | None 244 | } 245 | } 246 | 247 | /// get block descriptor by index 248 | pub fn get_bdesc_by_idx(&self, idx: usize) -> Option { 249 | if idx < self.block_num { 250 | let mut bdesc = *(self.block_desc[idx].read_lock()); 251 | bdesc.dirty = 0 != (self.pins[idx].load(Ordering::Relaxed) & DIRTY_BIT); 252 | Some(bdesc) 253 | } else { 254 | None 255 | } 256 | } 257 | 258 | /// Mark the block in buffer if checkpoint block is written to disk or not. 259 | pub fn set_checkpoint_written(&self, idx: usize, state: bool) { 260 | let mut bdesc = self.block_desc[idx].write_lock(); 261 | (*bdesc).checkpoint_written = state; 262 | } 263 | 264 | /// Set checkpoint block id 265 | pub fn set_checkpoint_block_id(&self, idx: usize, block_id: BlockId) { 266 | let mut bdesc = self.block_desc[idx].write_lock(); 267 | (*bdesc).checkpoint_block_id = block_id; 268 | } 269 | 270 | /// Mark the block in buffer if it is dirty or not. 271 | pub fn set_dirty(&self, idx: usize, state: bool) { 272 | let mut cur = self.pins[idx].load(Ordering::Relaxed); 273 | loop { 274 | // check if set 275 | if (state && (cur & DIRTY_BIT != 0)) || 276 | (!state && (cur & DIRTY_BIT == 0)) { 277 | return; 278 | } 279 | 280 | // apply bit 281 | let new_val = if state { 282 | cur | DIRTY_BIT 283 | } else { 284 | cur & (!DIRTY_BIT) 285 | }; 286 | 287 | if let Err(cur2) = self.pins[idx].compare_exchange(cur, new_val, Ordering::Relaxed, Ordering::Relaxed) { 288 | cur = cur2; 289 | } else { 290 | return; 291 | } 292 | } 293 | } 294 | 295 | 296 | fn get_block_area(&self, idx: usize) -> BlockArea { 297 | BlockArea { 298 | data: unsafe { self.mem.offset(idx as isize * self.block_size as isize) }, 299 | block_size: self.block_size, 300 | } 301 | } 302 | 303 | fn put_on_map(&self, mut block_map: RwLockWriteGuard>, val: I, block_type: BlockType) -> (Pinned, usize) 304 | where E: EvictionMech<(usize, BlockId), I, Q>, Q: CacheItemIterator<(usize, BlockId), I> 305 | { 306 | let block_id = val.get_value().1; 307 | 308 | if let Some(item) = block_map.get_mut(&block_id) { 309 | *item = val.clone(); 310 | } else { 311 | block_map.insert(block_id, val.clone()); 312 | }; 313 | 314 | let id = val.get_value().0; 315 | 316 | self.pinunlock_pin(id); 317 | 318 | let mut bdesc = self.block_desc[id].write_lock(); 319 | bdesc.block_type = block_type; 320 | bdesc.block_id = block_id; 321 | bdesc.checkpoint_block_id = BlockId::new(); 322 | bdesc.checkpoint_written = true; 323 | drop(bdesc); 324 | 325 | (Pinned::new(self.get_block_area(id), &self.pins[id]), id) 326 | } 327 | 328 | fn try_remove_from_map(&self, buf_idx: usize, block_id: &BlockId, block_map: &mut HashMap) -> bool { 329 | if self.try_pinlock(buf_idx) { 330 | block_map.remove(block_id); 331 | return true; 332 | } 333 | false 334 | } 335 | 336 | fn get_free_block(&self, block_map: &mut HashMap) -> Option 337 | where E: EvictionMech<(usize, BlockId), I, Q>, Q: CacheItemIterator<(usize, BlockId), I> 338 | { 339 | // lock linked list. 340 | // go throgh linked list of lru starting from the head. 341 | // for each block of ll: 342 | // skip block if it is dirty 343 | // check if block is pinned, and if not try to prevent it from pinning atomically. 344 | // after block is marked as not pinnable remove it from hash map. 345 | // move correponding node to the tail of the lined list. 346 | // unlock linked list. 347 | // return index of the free block. 348 | let mut em = (&self).eviction_mech.lock().unwrap(); 349 | 350 | let mut iter: Q = em.iter(); 351 | while let Some(item) = iter.next() { 352 | let (buf_idx, block_id) = item.get_value(); 353 | 354 | if (&self).try_remove_from_map(*buf_idx, &block_id, block_map) { 355 | em.on_access(item.clone()); 356 | return Some(item); 357 | } 358 | } 359 | drop(em); 360 | 361 | None 362 | } 363 | 364 | // try pinning a block. 365 | fn try_pin(&self, buf_idx: usize) -> Option> { 366 | let mut cur = self.pins[buf_idx].load(Ordering::Relaxed); 367 | loop { 368 | if cur & PINLOCK_BIT != 0 { 369 | return None; 370 | } 371 | 372 | if let Err(cur2) = self.pins[buf_idx].compare_exchange(cur, cur+1, Ordering::Relaxed, Ordering::Relaxed) { 373 | cur = cur2; 374 | } else { 375 | return Some(Pinned::new(self.get_block_area(buf_idx), &self.pins[buf_idx])); 376 | } 377 | } 378 | } 379 | 380 | // try to mark block as not avaialable for pinning. 381 | fn try_pinlock(&self, buf_idx: usize) -> bool { 382 | let cur = self.pins[buf_idx].load(Ordering::Relaxed); 383 | if cur & (PINLOCK_BIT | PIN_COUNTER_MASK) != 0 { 384 | return false; 385 | } else { 386 | self.pins[buf_idx].compare_exchange(cur, cur | PINLOCK_BIT, Ordering::Relaxed, Ordering::Relaxed).is_ok() 387 | } 388 | } 389 | 390 | // mark a block as avilable for pinning, and pin once. 391 | fn pinunlock_pin(&self, buf_idx: usize) { 392 | self.pins[buf_idx].store(1, Ordering::Relaxed); 393 | } 394 | } 395 | 396 | 397 | impl Drop for BufMgr { 398 | 399 | fn drop(&mut self) { 400 | let BufMgr { 401 | mem, 402 | block_desc, 403 | block_map: _, 404 | eviction_mech: _, 405 | pins: _, 406 | block_size, 407 | block_num, 408 | } = self; 409 | 410 | if Arc::strong_count(&block_desc) == 1 { 411 | // we are the last instance, deallocate memory 412 | dealloc_buf(*mem, (*block_size) * (*block_num)); 413 | } 414 | } 415 | } 416 | 417 | 418 | 419 | /// A block pinned in buffer. 420 | /// Pin counter decreases by 1 when struct instance is dropped. 421 | pub struct Pinned<'a, T> { 422 | value: T, 423 | pin: &'a AtomicU64, 424 | } 425 | 426 | impl<'a, T> Pinned<'a, T> { 427 | pub fn new(value: T, pin: &'a AtomicU64) -> Self { 428 | Pinned::<'a, T> { 429 | value, 430 | pin, 431 | } 432 | } 433 | } 434 | 435 | impl<'a, T> Drop for Pinned<'a, T> { 436 | fn drop(&mut self) { 437 | self.pin.fetch_sub(1, Ordering::Relaxed); 438 | } 439 | } 440 | 441 | 442 | impl Deref for Pinned<'_, T> { 443 | type Target = T; 444 | fn deref(&self) -> &Self::Target { 445 | &self.value 446 | } 447 | } 448 | 449 | impl DerefMut for Pinned<'_, T> { 450 | fn deref_mut(&mut self) -> &mut Self::Target { 451 | &mut self.value 452 | } 453 | } 454 | 455 | 456 | // Eviction mechanism specification. 457 | // Client first adds items to the eviction mech, then uses on_access function to refresh item 458 | // state when item is accessed. When client needs item to be evicted and reused it uses iterator over 459 | // items and chooses suitable item. After item is chosen it can be modified and on_access can be 460 | // called once more to reflect new state of the item. 461 | pub trait EvictionMech 462 | where I: CacheItem, Q: CacheItemIterator 463 | { 464 | // create an instance. 465 | fn new(value: T) -> Self; 466 | 467 | // register an additional item in cache. 468 | fn add_item(&mut self, value: T); 469 | 470 | // updates eviction priority of the item. 471 | fn on_access(&mut self, item: I); 472 | 473 | // iterator of items for potential eviction. 474 | fn iter(&self) -> Q; 475 | } 476 | 477 | pub trait CacheItem { 478 | 479 | // return associated value. 480 | fn get_value(&self) -> &T; 481 | 482 | // return associated value. 483 | fn get_value_mut(&mut self) -> &mut T; 484 | 485 | // clone item reference. 486 | fn clone(&self) -> Self; 487 | } 488 | 489 | pub trait CacheItemIterator> { 490 | 491 | // return next item or None if no items left. 492 | fn next(&mut self) -> Option; 493 | } 494 | 495 | #[cfg(test)] 496 | mod tests { 497 | 498 | use super::*; 499 | use crate::buf_mgr::lru::LruList; 500 | use crate::buf_mgr::lru::LruNodeRef; 501 | 502 | #[test] 503 | fn test() { 504 | assert_eq!(1, 1); 505 | } 506 | 507 | #[test] 508 | fn test_buf_mgr() { 509 | let block_size = 8192; 510 | let block_num = 100; 511 | let bm = BufMgr::, LruList<(usize, BlockId)>>::new(block_size, block_num).expect("Failed to create BufMgr"); 512 | 513 | let mut block_id = BlockId { 514 | file_id: 0, 515 | extent_id: 0, 516 | block_id: 0, 517 | }; 518 | assert!(bm.get_block(&block_id).is_none()); 519 | 520 | let mut block_buf = [0u8; 8192]; 521 | block_buf[0] = 1; 522 | 523 | let (mut block, _buf_idx) = bm.allocate_on_cache(&block_id, BlockType::DataBlock).expect("Failed to allocate block"); 524 | block.copy_from_slice(&block_buf); 525 | assert!(bm.get_block(&block_id).is_some()); 526 | drop(block); 527 | 528 | for _i in 0..block_num { 529 | block_id.block_id += 1; 530 | block_buf[0] += 1; 531 | let (mut block, _buf_idx) = bm.allocate_on_cache(&block_id, BlockType::DataBlock).expect("Failed to allocate block"); 532 | block.copy_from_slice(&block_buf); 533 | drop(block); 534 | } 535 | block_id.block_id = 0; 536 | assert!(bm.get_block(&block_id).is_none()); 537 | 538 | for i in 0..block_num { 539 | block_id.block_id += 1; 540 | let (block, _buf_idx) = bm.get_block(&block_id).expect("Block was not found"); 541 | assert_eq!(i + 2, (&block)[0] as usize); 542 | drop(block); 543 | } 544 | 545 | block_id.block_id = 10; 546 | let (block, _buf_idx) = bm.get_block(&block_id).expect("Block was not found"); 547 | drop(block); 548 | 549 | block_id.block_id = 1000; 550 | for _i in 1..block_num { 551 | block_id.block_id += 1; 552 | let (mut block, _buf_idx) = bm.allocate_on_cache(&block_id, BlockType::DataBlock).expect("Failed to allocate block"); 553 | block.copy_from_slice(&block_buf); 554 | assert!(bm.get_block(&block_id).is_some()); 555 | drop(block); 556 | } 557 | 558 | block_id.block_id = 0; 559 | for _i in 0..block_num { 560 | block_id.block_id += 1; 561 | if block_id.block_id == 10 { 562 | bm.get_block(&block_id).expect("Block was not found"); 563 | } else { 564 | assert!(bm.get_block(&block_id).is_none()); 565 | } 566 | } 567 | 568 | for i in 0..block_num { 569 | assert!(bm.get_block_by_idx(i).is_some()); 570 | } 571 | 572 | let bdesc = bm.get_bdesc_by_idx(0).expect("No block description found"); 573 | assert!(!bdesc.dirty); 574 | assert!(bdesc.checkpoint_written); 575 | bm.set_dirty(1, true); 576 | let bdesc = bm.get_bdesc_by_idx(0).expect("No block description found"); 577 | assert!(!bdesc.dirty); 578 | assert!(bdesc.checkpoint_written); 579 | bm.set_dirty(0, true); 580 | let bdesc = bm.get_bdesc_by_idx(0).expect("No block description found"); 581 | assert!(bdesc.dirty); 582 | assert!(bdesc.checkpoint_written); 583 | 584 | bm.set_checkpoint_written(0, false); 585 | let bdesc = bm.get_bdesc_by_idx(0).expect("No block description found"); 586 | assert!(!bdesc.checkpoint_written); 587 | 588 | bm.set_dirty(0, false); 589 | let bdesc = bm.get_bdesc_by_idx(0).expect("No block description found"); 590 | assert!(!bdesc.dirty); 591 | 592 | let chkbid = BlockId::init(1,2,3); 593 | bm.set_checkpoint_block_id(0, chkbid); 594 | let bdesc = bm.get_bdesc_by_idx(0).expect("No block description found"); 595 | assert_eq!(bdesc.checkpoint_block_id, chkbid); 596 | } 597 | } 598 | 599 | 600 | --------------------------------------------------------------------------------