├── .github └── workflows │ └── build.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── docs ├── api-specification.md ├── figs │ └── io-arch.png ├── members.json └── proposal.md ├── rust-toolchain ├── scripts ├── decode_parquet.py ├── generate_parquet.py └── generate_traces.py ├── storage-client ├── Cargo.toml ├── LICENSE ├── README.md └── src │ ├── bin │ └── driver.rs │ ├── client.rs │ └── lib.rs ├── storage-node ├── Cargo.toml ├── src │ ├── bin │ │ └── storage_node.rs │ ├── cache │ │ ├── data_store_cache │ │ │ ├── memdisk │ │ │ │ ├── data_store │ │ │ │ │ ├── disk.rs │ │ │ │ │ ├── memory.rs │ │ │ │ │ └── mod.rs │ │ │ │ └── mod.rs │ │ │ ├── mod.rs │ │ │ └── sqlite │ │ │ │ ├── blob.rs │ │ │ │ └── mod.rs │ │ ├── mod.rs │ │ └── replacer │ │ │ ├── lru.rs │ │ │ ├── lru_k.rs │ │ │ └── mod.rs │ ├── common │ │ ├── config.rs │ │ ├── hash.rs │ │ └── mod.rs │ ├── disk │ │ ├── disk_manager.rs │ │ ├── disk_manager_sync.rs │ │ ├── mod.rs │ │ └── stream.rs │ ├── error.rs │ ├── lib.rs │ ├── server.rs │ ├── storage_manager.rs │ └── storage_reader │ │ ├── mod.rs │ │ ├── s3.rs │ │ └── s3_diskmock.rs └── tests │ ├── parquet │ ├── small_random_data.parquet │ ├── userdata1.parquet │ └── userdata2.parquet │ └── text │ └── what-can-i-hold-you-with └── tests ├── Cargo.toml └── src ├── client_server_test.rs └── lib.rs /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | workflow_dispatch: 9 | 10 | env: 11 | CARGO_TERM_COLOR: always 12 | RUSTFLAGS: "-Dwarnings" 13 | 14 | jobs: 15 | build: 16 | 17 | runs-on: self-hosted 18 | continue-on-error: false 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Install Toolchain 23 | run: | 24 | rustup update stable 25 | rustup default stable 26 | rustup component add rustfmt 27 | rustup component add clippy 28 | - name: Install cargo-llvm-cov 29 | uses: taiki-e/install-action@cargo-llvm-cov 30 | - name: Install Sqlite 31 | run: | 32 | apt update 33 | apt install libsqlite3-dev 34 | - name: Format check 35 | run: cargo fmt --all -- --check 36 | - name: Run Clippy 37 | run: cargo clippy --all-targets --all-features 38 | - name: Compile check 39 | run: cargo check --all-targets --all-features 40 | - name: Run tests and Generate code coverage 41 | env: 42 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 43 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 44 | run: cargo llvm-cov --all-features --workspace --codecov --output-path lcov.info 45 | - name: Archive code coverage results 46 | uses: actions/upload-artifact@v4 47 | with: 48 | name: code-coverage-report 49 | path: lcov.info 50 | retention-days: 3 51 | - name: Upload to codecov 52 | uses: codecov/codecov-action@v3 53 | with: 54 | token: be8874e2-10d6-434f-9d52-db6094de31d6 55 | files: lcov.info 56 | name: codecov-umbrella # optional 57 | fail_ci_if_error: true 58 | verbose: true 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | .vscode/ 17 | 18 | **/.DS_Store 19 | 20 | data/ 21 | 22 | **/*.pem 23 | 24 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["storage-node", "storage-client", "tests"] 3 | 4 | resolver = "2" 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 CMU Database Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 15721-s24-cache1 2 | 15-721 Spring 2024 - Cache #1 3 | -------------------------------------------------------------------------------- /docs/api-specification.md: -------------------------------------------------------------------------------- 1 | # I/O Service API Specification 2 | 3 | ## Overview 4 | 5 | > What commands will the API expose. 6 | 7 | The I/O service will provide the execution engine with a client library, to which they can issue requests for data. We allow the execution engine to query data on different granularities, including table, column, and tuple. We will provide both synchronous and asynchronous methods for the execution engine to get storage data. 8 | 9 | See [this PR](https://github.com/cmu-db/15721-s24-cache1/pull/2) for more details. 10 | 11 | ## Encoding 12 | 13 | > What encoding scheme will the API use for inputs / outputs 14 | 15 | The I/O service will encode the data as [Arrow's `RecordBatch` type](https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html) when we transfer the storage data to the execution engine. 16 | 17 | ## Error Handling 18 | 19 | > What errors can the service encounter and how will API handle them (e.g., status codes). 20 | 21 | On error, the I/O service will return `anyhow::Error` to the execution engine with a customized message, which simply denotes that the I/O service is not able to retrieve data from the underlying storage. The execution engine should forward the error to the upper layer. 22 | 23 | See the [discussion here](https://github.com/cmu-db/15721-s24-cache1/pull/2#issuecomment-1942780360) for more details. -------------------------------------------------------------------------------- /docs/figs/io-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/15721-s24-cache1/b4e2bc8f2c2fc3ab7a9b9fa3f8e864e25e9c8c40/docs/figs/io-arch.png -------------------------------------------------------------------------------- /docs/members.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "title": "Parpulse: I/O Service for Modern OLAP Database System", 4 | "github": "https://github.com/cmu-db/15721-s24-cache1", 5 | "description": "The goal of this project is to develop an I/O service for an Online Analytical Processing (OLAP) database system. This service will facilitate communication between the execution engine and remote storage solutions such as Amazon S3. Additionally, a local cache will be incorporated to store recently accessed data on the local disk, thereby accelerating future data retrievals.\nThe I/O service is designed to manage requests from the execution engine and fetch pertinent data (e.g., Parquet files) from either the local cache or remote storage. It will process the data and return a stream of the decoded data to the execution engine.\nThe initial phase of this project aims to construct a fully functional I/O service following the specifications outlined above. Further enhancements, such as kernel bypass and integration of io_uring, may be considered in the future.", 6 | "students": [ 7 | { 8 | "name": "Yuanxin Cao", 9 | "url": "https://github.com/xx01cyx" 10 | }, 11 | { 12 | "name": "Kunle Li", 13 | "url": "https://github.com/unw9527" 14 | }, 15 | { 16 | "name": "Lan Lou", 17 | "url": "https://github.com/lanlou1554" 18 | } 19 | ] 20 | } 21 | } -------------------------------------------------------------------------------- /docs/proposal.md: -------------------------------------------------------------------------------- 1 | # I/O Service Project Proposal 2 | 3 | * Yuanxin Cao (yuanxinc) 4 | * Lan Lou (lanlou) 5 | * Kunle Li (kunlel) 6 | 7 | ## Overview 8 | 9 | > What is the goal of this project? What will this component achieve? 10 | 11 | The objective of this project is to develop an Input/Output (I/O) service for an Online Analytical Processing (OLAP) database system. This service will facilitate communication between the execution engine and remote storage solutions such as Amazon S3. Additionally, a local cache will be incorporated to store recently accessed data on the local disk, thereby accelerating future data retrievals. 12 | 13 | The I/O service is designed to manage requests from the execution engine, fetching pertinent data (e.g., Parquet files) from either the local cache or remote storage. It will process the data and return a stream of decoded information as a record batch to the execution engine. 14 | 15 | The initial phase aims to construct a fully functional I/O service following the specifications outlined above. Further enhancements, such as kernel bypass and integration of io_uring, may be considered based on project timeline and requirements. 16 | 17 | 18 | ## Architectural Design 19 | 20 | > Explain the input and output of the component, describe interactions and breakdown the smaller components if any. Include diagrams if appropriate. 21 | 22 | The I/O service receives input in the form of requested columns (i.e. logical location) from the execution engine and produces an output stream (e.g. [`tokio::Stream`](https://docs.rs/tokio/latest/tokio/stream/index.html)) of Apache Arrow [`RecordBatch`](https://docs.rs/arrow-array/50.0.0/arrow_array/struct.RecordBatch.html). 23 | 24 | ![](./figs/io-arch.png) 25 | 26 | 27 | Our design comprises several key components: 28 | 29 | - Storage Client 30 | - Storage Node 31 | - Storage Manager 32 | - DataStore Cache 33 | - Replacer (LRU, LRU-K) 34 | - DataStore 35 | - MemDiskStore --> File system 36 | - SqliteStore --> SQLite 37 | - Storage Reader 38 | - S3 Reader (Read from S3) 39 | - Mock S3 Reader (Read from file system) 40 | 41 | The Storage Client resides in the compute node, where it establishes connections with the executors from the execution engine. The Storage Manager orchestrates requests from the compute node and then directs them to either the cache or the Storage Reader. The cache works by recording the access timestamp and making evictions of the cached elements, and we plan to use embedded databases such as RocksDB or Redis as our cache. For the cache policy, we plan to incorporate common policies such as LRU-K. The Storage Reader includes several APIs for reading from different storage systems such as Amazon S3 and the local file system. 42 | 43 | The workflow of the I/O service is as follows. Initially, the execution engine invokes the API exposed by the I/O service. The Storage Client will then contact the catalog to retrieve the corresponding physical location based on the logical columns (update: After discussing with the other I/O service team, we used a hashmap to mimic the behavior of catalog for sake of time). Next, the Storage Client transmits the requests via HTTP to the Storage Node. The Storage Manager then verifies whether the data is already present on the local disk by consulting the cache. 44 | 45 | We design two levels of cache. One sits in the memory for small files fast retrieval, and the other uses disk as storage for caching large files. The latter includes a mapping where the key represents the file's physical location in S3, and the value denotes the physical location on the local disk. If the data is found, it is directly returned to the Storage Client. Otherwise, the Storage Reader reads the data from the underlying storage and updates the cache. Finally, the Parquet file is decoded in the Storage Client, and the resulting record batch stream is returned to the execution engine. 46 | 47 | 48 | ## Design Rationale 49 | 50 | > Explain the goals of this design and how the design achieves these goals. Present alternatives considered and document why they are not chosen. 51 | 52 | The design goal of the I/O service is to provide the execution engine with a simple interface to interact with the storage while achieving high performance. The storage client resides in the compute node, which makes it possible to let the execution engine get storage data just by a function call, instead of sending a request over the network. This leaves request processing to the I/O service itself and thus makes the conveying of data or error more straightforward. Moreover, having a storage client residing on the compute node promises more possibilities, including providing a `write_data` interface for the execution engine to store its own persistent states (if there would be any) in the future. 53 | 54 | We use HTTP rather than TCP for the interaction between the storage client and the storage node because of HTTP's provision of more extensive application semantics, such as transmitting data in row groups in Parquet, without requiring TCP's packet-to-byte breakdown capability. We opt out of using gRPC because the storage client already communicates with the catalog via HTTP, making it simpler to utilize HTTP for all communication and data transmission within the storage client. 55 | 56 | The storage node, on the other hand, is designed to be of high performance and at the same time be easily extendible. We adopt LRU cache algorithm because it is one of the most widely-used cache strategies in the industry since it maintains a good hit rate in real-world scenarios while requiring moderate computation. Besides LRU, we also plan to adopt more cache algorithms to get a sense of the performance difference between various cache algorithms. 57 | 58 | In addition to utilizing disk-based caching, we intend to incorporate SQLite as our primary cache storage solution. We choose SQLite because it is out-of-box and stable to use. 59 | 60 | The storage reader is designed to retrieve data from different storage services. Currently, we plan to support the local file systems and Amazon S3, but we can easily add more storage services in the future via the abstraction of the storage reader. 61 | 62 | ## Testing Plan 63 | 64 | > How should the component be tested? 65 | 66 | 1. Correctness Tests 67 | 68 | 1. Unit Tests 69 | 70 | 1. Cache Algorithm Test: focusing on the correctness of cache algorithms, like LRU. 71 | 2. Storage Reader Test: focusing on the correctness of getting data from the underlying storage. 72 | 3. Storage Manager Test: focusing on the correctness of coordinating the cache and the storage reader. 73 | 4. Storage Client Test: focusing on the correctness of getting physical location information from the catalog and forwarding the request to the I/O server. 74 | 75 | (Note: all the above tests should also focus on error handling.) 76 | 77 | 2. Integration Tests 78 | 79 | The integration test will use the public API of the I/O service. We will call the API of the storage client in the way the execution engine does. We will test on different request types (table, column, etc.), different storage types (file system, S3), and different request data amounts. We will focus on the availability of the data and the correctness of the contents. Also, the I/O service should report the error appropriately when there is an exception. 80 | 81 | 2. Performance Tests (Benchmark) 82 | 83 | We would write a Python script to generate random parquet files of certain sizes for benchmarking. Since the data type of the parquet files does not affect the performance, we generate floating point numbers for each parquet file. 84 | 85 | The dataset we create is two sets of 10 parquet files, with one set containing 1Mb files and the other set containing 100Mb files. We adopt the Zipfian distribution for the access pattern. 86 | 87 | For benchmarking, we measure the detailed elapsed time during each phase, starting from the Storage Client receiving the request, to the Storage Client returning the data. The machine we use is AWS EC2 type `C5.xlarge`, with 4vCPU, 8Gb memory and 32Gb disk. We set up one instance for Storage Client and one for Storage Server. 88 | 89 | The way we trigger the benchmarking is through GitHub Actions, which enables benchmarking to be triggered automatically on certain PR or push without manual operations. 90 | 91 | ## Trade-offs and Potential Problems 92 | 93 | > Write down any conscious trade-off you made that can be problematic in the future, or any problems discovered during the design process that remain unaddressed (technical debts). 94 | 95 | The whole design is based on the fact that the database is a static one (i.e. no data manipulation) and we only have read requests on the storage. This assumption makes everything easier, since there will be few concurrency issues for a read-only database. However, if we are going to enable updates, then we should correctly handle the read-write and write-write conflicts, which requires a more complicated design than the current one. 96 | 97 | Moreover, even if all data is ETLed into the database system (this is our assumption), there can still be updates if the user replaces some of the underlying Parquet files. In this case, we might need another service to perform data discovery on the storage to deal with these situations. Also, we have to ensure the consistency of caches in different compute nodes (if we are going to build the cache) and ensure that the data we read is not stale. 98 | 99 | 100 | 102 | 103 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /scripts/decode_parquet.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is used to decode a parquet file and display the first n rows of the 3 | file, mainly for visualization purposes. 4 | 5 | Usage: 6 | python scripts/decode_parquet.py --n 7 | 8 | """ 9 | 10 | from IPython.display import display 11 | import pandas as pd 12 | import argparse 13 | 14 | 15 | def decode_parquet(file_path, n=5): 16 | df = pd.read_parquet(file_path) 17 | print('---------- Statistics ----------') 18 | print(df.describe(), '\n') 19 | print(f'---------- First {n} rows of the parquet file ----------') 20 | display(df.head(n), '\n') 21 | 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser(description='Decode parquet file') 25 | parser.add_argument('file_path', type=str, help='path to parquet file') 26 | parser.add_argument( 27 | '--n', 28 | type=int, 29 | default=5, 30 | help='number of rows to display', 31 | required=False) 32 | args = parser.parse_args() 33 | decode_parquet(args.file_path, args.n) 34 | -------------------------------------------------------------------------------- /scripts/generate_parquet.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script generates random parquet files given the number of rows, columns, 3 | and files. 4 | 5 | Usage: 6 | python scripts/generate_parquet.py -d data -r 5500 -c 20 -n 5 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | import pyarrow as pa 11 | import pyarrow.parquet as pq 12 | import os 13 | import argparse 14 | from tqdm import tqdm 15 | 16 | 17 | def generate_random_parquet_files(output_dir, num_rows, num_cols, num_files): 18 | for i in tqdm(range(num_files)): 19 | data = pd.DataFrame(np.random.rand(num_rows, num_cols)) 20 | data.columns = [f'col {i + 1}' for i in range(num_cols)] 21 | table = pa.Table.from_pandas(data) 22 | file_name = os.path.join(output_dir, f"random_data_{i}.parquet") 23 | pq.write_table(table, file_name) 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser( 28 | description='Generate random parquet files') 29 | parser.add_argument( 30 | '-d', 31 | '--dir', 32 | type=str, 33 | help='output directory for parquet files') 34 | parser.add_argument( 35 | '-r', 36 | '--row', 37 | type=int, 38 | help='number of rows in each file', 39 | default=5500, # Roughly 1MB 40 | required=False) 41 | parser.add_argument( 42 | '-c', 43 | '--col', 44 | type=int, 45 | help='number of columns in each file', 46 | default=20, 47 | required=False) 48 | parser.add_argument( 49 | '-n', 50 | type=int, 51 | help='number of files to generate', 52 | default=5, 53 | required=False) 54 | args = parser.parse_args() 55 | 56 | generate_random_parquet_files(args.dir, args.row, args.col, args.n) 57 | -------------------------------------------------------------------------------- /scripts/generate_traces.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script generates access pattern for a set of files with a Zipfian distribution. 3 | 4 | Usage: 5 | python scripts/generate_traces.py --num_files 10 --skew_param 1.2 --num_accesses 100 -o 6 | """ 7 | 8 | import numpy as np 9 | import csv 10 | import argparse 11 | import random 12 | 13 | mp = {1: [1, 2, 3, 4, 5, 6, 7, 8, 9], 100: [ 14 | 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]} 15 | 16 | 17 | def generate_access_counts(num_files, s, num_accesses, file_size): 18 | access_counts = [] 19 | # Generate Zipfian distribution probabilities 20 | probabilities = 1 / np.arange(1, num_files + 1) ** s 21 | # Normalize 22 | probabilities /= np.sum(probabilities) 23 | 24 | # Simulate file accesses 25 | for _ in range(num_accesses): 26 | file_index = np.random.choice(mp[file_size], p=probabilities) 27 | access_counts.append(file_index) 28 | 29 | return access_counts 30 | 31 | 32 | def write_to_csv(access_counts, output_file): 33 | with open(output_file, "w") as f: 34 | writer = csv.writer(f) 35 | writer.writerow(["timestamp", "file_index"]) 36 | timestamp = 0 # timestamp is in milliseconds 37 | for file_index in access_counts: 38 | writer.writerow([timestamp, file_index]) 39 | timestamp += random.randint(1, 500) 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser( 44 | description="Generate access counts with a Zipfian distribution") 45 | parser.add_argument( 46 | "-s", 47 | "--skew_param", 48 | type=float, 49 | default=1.5, 50 | help="Skew parameter (default: 1.5)") 51 | parser.add_argument( 52 | "--num_accesses", 53 | type=int, 54 | default=20, 55 | help="Number of accesses (default: 20)") 56 | parser.add_argument( 57 | "-o", 58 | "--output_file", 59 | type=str, 60 | default="data/traces/trace_1m.csv", 61 | help="Output CSV file (default: data/trace.csv)") 62 | parser.add_argument( 63 | "--size", 64 | type=int, 65 | default=1, 66 | help="Size of the parquet file in MB" 67 | ) 68 | args = parser.parse_args() 69 | 70 | if args.size not in mp: 71 | raise ValueError("Size should be either 1 or 100") 72 | if args.size == 100: 73 | num_files = 10 74 | else: 75 | num_files = 9 76 | 77 | access_counts = generate_access_counts( 78 | num_files, args.skew_param, args.num_accesses, args.size) 79 | write_to_csv(access_counts, args.output_file) 80 | 81 | print("Access counts generated and written to", args.output_file) 82 | -------------------------------------------------------------------------------- /storage-client/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "parpulse-client" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = [ 6 | "Yuanxin Cao ", 7 | "Kunle <1041593558@qq.com>", 8 | "Lan Lou ", 9 | ] 10 | description = "Client application for Parpulse OLAP database I/O cache service" 11 | license-file = "LICENSE" 12 | homepage = "https://github.com/cmu-db/15721-s24-cache1" 13 | repository = "https://github.com/cmu-db/15721-s24-cache1" 14 | documentation = "https://github.com/cmu-db/15721-s24-cache1/blob/main/README.md" 15 | readme = "README.md" 16 | include = ["src/client.rs", "src/lib.rs"] 17 | 18 | [dependencies] 19 | anyhow = "1" 20 | hyper = "1" 21 | async-trait = "0.1" 22 | tokio = { version = "1", features = ["full", "rt-multi-thread"] } 23 | futures = "0.3" 24 | reqwest = { version = "0.12", features = ["stream"] } 25 | tempfile = "3.2" 26 | parquet = { version = "50.0.0", features = ["async"] } 27 | arrow = "50.0.0" 28 | log = "0.4" 29 | istziio-client = "0.1.9" 30 | lazy_static = "1.4" 31 | enum-as-inner = "0.6" 32 | serde = { version = "1", features = ["derive"] } 33 | env_logger = "0.11" 34 | 35 | [dev-dependencies] 36 | mockito = "1.4.0" 37 | -------------------------------------------------------------------------------- /storage-client/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 CMU Database Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /storage-client/README.md: -------------------------------------------------------------------------------- 1 | # Parpulse Client 2 | 3 | This is the storage client implementation for 15721-s24-cache1. The `StorageClientImpl` should implement the `StorageClient` trait agreed between the two teams. 4 | -------------------------------------------------------------------------------- /storage-client/src/bin/driver.rs: -------------------------------------------------------------------------------- 1 | use arrow::array::Float64Array; 2 | use istziio_client::client_api::{DataRequest, StorageClient, StorageRequest}; 3 | use log::info; 4 | use parpulse_client::client::StorageClientImpl; 5 | use std::time::Instant; 6 | 7 | /// This test is for benchmarking. 8 | 9 | #[tokio::main] 10 | async fn main() { 11 | let _ = env_logger::builder() 12 | .filter_level(log::LevelFilter::Info) 13 | .is_test(true) 14 | .try_init(); 15 | 16 | let server_endpoint = 17 | std::env::var("SERVER_URL").unwrap_or(String::from("http://127.0.0.1:3030")); 18 | 19 | let storage_client = StorageClientImpl::new(&server_endpoint, "http://127.0.0.1:3031") 20 | .expect("Failed to create storage client."); 21 | let start_time = Instant::now(); 22 | // Requesting random_data_100m_0.parquet 23 | let request = StorageRequest::new(0, DataRequest::Table(10)); 24 | let mut receiver = storage_client 25 | .request_data(request) 26 | .await 27 | .expect("Failed to get data from the server."); 28 | let mut record_batches = vec![]; 29 | while let Some(record_batch) = receiver.recv().await { 30 | record_batches.push(record_batch); 31 | } 32 | info!("Time taken for 100m file: {:?}", start_time.elapsed()); 33 | 34 | assert!(!record_batches.is_empty()); 35 | 36 | let first_batch = &record_batches[0]; 37 | assert_eq!(first_batch.num_columns(), 20); 38 | 39 | // Check the first 5 columns of the first row. 40 | let real_first_row = [ 41 | 0.869278151694903, 42 | 0.5698583744743971, 43 | 0.5731127546817466, 44 | 0.9509491985107434, 45 | 0.3949108352357301, 46 | ]; 47 | for (i, &real_value) in real_first_row.iter().enumerate() { 48 | let column = first_batch 49 | .column(i) 50 | .as_any() 51 | .downcast_ref::() 52 | .unwrap(); 53 | assert_eq!(column.value(0), real_value); 54 | } 55 | info!("Succeed!") 56 | } 57 | -------------------------------------------------------------------------------- /storage-client/src/client.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{anyhow, Ok, Result}; 2 | use arrow::array::RecordBatch; 3 | use futures::stream::StreamExt; 4 | 5 | use crate::RequestParams; 6 | use hyper::Uri; 7 | use lazy_static::lazy_static; 8 | use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; 9 | use parquet::arrow::ProjectionMask; 10 | use reqwest::{Client, Response, Url}; 11 | use std::collections::HashMap; 12 | use std::fs::File; 13 | use std::io::Write; 14 | use tempfile::tempdir; 15 | 16 | use tokio::sync::mpsc::{channel, Receiver}; 17 | 18 | use istziio_client::client_api::{DataRequest, StorageClient, StorageRequest, TableId}; 19 | 20 | /// The batch size for the record batch. 21 | const BATCH_SIZE: usize = 1024; 22 | const CHANNEL_CAPACITY: usize = 32; 23 | const PARAM_BUCKET_KEY: &str = "bucket"; 24 | const PARAM_KEYS_KEY: &str = "keys"; 25 | 26 | lazy_static! { 27 | static ref TABLE_FILE_MAP: HashMap = { 28 | let mut m = HashMap::new(); 29 | // For mock s3 30 | m.insert(0, "userdata1.parquet".to_string()); 31 | // All the remainings are for real s3 32 | for i in 1..=9 { 33 | m.insert(i, format!("1m/random_data_1m_{}.parquet", i)); 34 | } 35 | for i in 10..=19 { 36 | m.insert(i, format!("100m/random_data_100m_{}.parquet", i - 10)); 37 | } 38 | m 39 | }; 40 | } 41 | 42 | pub struct StorageClientImpl { 43 | storage_server_endpoint: Uri, 44 | _catalog_server_endpoint: Uri, 45 | } 46 | 47 | impl StorageClientImpl { 48 | pub fn new( 49 | storage_server_endpoint_str: &str, 50 | catalog_server_endpoint_str: &str, 51 | ) -> Result { 52 | let storage_server_endpoint = storage_server_endpoint_str.parse::().map_err(|_| { 53 | anyhow!( 54 | "cannot resolve storage server endpoint: {}", 55 | storage_server_endpoint_str 56 | ) 57 | })?; 58 | let catalog_server_endpoint = catalog_server_endpoint_str.parse::().map_err(|_| { 59 | anyhow!( 60 | "cannot resolve catalog server endpoint: {}", 61 | catalog_server_endpoint_str 62 | ) 63 | })?; 64 | Ok(Self { 65 | storage_server_endpoint, 66 | _catalog_server_endpoint: catalog_server_endpoint, 67 | }) 68 | } 69 | 70 | /// Returns the physical location of the requested data in RequestParams. 71 | async fn get_info_from_catalog(&self, request: StorageRequest) -> Result { 72 | let bucket = "parpulse-test".to_string(); 73 | let table_id = match request.data_request() { 74 | DataRequest::Table(id) => *id, 75 | _ => { 76 | return Err(anyhow!("Only table request is supported.")); 77 | } 78 | }; 79 | let keys = vec![TABLE_FILE_MAP.get(&table_id).unwrap().to_string()]; 80 | Ok(RequestParams::S3((bucket, keys))) 81 | } 82 | 83 | async fn get_data_from_response(response: Response) -> Result> { 84 | if response.status().is_success() { 85 | // Store the streamed Parquet file in a temporary file. 86 | // FIXME: 1. Do we really need streaming here? 87 | // 2. Do we need to store the file in a temporary file? 88 | let temp_dir = tempdir()?; 89 | let file_path = temp_dir.path().join("tmp.parquet"); 90 | let mut file = File::create(&file_path)?; 91 | let mut stream = response.bytes_stream(); 92 | while let Some(chunk) = stream.next().await { 93 | let chunk = chunk?; 94 | file.write_all(&chunk)?; 95 | } 96 | 97 | // Convert the Parquet file to a record batch. 98 | let file = File::open(file_path)?; 99 | let builder = 100 | ParquetRecordBatchReaderBuilder::try_new(file)?.with_batch_size(BATCH_SIZE); 101 | let mask = ProjectionMask::all(); 102 | let mut reader = builder.with_projection(mask).build()?; 103 | 104 | let (tx, rx) = channel(CHANNEL_CAPACITY); 105 | 106 | // Return the record batch as a stream. 107 | tokio::spawn(async move { 108 | while let Some(core::result::Result::Ok(rb)) = reader.next() { 109 | tx.send(rb).await.unwrap(); 110 | } 111 | }); 112 | Ok(rx) 113 | } else { 114 | Err(anyhow::anyhow!( 115 | "Failed to download file. Response: {:?}, Body: {}", 116 | response.status(), 117 | response 118 | .text() 119 | .await 120 | .unwrap_or_else(|_| String::from("Failed to read response body")) 121 | )) 122 | } 123 | } 124 | 125 | async fn get_info_from_catalog_test(&self, request: StorageRequest) -> Result { 126 | let bucket = "tests-parquet".to_string(); 127 | let table_id = match request.data_request() { 128 | DataRequest::Table(id) => id, 129 | _ => { 130 | return Err(anyhow!("Only table request is supported.")); 131 | } 132 | }; 133 | let keys = vec![TABLE_FILE_MAP.get(table_id).unwrap().to_string()]; 134 | Ok(RequestParams::MockS3((bucket, keys))) 135 | } 136 | 137 | fn get_request_url_and_params( 138 | &self, 139 | location: (String, Vec), 140 | ) -> Result<(String, Vec<(&str, String)>)> { 141 | let scheme = self 142 | .storage_server_endpoint 143 | .scheme() 144 | .ok_or_else(|| anyhow!("Failed to get the scheme of the storage server endpoint."))? 145 | .to_owned(); 146 | let authority = self 147 | .storage_server_endpoint 148 | .authority() 149 | .ok_or_else(|| anyhow!("Failed to get the authority of the storage server endpoint."))? 150 | .to_owned(); 151 | let path = "/file"; 152 | let url = Uri::builder() 153 | .scheme(scheme) 154 | .authority(authority) 155 | .path_and_query(path) 156 | .build() 157 | .unwrap(); 158 | let params = vec![ 159 | (PARAM_BUCKET_KEY, location.0), 160 | (PARAM_KEYS_KEY, location.1.join(",")), 161 | ]; 162 | Ok((url.to_string(), params)) 163 | } 164 | 165 | pub async fn request_data_test( 166 | &self, 167 | request: StorageRequest, 168 | ) -> Result> { 169 | // First we need to get the location of the parquet file from the catalog server. 170 | let location = match self.get_info_from_catalog_test(request).await? { 171 | RequestParams::MockS3(location) => location, 172 | _ => { 173 | return Err(anyhow!( 174 | "Failed to get location of the file from the catalog server." 175 | )); 176 | } 177 | }; 178 | 179 | // Then we need to send the request to the storage server. 180 | let client = Client::new(); 181 | let (url, mut params) = self.get_request_url_and_params(location)?; 182 | params.push(("is_test", "true".to_owned())); 183 | 184 | let url = Url::parse_with_params(&url, params)?; 185 | let response = client.get(url).send().await?; 186 | 187 | Self::get_data_from_response(response).await 188 | } 189 | } 190 | 191 | #[async_trait::async_trait] 192 | impl StorageClient for StorageClientImpl { 193 | async fn request_data(&self, request: StorageRequest) -> Result> { 194 | // First we need to get the location of the parquet file from the catalog server. 195 | let location = match self.get_info_from_catalog(request).await? { 196 | RequestParams::S3(location) => location, 197 | _ => { 198 | return Err(anyhow!( 199 | "Failed to get location of the file from the catalog server." 200 | )); 201 | } 202 | }; 203 | 204 | // Then we need to send the request to the storage server. 205 | let client = Client::new(); 206 | let (url, params) = self.get_request_url_and_params(location)?; 207 | let url = Url::parse_with_params(&url, params)?; 208 | let response = client.get(url).send().await?; 209 | Self::get_data_from_response(response).await 210 | } 211 | 212 | // TODO (kunle): I don't think this function is necessary. 213 | async fn request_data_sync(&self, _request: StorageRequest) -> Result> { 214 | todo!() 215 | } 216 | } 217 | 218 | #[cfg(test)] 219 | mod tests { 220 | use super::*; 221 | use arrow::array::StringArray; 222 | use mockito::Server; 223 | 224 | /// WARNING: Put userdata1.parquet in the storage-node/tests/parquet directory before running this test. 225 | #[tokio::test] 226 | async fn test_storage_client_disk() { 227 | // Create a mock server to serve the parquet file. 228 | let mut server = Server::new_async().await; 229 | println!("server host: {}", server.host_with_port()); 230 | server 231 | .mock( 232 | "GET", 233 | "/file?bucket=tests-parquet&keys=userdata1.parquet&is_test=true", 234 | ) 235 | .with_body_from_file("../storage-node/tests/parquet/userdata1.parquet") 236 | .create_async() 237 | .await; 238 | 239 | let server_endpoint = server.url() + "/"; 240 | let storage_client = StorageClientImpl::new(&server_endpoint, "localhost:3031") 241 | .expect("Failed to create storage client."); 242 | // 0 is the table id for userdata1.parquet on local disk. 243 | let request = StorageRequest::new(0, DataRequest::Table(0)); 244 | let mut receiver = storage_client 245 | .request_data_test(request) 246 | .await 247 | .expect("Failed to get data from the server."); 248 | let mut record_batches = vec![]; 249 | while let Some(record_batch) = receiver.recv().await { 250 | record_batches.push(record_batch); 251 | } 252 | assert!(!record_batches.is_empty()); 253 | 254 | let first_batch = &record_batches[0]; 255 | assert_eq!(first_batch.num_columns(), 13); 256 | 257 | let real_first_names = StringArray::from(vec!["Amanda", "Albert", "Evelyn"]); 258 | let read_last_names = StringArray::from(vec!["Jordan", "Freeman", "Morgan"]); 259 | let first_names = first_batch 260 | .column(2) 261 | .as_any() 262 | .downcast_ref::() 263 | .unwrap(); 264 | let last_names = first_batch 265 | .column(3) 266 | .as_any() 267 | .downcast_ref::() 268 | .unwrap(); 269 | // Check the first three entries in the first and last name columns. 270 | for i in 0..3 { 271 | assert_eq!(first_names.value(i), real_first_names.value(i)); 272 | assert_eq!(last_names.value(i), read_last_names.value(i)); 273 | } 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /storage-client/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | 3 | use enum_as_inner::EnumAsInner; 4 | use serde::Deserialize; 5 | 6 | #[derive(Clone, EnumAsInner, Debug)] 7 | pub enum RequestParams { 8 | /// S3 bucket and keys. 9 | S3((String, Vec)), 10 | /// Mock S3 bucket and keys. 11 | /// This is used for testing purposes. 12 | MockS3((String, Vec)), 13 | } 14 | 15 | #[derive(Deserialize)] 16 | pub struct S3Request { 17 | pub bucket: String, 18 | /// Cannot deserialize a vector of strings, might need to customize a deserializer later. 19 | pub keys: String, 20 | #[serde(default)] 21 | pub is_test: bool, 22 | } 23 | -------------------------------------------------------------------------------- /storage-node/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "storage-node" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | bytes = "1" 8 | hyper = "1" 9 | tokio = { version = "1", features = ["rt", "rt-multi-thread", "macros"] } 10 | hashlink = "0.8" 11 | enum-as-inner = "0.6" 12 | futures = { version = "0.3", features = ["alloc"] } 13 | thiserror = "1" 14 | aws-sdk-s3 = "1" 15 | aws-config = "1" 16 | aws-smithy-runtime-api = "1" 17 | async-trait = "0.1" 18 | parpulse-client = { path = "../storage-client" } 19 | warp = "0.3" 20 | tokio-util = "0.7" 21 | reqwest = "0.12" 22 | tempfile = "3.10.1" 23 | rand = "0.8" 24 | tokio-stream = "0.1" 25 | rusqlite = { version = "0.31", features = ["blob"] } 26 | log = "0.4" 27 | env_logger = "0.11" 28 | crc32fast = "1.4.0" 29 | clap = { version = "4.5", features = ["derive"] } 30 | serde = { version = "1", features = ["derive"] } 31 | 32 | [dev-dependencies] 33 | serial_test = "3.1" 34 | -------------------------------------------------------------------------------- /storage-node/src/bin/storage_node.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | use log::info; 3 | use storage_node::{common::config::ParpulseConfig, server::storage_node_serve}; 4 | 5 | #[tokio::main] 6 | async fn main() { 7 | // Init log. 8 | if let Err(e) = env_logger::builder() 9 | .filter_level(log::LevelFilter::Info) 10 | .try_init() 11 | { 12 | println!("Failed to init logger: {:?}", e); 13 | } 14 | info!("starting storage node server..."); 15 | let config = ParpulseConfig::parse(); 16 | storage_node_serve("0.0.0.0", 3030, config).await.unwrap(); 17 | } 18 | -------------------------------------------------------------------------------- /storage-node/src/cache/data_store_cache/memdisk/data_store/disk.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, sync::Arc}; 2 | 3 | use bytes::Bytes; 4 | use futures::StreamExt; 5 | use log::info; 6 | use tokio::sync::{mpsc::Receiver, Mutex}; 7 | 8 | use crate::{ 9 | cache::{ 10 | data_store_cache::memdisk::{MemDiskStoreReplacerKey, MemDiskStoreReplacerValue}, 11 | replacer::DataStoreReplacer, 12 | }, 13 | disk::disk_manager::DiskManager, 14 | error::ParpulseResult, 15 | storage_reader::StorageReaderStream, 16 | }; 17 | 18 | const DEFAULT_DISK_CHANNEL_BUFFER_SIZE: usize = 512; 19 | 20 | /// [`DiskStore`] stores the contents of remote objects on the local disk. 21 | pub struct DiskStore { 22 | disk_manager: DiskManager, 23 | /// The path to the directory where the data is stored on the disk. 24 | base_path: String, 25 | max_disk_reader_buffer_size: usize, 26 | } 27 | 28 | impl Drop for DiskStore { 29 | fn drop(&mut self) { 30 | if fs::metadata(&self.base_path).is_ok() { 31 | fs::remove_dir_all(self.base_path.clone()).expect("remove cache files failed"); 32 | info!("cache files removed: {}", self.base_path); 33 | } 34 | } 35 | } 36 | 37 | impl DiskStore { 38 | pub fn new( 39 | disk_manager: DiskManager, 40 | base_path: String, 41 | max_disk_reader_buffer_size: usize, 42 | ) -> Self { 43 | let mut final_base_path = base_path; 44 | if !final_base_path.ends_with('/') { 45 | final_base_path += "/"; 46 | } 47 | Self { 48 | disk_manager, 49 | base_path: final_base_path, 50 | max_disk_reader_buffer_size, 51 | } 52 | } 53 | } 54 | 55 | impl DiskStore { 56 | /// Reads data from the disk store. The method returns a stream of data read from the disk 57 | /// store. 58 | pub async fn read_data( 59 | &self, 60 | key: &str, 61 | disk_replacer: Arc>, 62 | key_replacer: String, 63 | ) -> ParpulseResult>>> 64 | where 65 | R: DataStoreReplacer + 'static, 66 | { 67 | // TODO(lanlou): we later may consider the remaining space to decide the buffer size 68 | let mut buffer_size = self.disk_manager.file_size(key).await? as usize; 69 | if buffer_size > self.max_disk_reader_buffer_size { 70 | buffer_size = self.max_disk_reader_buffer_size; 71 | } 72 | // FIXME: Shall we consider the situation where the data is not found? 73 | let mut disk_stream = self.disk_manager.disk_read_stream(key, buffer_size).await?; 74 | let (tx, rx) = tokio::sync::mpsc::channel(DEFAULT_DISK_CHANNEL_BUFFER_SIZE); 75 | tokio::spawn(async move { 76 | loop { 77 | match disk_stream.next().await { 78 | Some(Ok(bytes_read)) => { 79 | tx.send(Ok(Bytes::from(disk_stream.buffer()[..bytes_read].to_vec()))) 80 | .await 81 | .unwrap(); 82 | } 83 | Some(Err(e)) => tx.send(Err(e)).await.unwrap(), 84 | None => { 85 | // TODO(lanlou): when second read, so there is no need to unpin, how to improve? 86 | disk_replacer.lock().await.unpin(&key_replacer); 87 | break; 88 | } 89 | } 90 | } 91 | }); 92 | Ok(Some(rx)) 93 | } 94 | 95 | /// Writes data to the disk store. The method accepts a stream of data to write to the disk 96 | /// store. 97 | /// TODO: We may need to push the response writer down to the disk store as well. 98 | pub async fn write_data( 99 | &self, 100 | key: String, 101 | bytes_vec: Option>, 102 | stream: Option, 103 | ) -> ParpulseResult { 104 | // NOTE(Yuanxin): Shall we spawn a task to write the data to disk? 105 | let bytes_written = self 106 | .disk_manager 107 | .write_bytes_and_stream_to_disk(bytes_vec, stream, &key) 108 | .await?; 109 | Ok(bytes_written) 110 | } 111 | 112 | /// Cleans the data from the disk store. 113 | pub async fn clean_data(&self, key: &str) -> ParpulseResult<()> { 114 | self.disk_manager.remove_file(key).await 115 | } 116 | 117 | /// Returns the key for the disk store. The key should be cached in the disk store cache. 118 | pub fn data_store_key(&self, remote_location: &str) -> String { 119 | format!("{}{}", self.base_path, remote_location) 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /storage-node/src/cache/data_store_cache/memdisk/data_store/memory.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, sync::Arc}; 2 | 3 | use bytes::Bytes; 4 | use tokio::sync::{mpsc::Receiver, Mutex}; 5 | 6 | use crate::{ 7 | cache::{ 8 | data_store_cache::memdisk::{MemDiskStoreReplacerKey, MemDiskStoreReplacerValue}, 9 | replacer::DataStoreReplacer, 10 | }, 11 | error::ParpulseResult, 12 | }; 13 | 14 | const DEFAULT_MEM_CHANNEL_BUFFER_SIZE: usize = 1024; 15 | 16 | pub struct MemStore { 17 | /// data: remote_location -> (data, size) 18 | data: HashMap, usize)>, 19 | max_file_size: usize, 20 | } 21 | 22 | impl MemStore { 23 | pub fn new(max_file_size: usize) -> Self { 24 | Self { 25 | data: HashMap::new(), 26 | max_file_size, 27 | } 28 | } 29 | 30 | pub fn read_data( 31 | &self, 32 | key: &str, 33 | mem_replacer: Arc>, 34 | ) -> ParpulseResult>>> 35 | where 36 | R: DataStoreReplacer + 'static, 37 | { 38 | let key_value = self.data.get(key); 39 | if key_value.is_none() { 40 | return Ok(None); 41 | } 42 | let data_vec = key_value.unwrap().0.clone(); 43 | let (tx, rx) = tokio::sync::mpsc::channel(DEFAULT_MEM_CHANNEL_BUFFER_SIZE); 44 | let key_str = key.to_string().clone(); 45 | tokio::spawn(async move { 46 | for data in data_vec.iter() { 47 | tx.send(Ok(data.clone())).await.unwrap(); 48 | } 49 | // TODO(lanlou): when second read, so there is no need to unpin, how to improve? 50 | mem_replacer.lock().await.unpin(&key_str); 51 | }); 52 | Ok(Some(rx)) 53 | } 54 | 55 | /// Writes data to the memory store, also tracks the size. If the size for one key is too large, 56 | /// we will delete the data from the memory store and return all the data to the caller. 57 | /// If return value is None, it means successful write. Otherwise, it means unsuccessful write. 58 | /// TODO(lanlou): the key type should be &str maybe? 59 | pub fn write_data(&mut self, key: String, bytes: Bytes) -> Option<(Vec, usize)> { 60 | let (bytes_vec, size) = self.data.entry(key.clone()).or_insert((Vec::new(), 0)); 61 | *size += bytes.len(); 62 | bytes_vec.push(bytes); 63 | if *size > self.max_file_size { 64 | let size_copy = *size; 65 | let bytes_vec_copy = bytes_vec.clone(); 66 | self.data.remove(&key); 67 | Some((bytes_vec_copy, size_copy)) 68 | } else { 69 | None 70 | } 71 | } 72 | 73 | pub fn clean_data(&mut self, key: &str) -> Option<(Vec, usize)> { 74 | self.data.remove(key) 75 | } 76 | } 77 | 78 | #[cfg(test)] 79 | mod tests { 80 | use crate::cache::replacer::lru::LruReplacer; 81 | 82 | use super::*; 83 | 84 | #[test] 85 | fn test_large_write() { 86 | // max_file_size is 10 bytes per file. 87 | let max_file_size = 10; 88 | let mut mem_store = MemStore::new(max_file_size); 89 | let key = "large_write_key".to_string(); 90 | 91 | let bytes1 = Bytes::from(vec![1, 2, 3, 4]); 92 | let bytes2 = Bytes::from(vec![5, 6, 7, 8]); 93 | let bytes3 = Bytes::from(vec![9, 10, 11, 12]); 94 | 95 | let bytes1_cp = bytes1.clone(); 96 | let bytes2_cp = bytes2.clone(); 97 | let bytes3_cp = bytes3.clone(); 98 | 99 | let res1 = mem_store.write_data(key.clone(), bytes1); 100 | assert!(res1.is_none()); 101 | let res2 = mem_store.write_data(key.clone(), bytes2); 102 | assert!(res2.is_none()); 103 | let res3 = mem_store.write_data(key.clone(), bytes3); 104 | assert!(res3.is_some()); 105 | assert_eq!(res3.as_ref().unwrap().0.len(), 3); 106 | assert_eq!(res3.as_ref().unwrap().1, 12); 107 | assert_eq!(res3.as_ref().unwrap().0[0], bytes1_cp); 108 | assert_eq!(res3.as_ref().unwrap().0[1], bytes2_cp); 109 | assert_eq!(res3.as_ref().unwrap().0[2], bytes3_cp); 110 | 111 | let dummy_replacer = Arc::new(Mutex::new(LruReplacer::new(0))); 112 | let read_res = mem_store.read_data(key.as_str(), dummy_replacer); 113 | assert!(read_res.is_ok()); 114 | assert!(read_res.unwrap().is_none()); 115 | } 116 | 117 | #[tokio::test] 118 | async fn test_write_read() { 119 | let max_file_size = 10; 120 | let mut mem_store = MemStore::new(max_file_size); 121 | let key = "write_read_key".to_string(); 122 | let bytes = Bytes::from(vec![1, 2, 3, 4]); 123 | let bytes_cp = bytes.clone(); 124 | let res = mem_store.write_data(key.clone(), bytes); 125 | assert!(res.is_none()); 126 | let dummy_replacer = Arc::new(Mutex::new(LruReplacer::new(0))); 127 | let read_res = mem_store.read_data(key.as_str(), dummy_replacer); 128 | assert!(read_res.is_ok()); 129 | let mut rx = read_res.unwrap().unwrap(); 130 | let mut bytes_vec = Vec::new(); 131 | let mut data_size: usize = 0; 132 | while let Some(data) = rx.recv().await { 133 | let data = data.unwrap(); 134 | data_size += data.len(); 135 | bytes_vec.push(data); 136 | } 137 | assert_eq!(bytes_vec.len(), 1); 138 | assert_eq!(bytes_vec[0], bytes_cp); 139 | assert_eq!(data_size, bytes_cp.len()); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /storage-node/src/cache/data_store_cache/memdisk/data_store/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod disk; 2 | pub mod memory; 3 | -------------------------------------------------------------------------------- /storage-node/src/cache/data_store_cache/mod.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use bytes::Bytes; 3 | use parpulse_client::RequestParams; 4 | use tokio::sync::mpsc::Receiver; 5 | 6 | use crate::error::ParpulseResult; 7 | 8 | pub mod memdisk; 9 | pub mod sqlite; 10 | 11 | #[async_trait] 12 | pub trait DataStoreCache { 13 | async fn get_data_from_cache( 14 | &self, 15 | request_param: &RequestParams, 16 | ) -> ParpulseResult>>>; 17 | 18 | /// Put data to cache. Accepts a stream of bytes and returns the number of bytes written. 19 | /// The data_size parameter is optional and can be used to hint the cache about the size of the data. 20 | /// If the data_size is not provided, the cache implementation should try to determine the size of 21 | /// the data. 22 | async fn put_data_to_cache(&self, request_param: &RequestParams) -> ParpulseResult; 23 | } 24 | 25 | pub fn cache_key_from_request(request_param: &RequestParams) -> String { 26 | match request_param { 27 | RequestParams::S3((bucket, keys)) => { 28 | format!("{}-{}", bucket, keys.join(",")) 29 | } 30 | RequestParams::MockS3((bucket, keys)) => { 31 | format!("{}-{}", bucket, keys.join(",")) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /storage-node/src/cache/data_store_cache/sqlite/blob.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | 3 | use bytes::BytesMut; 4 | use rusqlite::{blob::Blob, Connection, DatabaseName}; 5 | 6 | use crate::error::ParpulseResult; 7 | 8 | use super::{SQLITE_CACHE_COLUMN_NAME, SQLITE_CACHE_TABLE_NAME}; 9 | 10 | pub type SqliteBlobKey = i64; 11 | 12 | pub struct SqliteBlob<'a> { 13 | blob: Blob<'a>, 14 | } 15 | 16 | impl<'a> SqliteBlob<'a> { 17 | pub fn new(blob: Blob<'a>) -> Self { 18 | Self { blob } 19 | } 20 | 21 | pub fn read(&mut self, buffer: &mut [u8]) -> ParpulseResult { 22 | self.blob.read(buffer).map_err(Into::into) 23 | } 24 | 25 | pub fn write_at(&mut self, data: &[u8], offset: usize) -> ParpulseResult<()> { 26 | self.blob.write_at(data, offset).map_err(Into::into) 27 | } 28 | } 29 | 30 | unsafe impl<'a> Send for SqliteBlob<'a> {} 31 | 32 | pub struct SqliteBlobReader<'a> { 33 | blob: SqliteBlob<'a>, 34 | buffer: BytesMut, 35 | } 36 | 37 | impl<'a> SqliteBlobReader<'a> { 38 | pub fn new( 39 | db: &'a Connection, 40 | blob_key: SqliteBlobKey, 41 | buffer_size: usize, 42 | ) -> ParpulseResult { 43 | let blob = db.blob_open( 44 | DatabaseName::Main, 45 | SQLITE_CACHE_TABLE_NAME, 46 | SQLITE_CACHE_COLUMN_NAME, 47 | blob_key, 48 | true, 49 | )?; 50 | Ok(Self { 51 | blob: SqliteBlob::new(blob), 52 | buffer: BytesMut::zeroed(buffer_size), 53 | }) 54 | } 55 | 56 | pub fn buffer(&self) -> &[u8] { 57 | &self.buffer 58 | } 59 | } 60 | 61 | impl Iterator for SqliteBlobReader<'_> { 62 | type Item = ParpulseResult; 63 | 64 | fn next(&mut self) -> Option { 65 | match self.blob.read(self.buffer.as_mut()) { 66 | Ok(bytes_read) => { 67 | if bytes_read > 0 { 68 | Some(Ok(bytes_read)) 69 | } else { 70 | None 71 | } 72 | } 73 | Err(e) => Some(Err(e)), 74 | } 75 | } 76 | } 77 | 78 | #[cfg(test)] 79 | mod tests { 80 | use super::*; 81 | 82 | use bytes::Bytes; 83 | use tempfile::tempdir; 84 | 85 | #[test] 86 | fn test_sqlite_blob_reader() { 87 | let poem = Bytes::from_static( 88 | b"What can I hold you with? 89 | I offer you lean streets, desperate sunsets, the 90 | moon of the jagged suburbs. 91 | I offer you the bitterness of a man who has looked 92 | long and long at the lonely moon. 93 | I offer you my ancestors, my dead men, the ghosts 94 | that living men have honoured in bronze. 95 | I offer you whatever insight my books may hold, 96 | whatever manliness or humour my life. 97 | I offer you the loyalty of a man who has never 98 | been loyal. 99 | I offer you that kernel of myself that I have saved, 100 | somehow-the central heart that deals not 101 | in words, traffics not with dreams, and is 102 | untouched by time, by joy, by adversities. 103 | I offer you the memory of a yellow rose seen at 104 | sunset, years before you were born. 105 | I offer you explanations of yourself, theories about 106 | yourself, authentic and surprising news of 107 | yourself. 108 | I can give you my loneliness, my darkness, the 109 | hunger of my heart; I am trying to bribe you 110 | with uncertainty, with danger, with defeat. 111 | ", 112 | ); 113 | let temp_dir = tempdir().unwrap(); 114 | let db_path = temp_dir.path().join("tmp.db"); 115 | 116 | let db = Connection::open(&db_path).unwrap(); 117 | db.execute( 118 | &format!( 119 | "CREATE TABLE {} ({})", 120 | SQLITE_CACHE_TABLE_NAME, SQLITE_CACHE_COLUMN_NAME 121 | ), 122 | [], 123 | ) 124 | .unwrap(); 125 | db.execute( 126 | &format!( 127 | "INSERT INTO {} ({}) VALUES (ZEROBLOB({}))", 128 | SQLITE_CACHE_TABLE_NAME, 129 | SQLITE_CACHE_COLUMN_NAME, 130 | poem.len() 131 | ), 132 | [], 133 | ) 134 | .unwrap(); 135 | let blob_key = db.last_insert_rowid(); 136 | 137 | { 138 | let mut writer = db 139 | .blob_open( 140 | DatabaseName::Main, 141 | SQLITE_CACHE_TABLE_NAME, 142 | SQLITE_CACHE_COLUMN_NAME, 143 | blob_key, 144 | false, 145 | ) 146 | .unwrap(); 147 | writer.write_at(&poem, 0).unwrap(); 148 | } 149 | // FLush the result so that the blob is visible to another connnection. 150 | db.cache_flush().unwrap(); 151 | 152 | let db2 = Connection::open(&db_path).unwrap(); 153 | let buffer_size = 100; 154 | let mut reader = SqliteBlobReader::new(&db2, blob_key, buffer_size).unwrap(); 155 | 156 | let mut total_bytes_read = 0; 157 | let mut read_count = 0; 158 | let mut result = String::new(); 159 | while let Some(bytes_read) = reader.next() { 160 | let bytes_read = bytes_read.unwrap(); 161 | println!("bytes_read: {}", bytes_read); 162 | println!("buffer: {:?}", reader.buffer()[..bytes_read].to_vec()); 163 | result += &String::from_utf8(reader.buffer()[..bytes_read].to_vec()).unwrap(); 164 | total_bytes_read += bytes_read; 165 | read_count += 1; 166 | } 167 | 168 | assert_eq!(result, poem); 169 | assert_eq!(total_bytes_read, 930); 170 | assert_eq!(read_count, 10); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /storage-node/src/cache/data_store_cache/sqlite/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod blob; 2 | 3 | use std::fs; 4 | 5 | use async_trait::async_trait; 6 | use bytes::Bytes; 7 | use futures::StreamExt; 8 | use log::warn; 9 | use parpulse_client::RequestParams; 10 | use rusqlite::{Connection, DatabaseName, OpenFlags}; 11 | use tokio::sync::{ 12 | mpsc::{channel, Receiver}, 13 | Mutex, 14 | }; 15 | 16 | use crate::{ 17 | cache::replacer::{DataStoreReplacer, ReplacerValue}, 18 | error::ParpulseResult, 19 | storage_reader::{s3::S3Reader, s3_diskmock::MockS3Reader, AsyncStorageReader}, 20 | }; 21 | 22 | use self::blob::{SqliteBlob, SqliteBlobReader}; 23 | 24 | use super::{cache_key_from_request, DataStoreCache}; 25 | 26 | const SQLITE_CACHE_TABLE_NAME: &str = "parpulse_cache"; 27 | const SQLITE_CACHE_COLUMN_NAME: &str = "content"; 28 | const SQLITE_MAX_BLOB_SIZE: usize = 512 * 1024 * 1024; // 512 MB 29 | const SQLITE_BLOB_CHANNEL_CAPACITY: usize = 5; 30 | 31 | pub type SqliteStoreReplacerKey = String; 32 | pub struct SqliteStoreReplacerValue { 33 | pub(crate) row_id: i64, 34 | pub(crate) size: usize, 35 | } 36 | 37 | impl SqliteStoreReplacerValue { 38 | pub fn new(row_id: i64, size: usize) -> Self { 39 | Self { row_id, size } 40 | } 41 | } 42 | 43 | impl ReplacerValue for SqliteStoreReplacerValue { 44 | type Value = i64; 45 | 46 | fn into_value(self) -> Self::Value { 47 | self.row_id 48 | } 49 | 50 | fn as_value(&self) -> &Self::Value { 51 | &self.row_id 52 | } 53 | 54 | fn size(&self) -> usize { 55 | self.size 56 | } 57 | } 58 | 59 | pub struct SqliteStoreCache> 60 | { 61 | replacer: Mutex, 62 | sqlite_base_path: String, 63 | reader_buffer_size: usize, 64 | } 65 | 66 | impl> SqliteStoreCache { 67 | pub fn new( 68 | replacer: R, 69 | sqlite_base_path: String, 70 | reader_buffer_size: usize, 71 | ) -> ParpulseResult { 72 | let db = Connection::open(&sqlite_base_path)?; 73 | let create_table_stmt = format!( 74 | "CREATE TABLE IF NOT EXISTS {} ({} BLOB);", 75 | SQLITE_CACHE_TABLE_NAME, SQLITE_CACHE_COLUMN_NAME 76 | ); 77 | db.execute_batch(&create_table_stmt)?; 78 | 79 | Ok(Self { 80 | replacer: Mutex::new(replacer), 81 | sqlite_base_path, 82 | reader_buffer_size, 83 | }) 84 | } 85 | } 86 | 87 | impl> Drop 88 | for SqliteStoreCache 89 | { 90 | fn drop(&mut self) { 91 | if fs::metadata(&self.sqlite_base_path).is_ok() { 92 | fs::remove_file(self.sqlite_base_path.clone()).expect("remove sqlite db files failed"); 93 | } else { 94 | warn!("sqlite db file not found: {}", self.sqlite_base_path); 95 | } 96 | } 97 | } 98 | 99 | #[async_trait] 100 | impl> DataStoreCache 101 | for SqliteStoreCache 102 | { 103 | async fn get_data_from_cache( 104 | &self, 105 | request: &RequestParams, 106 | ) -> ParpulseResult>>> { 107 | let remote_location = cache_key_from_request(request); 108 | let mut replacer = self.replacer.lock().await; 109 | if let Some(replacer_value) = replacer.get(&remote_location) { 110 | let (tx, rx) = channel(SQLITE_BLOB_CHANNEL_CAPACITY); 111 | let row_id = *replacer_value.as_value(); 112 | let sqlite_base_path = self.sqlite_base_path.clone(); 113 | let buffer_size = self.reader_buffer_size; 114 | 115 | tokio::spawn(async move { 116 | let db = 117 | Connection::open_with_flags(sqlite_base_path, OpenFlags::SQLITE_OPEN_READ_ONLY) 118 | .unwrap(); 119 | let mut blob_reader = SqliteBlobReader::new(&db, row_id, buffer_size).unwrap(); 120 | while let Some(result) = blob_reader.next() { 121 | match result { 122 | Ok(bytes_read) => { 123 | let buffer = blob_reader.buffer(); 124 | let bytes = Bytes::copy_from_slice(&buffer[..bytes_read]); 125 | tx.send(Ok(bytes)).await.unwrap() 126 | } 127 | Err(err) => tx.send(Err(err)).await.unwrap(), 128 | } 129 | } 130 | }); 131 | Ok(Some(rx)) 132 | } else { 133 | Ok(None) 134 | } 135 | } 136 | 137 | async fn put_data_to_cache(&self, request: &RequestParams) -> ParpulseResult { 138 | let remote_location = cache_key_from_request(request); 139 | let (mut data_stream, blob_size) = { 140 | match request { 141 | RequestParams::S3((bucket, keys)) => { 142 | let reader = S3Reader::new(bucket.clone(), keys.clone().to_vec()).await; 143 | let data_size = reader.get_object_size().await; 144 | (reader.into_stream().await?, data_size) 145 | } 146 | RequestParams::MockS3((bucket, keys)) => { 147 | let reader = MockS3Reader::new(bucket.clone(), keys.clone().to_vec()).await; 148 | let data_size = reader.get_object_size().await; 149 | (reader.into_stream().await?, data_size) 150 | } 151 | } 152 | }; 153 | let blob_size = blob_size.unwrap_or(SQLITE_MAX_BLOB_SIZE); 154 | let mut replacer = self.replacer.lock().await; 155 | let sqlite_base_path = self.sqlite_base_path.clone(); 156 | let db = Connection::open(sqlite_base_path)?; 157 | let insert_blob_stmt = format!( 158 | "INSERT INTO {} ({}) VALUES (ZEROBLOB({}))", 159 | SQLITE_CACHE_TABLE_NAME, SQLITE_CACHE_COLUMN_NAME, blob_size 160 | ); 161 | db.execute(&insert_blob_stmt, [])?; 162 | let blob_key = db.last_insert_rowid(); 163 | let mut blob = SqliteBlob::new(db.blob_open( 164 | DatabaseName::Main, 165 | SQLITE_CACHE_TABLE_NAME, 166 | SQLITE_CACHE_COLUMN_NAME, 167 | blob_key, 168 | false, 169 | )?); 170 | 171 | let mut size = 0; 172 | while let Some(data) = data_stream.next().await { 173 | let data = data?; 174 | blob.write_at(&data, size)?; 175 | size += data.len(); 176 | } 177 | replacer.put( 178 | remote_location, 179 | SqliteStoreReplacerValue::new(blob_key, size), 180 | ); 181 | Ok(size) 182 | } 183 | } 184 | 185 | #[cfg(test)] 186 | mod tests { 187 | use std::path::Path; 188 | 189 | use crate::cache::replacer::lru::LruReplacer; 190 | 191 | use super::*; 192 | 193 | #[tokio::test] 194 | async fn test_sqlite_store_cache() { 195 | let tmp = tempfile::tempdir().unwrap(); 196 | let sqlite_base_path = tmp.path().to_owned().join(Path::new("sqlite_test.db")); 197 | let replacer = LruReplacer::new(1024); 198 | let buffer_size = 100; 199 | let cache = SqliteStoreCache::new( 200 | replacer, 201 | sqlite_base_path.to_str().unwrap().to_string(), 202 | buffer_size, 203 | ) 204 | .expect("create sqlite store cache failed"); 205 | 206 | let bucket = "tests-text".to_string(); 207 | let keys = vec!["what-can-i-hold-you-with".to_string()]; 208 | let request = RequestParams::MockS3((bucket, keys)); 209 | let bytes_written = cache 210 | .put_data_to_cache(&request) 211 | .await 212 | .expect("put data to cache failed"); 213 | assert_eq!(bytes_written, 930); 214 | 215 | let mut rx = cache 216 | .get_data_from_cache(&request) 217 | .await 218 | .expect("get data from cache failed") 219 | .expect("data not found in cache"); 220 | 221 | let mut result = String::new(); 222 | let mut total_bytes_read = 0; 223 | while let Some(bytes) = rx.recv().await { 224 | let bytes = bytes.expect("read data from cache failed"); 225 | total_bytes_read += bytes.len(); 226 | result += &String::from_utf8(bytes.to_vec()).expect("convert bytes to string failed"); 227 | } 228 | assert_eq!(total_bytes_read, 930); 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /storage-node/src/cache/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod data_store_cache; 2 | pub mod replacer; 3 | -------------------------------------------------------------------------------- /storage-node/src/cache/replacer/lru.rs: -------------------------------------------------------------------------------- 1 | use hashlink::linked_hash_map; 2 | use hashlink::LinkedHashMap; 3 | use log::{debug, warn}; 4 | 5 | use super::DataStoreReplacer; 6 | use super::ReplacerKey; 7 | use super::ReplacerValue; 8 | 9 | /// [`LruReplacer`] adopts the least-recently-used algorithm to cache sized 10 | /// objects. The replacer will start evicting if a new object comes that makes 11 | /// the replacer's size exceeds its max capacity, from the oldest to the newest. 12 | pub struct LruReplacer { 13 | // usize is pin count 14 | cache_map: LinkedHashMap, 15 | max_capacity: usize, 16 | size: usize, 17 | } 18 | 19 | impl LruReplacer { 20 | pub fn new(max_capacity: usize) -> LruReplacer { 21 | LruReplacer { 22 | cache_map: LinkedHashMap::new(), 23 | max_capacity, 24 | size: 0, 25 | } 26 | } 27 | 28 | fn get_value(&mut self, key: &K) -> Option<&V> { 29 | match self.cache_map.raw_entry_mut().from_key(key) { 30 | linked_hash_map::RawEntryMut::Occupied(mut entry) => { 31 | entry.to_back(); 32 | Some(&entry.into_mut().0) 33 | } 34 | linked_hash_map::RawEntryMut::Vacant(_) => None, 35 | } 36 | } 37 | 38 | fn put_value(&mut self, key: K, value: V) -> Option> { 39 | if value.size() > self.max_capacity { 40 | // If the object size is greater than the max capacity, we do not insert the 41 | // object into the replacer. 42 | warn!("The size of the value is greater than the max capacity",); 43 | warn!( 44 | "Key: {:?}, Value: {:?}, Value size: {:?}, Max capacity: {:?}", 45 | key, 46 | value.as_value(), 47 | value.size(), 48 | self.max_capacity 49 | ); 50 | return None; 51 | } 52 | if let Some(cache_value) = self.cache_map.get(&key) { 53 | // If the key already exists, update the replacer size. 54 | self.size -= cache_value.0.size(); 55 | } 56 | let mut evicted_keys = Vec::new(); 57 | let mut iter = self.cache_map.iter(); 58 | let mut current_size = self.size; 59 | while (current_size + value.size()) > self.max_capacity { 60 | match iter.next() { 61 | Some((key, (value, pin_count))) => { 62 | if *pin_count > 0 { 63 | // If the key is pinned, we do not evict the key. 64 | continue; 65 | } 66 | evicted_keys.push(key.clone()); 67 | current_size -= value.size(); 68 | } 69 | None => { 70 | return None; 71 | } 72 | } 73 | } 74 | 75 | for key in &evicted_keys { 76 | if let Some(cache_value) = self.cache_map.remove(key) { 77 | debug!("-------- Evicting Key: {:?} --------", key); 78 | self.size -= cache_value.0.size(); 79 | } else { 80 | return None; 81 | } 82 | } 83 | 84 | self.size += value.size(); 85 | self.cache_map.insert(key.clone(), (value, 0)); 86 | Some(evicted_keys) 87 | } 88 | 89 | fn pin_key(&mut self, key: &K, count: usize) -> bool { 90 | match self.cache_map.get_mut(key) { 91 | Some((_, pin_count)) => { 92 | *pin_count += count; 93 | true 94 | } 95 | None => false, 96 | } 97 | } 98 | 99 | fn unpin_key(&mut self, key: &K) -> bool { 100 | match self.cache_map.get_mut(key) { 101 | Some((_, pin_count)) => { 102 | if *pin_count == 0 { 103 | return false; 104 | } 105 | *pin_count -= 1; 106 | true 107 | } 108 | None => false, 109 | } 110 | } 111 | 112 | fn peek_value(&self, key: &K) -> Option<&V> { 113 | match self.cache_map.get(key) { 114 | Some((value, _)) => Some(value), 115 | None => None, 116 | } 117 | } 118 | } 119 | 120 | impl DataStoreReplacer for LruReplacer { 121 | fn get(&mut self, key: &K) -> Option<&V> { 122 | self.get_value(key) 123 | } 124 | 125 | fn put(&mut self, key: K, value: V) -> Option> { 126 | self.put_value(key, value) 127 | } 128 | 129 | fn pin(&mut self, key: &K, count: usize) -> bool { 130 | self.pin_key(key, count) 131 | } 132 | 133 | fn unpin(&mut self, key: &K) -> bool { 134 | self.unpin_key(key) 135 | } 136 | 137 | fn peek(&self, key: &K) -> Option<&V> { 138 | self.peek_value(key) 139 | } 140 | 141 | fn len(&self) -> usize { 142 | self.cache_map.len() 143 | } 144 | 145 | fn is_empty(&self) -> bool { 146 | self.cache_map.is_empty() 147 | } 148 | 149 | fn size(&self) -> usize { 150 | self.size 151 | } 152 | 153 | fn max_capacity(&self) -> usize { 154 | self.max_capacity 155 | } 156 | 157 | fn set_max_capacity(&mut self, capacity: usize) { 158 | self.max_capacity = capacity; 159 | } 160 | 161 | fn clear(&mut self) { 162 | self.cache_map.clear(); 163 | self.size = 0; 164 | } 165 | } 166 | 167 | #[cfg(test)] 168 | mod tests { 169 | use crate::cache::replacer::{ 170 | tests::{ParpulseTestReplacerKey, ParpulseTestReplacerValue}, 171 | DataStoreReplacer, 172 | }; 173 | 174 | use super::LruReplacer; 175 | 176 | #[test] 177 | fn test_new() { 178 | let replacer = LruReplacer::::new(10); 179 | assert_eq!(replacer.max_capacity(), 10); 180 | assert_eq!(replacer.size(), 0); 181 | } 182 | 183 | #[test] 184 | fn test_peek_and_set() { 185 | let mut replacer = 186 | LruReplacer::::new(10); 187 | replacer.put("key1".to_string(), ("value1".to_string(), 1)); 188 | replacer.put("key2".to_string(), ("value2".to_string(), 2)); 189 | replacer.put("key3".to_string(), ("value3".to_string(), 3)); 190 | replacer.put("key4".to_string(), ("value4".to_string(), 4)); 191 | replacer.set_max_capacity(14); 192 | replacer.put("key5".to_string(), ("value5".to_string(), 5)); 193 | assert_eq!(replacer.peek(&"key1".to_string()), None); 194 | assert_eq!( 195 | replacer.peek(&"key2".to_string()), 196 | Some(&("value2".to_string(), 2)) 197 | ); 198 | assert_eq!( 199 | replacer.peek(&"key3".to_string()), 200 | Some(&("value3".to_string(), 3)) 201 | ); 202 | assert_eq!( 203 | replacer.peek(&"key4".to_string()), 204 | Some(&("value4".to_string(), 4)) 205 | ); 206 | assert_eq!( 207 | replacer.peek(&"key5".to_string()), 208 | Some(&("value5".to_string(), 5)) 209 | ); 210 | } 211 | 212 | #[test] 213 | fn test_put_different_keys() { 214 | let mut replacer = 215 | LruReplacer::::new(10); 216 | replacer.put("key1".to_string(), ("value1".to_string(), 1)); 217 | assert_eq!(replacer.size(), 1); 218 | replacer.put("key2".to_string(), ("value2".to_string(), 2)); 219 | assert_eq!(replacer.size(), 3); 220 | replacer.put("key3".to_string(), ("value3".to_string(), 3)); 221 | assert_eq!(replacer.size(), 6); 222 | replacer.put("key4".to_string(), ("value4".to_string(), 4)); 223 | assert_eq!(replacer.size(), 10); 224 | replacer.put("key5".to_string(), ("value5".to_string(), 5)); 225 | assert_eq!(replacer.size(), 9); // Only key4 and key5 are in the replacer 226 | assert_eq!(replacer.len(), 2); 227 | assert!(!replacer.is_empty()); 228 | replacer.clear(); 229 | assert!(replacer.is_empty()); 230 | assert_eq!(replacer.size(), 0); 231 | assert_eq!(replacer.len(), 0); 232 | } 233 | 234 | #[test] 235 | fn test_put_same_key() { 236 | let mut replacer = 237 | LruReplacer::::new(10); 238 | replacer.put("key1".to_string(), ("value1".to_string(), 1)); 239 | replacer.put("key1".to_string(), ("value2".to_string(), 2)); 240 | replacer.put("key1".to_string(), ("value3".to_string(), 3)); 241 | assert_eq!(replacer.len(), 1); 242 | assert_eq!(replacer.size(), 3); 243 | replacer.put("key1".to_string(), ("value4".to_string(), 100)); // Should not be inserted 244 | assert_eq!( 245 | replacer.get(&"key1".to_string()), 246 | Some(&("value3".to_string(), 3)) 247 | ); 248 | assert_eq!(replacer.get(&("key2".to_string())), None); 249 | } 250 | 251 | #[test] 252 | fn test_evict_pinned_key() { 253 | let mut replacer = 254 | LruReplacer::::new(10); 255 | replacer.put("key1".to_string(), ("value1".to_string(), 9)); 256 | assert!(replacer.pin(&"key1".to_string(), 1)); 257 | assert!(replacer 258 | .put("key2".to_string(), ("value2".to_string(), 2)) 259 | .is_none()); 260 | assert_eq!(replacer.size(), 9); 261 | assert!(replacer.pin(&"key1".to_string(), 1)); 262 | assert!(replacer.unpin(&"key1".to_string())); 263 | assert!(replacer 264 | .put("key2".to_string(), ("value2".to_string(), 2)) 265 | .is_none()); 266 | assert!(replacer.unpin(&"key1".to_string())); 267 | assert!(replacer 268 | .put("key2".to_string(), ("value2".to_string(), 2)) 269 | .is_some()); 270 | assert_eq!(replacer.size(), 2); 271 | assert!(replacer.pin(&"key2".to_string(), 1)); 272 | replacer.put("key3".to_string(), ("value3".to_string(), 8)); 273 | assert_eq!(replacer.size(), 10); 274 | replacer.put("key4".to_string(), ("value4".to_string(), 7)); 275 | assert_eq!(replacer.size(), 9); 276 | assert!(replacer.get(&"key2".to_string()).is_some()); 277 | assert!(replacer.get(&"key4".to_string()).is_some()); 278 | assert!(replacer.get(&"key3".to_string()).is_none()); 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /storage-node/src/cache/replacer/lru_k.rs: -------------------------------------------------------------------------------- 1 | /// LRU-K replacer implementation. 2 | /// Credit: https://doi.org/10.1145/170036.170081 3 | use log::{debug, warn}; 4 | use std::collections::HashMap; 5 | use std::collections::VecDeque; 6 | 7 | use super::DataStoreReplacer; 8 | use super::ReplacerKey; 9 | use super::ReplacerValue; 10 | 11 | type Timestamp = i32; 12 | 13 | /// Represents a node in the LRU-K replacer. 14 | /// 15 | /// Each node contains a value of type `V` and a history of timestamps. 16 | /// The history is stored as a `VecDeque`, where the most recent 17 | /// timestamps are at the front of the deque. 18 | struct LruKNode { 19 | value: V, 20 | history: VecDeque, 21 | pin_count: usize, 22 | } 23 | 24 | /// Represents an LRU-K replacer. 25 | /// 26 | /// The LRU-K algorithm evicts a node whose backward k-distance is maximum of all 27 | /// nodes. Backward k-distance is computed as the difference in time between current 28 | /// timestamp and the timestamp of kth previous access. A node with fewer than k 29 | /// historical accesses is given +inf as its backward k-distance. When multiple nodes 30 | /// have +inf backward k-distance, the replacer evicts the node with the earliest 31 | /// overall timestamp (i.e., the frame whose least-recent recorded access is the 32 | /// overall least recent access, overall, out of all nodes). 33 | pub struct LruKReplacer { 34 | cache_map: HashMap>, 35 | max_capacity: usize, 36 | size: usize, 37 | curr_timestamp: Timestamp, 38 | k: usize, // The k value for LRU-K 39 | } 40 | 41 | impl LruKReplacer { 42 | pub fn new(max_capacity: usize, k: usize) -> LruKReplacer { 43 | LruKReplacer { 44 | cache_map: HashMap::new(), 45 | max_capacity, 46 | size: 0, 47 | curr_timestamp: 0, 48 | k, 49 | } 50 | } 51 | 52 | fn evict(&mut self, new_key: &K) -> Option { 53 | let mut found = false; 54 | let mut max_k_dist = 0; 55 | let mut k_dist; 56 | let mut earliest_timestamp = 0; 57 | let mut key_to_evict: Option = None; 58 | for (key, node) in self.cache_map.iter() { 59 | if key == new_key { 60 | continue; 61 | } 62 | let history = &node.history; 63 | if let Some(kth_timestamp) = history.front() { 64 | k_dist = if history.len() < self.k { 65 | std::i32::MAX 66 | } else { 67 | self.curr_timestamp - kth_timestamp 68 | }; 69 | if ((k_dist > max_k_dist) 70 | || (k_dist == max_k_dist && kth_timestamp < &earliest_timestamp)) 71 | && node.pin_count == 0 72 | { 73 | found = true; 74 | max_k_dist = k_dist; 75 | earliest_timestamp = *kth_timestamp; 76 | key_to_evict = Some(key.clone()); 77 | } 78 | } 79 | } 80 | if found { 81 | if let Some(key) = key_to_evict { 82 | // TODO: Should have better logging 83 | debug!("-------- Evicting Key: {:?} --------", key); 84 | if let Some(node) = self.cache_map.remove(&key) { 85 | self.size -= node.value.size(); 86 | } 87 | return Some(key); 88 | } 89 | } 90 | None 91 | } 92 | 93 | fn record_access(&mut self, node: &mut LruKNode) { 94 | node.history.push_back(self.curr_timestamp); 95 | if node.history.len() > self.k { 96 | node.history.pop_front(); 97 | } 98 | self.curr_timestamp += 1; 99 | } 100 | 101 | fn get_value(&mut self, key: &K) -> Option<&V> { 102 | if let Some(mut node) = self.cache_map.remove(key) { 103 | self.record_access(&mut node); 104 | self.cache_map.insert(key.clone(), node); 105 | return self.cache_map.get(key).map(|node| &node.value); 106 | } 107 | None 108 | } 109 | 110 | fn put_value(&mut self, key: K, value: V) -> Option> { 111 | if value.size() > self.max_capacity { 112 | // If the object size is greater than the max capacity, we do not insert the 113 | // object into the replacer. 114 | warn!("The size of the value is greater than the max capacity",); 115 | warn!( 116 | "Key: {:?}, Value: {:?}, Value size: {:?}, Max capacity: {:?}", 117 | key, 118 | value.as_value(), 119 | value.size(), 120 | self.max_capacity 121 | ); 122 | return None; 123 | } 124 | let updated_size = value.size(); 125 | let mut new_history: VecDeque = VecDeque::new(); 126 | if let Some(mut node) = self.cache_map.remove(&key) { 127 | self.record_access(&mut node); 128 | self.size -= node.value.size(); 129 | new_history = node.history; 130 | } else { 131 | new_history.push_back(self.curr_timestamp); 132 | self.curr_timestamp += 1; 133 | } 134 | let mut evicted_keys = Vec::new(); 135 | while (self.size + updated_size) > self.max_capacity { 136 | let key_to_evict = self.evict(&key); 137 | // If key_to_evict is none, return none 138 | key_to_evict.as_ref()?; 139 | if let Some(evicted_key) = key_to_evict { 140 | evicted_keys.push(evicted_key); 141 | } 142 | } 143 | self.cache_map.insert( 144 | key.clone(), 145 | LruKNode { 146 | value, 147 | history: new_history, 148 | pin_count: 0, 149 | }, 150 | ); 151 | self.size += updated_size; 152 | Some(evicted_keys) 153 | } 154 | 155 | fn pin_value(&mut self, key: &K, count: usize) -> bool { 156 | match self.cache_map.get_mut(key) { 157 | Some(node) => { 158 | node.pin_count += count; 159 | true 160 | } 161 | None => false, 162 | } 163 | } 164 | 165 | fn unpin_value(&mut self, key: &K) -> bool { 166 | match self.cache_map.get_mut(key) { 167 | Some(node) => { 168 | if node.pin_count == 0 { 169 | return false; 170 | } 171 | node.pin_count -= 1; 172 | true 173 | } 174 | None => false, 175 | } 176 | } 177 | 178 | fn peek_value(&self, key: &K) -> Option<&V> { 179 | if let Some(node) = self.cache_map.get(key) { 180 | let cache_value = &node.value; 181 | Some(cache_value) 182 | } else { 183 | None 184 | } 185 | } 186 | 187 | #[allow(dead_code)] 188 | fn current_timestamp(&self) -> Timestamp { 189 | self.curr_timestamp 190 | } 191 | } 192 | 193 | impl DataStoreReplacer for LruKReplacer { 194 | fn get(&mut self, key: &K) -> Option<&V> { 195 | self.get_value(key) 196 | } 197 | 198 | fn put(&mut self, key: K, value: V) -> Option> { 199 | self.put_value(key, value) 200 | } 201 | 202 | fn pin(&mut self, key: &K, count: usize) -> bool { 203 | self.pin_value(key, count) 204 | } 205 | 206 | fn unpin(&mut self, key: &K) -> bool { 207 | self.unpin_value(key) 208 | } 209 | 210 | fn peek(&self, key: &K) -> Option<&V> { 211 | self.peek_value(key) 212 | } 213 | 214 | fn len(&self) -> usize { 215 | self.cache_map.len() 216 | } 217 | 218 | fn is_empty(&self) -> bool { 219 | self.cache_map.is_empty() 220 | } 221 | 222 | fn size(&self) -> usize { 223 | self.size 224 | } 225 | 226 | fn max_capacity(&self) -> usize { 227 | self.max_capacity 228 | } 229 | 230 | fn set_max_capacity(&mut self, capacity: usize) { 231 | self.max_capacity = capacity; 232 | } 233 | 234 | fn clear(&mut self) { 235 | self.cache_map.clear(); 236 | self.size = 0; 237 | } 238 | } 239 | 240 | #[cfg(test)] 241 | mod tests { 242 | use crate::cache::replacer::{ 243 | tests::{ParpulseTestReplacerKey, ParpulseTestReplacerValue}, 244 | DataStoreReplacer, 245 | }; 246 | 247 | use super::LruKReplacer; 248 | 249 | #[test] 250 | fn test_new() { 251 | let mut replacer = 252 | LruKReplacer::::new(10, 2); 253 | assert_eq!(replacer.max_capacity(), 10); 254 | assert_eq!(replacer.size(), 0); 255 | replacer.set_max_capacity(20); 256 | assert_eq!(replacer.max_capacity(), 20); 257 | } 258 | 259 | #[test] 260 | fn test_peek_and_set() { 261 | let mut replacer = 262 | LruKReplacer::::new(10, 2); 263 | let key = "key1".to_string(); 264 | let value = "value1".to_string(); 265 | assert_eq!(replacer.peek(&key), None); 266 | assert!(replacer.put(key.clone(), (value.clone(), 1)).is_some()); 267 | assert_eq!(replacer.peek(&key), Some(&(value.clone(), 1))); 268 | assert_eq!(replacer.len(), 1); 269 | assert_eq!(replacer.size(), 1); 270 | assert!(!replacer.is_empty()); 271 | replacer.clear(); 272 | assert!(replacer.is_empty()); 273 | } 274 | 275 | #[test] 276 | fn test_evict() { 277 | let mut replacer = 278 | LruKReplacer::::new(13, 2); 279 | let key1 = "key1".to_string(); 280 | let key2 = "key2".to_string(); 281 | let key3 = "key3".to_string(); 282 | let key4 = "key4".to_string(); 283 | let key5 = "key5".to_string(); 284 | let value1 = "value1".to_string(); 285 | let value2 = "value2".to_string(); 286 | let value3 = "value3".to_string(); 287 | let value4 = "value4".to_string(); 288 | let value5 = "value5".to_string(); 289 | replacer.put(key1.clone(), (value1.clone(), 1)); 290 | replacer.put(key2.clone(), (value2.clone(), 2)); 291 | replacer.put(key3.clone(), (value3.clone(), 3)); 292 | replacer.put(key4.clone(), (value4.clone(), 4)); 293 | assert_eq!(replacer.current_timestamp(), 4); 294 | assert_eq!(replacer.get(&key3), Some(&(value3.clone(), 3))); 295 | assert_eq!(replacer.get(&key4), Some(&(value4.clone(), 4))); 296 | assert_eq!(replacer.get(&key1), Some(&(value1.clone(), 1))); 297 | assert_eq!(replacer.get(&key2), Some(&(value2.clone(), 2))); 298 | assert_eq!(replacer.current_timestamp(), 8); 299 | // Now the kth (i.e. 2nd) order from old to new is [1, 2, 3, 4] 300 | replacer.put(key5.clone(), (value5.clone(), 4)); 301 | assert_eq!(replacer.get(&key1), None); // key1 should be evicted 302 | 303 | assert_eq!(replacer.get(&key2), Some(&(value2.clone(), 2))); 304 | assert_eq!(replacer.get(&key4), Some(&(value4.clone(), 4))); 305 | assert_eq!(replacer.get(&key3), Some(&(value3.clone(), 3))); 306 | assert_eq!(replacer.get(&key5), Some(&(value5.clone(), 4))); 307 | // Now the kth (i.e. 2nd) order from old to new is [3, 4, 2, 5] 308 | replacer.put(key1.clone(), (value1.clone(), 1)); 309 | assert_eq!(replacer.get(&key3), None); // key3 should be evicted 310 | assert_eq!(replacer.current_timestamp(), 14); // When get fails, the timestamp should not be updated 311 | } 312 | 313 | #[test] 314 | fn test_infinite() { 315 | let mut replacer = 316 | LruKReplacer::::new(6, 2); 317 | let key1 = "key1".to_string(); 318 | let key2 = "key2".to_string(); 319 | let key3 = "key3".to_string(); 320 | let key4 = "key4".to_string(); 321 | let value1 = "value1".to_string(); 322 | let value2 = "value2".to_string(); 323 | let value3 = "value3".to_string(); 324 | let value4 = "value4".to_string(); 325 | replacer.put(key1.clone(), (value1.clone(), 1)); 326 | replacer.put(key2.clone(), (value2.clone(), 2)); 327 | replacer.put(key3.clone(), (value3.clone(), 3)); 328 | replacer.put(key4.clone(), (value4.clone(), 4)); 329 | assert_eq!(replacer.current_timestamp(), 4); 330 | assert_eq!(replacer.get(&key1), None); // Key1 should be evicted as it has infinite k distance and the earliest overall timestamp, same for key2 and key3 331 | assert_eq!(replacer.get(&key2), None); 332 | assert_eq!(replacer.get(&key3), None); 333 | assert_eq!(replacer.size(), 4); // Only key4 should be in the replacer 334 | } 335 | 336 | #[test] 337 | fn test_put_same_key() { 338 | let mut replacer = 339 | LruKReplacer::::new(10, 2); 340 | replacer.put("key1".to_string(), ("value1".to_string(), 1)); 341 | replacer.put("key1".to_string(), ("value2".to_string(), 2)); 342 | replacer.put("key1".to_string(), ("value3".to_string(), 3)); 343 | replacer.put("key1".to_string(), ("value3".to_string(), 4)); 344 | assert_eq!(replacer.len(), 1); 345 | assert_eq!(replacer.size(), 4); 346 | replacer.put("key1".to_string(), ("value4".to_string(), 100)); // Should not be inserted 347 | assert_eq!( 348 | replacer.get(&"key1".to_string()), 349 | Some(&("value3".to_string(), 4)) 350 | ); 351 | assert_eq!(replacer.get(&("key2".to_string())), None); 352 | } 353 | 354 | #[test] 355 | fn test_evict_pinned_key() { 356 | let mut replacer = 357 | LruKReplacer::::new(10, 2); 358 | replacer.put("key1".to_string(), ("value1".to_string(), 9)); 359 | assert!(replacer.pin(&"key1".to_string(), 1)); 360 | assert!(replacer 361 | .put("key2".to_string(), ("value2".to_string(), 2)) 362 | .is_none()); 363 | assert_eq!(replacer.size(), 9); 364 | assert!(replacer.pin(&"key1".to_string(), 1)); 365 | assert!(replacer.unpin(&"key1".to_string())); 366 | assert!(replacer 367 | .put("key2".to_string(), ("value2".to_string(), 2)) 368 | .is_none()); 369 | assert!(replacer.unpin(&"key1".to_string())); 370 | assert!(replacer 371 | .put("key2".to_string(), ("value2".to_string(), 2)) 372 | .is_some()); 373 | assert_eq!(replacer.size(), 2); 374 | assert!(replacer.pin(&"key2".to_string(), 1)); 375 | replacer.put("key3".to_string(), ("value3".to_string(), 8)); 376 | assert_eq!(replacer.size(), 10); 377 | replacer.put("key4".to_string(), ("value4".to_string(), 7)); 378 | assert_eq!(replacer.size(), 9); 379 | assert!(replacer.get(&"key2".to_string()).is_some()); 380 | assert!(replacer.get(&"key4".to_string()).is_some()); 381 | assert!(replacer.get(&"key3".to_string()).is_none()); 382 | } 383 | } 384 | -------------------------------------------------------------------------------- /storage-node/src/cache/replacer/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod lru; 2 | pub mod lru_k; 3 | use std::fmt::Debug; 4 | use std::hash::Hash; 5 | 6 | /// [`ReplacerKey`] is the key type for data store replacers using different 7 | /// policies in the system. 8 | pub trait ReplacerKey: Hash + Eq + Clone + Debug + Send + Sync {} 9 | impl ReplacerKey for T {} 10 | /// [`ReplacerValue`] is the value type for data store caches using different 11 | /// policies in the system. 12 | /// It might represent a logical object and we can get the actual size for this 13 | /// logical object by calling `size()`. 14 | pub trait ReplacerValue: Send + Sync { 15 | type Value: Debug; 16 | 17 | fn into_value(self) -> Self::Value; 18 | fn as_value(&self) -> &Self::Value; 19 | fn size(&self) -> usize; 20 | } 21 | 22 | /// [`DataStoreReplacer`] records objects' locations in the data store. For example, we cache 23 | /// the contents of s3's remote object `userdata.parquet` in the local disk. Then we may 24 | /// store the local file system path of `userdata.parquet` in `DataStoreCache`. By querying 25 | /// `DataStoreCache`, we can get the local file system path of `userdata.parquet` and read the 26 | /// contents from the local disk. 27 | /// 28 | /// There are different policies for the data store replacer, such as LRU, LRU-K, etc. See 29 | /// other files in this module for more details. 30 | pub trait DataStoreReplacer: Send + Sync { 31 | /// Gets a value from the replacer. Might has side effect on the replacer (e.g. 32 | /// modifying some bookkeeping fields in the replacer). 33 | fn get(&mut self, key: &K) -> Option<&V>; 34 | 35 | /// Puts a value into the replacer. 36 | /// Returns `None`: insertion failed. 37 | /// Returns `Some`: insertion successful with a list of keys that are evicted from the cache. 38 | fn put(&mut self, key: K, value: V) -> Option>; 39 | 40 | fn pin(&mut self, key: &K, count: usize) -> bool; 41 | 42 | fn unpin(&mut self, key: &K) -> bool; 43 | 44 | /// Returns a reference to the value in the replacer with no side effect on the 45 | /// replacer. 46 | fn peek(&self, key: &K) -> Option<&V>; 47 | 48 | /// Returns the number of the objects in the replacer. 49 | fn len(&self) -> usize; 50 | 51 | /// Returns the total size of the objects in the replacer. 52 | fn size(&self) -> usize; 53 | 54 | fn is_empty(&self) -> bool; 55 | 56 | fn max_capacity(&self) -> usize; 57 | 58 | fn set_max_capacity(&mut self, capacity: usize); 59 | 60 | fn clear(&mut self); 61 | } 62 | 63 | #[cfg(test)] 64 | mod tests { 65 | use super::ReplacerValue; 66 | 67 | pub type ParpulseTestReplacerKey = String; 68 | pub type ParpulseTestReplacerValue = (String, usize); 69 | 70 | impl ReplacerValue for ParpulseTestReplacerValue { 71 | type Value = String; 72 | 73 | fn into_value(self) -> Self::Value { 74 | self.0 75 | } 76 | 77 | fn as_value(&self) -> &Self::Value { 78 | &self.0 79 | } 80 | 81 | fn size(&self) -> usize { 82 | self.1 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /storage-node/src/common/config.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | use serde::Serialize; 3 | 4 | #[derive(clap::ValueEnum, Clone, Default, Debug, Serialize)] 5 | pub enum ParpulseConfigDataStore { 6 | #[default] 7 | Memdisk, 8 | Disk, 9 | Sqlite, 10 | } 11 | 12 | #[derive(clap::ValueEnum, Clone, Default, Debug, Serialize)] 13 | pub enum ParpulseConfigCachePolicy { 14 | #[default] 15 | Lru, 16 | Lruk, 17 | } 18 | 19 | #[derive(Parser, Default)] 20 | pub struct ParpulseConfig { 21 | #[clap(long, default_value_t, value_enum)] 22 | pub cache_policy: ParpulseConfigCachePolicy, 23 | 24 | #[clap(long, default_value = None)] 25 | pub cache_lru_k: Option, 26 | 27 | #[clap(long, default_value_t, value_enum)] 28 | pub data_store: ParpulseConfigDataStore, 29 | 30 | #[clap( long, default_value = None)] 31 | pub data_store_cache_num: Option, 32 | 33 | #[clap(long, default_value = None)] 34 | pub mem_cache_size: Option, 35 | 36 | #[clap(long, default_value = None)] 37 | pub mem_cache_file_size: Option, 38 | 39 | #[clap(long, default_value = None)] 40 | pub disk_cache_size: Option, 41 | 42 | #[clap(long, default_value = None)] 43 | pub sqlite_cache_size: Option, 44 | 45 | #[clap(long, default_value = None)] 46 | pub cache_path: Option, 47 | 48 | #[clap(long, default_value = None)] 49 | pub max_disk_reader_buffer_size: Option, 50 | 51 | #[clap(long, default_value = None)] 52 | pub sqlite_blob_reader_buffer_size: Option, 53 | } 54 | -------------------------------------------------------------------------------- /storage-node/src/common/hash.rs: -------------------------------------------------------------------------------- 1 | use std::hash::Hasher; 2 | 3 | pub fn calculate_hash_default(data: &[u8]) -> usize { 4 | let mut hasher = std::collections::hash_map::DefaultHasher::new(); 5 | hasher.write(data); 6 | hasher.finish() as usize 7 | } 8 | 9 | pub fn calculate_hash_crc32fast(data: &[u8]) -> usize { 10 | let mut hasher = crc32fast::Hasher::new(); 11 | hasher.update(data); 12 | hasher.finalize() as usize 13 | } 14 | -------------------------------------------------------------------------------- /storage-node/src/common/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod config; 2 | pub mod hash; 3 | -------------------------------------------------------------------------------- /storage-node/src/disk/disk_manager.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use futures::stream::StreamExt; 3 | use futures::{future::TryFutureExt, join}; 4 | 5 | use std::future::IntoFuture; 6 | use std::io::SeekFrom; 7 | 8 | use std::path::{Path, PathBuf}; 9 | use std::pin::Pin; 10 | 11 | use tokio::fs::{self, File, OpenOptions}; 12 | use tokio::io::AsyncSeekExt; 13 | use tokio::io::{self, AsyncReadExt, AsyncWriteExt}; 14 | 15 | use crate::error::{ParpulseError, ParpulseResult}; 16 | use crate::storage_reader::StorageReaderStream; 17 | 18 | use super::stream::DiskReadStream; 19 | 20 | /// [`DiskManager`] is responsible for reading and writing data to disk. The default 21 | /// version is async. We keep this struct to add lock. 22 | /// 23 | /// TODO: Do we need to put disk_root_path into DiskManager? 24 | #[derive(Default)] 25 | pub struct DiskManager {} 26 | 27 | impl DiskManager { 28 | pub async fn open_or_create(&self, path: &str, append: bool) -> ParpulseResult { 29 | let path_buf: PathBuf = PathBuf::from(path); 30 | if let Some(parent) = path_buf.parent() { 31 | if !parent.exists() { 32 | fs::create_dir_all(parent).await?; 33 | } 34 | } 35 | let mut options = OpenOptions::new(); 36 | options.write(true); 37 | if !path_buf.exists() { 38 | options.create(true); 39 | } 40 | options.append(append); 41 | Ok(options.open(&path_buf).await?) 42 | } 43 | 44 | pub async fn write_disk_all(&self, path: &str, content: &[u8]) -> ParpulseResult<()> { 45 | let mut file = self.open_or_create(path, false).await?; 46 | file.write_all(content).await?; 47 | Ok(file.flush().await?) 48 | } 49 | 50 | pub async fn read_disk_all(&self, path: &str) -> ParpulseResult<(usize, Bytes)> { 51 | let mut file = File::open(path).await?; 52 | let mut buffer = Vec::with_capacity(file.metadata().await?.len() as usize); 53 | 54 | let bytes_read = file.read_to_end(&mut buffer).await?; 55 | Ok((bytes_read, Bytes::from(buffer))) 56 | } 57 | 58 | pub async fn read_disk( 59 | &self, 60 | path: &str, 61 | start_pos: u64, 62 | bytes_to_read: usize, 63 | ) -> ParpulseResult<(usize, Bytes)> { 64 | let mut file = File::open(path).await?; 65 | file.seek(SeekFrom::Start(start_pos)).await?; 66 | 67 | let mut buffer = vec![0; bytes_to_read]; 68 | let bytes_read = file.read(&mut buffer).await?; 69 | buffer.truncate(bytes_read); 70 | Ok((bytes_read, Bytes::from(buffer))) 71 | } 72 | 73 | // If needs to record statistics, use disk_read_stream, if not, please directly new DiskReadStream. 74 | pub async fn disk_read_stream( 75 | &self, 76 | path: &str, 77 | buffer_size: usize, 78 | ) -> ParpulseResult>> { 79 | let disk_read_stream = DiskReadStream::new(path, buffer_size).await?; 80 | Ok(Box::pin(disk_read_stream)) 81 | } 82 | 83 | /// This function will try to **first** write `bytes_vec` to disk if applicable, and write all the (remaining) 84 | /// data polled from the `stream` to disk. The function will return the total bytes written to disk. 85 | /// 86 | /// Note these in current implementation: 87 | /// 1. When writing evicted data from memory cache, bytes_vec should be Some and stream should be None. 88 | /// 2. When memory cache is disabled, bytes_vec should be None and stream should be Some. 89 | /// 3. When writing data which cannot be written to memory cache, both bytes_vec and stream should be Some. 90 | /// 91 | /// FIXME: disk_path should not exist, otherwise throw an error 92 | /// TODO(lanlou): we must handle write-write conflict correctly in the future. 93 | /// One way is using `write commit` to handle read-write conflict, then there is no w-w conflict. 94 | /// TODO(lanlou): We need to write data to disk & send data to network at the same time. 95 | /// TODO(lanlou): S3 stream now returns 10^5 bytes one time, and do we need to group all the bytes for 96 | /// one file and write all of them to disk at once? 97 | pub async fn write_bytes_and_stream_to_disk( 98 | &self, 99 | bytes_vec: Option>, 100 | stream: Option, 101 | disk_path: &str, 102 | ) -> ParpulseResult { 103 | if Path::new(disk_path).exists() { 104 | return Err(io::Error::new( 105 | io::ErrorKind::AlreadyExists, 106 | "disk file to write already exists", 107 | ) 108 | .into()); 109 | } 110 | let mut file = self.open_or_create(disk_path, true).await?; 111 | let mut bytes_written = 0; 112 | 113 | if let Some(bytes_vec) = bytes_vec { 114 | for bytes in bytes_vec { 115 | file.write_all(&bytes).await?; 116 | bytes_written += bytes.len(); 117 | } 118 | } 119 | 120 | if let Some(mut stream) = stream { 121 | let bytes_cur = stream.next().await; 122 | if bytes_cur.is_none() { 123 | file.flush().await?; 124 | return Ok(bytes_written); 125 | } 126 | let mut bytes_cur = bytes_cur.unwrap()?; 127 | loop { 128 | let disk_write_fut = TryFutureExt::into_future(file.write_all(&bytes_cur)); 129 | let bytes_next_fut = stream.next().into_future(); 130 | match join!(disk_write_fut, bytes_next_fut) { 131 | (Ok(_), Some(Ok(bytes_next))) => { 132 | bytes_written += bytes_cur.len(); 133 | bytes_cur = bytes_next; 134 | } 135 | (Ok(_), None) => { 136 | bytes_written += bytes_cur.len(); 137 | break; 138 | } 139 | (Err(e), _) => return Err(ParpulseError::Disk(e)), 140 | (Ok(_), Some(Err(e))) => return Err(e), 141 | } 142 | } 143 | } 144 | // FIXME: do we need a flush here? 145 | file.flush().await?; 146 | Ok(bytes_written) 147 | } 148 | 149 | pub async fn file_size(&self, path: &str) -> ParpulseResult { 150 | let metadata = fs::metadata(path).await?; 151 | Ok(metadata.len()) 152 | } 153 | 154 | pub fn file_size_sync(&self, path: &str) -> ParpulseResult { 155 | let metadata = std::fs::metadata(path)?; 156 | Ok(metadata.len()) 157 | } 158 | 159 | pub async fn remove_file(&self, path: &str) -> ParpulseResult<()> { 160 | Ok(fs::remove_file(path).await?) 161 | } 162 | } 163 | 164 | #[cfg(test)] 165 | mod tests { 166 | use crate::disk::stream::RandomDiskReadStream; 167 | 168 | use super::*; 169 | #[tokio::test] 170 | async fn test_simple_write_read() { 171 | let disk_manager = DiskManager {}; 172 | let tmp = tempfile::tempdir().unwrap(); 173 | let dir = tmp.path().to_owned(); 174 | let path = &dir.join("test_disk_manager1.txt").display().to_string(); 175 | let content = "Hello, world!"; 176 | disk_manager 177 | .write_disk_all(path, content.as_bytes()) 178 | .await 179 | .expect("write_disk_all failed"); 180 | let mut file = disk_manager 181 | .open_or_create(path, true) 182 | .await 183 | .expect("open_or_create failed"); 184 | file.write_all(content.as_bytes()).await.unwrap(); 185 | // Without this code, this test will fail sometimes. 186 | // But even if we add this code, this test is not likely to fail in the sync version. 187 | file.flush().await.unwrap(); 188 | 189 | let file_size = disk_manager 190 | .file_size(path) 191 | .await 192 | .expect("file_size failed"); 193 | assert_eq!(file_size, 2 * content.len() as u64); 194 | 195 | let (bytes_read, bytes) = disk_manager 196 | .read_disk_all(path) 197 | .await 198 | .expect("read_disk_all failed"); 199 | assert_eq!(bytes_read, 2 * content.len()); 200 | assert_eq!(bytes, Bytes::from(content.to_owned() + content)); 201 | 202 | let (bytes_read, bytes) = disk_manager 203 | .read_disk(path, content.len() as u64, content.len()) 204 | .await 205 | .expect("read_disk_all failed"); 206 | assert_eq!(bytes_read, content.len()); 207 | assert_eq!(bytes, Bytes::from(content)); 208 | } 209 | 210 | #[tokio::test] 211 | async fn test_iterator_read() { 212 | let disk_manager = DiskManager {}; 213 | let tmp = tempfile::tempdir().unwrap(); 214 | let dir = tmp.path().to_owned(); 215 | let path = &dir.join("test_disk_manager2.txt").display().to_string(); 216 | let content = "bhjoilkmnkbhaoijsdklmnjkbhiauosdjikbhjoilkmnkbhaoijsdklmnjkbhiauosdjik"; 217 | disk_manager 218 | .write_disk_all(path, content.as_bytes()) 219 | .await 220 | .expect("write_disk_all failed"); 221 | let mut stream = disk_manager 222 | .disk_read_stream(path, 2) 223 | .await 224 | .expect("disk_read_iterator failed"); 225 | let mut start_pos = 0; 226 | loop { 227 | if start_pos >= content.len() { 228 | break; 229 | } 230 | let bytes_read = stream 231 | .next() 232 | .await 233 | .expect("iterator early ended") 234 | .expect("iterator read failed"); 235 | let buffer = stream.buffer(); 236 | assert_eq!( 237 | &content.as_bytes()[start_pos..start_pos + bytes_read], 238 | &buffer[..bytes_read] 239 | ); 240 | start_pos += bytes_read; 241 | } 242 | assert_eq!(start_pos, content.len()); 243 | } 244 | 245 | #[tokio::test] 246 | async fn test_write_reader_to_disk() { 247 | let disk_manager = DiskManager {}; 248 | let tmp = tempfile::tempdir().unwrap(); 249 | let dir = tmp.path().to_owned(); 250 | let path = &dir.join("test_disk_manager3.txt").display().to_string(); 251 | let content = "bhjoilkmnkbhaoijsdklmnjkbhiauosdjikbhjoilkmnkbhaoijsdklmnjkbhiauosdjik"; 252 | disk_manager 253 | .write_disk_all(path, content.as_bytes()) 254 | .await 255 | .expect("write_disk_all failed"); 256 | let stream = RandomDiskReadStream::new(path, 2, 4).unwrap().boxed(); 257 | let output_path = &dir 258 | .join("test_disk_manager3_output.txt") 259 | .display() 260 | .to_string(); 261 | let bytes_written = disk_manager 262 | .write_bytes_and_stream_to_disk(None, Some(stream), output_path) 263 | .await 264 | .expect("write_reader_to_disk failed"); 265 | assert_eq!(bytes_written, content.len()); 266 | 267 | let (bytes_read, bytes) = disk_manager 268 | .read_disk_all(output_path) 269 | .await 270 | .expect("read_disk_all failed"); 271 | assert_eq!(bytes_read, content.len()); 272 | assert_eq!(bytes, Bytes::from(content)); 273 | let file_size = disk_manager 274 | .file_size(output_path) 275 | .await 276 | .expect("file_size failed"); 277 | assert_eq!(file_size, content.len() as u64); 278 | } 279 | 280 | #[tokio::test] 281 | async fn test_write_bytes_to_disk() { 282 | let disk_manager = DiskManager {}; 283 | let tmp = tempfile::tempdir().unwrap(); 284 | let dir = tmp.path().to_owned(); 285 | let path = &dir.join("test_disk_manager4.txt").display().to_string(); 286 | let content1 = "Hello, world!"; 287 | let content2 = "Bye, CMU!"; 288 | let bytes_written = disk_manager 289 | .write_bytes_and_stream_to_disk( 290 | Some(vec![Bytes::from(content1), Bytes::from(content2)]), 291 | None, 292 | path, 293 | ) 294 | .await 295 | .expect("write_bytes_to_disk failed"); 296 | assert_eq!(bytes_written, content1.len() + content2.len()); 297 | let (bytes_read, bytes) = disk_manager 298 | .read_disk_all(path) 299 | .await 300 | .expect("read_disk_all failed"); 301 | assert_eq!(bytes_read, content1.len() + content2.len()); 302 | assert_eq!(bytes, Bytes::from(content1.to_owned() + content2)); 303 | } 304 | 305 | #[tokio::test] 306 | async fn test_write_bytes_and_stream_to_disk() { 307 | let disk_manager = DiskManager {}; 308 | let tmp = tempfile::tempdir().unwrap(); 309 | let dir = tmp.path().to_owned(); 310 | let path = &dir.join("test_disk_manager5.txt").display().to_string(); 311 | let content = "bhjoilkmnkbhaoijsdklmnjkbhiauosdjikbhjoilkmnkbhaoijsdklmnjkbhiauosdjik"; 312 | disk_manager 313 | .write_disk_all(path, content.as_bytes()) 314 | .await 315 | .expect("write_disk_all failed"); 316 | let mut stream = RandomDiskReadStream::new(path, 2, 4).unwrap().boxed(); 317 | 318 | let mut bytes_vec: Vec = Vec::new(); 319 | for _ in 0..3 { 320 | let stream_data = stream.next().await.unwrap().unwrap(); 321 | bytes_vec.push(stream_data); 322 | } 323 | 324 | let output_path = &dir 325 | .join("test_disk_manager5_output.txt") 326 | .display() 327 | .to_string(); 328 | let bytes_written = disk_manager 329 | .write_bytes_and_stream_to_disk(Some(bytes_vec), Some(stream), output_path) 330 | .await 331 | .expect("write_reader_to_disk failed"); 332 | assert_eq!(bytes_written, content.len()); 333 | 334 | let (bytes_read, bytes) = disk_manager 335 | .read_disk_all(output_path) 336 | .await 337 | .expect("read_disk_all failed"); 338 | assert_eq!(bytes_read, content.len()); 339 | assert_eq!(bytes, Bytes::from(content)); 340 | let file_size = disk_manager 341 | .file_size(output_path) 342 | .await 343 | .expect("file_size failed"); 344 | assert_eq!(file_size, content.len() as u64); 345 | } 346 | 347 | #[tokio::test] 348 | async fn test_remove_file() { 349 | let disk_manager = DiskManager {}; 350 | let tmp = tempfile::tempdir().unwrap(); 351 | let dir = tmp.path().to_owned(); 352 | let path = &dir.join("test_disk_manager6.txt").display().to_string(); 353 | let content = "Hello, world!"; 354 | disk_manager 355 | .write_disk_all(path, content.as_bytes()) 356 | .await 357 | .expect("write_disk_all failed"); 358 | disk_manager 359 | .remove_file(path) 360 | .await 361 | .expect("remove_file failed"); 362 | assert!(!Path::new(path).exists()); 363 | } 364 | } 365 | -------------------------------------------------------------------------------- /storage-node/src/disk/disk_manager_sync.rs: -------------------------------------------------------------------------------- 1 | use bytes::{Bytes, BytesMut}; 2 | use std::fs::{self, File, OpenOptions}; 3 | use std::io::{self, Read, Seek, SeekFrom, Write}; 4 | use std::path::{Path, PathBuf}; 5 | 6 | use crate::error::ParpulseResult; 7 | use crate::storage_manager::ParpulseReaderIterator; 8 | 9 | /// [`DiskManagerSync`] contains the common logic to read from or write to a disk. 10 | /// 11 | /// TODO: Record statistics (maybe in statistics manager). 12 | #[derive(Default)] 13 | pub struct DiskManagerSync {} 14 | 15 | // TODO: Make each method accepting `&self` instead of `&mut self`. 16 | impl DiskManagerSync { 17 | pub fn open_or_create(&self, path: &str, append: bool) -> ParpulseResult { 18 | let path_buf: PathBuf = PathBuf::from(path); 19 | if let Some(parent) = path_buf.parent() { 20 | if !parent.exists() { 21 | fs::create_dir_all(parent)?; 22 | } 23 | } 24 | let mut options = OpenOptions::new(); 25 | options.write(true); 26 | if !path_buf.exists() { 27 | options.create(true); 28 | } 29 | options.append(append); 30 | Ok(options.open(&path_buf)?) 31 | } 32 | 33 | // FIXME: `mut` allows future statistics computation 34 | pub fn write_disk_all(&mut self, path: &str, content: &[u8]) -> ParpulseResult<()> { 35 | // TODO: when path exists, we directly overwrite it, should we notify cache? 36 | let mut file = self.open_or_create(path, false)?; 37 | file.write_all(content)?; 38 | Ok(file.flush()?) 39 | } 40 | 41 | // FIXME: do we need to record statistics for read? 42 | pub fn read_disk_all(&self, path: &str) -> ParpulseResult<(usize, Bytes)> { 43 | let mut file = File::open(path)?; 44 | let mut buffer = Vec::with_capacity(file.metadata()?.len() as usize); 45 | let bytes_read = file.read_to_end(&mut buffer)?; 46 | Ok((bytes_read, Bytes::from(buffer))) 47 | } 48 | 49 | pub fn read_disk( 50 | &self, 51 | path: &str, 52 | start_pos: u64, 53 | bytes_to_read: usize, 54 | ) -> ParpulseResult<(usize, Bytes)> { 55 | let mut file = File::open(path)?; 56 | file.seek(SeekFrom::Start(start_pos))?; 57 | 58 | let mut buffer = vec![0; bytes_to_read]; 59 | let bytes_read = file.read(&mut buffer)?; 60 | buffer.truncate(bytes_read); 61 | Ok((bytes_read, Bytes::from(buffer))) 62 | } 63 | 64 | // If needs to record statistics, use disk_read_iterator, if not, please directly new DiskReadIterator 65 | pub fn disk_read_iterator( 66 | &self, 67 | path: &str, 68 | buffer_size: usize, 69 | ) -> ParpulseResult { 70 | DiskReadIterator::new(path, buffer_size) 71 | } 72 | 73 | // FIXME: disk_path should not exist, otherwise throw an error 74 | pub fn write_iterator_reader_to_disk( 75 | &mut self, 76 | mut iterator: T, 77 | disk_path: &str, 78 | ) -> ParpulseResult 79 | where 80 | T: ParpulseReaderIterator, 81 | { 82 | if Path::new(disk_path).exists() { 83 | return Err(io::Error::new( 84 | io::ErrorKind::AlreadyExists, 85 | "disk file to write already exists", 86 | ) 87 | .into()); 88 | } 89 | let mut file = self.open_or_create(disk_path, true)?; 90 | let mut bytes_written = 0; 91 | loop { 92 | match iterator.next() { 93 | Some(Ok(bytes_read)) => { 94 | let buffer = iterator.buffer(); 95 | file.write_all(&buffer[..bytes_read])?; 96 | bytes_written += bytes_read; 97 | } 98 | Some(Err(e)) => return Err(e), 99 | None => break, 100 | } 101 | } 102 | // FIXME: do we need to flush? 103 | file.flush()?; 104 | Ok(bytes_written) 105 | } 106 | 107 | pub fn file_size(&self, path: &str) -> ParpulseResult { 108 | let metadata = fs::metadata(path)?; 109 | Ok(metadata.len()) 110 | } 111 | 112 | pub fn remove_file(&mut self, path: &str) -> ParpulseResult<()> { 113 | Ok(fs::remove_file(path)?) 114 | } 115 | } 116 | 117 | /// FIXME: iterator for sync, stream for async 118 | pub struct DiskReadIterator { 119 | f: File, 120 | pub buffer: BytesMut, 121 | } 122 | 123 | impl DiskReadIterator { 124 | pub fn new(file_path: &str, buffer_size: usize) -> ParpulseResult { 125 | let f = File::open(file_path)?; 126 | 127 | Ok(DiskReadIterator { 128 | f, 129 | buffer: BytesMut::zeroed(buffer_size), 130 | }) 131 | } 132 | } 133 | 134 | impl Iterator for DiskReadIterator { 135 | type Item = ParpulseResult; 136 | 137 | fn next(&mut self) -> Option { 138 | match self.f.read(self.buffer.as_mut()) { 139 | Ok(bytes_read) => { 140 | if bytes_read > 0 { 141 | Some(Ok(bytes_read)) 142 | } else { 143 | None 144 | } 145 | } 146 | Err(e) => Some(Err(e.into())), 147 | } 148 | } 149 | } 150 | 151 | impl ParpulseReaderIterator for DiskReadIterator { 152 | fn buffer(&self) -> &[u8] { 153 | &self.buffer 154 | } 155 | } 156 | 157 | #[cfg(test)] 158 | mod tests { 159 | use super::*; 160 | #[test] 161 | fn test_simple_write_read() { 162 | let mut disk_manager = DiskManagerSync {}; 163 | let tmp = tempfile::tempdir().unwrap(); 164 | let dir = tmp.path().to_owned(); 165 | let path = &dir 166 | .join("test_disk_manager_sync1.txt") 167 | .display() 168 | .to_string(); 169 | let content = "Hello, world!"; 170 | disk_manager 171 | .write_disk_all(path, content.as_bytes()) 172 | .expect("write_disk_all failed"); 173 | let mut file = disk_manager 174 | .open_or_create(path, true) 175 | .expect("open_or_create failed"); 176 | file.write_all(content.as_bytes()).unwrap(); 177 | file.flush().unwrap(); 178 | 179 | let file_size = disk_manager.file_size(path).expect("file_size failed"); 180 | assert_eq!(file_size, 2 * content.len() as u64); 181 | 182 | let (bytes_read, bytes) = disk_manager 183 | .read_disk_all(path) 184 | .expect("read_disk_all failed"); 185 | assert_eq!(bytes_read, 2 * content.len()); 186 | assert_eq!(bytes, Bytes::from(content.to_owned() + content)); 187 | 188 | let (bytes_read, bytes) = disk_manager 189 | .read_disk(path, content.len() as u64, content.len()) 190 | .expect("read_disk_all failed"); 191 | assert_eq!(bytes_read, content.len()); 192 | assert_eq!(bytes, Bytes::from(content)); 193 | } 194 | 195 | #[test] 196 | fn test_iterator_read() { 197 | let mut disk_manager = DiskManagerSync {}; 198 | let tmp = tempfile::tempdir().unwrap(); 199 | let dir = tmp.path().to_owned(); 200 | let path = &dir 201 | .join("test_disk_manager_sync2.txt") 202 | .display() 203 | .to_string(); 204 | let content = "bhjoilkmnkbhaoijsdklmnjkbhiauosdjikbhjoilkmnkbhaoijsdklmnjkbhiauosdjik"; 205 | disk_manager 206 | .write_disk_all(path, content.as_bytes()) 207 | .expect("write_disk_all failed"); 208 | let mut iterator = disk_manager 209 | .disk_read_iterator(path, 2) 210 | .expect("disk_read_iterator failed"); 211 | let mut start_pos = 0; 212 | loop { 213 | if start_pos >= content.len() { 214 | break; 215 | } 216 | let bytes_read = iterator 217 | .next() 218 | .expect("iterator early ended") 219 | .expect("iterator read failed"); 220 | let buffer = iterator.buffer(); 221 | assert_eq!( 222 | &content.as_bytes()[start_pos..start_pos + bytes_read], 223 | &buffer[..bytes_read] 224 | ); 225 | start_pos += bytes_read; 226 | } 227 | assert_eq!(start_pos, content.len()); 228 | } 229 | 230 | #[test] 231 | fn test_write_iterator_reader_to_disk() { 232 | let mut disk_manager = DiskManagerSync {}; 233 | let tmp = tempfile::tempdir().unwrap(); 234 | let dir = tmp.path().to_owned(); 235 | let path = &dir 236 | .join("test_disk_manager_sync3.txt") 237 | .display() 238 | .to_string(); 239 | let content = "bhjoilkmnkbhaoijsdklmnjkbhiauosdjikbhjoilkmnkbhaoijsdklmnjkbhiauosdjik"; 240 | disk_manager 241 | .write_disk_all(path, content.as_bytes()) 242 | .expect("write_disk_all failed"); 243 | let iterator = disk_manager 244 | .disk_read_iterator(path, 1) 245 | .expect("disk_read_iterator failed"); 246 | let output_path = &dir 247 | .join("test_disk_manager3_output.txt") 248 | .display() 249 | .to_string(); 250 | let bytes_written = disk_manager 251 | .write_iterator_reader_to_disk::(iterator, output_path) 252 | .expect("write_reader_to_disk failed"); 253 | assert_eq!(bytes_written, content.len()); 254 | 255 | let (bytes_read, bytes) = disk_manager 256 | .read_disk_all(output_path) 257 | .expect("read_disk_all failed"); 258 | assert_eq!(bytes_read, content.len()); 259 | assert_eq!(bytes, Bytes::from(content)); 260 | let file_size = disk_manager 261 | .file_size(output_path) 262 | .expect("file_size failed"); 263 | assert_eq!(file_size, content.len() as u64); 264 | } 265 | 266 | #[test] 267 | fn test_remove_file() { 268 | let mut disk_manager = DiskManagerSync {}; 269 | let tmp = tempfile::tempdir().unwrap(); 270 | let dir = tmp.path().to_owned(); 271 | let path = &dir 272 | .join("test_disk_manager_sync5.txt") 273 | .display() 274 | .to_string(); 275 | let content = "Hello, world!"; 276 | disk_manager 277 | .write_disk_all(path, content.as_bytes()) 278 | .expect("write_disk_all failed"); 279 | disk_manager.remove_file(path).expect("remove_file failed"); 280 | assert!(!Path::new(path).exists()); 281 | } 282 | } 283 | -------------------------------------------------------------------------------- /storage-node/src/disk/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod disk_manager; 2 | pub mod disk_manager_sync; 3 | pub mod stream; 4 | -------------------------------------------------------------------------------- /storage-node/src/disk/stream.rs: -------------------------------------------------------------------------------- 1 | use bytes::{Bytes, BytesMut}; 2 | use futures::{FutureExt, Stream}; 3 | 4 | use rand::Rng; 5 | use std::ops::DerefMut; 6 | use std::pin::Pin; 7 | use std::task::{Context, Poll}; 8 | use tokio::fs::File; 9 | use tokio::io::AsyncReadExt; 10 | 11 | use crate::error::{ParpulseError, ParpulseResult}; 12 | 13 | /// [`DiskReadStream`] reads data from disk as a stream. 14 | pub struct DiskReadStream { 15 | /// The file to read from. 16 | f: File, 17 | /// Contains the data read from the file. 18 | /// Note that the buffer may not be fully filled with data read from the file. 19 | buffer: BytesMut, 20 | } 21 | 22 | impl DiskReadStream { 23 | pub fn new_sync(file_path: &str, buffer_size: usize) -> ParpulseResult { 24 | let f: std::fs::File = std::fs::File::open(file_path)?; 25 | 26 | Ok(DiskReadStream { 27 | f: File::from_std(f), 28 | buffer: BytesMut::zeroed(buffer_size), 29 | }) 30 | } 31 | 32 | pub async fn new(file_path: &str, buffer_size: usize) -> ParpulseResult { 33 | let f = File::open(file_path).await?; 34 | 35 | Ok(DiskReadStream { 36 | f, 37 | buffer: BytesMut::zeroed(buffer_size), 38 | }) 39 | } 40 | 41 | pub fn buffer(&self) -> &[u8] { 42 | &self.buffer 43 | } 44 | } 45 | 46 | impl Stream for DiskReadStream { 47 | type Item = ParpulseResult; 48 | 49 | fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 50 | let deref_self = self.deref_mut(); 51 | match deref_self 52 | .f 53 | .read(deref_self.buffer.as_mut()) 54 | .boxed() 55 | .poll_unpin(cx) 56 | { 57 | Poll::Ready(Ok(bytes_read)) => { 58 | if bytes_read > 0 { 59 | Poll::Ready(Some(Ok(bytes_read))) 60 | } else { 61 | Poll::Ready(None) 62 | } 63 | } 64 | Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e.into()))), 65 | Poll::Pending => Poll::Pending, 66 | } 67 | } 68 | } 69 | 70 | /// [`RandomDiskReadStream`] is used by `MockS3Reader` to simulate the read from S3. 71 | /// Since every time we pull data from S3, the number of bytes read is random, we 72 | /// need to simulate this behavior. 73 | /// 74 | /// NOTE: The byte range here is only a hint. Due to the implementation of tokio's 75 | /// `AsyncReadExt` trait, the actual number of bytes read may be less than `min_read_bytes`. 76 | /// It is acceptable here because we just use this `RandomDiskReadStream` to simulate 77 | /// the read from S3. 78 | /// 79 | /// `RandomDiskReadStream` should only be used for testing purposes. 80 | pub struct RandomDiskReadStream { 81 | f: File, 82 | min_read_bytes: usize, 83 | max_read_bytes: usize, 84 | buffer: BytesMut, 85 | } 86 | 87 | impl RandomDiskReadStream { 88 | pub fn new( 89 | file_path: &str, 90 | min_read_bytes: usize, 91 | max_read_bytes: usize, 92 | ) -> ParpulseResult { 93 | let f: std::fs::File = std::fs::File::open(file_path)?; 94 | if min_read_bytes >= max_read_bytes { 95 | return Err(ParpulseError::Internal( 96 | "`min_read_bytes` must be less than `max_read_bytes` in `RandomDiskReadStream`" 97 | .to_string(), 98 | )); 99 | } 100 | 101 | Ok(RandomDiskReadStream { 102 | f: File::from_std(f), 103 | min_read_bytes, 104 | max_read_bytes, 105 | buffer: BytesMut::new(), 106 | }) 107 | } 108 | } 109 | 110 | impl Stream for RandomDiskReadStream { 111 | type Item = ParpulseResult; 112 | 113 | fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 114 | let bytes_to_read = rand::thread_rng().gen_range(self.min_read_bytes..self.max_read_bytes); 115 | self.buffer.resize(bytes_to_read, 0); 116 | let deref_self = self.deref_mut(); 117 | 118 | let read_result = deref_self 119 | .f 120 | .read(deref_self.buffer.as_mut()) 121 | .boxed() 122 | .poll_unpin(cx); 123 | match read_result { 124 | Poll::Ready(Ok(bytes_read)) => { 125 | if bytes_read > 0 { 126 | // Though we have resized the buffer to `bytes_to_read` before, tokio's 127 | // implementation doesn't ensure that `bytes_to_read` bytes have been read 128 | // into the buffer. It's likely that fewer bytes have been read. So we 129 | // truncate the buffer to the actual number of bytes read here. 130 | deref_self.buffer.truncate(bytes_read); 131 | Poll::Ready(Some(Ok(deref_self.buffer.clone().freeze()))) 132 | } else { 133 | Poll::Ready(None) 134 | } 135 | } 136 | Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e.into()))), 137 | Poll::Pending => Poll::Pending, 138 | } 139 | } 140 | } 141 | 142 | #[cfg(test)] 143 | mod tests { 144 | use super::*; 145 | use futures::stream::StreamExt; 146 | 147 | #[tokio::test] 148 | async fn test_disk_read_stream() { 149 | let poem = "What can I hold you with? 150 | I offer you lean streets, desperate sunsets, the 151 | moon of the jagged suburbs. 152 | I offer you the bitterness of a man who has looked 153 | long and long at the lonely moon. 154 | I offer you my ancestors, my dead men, the ghosts 155 | that living men have honoured in bronze. 156 | I offer you whatever insight my books may hold, 157 | whatever manliness or humour my life. 158 | I offer you the loyalty of a man who has never 159 | been loyal. 160 | I offer you that kernel of myself that I have saved, 161 | somehow-the central heart that deals not 162 | in words, traffics not with dreams, and is 163 | untouched by time, by joy, by adversities. 164 | I offer you the memory of a yellow rose seen at 165 | sunset, years before you were born. 166 | I offer you explanations of yourself, theories about 167 | yourself, authentic and surprising news of 168 | yourself. 169 | I can give you my loneliness, my darkness, the 170 | hunger of my heart; I am trying to bribe you 171 | with uncertainty, with danger, with defeat. 172 | "; 173 | 174 | let buffer_size = 102; 175 | let mut disk_read_stream = 176 | DiskReadStream::new("tests/text/what-can-i-hold-you-with", buffer_size) 177 | .await 178 | .unwrap(); 179 | 180 | let mut total_bytes_read = 0; 181 | let mut read_count = 0; 182 | let mut result = String::new(); 183 | while let Some(bytes_read) = disk_read_stream.next().await { 184 | let bytes_read = bytes_read.unwrap(); 185 | result += &String::from_utf8(disk_read_stream.buffer()[..bytes_read].to_vec()).unwrap(); 186 | total_bytes_read += bytes_read; 187 | read_count += 1; 188 | } 189 | 190 | assert_eq!(result, poem); 191 | assert_eq!(total_bytes_read, 930); 192 | assert_eq!(read_count, 10); 193 | } 194 | 195 | #[tokio::test] 196 | async fn test_random_disk_read_stream() { 197 | let poem = "What can I hold you with? 198 | I offer you lean streets, desperate sunsets, the 199 | moon of the jagged suburbs. 200 | I offer you the bitterness of a man who has looked 201 | long and long at the lonely moon. 202 | I offer you my ancestors, my dead men, the ghosts 203 | that living men have honoured in bronze. 204 | I offer you whatever insight my books may hold, 205 | whatever manliness or humour my life. 206 | I offer you the loyalty of a man who has never 207 | been loyal. 208 | I offer you that kernel of myself that I have saved, 209 | somehow-the central heart that deals not 210 | in words, traffics not with dreams, and is 211 | untouched by time, by joy, by adversities. 212 | I offer you the memory of a yellow rose seen at 213 | sunset, years before you were born. 214 | I offer you explanations of yourself, theories about 215 | yourself, authentic and surprising news of 216 | yourself. 217 | I can give you my loneliness, my darkness, the 218 | hunger of my heart; I am trying to bribe you 219 | with uncertainty, with danger, with defeat. 220 | "; 221 | 222 | let mut random_disk_read_stream = 223 | RandomDiskReadStream::new("tests/text/what-can-i-hold-you-with", 150, 250).unwrap(); 224 | 225 | let mut total_bytes_read = 0; 226 | let mut result = String::new(); 227 | while let Some(bytes) = random_disk_read_stream.next().await { 228 | let bytes = bytes.unwrap(); 229 | total_bytes_read += bytes.len(); 230 | result += &String::from_utf8(bytes.to_vec()).unwrap(); 231 | } 232 | 233 | assert_eq!(result, poem); 234 | assert_eq!(total_bytes_read, 930); 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /storage-node/src/error.rs: -------------------------------------------------------------------------------- 1 | use aws_sdk_s3::primitives::ByteStreamError; 2 | use thiserror::Error; 3 | 4 | #[derive(Debug, Error)] 5 | pub enum ParpulseError { 6 | #[error("Disk error: {0}")] 7 | Disk(#[source] std::io::Error), 8 | #[error("S3 error: {0}")] 9 | S3(#[source] Box), 10 | #[error("Internal error: {0}")] 11 | Internal(String), 12 | #[error("SQLite error: {0}")] 13 | Sqlite(#[source] rusqlite::Error), 14 | } 15 | 16 | impl From for ParpulseError { 17 | fn from(e: std::io::Error) -> Self { 18 | ParpulseError::Disk(e) 19 | } 20 | } 21 | 22 | impl From> for ParpulseError 23 | where 24 | E: std::error::Error + Send + Sync + 'static, 25 | R: std::fmt::Debug + Send + Sync + 'static, 26 | { 27 | fn from(e: aws_smithy_runtime_api::client::result::SdkError) -> Self { 28 | ParpulseError::S3(Box::new(e)) 29 | } 30 | } 31 | 32 | impl From for ParpulseError { 33 | fn from(e: ByteStreamError) -> Self { 34 | ParpulseError::Internal(e.to_string()) 35 | } 36 | } 37 | 38 | impl From for ParpulseError { 39 | fn from(e: rusqlite::Error) -> Self { 40 | ParpulseError::Sqlite(e) 41 | } 42 | } 43 | 44 | pub type ParpulseResult = std::result::Result; 45 | 46 | unsafe impl Send for ParpulseError {} 47 | unsafe impl Sync for ParpulseError {} 48 | -------------------------------------------------------------------------------- /storage-node/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::new_without_default)] 2 | 3 | pub mod cache; 4 | pub mod common; 5 | pub mod disk; 6 | pub mod error; 7 | pub mod server; 8 | pub mod storage_manager; 9 | pub mod storage_reader; 10 | -------------------------------------------------------------------------------- /storage-node/src/server.rs: -------------------------------------------------------------------------------- 1 | use log::{info, warn}; 2 | use parpulse_client::{RequestParams, S3Request}; 3 | use std::net::IpAddr; 4 | use std::sync::Arc; 5 | use tokio_stream::wrappers::ReceiverStream; 6 | use warp::{Filter, Rejection}; 7 | 8 | use crate::{ 9 | cache::{ 10 | data_store_cache::{memdisk::MemDiskStoreCache, sqlite::SqliteStoreCache}, 11 | replacer::{lru::LruReplacer, lru_k::LruKReplacer}, 12 | }, 13 | common::config::{ParpulseConfig, ParpulseConfigCachePolicy, ParpulseConfigDataStore}, 14 | error::ParpulseResult, 15 | storage_manager::{StorageManager, StorageManagerImpl}, 16 | }; 17 | 18 | const CACHE_BASE_PATH: &str = "parpulse-cache"; 19 | const DEFAULT_DATA_STORE_CACHE_NUM: usize = 3; 20 | const DEFAULT_MEM_CACHE_SIZE: usize = 100 * 1024; 21 | const DEFAULT_DISK_CACHE_SIZE: usize = 1024 * 1024 * 1024; 22 | const DEFAULT_SQLITE_CACHE_SIZE: usize = 200 * 1024 * 1024; 23 | const DEFAULT_MEM_CACHE_MAX_FILE_SIZE: usize = 10 * 1024 * 1024 + 1; 24 | const DEFAULT_LRU_K_VALUE: usize = 2; 25 | const DEFAULT_MAX_DISK_READER_BUFFER_SIZE: usize = 100 * 1024 * 1024; 26 | const DEFAULT_SQLITE_BLOB_READER_BUFFER_SIZE: usize = 1024; 27 | 28 | async fn route(storage_manager: Arc, ip_addr: &str, port: u16) { 29 | let route = warp::path!("file") 30 | .and(warp::path::end()) 31 | .and(warp::query::()) 32 | .and_then(move |params: S3Request| { 33 | let storage_manager = storage_manager.clone(); 34 | if params.is_test { 35 | info!( 36 | "Received test request for bucket: {}, keys: {:?}", 37 | params.bucket, params.keys 38 | ); 39 | } else { 40 | info!( 41 | "Received request for bucket: {}, keys: {:?}", 42 | params.bucket, params.keys 43 | ); 44 | } 45 | async move { 46 | let bucket = params.bucket; 47 | let keys = params.keys; 48 | let request = if params.is_test { 49 | RequestParams::MockS3((bucket, vec![keys])) 50 | } else { 51 | RequestParams::S3((bucket, vec![keys])) 52 | }; 53 | 54 | let result = storage_manager.get_data(request).await; 55 | match result { 56 | Ok(data_rx) => { 57 | let stream = ReceiverStream::new(data_rx); 58 | let body = warp::hyper::Body::wrap_stream(stream); 59 | let response = warp::http::Response::builder() 60 | .header("Content-Type", "text/plain") 61 | .body(body) 62 | .unwrap(); 63 | Ok::<_, Rejection>(warp::reply::with_status( 64 | response, 65 | warp::http::StatusCode::OK, 66 | )) 67 | } 68 | Err(e) => { 69 | let error_message = format!("Failed to get data: {}", e); 70 | let response = warp::http::Response::builder() 71 | .status(warp::http::StatusCode::INTERNAL_SERVER_ERROR) 72 | .body(error_message.into()) 73 | .unwrap(); 74 | Ok::<_, Rejection>(warp::reply::with_status( 75 | response, 76 | warp::http::StatusCode::INTERNAL_SERVER_ERROR, 77 | )) 78 | } 79 | } 80 | } 81 | }); 82 | 83 | let heartbeat = warp::path!("heartbeat").map(|| warp::http::StatusCode::OK); 84 | 85 | // Catch a request that does not match any of the routes above. 86 | let catch_all = warp::any() 87 | .and(warp::path::full()) 88 | .map(|path: warp::path::FullPath| { 89 | warn!("Catch all route hit. Path: {}", path.as_str()); 90 | warp::http::StatusCode::NOT_FOUND 91 | }); 92 | 93 | let routes = route.or(heartbeat).or(catch_all); 94 | let ip_addr: IpAddr = ip_addr.parse().unwrap(); 95 | warp::serve(routes).run((ip_addr, port)).await; 96 | } 97 | 98 | pub async fn storage_node_serve( 99 | ip_addr: &str, 100 | port: u16, 101 | config: ParpulseConfig, 102 | ) -> ParpulseResult<()> { 103 | let data_store_cache_num = config 104 | .data_store_cache_num 105 | .unwrap_or(DEFAULT_DATA_STORE_CACHE_NUM); 106 | match config.data_store { 107 | ParpulseConfigDataStore::Memdisk => { 108 | let disk_cache_size = config.disk_cache_size.unwrap_or(DEFAULT_DISK_CACHE_SIZE); 109 | let mem_cache_size = config.mem_cache_size.unwrap_or(DEFAULT_MEM_CACHE_SIZE); 110 | let mem_cache_file_size = config 111 | .mem_cache_file_size 112 | .unwrap_or(DEFAULT_MEM_CACHE_MAX_FILE_SIZE); 113 | let max_disk_reader_buffer_size = config 114 | .max_disk_reader_buffer_size 115 | .unwrap_or(DEFAULT_MAX_DISK_READER_BUFFER_SIZE); 116 | let cache_base_path = config.cache_path.unwrap_or(CACHE_BASE_PATH.to_string()); 117 | match config.cache_policy { 118 | ParpulseConfigCachePolicy::Lru => { 119 | info!("starting storage node with {} mem-disk cache(s) and LRU cache policy, disk cache size: {}, mem cache size: {}, mem cache file size: {}, max disk reader buffer size: {}", data_store_cache_num, disk_cache_size, mem_cache_size, mem_cache_file_size, max_disk_reader_buffer_size); 120 | let mut data_store_caches = Vec::new(); 121 | for i in 0..data_store_cache_num { 122 | let disk_replacer = LruReplacer::new(disk_cache_size); 123 | let mem_replacer = LruReplacer::new(mem_cache_size); 124 | let data_store_cache = MemDiskStoreCache::new( 125 | disk_replacer, 126 | i.to_string() + &cache_base_path, 127 | Some(mem_replacer), 128 | Some(mem_cache_file_size), 129 | max_disk_reader_buffer_size, 130 | ); 131 | data_store_caches.push(data_store_cache); 132 | } 133 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 134 | route(storage_manager, ip_addr, port).await; 135 | } 136 | ParpulseConfigCachePolicy::Lruk => { 137 | info!("starting storage node with {} mem-disk cache(s) and LRU-K cache policy, disk cache size: {}, mem cache size: {}, mem cache file size: {}, max disk reader buffer size: {}", data_store_cache_num, disk_cache_size, mem_cache_size, mem_cache_file_size, max_disk_reader_buffer_size); 138 | let mut data_store_caches = Vec::new(); 139 | let k = config.cache_lru_k.unwrap_or(DEFAULT_LRU_K_VALUE); 140 | for i in 0..data_store_cache_num { 141 | let disk_replacer = LruKReplacer::new(disk_cache_size, k); 142 | let mem_replacer = LruKReplacer::new(mem_cache_size, k); 143 | let data_store_cache = MemDiskStoreCache::new( 144 | disk_replacer, 145 | i.to_string() + &cache_base_path, 146 | Some(mem_replacer), 147 | Some(mem_cache_file_size), 148 | max_disk_reader_buffer_size, 149 | ); 150 | data_store_caches.push(data_store_cache); 151 | } 152 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 153 | route(storage_manager, ip_addr, port).await; 154 | } 155 | }; 156 | } 157 | ParpulseConfigDataStore::Disk => { 158 | let disk_cache_size = config.disk_cache_size.unwrap_or(DEFAULT_DISK_CACHE_SIZE); 159 | let cache_base_path = config.cache_path.unwrap_or(CACHE_BASE_PATH.to_string()); 160 | let max_disk_reader_buffer_size = config 161 | .max_disk_reader_buffer_size 162 | .unwrap_or(DEFAULT_MAX_DISK_READER_BUFFER_SIZE); 163 | match config.cache_policy { 164 | ParpulseConfigCachePolicy::Lru => { 165 | info!("starting storage node with {} disk-only cache(s) and LRU cache policy, disk cache size: {}, max disk reader buffer size: {}", data_store_cache_num, disk_cache_size, max_disk_reader_buffer_size); 166 | let mut data_store_caches = Vec::new(); 167 | for i in 0..data_store_cache_num { 168 | let disk_replacer = LruReplacer::new(disk_cache_size); 169 | let data_store_cache = MemDiskStoreCache::new( 170 | disk_replacer, 171 | i.to_string() + &cache_base_path, 172 | None, 173 | None, 174 | max_disk_reader_buffer_size, 175 | ); 176 | data_store_caches.push(data_store_cache); 177 | } 178 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 179 | route(storage_manager, ip_addr, port).await; 180 | } 181 | ParpulseConfigCachePolicy::Lruk => { 182 | info!("starting storage node with {} disk-only cache(s) and LRU-K cache policy, disk cache size: {}, max disk reader buffer size: {}", data_store_cache_num, disk_cache_size, max_disk_reader_buffer_size); 183 | let mut data_store_caches = Vec::new(); 184 | let k = config.cache_lru_k.unwrap_or(DEFAULT_LRU_K_VALUE); 185 | for i in 0..data_store_cache_num { 186 | let disk_replacer = LruKReplacer::new(disk_cache_size, k); 187 | let data_store_cache = MemDiskStoreCache::new( 188 | disk_replacer, 189 | i.to_string() + &cache_base_path, 190 | None, 191 | None, 192 | max_disk_reader_buffer_size, 193 | ); 194 | data_store_caches.push(data_store_cache); 195 | } 196 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 197 | route(storage_manager, ip_addr, port).await; 198 | } 199 | } 200 | } 201 | ParpulseConfigDataStore::Sqlite => { 202 | let sqlite_base_path = 203 | config.cache_path.unwrap_or(CACHE_BASE_PATH.to_string()) + "sqlite.db3"; 204 | let sqlite_cache_size = config.mem_cache_size.unwrap_or(DEFAULT_SQLITE_CACHE_SIZE); 205 | let sqlite_blob_reader_buffer_size = config 206 | .sqlite_blob_reader_buffer_size 207 | .unwrap_or(DEFAULT_SQLITE_BLOB_READER_BUFFER_SIZE); 208 | match config.cache_policy { 209 | ParpulseConfigCachePolicy::Lru => { 210 | info!("starting storage node with {} sqlite cache(s) and LRU cache policy, cache size: {}, blob reader buffer size: {}", data_store_cache_num, sqlite_cache_size, sqlite_blob_reader_buffer_size); 211 | let mut data_store_caches = Vec::new(); 212 | for i in 0..data_store_cache_num { 213 | let replacer = LruReplacer::new(sqlite_cache_size); 214 | let sqlite_data_cache = SqliteStoreCache::new( 215 | replacer, 216 | i.to_string() + &sqlite_base_path, 217 | sqlite_blob_reader_buffer_size, 218 | )?; 219 | data_store_caches.push(sqlite_data_cache); 220 | } 221 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 222 | route(storage_manager, ip_addr, port).await; 223 | } 224 | ParpulseConfigCachePolicy::Lruk => { 225 | info!("starting storage node with {} sqlite cache(s) and LRU-K cache policy, cache size: {}, blob reader buffer size: {}", data_store_cache_num, sqlite_cache_size, sqlite_blob_reader_buffer_size); 226 | let k = config.cache_lru_k.unwrap_or(DEFAULT_LRU_K_VALUE); 227 | let mut data_store_caches = Vec::new(); 228 | for i in 0..data_store_cache_num { 229 | let replacer = LruKReplacer::new(sqlite_cache_size, k); 230 | let sqlite_data_cache = SqliteStoreCache::new( 231 | replacer, 232 | i.to_string() + &sqlite_base_path, 233 | sqlite_blob_reader_buffer_size, 234 | )?; 235 | data_store_caches.push(sqlite_data_cache); 236 | } 237 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 238 | route(storage_manager, ip_addr, port).await; 239 | } 240 | } 241 | } 242 | }; 243 | 244 | Ok(()) 245 | } 246 | 247 | #[cfg(test)] 248 | mod tests { 249 | 250 | use super::*; 251 | use reqwest::Client; 252 | use std::fs; 253 | use std::io::Write; 254 | use tempfile::tempdir; 255 | 256 | /// WARNING: Put userdata1.parquet in the storage-node/tests/parquet directory before running this test. 257 | #[tokio::test] 258 | #[allow(clippy::field_reassign_with_default)] 259 | async fn test_server() { 260 | let original_file_path = "tests/parquet/userdata1.parquet"; 261 | let mut config = ParpulseConfig::default(); 262 | config.data_store_cache_num = Some(6); 263 | // Start the server 264 | let server_handle = tokio::spawn(async move { 265 | storage_node_serve("127.0.0.1", 3030, config).await.unwrap(); 266 | }); 267 | 268 | // Give the server some time to start 269 | tokio::time::sleep(std::time::Duration::from_secs(1)).await; 270 | 271 | // Test1: test_download_file 272 | let url = 273 | "http://localhost:3030/file?bucket=tests-parquet&keys=userdata1.parquet&is_test=true"; 274 | let client = Client::new(); 275 | let mut response = client 276 | .get(url) 277 | .send() 278 | .await 279 | .expect("Failed to get response from the server."); 280 | assert!( 281 | response.status().is_success(), 282 | "Failed to download file. Status code: {}", 283 | response.status() 284 | ); 285 | 286 | let temp_dir = tempdir().unwrap(); 287 | let file_path = temp_dir.path().join("userdata1.parquet"); 288 | let mut file = fs::File::create(&file_path).unwrap(); 289 | 290 | // Stream the response body and write to the file 291 | while let Some(chunk) = response.chunk().await.unwrap() { 292 | file.write_all(&chunk).unwrap(); 293 | } 294 | assert!(file_path.exists(), "File not found after download"); 295 | 296 | // Check if file sizes are equal 297 | assert_eq!( 298 | fs::metadata(original_file_path).unwrap().len(), 299 | fs::metadata(file_path.clone()).unwrap().len() 300 | ); 301 | 302 | assert_eq!(fs::metadata(file_path).unwrap().len(), 113629); 303 | 304 | // Test2: test_file_not_exist 305 | let url = 306 | "http://localhost:3030/file?bucket=tests-parquet&keys=not_exist.parquet&is_test=true"; 307 | let client = Client::new(); 308 | let response = client 309 | .get(url) 310 | .send() 311 | .await 312 | .expect("Failed to get response from the server."); 313 | 314 | assert!( 315 | response.status().is_server_error(), 316 | "Expected 500 status code" 317 | ); 318 | 319 | server_handle.abort(); 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /storage-node/src/storage_manager.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | cache::data_store_cache::{cache_key_from_request, DataStoreCache}, 3 | common::hash::calculate_hash_crc32fast, 4 | error::ParpulseResult, 5 | }; 6 | 7 | use async_trait::async_trait; 8 | use bytes::Bytes; 9 | use log::debug; 10 | use parpulse_client::RequestParams; 11 | use tokio::sync::mpsc::Receiver; 12 | 13 | #[async_trait] 14 | pub trait StorageManager: Send + Sync { 15 | async fn get_data( 16 | &self, 17 | request: RequestParams, 18 | ) -> ParpulseResult>>; 19 | } 20 | 21 | /// [`StorageManager`] handles the request from the storage client. 22 | /// 23 | /// We should allow concurrent requests fed into the storage manager, 24 | /// which should be responsible for handling multiple requests at the 25 | /// same time. 26 | pub struct StorageManagerImpl { 27 | /// We don't use lock here because `data_store_cache` itself should handle the concurrency. 28 | data_store_caches: Vec, 29 | } 30 | 31 | impl StorageManagerImpl { 32 | pub fn new(data_store_caches: Vec) -> Self { 33 | Self { data_store_caches } 34 | } 35 | 36 | pub async fn get_data( 37 | &self, 38 | request: RequestParams, 39 | ) -> ParpulseResult>> { 40 | // 1. Try to get data from the cache first. 41 | // 2. If cache miss, then go to storage reader to fetch the data from 42 | // the underlying storage. 43 | // 3. If needed, update the cache with the data fetched from the storage reader. 44 | 45 | // TODO: Support more request types. 46 | 47 | // FIXME: Cache key should be . Might refactor the underlying S3 48 | // reader as one S3 key for one reader. 49 | let cache_key = cache_key_from_request(&request); 50 | let hash = calculate_hash_crc32fast(cache_key.as_bytes()); 51 | let cache_index = hash % self.data_store_caches.len(); 52 | let data_store_cache = self.data_store_caches.get(cache_index).unwrap(); 53 | 54 | debug!( 55 | "For cache key: {}, the corresponding data_store_cache index {}", 56 | cache_key, cache_index 57 | ); 58 | 59 | let data_rx = data_store_cache.get_data_from_cache(&request).await?; 60 | if let Some(data_rx) = data_rx { 61 | Ok(data_rx) 62 | } else { 63 | data_store_cache.put_data_to_cache(&request).await?; 64 | // TODO (kunle): Push down the response writer rather than calling get_data_from_cache again. 65 | let data_rx = data_store_cache.get_data_from_cache(&request).await?; 66 | if data_rx.is_none() { 67 | panic!("Data should be in the cache now. {}", cache_key.clone()); 68 | } 69 | Ok(data_rx.unwrap()) 70 | } 71 | } 72 | } 73 | 74 | #[async_trait] 75 | impl StorageManager for StorageManagerImpl { 76 | async fn get_data( 77 | &self, 78 | request: RequestParams, 79 | ) -> ParpulseResult>> { 80 | self.get_data(request).await 81 | } 82 | } 83 | 84 | /// fn buffer(&self) -> &[u8]; ensures Iterator has a buffer 85 | /// This buffer function returns the starting point of the result. 86 | /// **NOTE**: The result buffer must be **CONTINUOUS** in bytes with the size in Item as its length. 87 | pub trait ParpulseReaderIterator: Iterator> { 88 | fn buffer(&self) -> &[u8]; 89 | } 90 | 91 | #[cfg(test)] 92 | mod tests { 93 | use futures::join; 94 | use std::{sync::Arc, time::Instant}; 95 | 96 | use crate::cache::{data_store_cache::memdisk::MemDiskStoreCache, replacer::lru::LruReplacer}; 97 | 98 | use super::*; 99 | 100 | async fn consume_receiver(mut rx: Receiver>) -> usize { 101 | let mut total_bytes = 0; 102 | while let Some(data) = rx.recv().await { 103 | match data { 104 | Ok(bytes) => { 105 | total_bytes += bytes.len(); 106 | } 107 | Err(e) => panic!("Error receiving data: {:?}", e), 108 | } 109 | } 110 | total_bytes 111 | } 112 | 113 | #[tokio::test] 114 | async fn test_storage_manager_disk_only() { 115 | let dummy_size = 1000000; 116 | let cache = LruReplacer::new(dummy_size); 117 | 118 | let tmp = tempfile::tempdir().unwrap(); 119 | let dir = tmp.path().to_owned(); 120 | let cache_base_path = dir.join("test-storage-manager"); 121 | 122 | let data_store_cache = MemDiskStoreCache::new( 123 | cache, 124 | cache_base_path.display().to_string(), 125 | None, 126 | None, 127 | 100 * 1024 * 1024, 128 | ); 129 | let storage_manager = StorageManagerImpl::new(vec![data_store_cache]); 130 | 131 | let bucket = "tests-parquet".to_string(); 132 | let keys = vec!["userdata1.parquet".to_string()]; 133 | let request = RequestParams::MockS3((bucket, keys)); 134 | 135 | let mut start_time = Instant::now(); 136 | let result = storage_manager.get_data(request.clone()).await; 137 | assert!(result.is_ok()); 138 | let mut data_rx = result.unwrap(); 139 | let mut total_bytes = 0; 140 | while let Some(data) = data_rx.recv().await { 141 | match data { 142 | Ok(bytes) => { 143 | total_bytes += bytes.len(); 144 | } 145 | Err(e) => panic!("Error receiving data: {:?}", e), 146 | } 147 | } 148 | assert_eq!(total_bytes, 113629); 149 | let delta_time_miss = Instant::now() - start_time; 150 | 151 | start_time = Instant::now(); 152 | let result = storage_manager.get_data(request).await; 153 | assert!(result.is_ok()); 154 | let data_rx = result.unwrap(); 155 | assert_eq!(consume_receiver(data_rx).await, 113629); 156 | let delta_time_hit = Instant::now() - start_time; 157 | 158 | println!( 159 | "Delta time miss: {:?}, delta time hit: {:?}", 160 | delta_time_miss, delta_time_hit 161 | ); 162 | assert!(delta_time_miss > delta_time_hit); 163 | } 164 | 165 | #[tokio::test] 166 | async fn test_storage_manager_mem_disk_1() { 167 | // 1. get small data (-> memory) 168 | // 2. get large data (-> disk) 169 | // 3. get small data again 170 | // 4. get large data again 171 | // 5. compare time 172 | let dummy_size = 1000000; 173 | let disk_cache = LruReplacer::new(dummy_size); 174 | let mem_cache = LruReplacer::new(dummy_size); 175 | 176 | let tmp = tempfile::tempdir().unwrap(); 177 | let disk_cache_base_path = tmp.path().to_owned(); 178 | 179 | let data_store_cache = MemDiskStoreCache::new( 180 | disk_cache, 181 | disk_cache_base_path.display().to_string(), 182 | Some(mem_cache), 183 | Some(950), 184 | 100 * 1024 * 1024, 185 | ); 186 | let storage_manager = StorageManagerImpl::new(vec![data_store_cache]); 187 | 188 | let request_path_small_bucket = "tests-text".to_string(); 189 | let request_path_small_keys = vec!["what-can-i-hold-you-with".to_string()]; 190 | let request_small = 191 | RequestParams::MockS3((request_path_small_bucket, request_path_small_keys)); 192 | 193 | let result = storage_manager.get_data(request_small.clone()).await; 194 | assert!(result.is_ok()); 195 | assert_eq!(consume_receiver(result.unwrap()).await, 930); 196 | 197 | let request_path_large_bucket = "tests-parquet".to_string(); 198 | let request_path_large_keys = vec!["userdata2.parquet".to_string()]; 199 | let request_large = 200 | RequestParams::MockS3((request_path_large_bucket, request_path_large_keys)); 201 | 202 | let result = storage_manager.get_data(request_large.clone()).await; 203 | assert!(result.is_ok()); 204 | assert_eq!(consume_receiver(result.unwrap()).await, 112193); 205 | 206 | // Get data again. 207 | let mut start_time = Instant::now(); 208 | let result = storage_manager.get_data(request_large).await; 209 | assert!(result.is_ok()); 210 | assert_eq!(consume_receiver(result.unwrap()).await, 112193); 211 | let delta_time_hit_disk = Instant::now() - start_time; 212 | 213 | start_time = Instant::now(); 214 | let result = storage_manager.get_data(request_small).await; 215 | assert!(result.is_ok()); 216 | assert_eq!(consume_receiver(result.unwrap()).await, 930); 217 | let delta_time_hit_mem = Instant::now() - start_time; 218 | 219 | println!( 220 | "For small and large files, Delta time hit mem: {:?}, delta time hit disk: {:?}", 221 | delta_time_hit_mem, delta_time_hit_disk 222 | ); 223 | assert!(delta_time_hit_disk > delta_time_hit_mem); 224 | } 225 | 226 | #[tokio::test] 227 | async fn test_storage_manager_mem_disk_2() { 228 | // 1. get large data1 (-> memory) 229 | // 2. get large data2 (-> memory, and evict data1 to disk) 230 | // 3. get data1 again 231 | // 4. get data2 again 232 | // 5. compare time 233 | let disk_cache = LruReplacer::new(1000000); 234 | let mem_cache = LruReplacer::new(120000); 235 | 236 | let tmp = tempfile::tempdir().unwrap(); 237 | let disk_cache_base_path = tmp.path().to_owned(); 238 | 239 | let data_store_cache = MemDiskStoreCache::new( 240 | disk_cache, 241 | disk_cache_base_path.display().to_string(), 242 | Some(mem_cache), 243 | Some(120000), 244 | 100 * 1024 * 1024, 245 | ); 246 | let storage_manager = StorageManagerImpl::new(vec![data_store_cache]); 247 | 248 | let request_path_bucket1 = "tests-parquet".to_string(); 249 | let request_path_keys1 = vec!["userdata1.parquet".to_string()]; 250 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 251 | 252 | let result = storage_manager.get_data(request_data1.clone()).await; 253 | assert!(result.is_ok()); 254 | assert_eq!(consume_receiver(result.unwrap()).await, 113629); 255 | 256 | let request_path_bucket2 = "tests-parquet".to_string(); 257 | let request_path_keys2 = vec!["userdata2.parquet".to_string()]; 258 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 259 | 260 | let result = storage_manager.get_data(request_data2.clone()).await; 261 | assert!(result.is_ok()); 262 | assert_eq!(consume_receiver(result.unwrap()).await, 112193); 263 | 264 | // Get data again. Now data2 in memory and data1 in disk. 265 | let mut start_time = Instant::now(); 266 | let result = storage_manager.get_data(request_data1).await; 267 | assert!(result.is_ok()); 268 | assert_eq!(consume_receiver(result.unwrap()).await, 113629); 269 | let delta_time_hit_disk = Instant::now() - start_time; 270 | 271 | start_time = Instant::now(); 272 | let result = storage_manager.get_data(request_data2).await; 273 | assert!(result.is_ok()); 274 | assert_eq!(consume_receiver(result.unwrap()).await, 112193); 275 | let delta_time_hit_mem = Instant::now() - start_time; 276 | 277 | println!( 278 | "For almost same files, delta time hit mem: {:?}, delta time hit disk: {:?}", 279 | delta_time_hit_mem, delta_time_hit_disk 280 | ); 281 | assert!(delta_time_hit_disk > delta_time_hit_mem); 282 | } 283 | 284 | #[tokio::test] 285 | async fn test_storage_manager_parallel_1() { 286 | let disk_cache = LruReplacer::new(1000000); 287 | 288 | let tmp = tempfile::tempdir().unwrap(); 289 | let disk_cache_base_path = tmp.path().to_owned(); 290 | 291 | let data_store_cache = MemDiskStoreCache::new( 292 | disk_cache, 293 | disk_cache_base_path.display().to_string(), 294 | None, 295 | None, 296 | 100 * 1024 * 1024, 297 | ); 298 | let storage_manager = Arc::new(StorageManagerImpl::new(vec![data_store_cache])); 299 | 300 | let request_path_bucket1 = "tests-parquet".to_string(); 301 | let request_path_keys1 = vec!["userdata1.parquet".to_string()]; 302 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 303 | 304 | let request_path_bucket2 = "tests-parquet".to_string(); 305 | let request_path_keys2 = vec!["userdata2.parquet".to_string()]; 306 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 307 | 308 | let storage_manager_1 = storage_manager.clone(); 309 | let request_data1_1 = request_data1.clone(); 310 | let get_data_fut_1 = 311 | tokio::spawn(async move { storage_manager_1.get_data(request_data1_1).await }); 312 | 313 | let storage_manager_2 = storage_manager.clone(); 314 | let request_data1_2 = request_data1.clone(); 315 | let get_data_fut_2 = 316 | tokio::spawn(async move { storage_manager_2.get_data(request_data1_2).await }); 317 | 318 | let storage_manager_3 = storage_manager.clone(); 319 | let request_data2_3 = request_data2.clone(); 320 | let get_data_fut_3 = 321 | tokio::spawn(async move { storage_manager_3.get_data(request_data2_3).await }); 322 | 323 | let storage_manager_4 = storage_manager.clone(); 324 | let request_data1_4 = request_data1.clone(); 325 | let get_data_fut_4 = 326 | tokio::spawn(async move { storage_manager_4.get_data(request_data1_4).await }); 327 | 328 | let result = join!( 329 | get_data_fut_1, 330 | get_data_fut_2, 331 | get_data_fut_3, 332 | get_data_fut_4 333 | ); 334 | assert!(result.0.is_ok()); 335 | assert_eq!(consume_receiver(result.0.unwrap().unwrap()).await, 113629); 336 | assert!(result.1.is_ok()); 337 | assert_eq!(consume_receiver(result.1.unwrap().unwrap()).await, 113629); 338 | assert!(result.2.is_ok()); 339 | assert_eq!(consume_receiver(result.2.unwrap().unwrap()).await, 112193); 340 | assert!(result.3.is_ok()); 341 | assert_eq!(consume_receiver(result.3.unwrap().unwrap()).await, 113629); 342 | } 343 | 344 | #[tokio::test] 345 | async fn test_storage_manager_parallel_2() { 346 | let disk_cache = LruReplacer::new(1000000); 347 | let mem_cache = LruReplacer::new(120000); 348 | 349 | let tmp = tempfile::tempdir().unwrap(); 350 | let disk_cache_base_path = tmp.path().to_owned(); 351 | 352 | let data_store_cache = MemDiskStoreCache::new( 353 | disk_cache, 354 | disk_cache_base_path.display().to_string(), 355 | Some(mem_cache), 356 | Some(120000), 357 | 100 * 1024 * 1024, 358 | ); 359 | let storage_manager = Arc::new(StorageManagerImpl::new(vec![data_store_cache])); 360 | 361 | let request_path_bucket1 = "tests-parquet".to_string(); 362 | let request_path_keys1 = vec!["userdata2.parquet".to_string()]; 363 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 364 | 365 | let request_path_bucket2 = "tests-parquet".to_string(); 366 | let request_path_keys2 = vec!["userdata1.parquet".to_string()]; 367 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 368 | 369 | let mut start_time = Instant::now(); 370 | 371 | let storage_manager_1 = storage_manager.clone(); 372 | let request_data1_1 = request_data1.clone(); 373 | let get_data_fut_1 = 374 | tokio::spawn(async move { storage_manager_1.get_data(request_data1_1).await }); 375 | 376 | let storage_manager_2 = storage_manager.clone(); 377 | let request_data1_2 = request_data1.clone(); 378 | let get_data_fut_2 = 379 | tokio::spawn(async move { storage_manager_2.get_data(request_data1_2).await }); 380 | 381 | let storage_manager_3 = storage_manager.clone(); 382 | let request_data2_3 = request_data2.clone(); 383 | let get_data_fut_3 = 384 | tokio::spawn(async move { storage_manager_3.get_data(request_data2_3).await }); 385 | 386 | let storage_manager_4 = storage_manager.clone(); 387 | let request_data2_4 = request_data2.clone(); 388 | let get_data_fut_4 = 389 | tokio::spawn(async move { storage_manager_4.get_data(request_data2_4).await }); 390 | 391 | let storage_manager_5 = storage_manager.clone(); 392 | let request_data1_5 = request_data1.clone(); 393 | let get_data_fut_5 = 394 | tokio::spawn(async move { storage_manager_5.get_data(request_data1_5).await }); 395 | 396 | let result = join!( 397 | get_data_fut_1, 398 | get_data_fut_2, 399 | get_data_fut_3, 400 | get_data_fut_4, 401 | get_data_fut_5 402 | ); 403 | assert!(result.0.is_ok()); 404 | assert_eq!(consume_receiver(result.0.unwrap().unwrap()).await, 112193); 405 | assert!(result.1.is_ok()); 406 | assert_eq!(consume_receiver(result.1.unwrap().unwrap()).await, 112193); 407 | assert!(result.2.is_ok()); 408 | assert_eq!(consume_receiver(result.2.unwrap().unwrap()).await, 113629); 409 | assert!(result.3.is_ok()); 410 | assert_eq!(consume_receiver(result.3.unwrap().unwrap()).await, 113629); 411 | assert!(result.4.is_ok()); 412 | assert_eq!(consume_receiver(result.4.unwrap().unwrap()).await, 112193); 413 | 414 | let delta_time_miss = Instant::now() - start_time; 415 | 416 | start_time = Instant::now(); 417 | 418 | let storage_manager_1 = storage_manager.clone(); 419 | let request_data2_1 = request_data2.clone(); 420 | let get_data_fut_1 = 421 | tokio::spawn(async move { storage_manager_1.get_data(request_data2_1).await }); 422 | 423 | let storage_manager_2 = storage_manager.clone(); 424 | let request_data1_2 = request_data1.clone(); 425 | let get_data_fut_2 = 426 | tokio::spawn(async move { storage_manager_2.get_data(request_data1_2).await }); 427 | 428 | let storage_manager_3 = storage_manager.clone(); 429 | let request_data2_3 = request_data2.clone(); 430 | let get_data_fut_3 = 431 | tokio::spawn(async move { storage_manager_3.get_data(request_data2_3).await }); 432 | 433 | let storage_manager_4 = storage_manager.clone(); 434 | let request_data1_4 = request_data1.clone(); 435 | let get_data_fut_4 = 436 | tokio::spawn(async move { storage_manager_4.get_data(request_data1_4).await }); 437 | 438 | let storage_manager_5 = storage_manager.clone(); 439 | let request_data1_5 = request_data1.clone(); 440 | let get_data_fut_5 = 441 | tokio::spawn(async move { storage_manager_5.get_data(request_data1_5).await }); 442 | 443 | let result = join!( 444 | get_data_fut_1, 445 | get_data_fut_2, 446 | get_data_fut_3, 447 | get_data_fut_4, 448 | get_data_fut_5 449 | ); 450 | assert!(result.0.is_ok()); 451 | assert_eq!(consume_receiver(result.0.unwrap().unwrap()).await, 113629); 452 | assert!(result.1.is_ok()); 453 | assert_eq!(consume_receiver(result.1.unwrap().unwrap()).await, 112193); 454 | assert!(result.2.is_ok()); 455 | assert_eq!(consume_receiver(result.2.unwrap().unwrap()).await, 113629); 456 | assert!(result.3.is_ok()); 457 | assert_eq!(consume_receiver(result.3.unwrap().unwrap()).await, 112193); 458 | assert!(result.4.is_ok()); 459 | assert_eq!(consume_receiver(result.4.unwrap().unwrap()).await, 112193); 460 | 461 | let delta_time_hit = Instant::now() - start_time; 462 | 463 | println!( 464 | "For parallel test 2, delta time miss: {:?}, delta time miss: {:?}", 465 | delta_time_miss, delta_time_hit 466 | ); 467 | assert!(delta_time_miss > delta_time_hit); 468 | } 469 | 470 | #[tokio::test] 471 | async fn test_fanout_cache() { 472 | let data_store_cache_num = 6; 473 | let mut data_store_caches = Vec::new(); 474 | for _ in 0..data_store_cache_num { 475 | let disk_cache = LruReplacer::new(1000000); 476 | let mem_cache = LruReplacer::new(120000); 477 | 478 | let tmp = tempfile::tempdir().unwrap(); 479 | let disk_cache_base_path = tmp.path().to_owned(); 480 | 481 | let data_store_cache = MemDiskStoreCache::new( 482 | disk_cache, 483 | disk_cache_base_path.display().to_string(), 484 | Some(mem_cache), 485 | Some(120000), 486 | 100 * 1024 * 1024, 487 | ); 488 | data_store_caches.push(data_store_cache); 489 | } 490 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 491 | 492 | let request_path_bucket1 = "tests-parquet".to_string(); 493 | let request_path_keys1 = vec!["userdata1.parquet".to_string()]; 494 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 495 | 496 | let result = storage_manager.get_data(request_data1.clone()).await; 497 | assert!(result.is_ok()); 498 | assert_eq!(consume_receiver(result.unwrap()).await, 113629); 499 | let request_path_bucket2 = "tests-parquet".to_string(); 500 | let request_path_keys2 = vec!["userdata2.parquet".to_string()]; 501 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 502 | let result = storage_manager.get_data(request_data2.clone()).await; 503 | assert!(result.is_ok()); 504 | assert_eq!(consume_receiver(result.unwrap()).await, 112193); 505 | 506 | let request_path_bucket3 = "tests-text".to_string(); 507 | let request_path_keys3: Vec = vec!["what-can-i-hold-you-with".to_string()]; 508 | let request_data3 = RequestParams::MockS3((request_path_bucket3, request_path_keys3)); 509 | let result = storage_manager.get_data(request_data3.clone()).await; 510 | assert!(result.is_ok()); 511 | assert_eq!(consume_receiver(result.unwrap()).await, 930); 512 | 513 | let request_path_bucket4 = "tests-parquet".to_string(); 514 | let request_path_keys4: Vec = vec!["small_random_data.parquet".to_string()]; 515 | let request_data4 = RequestParams::MockS3((request_path_bucket4, request_path_keys4)); 516 | let result = storage_manager.get_data(request_data4.clone()).await; 517 | assert!(result.is_ok()); 518 | assert_eq!(consume_receiver(result.unwrap()).await, 2013); 519 | } 520 | 521 | #[tokio::test] 522 | async fn test_fanout_cach_parallel() { 523 | let data_store_cache_num = 6; 524 | let mut data_store_caches = Vec::new(); 525 | for _ in 0..data_store_cache_num { 526 | let disk_cache = LruReplacer::new(1000000); 527 | let mem_cache = LruReplacer::new(120000); 528 | 529 | let tmp = tempfile::tempdir().unwrap(); 530 | let disk_cache_base_path = tmp.path().to_owned(); 531 | 532 | let data_store_cache = MemDiskStoreCache::new( 533 | disk_cache, 534 | disk_cache_base_path.display().to_string(), 535 | Some(mem_cache), 536 | Some(120000), 537 | 100 * 1024 * 1024, 538 | ); 539 | data_store_caches.push(data_store_cache); 540 | } 541 | let storage_manager = Arc::new(StorageManagerImpl::new(data_store_caches)); 542 | 543 | let request_path_bucket1 = "tests-parquet".to_string(); 544 | let request_path_keys1 = vec!["userdata2.parquet".to_string()]; 545 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 546 | 547 | let request_path_bucket2 = "tests-parquet".to_string(); 548 | let request_path_keys2 = vec!["userdata1.parquet".to_string()]; 549 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 550 | 551 | let request_path_bucket3 = "tests-text".to_string(); 552 | let request_path_keys3 = vec!["what-can-i-hold-you-with".to_string()]; 553 | let request_data3 = RequestParams::MockS3((request_path_bucket3, request_path_keys3)); 554 | 555 | let storage_manager_1 = storage_manager.clone(); 556 | let request_data1_1 = request_data1.clone(); 557 | let get_data_fut_1 = 558 | tokio::spawn(async move { storage_manager_1.get_data(request_data1_1).await }); 559 | 560 | let storage_manager_2 = storage_manager.clone(); 561 | let request_data1_2 = request_data1.clone(); 562 | let get_data_fut_2 = 563 | tokio::spawn(async move { storage_manager_2.get_data(request_data1_2).await }); 564 | 565 | let storage_manager_3 = storage_manager.clone(); 566 | let request_data3_3 = request_data3.clone(); 567 | let get_data_fut_3 = 568 | tokio::spawn(async move { storage_manager_3.get_data(request_data3_3).await }); 569 | 570 | let storage_manager_4 = storage_manager.clone(); 571 | let request_data2_4 = request_data2.clone(); 572 | let get_data_fut_4 = 573 | tokio::spawn(async move { storage_manager_4.get_data(request_data2_4).await }); 574 | 575 | let storage_manager_5 = storage_manager.clone(); 576 | let request_data2_5 = request_data2.clone(); 577 | let get_data_fut_5 = 578 | tokio::spawn(async move { storage_manager_5.get_data(request_data2_5).await }); 579 | 580 | let storage_manager_6 = storage_manager.clone(); 581 | let request_data1_6 = request_data1.clone(); 582 | let get_data_fut_6 = 583 | tokio::spawn(async move { storage_manager_6.get_data(request_data1_6).await }); 584 | 585 | let storage_manager_7 = storage_manager.clone(); 586 | let request_data3_7 = request_data3.clone(); 587 | let get_data_fut_7 = 588 | tokio::spawn(async move { storage_manager_7.get_data(request_data3_7).await }); 589 | 590 | let result = join!( 591 | get_data_fut_1, 592 | get_data_fut_2, 593 | get_data_fut_3, 594 | get_data_fut_4, 595 | get_data_fut_5, 596 | get_data_fut_6, 597 | get_data_fut_7 598 | ); 599 | assert!(result.0.is_ok()); 600 | assert_eq!(consume_receiver(result.0.unwrap().unwrap()).await, 112193); 601 | assert!(result.1.is_ok()); 602 | assert_eq!(consume_receiver(result.1.unwrap().unwrap()).await, 112193); 603 | assert!(result.2.is_ok()); 604 | assert_eq!(consume_receiver(result.2.unwrap().unwrap()).await, 930); 605 | assert!(result.3.is_ok()); 606 | assert_eq!(consume_receiver(result.3.unwrap().unwrap()).await, 113629); 607 | assert!(result.4.is_ok()); 608 | assert_eq!(consume_receiver(result.4.unwrap().unwrap()).await, 113629); 609 | assert!(result.5.is_ok()); 610 | assert_eq!(consume_receiver(result.5.unwrap().unwrap()).await, 112193); 611 | assert!(result.6.is_ok()); 612 | assert_eq!(consume_receiver(result.6.unwrap().unwrap()).await, 930); 613 | } 614 | 615 | #[tokio::test] 616 | async fn test_evict_disk() { 617 | let disk_cache = LruReplacer::new(120000); 618 | 619 | let tmp = tempfile::tempdir().unwrap(); 620 | let disk_cache_base_path = tmp.path().to_owned(); 621 | 622 | let data_store_cache = MemDiskStoreCache::new( 623 | disk_cache, 624 | disk_cache_base_path.display().to_string(), 625 | None, 626 | None, 627 | 100 * 1024 * 1024, 628 | ); 629 | let storage_manager = Arc::new(StorageManagerImpl::new(vec![data_store_cache])); 630 | 631 | let request_path_bucket1 = "tests-parquet".to_string(); 632 | let request_path_keys1 = vec!["userdata2.parquet".to_string()]; 633 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 634 | 635 | let request_path_bucket2 = "tests-parquet".to_string(); 636 | let request_path_keys2 = vec!["userdata1.parquet".to_string()]; 637 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 638 | 639 | let res1 = storage_manager.get_data(request_data1.clone()).await; 640 | assert!(res1.is_ok()); 641 | assert_eq!(consume_receiver(res1.unwrap()).await, 112193); 642 | let res2 = storage_manager.get_data(request_data2.clone()).await; 643 | assert!(res2.is_ok()); 644 | assert_eq!(consume_receiver(res2.unwrap()).await, 113629); 645 | let res3 = storage_manager.get_data(request_data1.clone()).await; 646 | assert!(res3.is_ok()); 647 | assert_eq!(consume_receiver(res3.unwrap()).await, 112193); 648 | } 649 | 650 | #[tokio::test] 651 | async fn test_evict_mem() { 652 | let disk_cache = LruReplacer::new(10); 653 | let mem_cache = LruReplacer::new(120000); 654 | 655 | let tmp = tempfile::tempdir().unwrap(); 656 | let disk_cache_base_path = tmp.path().to_owned(); 657 | 658 | let data_store_cache = MemDiskStoreCache::new( 659 | disk_cache, 660 | disk_cache_base_path.display().to_string(), 661 | Some(mem_cache), 662 | Some(120000), 663 | 100 * 1024 * 1024, 664 | ); 665 | let storage_manager = Arc::new(StorageManagerImpl::new(vec![data_store_cache])); 666 | 667 | let request_path_bucket1 = "tests-parquet".to_string(); 668 | let request_path_keys1 = vec!["userdata2.parquet".to_string()]; 669 | let request_data1 = RequestParams::MockS3((request_path_bucket1, request_path_keys1)); 670 | 671 | let request_path_bucket2 = "tests-parquet".to_string(); 672 | let request_path_keys2 = vec!["userdata1.parquet".to_string()]; 673 | let request_data2 = RequestParams::MockS3((request_path_bucket2, request_path_keys2)); 674 | 675 | let res1 = storage_manager.get_data(request_data1.clone()).await; 676 | assert!(res1.is_ok()); 677 | assert_eq!(consume_receiver(res1.unwrap()).await, 112193); 678 | let res2 = storage_manager.get_data(request_data2.clone()).await; 679 | assert!(res2.is_ok()); 680 | assert_eq!(consume_receiver(res2.unwrap()).await, 113629); 681 | let res3 = storage_manager.get_data(request_data1.clone()).await; 682 | assert!(res3.is_ok()); 683 | assert_eq!(consume_receiver(res3.unwrap()).await, 112193); 684 | } 685 | } 686 | -------------------------------------------------------------------------------- /storage-node/src/storage_reader/mod.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use bytes::Bytes; 3 | use futures::stream::BoxStream; 4 | 5 | use crate::{error::ParpulseResult, storage_manager::ParpulseReaderIterator}; 6 | 7 | pub mod s3; 8 | // TODO: We can use `use mockall::automock;` to mock s3. 9 | // (https://docs.aws.amazon.com/sdk-for-rust/latest/dg/testing.html) 10 | // pub mod s3_automock; 11 | pub mod s3_diskmock; 12 | 13 | pub trait SyncStorageReader { 14 | type ReaderIterator: ParpulseReaderIterator; 15 | fn read_all(&self) -> ParpulseResult; 16 | fn into_iterator(self) -> ParpulseResult; 17 | } 18 | 19 | /// [`StorageReaderStream`] is a stream of data read from the underlying storage. 20 | /// Each storage reader should implement `Stream` trait to provide a stream of data. 21 | pub type StorageReaderStream = BoxStream<'static, ParpulseResult>; 22 | 23 | // TODO: Merge `StorageReader` with `AsyncStorageReader`. 24 | #[async_trait] 25 | pub trait AsyncStorageReader { 26 | /// Read all data at once from the underlying storage. 27 | /// 28 | /// NEVER call this method if you do not know the size of the data -- collecting 29 | /// all data into one buffer might lead to OOM. 30 | async fn read_all(&self) -> ParpulseResult>; 31 | 32 | /// Read data from the underlying storage as a stream. 33 | async fn into_stream(self) -> ParpulseResult; 34 | } 35 | -------------------------------------------------------------------------------- /storage-node/src/storage_reader/s3.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | pin::Pin, 3 | task::{Context, Poll}, 4 | }; 5 | 6 | use async_trait::async_trait; 7 | use aws_config::{meta::region::RegionProviderChain, BehaviorVersion}; 8 | use aws_sdk_s3::{ 9 | operation::get_object::{GetObjectError, GetObjectOutput}, 10 | primitives::ByteStream, 11 | Client, 12 | }; 13 | use aws_smithy_runtime_api::{client::result::SdkError, http::Response}; 14 | use bytes::Bytes; 15 | use futures::{future::BoxFuture, ready, FutureExt, Stream}; 16 | 17 | use crate::error::{ParpulseError, ParpulseResult}; 18 | 19 | use super::{AsyncStorageReader, StorageReaderStream}; 20 | 21 | /// [`S3Reader`] is a reader for retrieving data from S3. It can either read the 22 | /// data once at all or read the data in an asynchronous stream. 23 | pub struct S3Reader { 24 | client: Client, 25 | bucket: String, 26 | keys: Vec, 27 | } 28 | 29 | impl S3Reader { 30 | pub async fn new(bucket: String, keys: Vec) -> Self { 31 | let region_provider = RegionProviderChain::default_provider().or_else("us-east-1"); 32 | let config = aws_config::defaults(BehaviorVersion::latest()) 33 | .region(region_provider) 34 | .load() 35 | .await; 36 | let client = Client::new(&config); 37 | Self { 38 | client, 39 | bucket, 40 | keys, 41 | } 42 | } 43 | 44 | pub async fn get_object_size(&self) -> ParpulseResult { 45 | let mut size = 0; 46 | for key in &self.keys { 47 | let obj = self 48 | .client 49 | .head_object() 50 | .bucket(&self.bucket) 51 | .key(key) 52 | .send() 53 | .await?; 54 | size += obj 55 | .content_length 56 | .map(|l| l as usize) 57 | .ok_or_else(|| ParpulseError::S3("fail to get object size".into()))?; 58 | } 59 | Ok(size) 60 | } 61 | } 62 | 63 | /// [`S3DataStream`] is a stream for reading data from S3. It reads the data in 64 | /// chunks and returns the data in a stream. Currently it uses non-fixed buffer, 65 | /// which means it will be consumed and extended. 66 | /// 67 | /// If we want to use fixed buffer for benchmark, we can add self.last_read_size and 68 | /// self.current_buffer_pos. 69 | pub struct S3ReaderStream { 70 | client: Client, 71 | bucket: String, 72 | keys: Vec, 73 | current_key: usize, 74 | 75 | object_fut: 76 | Option>>>, 77 | object_body: Option, 78 | } 79 | 80 | impl S3ReaderStream { 81 | pub fn new(client: Client, bucket: String, keys: Vec) -> Self { 82 | assert!(!keys.is_empty(), "keys should not be empty"); 83 | Self { 84 | client, 85 | bucket, 86 | keys, 87 | current_key: 0, 88 | object_fut: None, 89 | object_body: None, 90 | } 91 | } 92 | } 93 | 94 | impl Stream for S3ReaderStream { 95 | type Item = ParpulseResult; 96 | 97 | fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 98 | if let Some(object_fut) = self.object_fut.as_mut() { 99 | match ready!(object_fut.poll_unpin(cx)) { 100 | Ok(object) => { 101 | self.object_fut.take(); 102 | self.object_body = Some(object.body); 103 | self.poll_next(cx) 104 | } 105 | Err(e) => Poll::Ready(Some(Err(ParpulseError::from(e)))), 106 | } 107 | } else if let Some(object_body) = self.object_body.as_mut() { 108 | let poll_result = object_body.try_next().boxed().poll_unpin(cx); 109 | match poll_result { 110 | Poll::Ready(ready_result) => match ready_result { 111 | Ok(Some(bytes)) => Poll::Ready(Some(Ok(bytes))), 112 | Ok(None) => { 113 | self.object_body = None; 114 | self.poll_next(cx) 115 | } 116 | Err(e) => Poll::Ready(Some(Err(ParpulseError::from(e)))), 117 | }, 118 | Poll::Pending => Poll::Pending, 119 | } 120 | } else if self.current_key >= self.keys.len() { 121 | // No more data to read in S3. 122 | Poll::Ready(None) 123 | } else { 124 | // There are more files to read in S3. Fetch the next object. 125 | let fut = self 126 | .client 127 | .get_object() 128 | .bucket(&self.bucket) 129 | .key(&self.keys[self.current_key]) 130 | .send() 131 | .boxed(); 132 | self.object_fut = Some(fut); 133 | self.current_key += 1; 134 | self.poll_next(cx) 135 | } 136 | } 137 | } 138 | 139 | #[async_trait] 140 | impl AsyncStorageReader for S3Reader { 141 | /// NEVER call this method if you do not know the size of the data -- collecting 142 | /// all data into one buffer might lead to OOM. 143 | async fn read_all(&self) -> ParpulseResult> { 144 | let mut bytes_vec = Vec::with_capacity(self.keys.len()); 145 | for key in &self.keys { 146 | let object = self 147 | .client 148 | .get_object() 149 | .bucket(&self.bucket) 150 | .key(key) 151 | .send() 152 | .await 153 | .map_err(ParpulseError::from)?; 154 | bytes_vec.push( 155 | object 156 | .body 157 | .collect() 158 | .await 159 | .map_err(ParpulseError::from)? 160 | .into_bytes(), 161 | ); 162 | } 163 | Ok(bytes_vec) 164 | } 165 | 166 | async fn into_stream(self) -> ParpulseResult { 167 | let s3_stream = S3ReaderStream::new(self.client, self.bucket, self.keys); 168 | Ok(Box::pin(s3_stream)) 169 | } 170 | } 171 | 172 | #[cfg(test)] 173 | mod tests { 174 | use futures::StreamExt; 175 | 176 | use super::*; 177 | 178 | #[tokio::test] 179 | async fn test_s3_read_all() { 180 | let bucket = "parpulse-test".to_string(); 181 | let keys = vec!["userdata/userdata1.parquet".to_string()]; 182 | let reader = S3Reader::new(bucket, keys).await; 183 | let bytes = reader.read_all().await.unwrap(); 184 | assert_eq!(bytes[0].len(), 113629); 185 | } 186 | 187 | #[tokio::test] 188 | async fn test_s3_read_streaming() { 189 | let bucket = "parpulse-test".to_string(); 190 | let keys = vec![ 191 | "userdata/userdata1.parquet".to_string(), 192 | "userdata/userdata2.parquet".to_string(), 193 | "userdata/userdata3.parquet".to_string(), 194 | "userdata/userdata4.parquet".to_string(), 195 | "userdata/userdata5.parquet".to_string(), 196 | ]; 197 | 198 | let reader = S3Reader::new(bucket, keys).await; 199 | let mut s3_stream = reader.into_stream().await.unwrap(); 200 | 201 | let mut streaming_total_bytes = 0; 202 | while let Some(data) = s3_stream.next().await { 203 | let data = data.unwrap(); 204 | streaming_total_bytes += data.len(); 205 | } 206 | assert_eq!(streaming_total_bytes, 565545); 207 | } 208 | 209 | #[tokio::test] 210 | async fn test_s3_get_object_size() { 211 | let bucket = "parpulse-test".to_string(); 212 | let keys = vec![ 213 | "userdata/userdata1.parquet".to_string(), 214 | "userdata/userdata2.parquet".to_string(), 215 | "userdata/userdata3.parquet".to_string(), 216 | "userdata/userdata4.parquet".to_string(), 217 | "userdata/userdata5.parquet".to_string(), 218 | ]; 219 | 220 | let reader = S3Reader::new(bucket, keys).await; 221 | let size = reader.get_object_size().await.unwrap(); 222 | assert_eq!(size, 565545); 223 | } 224 | } 225 | -------------------------------------------------------------------------------- /storage-node/src/storage_reader/s3_diskmock.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | env, 3 | pin::Pin, 4 | task::{Context, Poll}, 5 | thread, 6 | time::Duration, 7 | }; 8 | 9 | use async_trait::async_trait; 10 | use bytes::Bytes; 11 | use futures::{ready, Stream, StreamExt}; 12 | 13 | use crate::{ 14 | disk::{disk_manager::DiskManager, stream::RandomDiskReadStream}, 15 | error::ParpulseResult, 16 | }; 17 | 18 | use super::{AsyncStorageReader, StorageReaderStream}; 19 | 20 | const DELAY: Option = Some(Duration::from_millis(1)); 21 | const MIN_DISK_READ_SIZE: usize = 1024 * 512; 22 | const MAX_DISK_READ_SIZE: usize = 1024 * 1024; 23 | 24 | /// Please DON'T use `MockS3Reader` to test performance, only use it to 25 | /// test the correctness!!! 26 | /// There is no chunksize in `MockS3Reader`. 27 | /// If we want to make big change to s3.rs, please also change s3_diskmock.rs 28 | /// TODO: We can also use automock to mock s3. (so there is no need to manually sync changes) 29 | pub struct MockS3Reader { 30 | file_paths: Vec, 31 | disk_manager: DiskManager, 32 | } 33 | 34 | impl MockS3Reader { 35 | // Async here is to be consistent with S3Reader. 36 | pub async fn new(bucket: String, keys: Vec) -> Self { 37 | // Get the absolute path instead of relative path. 38 | let base_path = env::current_dir() 39 | .ok() 40 | .and_then(|current_path| { 41 | current_path 42 | .parent() 43 | .map(|root_path| root_path.join("storage-node")) 44 | }) 45 | .and_then(|joined_path| joined_path.to_str().map(|s| s.to_string())) 46 | .unwrap_or_default(); 47 | 48 | let file_paths: Vec = keys 49 | .iter() 50 | .map(|key| format!("{}/{}/{}", base_path, bucket.replace('-', "/"), key)) 51 | .collect(); 52 | MockS3Reader { 53 | file_paths, 54 | disk_manager: DiskManager::default(), 55 | } 56 | } 57 | 58 | pub async fn get_object_size(&self) -> ParpulseResult { 59 | let mut size = 0; 60 | for file_path in &self.file_paths { 61 | size += self.disk_manager.file_size(file_path).await? as usize; 62 | } 63 | Ok(size) 64 | } 65 | } 66 | 67 | pub struct MockS3ReaderStream { 68 | current_disk_stream: Option>>, 69 | file_paths: Vec, 70 | current_key: usize, 71 | } 72 | 73 | impl MockS3ReaderStream { 74 | pub fn new(file_paths: Vec) -> Self { 75 | MockS3ReaderStream { 76 | current_disk_stream: None, 77 | file_paths, 78 | current_key: 0, 79 | } 80 | } 81 | } 82 | 83 | impl Stream for MockS3ReaderStream { 84 | type Item = ParpulseResult; 85 | 86 | fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 87 | if let Some(current_disk_stream) = self.current_disk_stream.as_mut() { 88 | match ready!(current_disk_stream.poll_next_unpin(cx)) { 89 | Some(Ok(bytes)) => { 90 | if let Some(delay) = DELAY { 91 | thread::sleep(delay); 92 | } 93 | Poll::Ready(Some(Ok(bytes))) 94 | } 95 | Some(Err(e)) => Poll::Ready(Some(Err(e))), 96 | None => { 97 | self.current_key += 1; 98 | self.current_disk_stream.take(); 99 | self.poll_next(cx) 100 | } 101 | } 102 | } else { 103 | // We need to create a new disk_stream since there is no last disk_stream, or it has 104 | // been consumed. 105 | if self.current_key >= self.file_paths.len() { 106 | return Poll::Ready(None); 107 | } 108 | let file_path = self.file_paths[self.current_key].clone(); 109 | match RandomDiskReadStream::new(&file_path, MIN_DISK_READ_SIZE, MAX_DISK_READ_SIZE) { 110 | Ok(disk_stream) => { 111 | self.current_disk_stream = Some(Box::pin(disk_stream)); 112 | } 113 | Err(e) => return Poll::Ready(Some(Err(e))), 114 | } 115 | self.poll_next(cx) 116 | } 117 | } 118 | } 119 | 120 | #[async_trait] 121 | impl AsyncStorageReader for MockS3Reader { 122 | async fn read_all(&self) -> ParpulseResult> { 123 | let mut bytes_vec = Vec::with_capacity(self.file_paths.len()); 124 | for file_path in &self.file_paths { 125 | let (_, data) = self.disk_manager.read_disk_all(file_path).await?; 126 | bytes_vec.push(data); 127 | } 128 | Ok(bytes_vec) 129 | } 130 | 131 | async fn into_stream(self) -> ParpulseResult { 132 | Ok(Box::pin(MockS3ReaderStream::new(self.file_paths))) 133 | } 134 | } 135 | 136 | #[cfg(test)] 137 | mod tests { 138 | use super::*; 139 | #[tokio::test] 140 | async fn test_simple_write_read() { 141 | let bucket = "tests-parquet".to_string(); 142 | let keys = vec![ 143 | "userdata1.parquet".to_string(), 144 | "userdata2.parquet".to_string(), 145 | ]; 146 | let reader = MockS3Reader::new(bucket, keys).await; 147 | let bytes = reader.read_all().await.unwrap(); 148 | assert_eq!(bytes[0].len() + bytes[1].len(), 113629 + 112193); 149 | } 150 | 151 | #[tokio::test] 152 | async fn test_mock_s3_read_streaming() { 153 | let bucket = "tests-parquet".to_string(); 154 | let keys = vec![ 155 | "userdata1.parquet".to_string(), 156 | "userdata2.parquet".to_string(), 157 | ]; 158 | 159 | let reader = MockS3Reader::new(bucket, keys).await; 160 | let mut s3_stream = reader.into_stream().await.unwrap(); 161 | 162 | let mut streaming_total_bytes = 0; 163 | while let Some(data) = s3_stream.next().await { 164 | let data = data.unwrap(); 165 | streaming_total_bytes += data.len(); 166 | } 167 | assert_eq!(streaming_total_bytes, 113629 + 112193); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /storage-node/tests/parquet/small_random_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/15721-s24-cache1/b4e2bc8f2c2fc3ab7a9b9fa3f8e864e25e9c8c40/storage-node/tests/parquet/small_random_data.parquet -------------------------------------------------------------------------------- /storage-node/tests/parquet/userdata1.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/15721-s24-cache1/b4e2bc8f2c2fc3ab7a9b9fa3f8e864e25e9c8c40/storage-node/tests/parquet/userdata1.parquet -------------------------------------------------------------------------------- /storage-node/tests/parquet/userdata2.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/15721-s24-cache1/b4e2bc8f2c2fc3ab7a9b9fa3f8e864e25e9c8c40/storage-node/tests/parquet/userdata2.parquet -------------------------------------------------------------------------------- /storage-node/tests/text/what-can-i-hold-you-with: -------------------------------------------------------------------------------- 1 | What can I hold you with? 2 | I offer you lean streets, desperate sunsets, the 3 | moon of the jagged suburbs. 4 | I offer you the bitterness of a man who has looked 5 | long and long at the lonely moon. 6 | I offer you my ancestors, my dead men, the ghosts 7 | that living men have honoured in bronze. 8 | I offer you whatever insight my books may hold, 9 | whatever manliness or humour my life. 10 | I offer you the loyalty of a man who has never 11 | been loyal. 12 | I offer you that kernel of myself that I have saved, 13 | somehow-the central heart that deals not 14 | in words, traffics not with dreams, and is 15 | untouched by time, by joy, by adversities. 16 | I offer you the memory of a yellow rose seen at 17 | sunset, years before you were born. 18 | I offer you explanations of yourself, theories about 19 | yourself, authentic and surprising news of 20 | yourself. 21 | I can give you my loneliness, my darkness, the 22 | hunger of my heart; I am trying to bribe you 23 | with uncertainty, with danger, with defeat. 24 | -------------------------------------------------------------------------------- /tests/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tests" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | storage-node = { path = "../storage-node" } 8 | parpulse-client = { path = "../storage-client" } 9 | tokio = { version = "1", features = ["rt", "rt-multi-thread", "macros"] } 10 | arrow = "50.0.0" 11 | log = "0.4" 12 | istziio-client = "0.1" 13 | 14 | [dev-dependencies] 15 | serial_test = "3.1" 16 | env_logger = "0.11" 17 | -------------------------------------------------------------------------------- /tests/src/client_server_test.rs: -------------------------------------------------------------------------------- 1 | /// This file serves as an integration test for the client and server. 2 | /// WARNING: This test assumes that the data returned from the server is userdata1.parquet. 3 | extern crate parpulse_client; 4 | extern crate storage_node; 5 | 6 | #[cfg(test)] 7 | mod tests { 8 | use arrow::array::{Float64Array, StringArray}; 9 | use istziio_client::client_api::{DataRequest, StorageClient, StorageRequest}; 10 | use parpulse_client::client::StorageClientImpl; 11 | use serial_test::serial; 12 | use std::time::Instant; 13 | use storage_node::{common::config::ParpulseConfig, server::storage_node_serve}; 14 | 15 | #[test] 16 | fn setup() { 17 | let _ = env_logger::builder() 18 | .filter_level(log::LevelFilter::Info) 19 | .is_test(true) 20 | .try_init(); 21 | } 22 | 23 | #[tokio::test] 24 | #[serial] 25 | async fn test_client_server_disk() { 26 | // The file dir should start from storage-node. 27 | // Start the server 28 | let server_handle = tokio::spawn(async move { 29 | storage_node_serve("127.0.0.1", 3030, ParpulseConfig::default()) 30 | .await 31 | .unwrap(); 32 | }); 33 | 34 | // Give the server some time to start 35 | tokio::time::sleep(std::time::Duration::from_secs(1)).await; 36 | 37 | let storage_client = 38 | StorageClientImpl::new("http://127.0.0.1:3030", "http://127.0.0.1:3031") 39 | .expect("Failed to create storage client."); 40 | let start_time = Instant::now(); 41 | let request = StorageRequest::new(0, DataRequest::Table(0)); 42 | let mut receiver = storage_client 43 | .request_data_test(request) 44 | .await 45 | .expect("Failed to get data from the server."); 46 | let mut record_batches = vec![]; 47 | while let Some(record_batch) = receiver.recv().await { 48 | record_batches.push(record_batch); 49 | } 50 | println!( 51 | "Time taken for userdata file in disk: {:?}", 52 | start_time.elapsed() 53 | ); 54 | assert!(!record_batches.is_empty()); 55 | 56 | let first_batch = &record_batches[0]; 57 | assert_eq!(first_batch.num_columns(), 13); 58 | 59 | let real_first_names = StringArray::from(vec!["Amanda", "Albert", "Evelyn"]); 60 | let read_last_names = StringArray::from(vec!["Jordan", "Freeman", "Morgan"]); 61 | let first_names = first_batch 62 | .column(2) 63 | .as_any() 64 | .downcast_ref::() 65 | .unwrap(); 66 | let last_names = first_batch 67 | .column(3) 68 | .as_any() 69 | .downcast_ref::() 70 | .unwrap(); 71 | // Check the first three entries in the first and last name columns. 72 | for i in 0..3 { 73 | assert_eq!(first_names.value(i), real_first_names.value(i)); 74 | assert_eq!(last_names.value(i), read_last_names.value(i)); 75 | } 76 | 77 | server_handle.abort(); 78 | } 79 | 80 | #[tokio::test] 81 | #[serial] 82 | async fn test_client_server_s3() { 83 | // Start the server 84 | let server_handle = tokio::spawn(async move { 85 | storage_node_serve("127.0.0.1", 3030, ParpulseConfig::default()) 86 | .await 87 | .unwrap(); 88 | }); 89 | 90 | // Give the server some time to start 91 | tokio::time::sleep(std::time::Duration::from_secs(1)).await; 92 | 93 | let storage_client = 94 | StorageClientImpl::new("http://127.0.0.1:3030", "http://127.0.0.1:3031") 95 | .expect("Failed to create storage client."); 96 | let start_time = Instant::now(); 97 | // Requesting random_data_1m_1.parquet 98 | let request = StorageRequest::new(0, DataRequest::Table(1)); 99 | let mut receiver = storage_client 100 | .request_data(request) 101 | .await 102 | .expect("Failed to get data from the server."); 103 | let mut record_batches = vec![]; 104 | while let Some(record_batch) = receiver.recv().await { 105 | record_batches.push(record_batch); 106 | } 107 | 108 | println!("Time taken for 1m file: {:?}", start_time.elapsed()); 109 | assert!(!record_batches.is_empty()); 110 | 111 | let first_batch = &record_batches[0]; 112 | assert_eq!(first_batch.num_columns(), 20); 113 | 114 | // Check the first 5 columns of the first row. 115 | let real_first_row = [ 116 | 0.19195386139992177, 117 | 0.4815442611405789, 118 | 0.47078682326631927, 119 | 0.7793912218913533, 120 | 0.21877220521846885, 121 | ]; 122 | for (i, &real_value) in real_first_row.iter().enumerate() { 123 | let column = first_batch 124 | .column(i) 125 | .as_any() 126 | .downcast_ref::() 127 | .unwrap(); 128 | assert_eq!(column.value(0), real_value); 129 | } 130 | 131 | server_handle.abort(); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /tests/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod client_server_test; 2 | --------------------------------------------------------------------------------