├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .gitmodules ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Makefile ├── README-ZH.MD ├── README.MD ├── benches ├── block_device │ └── main.rs ├── grpc │ ├── client.rs │ ├── main.rs │ ├── mod.rs │ └── server.rs ├── local_storage │ └── main.rs ├── mod.rs └── rpc │ ├── client.rs │ ├── main.rs │ ├── mod.rs │ └── server.rs ├── build.rs ├── docker ├── client │ └── Dockerfile ├── manager │ └── Dockerfile └── server │ └── Dockerfile ├── docs ├── RDMA.md ├── README-ZH.MD ├── README.MD ├── images │ └── architecture.jpg └── specification.md ├── examples ├── hello_client.rs ├── hello_server.rs ├── manager.yaml ├── rdma_client.rs └── rdma_server.rs ├── intercept ├── Cargo.toml ├── build.rs └── src │ ├── client.rs │ ├── file_desc.rs │ ├── lib.rs │ ├── path.rs │ ├── syscall_intercept.rs │ └── test_log.rs ├── proto └── test.proto ├── scripts ├── add_node.sh ├── close_all_instances.sh ├── delete_node.sh ├── read_files.sh ├── test.sh └── test_run_all.sh ├── src ├── bin │ ├── client.rs │ ├── manager.rs │ └── server.rs ├── client │ ├── daemon.rs │ ├── fuse_client.rs │ └── mod.rs ├── common │ ├── byte.rs │ ├── cache.rs │ ├── errors.rs │ ├── hash_ring.rs │ ├── info_syncer.rs │ ├── mod.rs │ ├── sender.rs │ ├── serialization.rs │ └── util.rs ├── lib.rs ├── manager │ ├── core.rs │ ├── manager_service.rs │ └── mod.rs ├── rpc │ ├── callback.rs │ ├── client.rs │ ├── connection.rs │ ├── mod.rs │ ├── protocol.rs │ ├── rdma │ │ ├── client.rs │ │ ├── mod.rs │ │ └── server.rs │ └── server.rs └── server │ ├── distributed_engine.rs │ ├── mod.rs │ ├── storage_engine │ ├── block_engine │ │ ├── allocator.rs │ │ ├── index.rs │ │ ├── io.rs │ │ └── mod.rs │ ├── file_engine.rs │ ├── meta_engine.rs │ └── mod.rs │ └── transfer_manager.rs └── test_io500.sh /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous integration 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | paths-ignore: 7 | - "docs/**" 8 | - "**/*.md" 9 | - "**/*.yaml" 10 | pull_request: 11 | branches: [ "main" ] 12 | paths-ignore: 13 | - "docs/**" 14 | - "**/*.md" 15 | - "**/*.yaml" 16 | 17 | env: 18 | CARGO_TERM_COLOR: always 19 | RUST_TOOLCHAIN: 1.68.0 20 | 21 | jobs: 22 | build: 23 | name: Continuous integration 24 | runs-on: self-hosted 25 | steps: 26 | - uses: actions/checkout@v3 27 | with: 28 | submodules: true 29 | - uses: actions-rs/toolchain@v1.0.6 30 | with: 31 | profile: minimal 32 | toolchain: ${{ env.RUST_TOOLCHAIN }} 33 | override: true 34 | 35 | - name: Install dependences 36 | id: install_deps 37 | run: make install_deps 38 | 39 | - name: Copy cache 40 | run: | 41 | mkdir -p /data/action/_work/sealfs/sealfs/target 42 | if [ -d /data/backup/debug ]; then 43 | mv /data/backup/debug /data/action/_work/sealfs/sealfs/target 44 | fi 45 | 46 | - name: Build 47 | id: make-build 48 | run: | 49 | make build 50 | # make build features=mem-db 51 | continue-on-error: true 52 | 53 | - name: Check 54 | id: cargo-check 55 | run: | 56 | cargo check --features=disk-db 57 | # cargo check --features=mem-db 58 | continue-on-error: true 59 | 60 | 61 | - name: Test Suite 62 | id: make-test 63 | run: | 64 | sudo rm -rf /tmp/test* 65 | make test 66 | ./test_io500.sh /data 67 | # make test features=mem-db 68 | continue-on-error: true 69 | 70 | - name: Clippy 71 | id: cargo-clippy 72 | run: | 73 | rustup component add clippy 74 | cargo clippy --features=disk-db -- -D warnings 75 | # cargo clippy --features=mem-db -- -D warnings 76 | continue-on-error: true 77 | 78 | - name: Rustfmt 79 | id: cargo-fmt 80 | run: | 81 | rustup component add rustfmt 82 | cargo fmt --all -- --check 83 | continue-on-error: true 84 | 85 | - if: always() 86 | name: Backup Temporary 87 | run: | 88 | if [ -d /data/action/_work/sealfs/sealfs/target/debug ]; then 89 | if [ -d /data/backup/debug ]; then 90 | rm -rf /data/backup/debug 91 | fi 92 | mv /data/action/_work/sealfs/sealfs/target/debug /data/backup 93 | fi 94 | 95 | - if: ${{ steps.install_deps.outcome == 'success' }} 96 | name: Summary 97 | run: | 98 | echo "Build: ${{ steps.make-build.outcome }}" 99 | echo "Check: ${{ steps.cargo-check.outcome }}" 100 | echo "Test: ${{ steps.make-test.outcome }}" 101 | echo "Clippy: ${{ steps.cargo-clippy.outcome }}" 102 | echo "Rustfmt: ${{ steps.cargo-fmt.outcome }}" 103 | if [ "${{ steps.make-build.outcome }}" != "success" ] || [ "${{ steps.cargo-check.outcome }}" != "success" ] || [ "${{ steps.make-test.outcome }}" != "success" ] || [ "${{ steps.cargo-clippy.outcome }}" != "success" ] || [ "${{ steps.cargo-fmt.outcome }}" != "success" ]; then 104 | exit 1 105 | fi -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /.vscode -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "syscall_intercept"] 2 | path = syscall_intercept 3 | url = https://github.com/pmem/syscall_intercept.git 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sealfs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["The Sealfs Developers"] 6 | license = "Apache-2.0" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | tokio = { version = "1.21.2", features = ["full"] } 12 | lazy_static = "1.4.0" 13 | anyhow = "1.0.66" 14 | thiserror = "1.0.37" 15 | log = "0.4.17" 16 | clap = { version = "=4.0.18", features = ["derive"] } 17 | env_logger = "0.9.1" 18 | prost = "0.11.0" 19 | serde = { version = "1", features = ["derive"] } 20 | serde_yaml = "0.9.14" 21 | # tonic-health = "0.7.1" 22 | dashmap = "5.4.0" 23 | async-trait = "0.1.73" 24 | nix = "0.26.1" 25 | rocksdb = "0.19.0" 26 | bincode = "1.3.3" 27 | ahash = "0.8.3" 28 | parking_lot = "0.12.1" 29 | fuser = "0.11.1" 30 | libc = "0.2" 31 | wyhash = "0.5.0" 32 | kanal = "0.1.0-pre8" 33 | rand = "0.8.5" 34 | pegasusdb = { git = "https://github.com/uran0sH/pegasusdb.git" } 35 | bytes = "1.4.0" 36 | ibv = { git = "https://github.com/mond77/ibv.git" } 37 | conhash = '0.5.0' 38 | spin = "0.5" 39 | 40 | [build-dependencies] 41 | tonic-build = "0.8" 42 | 43 | [dev-dependencies] 44 | tonic = "0.8.2" 45 | core_affinity = "0.8.0" 46 | criterion = "0.4" 47 | 48 | [[bin]] 49 | name = "client" 50 | path = "src/bin/client.rs" 51 | 52 | [[bin]] 53 | name = "server" 54 | path = "src/bin/server.rs" 55 | 56 | [[bin]] 57 | name = "manager" 58 | path = "src/bin/manager.rs" 59 | 60 | [workspace] 61 | members = [ 62 | "intercept", 63 | ] 64 | 65 | [features] 66 | disk-db = [] 67 | mem-db = [] 68 | 69 | [[bench]] 70 | name = "rpc" 71 | harness = false 72 | 73 | [[bench]] 74 | name = "grpc" 75 | harness = false 76 | 77 | [[bench]] 78 | name = "local_storage" 79 | harness = false 80 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | target_dir = target/debug 2 | features := disk-db 3 | flags += --workspace --verbose --features=$(features) 4 | deps = pkg-config protobuf-compiler clang libfuse-dev libcapstone-dev \ 5 | iproute2 perftest build-essential net-tools cmake pandoc \ 6 | libnl-3-dev libnl-route-3-dev libibverbs-dev 7 | 8 | all_release: install_deps release 9 | all_debug: install_deps debug 10 | 11 | install_deps: 12 | sudo apt update && sudo apt-get install $(deps) -y 13 | 14 | release: 15 | cargo build $(flags) --release 16 | 17 | build: 18 | cargo build $(flags) 19 | 20 | test: 21 | cargo test --features=$(features) 22 | 23 | images: manager-image server-image client-image 24 | 25 | manager-image: 26 | docker build -t manager -f docker/manager/Dockerfile . --no-cache 27 | 28 | server-image: 29 | docker build -t server -f docker/server/Dockerfile . 30 | 31 | client-image: 32 | docker build -t client -f docker/client/Dockerfile . -------------------------------------------------------------------------------- /README-ZH.MD: -------------------------------------------------------------------------------- 1 | # SEALFS 2 | 3 | [sealos](https://github.com/labring/sealos)的存储底座,我们希望构建一个适用于云原生的高性能,高可靠,弹性伸缩的分布式文件存储 4 | 5 | ## 系统架构 6 | sealfs的架构为无中心架构,且无独立的元数据节点,我们希望尽最大可能提升读写性能和解决存储海量小文件问题 7 | 8 | - server负责文件以及元数据存储,元数据无疑是分布式文件系统的热点文件,所以我们采用分挂盘的方式对数据和元数据进行存储,用户可以选择更好的硬件对元数据进行存储。 9 | - client实现用户态的文件系统,对文件请求进行拦截并通过哈希算法进行存储寻址。 10 | - manager负责协调集群。 11 | 12 | 设计图如下: 13 | ![](docs/images/architecture.jpg) 14 | 15 | ### 全链路用户态 16 | 我们希望结合特定硬件从客户端文件请求劫持到网络到存储打造一个全链路用户态的分布式文件存储系统,从而获得极致的性能体验。 17 | 18 | 更多设计可以参考: 19 | ### 设计文档 20 | [设计文档](https://github.com/labring/sealfs/blob/main/docs/README-ZH.MD) 21 | 22 | ## 实现计划 23 | 目前我们致力于全链路性能提升,对于其他方面的建设如高可靠性以及高可用性的优先级会低一些 24 | - 第一版功能实现: 25 | - 客户端: 26 | - [ ] fuse文件系统接口 27 | - [ ] 系统调用劫持(用户态文件系统) 28 | - [ ] 选址算法 29 | - [ ] 批处理 30 | 31 | - 服务端: 32 | - [ ] 绕过本地文件系统 33 | - [ ] 磁盘管理 34 | - [ ] 目录管理 35 | - [ ] 元数据持久化内存存储 36 | - [ ] 文件索引 37 | - [ ] 文件锁 38 | - [ ] 持久化数据结构 39 | 40 | - 协调节点: 41 | - [ ] 心跳管理 42 | 43 | - 网络: 44 | - [ ] RDMA 45 | - [ ] socket网络通信 46 | 47 | - 测试 48 | - [ ] IO500 49 | - [ ] 功能测试 50 | 51 | ## 编译 52 | 53 | rust版本 1.68 54 | 55 | ```bash 56 | cargo build 57 | ``` 58 | 59 | ## 快速使用 60 | 61 | ### 启动管理节点 62 | 63 | ```bash 64 | # edit manager.yaml 65 | vi examples/manager.yaml 66 | 67 | # start manager with manager.yaml 68 | SEALFS_CONFIG_PATH=./examples ./target/debug/manager & 69 | ``` 70 | 71 | ### 启动一个服务器 72 | 73 | ```bash 74 | ./target/debug/server --manager-address : --server-address : --database-path --storage-path --log-level warn & 75 | ``` 76 | 77 | ### 启动一个客户端 78 | 79 | ```bash 80 | ./target/debug/client --manager-address : --log-level warn daemon 81 | ``` 82 | 83 | ### 创建并挂载磁盘 84 | 85 | ```bash 86 | ./target/debug/client --manager-address : --log-level warn create test1 100000 87 | ./target/debug/client --log-level warn mount ~/fs test1 88 | ``` 89 | 90 | ## 开源协议 91 | [Apache License 2.0](https://github.com/labring/sealfs/blob/main/LICENSE) -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # SEALFS 2 | [English](https://github.com/labring/sealfs/blob/main/README.MD) | [简体 3 | 中文](https://github.com/labring/sealfs/blob/main/README-ZH.MD) 4 | 5 | The storage system of [sealos](https://github.com/labring/sealos), 6 | aiming to be a high-performance, highly reliable and auto-scalable 7 | distributed file system which fits the cloud native environment. 8 | 9 | ## System Architecture 10 | 11 | The architecture of sealfs is decentralized, and there is no single 12 | metadata node. sealfs hopes to improve the read and write performance 13 | as much as possible and solve the problems of storing large amounts of 14 | small files. 15 | 16 | ### Main Components 17 | Sealfs consists of the following three components: 18 | 19 | #### Server 20 | 21 | Server component is responsible for storing files and metadata. sealfs 22 | separates data and metadata into different disks, since metadata is 23 | undoubtedly the hot file on distributed file-system. This way, users 24 | can choose better hardware to store metadata. 25 | 26 | #### Client 27 | 28 | Client component implements the file-system in user mode. It 29 | intercepts file requests, stores, and addresses them through hash 30 | algorithms. 31 | 32 | #### Manager 33 | 34 | Manager component is responsible for coordinating the cluster. 35 | 36 | 37 | The System Architecture can be shown as follow: 38 | ![](docs/images/architecture.jpg) 39 | 40 | ### User Mode All The Way 41 | 42 | With specific hardware, sealos hopes to support user-mode completely, 43 | from file request hijacking on the client side, to the network, and to 44 | the storage, for maximum performance improvement. 45 | 46 | More designs can be referred to: 47 | ### Design Document 48 | [design document](https://github.com/labring/sealfs/blob/main/docs/README.MD) 49 | 50 | ## RoadMap 51 | Currently, we are committed to improving the performance 52 | thoroughly. For other design aspects, such as high reliability and 53 | high availability, the priority would be lower. 54 | 55 | - first version Function: 56 | - Client: 57 | - [ ] fuse file system interface 58 | - [ ] System call hijacking(file system of user mode) 59 | - [ ] location algorithm 60 | - [ ] batch process 61 | 62 | - Sever: 63 | - [ ] bypass file system 64 | - [ ] file Storage 65 | - [ ] disk manager 66 | - [ ] catalogue manager 67 | - [ ] Metadata persistent memory storage 68 | - [ ] file index 69 | - [ ] file lock 70 | - [ ] Persistent data structure 71 | 72 | - Manger: 73 | - [ ] heart manager 74 | 75 | - Network: 76 | - [ ] RDMA 77 | - [ ] socket network 78 | 79 | - Test 80 | - [ ] IO500 81 | - [ ] function test 82 | 83 | ## Compile 84 | 85 | rust version 1.68 86 | 87 | ```bash 88 | make build 89 | ``` 90 | 91 | ## Quick Start 92 | 93 | ### Start Manager 94 | 95 | ```bash 96 | # edit manager.yaml 97 | vi examples/manager.yaml 98 | 99 | # start manager with manager.yaml 100 | SEALFS_CONFIG_PATH=./examples ./target/debug/manager & 101 | ``` 102 | 103 | ### Start Servers on a Node 104 | 105 | ```bash 106 | ./target/debug/server --manager-address : --server-address : --database-path --storage-path --log-level warn & 107 | ``` 108 | 109 | ### Start Client on a Node 110 | 111 | ```bash 112 | ./target/debug/client --log-level warn daemon 113 | ``` 114 | 115 | ### Create & Mount Disk 116 | 117 | ```bash 118 | ./target/debug/client --log-level warn create test1 100000 119 | ./target/debug/client --log-level warn mount ~/fs test1 120 | ``` 121 | 122 | ## LICENSE 123 | [Apache License 2.0](https://github.com/labring/sealfs/blob/main/LICENSE) 124 | -------------------------------------------------------------------------------- /benches/block_device/main.rs: -------------------------------------------------------------------------------- 1 | //! run the benchmark with: 2 | //! cargo bench --bench block_device --features=disk-db 3 | 4 | use std::sync::Arc; 5 | 6 | use criterion::{criterion_group, criterion_main, Criterion}; 7 | use sealfs::server::storage_engine::{block_engine::BlockEngine, meta_engine, StorageEngine}; 8 | use std::process::Command; 9 | 10 | fn write_file(engine: &BlockEngine, n: isize) { 11 | (0..n).for_each(|_| { 12 | let bytes = vec![1u8; 10240]; 13 | engine.write_file("test", bytes.as_slice(), 0).unwrap(); 14 | }) 15 | } 16 | 17 | fn read_file(engine: &BlockEngine, n: isize) { 18 | (0..n * 10).for_each(|_| { 19 | engine.read_file("test", 10240, 0).unwrap(); 20 | }) 21 | } 22 | 23 | fn criterion_benchmark(c: &mut Criterion) { 24 | Command::new("bash") 25 | .arg("-c") 26 | .arg("dd if=/dev/zero of=node1 bs=4M count=1") 27 | .output() 28 | .unwrap(); 29 | Command::new("bash") 30 | .arg("-c") 31 | .arg("losetup /dev/loop8 node1") 32 | .output() 33 | .unwrap(); 34 | let meta_engine = Arc::new(meta_engine::MetaEngine::new( 35 | "/tmp/bench/db", 36 | 128 << 20, 37 | 128 * 1024 * 1024, 38 | )); 39 | let engine = BlockEngine::new("/dev/loop8", meta_engine); 40 | 41 | c.bench_function("block device test", |b| { 42 | b.iter(|| { 43 | write_file(&engine, 512); 44 | read_file(&engine, 512); 45 | }) 46 | }); 47 | Command::new("bash") 48 | .arg("-c") 49 | .arg("losetup -d /dev/loop8") 50 | .output() 51 | .unwrap(); 52 | Command::new("bash") 53 | .arg("-c") 54 | .arg("rm node1") 55 | .output() 56 | .unwrap(); 57 | } 58 | 59 | criterion_group!(benches, criterion_benchmark); 60 | criterion_main!(benches); 61 | -------------------------------------------------------------------------------- /benches/grpc/client.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use hello_world::greeter_client::GreeterClient; 4 | use hello_world::HelloRequest; 5 | 6 | use tonic::{transport::Server, Request, Response, Status}; 7 | 8 | pub mod hello_world { 9 | tonic::include_proto!("helloworld"); 10 | } 11 | 12 | pub fn gcli(total: u32) { 13 | let mut rt = tokio::runtime::Runtime::new().unwrap(); 14 | 15 | let client = rt 16 | .block_on(GreeterClient::connect("http://[::1]:50051")) 17 | .unwrap(); 18 | let mut handles = Vec::with_capacity(50); 19 | 20 | let mut data = [0u8; 50]; 21 | let data = String::from_utf8(data.to_vec()).unwrap(); 22 | for i in 0..total { 23 | let out = data.clone(); 24 | let client_clone = client.clone(); 25 | handles.push(rt.spawn(async move { 26 | let request = tonic::Request::new(HelloRequest { 27 | id: i, 28 | r#type: 0, 29 | flags: 0, 30 | filename: "".to_string(), 31 | meta_data: "".to_string(), 32 | data: out, 33 | }); 34 | let response = client_clone.to_owned().say_hello(request).await.unwrap(); 35 | // debug!("call_remote, result: {:?}", result); 36 | let reply = response.into_inner(); 37 | if reply.id != i || reply.status != 1 { 38 | println!("Error reply") 39 | } 40 | })); 41 | } 42 | for handle in handles { 43 | rt.block_on(handle).unwrap(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /benches/grpc/main.rs: -------------------------------------------------------------------------------- 1 | //! run the benchmark with: 2 | //! cargo bench --bench grpc 3 | 4 | mod client; 5 | mod server; 6 | 7 | use client::gcli; 8 | use criterion::{criterion_group, criterion_main, Criterion}; 9 | use server::server; 10 | 11 | extern crate core_affinity; 12 | 13 | fn grpc_benchmark(c: &mut Criterion) { 14 | std::thread::spawn(|| { 15 | let core_ids = core_affinity::get_core_ids().unwrap(); 16 | let core_id0 = core_ids[0]; 17 | core_affinity::set_for_current(core_id0); 18 | match server() { 19 | Ok(()) => println!("server start succeed!"), 20 | Err(_) => println!("server start failed!"), 21 | }; 22 | }); 23 | // wait for server to start 24 | std::thread::sleep(std::time::Duration::from_secs(5)); 25 | // c.bench_function("grpc_bench0", |b| b.iter(|| gcli(0))); 26 | // c.bench_function("grpc_bench10", |b| b.iter(|| gcli(10))); 27 | // c.bench_function("grpc_bench100", |b| b.iter(|| gcli(100))); 28 | // c.bench_function("grpc_bench1000", |b| b.iter(|| gcli(1000))); 29 | // c.bench_function("grpc_bench10000", |b| b.iter(|| gcli(10000))); 30 | c.bench_function("grpc_bench100000", |b| b.iter(|| gcli(100000))); 31 | } 32 | 33 | criterion_group!( 34 | name=benches; 35 | config=Criterion::default().significance_level(0.1).sample_size(10); 36 | targets = grpc_benchmark 37 | ); 38 | criterion_main!(benches); 39 | -------------------------------------------------------------------------------- /benches/grpc/mod.rs: -------------------------------------------------------------------------------- 1 | mod client; 2 | mod server; 3 | -------------------------------------------------------------------------------- /benches/grpc/server.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | use hello_world::greeter_server::{Greeter, GreeterServer}; 3 | use hello_world::{HelloReply, HelloRequest}; 4 | 5 | use tonic::{transport::Server, Request, Response, Status}; 6 | 7 | pub mod hello_world { 8 | tonic::include_proto!("helloworld"); 9 | } 10 | 11 | #[derive(Debug, Default)] 12 | pub struct MyGreeter {} 13 | 14 | #[tonic::async_trait] 15 | impl Greeter for MyGreeter { 16 | async fn say_hello( 17 | &self, 18 | request: Request, 19 | ) -> Result, Status> { 20 | let message = request.into_inner(); 21 | // println!("Got a request: {} {}", message.id, &message.data[0..5]); 22 | 23 | let reply = hello_world::HelloReply { 24 | id: message.id, 25 | status: 1, 26 | flags: 0, 27 | meta_data: "".to_string(), 28 | data: "".to_string(), 29 | }; 30 | 31 | Ok(Response::new(reply)) 32 | } 33 | } 34 | 35 | #[tokio::main] 36 | pub async fn server() -> Result<(), Box> { 37 | let addr = "[::1]:50051".parse()?; 38 | let greeter = MyGreeter::default(); 39 | 40 | Server::builder() 41 | .add_service(GreeterServer::new(greeter)) 42 | .serve(addr) 43 | .await?; 44 | 45 | Ok(()) 46 | } 47 | -------------------------------------------------------------------------------- /benches/local_storage/main.rs: -------------------------------------------------------------------------------- 1 | //! run the benchmark with: 2 | //! cargo bench --bench local_storage 3 | 4 | use std::sync::Arc; 5 | 6 | use criterion::{criterion_group, criterion_main, Criterion}; 7 | use rand::prelude::*; 8 | use sealfs::server::storage_engine::{ 9 | file_engine::{self, FileEngine}, 10 | meta_engine, StorageEngine, 11 | }; 12 | 13 | fn create_file(engine: &FileEngine, n: isize) { 14 | let mode = 0o777; 15 | let oflag = libc::O_CREAT | libc::O_RDWR; 16 | (0..n).for_each(|i| { 17 | engine 18 | .create_file(i.to_string().as_str(), oflag, 0, mode) 19 | .unwrap(); 20 | }) 21 | } 22 | 23 | fn delete_file(engine: &FileEngine, n: isize) { 24 | (0..n).for_each(|i| { 25 | engine.delete_file(i.to_string().as_str()).unwrap(); 26 | }) 27 | } 28 | 29 | fn write_file(engine: &FileEngine, n: isize) { 30 | (0..n).for_each(|_| { 31 | let mut rng = rand::thread_rng(); 32 | let i: usize = rng.gen::() % n as usize; 33 | let bytes = vec![1u8; 10240]; 34 | engine 35 | .write_file(i.to_string().as_str(), bytes.as_slice(), 0) 36 | .unwrap(); 37 | }) 38 | } 39 | 40 | fn read_file(engine: &FileEngine, n: isize) { 41 | (0..n * 10).for_each(|_| { 42 | let mut rng = rand::thread_rng(); 43 | let i: usize = rng.gen::() % n as usize; 44 | let _ = engine.read_file(i.to_string().as_str(), 10240, 0).unwrap(); 45 | }) 46 | } 47 | 48 | fn criterion_benchmark(c: &mut Criterion) { 49 | let meta_engine = Arc::new(meta_engine::MetaEngine::new( 50 | "/tmp/bench/db", 51 | 128 << 20, 52 | 128 * 1024 * 1024, 53 | )); 54 | let engine = file_engine::FileEngine::new("/tmp/bench/root", meta_engine); 55 | 56 | c.bench_function("default engine file 512", |b| { 57 | b.iter(|| { 58 | create_file(&engine, 5120); 59 | write_file(&engine, 5120); 60 | read_file(&engine, 5120); 61 | delete_file(&engine, 5120); 62 | }) 63 | }); 64 | } 65 | 66 | criterion_group!(benches, criterion_benchmark); 67 | criterion_main!(benches); 68 | -------------------------------------------------------------------------------- /benches/mod.rs: -------------------------------------------------------------------------------- 1 | mod grpc; 2 | mod rpc; 3 | -------------------------------------------------------------------------------- /benches/rpc/client.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use sealfs::rpc::client::{RpcClient, TcpStreamCreator}; 4 | use std::{sync::Arc, time::Duration}; 5 | 6 | pub fn cli(total: u32) { 7 | let runtime = tokio::runtime::Builder::new_multi_thread() 8 | .enable_all() 9 | .build() 10 | .unwrap(); 11 | runtime.block_on(run_cli_without_data(total)); 12 | } 13 | 14 | pub fn cli_size(total: u32, size: usize) { 15 | let runtime = tokio::runtime::Builder::new_multi_thread() 16 | .enable_all() 17 | .build() 18 | .unwrap(); 19 | runtime.block_on(run_cli_with_data_size(total, size)); 20 | } 21 | 22 | pub async fn run_cli_without_data(total: u32) { 23 | let rt = tokio::runtime::Handle::current(); 24 | let mut handles = Vec::with_capacity(total as usize); 25 | 26 | let server_address = "127.0.0.1:50052"; 27 | let client: Arc< 28 | RpcClient< 29 | tokio::net::tcp::OwnedReadHalf, 30 | tokio::net::tcp::OwnedWriteHalf, 31 | TcpStreamCreator, 32 | >, 33 | > = Arc::new(RpcClient::new()); 34 | client.add_connection(server_address).await; 35 | 36 | for i in 0..total { 37 | let new_client = client.clone(); 38 | handles.push(rt.spawn(async move { 39 | let mut status = 0; 40 | let mut rsp_flags = 0; 41 | let mut recv_meta_data_length = 0; 42 | let mut recv_data_length = 0; 43 | let mut recv_meta_data = vec![]; 44 | let mut recv_data = vec![]; 45 | // debug!("call_remote, start"); 46 | let result = new_client 47 | .call_remote( 48 | server_address, 49 | 0, 50 | i, 51 | "", 52 | &[], 53 | &[], 54 | &mut status, 55 | &mut rsp_flags, 56 | &mut recv_meta_data_length, 57 | &mut recv_data_length, 58 | &mut recv_meta_data, 59 | &mut recv_data, 60 | Duration::from_secs(10), 61 | ) 62 | .await; 63 | // debug!("call_remote, result: {:?}", result); 64 | match result { 65 | Ok(_) => { 66 | if status == 0 { 67 | // let data = String::from_utf8(recv_data).unwrap(); 68 | // println!("result: {}, data: {}", i, data); 69 | } else { 70 | println!("Error: {}", status); 71 | } 72 | } 73 | Err(e) => { 74 | println!("Error: {}", e); 75 | } 76 | } 77 | })); 78 | } 79 | for handle in handles { 80 | handle.await; 81 | } 82 | client.close(); 83 | } 84 | async fn run_cli_with_data_size(total: u32, size: usize) { 85 | let rt = tokio::runtime::Handle::current(); 86 | let mut handles = Vec::with_capacity(total as usize); 87 | 88 | let server_address = "127.0.0.1:50052"; 89 | let client: Arc< 90 | RpcClient< 91 | tokio::net::tcp::OwnedReadHalf, 92 | tokio::net::tcp::OwnedWriteHalf, 93 | TcpStreamCreator, 94 | >, 95 | > = Arc::new(RpcClient::new()); 96 | client.add_connection(server_address).await; 97 | let data = vec![0u8; size]; 98 | for i in 0..total { 99 | let new_client = client.clone(); 100 | let data = data.clone(); 101 | handles.push(rt.spawn(async move { 102 | let mut status = 0; 103 | let mut rsp_flags = 0; 104 | let mut recv_meta_data_length = 0; 105 | let mut recv_data_length = 0; 106 | let mut recv_meta_data = vec![]; 107 | let mut recv_data = vec![]; 108 | // debug!("call_remote, start"); 109 | let result = new_client 110 | .call_remote( 111 | server_address, 112 | 0, 113 | i, 114 | "", 115 | &[], 116 | &data, 117 | &mut status, 118 | &mut rsp_flags, 119 | &mut recv_meta_data_length, 120 | &mut recv_data_length, 121 | &mut recv_meta_data, 122 | &mut recv_data, 123 | Duration::from_secs(10), 124 | ) 125 | .await; 126 | // debug!("call_remote, result: {:?}", result); 127 | match result { 128 | Ok(_) => { 129 | if status == 0 { 130 | // let data = String::from_utf8(recv_data).unwrap(); 131 | // println!("result: {}, data: {}", i, data); 132 | } else { 133 | println!("Error: {}", status); 134 | } 135 | } 136 | Err(e) => { 137 | println!("Error: {}", e); 138 | } 139 | } 140 | })); 141 | } 142 | for handle in handles { 143 | handle.await; 144 | } 145 | client.close(); 146 | } 147 | -------------------------------------------------------------------------------- /benches/rpc/main.rs: -------------------------------------------------------------------------------- 1 | //! run the benchmark with: 2 | //! cargo bench --bench rpc 3 | 4 | #![allow(unused)] 5 | mod client; 6 | mod server; 7 | use client::{cli, cli_size}; 8 | use criterion::{criterion_group, criterion_main, Criterion}; 9 | 10 | use server::server; 11 | fn rpc_benchmark(c: &mut Criterion) { 12 | std::thread::spawn(|| { 13 | let core_ids = core_affinity::get_core_ids().unwrap(); 14 | let core_id0 = core_ids[0]; 15 | core_affinity::set_for_current(core_id0); 16 | match server() { 17 | Ok(()) => println!("server start succeed!"), 18 | Err(_) => println!("server start failed!"), 19 | }; 20 | }); 21 | // wait for server to start 22 | std::thread::sleep(std::time::Duration::from_secs(5)); 23 | // c.bench_function("rpc_bench0", |b| b.iter(|| cli(0))); 24 | // c.bench_function("rpc_bench10", |b| b.iter(|| cli(10))); 25 | // c.bench_function("rpc_bench100", |b| b.iter(|| cli(100))); 26 | // c.bench_function("rpc_bench1000", |b| b.iter(|| cli(1000))); 27 | c.bench_function("rpc_bench100000", |b| b.iter(|| cli(100000))); 28 | // c.bench_function("rpc_bench100000_without_data", |b| b.iter(|| cli(100000))); 29 | // c.bench_function("rpc_bench100000_data_size_1024", |b| { 30 | // b.iter(|| cli_size(100000, 1024)) 31 | // }); 32 | // c.bench_function("rpc_bench100000_data_size_1024_4", |b| { 33 | // b.iter(|| cli_size(100000, 1024 * 4)) 34 | // }); 35 | // c.bench_function("rpc_bench100000_data_size_1024_16", |b| { 36 | // b.iter(|| cli_size(100000, 1024 * 16)) 37 | // }); 38 | // c.bench_function("rpc_bench100000_data_size_1024_64", |b| { 39 | // b.iter(|| cli_size(100000, 1024 * 64)) 40 | // }); 41 | // c.bench_function("rpc_bench100000_data_size_1024_256", |b| { 42 | // b.iter(|| cli_size(100000,1024*256)) 43 | // }); 44 | // c.bench_function("rpc_bench100000_data_size_1024_1024", |b| { 45 | // b.iter(|| cli_size(100000,1024*1024)) 46 | // }); 47 | } 48 | 49 | criterion_group!( 50 | name=benches; 51 | config=Criterion::default().significance_level(0.1).sample_size(10); 52 | targets = rpc_benchmark 53 | ); 54 | criterion_main!(benches); 55 | -------------------------------------------------------------------------------- /benches/rpc/mod.rs: -------------------------------------------------------------------------------- 1 | mod client; 2 | mod server; 3 | -------------------------------------------------------------------------------- /benches/rpc/server.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use async_trait::async_trait; 4 | use sealfs::rpc::server::{Handler, RpcServer}; 5 | use std::{sync::Arc, vec}; 6 | use tokio::sync::Mutex; 7 | pub struct HelloHandler {} 8 | 9 | impl HelloHandler { 10 | pub fn new() -> Self { 11 | Self {} 12 | } 13 | } 14 | 15 | // lazy_static::lazy_static! { 16 | // static ref HELLO_COUNT: Arc> = Arc::new(Mutex::new(0)); 17 | // } 18 | 19 | #[async_trait] 20 | impl Handler for HelloHandler { 21 | async fn dispatch( 22 | &self, 23 | _conn_id: u32, 24 | operation_type: u32, 25 | _flags: u32, 26 | _path: Vec, 27 | _data: Vec, 28 | _metadata: Vec, 29 | ) -> anyhow::Result<(i32, u32, usize, usize, Vec, Vec)> { 30 | // debug!("dispatch, operation_type: {}", operation_type); 31 | // debug!("dispatch, path: {:?}", path); 32 | // debug!("dispatch, data: {:?}", data); 33 | match operation_type { 34 | 0 => { 35 | // let success = String::from("Success").into_bytes(); 36 | Ok((0, 0, 0, 0, vec![], vec![])) 37 | } 38 | _ => { 39 | todo!() 40 | } 41 | } 42 | } 43 | } 44 | 45 | #[tokio::main] 46 | pub async fn server() -> anyhow::Result<()> { 47 | // let mut builder = env_logger::Builder::from_default_env(); 48 | // builder 49 | // .format_timestamp(None) 50 | // .filter(None, log::LevelFilter::Debug); 51 | // builder.init(); 52 | 53 | let server = RpcServer::new(Arc::new(HelloHandler::new()), "127.0.0.1:50052"); 54 | server.run().await?; 55 | Ok(()) 56 | } 57 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | fn main() -> Result<(), Box> { 2 | tonic_build::compile_protos("proto/test.proto")?; 3 | Ok(()) 4 | } 5 | -------------------------------------------------------------------------------- /docker/client/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bullseye-20221205 2 | 3 | RUN apt update && apt upgrade -y && apt-mark unhold libcap2 && \ 4 | apt install -y fuse3 libfuse3-3 libfuse2 libibverbs1 && \ 5 | apt clean && \ 6 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 7 | 8 | COPY target/debug/client /usr/local/bin/client 9 | 10 | ENTRYPOINT ["/usr/local/bin/client"] 11 | -------------------------------------------------------------------------------- /docker/manager/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bullseye-20221205 2 | 3 | RUN apt update && apt upgrade -y && apt-mark unhold libcap2 && \ 4 | apt install -y libfuse3-3 libfuse2 libibverbs1 && \ 5 | apt clean && \ 6 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 7 | 8 | COPY target/debug/manager /usr/local/bin/manager 9 | 10 | ENTRYPOINT ["/usr/local/bin/manager"] 11 | -------------------------------------------------------------------------------- /docker/server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bullseye-20221205 2 | 3 | RUN apt update && apt upgrade -y && apt-mark unhold libcap2 && \ 4 | apt install -y libfuse3-3 libfuse2 libibverbs1 && \ 5 | apt clean && \ 6 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 7 | 8 | COPY target/debug/server /usr/local/bin/server 9 | 10 | ENTRYPOINT ["/usr/local/bin/server"] 11 | -------------------------------------------------------------------------------- /docs/RDMA.md: -------------------------------------------------------------------------------- 1 | ## Communication Principle of RDMA in sealfs 2 | 3 | GitHub Repository: https://github.com/mond77/ibv.git 4 | 5 | ### Connection Establishment 6 | 7 | The endpoint device addresses and RecvBuffer memory addresses are exchanged via TCP. 8 | 9 | RemoteBufManager is the allocator of the remote RecvBuffer. 10 | 11 | Both SendBuffer and RecvBuffer are memory regions that have been registered with `ibv_reg_mr`. 12 | 13 | ### send_msg() Method 14 | 15 | `fn send_msg(&self, msg: &[IoSlice<'_>]) -> io::Result<()> ` 16 | 17 | The main process of the send_msg() method includes: 18 | 19 | 1. Allocation of SendBuffer 20 | 2. Locking 21 | 3. Allocation of RemoteBuf 22 | 4. Issuing of WR (work request) 23 | 5. Unlocking 24 | 25 | Each request/response corresponds to a write_with_imm operation, which generates a WC (work completion) on both the local and remote ends. write_with_imm consumes one RQE (Receive Queue Element) on the remote end. The WC type of the remote end is write_with_imm, and the WC type of the local end is write. 26 | 27 | ### recv_msg() Method 28 | 29 | `fn recv_msg(&self) -> io::Result<&[u8]> ` 30 | 31 | The main process of the recv_msg() method includes: 32 | 33 | 1. A task in the polling background that polls a CQ (Completion Queue) notifies the task blocked in recv_msg() when a type write_with_imm WC is received. 34 | 2. The task then reads the data of this write. 35 | 3. The returned & [u8] points to the data located on the RecvBuffer. 36 | 37 | ### Memory Management 38 | 39 | #### SendBuffer 40 | 41 | SendBuffer is linearly allocated, and when released, it is marked with AtomicBool, greatly reducing the complexity of memory management. A release_task is used to maintain the linear release order of each allocated memory block and determine whether it is released or not using AtomicBool. 42 | 43 | #### RecvBuffer 44 | 45 | RecvBuffer is linearly allocated and released. 46 | 47 | -------------------------------------------------------------------------------- /docs/README-ZH.MD: -------------------------------------------------------------------------------- 1 | # sealfs设计文档 2 | 3 | ## 系统架构 4 | sealfs的架构为无中心架构,且无独立的元数据节点 5 | sealfs包含三个组件, 6 | - server负责文件以及元数据存储,元数据无疑是分布式文件系统的热点文件,所以我们采用分挂盘的方式对数据和元数据进行存储,用户可以选择更好的硬件对元数据进行存储。 7 | - client实现用户态的文件系统,对文件请求进行拦截并通过哈希算法进行存储寻址。 8 | - manager负责协调集群。 9 | 设计图如下: 10 | ![](images/architecture.jpg) 11 | 12 | ### 全链路用户态 13 | 我们希望结合特定硬件从客户端文件请求劫持到网络到存储打造一个全链路用户态的分布式文件存储系统,从而获得极致的性能体验。 14 | 15 | ## 客户端 16 | 在客户端处,我们支持了两种类型的文件系统,一种是比较常见的fuse,另一种是用户态的文件系统,我们希望以这种方式提升性能。 17 | 18 | ### fuse 19 | #### 内核文件系统 20 | 为了减少调用次数,一种实现方案则是直接使用内核态实现文件系统 21 | 22 | 1. 用户态请求 23 | 2. VFS 24 | 3. 内核态文件系统 25 | 4. 网络传输 26 | 27 | 这种方案可以将内核态和用户态切换次数减少,但缺点也显而易见: 28 | 1. 内核编程的调试复杂 29 | 2. 需要为客户端安装额外内核模块 30 | 3. 文件系统崩溃影响其他进程 31 | 32 | #### fuse 33 | 为了避免上述问题,第一种方式我们采用容易实现且易于使用的fuse。 34 | 35 | ![alt fuse](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9tbWJpei5xcGljLmNuL21tYml6X3BuZy9kNGhvWUpseE9qTnNvaWNRQkUwM01aRDBrWjNmY3VpYWVRZzJmV1RlNFlWV3RUYko5aWN1cG1iZ1IwZGd1RUlrTTloTzZzaWJQdU80VTlFNzlpYWczWWljdlE4US82NDA?x-oss-process=image/format,png) 36 | 37 | 在fuse中,通过网络进行文件存储的一次调用流程包括: 38 | 1. 用户态请求 39 | 2. VFS 40 | 3. fuse driver 41 | 4. fuse library 42 | 5. 网络传输 43 | 44 | 需要注意的是,在避免上述问题的同时,fuse也降低了文件系统的性能。 45 | 46 | 47 | ### 系统调用劫持 48 | 这是我们实现的第二种方案,即实现一个用户态的文件系统。在上图中,可以看到,用户请求并不是直接交给linux内核的,而是经过了glibc(或其他libc库)来提交系统调用,这意味着可以在libc层替换系统调用的地址,实现系统调用劫持。 49 | 1. 用户态请求 50 | 2. 系统调用劫持 51 | 3. 网络传输 52 | 53 | ## 网络 54 | 对于网络部分,同样提供了两种网络传输方式,一种是RPC的方式,一种是通过RDMA的方式。 55 | 56 | ### RPC 57 | ![image](https://user-images.githubusercontent.com/14962503/189853670-d10c29e8-34d7-468e-baa6-36c8fa65a3c9.png) 58 | 59 | #### 寻址算法 60 | 一个文件请求会被客户端使用一致性哈希算法映射到一个服务器,通过长链接进行传输。 61 | 62 | #### 请求流程 63 | 64 | 1. client接收请求,创建处理线程。创建处理线程的工作是由libfuse实现的,sealfs实现的函数可以认为已经是独立线程。 65 | 2. 计算文件所在的服务器。 66 | 3. 向server发送文件请求,hold线程。发送请求的过程要考虑多个请求并行处理的情况。为每个请求建立一个socket是最简单的实现,但创建连接的延迟过高,网络连接数也可能会过多。保持多个长连接保证了创建连接的延迟问题,但在大并发的情况下,依旧无法解决网络连接数量过多,同时代码的实现也稍显复杂。所以采用了一个长连接共享多个文件请求的方式。一个线程发送请求需要包含该请求的id与数据长度,同时需要实现一个额外的线程安全的队列用于保存发送请求后的线程的锁。 67 | 4. server处理文件请求,并将请求结果返回给client。处理过程中始终保持了请求的id。 68 | 5. client接收数据,激活请求线程并处理返回值。一个(或有限多个)独立的线程用于接收请求结果,其中包含了请求的id,需要在队列中查询请求id所对应的线程锁,写入结果并释放该线程锁,激活原请求线程并将结果返回给应用。 69 | 70 | #### 内存拷贝 71 | 72 | 采用了多个请求共享同一线程的方式,socket发送请求时,由于数据不定长,需要提前发送一个长度变量,才能避免粘包。这里有两种不同的方案: 73 | 一种是使用多个socket实现连接池,每次发送一个请求使用一个socket,该方案不存在数据包连续性的问题,可以多次发送。 74 | 另一种是用同一个socket,但是要保证数据连续性,需要进行字符串拼接,涉及内存拷贝,开销会变大,那为了避免这个问题,每次发送数据需要给线程加锁,这个是第一个阶段的实现方案。 75 | 76 | ### RDMA 77 | 78 | ## 管理节点 79 | 管理节点用于管理server集群。 80 | 81 | ### 心跳管理 82 | server节点上下线的时候,会将心跳信息上报给管理节点,客户端会订阅心跳信息用于选址计算,同时,服务端也会订阅心跳信息用于数据迁移等方面。 83 | 84 | ## 服务端 85 | server主要存储两种类型的数据,一种是文件的元数据信息,以及文件本身的内容。 86 | 87 | 对于分布式文件存储而言,元数据无疑是热点数据,因此,我们采用分开挂盘的方式,将元数据和文件数据挂载在不同的盘中,一种经济的办法是将元数据数据挂载在SSD盘中,而对普通的文件数据存储在hdd中。当然,可以随意搭配。 88 | 89 | ### 元数据管理 90 | 91 | >在大数据环境下,元数据的体量也非常大,元数据的存取性能是整个分布式文件系统性能的关键。常见的元数据管理可以分为集中式和分布式元数据管理架构。集中式元数据管理架构采用单一的元数据服务器,实现简单.但是存在单点故障等问题。分布式元数据管理架构则将元数据分散在多个结点上.进而解决了元数据服务器的性能瓶颈等问题.并提高了元数据管理架构的可扩展性,但实现较为复杂,并引入了元数据一致性的问题。另外,还有一种无元数据服务器的分布式架构,通过在线算法组织数据,不需要专用的元数据服务器。但是该架构对数据一致性的保障很困难.实现较为复杂。文件目录遍历操作效率低下,并且缺乏文件系统全局监控管理功能。 92 | 93 | sealfs目前选择的是元数据节点的架构,即避免了元数据节点单点故障的问题,但是对于元数据的遍历成为了一个难题。 94 | 95 | #### 元数据持久化内存存储 96 | 97 | 为了提高元数据的性能,我们打算结合支持持久化内存的硬件对元数据进行存储设计。 98 | 99 | ### 数据存储 100 | 101 | #### 绕过本地文件系统 102 | 为了提升性能,sealfs直接跨过文件系统进行文件存储。当然这会带来更多的复杂性。 103 | 104 | #### 适配不同的硬件 105 | 对于不同的固态硬盘有不同的特性,我们会对不同的硬件都进行适配,设计不同的数据结构,希望对用户使用的每一种硬件都达到比较好的效果。 106 | 107 | ## 一些其他的扩展 108 | 这些拓展点暂时不在第一版计划实现中 109 | - 数据可靠性与高可用 110 | - 多副本 111 | 多副本暂时计划采用raft协议,由一致性hash算法计算副本位置分布到多个节点上实现replica 112 | - 纠删码 113 | 114 | - 数据扩缩容 115 | 基于一致性hash实现扩缩容,具体细节暂时先不讲。需要明确的是添加或删除节点后集群会进行rebalance,这是一致性hash本身需要做的,无需额外设计。rebalance期间会导致集群性能下降,且可能耗费较长时间,但对于可以持续提供服务。在rebalance期间需要做的工作如下: 116 | 117 | | 开始扩容 | 迁移数据 | 扩容完成 | 118 | | ---- | ---- | ---- | 119 | | 更新集群元数据 | client进行二次请求,确认迁移后数据和迁移前数据一致性,并将数据写于新节点;同时迁移任务进行数据迁移和同步 | 确认集群元数据 | 120 | 121 | - 租户管理 122 | 对于不同的client申请挂载的磁盘,进行容量限制隔离 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /docs/README.MD: -------------------------------------------------------------------------------- 1 | # Sealfs Design Document 2 | 3 | ## System Architecture 4 | The architecture of sealfs is non centralized, and there is no single metadata node. 5 | 6 | Sealfs consists of three components 7 | -Server:Responsible for storing files and metadata.Metadata is undoubtedly the hot file of the distributed file system, so we store data and metadata in the way of separate disks. Users can choose better hardware to store metadata. 8 | -client:It implements the file system in user mode, intercepts file requests, and stores and addresses them through the hash algorithm. 9 | -manager:Responsible for Coordinate cluster. 10 | 11 | The System Architecture picture as follows: 12 | ![](images/architecture.jpg) 13 | 14 | ### User Mode in Overall Chain 15 | We hope to create an overall chain user mode distributed file storage system from client file request hijacking to network to storage with specific hardware, so as to obtain the ultimate performance experience. 16 | 17 | ## Client 18 | On the client side, we support two types of file systems: the more common fuse file system and the user mode file system which we hope to improve the performance in this way. 19 | 20 | ### Fuse 21 | 22 | #### Kernel File System 23 | In order to reduce the number of calls, an implementation scheme is to directly implement the file system in the kernel state, and network requests are implemented in the kernel state. 24 | 1. User status request 25 | 2. VFS 26 | 3. Kernel file system 27 | 4. network 28 | 29 | This scheme can reduce the number of handovers to 2, but its disadvantages are also obvious: 30 | 1. The debugging of kernel programming is complex 31 | 2. You need to install additional kernel modules for the client 32 | 3. File system crash affects other processes 33 | 34 | #### fuse 35 | There are different ways to implement network file storage, and fuse is an easy way to implement and use. 36 | 37 | ![alt fuse](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9tbWJpei5xcGljLmNuL21tYml6X3BuZy9kNGhvWUpseE9qTnNvaWNRQkUwM01aRDBrWjNmY3VpYWVRZzJmV1RlNFlWV3RUYko5aWN1cG1iZ1IwZGd1RUlrTTloTzZzaWJQdU80VTlFNzlpYWczWWljdlE4US82NDA?x-oss-process=image/format,png) 38 | 39 | In fuse, a call process for file storage through the network includes: 40 | 1. User status request 41 | 2. VFS (switching) 42 | 3. fuse driver 43 | 4. fuse library 44 | 5. network 45 | 46 | It should be noted that fuse also reduces the performance of the file system while avoiding the above problems. 47 | 48 | ### System Call Hijacking 49 | This is the second scheme we implemented, namely, to implement a user mode file system. In the figure in the previous section, we can see that user requests are not directly handed over to the Linux kernel, but are submitted through glibc (or other libc libraries). This means that the address of system calls can be replaced at the libc layer to achieve system call hijacking. 50 | 1. User status request 51 | 2. System call hijacking client 52 | 3. network 53 | 54 | ## Network 55 | For the network part, two network transmission modes are also provided, RPC and RDMA. 56 | 57 | ### RPC 58 | ![image](https://user-images.githubusercontent.com/14962503/189853670-d10c29e8-34d7-468e-baa6-36c8fa65a3c9.png) 59 | 60 | #### Location algorithm 61 | A file request will be mapped to a server by the client using a hash algorithm and transmitted through the socket link. 62 | 63 | #### Request Process 64 | 65 | 1. The client receives the request and creates a processing thread. The work of creating processing threads is implemented by libfuse, and the functions implemented by sealfs can be considered as independent threads. 66 | 2. The server where the calculation file is located, and the content is in metadata management, which is not detailed in this section. 67 | 3. Send a file request to the server and hold the thread. The process of sending requests should consider the parallel processing of multiple requests. Setting up a socket for each request is the simplest implementation, but the connection creation delay is too high, and the number of network connections may be too large. Maintaining multiple long connections ensures the delay of connection creation. However, in the case of large concurrency, it is still unable to solve the problem of excessive network connections. At the same time, the code implementation is slightly complicated. Therefore, a long connection is used to share multiple file requests. When a thread sends a request, it needs to include the request ID and data length. At the same time, it needs to implement an additional thread safe queue to store the lock of the thread after sending the request. 68 | 4. The server processes the file request and returns the request result to the client. The requested id is always maintained during processing. 69 | 5. The client receives data, activates the request thread, and processes the return value. One (or a limited number of) independent threads are used to receive the request result, which contains the request ID, so it is necessary to query the thread lock corresponding to the request ID in the queue, write the result and release the thread lock, activate the original request thread and return the result to the application. 70 | 71 | #### Memory Copy 72 | 73 | The method that multiple requests share the same thread is adopted. When the socket sends a request, it needs to send a length variable in advance to avoid packet sticking due to the variable length of the data. There are two different solutions: 74 | One is to use multiple sockets to realize the connection pool. One socket is used to send one request each time. This scheme does not have the problem of packet continuity and can be sent multiple times. 75 | The other is to use the same socket, but to ensure data continuity, string splicing is required. When memory copying is involved, the overhead will increase. To avoid this problem, threads need to be locked each time data is sent. This is the implementation scheme of the first phase. 76 | 77 | ### RDMA 78 | 79 | ## Manager 80 | Manager used for managing server cluster. 81 | 82 | ### heart Manger 83 | When the server node is online or offline, it will report the heartbeat information to the management node. The client will subscribe to the heartbeat information for location calculation. At the same time, the server will subscribe to the heartbeat information for data migration and other aspects. 84 | 85 | ## Server 86 | The server mainly stores two types of data. One is the metadata information of the file and the content of the file itself. 87 | 88 | As far as distributed file storage is concerned, metadata is undoubtedly hot data. Therefore, we use the method of separate hanging disks to mount metadata and file data in different disks. One economic way is to mount metadata data in SSD disks, while ordinary file data is stored in hdds. Of course, it can be matched at will. 89 | 90 | ### Metadata Management 91 | 92 | >In the big data environment, the volume of metadata is also very large, and the access performance of metadata is the key to the performance of the entire distributed file system. Common metadata management can be divided into centralized and distributed metadata management architectures. The centralized metadata management architecture uses a single metadata server, which is simple to implement, but has a single point of failure and other problems. The distributed metadata management architecture disperses metadata on multiple nodes, thus solving the performance bottleneck of the metadata server and improving the scalability of the metadata management architecture, but the implementation is more complex and introduces the problem of metadata consistency. In addition, there is a distributed architecture without metadata server, which organizes data through online algorithms, and does not require a dedicated metadata server. However, it is difficult to guarantee the data consistency of this architecture. The implementation is more complex. The file directory traversal operation is inefficient and lacks the global monitoring and management function of the file system. 93 | 94 | At present, sealfs chooses the metadata node architecture, which avoids the single point of failure of the metadata node, but metadata traversal becomes a difficult problem. 95 | 96 | #### Metadata persistent memory storage 97 | 98 | In order to improve the performance of metadata, we plan to combine the hardware supporting persistent memory to design metadata storage. 99 | 100 | ### Data storage 101 | 102 | #### Bypass local file system 103 | To improve performance, sealfs directly stores files across file systems. Of course, this will bring more complexity. 104 | 105 | #### Adapt to different hardware 106 | Different SSDs have different characteristics. We will adapt different hardware, design different data structures, and hope to achieve better results for each type of hardware used by users. 107 | 108 | ## Some Other Extensions 109 | These expansion points are not implemented in the first version of the plan 110 | - Data reliability and high availability 111 | - multi-replica 112 | 113 | The multi replica is temporarily planned to use the raft protocol, and the consistent hash algorithm is used to calculate the replica locations and distribute them to multiple nodes to achieve replica. 114 | - erasure coding 115 | - Data expansion 116 | 117 | Capacity expansion and reduction are implemented based on consistency hash. The details will not be discussed for the time being. It needs to be clear that after adding or deleting nodes, the cluster will be rebalanced. This is what consistency hash itself needs to do without additional design. Rebalance will cause the cluster performance to decline and may take a long time, but it can provide services continuously. The work to be done during rebalance is as follows: 118 | 119 | | Start capacity expansion | Migrate data | Complete capacity expansion | 120 | | ---- | ---- | ---- | 121 | |Updating cluster metadata | The client makes a second request to confirm the consistency of the data after migration and the data before migration, and writes the data to the new node; Simultaneous migration task for data migration and synchronization | Confirm cluster metadata| 122 | 123 | - Tenant Management 124 | 125 | For the disks that different clients apply to mount, perform capacity limitation isolation 126 | 127 | -------------------------------------------------------------------------------- /docs/images/architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/labring/sealfs/41e1ad08ab587cb78e8b7610a74a2eb373172f13/docs/images/architecture.jpg -------------------------------------------------------------------------------- /docs/specification.md: -------------------------------------------------------------------------------- 1 | # specifications 2 | 3 | ## benchmark 4 | 5 | we use library [criterion](https://github.com/bheisler/criterion.rs) for benchmark. 6 | 7 | add benchmark like such: 8 | 9 | ```toml 10 | # Cargo.toml 11 | 12 | [[bench]] 13 | name = "rpc" 14 | harness = false 15 | ``` 16 | 17 | create the file benches/rpc/main.rs or benches/rpc.rs. 18 | 19 | ```rust 20 | // benches/rpc/main.rs or benches/rpc.rs 21 | // ...... 22 | use criterion::{criterion_group, criterion_main, Criterion}; 23 | 24 | fn rpc_benchmark(c: &mut Criterion) { 25 | // add your bench like below. 26 | c.bench_function("rpc_bench100000", |b| b.iter(|| cli(100000))); 27 | } 28 | 29 | //define benchmark configuration like below. 30 | criterion_group!( 31 | name=benches; 32 | config=Criterion::default().significance_level(0.1).sample_size(10); 33 | targets = rpc_benchmark 34 | ); 35 | criterion_main!(benches); 36 | ``` 37 | 38 | to run benchmark 39 | `cargo bench --bench ` 40 | 41 | ```shell 42 | cargo bench --bench rpc 43 | ``` 44 | 45 | ## log 46 | 47 | We use library [env-logger](https://docs.rs/env_logger/0.10.0/env_logger/) including five log level: "ERROR", "WARN", "INFO", "DEBUG", "TRACE". 48 | 49 | For flexible usage, you can specify the log level by `./target/debug/server --log-level info`. The default log level is set in `examples/*.yaml` 50 | 51 | Logging principles: 52 | 53 | 1. Logging key change information 54 | 2. Applying the right level of logging; 55 | 3. Avoid duplication of logging information; 56 | 4. ...... 57 | 58 | Details about log level : 59 | 60 | **error** : Designates very serious errors. 61 | 62 | error log generally refers to program-level errors or serious business errors that do not affect the operation of the program. 63 | 64 | **warn** : Designates hazardous situations. 65 | 66 | warn log implies that needs attention, but not sure if an error occurred. For example, a user connection is closed abnormally, the relevant configuration cannot be found and only the default configuration can be used, retry after XX seconds, etc. 67 | 68 | **info** : Designates useful information. 69 | 70 | info log often used to record information about the operation of a program, such as user operations or changes in status, connection establishment and termination. 71 | 72 | **debug** : Designates lower priority information. 73 | 74 | debug log always used for detailed information, such as user request details tracking, configuration information read. 75 | 76 | **trace** : Designates very low priority, often extremely verbose, information. 77 | -------------------------------------------------------------------------------- /examples/hello_client.rs: -------------------------------------------------------------------------------- 1 | //! hello_client and hello_server demos show how rpc process the message sent by client 2 | //! and the usage of 'call_remote' and 'dispatch' APIs. 3 | //! 4 | //! After starting server: 5 | //! 6 | //! cargo run --example hello_server --features=disk-db 7 | //! 8 | //! You can try this example by running: 9 | //! 10 | //! cargo run --example hello_client --features=disk-db 11 | 12 | use log::debug; 13 | use sealfs::rpc::client::{RpcClient, TcpStreamCreator}; 14 | use std::sync::Arc; 15 | use std::time::Duration; 16 | 17 | #[tokio::main] 18 | pub async fn main() { 19 | let mut builder = env_logger::Builder::from_default_env(); 20 | builder 21 | .format_timestamp(None) 22 | .filter(None, log::LevelFilter::Info); 23 | builder.init(); 24 | let total = 10000; 25 | let elapsed = cli(total).await; 26 | println!("elapsed: {:?}", elapsed); 27 | } 28 | 29 | pub async fn cli(total: u32) -> Duration { 30 | let client: Arc< 31 | RpcClient< 32 | tokio::net::tcp::OwnedReadHalf, 33 | tokio::net::tcp::OwnedWriteHalf, 34 | TcpStreamCreator, 35 | >, 36 | > = Arc::new(RpcClient::default()); 37 | let server_address = "127.0.0.1:50051"; 38 | client.add_connection(server_address).await.unwrap(); 39 | // sleep for 1 second to wait for server to start 40 | // tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; 41 | let mut handles = vec![]; 42 | let start = tokio::time::Instant::now(); 43 | for _ in 0..total { 44 | let new_client = client.clone(); 45 | handles.push(tokio::spawn(async move { 46 | let mut status = 0; 47 | let mut rsp_flags = 0; 48 | let mut recv_meta_data_length = 0; 49 | let mut recv_data_length = 0; 50 | let mut recv_meta_data = vec![0u8; 4]; 51 | let mut recv_data = vec![0u8; 4]; 52 | debug!("call_remote, start"); 53 | let result = new_client 54 | .call_remote( 55 | server_address, 56 | 0, 57 | 0, 58 | "", 59 | &[], 60 | &[0u8; 10], 61 | &mut status, 62 | &mut rsp_flags, 63 | &mut recv_meta_data_length, 64 | &mut recv_data_length, 65 | &mut recv_meta_data, 66 | &mut recv_data, 67 | Duration::from_secs(10), 68 | ) 69 | .await; 70 | debug!("call_remote, result: {:?}", result); 71 | match result { 72 | Ok(_) => { 73 | if status == 0 { 74 | println!("Success"); 75 | } else { 76 | println!("Error: {}", status); 77 | } 78 | } 79 | Err(e) => { 80 | println!("Error: {}", e); 81 | } 82 | } 83 | })); 84 | } 85 | for handle in handles { 86 | if let Err(e) = handle.await { 87 | println!("Error: {}", e); 88 | } 89 | } 90 | let elapsed = start.elapsed(); 91 | client.close(); 92 | elapsed 93 | } 94 | -------------------------------------------------------------------------------- /examples/hello_server.rs: -------------------------------------------------------------------------------- 1 | //! hello_client and hello_server demos show how rpc process the message sent by client 2 | //! and the usage of 'call_remote' and 'dispatch' APIs. 3 | //! 4 | //! You can try this example by running: 5 | //! 6 | //! cargo run --example hello_server --features=disk-db 7 | //! 8 | //! And then start client in another terminal by running: 9 | //! 10 | //! cargo run --example hello_client --features=disk-db 11 | 12 | #![allow(unused)] 13 | use async_trait::async_trait; 14 | use log::debug; 15 | use sealfs::rpc::server::{Handler, RpcServer}; 16 | use std::sync::Arc; 17 | use tokio::sync::Mutex; 18 | pub struct HelloHandler {} 19 | 20 | impl HelloHandler { 21 | pub fn new() -> Self { 22 | Self {} 23 | } 24 | } 25 | 26 | // lazy_static::lazy_static! { 27 | // static ref HELLO_COUNT: Arc> = Arc::new(Mutex::new(0)); 28 | // } 29 | 30 | #[async_trait] 31 | impl Handler for HelloHandler { 32 | async fn dispatch( 33 | &self, 34 | _conn_id: u32, 35 | operation_type: u32, 36 | _flags: u32, 37 | path: Vec, 38 | data: Vec, 39 | _metadata: Vec, 40 | ) -> anyhow::Result<(i32, u32, usize, usize, Vec, Vec)> { 41 | // debug!("dispatch, operation_type: {}", operation_type); 42 | // debug!("dispatch, path: {:?}", path); 43 | // debug!("dispatch, data: {:?}", data); 44 | match operation_type { 45 | 0 => { 46 | // let mut count = HELLO_COUNT.lock().await; 47 | // let buf = format!("Hello, {}!", count).into_bytes(); 48 | // *count += 1; 49 | Ok((0, 0, 4, 4, vec![1, 2, 3, 4], vec![5, 6, 7, 8])) 50 | } 51 | _ => { 52 | todo!() 53 | } 54 | } 55 | } 56 | } 57 | 58 | #[tokio::main] 59 | pub async fn main() -> anyhow::Result<()> { 60 | let mut builder = env_logger::Builder::from_default_env(); 61 | builder 62 | .format_timestamp(None) 63 | .filter(None, log::LevelFilter::Info); 64 | builder.init(); 65 | let server = RpcServer::new(Arc::new(HelloHandler::new()), "127.0.0.1:50051"); 66 | server.run().await?; 67 | Ok(()) 68 | } 69 | -------------------------------------------------------------------------------- /examples/manager.yaml: -------------------------------------------------------------------------------- 1 | address: 2 | 127.0.0.1:8081 3 | all_servers_address: 4 | - 127.0.0.1:8085 5 | - 127.0.0.1:8086 6 | - 127.0.0.1:8087 7 | - 127.0.0.1:8088 8 | - 127.0.0.1:8089 9 | virtual_nodes: 10 | 100 11 | log_level: 12 | warn 13 | -------------------------------------------------------------------------------- /examples/rdma_client.rs: -------------------------------------------------------------------------------- 1 | //! cargo run --example rdma_client --features=disk-db 2 | //! 3 | 4 | use log::debug; 5 | use sealfs::rpc::rdma::client::Client; 6 | use std::{sync::Arc, time::Duration}; 7 | 8 | #[tokio::main] 9 | pub async fn main() { 10 | let mut builder = env_logger::Builder::from_default_env(); 11 | builder 12 | .format_timestamp(None) 13 | .filter(None, log::LevelFilter::Info); 14 | builder.init(); 15 | let total = 10000; 16 | let elapsed = cli(total).await; 17 | println!("elapsed: {:?}", elapsed); 18 | } 19 | 20 | pub async fn cli(total: u32) -> Duration { 21 | let client = Arc::new(Client::new()); 22 | let server_address = "127.0.0.1:7777"; 23 | client.add_connection(server_address).await; 24 | // sleep for 1 second to wait for server to start 25 | // tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; 26 | let mut handles = vec![]; 27 | let start = tokio::time::Instant::now(); 28 | for _ in 0..total { 29 | let new_client = client.clone(); 30 | handles.push(tokio::spawn(async move { 31 | let mut status = 0; 32 | let mut rsp_flags = 0; 33 | let mut recv_meta_data_length = 0; 34 | let mut recv_data_length = 0; 35 | let mut recv_meta_data = vec![0u8; 4]; 36 | let mut recv_data = vec![0u8; 4]; 37 | debug!("call_remote, start"); 38 | let result = new_client 39 | .call_remote( 40 | server_address, 41 | 0, 42 | 0, 43 | "", 44 | &[], 45 | &[0u8; 10], 46 | &mut status, 47 | &mut rsp_flags, 48 | &mut recv_meta_data_length, 49 | &mut recv_data_length, 50 | &mut recv_meta_data, 51 | &mut recv_data, 52 | Duration::from_secs(10), 53 | ) 54 | .await; 55 | debug!("call_remote, result: {:?}", result); 56 | match result { 57 | Ok(_) => { 58 | if status == 0 { 59 | // // print recv_metadata and recv_data 60 | // println!( 61 | // "result: {}, recv_meta_data: {:?}, recv_data: {:?}", 62 | // i, recv_meta_data, recv_data 63 | // ); 64 | } else { 65 | println!("Error: {}", status); 66 | } 67 | } 68 | Err(e) => { 69 | println!("Error: {}", e); 70 | } 71 | } 72 | })); 73 | } 74 | for handle in handles { 75 | if let Err(e) = handle.await { 76 | println!("Error: {}", e); 77 | } 78 | } 79 | let elapsed = start.elapsed(); 80 | client.close(); 81 | elapsed 82 | } 83 | -------------------------------------------------------------------------------- /examples/rdma_server.rs: -------------------------------------------------------------------------------- 1 | //! cargo run --example rdma_server --features=disk-db 2 | //! 3 | 4 | use async_trait::async_trait; 5 | use sealfs::rpc::{rdma::server::Server, server::Handler}; 6 | use std::sync::Arc; 7 | pub struct HelloHandler {} 8 | 9 | impl HelloHandler { 10 | pub fn new() -> Self { 11 | Self {} 12 | } 13 | } 14 | 15 | // lazy_static::lazy_static! { 16 | // static ref HELLO_COUNT: Arc> = Arc::new(Mutex::new(0)); 17 | // } 18 | 19 | #[async_trait] 20 | impl Handler for HelloHandler { 21 | async fn dispatch( 22 | &self, 23 | _conn_id: u32, 24 | operation_type: u32, 25 | _flags: u32, 26 | _path: Vec, 27 | _data: Vec, 28 | _metadata: Vec, 29 | ) -> anyhow::Result<(i32, u32, usize, usize, Vec, Vec)> { 30 | // println!("metadata: {:?}", metadata); 31 | // println!("data: {:?}", data); 32 | match operation_type { 33 | 0 => Ok((0, 0, 4, 4, vec![1, 2, 3, 4], vec![5, 6, 7, 8])), 34 | _ => { 35 | todo!() 36 | } 37 | } 38 | } 39 | } 40 | 41 | #[tokio::main] 42 | pub async fn main() -> anyhow::Result<()> { 43 | let server = Server::new("127.0.0.1:7777".to_string(), Arc::new(HelloHandler::new())).await; 44 | server.run().await?; 45 | Ok(()) 46 | } 47 | -------------------------------------------------------------------------------- /intercept/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "intercept" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["The Sealfs Developers"] 6 | license = "Apache-2.0" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | libc = "0.2" 12 | tokio = {version = "1.22.0", features = ["full"]} 13 | dashmap = "5.4.0" 14 | lazy_static = "1.4.0" 15 | crossbeam-channel = "0.5" 16 | log = "0.4.17" 17 | env_logger = "0.9.1" 18 | bincode = "1.3.3" 19 | nix = "0.26.1" 20 | serde = { version = "1", features = ["derive"] } 21 | serde_yaml = "0.9.14" 22 | async-trait = "0.1.59" 23 | spin = "0.5" 24 | sealfs = { path = "../" } 25 | 26 | [build-dependencies] 27 | cmake = "0.1" 28 | 29 | [dev-dependencies] 30 | libc = "0.2" 31 | 32 | [lib] 33 | crate-type = ["cdylib"] -------------------------------------------------------------------------------- /intercept/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let dst = cmake::build("../syscall_intercept"); 3 | 4 | println!("cargo:rustc-link-search=native={}/lib", dst.display()); 5 | println!("cargo:rustc-link-lib=static=syscall_intercept"); 6 | println!("cargo:rustc-link-lib=capstone"); 7 | } 8 | -------------------------------------------------------------------------------- /intercept/src/file_desc.rs: -------------------------------------------------------------------------------- 1 | use crossbeam_channel::{bounded, Receiver, Sender}; 2 | use dashmap::DashMap; 3 | 4 | #[derive(PartialEq, Debug, Clone)] 5 | pub enum FdType { 6 | File, 7 | Dir, 8 | } 9 | 10 | #[derive(Clone)] 11 | pub struct FdAttr { 12 | pub pathname: String, 13 | pub r#type: FdType, 14 | pub offset: i64, 15 | pub flags: i32, 16 | } 17 | 18 | lazy_static::lazy_static! { 19 | static ref IDLE_FD: (Sender, Receiver) = { 20 | let (s, r) = bounded(1024); 21 | for i in 10000..11024 { 22 | s.send(i).unwrap(); 23 | } 24 | (s, r) 25 | }; 26 | static ref FD_TB: DashMap = DashMap::new(); 27 | } 28 | 29 | pub fn insert_attr(attr: FdAttr) -> Option { 30 | let fd = match IDLE_FD.1.recv() { 31 | Ok(value) => value, 32 | Err(_) => return None, 33 | }; 34 | 35 | FD_TB.insert(fd, attr); 36 | return Some(fd); 37 | } 38 | 39 | pub fn remove_attr(fd: i32) -> bool { 40 | match FD_TB.remove(&fd) { 41 | Some(_) => { 42 | IDLE_FD.0.send(fd).unwrap(); 43 | true 44 | } 45 | None => false, 46 | } 47 | } 48 | 49 | pub fn get_attr(fd: i32) -> Option { 50 | match FD_TB.get(&fd) { 51 | Some(value) => Some((*value).clone()), 52 | None => None, 53 | } 54 | } 55 | 56 | pub fn set_attr(fd: i32, attr: FdAttr) -> bool { 57 | match FD_TB.get_mut(&fd) { 58 | Some(mut value) => { 59 | *value = attr; 60 | true 61 | } 62 | None => false, 63 | } 64 | } 65 | 66 | pub fn set_offset(fd: i32, offset: i64) { 67 | FD_TB.get_mut(&fd).unwrap().offset = offset as i64 68 | } 69 | -------------------------------------------------------------------------------- /intercept/src/path.rs: -------------------------------------------------------------------------------- 1 | lazy_static::lazy_static! { 2 | pub static ref CURRENT_DIR: String = std::env::current_dir() 3 | .unwrap() 4 | .to_str() 5 | .unwrap() 6 | .to_string(); 7 | pub static ref MOUNT_POINT: String = { 8 | let mut value = 9 | std::env::var("SEALFS_MOUNT_POINT").unwrap_or_else(|_| "/mnt/fs".to_string()); 10 | if value.ends_with('/') { 11 | value.pop(); 12 | } 13 | value 14 | }; 15 | pub static ref VOLUME_NAME: String = { 16 | std::env::var("SEALFS_VOLUME_NAME").unwrap_or_else(|_| "sealfs".to_string()) 17 | }; 18 | } 19 | 20 | fn get_realpath(path: &str) -> Option { 21 | // An absolute pathname 22 | let path = if path.starts_with('/') { 23 | path.to_string() 24 | } else { 25 | let mut cwd = CURRENT_DIR.to_string(); 26 | cwd.push('/'); 27 | cwd.push_str(path); 28 | cwd 29 | }; 30 | let mut start = 0; 31 | let mut end; 32 | let path_bytes = path.as_bytes(); 33 | let mut result = String::new(); 34 | while start < path_bytes.len() { 35 | while start < path_bytes.len() && path_bytes[start] == b'/' { 36 | start += 1; 37 | } 38 | end = start; 39 | while end < path_bytes.len() && path_bytes[end] != b'/' { 40 | end += 1; 41 | } 42 | let len = end - start; 43 | if len == 0 { 44 | break; 45 | } else if len == 1 && path_bytes[start] == b'.' { 46 | /* nothing */ 47 | } else if len == 2 && path_bytes[start] == b'.' && path_bytes[start + 1] == b'.' { 48 | while result.len() > 0 && !result.ends_with('/') { 49 | result.pop(); 50 | } 51 | if result.len() == 0 { 52 | return None; 53 | } 54 | result.pop(); 55 | } else { 56 | result.push('/'); 57 | result.push_str(&String::from_utf8(path_bytes[start..end].to_vec()).unwrap()); 58 | } 59 | start = end; 60 | } 61 | Some(result) 62 | } 63 | 64 | pub fn get_absolutepath(dir_path: &str, file_path: &str) -> Result { 65 | // An absolute pathname 66 | if file_path.starts_with('/') { 67 | match get_realpath(file_path) { 68 | Some(value) => return Ok(value), 69 | None => return Err(0), 70 | } 71 | } 72 | 73 | // By file descriptor 74 | if file_path.is_empty() { 75 | match get_realpath(dir_path) { 76 | Some(value) => return Ok(value), 77 | None => return Err(0), 78 | } 79 | } 80 | 81 | // By file descriptor 82 | match get_realpath(&(dir_path.to_string() + "/" + &file_path)) { 83 | Some(value) => Ok(value), 84 | None => Err(0), 85 | } 86 | } 87 | 88 | pub fn get_remotepath(path: &str) -> Option { 89 | if path.starts_with(MOUNT_POINT.as_str()) { 90 | let mut remotepath = VOLUME_NAME.clone(); 91 | remotepath.push_str(&path[MOUNT_POINT.len()..]); 92 | if remotepath.len() > 1 && remotepath.ends_with('/') { 93 | remotepath.pop(); 94 | } 95 | return Some(remotepath); 96 | } 97 | None 98 | } 99 | -------------------------------------------------------------------------------- /intercept/src/syscall_intercept.rs: -------------------------------------------------------------------------------- 1 | #[link(name = "syscall_intercept")] 2 | extern "C" { 3 | static mut intercept_hook_point: Option; 4 | 5 | pub fn syscall_no_intercept(num: isize, ...) -> isize; 6 | } 7 | 8 | /// Set syscall intercept hook function. 9 | /// 10 | /// # Safety 11 | /// 12 | /// This function will change all syscall behavior! 13 | pub unsafe fn set_hook_fn(f: HookFn) { 14 | intercept_hook_point = Some(f); 15 | } 16 | 17 | /// Clear syscall intercept hook function. 18 | /// 19 | /// # Safety 20 | /// 21 | /// This function will change all syscall behavior! 22 | pub unsafe fn unset_hook_fn() { 23 | intercept_hook_point = None; 24 | } 25 | 26 | /// The type of hook function. 27 | pub type HookFn = extern "C" fn( 28 | num: isize, 29 | a0: isize, 30 | a1: isize, 31 | a2: isize, 32 | a3: isize, 33 | a4: isize, 34 | a5: isize, 35 | result: &mut isize, 36 | ) -> InterceptResult; 37 | 38 | /// The return value of hook function. 39 | #[repr(i32)] 40 | pub enum InterceptResult { 41 | /// The user takes over the system call. The return value should be set via `result`. 42 | Hook = 0, 43 | /// The specific system call was ignored by the user and the original syscall should be executed. 44 | Forward = 1, 45 | } 46 | -------------------------------------------------------------------------------- /intercept/src/test_log.rs: -------------------------------------------------------------------------------- 1 | use libc::SYS_write; 2 | 3 | use crate::syscall_intercept::syscall_no_intercept; 4 | 5 | struct CStrPointer { 6 | _p: *const u8, 7 | } 8 | 9 | unsafe impl std::marker::Sync for CStrPointer {} 10 | unsafe impl std::marker::Send for CStrPointer {} 11 | lazy_static::lazy_static! { 12 | static ref LOG_BUF: Vec = vec![0u8; 10]; 13 | } 14 | 15 | pub unsafe fn _print_log(mut num: i32, pre_char: char, suf_char: char) { 16 | let c = LOG_BUF.as_slice().as_ptr() as *mut u8; 17 | let mut cnt = 1; 18 | *c = pre_char as u8; 19 | while num != 0 { 20 | *c.offset(cnt) = ((num % 10) + 48) as u8; 21 | num /= 10; 22 | cnt += 1; 23 | } 24 | *c.offset(cnt) = suf_char as u8; 25 | *c.offset(cnt + 1) = 0 as u8; 26 | syscall_no_intercept(SYS_write as isize, 1, c, cnt + 2); 27 | } 28 | -------------------------------------------------------------------------------- /proto/test.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | package helloworld; 3 | service Greeter { 4 | rpc SayHello (HelloRequest) returns (HelloReply); 5 | } 6 | 7 | message HelloRequest { 8 | uint32 id = 1; 9 | uint32 type = 2; 10 | uint32 flags = 3; 11 | string filename = 4; 12 | string meta_data = 5; 13 | string data = 6; 14 | } 15 | 16 | message HelloReply { 17 | uint32 id = 1; 18 | int32 status = 2; 19 | uint32 flags = 3; 20 | string meta_data = 5; 21 | string data = 6; 22 | } -------------------------------------------------------------------------------- /scripts/add_node.sh: -------------------------------------------------------------------------------- 1 | for ((i=1; i<=10; i++)) 2 | do 3 | mkdir ~/fs/test_rm$i 4 | echo "test" >> ~/fs/test_rm$i/test.log 5 | sleep 0.1 6 | done 7 | 8 | target/debug/client --log-level info add 127.0.0.1:8090 9 | ./target/debug/server --server-address 127.0.0.1:8090 --database-path /data/database5/ --storage-path /data/storage5/ --log-level info 10 | 11 | 12 | for ((i=1; i<=10; i++)) 13 | do 14 | rm ~/fs/test_rm$i/test.log 15 | echo heart 16 | rm -r ~/fs/test_rm$i 17 | sleep 0.1 18 | done -------------------------------------------------------------------------------- /scripts/close_all_instances.sh: -------------------------------------------------------------------------------- 1 | set +e 2 | # ps and kill the process start by this command "target/debug/server" 3 | ps -ef | grep "target/debug/server" | grep -v grep | awk '{print $2}' | xargs kill -9 4 | # ps and kill the process start by this command "target/debug/manager" 5 | ps -ef | grep "target/debug/manager" | grep -v grep | awk '{print $2}' | xargs kill -9 6 | # ps and kill the process start by this command "target/debug/client" 7 | ps -ef | grep "target/debug/client" | grep -v grep | awk '{print $2}' | xargs kill -9 8 | 9 | # ps and kill the process start by this command "target/release/server" 10 | ps -ef | grep "target/release/server" | grep -v grep | awk '{print $2}' | xargs kill -9 11 | # ps and kill the process start by this command "target/release/manager" 12 | ps -ef | grep "target/release/manager" | grep -v grep | awk '{print $2}' | xargs kill -9 13 | # ps and kill the process start by this command "target/release/client" 14 | ps -ef | grep "target/release/client" | grep -v grep | awk '{print $2}' | xargs kill -9 15 | 16 | # ps and kill the process start by this command "target/release/client" 17 | ps -ef | grep "run_all" | grep -v grep | awk '{print $2}' | xargs kill -9 -------------------------------------------------------------------------------- /scripts/delete_node.sh: -------------------------------------------------------------------------------- 1 | target/debug/client --log-level info delete 127.0.0.1:8089 -------------------------------------------------------------------------------- /scripts/read_files.sh: -------------------------------------------------------------------------------- 1 | 2 | for ((l=1; l<=100; l++)) 3 | do 4 | for ((i=1; i<=10; i++)) 5 | do 6 | mkdir ~/fs/test_rm$i 7 | echo "test" >> ~/fs/test_rm$i/test.log 8 | sleep 0.1 9 | done 10 | 11 | for ((i=1; i<=10; i++)) 12 | do 13 | cat ~/fs/test_rm$i/test.log 14 | echo "test" >> ~/fs/test_rm$i/test.log 15 | sleep 0.1 16 | done 17 | 18 | for ((i=1; i<=10; i++)) 19 | do 20 | sleep 0.1 21 | done 22 | 23 | for ((i=1; i<=10; i++)) 24 | do 25 | rm ~/fs/test_rm$i/test.log 26 | echo heart 27 | rm -r ~/fs/test_rm$i 28 | sleep 0.1 29 | done 30 | done 31 | 32 | kill $(jobs -p) -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | 2 | s=(SyncNewHashRing PreTransfer Transferring PreFinish Finishing Idle) 3 | 4 | for i in {0..5} 5 | do 6 | echo "test $i" 7 | echo "" 8 | 9 | scripts/close_all_instances.sh 10 | scripts/run_all.sh /data warn& 11 | sleep 10 12 | 13 | mkdir ~/fs/test_rm5 14 | 15 | target/debug/client --log-level info delete 127.0.0.1:8089 16 | 17 | while true 18 | do 19 | status=`target/debug/client status` 20 | #echo $status+${s[$i]} 21 | if [[ $status == *${s[$i]}* ]]; 22 | then 23 | break 24 | fi 25 | sleep 0.1 26 | done 27 | 28 | echo "test" >> ~/fs/test_rm5/test.log 29 | 30 | rm ~/fs/test_rm5/test.log 31 | 32 | while true 33 | do 34 | status=`target/debug/client status` 35 | #echo $status+${s[$i]} 36 | if [[ $status == *${s[5]}* ]]; 37 | then 38 | break 39 | fi 40 | sleep 0.1 41 | done 42 | 43 | rm -r ~/fs/test_rm5 44 | 45 | kill $(jobs -p) 46 | done -------------------------------------------------------------------------------- /scripts/test_run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function finish() { 4 | set +e 5 | trap 'kill $(jobs -p)' EXIT 6 | set -e 7 | exit $1 8 | } 9 | 10 | trap 'onCtrlC' INT 11 | function onCtrlC () { 12 | finish 0 13 | } 14 | 15 | function green_font() { 16 | echo -e "\033[32m$1\033[0m\c" 17 | } 18 | 19 | echo "start fuse_client_run" 20 | 21 | # exit with 1 if no argument 22 | if [ $# -eq 0 ] 23 | then 24 | echo "no argument" 25 | exit 1 26 | fi 27 | 28 | set +e 29 | 30 | rm /tmp/sealfs.sock 31 | rm /tmp/sealfs.index 32 | sudo umount ~/fs 33 | mkdir -p ~/fs 34 | 35 | set -e 36 | 37 | # check if $2 is empty, if empty, let $log_level = warn, else $log_level = $2 38 | if [ -z $2 ]; then 39 | log_level=info 40 | else 41 | log_level=$2 42 | fi 43 | 44 | SEALFS_CONFIG_PATH=./examples ./target/debug/manager --log-level $log_level & 45 | 46 | sudo rm -rf $1/database* 47 | sudo rm -rf $1/storage* 48 | for ((i=0; i<5; i++)) 49 | do 50 | port=$[8085+$i] 51 | ./target/debug/server --server-address 127.0.0.1:${port} --database-path $1/database${i}/ --storage-path $1/storage${i}/ --log-level $log_level & 52 | done 53 | 54 | sleep 3 55 | 56 | ./target/debug/client --log-level $log_level create-volume test1 100000 57 | 58 | ./target/debug/client --log-level $log_level daemon& 59 | sleep 3 60 | 61 | ./target/debug/client --log-level $log_level mount ~/fs test1 62 | sleep 3 63 | 64 | echo "press ctrl+c to stop" 65 | sleep 100000 -------------------------------------------------------------------------------- /src/bin/client.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use sealfs::client; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<(), Box> { 9 | if let Err(e) = client::run_command().await { 10 | println!("Error: {}", e); 11 | return Err(e); 12 | } 13 | Ok(()) 14 | } 15 | -------------------------------------------------------------------------------- /src/bin/manager.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use clap::Parser; 6 | use env_logger::fmt; 7 | use log::{error, info, warn}; 8 | use sealfs::manager::manager_service::update_server_status; 9 | use sealfs::{manager::manager_service::ManagerService, rpc::server::RpcServer}; 10 | use serde::{Deserialize, Serialize}; 11 | use std::fs; 12 | use std::io::Read; 13 | use std::str::FromStr; 14 | use std::{fmt::Debug, sync::Arc}; 15 | 16 | #[derive(Parser, Debug)] 17 | #[command(author, version, about, long_about = None)] 18 | struct Args { 19 | #[arg(long)] 20 | address: Option, 21 | #[arg(long)] 22 | config_file: Option, 23 | /// To use customized configuration or not. If this flag is used, please provide a config file through --config_file 24 | #[arg(long)] 25 | use_config_file: bool, 26 | #[arg(long)] 27 | log_level: Option, 28 | #[arg(long)] 29 | all_servers_address: Option>, 30 | #[arg(long)] 31 | virtual_nodes: Option, 32 | } 33 | 34 | #[derive(Debug, Serialize, Deserialize)] 35 | struct Properties { 36 | address: String, 37 | all_servers_address: Vec, 38 | virtual_nodes: usize, 39 | log_level: String, 40 | } 41 | 42 | #[tokio::main] 43 | async fn main() -> anyhow::Result<()> { 44 | let mut builder = env_logger::Builder::from_default_env(); 45 | 46 | // read from default configuration. 47 | let config_path = std::env::var("SEALFS_CONFIG_PATH").unwrap_or("~".to_string()); 48 | 49 | let mut config_file = std::fs::File::open(format!("{}/{}", config_path, "manager.yaml")) 50 | .expect("manager.yaml open failed!"); 51 | 52 | let mut config_str = String::new(); 53 | 54 | config_file 55 | .read_to_string(&mut config_str) 56 | .expect("manager.yaml read failed!"); 57 | 58 | let default_properties: Properties = 59 | serde_yaml::from_str(&config_str).expect("manager.yaml serializa failed!"); 60 | 61 | // read from command line. 62 | let args: Args = Args::parse(); 63 | let properties: Properties = match args.use_config_file { 64 | true => { 65 | // read from user-provided config file 66 | match args.config_file { 67 | Some(c) => { 68 | let yaml_str = fs::read_to_string(c).expect("Couldn't read from file. The file is either missing or you don't have enough permissions!"); 69 | let mut result: Properties = 70 | serde_yaml::from_str(&yaml_str).expect("manager.yaml read failed!"); 71 | if args.log_level.is_some() { 72 | result.log_level = args.log_level.unwrap(); 73 | } 74 | result 75 | } 76 | _ => { 77 | warn!( 78 | "No custom configuration provided, fallback to the default configuration." 79 | ); 80 | default_properties 81 | } 82 | } 83 | } 84 | false => Properties { 85 | address: args.address.unwrap_or(default_properties.address), 86 | all_servers_address: args 87 | .all_servers_address 88 | .unwrap_or(default_properties.all_servers_address), 89 | virtual_nodes: args 90 | .virtual_nodes 91 | .unwrap_or(default_properties.virtual_nodes), 92 | log_level: args.log_level.unwrap_or(default_properties.log_level), 93 | }, 94 | }; 95 | 96 | builder 97 | .format_timestamp(Some(fmt::TimestampPrecision::Millis)) 98 | .filter( 99 | None, 100 | log::LevelFilter::from_str(&properties.log_level).unwrap(), 101 | ); 102 | builder.init(); 103 | 104 | info!("Starting manager with log level: {}", properties.log_level); 105 | 106 | let address = properties.address; 107 | 108 | let servers_address = properties 109 | .all_servers_address 110 | .iter() 111 | .map(|s| (s.to_string(), properties.virtual_nodes)) 112 | .collect::>(); 113 | 114 | info!("All servers address: {:?}", servers_address); 115 | 116 | let manager = Arc::new(ManagerService::new(servers_address.clone())); 117 | 118 | let server = Arc::new(RpcServer::new(manager.clone(), &address)); 119 | 120 | info!("Manager started at {}", address); 121 | 122 | let new_manager = manager.clone(); 123 | 124 | tokio::spawn(async move { 125 | if let Err(e) = server.run().await { 126 | error!("Manager server error: {}", e); 127 | new_manager 128 | .manager 129 | .closed 130 | .store(true, std::sync::atomic::Ordering::Relaxed); 131 | } 132 | }); 133 | 134 | update_server_status(manager.manager.clone()).await; 135 | 136 | Ok(()) 137 | } 138 | -------------------------------------------------------------------------------- /src/bin/server.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use clap::Parser; 6 | use env_logger::fmt; 7 | use log::info; 8 | use sealfs::server; 9 | use serde::{Deserialize, Serialize}; 10 | use std::fmt::Debug; 11 | use std::str::FromStr; 12 | 13 | const _SERVER_FLAG: u32 = 1; 14 | 15 | #[derive(Parser, Debug)] 16 | #[command(author, version, about, long_about = None)] 17 | struct Args { 18 | #[arg(long)] 19 | manager_address: Option, 20 | #[arg(required = true, long)] 21 | server_address: Option, 22 | #[arg(required = true, long)] 23 | database_path: Option, 24 | #[arg(long)] 25 | cache_capacity: Option, 26 | #[arg(long)] 27 | write_buffer_size: Option, 28 | #[arg(required = true, long)] 29 | storage_path: Option, 30 | #[arg(long)] 31 | log_level: Option, 32 | } 33 | 34 | #[derive(Debug, Serialize, Deserialize)] 35 | struct Properties { 36 | manager_address: String, 37 | server_address: String, 38 | database_path: String, 39 | cache_capacity: usize, 40 | write_buffer_size: usize, 41 | storage_path: String, 42 | log_level: String, 43 | } 44 | 45 | #[tokio::main] 46 | async fn main() -> anyhow::Result<(), Box> { 47 | // read from command line. 48 | let args: Args = Args::parse(); 49 | // if the user provides the config file, parse it and use the arguments from the config file. 50 | let properties: Properties = Properties { 51 | manager_address: args.manager_address.unwrap_or("127.0.0.1:8081".to_owned()), 52 | server_address: args.server_address.unwrap(), 53 | database_path: args.database_path.unwrap(), 54 | cache_capacity: args.cache_capacity.unwrap_or(13421772), 55 | write_buffer_size: args.write_buffer_size.unwrap_or(0x4000000), 56 | storage_path: args.storage_path.unwrap(), 57 | log_level: args.log_level.unwrap_or("warn".to_owned()), 58 | }; 59 | 60 | let mut builder = env_logger::Builder::from_default_env(); 61 | builder 62 | .format_timestamp(Some(fmt::TimestampPrecision::Millis)) 63 | .filter( 64 | None, 65 | match log::LevelFilter::from_str(&properties.log_level) { 66 | Ok(level) => level, 67 | Err(_) => log::LevelFilter::Warn, 68 | }, 69 | ); 70 | builder.init(); 71 | 72 | info!("start server with properties: {:?}", properties); 73 | 74 | let manager_address = properties.manager_address; 75 | let server_address = properties.server_address.clone(); 76 | 77 | server::run( 78 | properties.database_path, 79 | properties.storage_path, 80 | server_address, 81 | manager_address, 82 | properties.cache_capacity, 83 | properties.write_buffer_size, 84 | ) 85 | .await?; 86 | Ok(()) 87 | } 88 | -------------------------------------------------------------------------------- /src/common/byte.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub fn array2u32(array: &[u8]) -> u32 { 6 | (array[0] as u32) 7 | + ((array[1] as u32) << 8) 8 | + ((array[2] as u32) << 16) 9 | + ((array[3] as u32) << 24) 10 | } 11 | 12 | pub const CHUNK_SIZE: i64 = 65536; 13 | -------------------------------------------------------------------------------- /src/common/cache.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use dashmap::DashMap; 6 | use parking_lot::Mutex; 7 | use std::sync::atomic::{AtomicUsize, Ordering}; 8 | use std::{marker::PhantomData, mem, ptr::NonNull}; 9 | 10 | #[derive(Copy, Clone)] 11 | struct NodePointer(Option>>); 12 | 13 | unsafe impl Send for NodePointer {} 14 | unsafe impl Sync for NodePointer {} 15 | 16 | impl Default for NodePointer { 17 | fn default() -> Self { 18 | NodePointer(None) 19 | } 20 | } 21 | 22 | pub struct Node { 23 | val: T, 24 | next: Mutex>, 25 | prev: Mutex>, 26 | } 27 | 28 | impl Node { 29 | fn new(val: T) -> Self { 30 | Self { 31 | val, 32 | next: Mutex::new(NodePointer::default()), 33 | prev: Mutex::new(NodePointer::default()), 34 | } 35 | } 36 | 37 | fn into_val(self) -> T { 38 | self.val 39 | } 40 | } 41 | 42 | pub struct LinkedList 43 | where 44 | T: std::fmt::Debug, 45 | { 46 | length: AtomicUsize, 47 | head: Mutex>, 48 | tail: Mutex>, 49 | _marker: PhantomData>>, 50 | } 51 | 52 | impl Default for LinkedList 53 | where 54 | T: std::fmt::Debug, 55 | { 56 | fn default() -> Self { 57 | Self { 58 | length: 0.into(), 59 | head: Mutex::new(NodePointer::default()), 60 | tail: Mutex::new(NodePointer::default()), 61 | _marker: PhantomData, 62 | } 63 | } 64 | } 65 | 66 | impl LinkedList 67 | where 68 | T: std::fmt::Debug, 69 | { 70 | pub fn new() -> Self { 71 | Self { 72 | length: 0.into(), 73 | head: Mutex::new(NodePointer::default()), 74 | tail: Mutex::new(NodePointer::default()), 75 | _marker: PhantomData, 76 | } 77 | } 78 | 79 | pub fn insert_front(&self, val: T) { 80 | let node = Box::new(Node::new(val)); 81 | let node = NonNull::new(Box::into_raw(node)).unwrap(); 82 | self.insert_front_raw(node); 83 | } 84 | 85 | pub fn insert_front_raw(&self, mut node: NonNull>) { 86 | let mut head_locked = self.head.lock(); 87 | unsafe { 88 | node.as_mut().next.lock().0 = head_locked.0; 89 | node.as_mut().prev = Mutex::new(NodePointer::default()); 90 | } 91 | 92 | match head_locked.0 { 93 | Some(head) => unsafe { 94 | (*head.as_ptr()).prev.lock().0 = Some(node); 95 | }, 96 | None => { 97 | self.tail.lock().0 = Some(node); 98 | } 99 | } 100 | head_locked.0 = Some(node); 101 | self.length.fetch_add(1, Ordering::Relaxed); 102 | } 103 | 104 | pub fn remove(&self, mut node: NonNull>) -> T { 105 | let node_mut = unsafe { node.as_mut() }; 106 | self.length.fetch_sub(1, Ordering::Relaxed); 107 | match node_mut.prev.lock().0 { 108 | Some(prev) => unsafe { (*prev.as_ptr()).next.lock().0 = node_mut.next.lock().0 }, 109 | None => { 110 | self.head.lock().0 = node_mut.next.lock().0; 111 | } 112 | } 113 | match node_mut.next.lock().0 { 114 | Some(next) => unsafe { (*next.as_ptr()).prev.lock().0 = node_mut.prev.lock().0 }, 115 | None => self.tail.lock().0 = node_mut.prev.lock().0, 116 | } 117 | unsafe { 118 | let n = Box::from_raw(node.as_ptr()); 119 | n.into_val() 120 | } 121 | } 122 | 123 | pub fn reinsert_front(&self, mut node: NonNull>) { 124 | { 125 | let head_locked = self.head.lock(); 126 | if head_locked.0 == Some(node) { 127 | return; 128 | } 129 | } 130 | let node_mut = unsafe { node.as_mut() }; 131 | self.length.fetch_sub(1, Ordering::Relaxed); 132 | match node_mut.prev.lock().0 { 133 | Some(prev) => unsafe { (*prev.as_ptr()).next.lock().0 = node_mut.next.lock().0 }, 134 | None => { 135 | self.head.lock().0 = node_mut.next.lock().0; 136 | } 137 | } 138 | match node_mut.next.lock().0 { 139 | Some(next) => unsafe { (*next.as_ptr()).prev.lock().0 = node_mut.prev.lock().0 }, 140 | None => self.tail.lock().0 = node_mut.prev.lock().0, 141 | } 142 | self.insert_front_raw(node); 143 | } 144 | 145 | pub fn remove_tail(&self) -> Option { 146 | let mut tail_locked = self.tail.lock(); 147 | self.length.fetch_sub(1, Ordering::Relaxed); 148 | match tail_locked.0 { 149 | Some(tail) => unsafe { 150 | let node = Box::from_raw(tail.as_ptr()); 151 | { 152 | let prev_node_locked = node.prev.lock(); 153 | tail_locked.0 = prev_node_locked.0; 154 | match tail_locked.0 { 155 | Some(t) => { 156 | (*t.as_ptr()).next.lock().0 = None; 157 | } 158 | None => { 159 | self.head.lock().0 = None; 160 | } 161 | } 162 | } 163 | Some(node.into_val()) 164 | }, 165 | None => { 166 | let mut head_locked = self.head.lock(); 167 | head_locked.0 = None; 168 | None 169 | } 170 | } 171 | } 172 | 173 | pub fn iter(&self) -> Iter<'_, T> { 174 | Iter { 175 | head: NodePointer(self.head.lock().0), 176 | len: self.length.load(Ordering::Relaxed), 177 | _marker: PhantomData, 178 | } 179 | } 180 | } 181 | 182 | impl Drop for LinkedList 183 | where 184 | T: std::fmt::Debug, 185 | { 186 | fn drop(&mut self) { 187 | struct DropGuard<'a, T>(&'a mut LinkedList) 188 | where 189 | T: std::fmt::Debug; 190 | impl<'a, T> Drop for DropGuard<'a, T> 191 | where 192 | T: std::fmt::Debug, 193 | { 194 | fn drop(&mut self) { 195 | while self.0.remove_tail().is_some() {} 196 | } 197 | } 198 | 199 | while let Some(node) = self.remove_tail() { 200 | let guard = DropGuard(self); 201 | drop(node); 202 | mem::forget(guard); 203 | } 204 | } 205 | } 206 | 207 | pub struct Iter<'a, T: 'a> { 208 | head: NodePointer, 209 | len: usize, 210 | _marker: PhantomData<&'a Node>, 211 | } 212 | 213 | impl<'a, T> Iterator for Iter<'a, T> { 214 | type Item = &'a T; 215 | 216 | #[inline] 217 | fn next(&mut self) -> Option { 218 | if self.len == 0 { 219 | None 220 | } else { 221 | self.head.0.map(|node| { 222 | self.len -= 1; 223 | 224 | unsafe { 225 | let node = &*node.as_ptr(); 226 | self.head = NodePointer(node.next.lock().0); 227 | &node.val 228 | } 229 | }) 230 | } 231 | } 232 | } 233 | 234 | impl std::fmt::Debug for LinkedList { 235 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 236 | for cur in self.iter() { 237 | write!(f, "{:?} ", cur)?; 238 | } 239 | Ok(()) 240 | } 241 | } 242 | 243 | struct LRUEntry { 244 | key: Vec, 245 | value: T, 246 | } 247 | 248 | impl LRUEntry 249 | where 250 | T: std::fmt::Debug, 251 | { 252 | pub fn new(key: &[u8], value: T) -> Self { 253 | Self { 254 | key: key.to_vec(), 255 | value, 256 | } 257 | } 258 | } 259 | 260 | impl std::fmt::Debug for LRUEntry { 261 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 262 | write!(f, "{:?}", self.value)?; 263 | Ok(()) 264 | } 265 | } 266 | 267 | pub struct LRUCache 268 | where 269 | T: std::fmt::Debug, 270 | { 271 | map: DashMap, NodePointer>>, 272 | list: LinkedList>, 273 | capacity: usize, 274 | lock: Mutex<()>, 275 | } 276 | 277 | impl LRUCache 278 | where 279 | T: std::fmt::Debug, 280 | { 281 | pub fn new(capacity: usize) -> Self { 282 | Self { 283 | map: DashMap::new(), 284 | list: LinkedList::new(), 285 | capacity, 286 | lock: Mutex::new(()), 287 | } 288 | } 289 | 290 | pub fn insert(&self, key: &[u8], value: T) -> Option { 291 | let _l = self.lock.lock(); 292 | let new_node = LRUEntry::new(key, value); 293 | let new_node = Box::new(Node::new(new_node)); 294 | let new_node = NonNull::new(Box::into_raw(new_node)).unwrap(); 295 | 296 | let mut val = None; 297 | match self.map.get(key) { 298 | Some(entry) => { 299 | let entry = entry.0.unwrap(); 300 | let value = self.list.remove(entry); 301 | val = Some(value.value); 302 | self.list.insert_front_raw(new_node); 303 | } 304 | None => { 305 | if self.list.length.load(Ordering::Relaxed) >= self.capacity { 306 | // let removed_key = self.list.remove_tail(); 307 | if let Some(entry) = self.list.remove_tail() { 308 | self.map.remove(&entry.key); 309 | val = Some(entry.value); 310 | } 311 | } 312 | self.list.insert_front_raw(new_node); 313 | } 314 | } 315 | self.map.insert(key.to_vec(), NodePointer(Some(new_node))); 316 | val 317 | } 318 | 319 | pub fn get(&self, key: &[u8]) -> Option<&T> { 320 | let _l = self.lock.lock(); 321 | match self.map.get(key) { 322 | Some(node) => unsafe { 323 | let node = node.0.unwrap(); 324 | let value = &node.as_ref().val.value; 325 | self.list.reinsert_front(node); 326 | Some(value) 327 | }, 328 | None => None, 329 | } 330 | } 331 | 332 | pub fn remove(&self, key: &[u8]) { 333 | let _l = self.lock.lock(); 334 | if let Some(node) = self.map.get(key) { 335 | self.list.remove(node.0.unwrap()); 336 | } 337 | self.map.remove(key); 338 | } 339 | } 340 | 341 | #[cfg(test)] 342 | mod test { 343 | mod test_linkedlist { 344 | use super::super::LinkedList; 345 | 346 | #[test] 347 | fn test_insert() { 348 | let list: LinkedList = LinkedList::new(); 349 | list.insert_front(2); 350 | list.insert_front(3); 351 | list.insert_front(4); 352 | let result = format!("{:?}", list); 353 | assert_eq!("4 3 2 ", result); 354 | } 355 | } 356 | 357 | mod test_lru_cache { 358 | use std::sync::Arc; 359 | 360 | use super::super::LRUCache; 361 | use rand::prelude::*; 362 | 363 | #[test] 364 | fn test() { 365 | let lru = LRUCache::new(5); 366 | lru.insert(&5_i32.to_le_bytes(), 5); 367 | println!("{:?}", lru.list); 368 | lru.insert(&0_i32.to_le_bytes(), 0); 369 | println!("{:?}", lru.list); 370 | lru.insert(&2_i32.to_le_bytes(), 2); 371 | println!("{:?}", lru.list); 372 | lru.insert(&6_i32.to_le_bytes(), 6); 373 | println!("{:?}", lru.list); 374 | lru.insert(&1_i32.to_le_bytes(), 1); 375 | println!("{:?}", lru.list); 376 | lru.insert(&6_i32.to_le_bytes(), 6); 377 | println!("{:?}", lru.list); 378 | lru.insert(&8_i32.to_le_bytes(), 8); 379 | println!("{:?}", lru.list); 380 | lru.insert(&8_i32.to_le_bytes(), 8); 381 | println!("{:?}", lru.list); 382 | lru.insert(&7_i32.to_le_bytes(), 7); 383 | println!("{:?}", lru.list); 384 | lru.insert(&4_i32.to_le_bytes(), 4); 385 | println!("{:?}", lru.list); 386 | lru.insert(&0_i32.to_le_bytes(), 0); 387 | println!("{:?}", lru.list); 388 | lru.insert(&0_i32.to_le_bytes(), 0); 389 | lru.insert(&2_i32.to_le_bytes(), 2); 390 | lru.insert(&1_i32.to_le_bytes(), 1); 391 | lru.insert(&0_i32.to_le_bytes(), 0); 392 | lru.insert(&2_i32.to_le_bytes(), 2); 393 | } 394 | 395 | #[test] 396 | fn test_insert() { 397 | let lru = LRUCache::new(10); 398 | for _i in 0..50000_usize { 399 | let mut rng = rand::thread_rng(); 400 | let n: usize = rng.gen::() % 100; 401 | print!("{n},"); 402 | lru.insert(&n.to_le_bytes(), n); 403 | } 404 | } 405 | 406 | #[test] 407 | fn test_multithread() { 408 | let lru: Arc> = Arc::new(LRUCache::new(10)); 409 | let mut thread_arr = Vec::new(); 410 | for _i in 0..10usize { 411 | let lru_arc = Arc::clone(&lru); 412 | let handler = std::thread::spawn(move || { 413 | for _i in 0..10000_usize { 414 | let mut rng = rand::thread_rng(); 415 | let n = rng.gen::() % 100; 416 | // println!("thread: {:?} {n}", std::thread::current().id()); 417 | lru_arc.insert(&n.to_le_bytes(), n); 418 | } 419 | }); 420 | thread_arr.push(handler); 421 | } 422 | for thread in thread_arr { 423 | thread.join().unwrap(); 424 | } 425 | } 426 | } 427 | } 428 | -------------------------------------------------------------------------------- /src/common/errors.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::ffi::CStr; 6 | 7 | use libc::strerror; 8 | 9 | pub const CONNECTION_ERROR: i32 = 10001; 10 | pub const INVALID_CLUSTER_STATUS: i32 = 10002; 11 | pub const DATABASE_ERROR: i32 = 10003; 12 | pub const SERIALIZATION_ERROR: i32 = 10004; 13 | 14 | pub fn status_to_string(status: i32) -> String { 15 | match status { 16 | CONNECTION_ERROR => "CONNECTION_ERROR".to_string(), 17 | INVALID_CLUSTER_STATUS => "INVALID_CLUSTER_STATUS".to_string(), 18 | DATABASE_ERROR => "DATABASE_ERROR".to_string(), 19 | SERIALIZATION_ERROR => "SERIALIZATION_ERROR".to_string(), 20 | _ => unsafe { CStr::from_ptr(strerror(status)) } 21 | .to_str() 22 | .unwrap() 23 | .to_string(), 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/common/hash_ring.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::collections::HashMap; 6 | 7 | use conhash::{ConsistentHash, Node}; 8 | 9 | #[derive(Clone)] 10 | pub struct ServerNode { 11 | pub address: String, 12 | } 13 | 14 | impl Node for ServerNode { 15 | fn name(&self) -> String { 16 | self.address.clone() 17 | } 18 | } 19 | 20 | pub struct HashRing { 21 | pub ring: ConsistentHash, 22 | pub servers: HashMap, 23 | } 24 | 25 | impl Clone for HashRing { 26 | fn clone(&self) -> Self { 27 | let servers = self.servers.clone(); 28 | let mut ring = ConsistentHash::::new(); 29 | for (server, weight) in servers.iter() { 30 | ring.add( 31 | &ServerNode { 32 | address: server.clone(), 33 | }, 34 | *weight, 35 | ); 36 | } 37 | HashRing { ring, servers } 38 | } 39 | } 40 | 41 | impl HashRing { 42 | pub fn new(servers: Vec<(String, usize)>) -> Self { 43 | let mut ring = ConsistentHash::::new(); 44 | let mut servers_map = HashMap::new(); 45 | for (server, weight) in servers { 46 | ring.add( 47 | &ServerNode { 48 | address: server.clone(), 49 | }, 50 | weight, 51 | ); 52 | servers_map.insert(server, weight); 53 | } 54 | HashRing { 55 | ring, 56 | servers: servers_map, 57 | } 58 | } 59 | 60 | pub fn get(&self, key: &str) -> Option<&ServerNode> { 61 | self.ring.get_str(key) 62 | } 63 | 64 | pub fn add(&mut self, server: ServerNode, weight: usize) { 65 | self.ring.add(&server, weight); 66 | self.servers.insert(server.address, weight); 67 | } 68 | 69 | pub fn remove(&mut self, server: &ServerNode) { 70 | self.ring.remove(server); 71 | self.servers.remove(&server.address); 72 | } 73 | 74 | pub fn contains(&self, server: &str) -> bool { 75 | self.servers.contains_key(server) 76 | } 77 | 78 | pub fn get_server_lists(&self) -> Vec { 79 | self.servers.keys().cloned().collect() 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/common/info_syncer.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | sync::{ 3 | atomic::{AtomicI32, Ordering}, 4 | Arc, 5 | }, 6 | time::Duration, 7 | }; 8 | 9 | use async_trait::async_trait; 10 | use log::{debug, error, info}; 11 | use spin::RwLock; 12 | use tokio::time::sleep; 13 | 14 | use crate::common::errors::{self, status_to_string, CONNECTION_ERROR}; 15 | 16 | use super::{hash_ring::HashRing, sender::Sender, serialization::ClusterStatus}; 17 | 18 | #[async_trait] 19 | pub trait InfoSyncer { 20 | async fn get_cluster_status(&self) -> Result; 21 | fn cluster_status(&self) -> &AtomicI32; 22 | } 23 | 24 | async fn sync_cluster_infos(client: Arc) { 25 | loop { 26 | { 27 | let result = client.get_cluster_status().await; 28 | match result { 29 | Ok(status) => { 30 | let status = status.into(); 31 | if client.cluster_status().load(Ordering::Relaxed) != status { 32 | client.cluster_status().store(status, Ordering::Relaxed); 33 | } 34 | } 35 | Err(e) => { 36 | info!("sync server infos failed, error = {}", e); 37 | } 38 | } 39 | } 40 | sleep(Duration::from_secs(1)).await; 41 | } 42 | } 43 | 44 | #[async_trait] 45 | pub trait ClientStatusMonitor: InfoSyncer { 46 | fn hash_ring(&self) -> &Arc>>; 47 | fn new_hash_ring(&self) -> &Arc>>; 48 | fn sender(&self) -> &Sender; 49 | fn manager_address(&self) -> &Arc>; 50 | 51 | fn get_address(&self, path: &str) -> String { 52 | self.hash_ring() 53 | .read() 54 | .as_ref() 55 | .unwrap() 56 | .get(path) 57 | .unwrap() 58 | .address 59 | .clone() 60 | } 61 | 62 | fn get_new_address(&self, path: &str) -> String { 63 | match self.new_hash_ring().read().as_ref() { 64 | Some(hash_ring) => hash_ring.get(path).unwrap().address.clone(), 65 | None => self.get_address(path), 66 | } 67 | } 68 | 69 | async fn get_hash_ring_info(&self) -> Result, i32> { 70 | self.sender() 71 | .get_hash_ring_info(&self.manager_address().lock().await) 72 | .await 73 | } 74 | async fn get_new_hash_ring_info(&self) -> Result, i32> { 75 | self.sender() 76 | .get_new_hash_ring_info(&self.manager_address().lock().await) 77 | .await 78 | } 79 | 80 | fn get_connection_address(&self, path: &str) -> String { 81 | let cluster_status = self.cluster_status().load(Ordering::Acquire); 82 | 83 | // check the ClusterStatus is not Idle 84 | // for efficiency, we use i32 operation to check the ClusterStatus 85 | if cluster_status == 301 { 86 | return self.get_address(path); 87 | } 88 | 89 | match cluster_status.try_into().unwrap() { 90 | ClusterStatus::Initializing => panic!("cluster status is not ready"), 91 | ClusterStatus::Idle => todo!(), 92 | ClusterStatus::NodesStarting => self.get_address(path), 93 | ClusterStatus::SyncNewHashRing => self.get_address(path), 94 | ClusterStatus::PreTransfer => self.get_address(path), 95 | ClusterStatus::Transferring => self.get_address(path), 96 | ClusterStatus::PreFinish => self.get_new_address(path), 97 | ClusterStatus::Finishing => self.get_address(path), 98 | ClusterStatus::StatusError => todo!(), 99 | ClusterStatus::Unkown => todo!(), 100 | } 101 | } 102 | 103 | async fn add_connection(&self, server_address: &str) -> Result<(), i32>; 104 | 105 | async fn connect_to_manager(&self, manager_address: &str) -> Result<(), i32> { 106 | self.manager_address() 107 | .lock() 108 | .await 109 | .push_str(manager_address); 110 | self.add_connection(manager_address).await.map_err(|e| { 111 | error!("add connection failed: {:?}", e); 112 | CONNECTION_ERROR 113 | }) 114 | } 115 | 116 | async fn add_new_servers(&self, new_servers_info: Vec<(String, usize)>) -> Result<(), i32> { 117 | self.sender() 118 | .add_new_servers(&self.manager_address().lock().await, new_servers_info) 119 | .await 120 | } 121 | 122 | async fn connect_servers(&self) -> Result<(), i32> { 123 | debug!("init"); 124 | 125 | let result = async { 126 | loop { 127 | match self 128 | .cluster_status() 129 | .load(Ordering::Acquire) 130 | .try_into() 131 | .unwrap() 132 | { 133 | ClusterStatus::Idle => { 134 | return self.get_hash_ring_info().await; 135 | } 136 | ClusterStatus::Initializing => { 137 | info!("cluster is initalling, wait for a while"); 138 | tokio::time::sleep(Duration::from_secs(1)).await; 139 | } 140 | ClusterStatus::PreFinish => { 141 | info!("cluster is initalling, wait for a while"); 142 | tokio::time::sleep(Duration::from_secs(1)).await; 143 | } 144 | s => { 145 | error!("invalid cluster status: {}", s); 146 | return Err(errors::INVALID_CLUSTER_STATUS); 147 | } 148 | } 149 | } 150 | } 151 | .await; 152 | 153 | match result { 154 | Ok(all_servers_address) => { 155 | for server_address in &all_servers_address { 156 | self.add_connection(&server_address.0).await?; 157 | } 158 | self.hash_ring() 159 | .write() 160 | .replace(HashRing::new(all_servers_address.clone())); 161 | Ok(()) 162 | } 163 | Err(e) => Err(e), 164 | } 165 | } 166 | } 167 | 168 | async fn client_watch_status( 169 | client: Arc, 170 | ) { 171 | loop { 172 | match client 173 | .cluster_status() 174 | .load(Ordering::Relaxed) 175 | .try_into() 176 | .unwrap() 177 | { 178 | ClusterStatus::SyncNewHashRing => { 179 | // here I write a long code block to deal with the process from SyncNewHashRing to new Idle status. 180 | // this is because we don't make persistent flags for status, so we could not check a status is finished or not. 181 | // so we have to check the status in a long code block, and we could not use a loop to check the status. 182 | // in the future, we will make persistent flags for status, and we separate the code block for each status. 183 | info!("Transfer: start to sync new hash ring"); 184 | let all_servers_address = match client.get_new_hash_ring_info().await { 185 | Ok(value) => value, 186 | Err(e) => { 187 | panic!("Get Hash Ring Info Failed. Error = {}", e); 188 | } 189 | }; 190 | info!("Transfer: get new hash ring info"); 191 | 192 | for value in all_servers_address.iter() { 193 | if client 194 | .hash_ring() 195 | .read() 196 | .as_ref() 197 | .unwrap() 198 | .contains(&value.0) 199 | { 200 | continue; 201 | } 202 | if let Err(e) = client.add_connection(&value.0).await { 203 | // TODO: we should rollback the transfer process 204 | panic!("Add Connection Failed. Error = {}", e); 205 | } 206 | } 207 | client 208 | .new_hash_ring() 209 | .write() 210 | .replace(HashRing::new(all_servers_address)); 211 | info!("Transfer: sync new hash ring finished"); 212 | 213 | // wait for all servers to be PreTransfer 214 | 215 | while >::try_into( 216 | client.cluster_status().load(Ordering::Relaxed), 217 | ) 218 | .unwrap() 219 | == ClusterStatus::SyncNewHashRing 220 | { 221 | sleep(Duration::from_secs(1)).await; 222 | } 223 | assert!( 224 | >::try_into( 225 | client.cluster_status().load(Ordering::Relaxed) 226 | ) 227 | .unwrap() 228 | == ClusterStatus::PreTransfer 229 | ); 230 | 231 | while >::try_into( 232 | client.cluster_status().load(Ordering::Relaxed), 233 | ) 234 | .unwrap() 235 | == ClusterStatus::PreTransfer 236 | { 237 | sleep(Duration::from_secs(1)).await; 238 | } 239 | assert!( 240 | >::try_into( 241 | client.cluster_status().load(Ordering::Relaxed) 242 | ) 243 | .unwrap() 244 | == ClusterStatus::Transferring 245 | ); 246 | 247 | while >::try_into( 248 | client.cluster_status().load(Ordering::Relaxed), 249 | ) 250 | .unwrap() 251 | == ClusterStatus::Transferring 252 | { 253 | sleep(Duration::from_secs(1)).await; 254 | } 255 | assert!( 256 | >::try_into( 257 | client.cluster_status().load(Ordering::Relaxed) 258 | ) 259 | .unwrap() 260 | == ClusterStatus::PreFinish 261 | ); 262 | 263 | let _old_hash_ring = client 264 | .hash_ring() 265 | .write() 266 | .replace(client.new_hash_ring().read().as_ref().unwrap().clone()); 267 | 268 | while >::try_into( 269 | client.cluster_status().load(Ordering::Relaxed), 270 | ) 271 | .unwrap() 272 | == ClusterStatus::PreFinish 273 | { 274 | sleep(Duration::from_secs(1)).await; 275 | } 276 | assert!( 277 | >::try_into( 278 | client.cluster_status().load(Ordering::Relaxed) 279 | ) 280 | .unwrap() 281 | == ClusterStatus::Finishing 282 | ); 283 | 284 | let _ = client.new_hash_ring().write().take(); 285 | // here we should close connections to old servers, but now we just wait for remote servers to close connections and do nothing 286 | 287 | while >::try_into( 288 | client.cluster_status().load(Ordering::Relaxed), 289 | ) 290 | .unwrap() 291 | == ClusterStatus::Finishing 292 | { 293 | sleep(Duration::from_secs(1)).await; 294 | } 295 | assert!( 296 | >::try_into( 297 | client.cluster_status().load(Ordering::Relaxed) 298 | ) 299 | .unwrap() 300 | == ClusterStatus::Idle 301 | ); 302 | 303 | info!("transferring data finished"); 304 | } 305 | ClusterStatus::Idle => { 306 | sleep(Duration::from_secs(1)).await; 307 | } 308 | ClusterStatus::Initializing => { 309 | sleep(Duration::from_secs(1)).await; 310 | } 311 | ClusterStatus::NodesStarting => { 312 | sleep(Duration::from_secs(1)).await; 313 | } 314 | e => { 315 | panic!("cluster status error: {:?}", e as u32); 316 | } 317 | } 318 | } 319 | } 320 | 321 | pub async fn init_network_connections< 322 | I: ClientStatusMonitor + std::marker::Sync + std::marker::Send + 'static, 323 | >( 324 | manager_address: String, 325 | client: Arc, 326 | ) { 327 | if let Err(e) = client.connect_to_manager(&manager_address).await { 328 | panic!("connect to manager failed, err = {}", status_to_string(e)); 329 | } 330 | tokio::spawn(sync_cluster_infos(client.clone())); 331 | tokio::spawn(client_watch_status(client)); 332 | } 333 | -------------------------------------------------------------------------------- /src/common/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub mod byte; 6 | pub mod cache; 7 | pub mod errors; 8 | pub mod hash_ring; 9 | pub mod info_syncer; 10 | pub mod sender; 11 | pub mod serialization; 12 | pub mod util; 13 | -------------------------------------------------------------------------------- /src/common/util.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::time::SystemTime; 6 | 7 | use fuser::{FileAttr, FileType}; 8 | use log::error; 9 | 10 | pub fn get_full_path(parent: &str, name: &str) -> String { 11 | if parent == "/" { 12 | return format!("/{}", name); 13 | } 14 | let path = format!("{}/{}", parent, name); 15 | path 16 | } 17 | 18 | // path_split: the path should not be empty, and it does not end with a slash unless it is the root directory. 19 | pub fn path_split(path: &str) -> Result<(String, String), i32> { 20 | if path.is_empty() { 21 | error!("path is empty"); 22 | return Err(libc::EINVAL); 23 | } 24 | if path == "/" { 25 | error!("path is root"); 26 | return Err(libc::EINVAL); 27 | } 28 | if path.ends_with('/') { 29 | error!("path ends with /"); 30 | return Err(libc::EINVAL); 31 | } 32 | let index = match path.rfind('/') { 33 | Some(value) => value, 34 | None => { 35 | error!("path does not contain /"); 36 | return Err(libc::EINVAL); 37 | } 38 | }; 39 | match index { 40 | 0 => Ok(("/".into(), path[1..].into())), 41 | _ => Ok((path[..index].into(), path[(index + 1)..].into())), 42 | } 43 | } 44 | 45 | pub fn empty_file() -> FileAttr { 46 | FileAttr { 47 | ino: 0, 48 | size: 0, 49 | blocks: 0, 50 | atime: SystemTime::now(), 51 | mtime: SystemTime::now(), 52 | ctime: SystemTime::now(), 53 | crtime: SystemTime::now(), 54 | kind: FileType::RegularFile, 55 | perm: 0, 56 | nlink: 0, 57 | uid: 0, 58 | gid: 0, 59 | rdev: 0, 60 | flags: 0, 61 | blksize: 0, 62 | } 63 | } 64 | 65 | pub fn empty_dir() -> FileAttr { 66 | FileAttr { 67 | ino: 0, 68 | size: 4096, 69 | blocks: 0, 70 | atime: SystemTime::now(), 71 | mtime: SystemTime::now(), 72 | ctime: SystemTime::now(), 73 | crtime: SystemTime::now(), 74 | kind: FileType::Directory, 75 | perm: 0, 76 | nlink: 0, 77 | uid: 0, 78 | gid: 0, 79 | rdev: 0, 80 | flags: 0, 81 | blksize: 0, 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub mod client; 6 | pub mod common; 7 | pub mod manager; 8 | pub mod rpc; 9 | pub mod server; 10 | -------------------------------------------------------------------------------- /src/manager/core.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::sync::atomic::AtomicBool; 6 | use std::sync::{Arc, Mutex, RwLock}; 7 | 8 | use ahash::{HashMap, HashMapExt}; 9 | use anyhow::Error; 10 | use dashmap::DashMap; 11 | use log::{debug, info}; 12 | 13 | use crate::common::hash_ring::{HashRing, ServerNode}; 14 | use crate::common::serialization::{ClusterStatus, ServerStatus, ServerType}; 15 | pub struct Manager { 16 | pub hashring: Arc>>, 17 | pub new_hashring: Arc>>, 18 | pub servers: Arc>>, 19 | pub cluster_status: Arc>, 20 | pub closed: AtomicBool, 21 | _clients: DashMap, 22 | } 23 | 24 | pub struct Server { 25 | pub status: ServerStatus, 26 | r#_type: ServerType, 27 | _replicas: usize, 28 | } 29 | 30 | impl Manager { 31 | pub fn new(servers: Vec<(String, usize)>) -> Self { 32 | let hashring = Arc::new(RwLock::new(Some(HashRing::new(servers.clone())))); 33 | let manager = Manager { 34 | hashring, 35 | new_hashring: Arc::new(RwLock::new(None)), 36 | servers: Arc::new(Mutex::new(HashMap::new())), 37 | cluster_status: Arc::new(Mutex::new(ClusterStatus::Initializing)), 38 | closed: AtomicBool::new(false), 39 | _clients: DashMap::new(), 40 | }; 41 | 42 | for (server, weight) in servers { 43 | manager.servers.lock().unwrap().insert( 44 | server, 45 | Server { 46 | status: ServerStatus::Initializing, 47 | r#_type: ServerType::Running, 48 | _replicas: weight, 49 | }, 50 | ); 51 | } 52 | 53 | manager 54 | } 55 | 56 | pub fn get_cluster_status(&self) -> ClusterStatus { 57 | let status = *self.cluster_status.lock().unwrap(); 58 | debug!("get_cluster_status: {:?}", status); 59 | status 60 | } 61 | 62 | pub fn get_hash_ring_info(&self) -> Vec<(String, usize)> { 63 | self.hashring 64 | .read() 65 | .unwrap() 66 | .as_ref() 67 | .unwrap() 68 | .servers 69 | .iter() 70 | .map(|(k, v)| (k.clone(), *v)) 71 | .collect() 72 | } 73 | 74 | pub fn get_new_hash_ring_info(&self) -> Result, Error> { 75 | if let Some(new_hashring) = self.new_hashring.read().unwrap().as_ref() { 76 | Ok(new_hashring 77 | .servers 78 | .iter() 79 | .map(|(k, v)| (k.clone(), *v)) 80 | .collect()) 81 | } else { 82 | Err(anyhow::anyhow!("new hashring is none")) 83 | } 84 | } 85 | 86 | pub fn add_nodes(&self, nodes: Vec<(String, usize)>) -> Option { 87 | info!("add_nodes: {:?}", nodes); 88 | let mut cluster_status = self.cluster_status.lock().unwrap(); 89 | if *cluster_status != ClusterStatus::Idle { 90 | return Some(anyhow::anyhow!("cluster is not idle")); 91 | } 92 | let mut new_hashring = self.hashring.read().unwrap().clone().unwrap(); 93 | let mut servers = self.servers.lock().unwrap(); 94 | for (node, weight) in nodes { 95 | new_hashring.add( 96 | ServerNode { 97 | address: node.clone(), 98 | }, 99 | weight, 100 | ); 101 | servers.insert( 102 | node, 103 | Server { 104 | status: ServerStatus::Initializing, 105 | r#_type: ServerType::Running, 106 | _replicas: weight, 107 | }, 108 | ); 109 | } 110 | 111 | self.new_hashring.write().unwrap().replace(new_hashring); 112 | *cluster_status = ClusterStatus::NodesStarting; 113 | 114 | None 115 | } 116 | 117 | pub fn delete_nodes(&self, nodes: Vec) -> Option { 118 | let mut cluster_status = self.cluster_status.lock().unwrap(); 119 | if *cluster_status != ClusterStatus::Idle { 120 | return Some(anyhow::anyhow!("cluster is not idle")); 121 | } 122 | let mut new_hashring = self.hashring.read().unwrap().clone().unwrap(); 123 | new_hashring.remove(&ServerNode { 124 | address: nodes[0].clone(), 125 | }); 126 | 127 | self.new_hashring.write().unwrap().replace(new_hashring); 128 | 129 | *cluster_status = ClusterStatus::NodesStarting; 130 | None 131 | } 132 | 133 | pub fn set_server_status(&self, server_id: String, status: ServerStatus) -> Option { 134 | // debug : logs all server_name in self.servers 135 | debug!( 136 | "set_server_status: {:?}", 137 | self.servers 138 | .lock() 139 | .unwrap() 140 | .iter() 141 | .map(|kv| kv.0.clone()) 142 | .collect::>() 143 | ); 144 | 145 | info!("set server status: {} {:?}", server_id, status); 146 | 147 | match status { 148 | ServerStatus::Initializing => { 149 | panic!("cannot set server status to init"); 150 | } 151 | ServerStatus::PreTransfer => { 152 | let cluster_status = self.cluster_status.lock().unwrap(); 153 | if *cluster_status != ClusterStatus::SyncNewHashRing { 154 | return Some(anyhow::anyhow!("cannot pretransfer for server: {}, cluster is not SyncNewHashRing: status: {:?}" , server_id, *cluster_status)); 155 | } 156 | let mut servers = self.servers.lock().unwrap(); 157 | if servers.get(&server_id).unwrap().status != ServerStatus::Finished { 158 | return Some(anyhow::anyhow!( 159 | "cannot pretransfer for server: {}, server is not finish: status: {:?}", 160 | server_id, 161 | servers.get(&server_id).unwrap().status 162 | )); 163 | } 164 | servers.get_mut(&server_id).unwrap().status = ServerStatus::PreTransfer; 165 | None 166 | } 167 | ServerStatus::Transferring => { 168 | let cluster_status = self.cluster_status.lock().unwrap(); 169 | if *cluster_status != ClusterStatus::PreTransfer { 170 | return Some(anyhow::anyhow!( 171 | "cannot transfer for server: {}, cluster is not PreTransfer: status: {:?}", 172 | server_id, 173 | *cluster_status 174 | )); 175 | } 176 | let mut servers = self.servers.lock().unwrap(); 177 | if servers.get(&server_id).unwrap().status != ServerStatus::PreTransfer { 178 | return Some(anyhow::anyhow!( 179 | "cannot transfer for server: {}, server is not finish: status: {:?}", 180 | server_id, 181 | servers.get(&server_id).unwrap().status 182 | )); 183 | } 184 | servers.get_mut(&server_id).unwrap().status = ServerStatus::Transferring; 185 | None 186 | } 187 | ServerStatus::PreFinish => { 188 | let cluster_status = self.cluster_status.lock().unwrap(); 189 | if *cluster_status != ClusterStatus::Transferring { 190 | return Some(anyhow::anyhow!("cannot prefinish for server: {}, cluster is not Transferring: status: {:?}" , server_id, *cluster_status)); 191 | } 192 | let mut servers = self.servers.lock().unwrap(); 193 | if servers.get(&server_id).unwrap().status != ServerStatus::Transferring { 194 | return Some(anyhow::anyhow!( 195 | "cannot prefinish for server: {}, server is not transferring: status: {:?}", 196 | server_id, 197 | servers.get(&server_id).unwrap().status 198 | )); 199 | } 200 | servers.get_mut(&server_id).unwrap().status = ServerStatus::PreFinish; 201 | None 202 | } 203 | ServerStatus::Finishing => { 204 | let cluster_status = self.cluster_status.lock().unwrap(); 205 | if *cluster_status != ClusterStatus::PreFinish { 206 | return Some(anyhow::anyhow!("cannot prefinish for server: {}, cluster is not Transferring: status: {:?}" , server_id, *cluster_status)); 207 | } 208 | let mut servers: std::sync::MutexGuard< 209 | std::collections::HashMap, 210 | > = self.servers.lock().unwrap(); 211 | if servers.get(&server_id).unwrap().status != ServerStatus::PreFinish { 212 | return Some(anyhow::anyhow!( 213 | "cannot finish for server: {}, server is not prefinish: status: {:?}", 214 | server_id, 215 | servers.get(&server_id).unwrap().status 216 | )); 217 | } 218 | servers.get_mut(&server_id).unwrap().status = ServerStatus::Finishing; 219 | None 220 | } 221 | ServerStatus::Finished => { 222 | let cluster_status = self.cluster_status.lock().unwrap(); 223 | match *cluster_status { 224 | ClusterStatus::Finishing => { 225 | let mut servers: std::sync::MutexGuard> = self.servers.lock().unwrap(); 226 | if servers.get(&server_id).unwrap().status != ServerStatus::Finishing { 227 | return Some(anyhow::anyhow!("cannot finish for server: {}, server is not Finishing: status: {:?}", server_id, servers.get(&server_id).unwrap().status)); 228 | } 229 | servers.get_mut(&server_id).unwrap().status = ServerStatus::Finished; 230 | None 231 | } 232 | ClusterStatus::Initializing => { 233 | let mut servers = self.servers.lock().unwrap(); 234 | if servers.get(&server_id).unwrap().status != ServerStatus::Initializing { 235 | return Some(anyhow::anyhow!( 236 | "cannot finish for server: {}, server is not Initializing: status: {:?}", 237 | server_id, 238 | servers.get(&server_id).unwrap().status 239 | )); 240 | } 241 | servers.get_mut(&server_id).unwrap().status = ServerStatus::Finished; 242 | None 243 | } 244 | ClusterStatus::NodesStarting => { 245 | let mut servers = self.servers.lock().unwrap(); 246 | if !self 247 | .new_hashring 248 | .read() 249 | .unwrap() 250 | .as_ref() 251 | .unwrap() 252 | .contains(&server_id) 253 | { 254 | return Some(anyhow::anyhow!( 255 | "cannot finish for server: {}, server is not in new_hashring", 256 | server_id 257 | )); 258 | } 259 | if servers.get(&server_id).unwrap().status != ServerStatus::Initializing { 260 | return Some(anyhow::anyhow!( 261 | "cannot finish for server: {}, server is not Initializing: status: {:?}", 262 | server_id, 263 | servers.get(&server_id).unwrap().status 264 | )); 265 | } 266 | servers.get_mut(&server_id).unwrap().status = ServerStatus::Finished; 267 | None 268 | } 269 | _ => { 270 | Some(anyhow::anyhow!( 271 | "cannot finish for server: {}, cluster is not Finishing, Init or AddNodes: status: {:?}", 272 | server_id, 273 | *cluster_status 274 | )) 275 | } 276 | } 277 | } 278 | } 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /src/manager/manager_service.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::{sync::Arc, time::Duration}; 6 | 7 | use crate::{ 8 | common::serialization::{ 9 | AddNodesSendMetaData, ClusterStatus, DeleteNodesSendMetaData, GetClusterStatusRecvMetaData, 10 | GetHashRingInfoRecvMetaData, ManagerOperationType, ServerStatus, 11 | }, 12 | rpc::server::Handler, 13 | }; 14 | 15 | use super::core::Manager; 16 | 17 | use async_trait::async_trait; 18 | use log::{debug, error, info}; 19 | use serde::{Deserialize, Serialize}; 20 | 21 | pub struct ManagerService { 22 | pub manager: Arc, 23 | } 24 | 25 | #[derive(Serialize, Deserialize)] 26 | pub struct SendHeartRequest { 27 | pub address: String, 28 | pub flags: u32, 29 | pub lifetime: String, 30 | } 31 | 32 | #[derive(Serialize, Deserialize)] 33 | pub struct MetadataRequest { 34 | pub flags: u32, 35 | } 36 | 37 | #[derive(Default, Serialize, Deserialize)] 38 | pub struct MetadataResponse { 39 | pub instances: Vec, 40 | } 41 | 42 | pub async fn update_server_status(manager: Arc) { 43 | loop { 44 | tokio::time::sleep(Duration::from_secs(1)).await; 45 | if manager.closed.load(std::sync::atomic::Ordering::Relaxed) { 46 | break; 47 | } 48 | let status = *manager.cluster_status.lock().unwrap(); 49 | debug!("current cluster status is {:?}", status); 50 | match status { 51 | ClusterStatus::Idle => {} 52 | ClusterStatus::NodesStarting => { 53 | // if all servers is ready, change the cluster status to SyncNewHashRing 54 | let flag = manager 55 | .servers 56 | .lock() 57 | .unwrap() 58 | .iter() 59 | .all(|kv| kv.1.status == ServerStatus::Finished); 60 | if flag { 61 | tokio::time::sleep(Duration::from_secs(1)).await; 62 | *manager.cluster_status.lock().unwrap() = ClusterStatus::SyncNewHashRing; 63 | info!("all servers is ready, change the cluster status to SyncNewHashRing"); 64 | }; 65 | } 66 | ClusterStatus::SyncNewHashRing => { 67 | // if all servers is ready, change the cluster status to PreTransfer 68 | let flag = manager 69 | .servers 70 | .lock() 71 | .unwrap() 72 | .iter() 73 | .all(|kv| kv.1.status == ServerStatus::PreTransfer); 74 | if flag { 75 | tokio::time::sleep(Duration::from_secs(1)).await; 76 | *manager.cluster_status.lock().unwrap() = ClusterStatus::PreTransfer; 77 | info!("all servers is ready, change the cluster status to PreTransfer"); 78 | } 79 | } 80 | ClusterStatus::PreTransfer => { 81 | // if all servers is ready, change the cluster status to Transferring 82 | let flag = manager 83 | .servers 84 | .lock() 85 | .unwrap() 86 | .iter() 87 | .all(|kv| kv.1.status == ServerStatus::Transferring); 88 | if flag { 89 | tokio::time::sleep(Duration::from_secs(1)).await; 90 | *manager.cluster_status.lock().unwrap() = ClusterStatus::Transferring; 91 | info!("all servers is ready, change the cluster status to Transferring"); 92 | } 93 | } 94 | ClusterStatus::Transferring => { 95 | // if all servers is ready, change the cluster status to PreFinish 96 | let flag = manager 97 | .servers 98 | .lock() 99 | .unwrap() 100 | .iter() 101 | .all(|kv| kv.1.status == ServerStatus::PreFinish); 102 | if flag { 103 | tokio::time::sleep(Duration::from_secs(1)).await; 104 | *manager.cluster_status.lock().unwrap() = ClusterStatus::PreFinish; 105 | info!("all servers is ready, change the cluster status to PreFinish"); 106 | } 107 | } 108 | ClusterStatus::PreFinish => { 109 | // if all servers is ready, change the cluster status to Finishing 110 | let flag = manager 111 | .servers 112 | .lock() 113 | .unwrap() 114 | .iter() 115 | .all(|kv| kv.1.status == ServerStatus::Finishing); 116 | if flag { 117 | tokio::time::sleep(Duration::from_secs(1)).await; 118 | let _ = manager 119 | .hashring 120 | .write() 121 | .unwrap() 122 | .replace(manager.new_hashring.read().unwrap().clone().unwrap()); 123 | *manager.cluster_status.lock().unwrap() = ClusterStatus::Finishing; 124 | info!("all servers is ready, change the cluster status to Finishing"); 125 | } 126 | } 127 | ClusterStatus::Finishing => { 128 | // if all servers is ready, change the cluster status to Idle 129 | let flag = manager 130 | .servers 131 | .lock() 132 | .unwrap() 133 | .iter() 134 | .all(|kv| kv.1.status == ServerStatus::Finished); 135 | if flag { 136 | tokio::time::sleep(Duration::from_secs(1)).await; 137 | let mut new_hashring = manager.new_hashring.write().unwrap(); 138 | manager 139 | .servers 140 | .lock() 141 | .unwrap() 142 | .retain(|k, _| new_hashring.as_ref().unwrap().contains(k)); 143 | // move new_hashring to hashring 144 | let _ = new_hashring.take().unwrap(); 145 | *manager.cluster_status.lock().unwrap() = ClusterStatus::Idle; 146 | info!("all servers is ready, change the cluster status to Idle"); 147 | } 148 | } 149 | ClusterStatus::Initializing => { 150 | // if all servers is ready, change the cluster status to Idle 151 | let flag = manager 152 | .servers 153 | .lock() 154 | .unwrap() 155 | .iter() 156 | .all(|kv| kv.1.status == ServerStatus::Finished); 157 | if flag { 158 | tokio::time::sleep(Duration::from_secs(1)).await; 159 | *manager.cluster_status.lock().unwrap() = ClusterStatus::Idle; 160 | info!("all servers is ready, change the cluster status to Idle"); 161 | } 162 | } 163 | s => panic!("update server status failed, invalid cluster status: {}", s), 164 | } 165 | } 166 | } 167 | 168 | impl ManagerService { 169 | pub fn new(servers: Vec<(String, usize)>) -> Self { 170 | let manager = Arc::new(Manager::new(servers)); 171 | ManagerService { manager } 172 | } 173 | } 174 | 175 | #[async_trait] 176 | impl Handler for ManagerService { 177 | async fn dispatch( 178 | &self, 179 | id: u32, 180 | operation_type: u32, 181 | _flags: u32, 182 | path: Vec, 183 | _data: Vec, 184 | metadata: Vec, 185 | ) -> anyhow::Result<(i32, u32, usize, usize, Vec, Vec)> { 186 | let r#type = ManagerOperationType::try_from(operation_type).unwrap(); 187 | match r#type { 188 | ManagerOperationType::GetClusterStatus => { 189 | let status = self.manager.get_cluster_status(); 190 | let response_meta_data = 191 | bincode::serialize(&GetClusterStatusRecvMetaData { status }).unwrap(); 192 | 193 | debug!("connection {} get cluster status: {:?}", id, status); 194 | 195 | Ok(( 196 | 0, 197 | 0, 198 | response_meta_data.len(), 199 | 0, 200 | response_meta_data, 201 | Vec::new(), 202 | )) 203 | } 204 | ManagerOperationType::GetHashRing => { 205 | let hash_ring_info = self.manager.get_hash_ring_info(); 206 | 207 | info!("connection {} get hash ring: {:?}", id, hash_ring_info); 208 | 209 | let response_meta_data = 210 | bincode::serialize(&GetHashRingInfoRecvMetaData { hash_ring_info }).unwrap(); 211 | Ok(( 212 | 0, 213 | 0, 214 | response_meta_data.len(), 215 | 0, 216 | response_meta_data, 217 | Vec::new(), 218 | )) 219 | } 220 | ManagerOperationType::GetNewHashRing => match self.manager.get_new_hash_ring_info() { 221 | Ok(hash_ring_info) => { 222 | info!("connection {} get new hash ring: {:?}", id, hash_ring_info); 223 | let response_meta_data = 224 | bincode::serialize(&GetHashRingInfoRecvMetaData { hash_ring_info }) 225 | .unwrap(); 226 | Ok(( 227 | 0, 228 | 0, 229 | response_meta_data.len(), 230 | 0, 231 | response_meta_data, 232 | Vec::new(), 233 | )) 234 | } 235 | Err(e) => { 236 | error!("get new hash ring error: {}", e); 237 | Ok((libc::ENOENT, 0, 0, 0, Vec::new(), Vec::new())) 238 | } 239 | }, 240 | ManagerOperationType::AddNodes => { 241 | let new_servers_info = bincode::deserialize::(&metadata) 242 | .unwrap() 243 | .new_servers_info; 244 | info!("connection {} add nodes: {:?}", id, new_servers_info); 245 | match self.manager.add_nodes(new_servers_info) { 246 | None => Ok((0, 0, 0, 0, Vec::new(), Vec::new())), 247 | Some(e) => { 248 | error!("add nodes error: {}", e); 249 | Ok((libc::EIO, 0, 0, 0, Vec::new(), Vec::new())) 250 | } 251 | } 252 | } 253 | ManagerOperationType::RemoveNodes => { 254 | let deleted_servers_info = 255 | bincode::deserialize::(&metadata) 256 | .unwrap() 257 | .deleted_servers_info; 258 | info!("connection {} remove nodes: {:?}", id, deleted_servers_info); 259 | match self.manager.delete_nodes(deleted_servers_info) { 260 | None => Ok((0, 0, 0, 0, Vec::new(), Vec::new())), 261 | Some(e) => { 262 | error!("remove nodes error: {}", e); 263 | Ok((libc::EIO, 0, 0, 0, Vec::new(), Vec::new())) 264 | } 265 | } 266 | } 267 | ManagerOperationType::UpdateServerStatus => { 268 | info!("connection {} update server status", id); 269 | match self.manager.set_server_status( 270 | String::from_utf8(path).unwrap(), 271 | bincode::deserialize(&metadata).unwrap(), 272 | ) { 273 | None => Ok((0, 0, 0, 0, Vec::new(), Vec::new())), 274 | Some(e) => { 275 | error!("update server status error: {}", e); 276 | Ok((libc::EIO, 0, 0, 0, Vec::new(), Vec::new())) 277 | } 278 | } 279 | } 280 | _ => todo!(), 281 | } 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/manager/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub mod core; 6 | pub mod manager_service; 7 | -------------------------------------------------------------------------------- /src/rpc/client.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use super::{ 6 | callback::CallbackPool, 7 | connection::ClientConnection, 8 | protocol::{CONNECTION_RETRY_TIMES, SEND_RETRY_TIMES}, 9 | }; 10 | use async_trait::async_trait; 11 | use dashmap::DashMap; 12 | use log::{error, info, warn}; 13 | use std::{marker::PhantomData, sync::Arc, time::Duration}; 14 | use tokio::io::{AsyncReadExt, AsyncWriteExt}; 15 | 16 | #[async_trait] 17 | pub trait StreamCreator< 18 | R: AsyncReadExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 19 | W: AsyncWriteExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 20 | > 21 | { 22 | async fn create_stream(server_address: &str) -> Result<(R, W), String>; 23 | } 24 | 25 | pub struct TcpStreamCreator; 26 | 27 | #[async_trait] 28 | impl StreamCreator 29 | for TcpStreamCreator 30 | { 31 | async fn create_stream( 32 | server_address: &str, 33 | ) -> Result< 34 | ( 35 | tokio::net::tcp::OwnedReadHalf, 36 | tokio::net::tcp::OwnedWriteHalf, 37 | ), 38 | String, 39 | > { 40 | let stream = match tokio::net::TcpStream::connect(server_address).await { 41 | Ok(stream) => stream, 42 | Err(e) => { 43 | return Err(format!("connect to {} error: {}", server_address, e)); 44 | } 45 | }; 46 | Ok(stream.into_split()) 47 | } 48 | } 49 | 50 | pub struct UnixStreamCreator; 51 | 52 | #[async_trait] 53 | impl StreamCreator 54 | for UnixStreamCreator 55 | { 56 | async fn create_stream( 57 | server_address: &str, 58 | ) -> Result< 59 | ( 60 | tokio::net::unix::OwnedReadHalf, 61 | tokio::net::unix::OwnedWriteHalf, 62 | ), 63 | String, 64 | > { 65 | let stream = match tokio::net::UnixStream::connect(server_address).await { 66 | Ok(stream) => stream, 67 | Err(e) => { 68 | return Err(format!("connect to {} error: {}", server_address, e)); 69 | } 70 | }; 71 | Ok(stream.into_split()) 72 | } 73 | } 74 | 75 | pub struct RpcClient< 76 | R: AsyncReadExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 77 | W: AsyncWriteExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 78 | S: StreamCreator, 79 | > { 80 | connections: DashMap>>, 81 | pool: Arc, 82 | stream_creator: PhantomData, 83 | } 84 | 85 | impl< 86 | R: AsyncReadExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 87 | W: AsyncWriteExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 88 | S: StreamCreator, 89 | > Default for RpcClient 90 | { 91 | fn default() -> Self { 92 | Self::new() 93 | } 94 | } 95 | 96 | impl< 97 | R: AsyncReadExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 98 | W: AsyncWriteExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 99 | S: StreamCreator, 100 | > RpcClient 101 | { 102 | pub fn new() -> Self { 103 | let mut pool = CallbackPool::new(); 104 | pool.init(); 105 | let pool = Arc::new(pool); 106 | Self { 107 | connections: DashMap::new(), 108 | pool, 109 | stream_creator: PhantomData, 110 | } 111 | } 112 | 113 | pub fn close(&self) { 114 | self.pool.free(); 115 | } 116 | 117 | pub async fn add_connection(&self, server_address: &str) -> Result<(), String> { 118 | for _ in 0..CONNECTION_RETRY_TIMES { 119 | match S::create_stream(server_address).await { 120 | Ok((read_stream, write_stream)) => { 121 | if self.connections.contains_key(server_address) { 122 | warn!("connection already exists: {}", server_address); 123 | return Ok(()); 124 | } 125 | let connection = Arc::new(ClientConnection::new(server_address, write_stream)); 126 | tokio::spawn(parse_response( 127 | read_stream, 128 | connection.clone(), 129 | self.pool.clone(), 130 | )); 131 | self.connections 132 | .insert(server_address.to_string(), connection); 133 | info!("add connection to {} success", server_address); 134 | return Ok(()); 135 | } 136 | Err(e) => { 137 | warn!( 138 | "connect to {} failed: {}, wait for a while", 139 | server_address, e 140 | ); 141 | tokio::time::sleep(Duration::from_secs(1)).await; 142 | } 143 | } 144 | } 145 | Err(format!( 146 | "connect to {} error: connection retry times exceed", 147 | server_address 148 | )) 149 | } 150 | 151 | async fn reconnect(&self, server_address: &str) -> Result<(), String> { 152 | info!("reconnect to {}", server_address); 153 | match self.connections.get(server_address) { 154 | Some(connection) => { 155 | if connection.is_connected() { 156 | info!("connection already exists: {}", server_address); 157 | return Ok(()); 158 | } 159 | match S::create_stream(server_address).await { 160 | Ok((read_stream, write_stream)) => { 161 | tokio::spawn(parse_response( 162 | read_stream, 163 | connection.clone(), 164 | self.pool.clone(), 165 | )); 166 | connection.value().reset_connection(write_stream).await; 167 | info!("reconnect to {} success", server_address); 168 | Ok(()) 169 | } 170 | Err(e) => { 171 | warn!( 172 | "reconnect to {} failed: {}, wait for a while", 173 | server_address, e 174 | ); 175 | tokio::time::sleep(Duration::from_secs(1)).await; 176 | Ok(()) 177 | } 178 | } 179 | } 180 | None => Err(format!("connection not exists: {}", server_address)), 181 | } 182 | } 183 | 184 | pub fn remove_connection(&self, server_address: &str) { 185 | self.connections.remove(server_address); 186 | } 187 | 188 | #[allow(clippy::too_many_arguments)] 189 | pub async fn call_remote( 190 | &self, 191 | server_address: &str, 192 | operation_type: u32, 193 | req_flags: u32, 194 | path: &str, 195 | send_meta_data: &[u8], 196 | send_data: &[u8], 197 | status: &mut i32, 198 | rsp_flags: &mut u32, 199 | recv_meta_data_length: &mut usize, 200 | recv_data_length: &mut usize, 201 | recv_meta_data: &mut [u8], 202 | recv_data: &mut [u8], 203 | timeout: Duration, 204 | ) -> Result<(), String> { 205 | for _ in 0..SEND_RETRY_TIMES { 206 | let connection = match self.connections.get(server_address) { 207 | Some(connection) => connection, 208 | None => { 209 | error!("connection not exists: {}", server_address); 210 | return Err(format!("connection not exists: {}", server_address)); 211 | } 212 | }; 213 | let (batch, id) = self 214 | .pool 215 | .register_callback(recv_meta_data, recv_data) 216 | .await?; // TODO: unregister callback when error 217 | 218 | if let Err(e) = connection 219 | .send_request( 220 | batch, 221 | id, 222 | operation_type, 223 | req_flags, 224 | path, 225 | send_meta_data, 226 | send_data, 227 | ) 228 | .await 229 | { 230 | error!("send request to {} failed: {}", server_address, e); 231 | connection.disconnect(); 232 | let _lock = connection.get_reconnecting_lock().await; 233 | warn!("connection to {} disconnected", server_address); 234 | match self.reconnect(server_address).await { 235 | Ok(_) => { 236 | continue; 237 | } 238 | Err(e) => { 239 | error!("reconnect to {} failed: {}", server_address, e); 240 | return Err(format!("reconnect to {} failed: {}", server_address, e)); 241 | } 242 | } 243 | } 244 | match self.pool.wait_for_callback(id, timeout).await { 245 | Ok((s, f, meta_data_length, data_length)) => { 246 | *status = s; 247 | *rsp_flags = f; 248 | *recv_meta_data_length = meta_data_length; 249 | *recv_data_length = data_length; 250 | return Ok(()); 251 | } 252 | Err(e) => { 253 | error!("wait for callback failed: {}, batch: {}, id {}, operation type: {}, path: {}", e, batch, id, operation_type, path); 254 | continue; 255 | } 256 | } 257 | } 258 | Err(format!( 259 | "send request to {} error: send retry times exceed", 260 | server_address 261 | )) 262 | } 263 | } 264 | 265 | // parse_response 266 | // try to get response from sequence of connections and write to callbacks 267 | pub async fn parse_response( 268 | mut read_stream: R, 269 | connection: Arc>, 270 | pool: Arc, 271 | ) { 272 | loop { 273 | if !connection.is_connected() { 274 | break; 275 | } 276 | let header = match connection.receive_response_header(&mut read_stream).await { 277 | Ok(header) => header, 278 | Err(e) => { 279 | if e == "early eof" || e == "Connection reset by peer (os error 104)" { 280 | warn!("{:?} disconnected: {}", connection.server_address, e); 281 | break; 282 | } 283 | panic!( 284 | "parse_response header from {:?} error: {}", 285 | connection.server_address, e 286 | ); 287 | } 288 | }; 289 | let batch = header.batch; 290 | let id = header.id; 291 | let total_length = header.total_length; 292 | 293 | let result = { 294 | match pool.lock_if_not_timeout(batch, id) { 295 | Ok(_) => Ok(()), 296 | Err(e) => Err(e), 297 | } 298 | }; 299 | match result { 300 | Ok(_) => {} 301 | Err(e) => { 302 | error!( 303 | "parse_response lock timeout: {}, batch: {}, id: {}", 304 | e, batch, id 305 | ); 306 | let result = connection 307 | .clean_response(&mut read_stream, total_length) 308 | .await; 309 | match result { 310 | Ok(_) => {} 311 | Err(e) => { 312 | error!("parse_response clean_response error: {}", e); 313 | break; 314 | } 315 | } 316 | continue; 317 | } 318 | } 319 | 320 | if let Err(e) = connection 321 | .receive_response( 322 | &mut read_stream, 323 | pool.get_meta_data_ref(id, header.meta_data_length as usize), 324 | pool.get_data_ref(id, header.data_length as usize), 325 | ) 326 | .await 327 | { 328 | error!("Error receiving response: {}", e); 329 | break; 330 | }; 331 | if let Err(e) = pool 332 | .response( 333 | id, 334 | header.status, 335 | header.flags, 336 | header.meta_data_length as usize, 337 | header.data_length as usize, 338 | ) 339 | .await 340 | { 341 | error!("Error writing response back: {}", e); 342 | break; 343 | }; 344 | } 345 | } 346 | -------------------------------------------------------------------------------- /src/rpc/connection.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::{io::IoSlice, marker::PhantomData, sync::atomic::AtomicU32}; 6 | 7 | use super::protocol::{ 8 | RequestHeader, ResponseHeader, MAX_DATA_LENGTH, MAX_FILENAME_LENGTH, MAX_METADATA_LENGTH, 9 | REQUEST_HEADER_SIZE, RESPONSE_HEADER_SIZE, 10 | }; 11 | use log::{error, info}; 12 | use tokio::{ 13 | io::{AsyncReadExt, AsyncWriteExt}, 14 | sync::Mutex, 15 | }; 16 | 17 | const CONNECTED: u32 = 0; 18 | const DISCONNECTED: u32 = 1; 19 | 20 | pub struct ClientConnection { 21 | pub server_address: String, 22 | write_stream: Mutex>, 23 | status: AtomicU32, 24 | reconneting_lock: Mutex<()>, 25 | 26 | phantom_data: PhantomData, 27 | 28 | // lock for send_request 29 | // we need this lock because we will send multiple requests in parallel 30 | // and each request will be sent several data packets due to the partation of data and header. 31 | // now we simply copy the data and header to a buffer and send it in one write call, 32 | // so we do not need to lock the stream(linux kernel will do it for us). 33 | _send_lock: Mutex<()>, 34 | } 35 | 36 | impl ClientConnection { 37 | pub fn new(server_address: &str, write_stream: W) -> Self { 38 | Self { 39 | server_address: server_address.to_string(), 40 | write_stream: Mutex::new(Some(write_stream)), 41 | status: AtomicU32::new(CONNECTED), 42 | reconneting_lock: Mutex::new(()), 43 | phantom_data: PhantomData, 44 | _send_lock: Mutex::new(()), 45 | } 46 | } 47 | 48 | pub fn disconnect(&self) -> bool { 49 | info!("disconnecting from server {}", self.server_address); 50 | self.status 51 | .compare_exchange( 52 | CONNECTED, 53 | DISCONNECTED, 54 | std::sync::atomic::Ordering::SeqCst, 55 | std::sync::atomic::Ordering::SeqCst, 56 | ) 57 | .is_ok() 58 | } 59 | 60 | pub fn is_connected(&self) -> bool { 61 | self.status.load(std::sync::atomic::Ordering::Acquire) == CONNECTED 62 | } 63 | 64 | pub async fn get_reconnecting_lock(&self) -> tokio::sync::MutexGuard<'_, ()> { 65 | self.reconneting_lock.lock().await 66 | } 67 | 68 | pub async fn reset_connection(&self, write_stream: W) { 69 | self.write_stream.lock().await.replace(write_stream); 70 | self.status 71 | .store(CONNECTED, std::sync::atomic::Ordering::SeqCst); 72 | } 73 | 74 | // request 75 | // | batch | id | type | flags | total_length | file_path_length | meta_data_length | data_length | filename | meta_data | data | 76 | // | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 1~4kB | 0~ | 0~ | 77 | #[allow(clippy::too_many_arguments)] 78 | pub async fn send_request( 79 | &self, 80 | batch: u32, 81 | id: u32, 82 | operation_type: u32, 83 | flags: u32, 84 | filename: &str, 85 | meta_data: &[u8], 86 | data: &[u8], 87 | ) -> Result<(), String> { 88 | if !self.is_connected() { 89 | return Err("connection is not connected".to_string()); 90 | } 91 | let filename_length = filename.len(); 92 | let meta_data_length = meta_data.len(); 93 | let data_length = data.len(); 94 | let total_length = filename_length + meta_data_length + data_length; 95 | let mut request = Vec::with_capacity(total_length + REQUEST_HEADER_SIZE); 96 | request.extend_from_slice(&batch.to_le_bytes()); 97 | request.extend_from_slice(&id.to_le_bytes()); 98 | request.extend_from_slice(&operation_type.to_le_bytes()); 99 | request.extend_from_slice(&flags.to_le_bytes()); 100 | request.extend_from_slice(&(total_length as u32).to_le_bytes()); 101 | request.extend_from_slice(&(filename_length as u32).to_le_bytes()); 102 | request.extend_from_slice(&(meta_data_length as u32).to_le_bytes()); 103 | request.extend_from_slice(&(data_length as u32).to_le_bytes()); 104 | request.extend_from_slice(filename.as_bytes()); 105 | let mut stream = self.write_stream.lock().await; 106 | let mut offset = 0; 107 | loop { 108 | if offset >= request.len() + meta_data_length + data_length { 109 | break; 110 | } 111 | if offset < request.len() { 112 | let bufs: &[_] = &[ 113 | IoSlice::new(&request[offset..]), 114 | IoSlice::new(meta_data), 115 | IoSlice::new(data), 116 | ]; 117 | offset += stream 118 | .as_mut() 119 | .unwrap() 120 | .write_vectored(bufs) 121 | .await 122 | .map_err(|e| e.to_string())?; 123 | } else if offset < request.len() + meta_data_length { 124 | let bufs: &[_] = &[ 125 | IoSlice::new(&meta_data[offset - request.len()..]), 126 | IoSlice::new(data), 127 | ]; 128 | offset += stream 129 | .as_mut() 130 | .unwrap() 131 | .write_vectored(bufs) 132 | .await 133 | .map_err(|e| e.to_string())?; 134 | } else { 135 | let bufs: &[_] = &[IoSlice::new( 136 | &data[offset - request.len() - meta_data_length..], 137 | )]; 138 | offset += stream 139 | .as_mut() 140 | .unwrap() 141 | .write_vectored(bufs) 142 | .await 143 | .map_err(|e| e.to_string())?; 144 | } 145 | } 146 | Ok(()) 147 | } 148 | 149 | pub async fn receive_response_header( 150 | &self, 151 | read_stream: &mut R, 152 | ) -> Result { 153 | let mut header = [0; RESPONSE_HEADER_SIZE]; 154 | self.receive(read_stream, &mut header).await?; 155 | let batch = u32::from_le_bytes(header[0..4].try_into().unwrap()); 156 | let id = u32::from_le_bytes(header[4..8].try_into().unwrap()); 157 | let status = i32::from_le_bytes(header[8..12].try_into().unwrap()); 158 | let flags = u32::from_le_bytes(header[12..16].try_into().unwrap()); 159 | let total_length = u32::from_le_bytes(header[16..20].try_into().unwrap()); 160 | let meta_data_length = u32::from_le_bytes(header[20..24].try_into().unwrap()); 161 | let data_length = u32::from_le_bytes(header[24..28].try_into().unwrap()); 162 | Ok(ResponseHeader { 163 | batch, 164 | id, 165 | status, 166 | flags, 167 | total_length, 168 | meta_data_length, 169 | data_length, 170 | }) 171 | } 172 | 173 | pub async fn receive_response( 174 | &self, 175 | read_stream: &mut R, 176 | meta_data: &mut [u8], 177 | data: &mut [u8], 178 | ) -> Result<(), String> { 179 | let meta_data_length = meta_data.len(); 180 | let data_length = data.len(); 181 | self.receive(read_stream, &mut meta_data[0..meta_data_length]) 182 | .await?; 183 | self.receive(read_stream, &mut data[0..data_length]).await?; 184 | Ok(()) 185 | } 186 | 187 | pub async fn receive(&self, read_stream: &mut R, data: &mut [u8]) -> Result<(), String> { 188 | match read_stream.read_exact(data).await { 189 | Ok(_) => Ok(()), 190 | Err(e) => Err(e.to_string()), 191 | } 192 | } 193 | 194 | pub async fn clean_response( 195 | &self, 196 | read_stream: &mut R, 197 | total_length: u32, 198 | ) -> Result<(), String> { 199 | let mut buffer = vec![0u8; total_length as usize]; 200 | self.receive(read_stream, &mut buffer).await?; 201 | Ok(()) 202 | } 203 | } 204 | 205 | pub struct ServerConnection { 206 | pub id: u32, 207 | name_id: String, 208 | write_stream: Mutex, 209 | 210 | phantom_data: PhantomData, 211 | } 212 | 213 | impl ServerConnection { 214 | pub fn new(write_stream: W, name_id: String, id: u32) -> Self { 215 | ServerConnection { 216 | id, 217 | name_id, 218 | write_stream: Mutex::new(write_stream), 219 | 220 | phantom_data: PhantomData, 221 | } 222 | } 223 | 224 | pub fn name_id(&self) -> String { 225 | self.name_id.clone() 226 | } 227 | 228 | pub async fn close(&self) -> Result<(), String> { 229 | let mut stream = self.write_stream.lock().await; 230 | stream.shutdown().await.map_err(|e| e.to_string())?; 231 | info!("close connection {}", self.name_id); 232 | Ok(()) 233 | } 234 | 235 | // response 236 | // | batch | id | status | flags | total_length | meta_data_lenght | data_length | meta_data | data | 237 | // | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 0~ | 0~ | 238 | pub async fn send_response( 239 | &self, 240 | batch: u32, 241 | id: u32, 242 | status: i32, 243 | flags: u32, 244 | meta_data: &[u8], 245 | data: &[u8], 246 | ) -> Result<(), String> { 247 | let data_length = data.len(); 248 | let meta_data_length = meta_data.len(); 249 | let total_length = data_length + meta_data_length; 250 | let mut response = Vec::with_capacity(RESPONSE_HEADER_SIZE + total_length); 251 | response.extend_from_slice(&batch.to_le_bytes()); 252 | response.extend_from_slice(&id.to_le_bytes()); 253 | response.extend_from_slice(&status.to_le_bytes()); 254 | response.extend_from_slice(&flags.to_le_bytes()); 255 | response.extend_from_slice(&(total_length as u32).to_le_bytes()); 256 | response.extend_from_slice(&(meta_data_length as u32).to_le_bytes()); 257 | response.extend_from_slice(&(data_length as u32).to_le_bytes()); 258 | let mut stream = self.write_stream.lock().await; 259 | let mut offset = 0; 260 | loop { 261 | if offset >= response.len() + meta_data_length + data_length { 262 | break; 263 | } 264 | if offset < response.len() { 265 | let bufs: &[_] = &[ 266 | IoSlice::new(&response[offset..]), 267 | IoSlice::new(meta_data), 268 | IoSlice::new(data), 269 | ]; 270 | offset += stream 271 | .write_vectored(bufs) 272 | .await 273 | .map_err(|e| e.to_string())?; 274 | } else if offset < response.len() + meta_data_length { 275 | let bufs: &[_] = &[ 276 | IoSlice::new(&meta_data[offset - response.len()..]), 277 | IoSlice::new(data), 278 | ]; 279 | offset += stream 280 | .write_vectored(bufs) 281 | .await 282 | .map_err(|e| e.to_string())?; 283 | } else { 284 | let bufs: &[_] = &[IoSlice::new( 285 | &data[offset - response.len() - meta_data_length..], 286 | )]; 287 | offset += stream 288 | .write_vectored(bufs) 289 | .await 290 | .map_err(|e| e.to_string())?; 291 | } 292 | } 293 | Ok(()) 294 | } 295 | 296 | pub async fn receive_request_header( 297 | &self, 298 | read_stream: &mut R, 299 | ) -> Result { 300 | let mut header = [0; REQUEST_HEADER_SIZE]; 301 | self.receive(read_stream, &mut header).await?; 302 | let batch = u32::from_le_bytes(header[0..4].try_into().unwrap()); 303 | let id = u32::from_le_bytes(header[4..8].try_into().unwrap()); 304 | let operation_type = u32::from_le_bytes(header[8..12].try_into().unwrap()); 305 | let flags: u32 = u32::from_le_bytes(header[12..16].try_into().unwrap()); 306 | let total_length = u32::from_le_bytes(header[16..20].try_into().unwrap()); 307 | let file_path_length = u32::from_le_bytes(header[20..24].try_into().unwrap()); 308 | let meta_data_length = u32::from_le_bytes(header[24..28].try_into().unwrap()); 309 | let data_length = u32::from_le_bytes(header[28..32].try_into().unwrap()); 310 | Ok(RequestHeader { 311 | batch, 312 | id, 313 | r#type: operation_type, 314 | flags, 315 | total_length, 316 | file_path_length, 317 | meta_data_length, 318 | data_length, 319 | }) 320 | } 321 | 322 | pub async fn receive_request( 323 | &self, 324 | read_stream: &mut R, 325 | header: &RequestHeader, 326 | ) -> Result<(Vec, Vec, Vec), String> { 327 | if header.file_path_length as usize > MAX_FILENAME_LENGTH { 328 | error!("path length is too long: {}", header.file_path_length); 329 | return Err("path length is too long".into()); 330 | } 331 | if header.data_length as usize > MAX_DATA_LENGTH { 332 | error!("data length is too long: {}", header.data_length); 333 | return Err("data length is too long".into()); 334 | } 335 | if header.meta_data_length as usize > MAX_METADATA_LENGTH { 336 | error!("meta data length is too long: {}", header.meta_data_length); 337 | return Err("meta data length is too long".into()); 338 | } 339 | let mut path = vec![0u8; header.file_path_length as usize]; 340 | let mut data = vec![0u8; header.data_length as usize]; 341 | let mut meta_data = vec![0u8; header.meta_data_length as usize]; 342 | 343 | self.receive(read_stream, &mut path[0..header.file_path_length as usize]) 344 | .await?; 345 | self.receive( 346 | read_stream, 347 | &mut meta_data[0..header.meta_data_length as usize], 348 | ) 349 | .await?; 350 | self.receive(read_stream, &mut data[0..header.data_length as usize]) 351 | .await?; 352 | 353 | Ok((path, data, meta_data)) 354 | } 355 | 356 | pub async fn receive(&self, read_stream: &mut R, data: &mut [u8]) -> Result<(), String> { 357 | match read_stream.read_exact(data).await { 358 | Ok(_) => Ok(()), 359 | Err(e) => Err(e.to_string()), 360 | } 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /src/rpc/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub mod callback; 6 | pub mod client; 7 | pub mod connection; 8 | pub mod protocol; 9 | pub mod rdma; 10 | pub mod server; 11 | -------------------------------------------------------------------------------- /src/rpc/protocol.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub const MAX_FILENAME_LENGTH: usize = 4096; 6 | pub const MAX_DATA_LENGTH: usize = 65536 * 65536; 7 | pub const MAX_METADATA_LENGTH: usize = 65536; 8 | pub const MAX_COPY_LENGTH: usize = 1024 * 8; 9 | 10 | pub const CONNECTION_RETRY_TIMES: i32 = 100; 11 | pub const SEND_RETRY_TIMES: i32 = 5; 12 | 13 | // request 14 | // | batch | id | type | flags | total_length | file_path_length | meta_data_length | data_length | filename | meta_data | data | 15 | // | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 1~4kB | 0~ | 0~ | 16 | pub const REQUEST_HEADER_SIZE: usize = 4 * 8; 17 | pub const REQUEST_FILENAME_LENGTH_SIZE: usize = 4; 18 | pub const REQUEST_METADATA_LENGTH_SIZE: usize = 4; 19 | pub const REQUEST_DATA_LENGTH_SIZE: usize = 4; 20 | 21 | pub const REQUEST_POOL_SIZE: usize = 65536; 22 | 23 | /* receive operation response and wake up the operation thread using condition variable 24 | response 25 | | batch | id | status | flags | total_length | meta_data_lenght | data_length | meta_data | data | 26 | | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 4Byte | 0~ | 0~ | 27 | */ 28 | pub const RESPONSE_HEADER_SIZE: usize = 4 * 7; 29 | 30 | // pub const CLIENT_RESPONSE_TIMEOUT: time::Duration = time::Duration::from_micros(300); // timeout for client response loop 31 | 32 | #[derive(Debug)] 33 | pub struct RequestHeader { 34 | pub batch: u32, 35 | pub id: u32, 36 | pub r#type: u32, 37 | pub flags: u32, 38 | pub total_length: u32, // we use u32 because of the protocol consistency 39 | pub file_path_length: u32, 40 | pub meta_data_length: u32, 41 | pub data_length: u32, 42 | } 43 | 44 | impl RequestHeader { 45 | #[allow(clippy::too_many_arguments)] 46 | pub fn new( 47 | batch: u32, 48 | id: u32, 49 | r#type: u32, 50 | flags: u32, 51 | total_length: u32, 52 | file_path_length: u32, 53 | meta_data_length: u32, 54 | data_length: u32, 55 | ) -> Self { 56 | Self { 57 | batch, 58 | id, 59 | r#type, 60 | flags, 61 | total_length, 62 | file_path_length, 63 | meta_data_length, 64 | data_length, 65 | } 66 | } 67 | } 68 | 69 | pub struct ResponseHeader { 70 | pub batch: u32, 71 | pub id: u32, 72 | pub status: i32, 73 | pub flags: u32, 74 | pub total_length: u32, 75 | pub meta_data_length: u32, 76 | pub data_length: u32, 77 | } 78 | 79 | impl ResponseHeader { 80 | pub fn new( 81 | batch: u32, 82 | id: u32, 83 | status: i32, 84 | flags: u32, 85 | total_length: u32, 86 | meta_data_length: u32, 87 | data_length: u32, 88 | ) -> Self { 89 | Self { 90 | batch, 91 | id, 92 | status, 93 | flags, 94 | total_length, 95 | meta_data_length, 96 | data_length, 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/rpc/rdma/client.rs: -------------------------------------------------------------------------------- 1 | use core::result::Result; 2 | use dashmap::DashMap; 3 | use ibv::connection::conn::{connect, Conn}; 4 | use log::{debug, error}; 5 | use std::{io::IoSlice, sync::Arc, time::Duration}; 6 | 7 | use crate::rpc::{ 8 | callback::CallbackPool, 9 | protocol::{ResponseHeader, RESPONSE_HEADER_SIZE}, 10 | }; 11 | pub struct Client { 12 | connections: DashMap>, 13 | pool: Arc, 14 | } 15 | 16 | impl Client { 17 | pub fn new() -> Self { 18 | let mut pool = CallbackPool::new(); 19 | pool.init(); 20 | let pool = Arc::new(pool); 21 | Client { 22 | connections: DashMap::new(), 23 | pool, 24 | } 25 | } 26 | 27 | pub fn close(&self) { 28 | self.pool.free(); 29 | } 30 | 31 | pub async fn add_connection(&self, addr: &str) { 32 | let conn = Arc::new(connect(addr).await.unwrap()); 33 | debug!("connect to {} success", addr); 34 | let conn1 = conn.clone(); 35 | self.connections.insert(addr.to_string(), conn1); 36 | tokio::spawn(parse_response(conn, self.pool.clone())); 37 | } 38 | 39 | pub fn get_connection(&self, addr: &str) -> Option> { 40 | self.connections.get(addr).map(|conn| conn.value().clone()) 41 | } 42 | 43 | #[allow(clippy::too_many_arguments)] 44 | pub async fn call_remote( 45 | &self, 46 | server_address: &str, 47 | operation_type: u32, 48 | req_flags: u32, 49 | path: &str, 50 | send_meta_data: &[u8], 51 | send_data: &[u8], 52 | status: &mut i32, 53 | rsp_flags: &mut u32, 54 | recv_meta_data_length: &mut usize, 55 | recv_data_length: &mut usize, 56 | recv_meta_data: &mut [u8], 57 | recv_data: &mut [u8], 58 | timeout: Duration, 59 | ) -> Result<(), Box> { 60 | let (batch, id) = self 61 | .pool 62 | .register_callback(recv_meta_data, recv_data) 63 | .await?; 64 | debug!( 65 | "call_remote on {:?}, batch {}, id: {}", 66 | server_address, batch, id 67 | ); 68 | // send request to remote 69 | self.send_request( 70 | server_address, 71 | batch, 72 | id, 73 | operation_type, 74 | req_flags, 75 | path, 76 | send_meta_data, 77 | send_data, 78 | ) 79 | .await?; 80 | 81 | let (s, f, meta_data_length, data_length) = 82 | self.pool.wait_for_callback(id, timeout).await?; 83 | debug!( 84 | "call_remote success, id: {}, status: {}, flags: {}, meta_data_length: {}, data_length: {}", 85 | id, s, f, meta_data_length, data_length 86 | ); 87 | *status = s; 88 | *rsp_flags = f; 89 | *recv_meta_data_length = meta_data_length; 90 | *recv_data_length = data_length; 91 | Ok(()) 92 | } 93 | 94 | #[allow(clippy::too_many_arguments)] 95 | pub async fn send_request( 96 | &self, 97 | addr: &str, 98 | batch: u32, 99 | id: u32, 100 | operation_type: u32, 101 | req_flags: u32, 102 | path: &str, 103 | send_meta_data: &[u8], 104 | send_data: &[u8], 105 | ) -> Result<(), Box> { 106 | let conn = self.get_connection(addr).unwrap(); 107 | let mut request = Vec::new(); 108 | let total_length = path.len() + send_meta_data.len() + send_data.len(); 109 | request.extend_from_slice(&batch.to_le_bytes()); 110 | request.extend_from_slice(&id.to_le_bytes()); 111 | request.extend_from_slice(&operation_type.to_le_bytes()); 112 | request.extend_from_slice(&req_flags.to_le_bytes()); 113 | request.extend_from_slice(&(total_length as u32).to_le_bytes()); 114 | request.extend_from_slice(&(path.len() as u32).to_le_bytes()); 115 | request.extend_from_slice(&(send_meta_data.len() as u32).to_le_bytes()); 116 | request.extend_from_slice(&(send_data.len() as u32).to_le_bytes()); 117 | request.extend_from_slice(path.as_bytes()); 118 | let request = &[ 119 | IoSlice::new(&request), 120 | IoSlice::new(send_meta_data), 121 | IoSlice::new(send_data), 122 | ]; 123 | debug!("send_request: {:?}", request); 124 | conn.send_msg(request).await?; 125 | Ok(()) 126 | } 127 | } 128 | 129 | impl Default for Client { 130 | fn default() -> Self { 131 | Self::new() 132 | } 133 | } 134 | 135 | pub async fn parse_response(conn: Arc, pool: Arc) { 136 | loop { 137 | let response = conn.recv_msg().await.unwrap(); 138 | debug!("parse_response: recv response: {:?}", response); 139 | // parse response 140 | let header = parse_response_header(response); 141 | 142 | let batch = header.batch; 143 | let id = header.id; 144 | 145 | if pool.lock_if_not_timeout(batch, id).is_err() { 146 | debug!("parse_response: lock timeout"); 147 | continue; 148 | } 149 | debug!("parse_response: lock success"); 150 | 151 | parse_response_body( 152 | response, 153 | pool.get_meta_data_ref(id, header.meta_data_length as usize), 154 | pool.get_data_ref(id, header.data_length as usize), 155 | ); 156 | conn.release(response).await; 157 | if let Err(e) = pool 158 | .response( 159 | id, 160 | header.status, 161 | header.flags, 162 | header.meta_data_length as usize, 163 | header.data_length as usize, 164 | ) 165 | .await 166 | { 167 | error!("Error writing response back: {}", e); 168 | break; 169 | }; 170 | // todo: realease the buf in response 171 | } 172 | } 173 | 174 | pub fn parse_response_header(response: &[u8]) -> ResponseHeader { 175 | let header = &response[0..RESPONSE_HEADER_SIZE]; 176 | let batch = u32::from_le_bytes(header[0..4].try_into().unwrap()); 177 | let id = u32::from_le_bytes(header[4..8].try_into().unwrap()); 178 | let status = i32::from_le_bytes(header[8..12].try_into().unwrap()); 179 | let flags = u32::from_le_bytes(header[12..16].try_into().unwrap()); 180 | let total_length = u32::from_le_bytes(header[16..20].try_into().unwrap()); 181 | let meta_data_length = u32::from_le_bytes(header[20..24].try_into().unwrap()); 182 | let data_length = u32::from_le_bytes(header[24..28].try_into().unwrap()); 183 | // debug!( 184 | // "received response_header batch: {}, id: {}, status: {}, flags: {}, total_length: {}, meta_data_length: {}, data_length: {}", 185 | // batch, id, status, flags, total_length, meta_data_length, data_length 186 | // ); 187 | ResponseHeader { 188 | batch, 189 | id, 190 | status, 191 | flags, 192 | total_length, 193 | meta_data_length, 194 | data_length, 195 | } 196 | } 197 | 198 | pub fn parse_response_body(response: &[u8], meta_data: &mut [u8], data: &mut [u8]) { 199 | let meta_data_length = meta_data.len(); 200 | let data_length = data.len(); 201 | debug!( 202 | "waiting for response_meta_data, length: {}", 203 | meta_data_length 204 | ); 205 | // copy response to meta_data 206 | meta_data 207 | .copy_from_slice(&response[RESPONSE_HEADER_SIZE..RESPONSE_HEADER_SIZE + meta_data_length]); 208 | debug!("received reponse_meta_data, meta_data: {:?}", meta_data); 209 | // copy response to data 210 | data.copy_from_slice( 211 | &response[RESPONSE_HEADER_SIZE + meta_data_length 212 | ..RESPONSE_HEADER_SIZE + meta_data_length + data_length], 213 | ); 214 | debug!("received reponse_data, data: {:?}", data); 215 | } 216 | -------------------------------------------------------------------------------- /src/rpc/rdma/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | pub mod server; 3 | -------------------------------------------------------------------------------- /src/rpc/rdma/server.rs: -------------------------------------------------------------------------------- 1 | use std::{io::IoSlice, sync::Arc}; 2 | 3 | use ibv::connection::conn::{Conn, MyReceiver}; 4 | use log::debug; 5 | use tokio::sync::mpsc::channel; 6 | 7 | use ibv::connection::conn::run; 8 | 9 | use crate::rpc::{ 10 | protocol::{RequestHeader, REQUEST_HEADER_SIZE, RESPONSE_HEADER_SIZE}, 11 | server::Handler, 12 | }; 13 | pub struct Server { 14 | pub addr: String, 15 | incoming: MyReceiver, 16 | handler: Arc, 17 | } 18 | 19 | impl Server 20 | where 21 | H: Handler + std::marker::Sync + std::marker::Send + 'static, 22 | { 23 | pub async fn new(addr: String, handler: Arc) -> Self { 24 | let (tx, rx) = channel(1000); 25 | let address = addr.clone(); 26 | tokio::spawn(run(address, tx)); 27 | let rx = MyReceiver::new(rx); 28 | Server { 29 | addr, 30 | incoming: rx, 31 | handler, 32 | } 33 | } 34 | 35 | pub async fn accept(&self) -> Conn { 36 | self.incoming.recv().await 37 | } 38 | 39 | pub async fn run(&self) -> anyhow::Result<()> { 40 | loop { 41 | let conn = Arc::new(self.accept().await); 42 | println!("accept a connection"); 43 | let handler = Arc::clone(&self.handler); 44 | tokio::spawn(receive(handler, conn)); 45 | } 46 | } 47 | } 48 | 49 | pub async fn receive( 50 | handler: Arc, 51 | conn: Arc, 52 | ) { 53 | loop { 54 | let request: &[u8] = conn.recv_msg().await.unwrap(); 55 | debug!("receive a request: {:?}", request); 56 | let (header, path, meta_data, data) = parse_request(request); 57 | conn.release(request).await; 58 | 59 | let handler = handler.clone(); 60 | tokio::spawn(handle(handler, conn.clone(), header, path, meta_data, data)); 61 | } 62 | } 63 | 64 | // parse_request_header(): parse the request header 65 | // 1. parse the header from the request 66 | // 2. return the header 67 | pub fn parse_request_header(request: &[u8]) -> RequestHeader { 68 | let header = &request[0..REQUEST_HEADER_SIZE]; 69 | let batch = u32::from_le_bytes(header[0..4].try_into().unwrap()); 70 | let id = u32::from_le_bytes(header[4..8].try_into().unwrap()); 71 | let operation_type = u32::from_le_bytes(header[8..12].try_into().unwrap()); 72 | let flags: u32 = u32::from_le_bytes(header[12..16].try_into().unwrap()); 73 | let total_length = u32::from_le_bytes(header[16..20].try_into().unwrap()); 74 | let file_path_length = u32::from_le_bytes(header[20..24].try_into().unwrap()); 75 | let meta_data_length = u32::from_le_bytes(header[24..28].try_into().unwrap()); 76 | let data_length = u32::from_le_bytes(header[28..32].try_into().unwrap()); 77 | RequestHeader { 78 | batch, 79 | id, 80 | r#type: operation_type, 81 | flags, 82 | total_length, 83 | file_path_length, 84 | meta_data_length, 85 | data_length, 86 | } 87 | } 88 | 89 | pub fn parse_request(request: &[u8]) -> (RequestHeader, Vec, Vec, Vec) { 90 | let header = parse_request_header(request); 91 | debug!("parse_request, header: {:?}", header); 92 | let path = 93 | &request[REQUEST_HEADER_SIZE..REQUEST_HEADER_SIZE + header.file_path_length as usize]; 94 | let metadata = &request[REQUEST_HEADER_SIZE + header.file_path_length as usize 95 | ..REQUEST_HEADER_SIZE 96 | + header.file_path_length as usize 97 | + header.meta_data_length as usize]; 98 | let data = &request[REQUEST_HEADER_SIZE 99 | + header.file_path_length as usize 100 | + header.meta_data_length as usize 101 | ..REQUEST_HEADER_SIZE 102 | + header.file_path_length as usize 103 | + header.meta_data_length as usize 104 | + header.data_length as usize]; 105 | (header, path.to_vec(), metadata.to_vec(), data.to_vec()) 106 | } 107 | 108 | // handle(): handle the request 109 | // 1. call the handler to handle the request 110 | // 2. send the response back to the client 111 | async fn handle( 112 | handler: Arc, 113 | conn: Arc, 114 | header: RequestHeader, 115 | path: Vec, 116 | metadata: Vec, 117 | data: Vec, 118 | ) { 119 | debug!("handle, id: {}", header.id); 120 | let response = handler 121 | .dispatch(0, header.r#type, header.flags, path, data, metadata) 122 | .await; 123 | debug!("handle, response: {:?}", response); 124 | match response { 125 | Ok(response) => { 126 | let result = send_response( 127 | conn.clone(), 128 | header.batch, 129 | header.id, 130 | response.0, 131 | response.1, 132 | &response.4[0..response.2], 133 | &response.5[0..response.3], 134 | ) 135 | .await; 136 | match result { 137 | Ok(_) => { 138 | // debug!("handle, send response success"); 139 | } 140 | Err(e) => { 141 | debug!("handle, send response error: {}", e); 142 | } 143 | } 144 | } 145 | Err(e) => { 146 | debug!("handle, dispatch error: {}", e); 147 | } 148 | } 149 | } 150 | 151 | pub async fn send_response( 152 | conn: Arc, 153 | batch: u32, 154 | id: u32, 155 | status: i32, 156 | flags: u32, 157 | meta_data: &[u8], 158 | data: &[u8], 159 | ) -> Result<(), Box> { 160 | let data_length = data.len(); 161 | let meta_data_length = meta_data.len(); 162 | let total_length = data_length + meta_data_length; 163 | let mut response = Vec::with_capacity(RESPONSE_HEADER_SIZE + total_length); 164 | response.extend_from_slice(&batch.to_le_bytes()); 165 | response.extend_from_slice(&id.to_le_bytes()); 166 | response.extend_from_slice(&status.to_le_bytes()); 167 | response.extend_from_slice(&flags.to_le_bytes()); 168 | response.extend_from_slice(&(total_length as u32).to_le_bytes()); 169 | response.extend_from_slice(&(meta_data_length as u32).to_le_bytes()); 170 | response.extend_from_slice(&(data_length as u32).to_le_bytes()); 171 | let response = &[ 172 | IoSlice::new(&response), 173 | IoSlice::new(meta_data), 174 | IoSlice::new(data), 175 | ]; 176 | conn.send_msg(response).await?; 177 | Ok(()) 178 | } 179 | -------------------------------------------------------------------------------- /src/rpc/server.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::sync::Arc; 6 | 7 | use async_trait::async_trait; 8 | use log::{error, info, warn}; 9 | use tokio::{ 10 | io::{AsyncReadExt, AsyncWriteExt}, 11 | net::{TcpListener, UnixListener}, 12 | }; 13 | 14 | use super::{connection::ServerConnection, protocol::RequestHeader}; 15 | 16 | #[async_trait] 17 | pub trait Handler { 18 | async fn dispatch( 19 | &self, 20 | id: u32, 21 | operation_type: u32, 22 | flags: u32, 23 | path: Vec, 24 | data: Vec, 25 | metadata: Vec, 26 | ) -> anyhow::Result<(i32, u32, usize, usize, Vec, Vec)>; 27 | } 28 | 29 | pub async fn handle< 30 | H: Handler + std::marker::Sync + std::marker::Send + 'static, 31 | W: AsyncWriteExt + Unpin, 32 | R: AsyncReadExt + Unpin, 33 | >( 34 | handler: Arc, 35 | connection: Arc>, 36 | header: RequestHeader, 37 | path: Vec, 38 | data: Vec, 39 | metadata: Vec, 40 | ) { 41 | let response = handler 42 | .dispatch( 43 | connection.id, 44 | header.r#type, 45 | header.flags, 46 | path.clone(), 47 | data, 48 | metadata, 49 | ) 50 | .await; 51 | match response { 52 | Ok(response) => { 53 | if let Err(e) = connection 54 | .send_response( 55 | header.batch, 56 | header.id, 57 | response.0, 58 | response.1, 59 | &response.4[0..response.2], 60 | &response.5[0..response.3], 61 | ) 62 | .await 63 | { 64 | error!("handle connection: {} , send response error: {}, batch: {}, id: {}, operation_type: {}, flags: {}, path: {:?}", connection.id, e, header.batch, header.id, header.r#type, header.flags, std::str::from_utf8(&path)); 65 | let _ = connection.close().await; 66 | } 67 | } 68 | Err(e) => { 69 | error!( 70 | "handle connection: {} , dispatch error: {}", 71 | connection.id, e 72 | ); 73 | } 74 | } 75 | } 76 | 77 | pub async fn receive< 78 | H: Handler + std::marker::Sync + std::marker::Send + 'static, 79 | W: AsyncWriteExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 80 | R: AsyncReadExt + Unpin + std::marker::Sync + std::marker::Send + 'static, 81 | >( 82 | handler: Arc, 83 | connection: Arc>, 84 | mut read_stream: R, 85 | ) { 86 | loop { 87 | { 88 | let id = connection.name_id(); 89 | let header = match connection.receive_request_header(&mut read_stream).await { 90 | Ok(header) => header, 91 | Err(e) => { 92 | if e == "early eof" || e == "Connection reset by peer (os error 104)" { 93 | warn!("{:?} receive, connection closed", id); 94 | break; 95 | } 96 | panic!("{:?} parse_request, header error: {}", id, e); 97 | } 98 | }; 99 | let data_result = connection.receive_request(&mut read_stream, &header).await; 100 | let (path, data, metadata) = match data_result { 101 | Ok(data) => data, 102 | Err(e) => { 103 | panic!("{:?} parse_request, data error: {}", id, e); 104 | } 105 | }; 106 | let handler = handler.clone(); 107 | let connection = connection.clone(); 108 | tokio::spawn(handle(handler, connection, header, path, data, metadata)); 109 | } 110 | } 111 | } 112 | 113 | pub struct RpcServer { 114 | // listener: TcpListener, 115 | bind_address: String, 116 | handler: Arc, 117 | } 118 | 119 | impl RpcServer { 120 | pub fn new(handler: Arc, bind_address: &str) -> Self { 121 | Self { 122 | handler, 123 | bind_address: String::from(bind_address), 124 | } 125 | } 126 | 127 | pub async fn run(&self) -> anyhow::Result<()> { 128 | info!("Listening on {:?}", self.bind_address); 129 | let listener = TcpListener::bind(&self.bind_address).await?; 130 | let mut id = 1u32; 131 | loop { 132 | match listener.accept().await { 133 | Ok((stream, _)) => { 134 | let (read_stream, write_stream) = stream.into_split(); 135 | info!("Connection {id} accepted"); 136 | let handler = Arc::clone(&self.handler); 137 | let name_id = format!("{},{}", self.bind_address, id); 138 | let connection = Arc::new(ServerConnection::new(write_stream, name_id, id)); 139 | tokio::spawn(async move { 140 | receive(handler, connection, read_stream).await; 141 | }); 142 | id += 1; 143 | } 144 | Err(e) => { 145 | panic!("Failed to create tcp stream, error is {}", e) 146 | } 147 | } 148 | } 149 | } 150 | 151 | pub async fn run_unix_stream(&self) -> anyhow::Result<()> { 152 | info!("Listening on {:?}", self.bind_address); 153 | let listener = match UnixListener::bind(&self.bind_address) { 154 | Ok(listener) => listener, 155 | Err(e) => { 156 | return Err(anyhow::anyhow!( 157 | "Failed to create unix stream at {:?}, error is {}", 158 | self.bind_address, 159 | e 160 | )); 161 | } 162 | }; 163 | let mut id = 1u32; 164 | loop { 165 | match listener.accept().await { 166 | Ok((stream, _)) => { 167 | let (read_stream, write_stream) = stream.into_split(); 168 | info!("Connection {id} accepted"); 169 | let handler = Arc::clone(&self.handler); 170 | let name_id = format!("{},{}", self.bind_address, id); 171 | let connection = Arc::new(ServerConnection::new(write_stream, name_id, id)); 172 | tokio::spawn(async move { 173 | receive(handler, connection, read_stream).await; 174 | }); 175 | id += 1; 176 | } 177 | Err(e) => { 178 | panic!("Failed to create tcp stream, error is {}", e) 179 | } 180 | } 181 | } 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/server/storage_engine/block_engine/allocator.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::sync::Arc; 6 | 7 | use libc::ioctl; 8 | use nix::fcntl::{open, OFlag}; 9 | use parking_lot::Mutex; 10 | 11 | //#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ 12 | const BLOCKGETSIZE: u64 = 0x1260; 13 | 14 | pub const CHUNK: u64 = 512 * 8; 15 | const SECTOR: u64 = 512; 16 | 17 | pub(crate) trait Allocator { 18 | fn new(path: &str) -> Self; 19 | fn allocator_space(&self, lenth: u64) -> u64; 20 | } 21 | 22 | /* 23 | * This Allocator use for memory. 24 | */ 25 | #[allow(unused)] 26 | pub(crate) struct BitmapAllocator { 27 | block_space: Arc>, 28 | total_aspce: u64, 29 | } 30 | 31 | impl Allocator for BitmapAllocator { 32 | fn new(path: &str) -> Self { 33 | let blockdevice = BlockDevice::new(path).unwrap(); 34 | Self { 35 | block_space: Arc::new(Mutex::new(0)), 36 | total_aspce: blockdevice.chunk_num, 37 | } 38 | } 39 | 40 | fn allocator_space(&self, lenth: u64) -> u64 { 41 | // todo reduce allocatorc size. 42 | // todo exent space manager. 43 | let mut chunk_size = lenth / CHUNK; 44 | if lenth - chunk_size * CHUNK > 0 { 45 | chunk_size += 1; 46 | } 47 | let mut mutex = self.block_space.lock(); 48 | let begin_allocator_pos = *mutex; 49 | *mutex += chunk_size; 50 | begin_allocator_pos 51 | } 52 | } 53 | 54 | // Block device info. 55 | struct BlockDevice { 56 | chunk_num: u64, 57 | } 58 | 59 | impl BlockDevice { 60 | fn new(path: &str) -> Result { 61 | let block_num = Self::get_block_info(path)?; 62 | let chunk_num = block_num / (CHUNK / SECTOR); 63 | Ok(BlockDevice { chunk_num }) 64 | } 65 | 66 | fn get_block_info(path: &str) -> Result { 67 | let fd = open(path, OFlag::O_DIRECT, nix::sys::stat::Mode::S_IRWXU).map_err(|_| { 68 | println!("open block device error"); 69 | libc::EIO 70 | })?; 71 | if fd < 0 { 72 | return Err(libc::EEXIST); 73 | } 74 | let block_num = 0; 75 | unsafe { 76 | let result = ioctl(fd, BLOCKGETSIZE, &block_num); 77 | if result < 0 { 78 | return Err(libc::EIO); 79 | } 80 | } 81 | Ok(block_num) 82 | } 83 | } 84 | 85 | #[cfg(feature = "block_test")] 86 | #[cfg(test)] 87 | mod tests { 88 | use std::process::Command; 89 | 90 | use super::{Allocator, BitmapAllocator, BlockDevice}; 91 | 92 | #[test] 93 | fn block_info_test() { 94 | Command::new("bash") 95 | .arg("-c") 96 | .arg("dd if=/dev/zero of=node1 bs=4M count=1") 97 | .output() 98 | .unwrap(); 99 | Command::new("bash") 100 | .arg("-c") 101 | .arg("losetup /dev/loop8 node1") 102 | .output() 103 | .unwrap(); 104 | let block_num = BlockDevice::get_block_info("/dev/loop8"); 105 | assert_eq!(8192, block_num.unwrap()); 106 | Command::new("bash") 107 | .arg("-c") 108 | .arg("losetup -d /dev/loop8") 109 | .output() 110 | .unwrap(); 111 | Command::new("bash") 112 | .arg("-c") 113 | .arg("rm node1") 114 | .output() 115 | .unwrap(); 116 | } 117 | 118 | #[test] 119 | fn allocator_test() { 120 | Command::new("bash") 121 | .arg("-c") 122 | .arg("dd if=/dev/zero of=node1 bs=4M count=1") 123 | .output() 124 | .unwrap(); 125 | Command::new("bash") 126 | .arg("-c") 127 | .arg("losetup /dev/loop8 node1") 128 | .output() 129 | .unwrap(); 130 | let allocator = BitmapAllocator::new("/dev/loop8"); 131 | let length = allocator.allocator_space(512 * 8 * 8); 132 | assert_eq!(length + 8, 8); 133 | Command::new("bash") 134 | .arg("-c") 135 | .arg("losetup -d /dev/loop8") 136 | .output() 137 | .unwrap(); 138 | Command::new("bash") 139 | .arg("-c") 140 | .arg("rm node1") 141 | .output() 142 | .unwrap(); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/server/storage_engine/block_engine/index.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use dashmap::DashMap; 6 | 7 | pub(crate) struct FileIndex { 8 | index: DashMap>, 9 | } 10 | 11 | impl FileIndex { 12 | pub(crate) fn new() -> Self { 13 | let index = DashMap::new(); 14 | Self { index } 15 | } 16 | 17 | pub(crate) fn search(&self, file_name: &str) -> Vec { 18 | let value = self.index.get(file_name); 19 | match value { 20 | Some(entry) => entry.value().to_vec(), 21 | None => Vec::new(), 22 | } 23 | } 24 | 25 | pub(crate) fn update_index(&self, path: &str, mut vec: Vec) { 26 | let mut index_value_vec = self.search(path); 27 | index_value_vec.append(vec.as_mut()); 28 | self.index.insert(path.to_string(), index_value_vec); 29 | } 30 | } 31 | 32 | #[derive(Clone, Copy)] 33 | #[allow(unused)] 34 | pub(crate) struct IndexEntry { 35 | chunk: u64, 36 | begin: u64, 37 | length: u64, 38 | } 39 | 40 | #[cfg(test)] 41 | mod tests { 42 | use super::FileIndex; 43 | 44 | #[test] 45 | fn search_and_update_index_test() { 46 | let index = FileIndex::new(); 47 | let vec = index.search("test"); 48 | assert_eq!(vec.is_empty(), true); 49 | let mut vec = Vec::new(); 50 | vec.push(1); 51 | index.update_index("test", vec); 52 | let mut vec = index.search("test"); 53 | assert_eq!(vec.pop(), Some(1)); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/server/storage_engine/block_engine/io.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use nix::{ 6 | fcntl::{self, OFlag}, 7 | sys::{ 8 | stat::Mode, 9 | uio::{pread, pwrite}, 10 | }, 11 | }; 12 | 13 | pub(crate) struct Storage { 14 | _fd: i32, 15 | } 16 | 17 | impl Storage { 18 | pub(crate) fn new(path: &str) -> Storage { 19 | let oflags = OFlag::O_RDWR; 20 | let mode = Mode::S_IRUSR 21 | | Mode::S_IWUSR 22 | | Mode::S_IRGRP 23 | | Mode::S_IWGRP 24 | | Mode::S_IROTH 25 | | Mode::S_IWOTH; 26 | let fd = fcntl::open(path, oflags, mode); 27 | match fd { 28 | Ok(fd) => Self { _fd: fd }, 29 | Err(_) => panic!("No Raw blockdevice"), 30 | } 31 | } 32 | 33 | pub(crate) fn _write(&self, data: &[u8], offset: i64) -> Result { 34 | match pwrite(self._fd, data, offset) { 35 | Ok(size) => Ok(size), 36 | Err(_) => Err(libc::EIO), 37 | } 38 | } 39 | 40 | pub(crate) fn _read(&self, size: u32, offset: i64) -> Result, i32> { 41 | let mut data = vec![0; size as usize]; 42 | let length = pread(self._fd, data.as_mut_slice(), offset).map_err(|_| libc::EIO)?; 43 | Ok(data[..length].to_vec()) 44 | } 45 | } 46 | 47 | #[cfg(feature = "block_test")] 48 | #[cfg(test)] 49 | mod tests { 50 | use std::process::Command; 51 | 52 | use crate::server::storage_engine::block_device::io::Storage; 53 | #[test] 54 | fn write_and_read_test() { 55 | Command::new("bash") 56 | .arg("-c") 57 | .arg("dd if=/dev/zero of=node1 bs=4M count=1") 58 | .output() 59 | .unwrap(); 60 | Command::new("bash") 61 | .arg("-c") 62 | .arg("losetup /dev/loop8 node1") 63 | .output() 64 | .unwrap(); 65 | let storage = Storage::new("/dev/loop8"); 66 | let writre_result = storage.write(&b"some bytes"[..], 0).unwrap(); 67 | assert_eq!(writre_result, 10); 68 | let read_result = storage.read(10, 0).unwrap(); 69 | assert_eq!(read_result, &b"some bytes"[..]); 70 | Command::new("bash") 71 | .arg("-c") 72 | .arg("losetup -d /dev/loop8") 73 | .output() 74 | .unwrap(); 75 | Command::new("bash") 76 | .arg("-c") 77 | .arg("rm node1") 78 | .output() 79 | .unwrap(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/server/storage_engine/block_engine/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | pub mod allocator; 6 | /** 7 | *block device is use to bypass filesystem aimed to attain higher performance. 8 | */ 9 | pub mod index; 10 | pub mod io; 11 | 12 | use std::sync::Arc; 13 | 14 | use crate::server::storage_engine::StorageEngine; 15 | 16 | use allocator::{Allocator, BitmapAllocator, CHUNK}; 17 | use index::FileIndex; 18 | use io::Storage; 19 | 20 | use super::meta_engine::MetaEngine; 21 | 22 | #[allow(unused)] 23 | pub struct BlockEngine { 24 | allocator: BitmapAllocator, 25 | index: FileIndex, 26 | storage: Storage, 27 | } 28 | 29 | impl StorageEngine for BlockEngine { 30 | fn new(root: &str, _meta: Arc) -> Self { 31 | let index = FileIndex::new(); 32 | let storage = Storage::new(root); 33 | let allocator = BitmapAllocator::new(root); 34 | Self { 35 | allocator, 36 | index, 37 | storage, 38 | } 39 | } 40 | 41 | fn init(&self) {} 42 | 43 | fn read_file(&self, path: &str, _size: u32, offset: i64) -> Result, i32> { 44 | let index_vec = self.index.search(path); 45 | let real_offset_index = offset as u64 / CHUNK; 46 | let real_offset = index_vec.get(real_offset_index as usize); 47 | match real_offset { 48 | Some(_real_offset) => todo!(), // self.storage.read(size, *real_offset as i64), 49 | None => todo!(), // Err(libc::EIO), 50 | } 51 | } 52 | 53 | fn open_file(&self, _path: &str, _flag: i32, _mode: u32) -> Result<(), i32> { 54 | todo!() 55 | } 56 | 57 | fn write_file(&self, path: &str, data: &[u8], _offset: i64) -> Result { 58 | let pos = self.allocator.allocator_space(data.len() as u64); 59 | let index_value_vec = self.index.search(path); 60 | let mut vec = Vec::new(); 61 | let mut length = (data.len() as u64) / CHUNK; 62 | if data.len() as u64 - length * CHUNK > 0 { 63 | length += 1; 64 | } 65 | for n in 0..length { 66 | vec.push(pos + n * CHUNK); 67 | } 68 | self.index.update_index(path, vec); 69 | match index_value_vec.last() { 70 | Some(_last) => todo!(), // self.storage.write(data, (last + pos) as i64), 71 | None => todo!(), // self.storage.write(data, pos as i64), 72 | } 73 | } 74 | 75 | fn create_file( 76 | &self, 77 | _path: &str, 78 | _oflag: i32, 79 | _umask: u32, 80 | _mode: u32, 81 | ) -> Result, i32> { 82 | todo!() 83 | } 84 | 85 | fn delete_file(&self, _path: &str) -> Result<(), i32> { 86 | todo!() 87 | } 88 | 89 | fn truncate_file(&self, _path: &str, _length: i64) -> Result<(), i32> { 90 | todo!() 91 | } 92 | } 93 | 94 | #[cfg(feature = "block_test")] 95 | #[cfg(test)] 96 | mod tests { 97 | use crate::server::storage_engine::StorageEngine; 98 | 99 | use super::BlockEngine; 100 | use std::process::Command; 101 | #[test] 102 | fn write_and_read_test() { 103 | Command::new("bash") 104 | .arg("-c") 105 | .arg("dd if=/dev/zero of=node1 bs=4M count=1") 106 | .output() 107 | .unwrap(); 108 | Command::new("bash") 109 | .arg("-c") 110 | .arg("losetup /dev/loop8 node1") 111 | .output() 112 | .unwrap(); 113 | let engine = BlockEngine::new("", "/dev/loop8"); 114 | let write_size = engine 115 | .write_file("test".to_string(), &b"some bytes"[..], 0) 116 | .unwrap(); 117 | assert_eq!(write_size, 10); 118 | let read = engine.read_file("test".to_string(), 10, 0).unwrap(); 119 | assert_eq!(read, &b"some bytes"[..]); 120 | Command::new("bash") 121 | .arg("-c") 122 | .arg("losetup -d /dev/loop8") 123 | .output() 124 | .unwrap(); 125 | Command::new("bash") 126 | .arg("-c") 127 | .arg("rm node1") 128 | .output() 129 | .unwrap(); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/server/storage_engine/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::sync::Arc; 6 | 7 | use self::meta_engine::MetaEngine; 8 | 9 | pub mod block_engine; 10 | pub mod file_engine; 11 | pub mod meta_engine; 12 | 13 | pub trait StorageEngine { 14 | fn new(root: &str, meta_engine: Arc) -> Self; 15 | 16 | fn init(&self); 17 | 18 | fn read_file(&self, path: &str, size: u32, offset: i64) -> Result, i32>; 19 | 20 | fn open_file(&self, path: &str, flag: i32, mode: u32) -> Result<(), i32>; 21 | 22 | fn write_file(&self, path: &str, data: &[u8], offset: i64) -> Result; 23 | 24 | fn create_file(&self, path: &str, oflag: i32, umask: u32, mode: u32) -> Result, i32>; 25 | 26 | fn delete_file(&self, path: &str) -> Result<(), i32>; 27 | 28 | fn truncate_file(&self, path: &str, length: i64) -> Result<(), i32>; 29 | } 30 | -------------------------------------------------------------------------------- /src/server/transfer_manager.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 labring. All rights reserved. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | use std::collections::HashMap; 6 | 7 | use dashmap::DashMap; 8 | use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; 9 | 10 | pub struct LockPool { 11 | locks: HashMap>, 12 | } 13 | 14 | pub struct TransferManager { 15 | transferring_locks: *const LockPool, 16 | transferring_status: DashMap, 17 | } 18 | 19 | unsafe impl std::marker::Sync for TransferManager {} 20 | unsafe impl std::marker::Send for TransferManager {} 21 | 22 | impl Default for TransferManager { 23 | fn default() -> Self { 24 | Self::new() 25 | } 26 | } 27 | 28 | impl TransferManager { 29 | pub fn new() -> Self { 30 | TransferManager { 31 | transferring_locks: Box::into_raw(Box::new(LockPool { 32 | locks: HashMap::new(), 33 | })), 34 | transferring_status: DashMap::new(), 35 | } 36 | } 37 | 38 | pub fn get_lock(&self, path: &str) -> &RwLock<()> { 39 | unsafe { (*self.transferring_locks).locks.get(path).unwrap() } 40 | } 41 | 42 | pub fn make_up_files(&self, paths: &Vec) { 43 | self.transferring_status.clear(); 44 | let transferring_locks = unsafe { &mut *(self.transferring_locks as *mut LockPool) }; 45 | transferring_locks.locks.clear(); 46 | for path in paths { 47 | transferring_locks 48 | .locks 49 | .insert(path.clone(), RwLock::new(())); 50 | self.transferring_status.insert(path.clone(), false); 51 | } 52 | } 53 | 54 | pub async fn get_rlock(&self, path: &str) -> RwLockReadGuard<'_, ()> { 55 | let lock = self.get_lock(path); 56 | lock.read().await 57 | } 58 | 59 | pub async fn get_wlock(&self, path: &str) -> RwLockWriteGuard<'_, ()> { 60 | let lock = self.get_lock(path); 61 | lock.write().await 62 | } 63 | 64 | pub fn status(&self, path: &str) -> Option { 65 | self.transferring_status.get(path).map(|status| *status) 66 | } 67 | 68 | pub fn set_status(&self, path: &str, status: bool) { 69 | self.transferring_status.insert(path.to_string(), status); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /test_io500.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function finish() { 4 | trap 'kill $(jobs -p)' EXIT 5 | exit $1 6 | } 7 | 8 | function green_font() { 9 | echo -e "\033[32m$1\033[0m\c" 10 | } 11 | 12 | function fuse_test() { 13 | ./target/debug/client --log-level info daemon& 14 | sleep 3 15 | ./target/debug/client --log-level info mount ~/fs test1 16 | 17 | start_time=$[$(date +%s%N)/1000000] 18 | cd io500 19 | timeout -s SIGKILL 200 mpirun -np 5 ./io500 config-minimal.ini 20 | result=$? 21 | cd .. 22 | end_time=$[$(date +%s%N)/1000000] 23 | result_time=$[ $end_time - $start_time ] 24 | echo -e "fuse tests finish, cost: $(green_font ${result_time}ms)" 25 | return $result 26 | } 27 | 28 | function intercept_test() { 29 | start_time=$[$(date +%s%N)/1000000] 30 | cd io500 31 | SEALFS_LOG_LEVEL=warn SEALFS_VOLUME_NAME=test1 SEALFS_MOUNT_POINT=~/fs LD_PRELOAD=../target/debug/libintercept.so timeout -s SIGKILL 200 mpirun -np 5 ./io500 config-minimal.ini 32 | result=$? 33 | cd .. 34 | end_time=$[$(date +%s%N)/1000000] 35 | result_time=$[ $end_time - $start_time ] 36 | echo -e "intercept tests finish, cost: $(green_font ${result_time}ms)" 37 | return $result 38 | } 39 | 40 | echo "start fuse_client_run" 41 | 42 | # exit with 1 if no argument 43 | if [ $# -eq 0 ] 44 | then 45 | echo "no argument" 46 | exit 1 47 | fi 48 | 49 | set +e 50 | 51 | sudo umount ~/fs 52 | rm /tmp/sealfs.sock 53 | rm /tmp/sealfs.index 54 | mkdir -p ~/fs 55 | 56 | sudo rm -rf io500 57 | sudo rm -rf $1/database* 58 | sudo rm -rf $1/storage* 59 | 60 | set -e 61 | 62 | SEALFS_CONFIG_PATH=./examples ./target/debug/manager --log-level info & 63 | 64 | sleep 1 65 | 66 | for ((i=0; i<5; i++)) 67 | do 68 | port=$[8085+$i] 69 | ./target/debug/server --server-address 127.0.0.1:${port} --database-path $1/database${i}/ --storage-path $1/storage${i}/ --log-level info & 70 | done 71 | 72 | sleep 3 73 | 74 | 75 | SELF_HOSTED=1 76 | 77 | if [ $SELF_HOSTED -eq 1 ] 78 | then 79 | cp -r ~/io500/io500 . 80 | cd io500 81 | else 82 | git clone https://github.com/IO500/io500.git 83 | cd io500 84 | ./prepare.sh 85 | fi 86 | 87 | echo "[global]" > config-minimal.ini 88 | echo "datadir = $HOME/fs" >> config-minimal.ini 89 | echo "" >> config-minimal.ini 90 | echo "[debug]" >> config-minimal.ini 91 | echo "stonewall-time = 2" >> config-minimal.ini 92 | 93 | cd .. 94 | 95 | set +e 96 | 97 | ./target/debug/client --log-level info create-volume test1 100000 98 | 99 | fuse_test 100 | fuse_result=$? 101 | echo "fuse result: $fuse_result" 102 | 103 | intercept_test 104 | intercept_result=$? 105 | echo "intercept result: $intercept_result" 106 | result=$(($fuse_result||$intercept_result)) 107 | 108 | set -e 109 | finish $result --------------------------------------------------------------------------------