├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── doc └── kmiyc.png ├── include └── serialization.proto ├── run.sh ├── src ├── bin │ ├── rasputinc.rs │ └── rasputind.rs ├── client │ └── mod.rs ├── clock.rs ├── codec.rs ├── etc │ └── loadtest.go ├── lib.rs ├── logging │ └── mod.rs ├── range_bounds.rs ├── serialization.rs └── server │ ├── acked_log.rs │ ├── connset.rs │ ├── mod.rs │ ├── rocksdb.rs │ ├── server.rs │ ├── server_conn.rs │ └── traffic_cop.rs └── test ├── cluster.rs ├── test.rs ├── test_client.rs └── test_paxos.rs /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled files 2 | *.o 3 | *.so 4 | *.rlib 5 | *.dll 6 | *.swp 7 | *.swo 8 | *.swn 9 | 10 | # Executables 11 | *.exe 12 | 13 | # Generated by Cargo 14 | /target/ 15 | 16 | # Test cluster 17 | /_test/ 18 | 19 | # rustfmt 20 | *.bk 21 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | [root] 2 | name = "rasputin" 3 | version = "0.1.0" 4 | dependencies = [ 5 | "bytes 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", 6 | "docopt 0.6.68 (registry+https://github.com/rust-lang/crates.io-index)", 7 | "eligos 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 8 | "lazy_static 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)", 9 | "log 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 10 | "mio 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", 11 | "protobuf 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", 12 | "quickcheck 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)", 13 | "rand 0.3.10 (registry+https://github.com/rust-lang/crates.io-index)", 14 | "rocksdb 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", 15 | "rustc-serialize 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", 16 | "threadpool 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", 17 | "time 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", 18 | "uuid 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)", 19 | ] 20 | 21 | [[package]] 22 | name = "advapi32-sys" 23 | version = "0.1.2" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | dependencies = [ 26 | "winapi 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", 27 | "winapi-build 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 28 | ] 29 | 30 | [[package]] 31 | name = "aho-corasick" 32 | version = "0.3.0" 33 | source = "registry+https://github.com/rust-lang/crates.io-index" 34 | dependencies = [ 35 | "memchr 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", 36 | ] 37 | 38 | [[package]] 39 | name = "bitflags" 40 | version = "0.1.1" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | 43 | [[package]] 44 | name = "bytes" 45 | version = "0.2.11" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | 48 | [[package]] 49 | name = "clock_ticks" 50 | version = "0.0.5" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | dependencies = [ 53 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 54 | ] 55 | 56 | [[package]] 57 | name = "docopt" 58 | version = "0.6.68" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | dependencies = [ 61 | "regex 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)", 62 | "rustc-serialize 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", 63 | "strsim 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 64 | ] 65 | 66 | [[package]] 67 | name = "eligos" 68 | version = "0.1.0" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | dependencies = [ 71 | "bytes 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", 72 | "mio 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", 73 | "rand 0.3.10 (registry+https://github.com/rust-lang/crates.io-index)", 74 | ] 75 | 76 | [[package]] 77 | name = "kernel32-sys" 78 | version = "0.1.3" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | dependencies = [ 81 | "winapi 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", 82 | "winapi-build 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 83 | ] 84 | 85 | [[package]] 86 | name = "lazy_static" 87 | version = "0.1.14" 88 | source = "registry+https://github.com/rust-lang/crates.io-index" 89 | 90 | [[package]] 91 | name = "libc" 92 | version = "0.1.8" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | 95 | [[package]] 96 | name = "log" 97 | version = "0.3.1" 98 | source = "registry+https://github.com/rust-lang/crates.io-index" 99 | dependencies = [ 100 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 101 | ] 102 | 103 | [[package]] 104 | name = "memchr" 105 | version = "0.1.3" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | dependencies = [ 108 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 109 | ] 110 | 111 | [[package]] 112 | name = "mio" 113 | version = "0.4.2" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | dependencies = [ 116 | "bytes 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", 117 | "clock_ticks 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)", 118 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 119 | "log 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 120 | "nix 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", 121 | "slab 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", 122 | "winapi 0.1.23 (registry+https://github.com/rust-lang/crates.io-index)", 123 | ] 124 | 125 | [[package]] 126 | name = "nix" 127 | version = "0.3.9" 128 | source = "registry+https://github.com/rust-lang/crates.io-index" 129 | dependencies = [ 130 | "bitflags 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", 131 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 132 | ] 133 | 134 | [[package]] 135 | name = "protobuf" 136 | version = "1.0.4" 137 | source = "registry+https://github.com/rust-lang/crates.io-index" 138 | 139 | [[package]] 140 | name = "quickcheck" 141 | version = "0.2.21" 142 | source = "registry+https://github.com/rust-lang/crates.io-index" 143 | dependencies = [ 144 | "log 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 145 | "rand 0.3.10 (registry+https://github.com/rust-lang/crates.io-index)", 146 | ] 147 | 148 | [[package]] 149 | name = "rand" 150 | version = "0.3.10" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | dependencies = [ 153 | "advapi32-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", 154 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 155 | "winapi 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", 156 | ] 157 | 158 | [[package]] 159 | name = "regex" 160 | version = "0.1.41" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | dependencies = [ 163 | "aho-corasick 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 164 | "memchr 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", 165 | "regex-syntax 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", 166 | ] 167 | 168 | [[package]] 169 | name = "regex-syntax" 170 | version = "0.2.1" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | 173 | [[package]] 174 | name = "rocksdb" 175 | version = "0.1.1" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | dependencies = [ 178 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 179 | ] 180 | 181 | [[package]] 182 | name = "rustc-serialize" 183 | version = "0.3.15" 184 | source = "registry+https://github.com/rust-lang/crates.io-index" 185 | 186 | [[package]] 187 | name = "slab" 188 | version = "0.1.2" 189 | source = "registry+https://github.com/rust-lang/crates.io-index" 190 | 191 | [[package]] 192 | name = "strsim" 193 | version = "0.3.0" 194 | source = "registry+https://github.com/rust-lang/crates.io-index" 195 | 196 | [[package]] 197 | name = "threadpool" 198 | version = "0.1.4" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | 201 | [[package]] 202 | name = "time" 203 | version = "0.1.31" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | dependencies = [ 206 | "kernel32-sys 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", 207 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 208 | "winapi 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", 209 | ] 210 | 211 | [[package]] 212 | name = "uuid" 213 | version = "0.1.17" 214 | source = "registry+https://github.com/rust-lang/crates.io-index" 215 | dependencies = [ 216 | "rand 0.3.10 (registry+https://github.com/rust-lang/crates.io-index)", 217 | "rustc-serialize 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", 218 | ] 219 | 220 | [[package]] 221 | name = "winapi" 222 | version = "0.1.23" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | dependencies = [ 225 | "libc 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", 226 | ] 227 | 228 | [[package]] 229 | name = "winapi" 230 | version = "0.2.1" 231 | source = "registry+https://github.com/rust-lang/crates.io-index" 232 | 233 | [[package]] 234 | name = "winapi-build" 235 | version = "0.1.0" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | 238 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | 3 | name = "rasputin" 4 | description = "Hard to kill transactional distributed database" 5 | version = "0.2.0" 6 | authors = [ 7 | "Tyler Neely ", 8 | "Steve Salevan " 9 | ] 10 | license = "Apache-2.0" 11 | homepage = "https://github.com/the-tetanus-clinic/rasputin" 12 | keywords = ["database", "HA", "transactions", "distributed-systems", "paxos"] 13 | 14 | [[test]] 15 | 16 | name = "test" 17 | path = "test/test.rs" 18 | 19 | [dependencies] 20 | bytes = "0.2.11" 21 | docopt = "0.6.66" 22 | lazy_static="0.1.14" 23 | log = "0.3.1" 24 | mio = "0.4.2" 25 | rand = "0.3" 26 | rocksdb = "~0.1.1" 27 | rustc-serialize = "0.3.15" 28 | time = "0.1" 29 | uuid = "0.1" 30 | protobuf = "1.0.16" 31 | threadpool = "0.1.4" 32 | 33 | [dev-dependencies] 34 | quickcheck = "*" 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rasputin DB :globe_with_meridians: 2 | 3 | (Significant work is currently happening in the `tyler_ranges` branch) 4 | 5 | flexible linearizable distributed store 6 | 7 | triumvirs: operational clarity, performance and composability 8 | 9 | currently implemented: linearized KV set/get/cas/del. client code is happy-path only, so it's only fit for playing around with at this point! 10 | 11 | current reasons why you don't want to use this beyond playing with it: 12 | 13 | 1. Mostly unimplemented. We don't have support for automatic resharding, real transactions or collection types other than KV yet. These are still in the planning phase. 14 | 1. Possibly incorrect. We have not yet proven the correctness of the core consensus algorithm. We may be able to adapt the Raft Coq proof to this end, as we are essentially replacing Raft's preemptible leadership with a non-preempting lease to improve throughput in the presence of partial partitions. 15 | 1. Inefficient. The write path involves a TON of copying. We are in the process of designing a much more efficient buffer management system. 16 | 1. Buggy. Client code is a complete hack that only occasionally works. We have a simulator in place for teasing out bugs in the state machine, but we haven't used it for simulating common datacenter conditions like partitions, delayed message arrival, node restarts/shutdowns/pauses, etc... 17 | 1. Undocumented. 18 | 1. Unpopular. No community and no production users (or, at least I hope nobody is using it yet!). 19 | 20 | ## Running 21 | 22 | ###### Run a test cluster 23 | 24 | ``` 25 | cargo build 26 | ./run.sh 27 | tail -f _rasputin_test/*log 28 | ``` 29 | 30 | ###### Run an individual server 31 | 32 | ``` 33 | target/debug/rasputind \ 34 | --peer-port=7777 \ 35 | --cli-port=8888 \ 36 | --seed-peers="127.0.0.1:7777" \ 37 | --storage-dir=/var/lib/rasputin/ \ 38 | --logfile=/var/log/rasputin.log 39 | ``` 40 | 41 | ###### Hit the cluster with a remote client! 42 | 43 | Cargo.toml: 44 | 45 | ``` 46 | [dependencies] 47 | rasputin = "0.1.0" 48 | ``` 49 | 50 | Code: 51 | ```rust 52 | extern crate rasputin; 53 | 54 | fn main() { 55 | let peers = vec!["127.0.0.1:8888".parse().unwrap()]; 56 | let nthreads = 1; 57 | let mut cli = rasputin::Client::new(peers, nthreads); 58 | 59 | cli.set(b"k1", b"v1").unwrap(); 60 | assert!(cli.get(b"k1").unwrap().get_value() == b"v1"); 61 | 62 | // CAS returns the current value, and sets the success flag accordingly 63 | assert!(cli.cas(b"k1", b"v1", b"v12").unwrap().get_value() == b"v12"); 64 | assert!(cli.cas(b"k1", b"vNever", b"vNever2").unwrap().get_value() == b"v12"); 65 | assert!(cli.cas(b"k1", b"vNever", b"vNever2").unwrap().get_success() == false); 66 | assert!(cli.cas(b"k1", b"v12", b"v13").unwrap().get_value() == b"v13"); 67 | 68 | // deletes return the last value 69 | assert!(cli.del(b"k1").unwrap().get_value() == b"v13"); 70 | assert!(cli.get(b"k1").unwrap().get_success() == false); 71 | } 72 | ``` 73 | 74 | ## Planned Work 75 | 76 | ###### automatic lexicographic resharding 77 | 78 | Rasputin will utilize shard size and request density metrics to faciliate intelligent splitting. 79 | 80 | ###### several simple persistent collection types 81 | 82 | 1. kv: backed by RocksDB 83 | 2. log: Kafka-like sequential segment files 84 | 3. object: files on system VFS 85 | 86 | ###### interest semantics 87 | 88 | * subscribe: in-order mutation stream 89 | * watch: at most once mutation notification 90 | 91 | ###### replication modes (per-collection) 92 | 93 | 1. consensus: mutations block on replication to a quorum 94 | 2. async: mutations return quickly, and are replicated later 95 | 96 | ###### timeseries primitives 97 | 98 | * logarithmically bucketed histograms for efficient aggregation and consumption of extremely high velocity metrics, a la [loghisto](github.com/spacejam/loghisto). 99 | 100 | ## roadmap 101 | - [x] mio event loops 102 | - [x] leader election 103 | - [x] rocksdb persistence layer 104 | - [x] log replication 105 | - [x] multipaxos consensus 106 | - [x] simple KV client operations 107 | - [ ] reconfigurable membership 108 | - [ ] range splitting 109 | - [ ] mesos framework 110 | - [ ] c/jvm/python/ruby/go client libs 111 | 112 | ## Appendix: The Harpoon Consensus Algorithm 113 | 114 | Because Rasputin aims to be as general purpose of a replication mechanism as possible, it needs to be resilient against partitions. We aim to reuse the parts of Raft that work for this as much as we can, and replace the leader election mechanism with a lease-based one that does not preempt in the presence of partial partitions. This obviously needs to be tested extensively, and to that end a comprehensive simulator is being built for testing the state machine (see test/cluster.rs), and fault injection tooling is being built for inducing realistic datacenter conditions on a non-simulated cluster. 115 | 116 | Raft is vulnerable to rapid leader churn when a partial partition exists between the leader and any other node. The partially partitioned node will fire its leader election timer and receive quorum. Because the old leader can't talk to this new leader, it will do the same. Leadership bounces a lot and we have suboptimal throughput. Harpoon is essentially just Raft with a modified election mechanism: candidates and leaders request leases from all peers, extend leadership if they reach quorum, and abdicate if they do not reach a quorum of successful extension request responses by the end of their lease. This prevents leadership churn in scenarios where there is a partial partition, which is common over the open internet, for example. 117 | 118 | Harpoon has not yet been formally verified, but eventually we will adapt the Raft Coq proof for it. 119 | 120 | -------------------------------------------------------------------------------- /doc/kmiyc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spacejam/rasputin/31249108bc073d2212f9812533a810b6652e2ebd/doc/kmiyc.png -------------------------------------------------------------------------------- /include/serialization.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package rasputin; 4 | 5 | // 6 | // Client <-> rasputin server messages 7 | // 8 | message SetReq { 9 | required bytes key = 1; 10 | required bytes value = 2; 11 | } 12 | 13 | message SetRes { 14 | required bool success = 1; 15 | required uint64 txid = 2; 16 | optional string err = 3; 17 | } 18 | 19 | message GetReq { 20 | required bytes key = 1; 21 | } 22 | 23 | message GetRes { 24 | required bool success = 1; 25 | required uint64 txid = 2; 26 | optional bytes value = 3; 27 | optional string err = 4; 28 | } 29 | 30 | message CASReq { 31 | required bytes key = 1; 32 | optional bytes new_value = 2; 33 | optional bytes old_value = 3; 34 | } 35 | 36 | message CASRes { 37 | required bool success = 1; 38 | required uint64 txid = 2; 39 | optional bytes value = 3; 40 | optional string err = 4; 41 | } 42 | 43 | message DelReq { 44 | required bytes key = 1; 45 | } 46 | 47 | message DelRes { 48 | required bool success = 1; 49 | required uint64 txid = 2; 50 | required bytes value = 3; 51 | optional string err = 4; 52 | } 53 | 54 | message WatchReq { 55 | required bytes key = 1; 56 | required uint64 last_txid = 2; 57 | required bool recursive = 3; 58 | required bool historical = 4; 59 | } 60 | 61 | message WatchRes { 62 | required bool success = 1; 63 | repeated Mutation history = 2; 64 | optional string err = 3; 65 | } 66 | 67 | message RedirectRes { 68 | required bool success = 1; 69 | optional string address = 2; 70 | optional string err = 3; 71 | } 72 | 73 | // datatypes 74 | enum MutationType { 75 | KVSET = 1; 76 | KVCAS = 2; 77 | KVDEL = 3; 78 | } 79 | 80 | message Mutation { 81 | required MutationType type = 1; 82 | required Version version = 2; 83 | required bytes key = 3; 84 | optional bytes value = 4; 85 | optional bytes old_value = 5; 86 | } 87 | 88 | message Version { 89 | required uint64 txid = 1; 90 | required uint64 term = 2; 91 | } 92 | 93 | // client top-levl API 94 | message CliReq { 95 | required uint64 req_id = 1; 96 | optional GetReq get = 2; 97 | optional SetReq set = 3; 98 | optional CASReq cas = 4; 99 | optional DelReq del = 5; 100 | optional WatchReq watch = 6; 101 | } 102 | 103 | message CliRes { 104 | required uint64 req_id = 1; 105 | optional GetRes get = 2; 106 | optional SetRes set = 3; 107 | optional CASRes cas = 4; 108 | optional DelRes del = 5; 109 | optional WatchRes watch = 6; 110 | optional RedirectRes redirect = 7; 111 | } 112 | 113 | // 114 | // Leadership 115 | // 116 | message VoteReq { 117 | required uint64 term = 1; 118 | required uint64 last_learned_term = 2; 119 | required uint64 last_learned_txid = 3; 120 | required uint64 last_accepted_term = 4; 121 | required uint64 last_accepted_txid = 5; 122 | } 123 | 124 | message VoteRes { 125 | required bool success = 1; 126 | required uint64 term = 2; 127 | } 128 | 129 | // 130 | // Replication 131 | // 132 | message Append { 133 | required uint64 from_txid = 1; 134 | required uint64 from_term = 2; 135 | repeated Mutation batch = 3; 136 | required uint64 last_learned_txid = 4; 137 | } 138 | 139 | message AppendRes { 140 | required bool accepted = 1; 141 | optional uint64 last_accepted_txid = 2; 142 | optional uint64 last_accepted_term = 3; 143 | } 144 | 145 | // server<->server top-level api 146 | message PeerMsg { 147 | required string srvid = 1; 148 | optional VoteReq vote_req = 2; 149 | optional VoteRes vote_res = 3; 150 | optional Append append = 4; 151 | optional AppendRes append_res = 5; 152 | } 153 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | peers="127.0.0.1:7770,127.0.0.1:7771,127.0.0.1:7772,127.0.0.1:7773,127.0.0.1:7774" 3 | for i in {0..4}; do 4 | mkdir -p _rasputin_test/$i/data 5 | RUST_BACKTRACE=1 target/debug/rasputind \ 6 | --peer-port=777$i \ 7 | --cli-port=888$i \ 8 | --seed-peers=$peers \ 9 | --storage-dir=_rasputin_test/$i/data \ 10 | --logfile=_rasputin_test/$i.log & 11 | done 12 | -------------------------------------------------------------------------------- /src/bin/rasputinc.rs: -------------------------------------------------------------------------------- 1 | extern crate rustc_serialize; 2 | extern crate docopt; 3 | #[macro_use] extern crate log; 4 | extern crate rasputin; 5 | 6 | use std::net::SocketAddr; 7 | use std::process; 8 | 9 | use rasputin::Client; 10 | use docopt::Docopt; 11 | 12 | static USAGE: &'static str = " 13 | rasputinc - client for rasputin. 14 | 15 | This program is the Rasputin DB command line client. 16 | 17 | Usage: 18 | rasputinc --help 19 | rasputinc [--peers=] [--get=] [--set=,] [--cas=,,] [--del=] 20 | 21 | Options: 22 | --help Show this help message. 23 | --peers= List of comma-delimited peers, e.g: 24 | foo.baz.com:8888,bar.baz.com:8888 25 | --get= Get the current value for , if set. 26 | --set= Set the key to . 27 | --cas= Attempt an atomic compare and swap. 28 | --del= Delete the current value for , if set. 29 | "; 30 | 31 | fn main() { 32 | let args: Args = Docopt::new(USAGE) 33 | .and_then(|d| d.decode()) 34 | .unwrap_or_else(|e| e.exit()); 35 | 36 | let peers: Vec = args.flag_peers.unwrap_or("127.0.0.1:8888".to_string()) 37 | .split(",") 38 | .map(|s| s.parse().unwrap()) 39 | .collect(); 40 | 41 | let nthreads = 1; 42 | let mut cli = Client::new(peers, nthreads); 43 | 44 | args.flag_set.map(|kv: String| { 45 | let kvs: Vec<&str> = kv.splitn(2, ",").take(2).collect(); 46 | if kvs.len() != 2 { 47 | println!("{}", USAGE); 48 | process::exit(1); 49 | } 50 | let (k, v) = (kvs[0], kvs[1]); 51 | cli.set(k.as_bytes(), v.as_bytes()).unwrap(); 52 | }); 53 | } 54 | 55 | #[derive(Debug, RustcDecodable)] 56 | struct Args { 57 | flag_help: bool, 58 | flag_peers: Option, 59 | flag_set: Option, 60 | flag_get: Option, 61 | flag_cas: Option, 62 | flag_del: Option, 63 | } 64 | -------------------------------------------------------------------------------- /src/bin/rasputind.rs: -------------------------------------------------------------------------------- 1 | extern crate rustc_serialize; 2 | extern crate mio; 3 | extern crate docopt; 4 | #[macro_use] 5 | extern crate log; 6 | extern crate rasputin; 7 | 8 | use std::sync::mpsc::SendError; 9 | 10 | use log::LogLevel; 11 | use docopt::Docopt; 12 | 13 | use rasputin::server::{Server, Envelope}; 14 | use rasputin::RealClock; 15 | 16 | static USAGE: &'static str = " 17 | rasputin - HA transactional store with a focus on usability, stability and performance. 18 | 19 | This program is the Rasputin DB server process. 20 | 21 | Usage: 22 | rasputind --help 23 | rasputind [--cli-port=] [--peer-port=] [--seed-peers=] [--logfile=] [--storage-dir=] 24 | 25 | Options: 26 | --help Show this help message. 27 | --cli-port= Listening port for communication between servers. 28 | --peer-port= Listening port for communication with clients. 29 | --seed-peers= List of comma-delimited initial peers, e.g: 30 | foo.baz.com:7777,bar.baz.com:7777 31 | --logfile= File to log output to instead of stdout. 32 | --storage-dir= Directory to store the persisted data in; defaults to /var/lib/rasputin 33 | "; 34 | 35 | fn main() { 36 | let args: Args = Docopt::new(USAGE) 37 | .and_then(|d| d.decode()) 38 | .unwrap_or_else(|e| e.exit()); 39 | 40 | rasputin::logging::init_logger(args.flag_logfile, LogLevel::Info).unwrap(); 41 | print_banner(); 42 | 43 | let peer_port: u16 = match args.flag_peer_port { 44 | Some(p) => p, 45 | None => 7770, 46 | }; 47 | 48 | let cli_port: u16 = match args.flag_cli_port { 49 | Some(p) => p, 50 | None => 8880, 51 | }; 52 | 53 | let storage_dir: String = match args.flag_storage_dir { 54 | Some(d) => d, 55 | None => "/var/lib/rasputin".to_string(), 56 | }; 57 | 58 | let seed_peers: Vec = args.flag_seed_peers 59 | .split(",") 60 | .map(|s| s.to_string()) 61 | .filter(|s| s != "") 62 | .collect(); 63 | 64 | Server::>> 65 | ::run(peer_port, cli_port, storage_dir, seed_peers); 66 | } 67 | 68 | #[derive(Debug, RustcDecodable)] 69 | struct Args { 70 | flag_help: bool, 71 | flag_cli_port: Option, 72 | flag_peer_port: Option, 73 | flag_seed_peers: String, 74 | flag_logfile: Option, 75 | flag_storage_dir: Option, 76 | } 77 | 78 | fn print_banner() { 79 | info!(" 80 | )xxxxx[:::::::::> 81 | ______ _______ _______ _____ _ _ _______ _____ __ _ 82 | |_____/ |_____| |______ |_____] | | | | | \\ | 83 | | \\_ | | ______| | |_____| | __|__ | \\_|"); 84 | } 85 | -------------------------------------------------------------------------------- /src/client/mod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::io::{self, Error, ErrorKind}; 3 | use std::net::SocketAddr; 4 | use std::sync::mpsc::channel; 5 | 6 | use bytes::{Buf, ByteBuf}; 7 | use threadpool::ThreadPool; 8 | use protobuf::{self, Message}; 9 | use mio::{TryRead, TryWrite}; 10 | use mio::tcp::TcpStream; 11 | 12 | use {CliReq, CliRes, GetReq, GetRes, RangeBounds, RedirectRes, SetReq, 13 | SetRes, Version, CASReq, CASRes, DelReq, DelRes}; 14 | use codec::{self, Codec, Framed}; 15 | 16 | pub struct Client { 17 | servers: Vec, 18 | ranges: BTreeMap, 19 | pool: ThreadPool, 20 | req_counter: u64, 21 | } 22 | 23 | impl Client { 24 | pub fn new(servers: Vec, nthreads: usize) -> Client { 25 | Client { 26 | servers: servers, 27 | ranges: BTreeMap::new(), 28 | pool: ThreadPool::new(nthreads), 29 | req_counter: 0, 30 | } 31 | } 32 | 33 | fn get_id(&mut self) -> u64 { 34 | self.req_counter += 1; 35 | self.req_counter 36 | } 37 | 38 | pub fn set<'a>( 39 | &mut self, 40 | key: &'a [u8], 41 | value: &'a [u8] 42 | ) -> io::Result { 43 | 44 | let mut set = SetReq::new(); 45 | set.set_key(key.to_vec()); 46 | set.set_value(value.to_vec()); 47 | let mut req = CliReq::new(); 48 | req.set_set(set); 49 | req.set_req_id(self.get_id()); 50 | 51 | self.req(key.to_vec(), req).map(|cli_res| { 52 | let set_res = cli_res.get_set(); 53 | debug!("got response success: {} txid: {} err: {}", 54 | set_res.get_success(), 55 | set_res.get_txid(), 56 | set_res.get_err()); 57 | cli_res.get_set().clone() 58 | }) 59 | } 60 | 61 | pub fn get<'a>( 62 | &mut self, 63 | key: &'a [u8], 64 | ) -> io::Result { 65 | 66 | let mut get = GetReq::new(); 67 | get.set_key(key.to_vec()); 68 | let mut req = CliReq::new(); 69 | req.set_get(get); 70 | req.set_req_id(self.get_id()); 71 | 72 | self.req(key.to_vec(), req).map(|cli_res| { 73 | let get_res = cli_res.get_get(); 74 | debug!("got response success: {} txid: {} err: {}", 75 | get_res.get_success(), 76 | get_res.get_txid(), 77 | get_res.get_err()); 78 | cli_res.get_get().clone() 79 | }) 80 | } 81 | 82 | pub fn cas<'a>( 83 | &mut self, 84 | key: &'a [u8], 85 | old_value: &'a [u8], 86 | new_value: &'a [u8] 87 | ) -> io::Result { 88 | 89 | let mut cas = CASReq::new(); 90 | cas.set_key(key.to_vec()); 91 | cas.set_old_value(old_value.to_vec()); 92 | cas.set_new_value(new_value.to_vec()); 93 | let mut req = CliReq::new(); 94 | req.set_cas(cas); 95 | req.set_req_id(self.get_id()); 96 | 97 | self.req(key.to_vec(), req).map(|cli_res| { 98 | let cas_res = cli_res.get_cas(); 99 | debug!("got response success: {} txid: {} err: {}", 100 | cas_res.get_success(), 101 | cas_res.get_txid(), 102 | cas_res.get_err()); 103 | cli_res.get_cas().clone() 104 | }) 105 | } 106 | 107 | pub fn del<'a>( 108 | &mut self, 109 | key: &'a [u8], 110 | ) -> io::Result { 111 | 112 | let mut del = DelReq::new(); 113 | del.set_key(key.to_vec()); 114 | let mut req = CliReq::new(); 115 | req.set_del(del); 116 | req.set_req_id(self.get_id()); 117 | 118 | self.req(key.to_vec(), req).map(|cli_res| { 119 | let del_res = cli_res.get_del(); 120 | debug!("got response success: {} txid: {} err: {}", 121 | del_res.get_success(), 122 | del_res.get_txid(), 123 | del_res.get_err()); 124 | cli_res.get_del().clone() 125 | }) 126 | } 127 | 128 | fn req(&mut self, key: Vec, req: CliReq) -> io::Result { 129 | // send to a peer, they'll redirect us if we're wrong 130 | for peer in self.servers.iter() { 131 | debug!("trying peer {:?}", peer); 132 | let mut stream_attempt = TcpStream::connect(&peer); 133 | if stream_attempt.is_err() { 134 | continue; 135 | } 136 | 137 | let mut stream = stream_attempt.unwrap(); 138 | let mut codec = Framed::new(); 139 | let mut msg = 140 | codec.encode(ByteBuf::from_slice(&*req.write_to_bytes() 141 | .unwrap())); 142 | 143 | if send_to(&mut stream, &mut msg).is_err() { 144 | debug!("could not send"); 145 | continue; 146 | } 147 | match recv_into(&mut stream, &mut codec) { 148 | Ok(res_buf) => { 149 | let res: &[u8] = res_buf.bytes(); 150 | let cli_res: CliRes = protobuf::parse_from_bytes(res) 151 | .unwrap(); 152 | if cli_res.has_redirect() { 153 | debug!("we got redirect to {}!", 154 | cli_res.get_redirect().get_address()); 155 | // TODO(tyler) try redirected host next 156 | continue; 157 | } 158 | return Ok(cli_res); 159 | } 160 | Err(e) => { 161 | debug!("got err on recv_into: {}", e); 162 | continue; 163 | } 164 | } 165 | } 166 | Err(Error::new(ErrorKind::Other, "unable to reach any servers!")) 167 | } 168 | } 169 | 170 | fn send_to(stream: &mut TcpStream, buf: &mut ByteBuf) -> io::Result<()> { 171 | loop { 172 | match stream.try_write_buf(buf) { 173 | Ok(None) => { 174 | continue; 175 | } 176 | Ok(Some(r)) => { 177 | if buf.remaining() == 0 { 178 | return Ok(()); 179 | } 180 | } 181 | Err(e) => { 182 | match e.raw_os_error() { 183 | Some(32) => { 184 | debug!("client disconnected"); 185 | } 186 | Some(e) => 187 | debug!("not implemented; client os err={:?}", e), 188 | _ => debug!("not implemented; client err={:?}", e), 189 | }; 190 | // Don't reregister. 191 | return Err(e); 192 | } 193 | } 194 | } 195 | } 196 | 197 | fn recv_into(stream: &mut TcpStream, 198 | codec: &mut Codec) 199 | -> io::Result { 200 | loop { 201 | let mut res_buf = ByteBuf::mut_with_capacity(1024); 202 | match stream.try_read_buf(&mut res_buf) { 203 | Ok(None) => { 204 | //debug!("got readable, but can't read from the socket"); 205 | } 206 | Ok(Some(r)) => { 207 | //debug!("CONN : we read {} bytes!", r); 208 | } 209 | Err(e) => { 210 | debug!("not implemented; client err={:?}", e); 211 | } 212 | } 213 | let mut r: Vec = codec.decode(&mut res_buf.flip()); 214 | if r.len() == 1 { 215 | let res_buf = r.pop().unwrap(); 216 | return Ok(res_buf) 217 | } 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /src/clock.rs: -------------------------------------------------------------------------------- 1 | use std::sync::RwLock; 2 | use std::thread; 3 | 4 | use time; 5 | 6 | pub trait Clock { 7 | fn now(&self) -> time::Timespec; 8 | fn sleep_ms(&self, ms: u32); 9 | } 10 | 11 | pub struct RealClock; 12 | 13 | unsafe impl Sync for RealClock{} 14 | 15 | impl Clock for RealClock { 16 | fn now(&self) -> time::Timespec { 17 | time::now().to_timespec() 18 | } 19 | 20 | fn sleep_ms(&self, ms: u32) { 21 | thread::sleep_ms(ms) 22 | } 23 | } 24 | 25 | pub struct TestClock { 26 | inner: RwLock, 27 | } 28 | 29 | impl TestClock { 30 | pub fn new() -> TestClock { 31 | TestClock { inner: RwLock::new(time::Timespec { sec: 0, nsec: 0 }) } 32 | } 33 | } 34 | 35 | impl Clock for TestClock { 36 | fn now(&self) -> time::Timespec { 37 | let inner = self.inner.read().unwrap(); 38 | *inner 39 | } 40 | 41 | fn sleep_ms(&self, ms: u32) { 42 | let mut inner = self.inner.write().unwrap(); 43 | let ns = (ms % 1e6 as u32) * 1e6 as u32; 44 | inner.nsec += ns as i32; 45 | if inner.nsec > 1e9 as i32 { 46 | inner.sec += (inner.nsec / 1e9 as i32) as i64; 47 | inner.nsec = (inner.nsec % 1e9 as i32) as i32; 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/codec.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Add; 2 | 3 | use bytes::{Buf, ByteBuf, MutBuf, MutByteBuf, alloc}; 4 | use mio::{TryRead, TryWrite}; 5 | 6 | pub trait Codec 7 | { 8 | fn decode(&mut self, buf: &mut In) -> Vec; 9 | fn encode(&self, a: Out) -> In; 10 | } 11 | 12 | pub struct CodecStack { 13 | left: Box>, 14 | right: Box>, 15 | } 16 | 17 | impl Codec for CodecStack { 18 | fn decode(&mut self, buf: &mut In) -> Vec { 19 | self.left 20 | .decode(buf) 21 | .iter_mut() 22 | .flat_map(|mut d| self.right.decode(&mut d)) 23 | .collect() 24 | } 25 | 26 | fn encode(&self, out: Out) -> In { 27 | self.left.encode(self.right.encode(out)) 28 | } 29 | } 30 | 31 | pub struct Framed { 32 | sz_buf: MutByteBuf, 33 | msg: Option, 34 | } 35 | 36 | impl Framed { 37 | pub fn new() -> Framed { 38 | Framed { 39 | sz_buf: ByteBuf::mut_with_capacity(4), 40 | msg: None, 41 | } 42 | } 43 | } 44 | 45 | impl Codec for Framed { 46 | 47 | fn decode(&mut self, buf: &mut ByteBuf) -> Vec { 48 | let mut res = vec![]; 49 | loop { 50 | // read size if we don't have a message yet 51 | if self.msg.is_none() { 52 | let sz_read = buf.try_read_buf(&mut self.sz_buf); 53 | // if we've read 4 bytes for the size, create a msg 54 | if self.sz_buf.remaining() != 0 { 55 | break; 56 | } 57 | 58 | let sz_buf = self.sz_buf.bytes(); 59 | let size = array_to_usize([sz_buf[0], sz_buf[1], sz_buf[2], 60 | sz_buf[3]]); 61 | self.msg = unsafe { 62 | // manually create bytebuf so we can have exact cap and lim 63 | Some(ByteBuf::from_mem_ref(alloc::heap(size.next_power_of_two()), 64 | size as u32, // cap 65 | 0, // pos 66 | size as u32 /* lim */) 67 | .flip()) 68 | }; 69 | } 70 | 71 | if self.msg.is_none() { 72 | break; 73 | } 74 | 75 | let mut msg = self.msg.take().unwrap(); 76 | 77 | // read actual message 78 | match buf.try_read_buf(&mut msg) { 79 | Ok(Some(read)) => { 80 | // if we're done, return our Item 81 | if msg.remaining() == 0 { 82 | // get ready to read a new size 83 | self.sz_buf.clear(); 84 | // return the message 85 | res.push(msg.flip()) 86 | } else { 87 | self.msg = Some(msg); 88 | break 89 | } 90 | } 91 | _ => break, 92 | } 93 | } 94 | res 95 | } 96 | 97 | fn encode(&self, item: ByteBuf) -> ByteBuf { 98 | let b = item.bytes(); 99 | let mut res = ByteBuf::mut_with_capacity(4 + b.len()); 100 | assert!(res.write_slice(&usize_to_array(b.len())) == 4); 101 | assert!(res.write_slice(b) == b.len()); 102 | res.flip() 103 | } 104 | } 105 | 106 | pub fn usize_to_array(u: usize) -> [u8; 4] { 107 | [(u >> 24) as u8, (u >> 16) as u8, (u >> 8) as u8, u as u8] 108 | } 109 | 110 | pub fn array_to_usize(ip: [u8; 4]) -> usize { 111 | ((ip[0] as usize) << 24) as usize + ((ip[1] as usize) << 16) as usize + 112 | ((ip[2] as usize) << 8) as usize + (ip[3] as usize) 113 | } 114 | 115 | #[cfg(test)] 116 | mod tests { 117 | extern crate quickcheck; 118 | use rand::{Rng, thread_rng}; 119 | 120 | use codec; 121 | use codec::Codec; 122 | use bytes::{Buf, ByteBuf, MutByteBuf}; 123 | 124 | fn array_prop(u: usize) -> bool { 125 | codec::array_to_usize(codec::usize_to_array(u)) == u 126 | } 127 | 128 | #[test] 129 | fn test_usize_to_array_to_usize() { 130 | quickcheck::quickcheck(array_prop as fn(usize) -> bool); 131 | let ip = [250, 1, 2, 3]; 132 | assert!(codec::usize_to_array(codec::array_to_usize(ip)) == ip); 133 | } 134 | 135 | fn framed_prop(sz: usize) -> bool { 136 | if sz == 0 { 137 | // TODO(tyler) currently, feeding an empty slice to 138 | // ByteBuf::from_slice causes a segfault... 139 | return true; 140 | } 141 | let mut rng = thread_rng(); 142 | let mut v: Vec = rng.gen_iter::().take(sz).collect(); 143 | let mut c = codec::Framed::new(); 144 | let mut bytes = ByteBuf::from_slice(&*v); 145 | let mut encoded = c.encode(bytes); 146 | c.decode(&mut encoded).len() == 1 147 | } 148 | 149 | #[test] 150 | fn test_framed_codec() { 151 | quickcheck::quickcheck(framed_prop as fn(usize) -> bool); 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/etc/loadtest.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "net" 7 | "runtime" 8 | 9 | "github.com/spacejam/loghisto" 10 | ) 11 | 12 | func benchmark(conn net.Conn) { 13 | fmt.Fprintf(conn, "\x00\x00\x00\x03yo\n") 14 | r, err := bufio.NewReader(conn).ReadString('\n') 15 | if err != nil { 16 | fmt.Errorf("could not read response: %v", err) 17 | return 18 | } 19 | if r != "\x00\x00\x00\x03yo\n" { 20 | fmt.Println("bad response") 21 | } 22 | } 23 | 24 | func main() { 25 | numCPU := runtime.NumCPU() 26 | runtime.GOMAXPROCS(numCPU) 27 | 28 | fire := make(chan struct{}) 29 | for i := 0; i < 8; i++ { 30 | go func() { 31 | conn, err := net.Dial("tcp", "localhost:8880") 32 | if err != nil { 33 | fmt.Errorf("could not connect: %v", err) 34 | return 35 | } 36 | for { 37 | <-fire 38 | benchmark(conn) 39 | } 40 | }() 41 | } 42 | 43 | desiredConcurrency := uint(10) 44 | loghisto.PrintBenchmark("benchmark1234", desiredConcurrency, func() { fire <- struct{}{} }) 45 | } 46 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![crate_id = "rasputin"] 2 | #![crate_type = "lib"] 3 | 4 | pub use serialization::{Append, AppendRes, CASReq, CASRes, CliReq, CliRes, 5 | GetReq, GetRes, Mutation, MutationType, PeerMsg, 6 | RedirectRes, SetReq, SetRes, Version, VoteReq, VoteRes, 7 | WatchReq, WatchRes, DelReq, DelRes}; 8 | 9 | pub use codec::{Codec, Framed}; 10 | 11 | pub use clock::{Clock, RealClock, TestClock}; 12 | 13 | pub use range_bounds::RangeBounds; 14 | 15 | pub use client::Client; 16 | 17 | pub mod client; 18 | pub mod clock; 19 | pub mod codec; 20 | pub mod logging; 21 | pub mod range_bounds; 22 | pub mod serialization; 23 | pub mod server; 24 | 25 | extern crate bytes; 26 | #[macro_use] 27 | extern crate log; 28 | #[macro_use] 29 | extern crate lazy_static; 30 | extern crate mio; 31 | extern crate protobuf; 32 | extern crate rand; 33 | extern crate rocksdb; 34 | extern crate time; 35 | extern crate uuid; 36 | extern crate threadpool; 37 | -------------------------------------------------------------------------------- /src/logging/mod.rs: -------------------------------------------------------------------------------- 1 | use std::fs::{self, File, OpenOptions}; 2 | use std::io::{Error, ErrorKind}; 3 | use std::io::prelude::Write; 4 | use std::path::Path; 5 | use std::sync::{Arc, Mutex}; 6 | 7 | use log::{self, LogLevel, LogLevelFilter, LogMetadata, LogRecord, 8 | SetLoggerError}; 9 | use time; 10 | 11 | struct StdoutLogger { 12 | level: LogLevel, 13 | } 14 | 15 | impl log::Log for StdoutLogger { 16 | fn enabled(&self, metadata: &LogMetadata) -> bool { 17 | metadata.level() <= self.level 18 | } 19 | 20 | fn log(&self, record: &LogRecord) { 21 | if self.enabled(record.metadata()) { 22 | println!("{} {} {}:{}] {}", 23 | record.level(), 24 | time::now().to_timespec().sec, // TODO(tyler) logical clock 25 | record.location().file().split("/").last().unwrap(), 26 | record.location().line(), 27 | record.args()); 28 | } 29 | } 30 | } 31 | 32 | struct FileLogger { 33 | file: Arc>, 34 | level: LogLevel, 35 | } 36 | 37 | impl FileLogger { 38 | pub fn new(path: &str, level: LogLevel) -> Result { 39 | let ospath = Path::new(path).parent(); 40 | if ospath.is_none() { 41 | return Err(Error::new(ErrorKind::Other, 42 | format!("Failed to use log directory: {}", 43 | path))); 44 | } 45 | 46 | match fs::create_dir_all(&ospath.unwrap()) { 47 | Err(e) => return Err(Error::new(ErrorKind::Other, 48 | format!("Failed to create log \ 49 | directory: {}", 50 | e))), 51 | Ok(_) => (), 52 | } 53 | 54 | OpenOptions::new() 55 | .create(true) 56 | .write(true) 57 | .append(true) 58 | .open(path) 59 | .map(|file| { 60 | FileLogger { 61 | file: Arc::new(Mutex::new(file)), 62 | level: level, 63 | } 64 | }) 65 | } 66 | } 67 | 68 | impl log::Log for FileLogger { 69 | fn enabled(&self, metadata: &LogMetadata) -> bool { 70 | metadata.level() <= self.level 71 | } 72 | 73 | fn log(&self, record: &LogRecord) { 74 | if self.enabled(record.metadata()) { 75 | let mut logfile = self.file.clone(); 76 | logfile.lock() 77 | .unwrap() 78 | .write_all(format!("{} {} {}:{}] {}\n", 79 | record.level(), 80 | time::now().to_timespec().sec, 81 | record.location() 82 | .file() 83 | .split("/") 84 | .last() 85 | .unwrap(), 86 | record.location().line(), 87 | record.args()) 88 | .as_bytes()); 89 | } 90 | } 91 | } 92 | 93 | pub fn init_logger(path: Option, 94 | level: LogLevel) 95 | -> Result<(), SetLoggerError> { 96 | let logger: Box = match path { 97 | Some(p) => Box::new(FileLogger::new(p.trim_left(), level).unwrap()), 98 | None => Box::new(StdoutLogger { level: level }), 99 | }; 100 | 101 | log::set_logger(|max_log_level| { 102 | max_log_level.set(LogLevelFilter::Debug); 103 | logger 104 | }) 105 | } 106 | -------------------------------------------------------------------------------- /src/range_bounds.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | 3 | pub struct RangeBounds { 4 | lower: Vec, 5 | upper: Vec, 6 | } 7 | 8 | impl RangeBounds { 9 | pub fn new(lower: Vec, upper: Vec) -> Result { 10 | if lower >= upper { 11 | Err("lower is >= upper, which is incorrect!".to_string()) 12 | } else { 13 | Ok(RangeBounds { 14 | lower: lower, 15 | upper: upper, 16 | }) 17 | } 18 | } 19 | } 20 | 21 | impl Ord for RangeBounds { 22 | fn cmp(&self, other: &Self) -> Ordering { 23 | assert!(self.upper > self.lower); 24 | assert!(other.upper > other.lower); 25 | if self.upper <= other.lower { 26 | Ordering::Less 27 | } else if self.lower >= other.upper { 28 | Ordering::Greater 29 | } else { 30 | Ordering::Equal 31 | } 32 | } 33 | } 34 | 35 | impl PartialOrd for RangeBounds { 36 | fn partial_cmp(&self, other: &Self) -> Option { 37 | Some(self.cmp(other)) 38 | } 39 | } 40 | 41 | impl PartialEq for RangeBounds { 42 | fn eq(&self, other: &Self) -> bool { 43 | (&self.lower, &self.upper) == (&other.lower, &other.upper) 44 | } 45 | } 46 | 47 | impl Eq for RangeBounds { } 48 | -------------------------------------------------------------------------------- /src/server/acked_log.rs: -------------------------------------------------------------------------------- 1 | pub use server::{PeerID, TXID, Term}; 2 | use std::fmt; 3 | 4 | use std::collections::BTreeMap; 5 | 6 | pub trait AckedLog { 7 | fn append(&mut self, term: Term, txid: TXID, entry: T); 8 | fn get(&self, txid: TXID) -> Option; 9 | fn ack_up_to(&mut self, txid: TXID, peer: PeerID) -> Vec<(Term, TXID)>; 10 | fn commit_up_to(&mut self, txid: TXID) -> Vec<(Term, TXID)>; 11 | fn last_learned_term(&self) -> Term; 12 | fn last_learned_txid(&self) -> TXID; 13 | fn last_accepted_term(&self) -> Term; 14 | fn last_accepted_txid(&self) -> TXID; 15 | } 16 | 17 | // This should be used for testing and debugging only. 18 | pub trait ViewableLog { 19 | fn acked(&self) -> Vec<(Term, TXID)>; 20 | fn learned(&self) -> Vec<(Term, TXID)>; 21 | } 22 | 23 | impl fmt::Debug for AckedLog + Send { 24 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 25 | write!(f, 26 | "(lt: {} lx: {} at: {} ax: {})", 27 | self.last_learned_term(), 28 | self.last_learned_txid(), 29 | self.last_accepted_term(), 30 | self.last_accepted_txid()) 31 | } 32 | } 33 | 34 | #[derive(Debug)] 35 | pub struct LogEntry { 36 | txid: TXID, 37 | term: Term, 38 | last_txid: TXID, 39 | last_term: Term, 40 | entry: T, 41 | } 42 | 43 | #[derive(Debug)] 44 | pub struct Acked { 45 | acks: Vec, 46 | inner: T, 47 | } 48 | 49 | // Leaders and Followers have an AckedLog for handling replication. 50 | // Leaders have quorums of cluster_sz / 2 + 1, and Followers have 51 | // a quorum of 1 (need a single subsequent ack from leader) 52 | #[derive(Debug)] 53 | pub struct InMemoryLog { 54 | pub pending: BTreeMap>>, 55 | pub committed: BTreeMap>, 56 | pub quorum: usize, 57 | pub last_learned_txid: TXID, 58 | pub last_learned_term: Term, 59 | pub last_accepted_txid: TXID, 60 | pub last_accepted_term: Term, 61 | } 62 | 63 | unsafe impl Sync for InMemoryLog{} 64 | 65 | impl AckedLog for InMemoryLog { 66 | fn append(&mut self, term: Term, txid: TXID, entry: T) { 67 | //TODO verify txid > last accepted 68 | assert!(txid > self.last_accepted_txid); 69 | self.pending.insert(txid, 70 | Acked { 71 | acks: vec![], 72 | inner: LogEntry { 73 | txid: txid, 74 | term: term, 75 | last_txid: self.last_accepted_txid, 76 | last_term: self.last_accepted_term, 77 | entry: entry, 78 | }, 79 | }); 80 | self.last_accepted_txid = txid; 81 | self.last_accepted_term = term; 82 | } 83 | 84 | fn get(&self, txid: TXID) -> Option { 85 | self.pending 86 | .get(&txid) 87 | .map(|al| al.inner.entry.clone()) 88 | .or(self.committed.get(&txid).map(|l| l.entry.clone())) 89 | } 90 | 91 | // Used by leaders to know when they've gotten enough acks. 92 | // returns a set of txid's that have reached quorum 93 | fn ack_up_to(&mut self, txid: TXID, peer: PeerID) -> Vec<(Term, TXID)> { 94 | // append ack 95 | for (txid, ent) in self.pending.iter_mut() { 96 | if ent.inner.txid <= *txid { 97 | if !ent.acks.contains(&peer) { 98 | ent.acks.push(peer) 99 | } 100 | break 101 | } 102 | } 103 | let mut reached_quorum = vec![]; 104 | loop { 105 | if self.pending.len() == 0 { 106 | break; 107 | } 108 | let txid = self.pending.keys().cloned().next().unwrap(); 109 | if self.pending.get(&txid).unwrap().acks.len() < self.quorum { 110 | break; 111 | } 112 | // TODO(tyler) work out persistence story so we don't lose 113 | // logs during server crash between remove and push. 114 | let ent = self.pending.remove(&txid).unwrap(); 115 | self.last_learned_term = ent.inner.term; 116 | self.last_learned_txid = ent.inner.txid; 117 | reached_quorum.push((ent.inner.term, ent.inner.txid)); 118 | self.committed.insert(txid, ent.inner); 119 | } 120 | reached_quorum 121 | } 122 | 123 | // Used by followers to commit where the leader told them they should 124 | // be learning up to. 125 | // returns the set of txids that have reached quorum 126 | fn commit_up_to(&mut self, txid: TXID) -> Vec<(Term, TXID)> { 127 | let mut reached_quorum = vec![]; 128 | loop { 129 | if self.pending.len() == 0 { 130 | break; 131 | } 132 | let next_txid = self.pending.keys().cloned().next().unwrap(); 133 | if next_txid > txid { 134 | break; 135 | } 136 | let ent = self.pending.remove(&next_txid).unwrap(); 137 | 138 | // TODO(tyler) work out persistence story so we don't lose 139 | // logs during server crash between remove and push. 140 | self.last_learned_term = ent.inner.term; 141 | self.last_learned_txid = ent.inner.txid; 142 | reached_quorum.push((ent.inner.term, ent.inner.txid)); 143 | self.committed.insert(txid, ent.inner); 144 | } 145 | reached_quorum 146 | } 147 | 148 | fn last_learned_term(&self) -> Term { 149 | self.last_learned_term 150 | } 151 | 152 | fn last_learned_txid(&self) -> TXID { 153 | self.last_learned_txid 154 | } 155 | 156 | fn last_accepted_term(&self) -> Term { 157 | self.last_accepted_term 158 | } 159 | 160 | fn last_accepted_txid(&self) -> TXID { 161 | self.last_accepted_txid 162 | } 163 | } 164 | 165 | impl ViewableLog for InMemoryLog>> { 166 | fn acked(&self) -> Vec<(Term, TXID)> { 167 | let mut ret = vec![]; 168 | for (txid, acked) in self.pending.iter() { 169 | ret.push((acked.inner.term, *txid)); 170 | } 171 | ret 172 | } 173 | 174 | fn learned(&self) -> Vec<(Term, TXID)> { 175 | let mut ret = vec![]; 176 | for (txid, learned) in self.committed.iter() { 177 | ret.push((learned.term, *txid)); 178 | } 179 | ret 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/server/connset.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Error, ErrorKind}; 2 | use std::io; 3 | use std::sync::mpsc::Sender; 4 | 5 | use mio; 6 | use mio::{EventLoop, EventSet, PollOpt, Token}; 7 | use mio::tcp::{TcpListener, TcpStream}; 8 | use mio::util::Slab; 9 | 10 | use server::Envelope; 11 | use server::server_conn::ServerConn; 12 | use server::traffic_cop::TrafficCop; 13 | 14 | pub struct ConnSet { 15 | pub srv_sock: TcpListener, 16 | pub srv_token: Token, 17 | pub conns: Slab, 18 | pub req_tx: Sender, 19 | } 20 | 21 | impl ConnSet { 22 | pub fn accept(&mut self, 23 | event_loop: &mut EventLoop) 24 | -> io::Result<()> { 25 | 26 | debug!("ConnSet accepting socket"); 27 | 28 | let sock = try!(self.srv_sock.accept()); 29 | self.register(sock.unwrap(), event_loop).map(|_| ()) 30 | } 31 | 32 | pub fn register(&mut self, 33 | sock: TcpStream, 34 | event_loop: &mut EventLoop) 35 | -> io::Result { 36 | 37 | let conn = ServerConn::new(sock, self.req_tx.clone()); 38 | 39 | // Re-register accepting socket 40 | event_loop.reregister(&self.srv_sock, 41 | self.srv_token, 42 | EventSet::readable(), 43 | PollOpt::edge() | PollOpt::oneshot()); 44 | 45 | self.conns 46 | .insert(conn) 47 | .map(|tok| { 48 | // Register the connection 49 | self.conns[tok].token = Some(tok); 50 | event_loop.register_opt(&self.conns[tok].sock, 51 | tok, 52 | EventSet::readable(), 53 | PollOpt::edge() | PollOpt::oneshot()) 54 | .ok() 55 | .expect("could not register socket with event loop"); 56 | tok 57 | }) 58 | .or_else(|e| { 59 | Err(Error::new(ErrorKind::Other, "All connection slots full.")) 60 | }) 61 | } 62 | 63 | pub fn conn_readable(&mut self, 64 | event_loop: &mut EventLoop, 65 | tok: Token) 66 | -> io::Result<()> { 67 | 68 | debug!("ConnSet conn readable; tok={:?}", tok); 69 | if !self.conns.contains(tok) { 70 | debug!("got conn_readable for non-existent token!"); 71 | return Ok(()); 72 | } 73 | 74 | self.conn(tok).readable(event_loop) 75 | } 76 | 77 | pub fn conn_writable(&mut self, 78 | event_loop: &mut EventLoop, 79 | tok: Token) 80 | -> io::Result<()> { 81 | if !self.conns.contains(tok) { 82 | debug!("got conn_writable for non-existent token!"); 83 | return Ok(()); 84 | } 85 | 86 | debug!("ConnSet conn writable; tok={:?}", tok); 87 | match self.conn(tok).writable(event_loop) { 88 | Err(e) => { 89 | debug!("got err in ConnSet conn_writable: {}", e); 90 | Err(e) 91 | } 92 | w => w, 93 | } 94 | } 95 | 96 | fn conn<'a>(&'a mut self, tok: Token) -> &'a mut ServerConn { 97 | &mut self.conns[tok] 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/server/mod.rs: -------------------------------------------------------------------------------- 1 | mod server; 2 | mod connset; 3 | mod server_conn; 4 | mod traffic_cop; 5 | mod acked_log; 6 | pub mod rocksdb; 7 | 8 | pub use server::server::Server; 9 | pub use server::connset::ConnSet; 10 | pub use server::server_conn::ServerConn; 11 | pub use server::acked_log::{AckedLog, InMemoryLog, LogEntry}; 12 | 13 | use std::io::{Error, ErrorKind}; 14 | use std::io; 15 | use std::net::SocketAddr; 16 | use std::ops::{Add, Sub}; 17 | use std::sync::{Arc, Mutex}; 18 | use std::sync::mpsc::{self, Receiver, SendError, Sender}; 19 | use std::thread; 20 | use std::usize; 21 | 22 | use bytes::{Buf, ByteBuf, MutByteBuf, SliceBuf, alloc}; 23 | use mio; 24 | use mio::{EventLoop, EventSet, Handler, NotifyError, PollOpt, Token, TryRead, 25 | TryWrite}; 26 | use mio::tcp::{TcpListener, TcpSocket, TcpStream}; 27 | use mio::util::Slab; 28 | use rand::{Rng, thread_rng}; 29 | use rocksdb::{DB, Writable}; 30 | use protobuf; 31 | use protobuf::Message; 32 | use time; 33 | 34 | pub const SERVER_CLIENTS: Token = Token(0); 35 | pub const SERVER_PEERS: Token = Token(1); 36 | pub const PEER_BROADCAST: Token = Token(usize::MAX); 37 | 38 | lazy_static! { 39 | pub static ref LEADER_DURATION: time::Duration = 40 | time::Duration::seconds(12); 41 | pub static ref LEADER_REFRESH: time::Duration = 42 | time::Duration::seconds(6); 43 | } 44 | 45 | pub type TXID = u64; 46 | pub type Term = u64; 47 | pub type PeerID = String; 48 | 49 | pub struct Envelope { 50 | pub address: Option, 51 | pub tok: Token, 52 | pub msg: ByteBuf, 53 | } 54 | 55 | impl Clone for Envelope { 56 | fn clone(&self) -> Self { 57 | Envelope { 58 | address: self.address, 59 | tok: self.tok, 60 | msg: ByteBuf::from_slice(self.msg.bytes()), 61 | } 62 | } 63 | } 64 | 65 | pub trait SendChannel { 66 | fn send_msg(&self, msg: M) -> E; 67 | } 68 | 69 | impl SendChannel>> for mio::Sender { 70 | fn send_msg(&self, msg: M) -> Result<(), NotifyError> { 71 | self.send(msg) 72 | } 73 | } 74 | 75 | impl SendChannel>> for Sender { 76 | fn send_msg(&self, msg: M) -> Result<(), SendError> { 77 | self.send(msg) 78 | } 79 | } 80 | 81 | #[derive(Debug, PartialEq, Clone)] 82 | pub struct Peer { 83 | addr: SocketAddr, 84 | sock: Option, 85 | } 86 | 87 | #[derive(Debug, PartialEq)] 88 | pub struct RepPeer { 89 | last_accepted_term: Term, 90 | last_accepted_txid: TXID, 91 | max_sent_txid: TXID, 92 | tok: Token, 93 | id: PeerID, 94 | addr: Option, 95 | } 96 | 97 | #[derive(Debug, Clone)] 98 | pub enum State { 99 | Leader { 100 | term: Term, 101 | have: Vec, 102 | need: u8, 103 | until: time::Timespec, 104 | }, 105 | Candidate { 106 | term: Term, 107 | have: Vec, 108 | need: u8, 109 | until: time::Timespec, 110 | }, 111 | Follower { 112 | term: Term, 113 | id: PeerID, 114 | tok: Token, 115 | leader_addr: SocketAddr, 116 | until: time::Timespec, 117 | }, 118 | Init, 119 | } 120 | 121 | impl State { 122 | fn valid_leader(&self, now: time::Timespec) -> bool { 123 | match *self { 124 | State::Leader{until: until, ..} => now < until, 125 | State::Follower{ 126 | term:_, id:_, leader_addr: _, until: until, tok: _ 127 | } => now < until, 128 | _ => false, 129 | } 130 | } 131 | 132 | fn valid_candidate(&self, now: time::Timespec) -> bool { 133 | match *self { 134 | State::Candidate{until: until, ..} => now < until, 135 | _ => false, 136 | } 137 | } 138 | 139 | pub fn is_leader(&self) -> bool { 140 | match *self { 141 | State::Leader{..} => true, 142 | _ => false, 143 | } 144 | } 145 | 146 | fn is_follower(&self) -> bool { 147 | match *self { 148 | State::Follower{..} => true, 149 | _ => false, 150 | } 151 | } 152 | 153 | fn is_following(&self, id: PeerID) -> bool { 154 | match *self { 155 | State::Follower{id: ref lid, .. } => *lid == id, 156 | _ => false, 157 | } 158 | } 159 | 160 | fn is_candidate(&self) -> bool { 161 | match *self { 162 | State::Candidate{..} => true, 163 | _ => false, 164 | } 165 | } 166 | 167 | fn should_extend_leadership(&self, now: time::Timespec) -> bool { 168 | match *self { 169 | State::Leader{until: until, ..} => { 170 | now.add(*LEADER_REFRESH) >= until && now < until 171 | } 172 | _ => false, 173 | } 174 | } 175 | 176 | fn can_extend_lead(&self) -> bool { 177 | match *self { 178 | State::Candidate{have: ref have, need: need, ..} => 179 | have.len() > need as usize, 180 | State::Leader{have: ref have, need: need, ..} => 181 | have.len() > need as usize, 182 | _ => false, 183 | } 184 | } 185 | 186 | fn following(&self, id: PeerID) -> bool { 187 | match *self { 188 | State::Follower{id: ref fid, until: until, .. } => id == *fid, 189 | _ => false, 190 | } 191 | } 192 | 193 | fn until(&self) -> Option { 194 | match *self { 195 | State::Leader{until: until, ..} => Some(until), 196 | State::Candidate{until: until, ..} => Some(until), 197 | State::Follower{ until: until, .. } => Some(until), 198 | _ => None, 199 | } 200 | } 201 | 202 | pub fn term(&self) -> Option { 203 | match *self { 204 | State::Leader{term: term, ..} => Some(term), 205 | State::Candidate{term: term, ..} => Some(term), 206 | State::Follower{term: term, .. } => Some(term), 207 | _ => None, 208 | } 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/server/rocksdb.rs: -------------------------------------------------------------------------------- 1 | use rocksdb::{DB, Writable}; 2 | use rocksdb::Options as RocksDBOptions; 3 | 4 | pub fn new(storage_dir: String) -> DB { 5 | let mut opts = RocksDBOptions::new(); 6 | let memtable_budget = 1024; 7 | opts.optimize_level_style_compaction(memtable_budget); 8 | opts.create_if_missing(true); 9 | match DB::open_cf(&opts, &storage_dir, &["storage", "local_meta"]) { 10 | Ok(db) => db, 11 | Err(_) => { 12 | info!("Attempting to initialize data directory at {}", storage_dir); 13 | match DB::open(&opts, &storage_dir) { 14 | Ok(mut db) => { 15 | db.create_cf("storage", &RocksDBOptions::new()).unwrap(); 16 | db.create_cf("local_meta", &RocksDBOptions::new()).unwrap(); 17 | db 18 | } 19 | Err(e) => { 20 | error!("failed to create database at {}", storage_dir); 21 | error!("{}", e); 22 | panic!(e); 23 | } 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/server/server.rs: -------------------------------------------------------------------------------- 1 | use std::cmp; 2 | use std::collections::BTreeMap; 3 | use std::net::SocketAddr; 4 | use std::ops::Add; 5 | use std::process; 6 | use std::sync::{Arc, Mutex}; 7 | use std::sync::mpsc; 8 | use std::thread; 9 | 10 | use bytes::{Buf, ByteBuf}; 11 | use mio; 12 | use mio::{EventLoop, Token}; 13 | use rand::{Rng, thread_rng}; 14 | use rocksdb::{DB, DBResult, Writable}; 15 | use protobuf; 16 | use protobuf::Message; 17 | use uuid::Uuid; 18 | 19 | use {Append, AppendRes, CliReq, CliRes, Clock, GetReq, GetRes, Mutation, 20 | MutationType, PeerMsg, RealClock, RedirectRes, SetReq, SetRes, Version, 21 | CASReq, CASRes, DelReq, DelRes, VoteReq, VoteRes}; 22 | use server::{Envelope, LEADER_DURATION, PEER_BROADCAST, State}; 23 | use server::{AckedLog, InMemoryLog, LogEntry, PeerID, RepPeer, TXID, Term}; 24 | use server::{SendChannel, rocksdb}; 25 | use server::traffic_cop::TrafficCop; 26 | 27 | pub struct Server { 28 | pub clock: Arc, 29 | pub peer_port: u16, 30 | pub cli_port: u16, 31 | pub id: PeerID, 32 | pub peers: Vec, 33 | pub rep_peers: BTreeMap, 34 | pub rpc_tx: Box + Send>, 35 | pub max_generated_txid: TXID, 36 | pub highest_term: Term, 37 | pub state: State, 38 | pub db: DB, 39 | pub rep_log: Box + Send>, 40 | pub pending: BTreeMap, 41 | } 42 | 43 | unsafe impl Sync for Server{} 44 | 45 | impl Server { 46 | 47 | pub fn run(peer_port: u16, 48 | cli_port: u16, 49 | storage_dir: String, 50 | peers: Vec) { 51 | let db = rocksdb::new(storage_dir); 52 | 53 | // All long-running worker threads get a clone of this 54 | // Sender. When they exit, they send over it. If the 55 | // Receiver ever completes a read, it means something 56 | // unexpectedly exited. It's vital that we shut down 57 | // immediately, so we don't repeat the ZK bug where 58 | // the heartbeater keeps running while other vital threads 59 | // have exited, falsely communicating healthiness. 60 | let (thread_exit_tx, thread_exit_rx) = mpsc::channel(); 61 | 62 | // The TrafficCop manages our sockets, sends deserialized 63 | // messages over the request channel, and receives completed 64 | // responses over the response channel. 65 | let (peer_req_tx, peer_req_rx) = mpsc::channel(); 66 | let (cli_req_tx, cli_req_rx) = mpsc::channel(); 67 | 68 | let mut tc = TrafficCop::new( 69 | peer_port, 70 | cli_port, 71 | peers.clone(), 72 | peer_req_tx, 73 | cli_req_tx 74 | ).unwrap(); 75 | 76 | // A single MIO EventLoop handles our IO 77 | let mut event_loop = EventLoop::new().unwrap(); 78 | 79 | // All RPC's are sent over the event_loop's 80 | // notification channel. 81 | let rpc_tx = event_loop.channel(); 82 | 83 | // start server periodic tasks 84 | event_loop.timeout_ms((), thread_rng().gen_range(200, 500)).unwrap(); 85 | 86 | // IO event loop thread 87 | let tex1 = thread_exit_tx.clone(); 88 | thread::Builder::new() 89 | .name("IO loop".to_string()) 90 | .spawn(move || { 91 | tc.run_event_loop(event_loop); 92 | tex1.send(()); 93 | }); 94 | 95 | let mut rep_log = Box::new(InMemoryLog { 96 | pending: BTreeMap::new(), 97 | committed: BTreeMap::new(), 98 | quorum: peers.len() / 2 + 1, 99 | last_learned_txid: 0, // TODO(tyler) read from rocksdb 100 | last_learned_term: 0, // TODO(tyler) read from rocksdb 101 | last_accepted_txid: 0, // TODO(tyler) read from rocksdb 102 | last_accepted_term: 0, // TODO(tyler) read from rocksdb 103 | }); 104 | 105 | let clock = Arc::new(RealClock); 106 | 107 | let server = Arc::new(Mutex::new(Server { 108 | clock: clock.clone(), 109 | peer_port: peer_port, 110 | cli_port: cli_port, 111 | id: Uuid::new_v4().to_string(), // TODO(tyler) read from rocksdb 112 | rpc_tx: Box::new(rpc_tx), 113 | max_generated_txid: 0, // TODO(tyler) read from rocksdb 114 | highest_term: 0, // TODO(tyler) read from rocksdb 115 | state: State::Init, 116 | db: db, 117 | rep_log: rep_log, 118 | peers: peers, 119 | rep_peers: BTreeMap::new(), 120 | pending: BTreeMap::new(), 121 | })); 122 | 123 | // peer request handler thread 124 | let srv1 = server.clone(); 125 | let tex2 = thread_exit_tx.clone(); 126 | thread::Builder::new() 127 | .name("peer request handler".to_string()) 128 | .spawn(move || { 129 | for req in peer_req_rx { 130 | match srv1.lock() { 131 | Ok(mut srv) => srv.handle_peer(req), 132 | Err(e) => { 133 | error!("{}", e); 134 | process::exit(1); 135 | } 136 | } 137 | } 138 | tex2.send(()); 139 | }); 140 | 141 | // cli request handler thread 142 | let srv2 = server.clone(); 143 | let tex3 = thread_exit_tx.clone(); 144 | thread::Builder::new() 145 | .name("cli request handler".to_string()) 146 | .spawn(move || { 147 | for req in cli_req_rx { 148 | match srv2.lock() { 149 | Ok(mut srv) => srv.handle_cli(req), 150 | Err(e) => { 151 | error!("{}", e); 152 | process::exit(1); 153 | } 154 | } 155 | } 156 | tex3.send(()); 157 | }); 158 | 159 | // cron thread 160 | let srv3 = server.clone(); 161 | let tex4 = thread_exit_tx.clone(); 162 | thread::Builder::new() 163 | .name("server cron".to_string()) 164 | .spawn(move || { 165 | let mut rng = thread_rng(); 166 | loop { 167 | clock.sleep_ms(rng.gen_range(400, 500)); 168 | match srv3.lock() { 169 | Ok(mut srv) => srv.cron(), 170 | Err(e) => { 171 | error!("{}", e); 172 | process::exit(1); 173 | } 174 | } 175 | } 176 | tex4.send(()); 177 | }); 178 | 179 | // this should never receive 180 | thread_exit_rx.recv(); 181 | let msg = "A worker thread unexpectedly exited! Shutting down."; 182 | error!("{}", msg); 183 | panic!("A worker thread unexpectedly exited! Shutting down."); 184 | } 185 | 186 | fn update_rep_peers(&mut self, 187 | peer_id: PeerID, 188 | addr: Option, 189 | tok: Token) { 190 | // don't send replication traffic to self 191 | if self.id == peer_id { 192 | return; 193 | } 194 | 195 | // set up a rep peer for this socket, and 196 | // reset possibly old ones 197 | match self.rep_peers 198 | .insert(peer_id.clone(), 199 | RepPeer { 200 | max_sent_txid: self.rep_log.last_accepted_txid(), 201 | last_accepted_txid: self.rep_log 202 | .last_accepted_txid(), 203 | last_accepted_term: self.rep_log 204 | .last_accepted_term(), 205 | tok: tok, 206 | id: peer_id.clone(), 207 | addr: addr, 208 | }) { 209 | Some(old_rep_peer) => { 210 | // retain previous offset information 211 | let new_rep_peer = self.rep_peers.get_mut(&peer_id).unwrap(); 212 | new_rep_peer.max_sent_txid = old_rep_peer.max_sent_txid; 213 | new_rep_peer.last_accepted_txid = old_rep_peer.last_accepted_txid; 214 | new_rep_peer.last_accepted_term = old_rep_peer.last_accepted_term; 215 | } 216 | _ => (), 217 | } 218 | } 219 | 220 | fn handle_vote_res(&mut self, 221 | env: Envelope, 222 | peer_id: PeerID, 223 | vote_res: &VoteRes) { 224 | debug!("{} got response for vote request from {}", 225 | self.id, 226 | env.address.unwrap()); 227 | let term = self.state.term(); 228 | 229 | if term.is_none() || vote_res.get_term() != term.unwrap() { 230 | // got response for an term that is not valid 231 | debug!("invalid term, ignoring vote res"); 232 | return 233 | } 234 | 235 | // Reset if we get any nacks as a candidate. 236 | // This is a difference from Raft, where any node can dethrone 237 | // an otherwise healthy leader with a higher term. We will give 238 | // up on our own if we don't get a majority of unique votes 239 | // by the time our leader lease expires. This protects us against 240 | // a single partially partitioned node from livelocking our cluster. 241 | if self.state.valid_candidate(self.clock.now()) && 242 | !vote_res.get_success() { 243 | // TODO(tyler) set term in rocksdb 244 | if vote_res.get_term() > self.highest_term { 245 | self.highest_term = vote_res.get_term(); 246 | } 247 | self.state = State::Init; 248 | // reset replication peers 249 | self.rep_peers = BTreeMap::new(); 250 | } else if self.state.valid_candidate(self.clock.now()) { 251 | // we're currently a candidate, so see if we can ascend to 252 | // leader or if we need to give up 253 | self.state = match self.state.clone() { 254 | State::Candidate{ 255 | term: term, 256 | until: until, 257 | need: need, 258 | have: ref have, 259 | } => { 260 | let mut new_have = have.clone(); 261 | if !new_have.contains(&env.tok) && 262 | vote_res.get_term() == term { 263 | new_have.push(env.tok); 264 | self.update_rep_peers(peer_id, env.address, env.tok); 265 | } 266 | if new_have.len() >= need as usize { 267 | // we've ascended to leader! 268 | info!("{} transitioning to leader state", self.id); 269 | new_have = vec![]; 270 | let state = State::Leader { 271 | term: term, 272 | until: until, // don't extend until 273 | need: need, 274 | have: new_have, 275 | }; 276 | info!("{:?}", state); 277 | Some(state) 278 | } else { 279 | debug!("need more votes, have {} need {}", 280 | new_have.len(), 281 | need); 282 | // we still need more votes 283 | Some(State::Candidate { 284 | term: term, 285 | until: until, 286 | need: need, 287 | have: new_have, 288 | }) 289 | } 290 | } 291 | _ => None, 292 | } 293 | .unwrap(); 294 | } else if self.state.is_leader() && 295 | self.state.valid_leader(self.clock.now()) && 296 | vote_res.get_success() { 297 | 298 | self.state = match self.state.clone() { 299 | State::Leader{ 300 | term: term, 301 | until: until, 302 | need: need, 303 | have: ref have 304 | } => { 305 | let mut new_until = until; 306 | let mut new_have = have.clone(); 307 | if !new_have.contains(&env.tok) && 308 | vote_res.get_term() == term { 309 | new_have.push(env.tok); 310 | self.update_rep_peers(peer_id, env.address, env.tok); 311 | } 312 | if new_have.len() >= need as usize { 313 | debug!("{} leadership extended", self.id); 314 | new_have = vec![]; 315 | new_until = self.clock.now().add(*LEADER_DURATION); 316 | } 317 | Some(State::Leader { 318 | term: term, 319 | until: new_until, 320 | need: need, 321 | have: new_have, 322 | }) 323 | } 324 | _ => None, 325 | } 326 | .unwrap() 327 | } else if !vote_res.get_success() { 328 | warn!("{} received vote nack from {}", self.id, peer_id); 329 | } else { 330 | // this can happen if a vote res is received by a follower 331 | error!("got vote response, but we can't handle it"); 332 | error!("valid leader: {}", 333 | self.state.valid_leader(self.clock.now())); 334 | error!("is leader: {}", self.state.is_leader()); 335 | error!("valid candidate: {}", 336 | self.state.valid_candidate(self.clock.now())); 337 | error!("is candidate: {}", self.state.is_candidate()); 338 | error!("res term: {}", vote_res.get_term()); 339 | error!("our term: {}", self.state.term().unwrap()); 340 | } 341 | } 342 | 343 | fn handle_vote_req(&mut self, 344 | env: Envelope, 345 | peer_id: PeerID, 346 | vote_req: &VoteReq) { 347 | let mut res = PeerMsg::new(); 348 | res.set_srvid(self.id.clone()); 349 | let mut vote_res = VoteRes::new(); 350 | vote_res.set_term(vote_req.get_term()); 351 | 352 | if peer_id == self.id { 353 | // if we are this node (broadcast is naive) then all is well 354 | // reply to self but don't change to follower 355 | vote_res.set_success(true); 356 | } else if self.state.valid_leader(self.clock.now()) && 357 | !self.state.following(peer_id.clone()) { 358 | // if we're already following a different node, reject 359 | 360 | warn!("got unwanted vote req from {}", peer_id); 361 | // communicate to the source what our term is so they 362 | // can quickly get followers when we're dead. 363 | vote_res.set_term(self.state.term().unwrap()); 364 | vote_res.set_success(false); 365 | } else if self.state.following(peer_id.clone()) { 366 | // if we're already following this node, keed doing so 367 | debug!("{} extending followership of {}", self.id, peer_id); 368 | self.state = match self.state { 369 | State::Follower{ 370 | term: term, 371 | id: ref id, 372 | leader_addr: leader_addr, 373 | until: _, 374 | tok: tok, 375 | } => Some(State::Follower { 376 | term: term, 377 | id: id.clone(), 378 | leader_addr: leader_addr, 379 | until: self.clock.now().add(*LEADER_DURATION), 380 | tok: tok, 381 | }), 382 | _ => None, 383 | } 384 | .unwrap(); 385 | vote_res.set_success(true); 386 | } else if self.should_grant_vote(vote_req) { 387 | self.highest_term = vote_req.get_term(); 388 | info!("new leader {}", peer_id); 389 | self.state = State::Follower { 390 | id: peer_id.clone(), 391 | term: vote_req.get_term(), 392 | tok: env.tok, 393 | leader_addr: env.address.unwrap(), 394 | until: self.clock.now().add(*LEADER_DURATION), 395 | }; 396 | info!("{:?}", self.state); 397 | vote_res.set_success(true); 398 | } else { 399 | match self.state.term() { 400 | Some(term) => vote_res.set_term(term), 401 | None => (), 402 | } 403 | 404 | vote_res.set_success(false); 405 | } 406 | res.set_vote_res(vote_res); 407 | self.reply(env, ByteBuf::from_slice(&*res.write_to_bytes().unwrap())); 408 | } 409 | 410 | fn handle_append(&mut self, 411 | env: Envelope, 412 | peer_id: PeerID, 413 | append: &Append) { 414 | if self.state.is_leader() { 415 | warn!("Leader got append request! This shouldn't happen."); 416 | return; 417 | } 418 | 419 | let mut res = PeerMsg::new(); 420 | res.set_srvid(self.id.clone()); 421 | let mut append_res = AppendRes::new(); 422 | 423 | // verify that we are following this node 424 | if self.state.is_following(peer_id.clone()) { 425 | // verify that it links 426 | if append.get_from_term() == self.rep_log.last_accepted_term() && 427 | append.get_from_txid() == self.rep_log.last_accepted_txid() { 428 | 429 | let mut max_term = self.rep_log.last_accepted_term(); 430 | let mut max_txid = self.rep_log.last_accepted_txid(); 431 | for mutation in append.get_batch() { 432 | let version = mutation.get_version(); 433 | if version.get_term() < max_term { 434 | error!("mutation term: {} our max: {}", 435 | version.get_term(), 436 | max_term); 437 | panic!("replication stream has decreasing term"); 438 | } 439 | if version.get_txid() <= max_txid { 440 | warn!("mutation txid: {} our max: {}", 441 | version.get_txid(), 442 | max_txid); 443 | continue; 444 | } 445 | max_term = version.get_term(); 446 | max_txid = version.get_txid(); 447 | debug!("accepting message txid {}", version.get_txid()); 448 | self.rep_log.append(version.get_term(), 449 | version.get_txid(), 450 | mutation.clone()); 451 | } 452 | 453 | append_res.set_accepted(true); 454 | append_res.set_last_accepted_term(max_term); 455 | append_res.set_last_accepted_txid(max_txid); 456 | 457 | // Bump up generator for future use if we transition to leader. 458 | self.max_generated_txid = max_txid; 459 | 460 | for (term, txid) in 461 | self.rep_log.commit_up_to(append.get_last_learned_txid()) { 462 | 463 | debug!("follower learning term {} txid {}", term, txid); 464 | self.learn(term, txid); 465 | debug!("learned"); 466 | } 467 | } else { 468 | // this update doesn't link to our last entry, so tell the 469 | // leader where to replicate from. 470 | warn!("failed to link msg from: {}", append.get_from_txid()); 471 | warn!("{:?}", self.state); 472 | append_res.set_accepted(false); 473 | append_res.set_last_accepted_term(self.rep_log 474 | .last_accepted_term()); 475 | append_res.set_last_accepted_txid(self.rep_log 476 | .last_accepted_txid()); 477 | } 478 | } 479 | 480 | res.set_append_res(append_res); 481 | 482 | self.reply(env, ByteBuf::from_slice(&*res.write_to_bytes().unwrap())); 483 | } 484 | 485 | fn handle_append_res(&mut self, 486 | env: Envelope, 487 | peer_id: PeerID, 488 | append_res: &AppendRes) { 489 | // verify that we are leading 490 | if !self.state.is_leader() { 491 | return; 492 | } 493 | 494 | // update peer's info (which may be divergent!) 495 | let mut accepted = vec![]; 496 | match self.rep_peers.get_mut(&peer_id) { 497 | Some(ref mut rep_peer) => { 498 | rep_peer.last_accepted_term = 499 | append_res.get_last_accepted_term(); 500 | rep_peer.last_accepted_txid = 501 | append_res.get_last_accepted_txid(); 502 | 503 | // reset max sent if we need to backfill 504 | if !append_res.get_accepted() { 505 | rep_peer.max_sent_txid = 506 | append_res.get_last_accepted_txid(); 507 | } 508 | 509 | // see if we can mark any updates as accepted 510 | accepted = self.rep_log 511 | .ack_up_to(append_res.get_last_accepted_txid(), 512 | peer_id); 513 | } 514 | None => error!("got AppendRes for non-existent peer!"), 515 | } 516 | for (term, txid) in accepted { 517 | debug!("leader learning txid {}", txid); 518 | self.learn(term, txid); 519 | } 520 | } 521 | 522 | pub fn handle_peer(&mut self, env: Envelope) { 523 | let peer_msg: PeerMsg = protobuf::parse_from_bytes(env.msg.bytes()) 524 | .unwrap(); 525 | let peer_id = peer_msg.get_srvid(); 526 | 527 | if peer_msg.has_vote_res() { 528 | self.handle_vote_res(env, 529 | peer_id.to_string(), 530 | peer_msg.get_vote_res()); 531 | } else if peer_msg.has_vote_req() { 532 | self.handle_vote_req(env, 533 | peer_id.to_string(), 534 | peer_msg.get_vote_req()); 535 | } else if peer_msg.has_append() { 536 | self.handle_append(env, peer_id.to_string(), peer_msg.get_append()); 537 | } else if peer_msg.has_append_res() { 538 | self.handle_append_res(env, 539 | peer_id.to_string(), 540 | peer_msg.get_append_res()); 541 | } else { 542 | error!("got unhandled peer message! {:?}", peer_msg); 543 | } 544 | } 545 | 546 | fn handle_cli(&mut self, req: Envelope) { 547 | let cli_req: CliReq = protobuf::parse_from_bytes(req.msg.bytes()) 548 | .unwrap(); 549 | let mut res = CliRes::new(); 550 | res.set_req_id(cli_req.get_req_id()); 551 | if !self.state.is_leader() { 552 | // If we aren't the leader, we must return some sort of 553 | // a RedirectRes instead of a response. 554 | let mut redirect_res = RedirectRes::new(); 555 | // If we're a follower, a leader has been elected, so 556 | // sets the return address. 557 | if self.state.is_follower() { 558 | let leader_address = match self.state { 559 | State::Follower{ 560 | term: _, 561 | id: _, 562 | leader_addr: leader_addr, 563 | until: _, 564 | tok: _, 565 | } => Some(leader_addr), 566 | _ => None, 567 | } 568 | .unwrap(); 569 | redirect_res.set_success(true); 570 | redirect_res.set_address(format!("{:?}", leader_address)); 571 | } else { 572 | redirect_res.set_success(false); 573 | redirect_res.set_err("No leader has been elected yet" 574 | .to_string()); 575 | } 576 | res.set_redirect(redirect_res); 577 | } else if cli_req.has_get() { 578 | let get_req = cli_req.get_get(); 579 | let mut get_res = GetRes::new(); 580 | self.db 581 | .get(get_req.get_key()) 582 | .map(|value| { 583 | get_res.set_success(true); 584 | get_res.set_value((*value).to_vec()); 585 | }) 586 | .on_absent(|| { 587 | get_res.set_success(false); 588 | get_res.set_err("Key not found".to_string()) 589 | }) 590 | .on_error(|e| { 591 | error!("Operational problem encountered: {}", e); 592 | get_res.set_success(false); 593 | get_res.set_err("Operational problem encountered" 594 | .to_string()); 595 | }); 596 | get_res.set_txid(self.rep_log.last_learned_txid()); 597 | res.set_get(get_res); 598 | } else if cli_req.has_set() { 599 | let txid = self.new_txid(); 600 | let set_req = cli_req.get_set(); 601 | 602 | // replicate the mutation 603 | let mut version = Version::new(); 604 | version.set_txid(txid); 605 | version.set_term(self.state.term().unwrap()); 606 | 607 | let mut mutation = Mutation::new(); 608 | mutation.set_field_type(MutationType::KVSET); 609 | mutation.set_version(version); 610 | mutation.set_key(set_req.get_key().to_vec()); 611 | mutation.set_value(set_req.get_value().to_vec()); 612 | 613 | info!("adding pending entry for txid {}", txid); 614 | self.pending.insert(txid, (req, cli_req.get_req_id())); 615 | self.replicate(vec![mutation]); 616 | // send a response later after this txid is learned 617 | return; 618 | } else if cli_req.has_cas() { 619 | let txid = self.new_txid(); 620 | let cas_req = cli_req.get_cas(); 621 | 622 | // replicate the mutation 623 | let mut version = Version::new(); 624 | version.set_txid(txid); 625 | version.set_term(self.state.term().unwrap()); 626 | 627 | let mut mutation = Mutation::new(); 628 | mutation.set_field_type(MutationType::KVCAS); 629 | mutation.set_version(version); 630 | mutation.set_key(cas_req.get_key().to_vec()); 631 | mutation.set_value(cas_req.get_new_value().to_vec()); 632 | mutation.set_old_value(cas_req.get_old_value().to_vec()); 633 | 634 | self.pending.insert(txid, (req, cli_req.get_req_id())); 635 | self.replicate(vec![mutation]); 636 | // send a response later after this txid is learned 637 | return; 638 | } else if cli_req.has_del() { 639 | let txid = self.new_txid(); 640 | let del_req = cli_req.get_del(); 641 | 642 | // replicate the mutation 643 | let mut version = Version::new(); 644 | version.set_txid(txid); 645 | version.set_term(self.state.term().unwrap()); 646 | 647 | let mut mutation = Mutation::new(); 648 | mutation.set_field_type(MutationType::KVDEL); 649 | mutation.set_version(version); 650 | mutation.set_key(del_req.get_key().to_vec()); 651 | 652 | self.pending.insert(txid, (req, cli_req.get_req_id())); 653 | self.replicate(vec![mutation]); 654 | // send a response later after this txid is learned 655 | return; 656 | } 657 | 658 | self.reply(req, ByteBuf::from_slice(&*res.write_to_bytes().unwrap())); 659 | } 660 | 661 | pub fn cron(&mut self) { 662 | debug!("{} state: {:?}", self.id, self.state); 663 | debug!("{} log: {:?}", self.id, self.rep_log); 664 | // become candidate if we need to 665 | if !self.state.valid_leader(self.clock.now()) && 666 | !self.state.valid_candidate(self.clock.now()) { 667 | info!("{} transitioning to candidate state", self.id); 668 | self.highest_term += 1; 669 | self.state = State::Candidate { 670 | term: self.highest_term, 671 | until: self.clock.now().add(*LEADER_DURATION), 672 | need: (self.peers.len() / 2 + 1) as u8, 673 | have: vec![], 674 | }; 675 | info!("{:?}", self.state); 676 | } 677 | 678 | // request or extend leadership 679 | if self.state.should_extend_leadership(self.clock.now()) || 680 | self.state.valid_candidate(self.clock.now()) { 681 | 682 | debug!("broadcasting VoteReq"); 683 | let mut req = PeerMsg::new(); 684 | req.set_srvid(self.id.clone()); 685 | let mut vote_req = VoteReq::new(); 686 | vote_req.set_term(self.state.term().unwrap()); 687 | vote_req.set_last_accepted_term(self.rep_log.last_accepted_term()); 688 | vote_req.set_last_accepted_txid(self.rep_log.last_accepted_txid()); 689 | vote_req.set_last_learned_term(self.rep_log.last_learned_term()); 690 | vote_req.set_last_learned_txid(self.rep_log.last_learned_txid()); 691 | req.set_vote_req(vote_req); 692 | self.peer_broadcast(ByteBuf::from_slice(&*req.write_to_bytes() 693 | .unwrap())); 694 | } 695 | 696 | // TODO(tyler) decide on whether to use heartbeats 697 | /* 698 | // heartbeat 699 | if self.state.is_leader() { 700 | let mut version = Version::new(); 701 | version.set_txid(self.new_txid()); 702 | version.set_term(self.state.term().unwrap()); 703 | 704 | let mut mutation = Mutation::new(); 705 | mutation.set_field_type(MutationType::KVSET); 706 | mutation.set_version(version); 707 | mutation.set_key(b"heartbeat".to_vec()); 708 | mutation.set_value(format!("{}", self.clock.now().sec) 709 | .as_bytes() 710 | .to_vec()); 711 | 712 | 713 | self.replicate(vec![mutation]); 714 | } 715 | */ 716 | } 717 | 718 | fn new_txid(&mut self) -> TXID { 719 | self.max_generated_txid += 1; 720 | info!("generating txid {}, {:?}", 721 | self.max_generated_txid, 722 | self.rep_log); 723 | self.max_generated_txid 724 | } 725 | 726 | fn reply(&mut self, req: Envelope, res_buf: ByteBuf) { 727 | self.rpc_tx.send_msg(Envelope { 728 | address: req.address, 729 | tok: req.tok, 730 | msg: res_buf, 731 | }); 732 | } 733 | 734 | fn peer_broadcast(&mut self, msg: ByteBuf) { 735 | self.rpc_tx.send_msg(Envelope { 736 | address: None, 737 | tok: PEER_BROADCAST, 738 | msg: msg, 739 | }); 740 | } 741 | 742 | fn replicate(&mut self, mutations: Vec) { 743 | if mutations.len() > 0 { 744 | for mutation in mutations { 745 | let txid = mutation.get_version().get_txid(); 746 | self.rep_log.append(mutation.get_version().get_term(), 747 | txid, 748 | mutation); 749 | 750 | // this should only be learned on single replica collections 751 | let accepted = self.rep_log.ack_up_to(txid, self.id.clone()); 752 | for (term, txid) in accepted { 753 | debug!("leader learning txid {}", txid); 754 | self.learn(term, txid); 755 | } 756 | } 757 | 758 | debug!("in replicate, we have {} rep_peers", self.rep_peers.len()); 759 | 760 | // for each peer, send them their next message 761 | for (_, peer) in self.rep_peers.iter_mut() { 762 | let mut append = Append::new(); 763 | append.set_from_txid(peer.last_accepted_txid); 764 | append.set_from_term(peer.last_accepted_term); 765 | append.set_last_learned_txid(self.rep_log.last_learned_txid()); 766 | let mut batch = vec![]; 767 | for txid in peer.max_sent_txid + 1..peer.max_sent_txid + 100 { 768 | 769 | match self.rep_log.get(txid) { 770 | Some(mutation) => { 771 | // TODO(tyler) can we avoid copies here? 772 | // maybe if multiple Buf implementors could 773 | // hold RC>? 774 | batch.push(mutation.clone()); 775 | peer.max_sent_txid = mutation.get_version() 776 | .get_txid(); 777 | } 778 | None => (), 779 | } 780 | } 781 | 782 | append.set_batch(protobuf::RepeatedField::from_vec(batch)); 783 | 784 | let mut peer_msg = PeerMsg::new(); 785 | peer_msg.set_srvid(self.id.clone()); 786 | peer_msg.set_append(append); 787 | 788 | self.rpc_tx.send_msg(Envelope { 789 | address: peer.addr, 790 | tok: peer.tok, 791 | msg: ByteBuf::from_slice(&*peer_msg.write_to_bytes() 792 | .unwrap()), 793 | }); 794 | } 795 | } 796 | 797 | let peer_ids: Vec = self.rep_peers.keys().cloned().collect(); 798 | debug!("accepted: {} learned: {}", 799 | self.rep_log.last_accepted_txid(), 800 | self.rep_log.last_learned_txid()); 801 | debug!("rep log unaccepted len: {:?}", 802 | self.rep_log.last_accepted_txid() - 803 | self.rep_log.last_learned_txid()); 804 | debug!("peers: {:?}", peer_ids); 805 | } 806 | 807 | fn learn(&mut self, term: Term, txid: TXID) { 808 | 809 | debug!("trying to get txid {} in rep log", txid); 810 | let mutation = match self.rep_log.get(txid) { 811 | Some(m) => m, 812 | None => { 813 | debug!("we don't have this tx in our log yet"); 814 | return 815 | } 816 | }; 817 | debug!("got txid {} from rep log", txid); 818 | 819 | let mut res = CliRes::new(); 820 | 821 | info!("matching field type {:?}", mutation.get_field_type()); 822 | match mutation.get_field_type() { 823 | MutationType::KVSET => { 824 | info!("processing set!"); 825 | let mut set_res = SetRes::new(); 826 | match self.db.put(mutation.get_key(), mutation.get_value()) { 827 | Ok(_) => set_res.set_success(true), 828 | Err(e) => { 829 | error!("Operational problem encountered: {}", e); 830 | set_res.set_success(false); 831 | set_res.set_err("Operational problem encountered".to_string()); 832 | } 833 | } 834 | res.set_set(set_res); 835 | }, 836 | MutationType::KVCAS => { 837 | let mut cas_res = CASRes::new(); 838 | match self.db.get(mutation.get_key()) { 839 | DBResult::Some(old_val) => { 840 | if mutation.has_old_value() && 841 | *old_val == *mutation.get_old_value() { 842 | 843 | // compare succeeded, let's try to set 844 | match self.db.put(mutation.get_key(), mutation.get_value()) { 845 | Ok(_) => { 846 | cas_res.set_success(true); 847 | cas_res.set_value(mutation.get_value().to_vec()); 848 | }, 849 | Err(e) => { 850 | error!("Operational problem encountered: {}", e); 851 | cas_res.set_success(false); 852 | cas_res.set_err("Operational problem encountered".to_string()); 853 | cas_res.set_value(old_val.to_vec()); 854 | } 855 | } 856 | } else { 857 | cas_res.set_success(false); 858 | cas_res.set_err("compare failure".to_string()); 859 | cas_res.set_value(old_val.to_vec()); 860 | } 861 | }, 862 | DBResult::None => { 863 | if !mutation.has_old_value() { 864 | match self.db.put(mutation.get_key(), mutation.get_value()) { 865 | Ok(_) => { 866 | cas_res.set_success(true); 867 | cas_res.set_value(mutation.get_value().to_vec()); 868 | }, 869 | Err(e) => { 870 | error!("Operational problem encountered: {}", e); 871 | cas_res.set_success(false); 872 | cas_res.set_err(format!("Operational problem encountered: {}", e)); 873 | } 874 | } 875 | } else { 876 | cas_res.set_success(false); 877 | cas_res.set_err("compare failure".to_string()); 878 | } 879 | }, 880 | DBResult::Error(e) => { 881 | cas_res.set_success(false); 882 | error!("Operational problem encountered: {}", e); 883 | cas_res.set_err(format!("Operational problem encountered: {}", e)); 884 | }, 885 | } 886 | cas_res.set_txid(self.rep_log.last_learned_txid()); 887 | res.set_cas(cas_res); 888 | }, 889 | MutationType::KVDEL => { 890 | let mut del_res = DelRes::new(); 891 | // If the value exists, return it. 892 | match self.db.get(mutation.get_key()) { 893 | DBResult::Some(old_val) => { 894 | del_res.set_value(old_val.to_vec()); 895 | } 896 | DBResult::None => (), // we don't care 897 | DBResult::Error(e) => (), // we don't care, but we probably should 898 | } 899 | match self.db.delete(mutation.get_key()) { 900 | Ok(_) => del_res.set_success(true), 901 | Err(e) => { 902 | error!("Operational problem encountered: {}", e); 903 | del_res.set_success(false); 904 | del_res.set_err(format!("Operational problem encountered: {}", e)); 905 | } 906 | } 907 | res.set_del(del_res); 908 | }, 909 | } 910 | 911 | // TODO(tyler) use persisted crash-proof logic 912 | let pending = self.pending.remove(&txid); 913 | match pending { 914 | Some((env, req_id)) => { 915 | info!("found pending listener"); 916 | // If there's a pending client request associated with this, 917 | // then send them a response. 918 | res.set_req_id(req_id); 919 | self.reply(env, 920 | ByteBuf::from_slice(&*res.write_to_bytes() 921 | .unwrap())); 922 | } 923 | None => { 924 | info!("could not find pending for this learned request"); 925 | }, 926 | } 927 | } 928 | 929 | // These conditions guarantee that we don't lose acked writes 930 | // as long as a majority of our previous nodes stay alive. 931 | fn should_grant_vote(&self, vote_req: &VoteReq) -> bool { 932 | if self.state.valid_leader(self.clock.now()) { 933 | // we already have (or are) a valid leader 934 | false 935 | } else if vote_req.get_term() < self.rep_log.last_learned_term() { 936 | // This refers to a stale term. Note that we can still vote for 937 | // vote requestors with lower terms than we've accepted but not 938 | // learned, because our acks may not have actually gained quorum. 939 | // This is safe because any vote requestors that receives a quorum 940 | // of votes will have anything that reached quorum in past rounds 941 | // with the same members. 942 | false 943 | } else { 944 | // at this point, we need to verify one of two conditions: 945 | // 1. that the vote requestor has learned anything in a higher 946 | // term than we have 947 | // 2. that the last term the vote requestor has learned something 948 | // is the same as ours, and the requestor has accepted at least 949 | // as many mutations within that term as we have 950 | if vote_req.get_last_learned_term() > 951 | self.rep_log.last_learned_term() { 952 | // case 1 953 | true 954 | } else if vote_req.get_last_learned_term() == 955 | self.rep_log.last_learned_term() && 956 | vote_req.get_last_accepted_txid() >= 957 | self.rep_log.last_accepted_txid() { 958 | // case 2 959 | true 960 | } else { 961 | // at this point, we know that we have a log that is more 962 | // recent than the vote requestor. 963 | false 964 | } 965 | } 966 | } 967 | } 968 | -------------------------------------------------------------------------------- /src/server/server_conn.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::sync::mpsc::Sender; 3 | 4 | use bytes::{Buf, ByteBuf}; 5 | use mio::{EventLoop, EventSet, PollOpt, Token, TryRead, TryWrite}; 6 | use mio::tcp::TcpStream; 7 | 8 | use codec::{self, Codec}; 9 | use server::Envelope; 10 | use server::traffic_cop::TrafficCop; 11 | 12 | pub struct ServerConn { 13 | pub sock: TcpStream, 14 | pub req_tx: Sender, 15 | pub res_bufs: Vec, // TODO(tyler) use proper dequeue 16 | pub res_remaining: usize, 17 | pub req_codec: codec::Framed, 18 | pub token: Option, 19 | pub interest: EventSet, 20 | } 21 | 22 | impl ServerConn { 23 | pub fn new(sock: TcpStream, req_tx: Sender) -> ServerConn { 24 | ServerConn { 25 | sock: sock, 26 | req_tx: req_tx, 27 | req_codec: codec::Framed::new(), 28 | res_bufs: vec![], 29 | res_remaining: 0, 30 | token: None, 31 | interest: EventSet::hup(), 32 | } 33 | } 34 | 35 | pub fn writable(&mut self, 36 | event_loop: &mut EventLoop) 37 | -> io::Result<()> { 38 | if self.res_bufs.len() == 0 { 39 | // no responses yet, don't reregister 40 | return Ok(()) 41 | } 42 | let mut res_buf = self.res_bufs.remove(0); 43 | 44 | debug!("res buf: {:?}", res_buf.bytes()); 45 | match self.sock.try_write_buf(&mut res_buf) { 46 | Ok(None) => { 47 | info!("client flushing buf; WOULDBLOCK"); 48 | self.interest.insert(EventSet::writable()); 49 | } 50 | Ok(Some(r)) => { 51 | debug!("CONN : we wrote {} bytes!", r); 52 | self.res_remaining -= r; 53 | debug!("remaining: {}", self.res_remaining); 54 | if self.res_remaining == 0 { 55 | // we've written the whole response, now let's wait to read 56 | self.interest.insert(EventSet::readable()); 57 | self.interest.remove(EventSet::writable()); 58 | } 59 | } 60 | Err(e) => { 61 | match e.raw_os_error() { 62 | Some(32) => { 63 | info!("client disconnected"); 64 | } 65 | Some(e) => info!("not implemented; client os err={:?}", e), 66 | _ => info!("not implemented; client err={:?}", e), 67 | }; 68 | // Don't reregister. 69 | return Err(e); 70 | } 71 | } 72 | 73 | // push res back if it's not finished 74 | if res_buf.remaining() != 0 { 75 | self.res_bufs.insert(0, res_buf); 76 | } 77 | 78 | event_loop.reregister(&self.sock, 79 | self.token.unwrap(), 80 | self.interest, 81 | PollOpt::edge() | PollOpt::oneshot()) 82 | } 83 | 84 | pub fn readable(&mut self, 85 | event_loop: &mut EventLoop) 86 | -> io::Result<()> { 87 | 88 | // TODO(tyler) get rid of this double copying and read 89 | // directly to codec 90 | let mut req_buf = ByteBuf::mut_with_capacity(1024); 91 | 92 | match self.sock.try_read_buf(&mut req_buf) { 93 | Ok(None) => { 94 | panic!("got readable, but can't read from the socket"); 95 | } 96 | Ok(Some(r)) => { 97 | debug!("CONN : we read {} bytes!", r); 98 | //T self.interest.remove(EventSet::readable()); 99 | } 100 | Err(e) => { 101 | info!("not implemented; client err={:?}", e); 102 | self.interest.remove(EventSet::readable()); 103 | } 104 | }; 105 | 106 | for req in self.req_codec.decode(&mut req_buf.flip()) { 107 | self.req_tx.send(Envelope { 108 | address: Some(self.sock.peer_addr().unwrap()), 109 | tok: self.token.unwrap(), 110 | msg: req, 111 | }); 112 | } 113 | 114 | event_loop.reregister(&self.sock, 115 | self.token.unwrap(), 116 | self.interest, 117 | PollOpt::edge() | PollOpt::oneshot()) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/server/traffic_cop.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Error, ErrorKind}; 2 | use std::io; 3 | use std::sync::mpsc::Sender; 4 | 5 | use bytes::{Buf, ByteBuf, alloc}; 6 | use mio::{EventLoop, EventSet, Handler, PollOpt, Token, TryRead, TryWrite}; 7 | use mio::tcp::{TcpListener, TcpSocket}; 8 | use mio::util::Slab; 9 | use rand::{Rng, thread_rng}; 10 | 11 | use server::*; 12 | use codec; 13 | 14 | pub struct TrafficCop { 15 | peers: Vec, 16 | cli_handler: ConnSet, 17 | peer_handler: ConnSet, 18 | } 19 | 20 | impl TrafficCop { 21 | 22 | pub fn new(peer_port: u16, 23 | cli_port: u16, 24 | peer_addrs: Vec, 25 | peer_req_tx: Sender, 26 | cli_req_tx: Sender) 27 | -> io::Result { 28 | 29 | let cli_addr = format!("0.0.0.0:{}", cli_port).parse().unwrap(); 30 | info!("binding to {} for client connections", cli_addr); 31 | let cli_srv_sock = try!(TcpListener::bind(&cli_addr)); 32 | 33 | let peer_addr = format!("0.0.0.0:{}", peer_port).parse().unwrap(); 34 | info!("binding to {} for peer connections", peer_addr); 35 | let peer_srv_sock = try!(TcpListener::bind(&peer_addr)); 36 | 37 | let mut peers = vec![]; 38 | for peer in peer_addrs { 39 | peers.push(Peer { 40 | addr: peer.parse().unwrap(), 41 | sock: None, 42 | }); 43 | } 44 | 45 | Ok(TrafficCop { 46 | peers: peers, 47 | cli_handler: ConnSet { 48 | srv_sock: cli_srv_sock, 49 | srv_token: SERVER_CLIENTS, 50 | conns: Slab::new_starting_at(Token(1024), 4096), 51 | req_tx: cli_req_tx, 52 | }, 53 | peer_handler: ConnSet { 54 | srv_sock: peer_srv_sock, 55 | srv_token: SERVER_PEERS, 56 | conns: Slab::new_starting_at(Token(2), 15), 57 | req_tx: peer_req_tx, 58 | }, 59 | }) 60 | } 61 | 62 | pub fn run_event_loop(&mut self, 63 | mut event_loop: EventLoop) 64 | -> io::Result<()> { 65 | 66 | event_loop.register_opt(&self.cli_handler.srv_sock, 67 | SERVER_CLIENTS, 68 | EventSet::readable(), 69 | PollOpt::edge() | PollOpt::oneshot()) 70 | .unwrap(); 71 | 72 | event_loop.register_opt(&self.peer_handler.srv_sock, 73 | SERVER_PEERS, 74 | EventSet::readable(), 75 | PollOpt::edge() | PollOpt::oneshot()) 76 | .unwrap(); 77 | 78 | event_loop.run(self).unwrap(); 79 | 80 | Err(Error::new(ErrorKind::Other, "event_loop shouldn't have returned.")) 81 | } 82 | 83 | fn tok_to_sc(&mut self, tok: Token) -> Option<&mut ServerConn> { 84 | if tok.as_usize() > 1 && tok.as_usize() <= 128 { 85 | self.peer_handler.conns.get_mut(tok) 86 | } else if tok.as_usize() > 128 && tok.as_usize() <= 4096 { 87 | self.cli_handler.conns.get_mut(tok) 88 | } else { 89 | error!("bad event loop notification message envelope"); 90 | None 91 | } 92 | } 93 | } 94 | 95 | impl Handler for TrafficCop { 96 | type Timeout = (); 97 | type Message = Envelope; 98 | 99 | fn ready(&mut self, 100 | event_loop: &mut EventLoop, 101 | token: Token, 102 | events: EventSet) { 103 | if events.is_hup() || events.is_error() { 104 | debug!("clearing error or hup connection"); 105 | match token { 106 | peer if peer.as_usize() >= 2 && peer.as_usize() <= 16 => { 107 | if self.peer_handler.conns.contains(token) { 108 | self.peer_handler.conns.remove(token); 109 | for peer in self.peers.iter_mut() { 110 | if peer.sock == Some(token) { 111 | debug!("dropping disconnected peer socket"); 112 | peer.sock = None; 113 | } 114 | } 115 | } 116 | } 117 | cli if cli.as_usize() >= 1024 && cli.as_usize() <= 4096 => { 118 | if self.cli_handler.conns.contains(token) { 119 | self.cli_handler.conns.remove(token); 120 | } 121 | } 122 | t => panic!("bad token for error/hup: {}", t.as_usize()), 123 | } 124 | } 125 | 126 | if events.is_readable() { 127 | match token { 128 | SERVER_PEERS => { 129 | debug!("got SERVER_PEERS accept"); 130 | self.peer_handler.accept(event_loop).or_else(|e| { 131 | error!("failed to accept peer: all slots full"); 132 | Err(e) 133 | }); 134 | } 135 | SERVER_CLIENTS => { 136 | debug!("got SERVER_CLIENTS accept"); 137 | self.cli_handler.accept(event_loop).or_else(|e| { 138 | error!("failed to accept client: all slots full"); 139 | Err(e) 140 | }); 141 | } 142 | peer if peer.as_usize() >= 2 && peer.as_usize() <= 16 => { 143 | self.peer_handler.conn_readable(event_loop, peer).unwrap(); 144 | } 145 | cli if cli.as_usize() >= 1024 && cli.as_usize() <= 4096 => { 146 | self.cli_handler.conn_readable(event_loop, cli).unwrap(); 147 | } 148 | t => panic!("unknown token: {}", t.as_usize()), 149 | } 150 | } 151 | 152 | if events.is_writable() { 153 | match token { 154 | SERVER_PEERS => panic!("received writable for SERVER_PEERS"), 155 | SERVER_CLIENTS => 156 | panic!("received writable for token SERVER_CLIENTS"), 157 | peer if peer.as_usize() > 1 && peer.as_usize() <= 128 => 158 | self.peer_handler.conn_writable(event_loop, peer), 159 | cli if cli.as_usize() > 128 && cli.as_usize() <= 4096 => 160 | self.cli_handler.conn_writable(event_loop, cli), 161 | t => panic!("received writable for out-of-range token: {}", 162 | t.as_usize()), 163 | }; 164 | } 165 | } 166 | 167 | // timeout is triggered periodically to (re)establish connections to peers. 168 | fn timeout(&mut self, 169 | event_loop: &mut EventLoop, 170 | timeout: ()) { 171 | for peer in self.peers.iter_mut() { 172 | if peer.sock.is_none() { 173 | debug!("reestablishing connection with peer"); 174 | let (sock, _) = TcpSocket::v4() 175 | .unwrap() 176 | .connect(&peer.addr) 177 | .unwrap(); 178 | self.peer_handler.register(sock, event_loop).map(|tok| { 179 | peer.sock = Some(tok); 180 | }); 181 | } 182 | } 183 | debug!("have {:?} peer connections", 184 | self.peer_handler.conns.count()); 185 | // if leader is None, try to get promise leases, following-up with 186 | // an abdication if we fail to get quorum after 2s (randomly picked). 187 | 188 | // if leader is self, renew after 6s 189 | 190 | let mut rng = thread_rng(); 191 | event_loop.timeout_ms((), rng.gen_range(200, 500)).unwrap(); 192 | } 193 | 194 | // notify is used to transmit messages 195 | fn notify(&mut self, 196 | event_loop: &mut EventLoop, 197 | mut msg: Envelope) { 198 | let mut toks = vec![]; 199 | if msg.tok == PEER_BROADCAST { 200 | for peer in self.peers.iter() { 201 | peer.sock.map(|tok| toks.push(tok)); 202 | } 203 | } else { 204 | toks.push(msg.tok); 205 | } 206 | for tok in toks { 207 | let sco = self.tok_to_sc(tok); 208 | if sco.is_none() { 209 | warn!("got notify for invalid token {}", tok.as_usize()); 210 | continue; 211 | } 212 | let mut sc = sco.unwrap(); 213 | let m = msg.msg.bytes(); 214 | 215 | let size = 4 + m.len(); 216 | let mut res = unsafe { 217 | ByteBuf::from_mem_ref(alloc::heap(size.next_power_of_two()), 218 | size as u32, // cap 219 | 0, // pos 220 | size as u32 /* lim */) 221 | .flip() 222 | }; 223 | 224 | assert!(res.write_slice(&codec::usize_to_array(m.len())) == 4); 225 | assert!(res.write_slice(m) == m.len()); 226 | 227 | debug!("adding res to sc.res_bufs: {:?}", res.bytes()); 228 | 229 | sc.res_remaining += res.bytes().len(); 230 | sc.res_bufs.push(res.flip()); 231 | 232 | sc.interest.insert(EventSet::writable()); 233 | 234 | event_loop.reregister(&sc.sock, 235 | tok, 236 | sc.interest, 237 | PollOpt::edge() | PollOpt::oneshot()); 238 | } 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /test/cluster.rs: -------------------------------------------------------------------------------- 1 | extern crate bytes; 2 | extern crate rand; 3 | extern crate mio; 4 | extern crate uuid; 5 | 6 | use std::collections::BTreeMap; 7 | use std::fs; 8 | use std::net::{SocketAddr, SocketAddrV4, Ipv4Addr}; 9 | use std::sync::{Arc, Mutex}; 10 | use std::sync::mpsc::{self, Sender, Receiver, SendError}; 11 | 12 | use self::rand::{StdRng, SeedableRng, Rng}; 13 | use self::bytes::{Buf, ByteBuf}; 14 | use self::mio::Token; 15 | use rasputin::server::rocksdb as db; 16 | use rasputin::server::{Server, Envelope, State, Peer, InMemoryLog, 17 | LEADER_DURATION, PEER_BROADCAST}; 18 | use rasputin::{Clock, TestClock, Mutation}; 19 | use self::uuid::Uuid; 20 | 21 | // SimCluster facilitates testing a cluster against network failures. 22 | // This is accomplished by dropping messages, delaying messages, and randomizing 23 | // which surviving ready messages are chosen in which order (but surviving 24 | // messages between the same two nodes preserve ordering, because we use a 25 | // single tcp connection for now) 26 | 27 | enum Condition { 28 | Partition { node1: u16, node2: u16 }, 29 | Paused { node: u16 } 30 | } 31 | 32 | enum Event { 33 | Cron { node: u16 }, 34 | Receive { to: SocketAddr, env: Envelope }, 35 | } 36 | 37 | pub struct SimServer { 38 | path: String, 39 | pub server: Server>>, 40 | clock: Arc, 41 | outbound: Receiver, 42 | pub tok: Token, 43 | addr: SocketAddr, 44 | } 45 | 46 | pub struct SimCluster { 47 | rng: StdRng, 48 | clock: u64, // elapsed time in ms 49 | events: BTreeMap>, // times to events 50 | pub nodes: BTreeMap, 51 | filters: Vec, 52 | } 53 | 54 | impl SimCluster { 55 | pub fn new(dir: &str, num_nodes: u16) -> SimCluster { 56 | let mut logs = vec![]; 57 | for i in 0..num_nodes as usize { 58 | logs.push(InMemoryLog { 59 | pending: BTreeMap::new(), 60 | committed: BTreeMap::new(), 61 | quorum: num_nodes as usize / 2 + 1, 62 | last_learned_txid: 0, 63 | last_learned_term: 0, 64 | last_accepted_txid: 0, 65 | last_accepted_term: 0, 66 | }); 67 | } 68 | SimCluster::new_from_logs(dir, logs) 69 | } 70 | 71 | pub fn new_from_logs(dir: &str, logs: Vec>) -> SimCluster { 72 | let mut peers = vec![]; 73 | let mut peer_strings = vec![]; 74 | for i in 0..logs.len() { 75 | let ip = Ipv4Addr::new(1, 0, (i / 256) as u8, (i % 256) as u8); 76 | let port = i as u16; 77 | peers.push(SocketAddrV4::new(ip, port)); 78 | peer_strings.push(format!("{}:{}", ip, port)); 79 | } 80 | 81 | let mut nodes = BTreeMap::new(); 82 | 83 | let mut toks = 0; 84 | for (peer, rep_log) in peers.iter().zip(logs) { 85 | let (tx, rx) = mpsc::channel(); 86 | 87 | let clock = Arc::new(TestClock::new()); 88 | 89 | let state_dir = format!("_rasputin_test/{}/sim_{}", 90 | dir, peer.port()); 91 | let server = Server { 92 | clock: clock.clone(), 93 | peer_port: peer.port(), 94 | cli_port: 65535 - peer.port(), 95 | id: Uuid::new_v4().to_string(), 96 | rpc_tx: Box::new(tx), 97 | max_generated_txid: 0, 98 | highest_term: 0, 99 | state: State::Init, 100 | db: db::new(state_dir.clone()), 101 | rep_log: Box::new(rep_log), 102 | peers: peer_strings.clone(), 103 | rep_peers: BTreeMap::new(), 104 | pending: BTreeMap::new(), 105 | }; 106 | 107 | nodes.insert(peer.port(), SimServer { 108 | path: state_dir.to_string(), 109 | server: server, 110 | addr: SocketAddr::V4(SocketAddrV4::new(*peer.ip(), peer.port())), 111 | clock: clock.clone(), 112 | outbound: rx, 113 | tok: Token(toks), 114 | }); 115 | 116 | toks += 1; 117 | } 118 | 119 | let seed: &[_] = &[0]; 120 | let mut ns = SimCluster{ 121 | rng: SeedableRng::from_seed(seed), 122 | clock: 0, 123 | events: BTreeMap::new(), 124 | nodes: nodes, 125 | filters: vec![], 126 | }; 127 | 128 | // fire up the servers by queuing their cron 129 | for i in 0..ns.nodes.len() { 130 | let time = ns.rng.gen_range(400,500); 131 | ns.push_event( 132 | time, 133 | Event::Cron{ node: i as u16 } 134 | ); 135 | } 136 | ns 137 | } 138 | 139 | pub fn leaders(&self) -> Vec { 140 | self.nodes.iter() 141 | .filter(|&(id, n)| n.server.state.is_leader()) 142 | .map(|(id, n)| *id).collect() 143 | } 144 | 145 | pub fn pause_node(&mut self, node: u16) -> Result<(), ()> { 146 | // TODO 147 | Err(()) 148 | } 149 | 150 | pub fn unpause_node(&mut self, node: u16) -> Result<(), ()> { 151 | // TODO 152 | Err(()) 153 | } 154 | 155 | pub fn partition_two_nodes(&mut self, node1: u16, node2: u16) -> Result<(), ()> { 156 | // TODO 157 | Err(()) 158 | } 159 | 160 | pub fn unpartition_two_nodes(&mut self, node1: u16, node2: u16) -> Result<(), ()> { 161 | // TODO 162 | Err(()) 163 | } 164 | 165 | pub fn partition_all(&mut self) { 166 | } 167 | 168 | pub fn unpartition_all(&mut self) { 169 | } 170 | 171 | pub fn advance_time(&mut self, ms: u64) { 172 | self.clock += ms; 173 | for (_, node) in self.nodes.iter_mut() { 174 | node.clock.sleep_ms(ms as u32); 175 | } 176 | } 177 | 178 | fn push_event(&mut self, time: u64, event: Event) { 179 | match self.events.get_mut(&time) { 180 | Some(event_vec) => { 181 | event_vec.push(event); 182 | return; 183 | }, 184 | None => (), 185 | }; 186 | self.events.insert(time, vec![event]); 187 | } 188 | 189 | fn pop_event(&mut self) -> (u64, Option>) { 190 | let next_key = self.events.keys().next().unwrap().clone(); 191 | (next_key, self.events.remove(&next_key)) 192 | } 193 | 194 | // step works in two phases: 195 | // 1. handle queued events 196 | // 2. queue rpc's generated in response to those events 197 | pub fn step(&mut self) { 198 | let (time, events) = self.pop_event(); 199 | // move everyone's clocks forward 200 | let before = self.clock.clone(); 201 | self.advance_time(time - before); 202 | let after = self.clock.clone(); 203 | 204 | // Perform event 205 | for event in events.unwrap() { 206 | match event { 207 | Event::Cron{node:node} => { 208 | self.nodes.get_mut(&node).unwrap().server.cron(); 209 | let time = self.rng.gen_range(400,500); 210 | self.push_event( 211 | after + time, 212 | Event::Cron{ node: node } 213 | ); 214 | }, 215 | Event::Receive{to:to, env:env} => { 216 | let node = self.nodes.get_mut(&to.port()).unwrap(); 217 | node.server.handle_peer(env); 218 | }, 219 | } 220 | } 221 | 222 | // Queue up any outbound messages 223 | let mut outbound = vec![]; 224 | for (ip, node) in self.nodes.iter_mut() { 225 | loop { 226 | match node.outbound.try_recv() { 227 | Ok(env) => outbound.push((node.addr, env)), 228 | Err(_) => break, // nothing to send 229 | } 230 | } 231 | } 232 | // TODO(tyler) apply filters and node selection randomization 233 | for (addr, env) in outbound { 234 | let env_with_return_address = Envelope { 235 | address: Some(addr), 236 | tok: Token(addr.port() as usize), 237 | msg: ByteBuf::from_slice(env.msg.bytes()), 238 | }; 239 | if env.address.is_none() { 240 | // this is a peer broadcast, which will be attempted to be sent 241 | // to all connected peers. 242 | let ports = self.nodes.len(); 243 | for port in 0..ports { 244 | let arrival = self.clock + 1; 245 | self.push_event(arrival, Event::Receive { 246 | to: u16_to_socketaddr(port as u16), 247 | env: env_with_return_address.clone(), 248 | }); 249 | } 250 | } else { 251 | let arrival = self.clock + 1; 252 | self.push_event(arrival, Event::Receive { 253 | to: u16_to_socketaddr(env.tok.as_usize() as u16), 254 | env: env_with_return_address, 255 | }); 256 | } 257 | } 258 | } 259 | } 260 | 261 | impl Drop for SimServer { 262 | fn drop(&mut self) { 263 | // TODO(tyler) implement this in rocksdb lib 264 | // self.server.db.delete(); 265 | fs::remove_dir_all(&self.path); 266 | } 267 | } 268 | 269 | fn u16_to_socketaddr(from: u16) -> SocketAddr { 270 | let ip = Ipv4Addr::new(1, 0, (from / 256) as u8, (from % 256) as u8); 271 | SocketAddr::V4(SocketAddrV4::new(ip, from)) 272 | } 273 | -------------------------------------------------------------------------------- /test/test.rs: -------------------------------------------------------------------------------- 1 | extern crate rasputin; 2 | 3 | mod cluster; 4 | mod test_paxos; 5 | mod test_client; 6 | -------------------------------------------------------------------------------- /test/test_client.rs: -------------------------------------------------------------------------------- 1 | extern crate log; 2 | use std::sync::mpsc::SendError; 3 | use std::thread; 4 | use std::process; 5 | 6 | use rasputin::Client; 7 | use rasputin::server::Server; 8 | use rasputin::logging; 9 | use rasputin::server::{Envelope, LEADER_DURATION, PEER_BROADCAST, State}; 10 | use rasputin::RealClock; 11 | use cluster::{SimCluster, SimServer}; 12 | use self::log::LogLevel; 13 | 14 | #[test] 15 | fn client() { 16 | //logging::init_logger(None, LogLevel::Info).unwrap(); 17 | 18 | thread::spawn( move || { 19 | Server::>>::run( 20 | 29999, 21 | 39999, 22 | "_test_client".to_string(), 23 | vec!["127.0.0.1:29999".to_string()] 24 | ); 25 | }); 26 | 27 | thread::sleep_ms(1000); 28 | let peers = vec!["127.0.0.1:39999".parse().unwrap()]; 29 | let nthreads = 1; 30 | let mut cli = Client::new(peers, nthreads); 31 | cli.set(b"k1", b"v1").unwrap(); 32 | assert!(cli.get(b"k1").unwrap().get_value() == b"v1"); 33 | assert!(cli.cas(b"k1", b"v1", b"v12").unwrap().get_value() == b"v12"); 34 | assert!(cli.cas(b"k1", b"vNever", b"vNever2").unwrap().get_value() == b"v12"); 35 | assert!(cli.cas(b"k1", b"vNever", b"vNever2").unwrap().get_success() == false); 36 | assert!(cli.cas(b"k1", b"v12", b"v13").unwrap().get_value() == b"v13"); 37 | assert!(cli.del(b"k1").unwrap().get_value() == b"v13"); 38 | assert!(cli.get(b"k1").unwrap().get_success() == false); 39 | } 40 | -------------------------------------------------------------------------------- /test/test_paxos.rs: -------------------------------------------------------------------------------- 1 | extern crate log; 2 | extern crate quickcheck; 3 | 4 | use std::collections::BTreeMap; 5 | 6 | use rasputin::server::Server; 7 | use rasputin::logging; 8 | use cluster::{SimCluster, SimServer}; 9 | use self::log::LogLevel; 10 | 11 | /* 12 | * Correctness Properties: (Ongaro '14) 13 | * 1. Election Safety: at most one leader can be elected in a given term. 14 | * 2. Leader Append-Only: a leader never overwrites or deletes entries in its 15 | * log; it only appends new entries. 16 | * 3. Log Matching: if two logs contain an entry with the same index and term, 17 | * then the logs are identical in all entries up through the given index. 18 | * 4. Leader Completeness: if a log entry is committed in a given term, then 19 | * that entry will be present in the logs of the leaders for all 20 | * higher-numbered terms. 21 | * 5. State Machine Safety: if a server has applied a log entry at a given index 22 | * to its state machine, no other server will ever apply a different log 23 | * entry for the same index. 24 | */ 25 | 26 | #[test] 27 | fn election_safety() { 28 | //logging::init_logger(None, LogLevel::Debug).unwrap(); 29 | let mut sim = SimCluster::new("safety", 5); 30 | let mut leaders = BTreeMap::new(); 31 | for i in 0..3000 { 32 | sim.step(); 33 | for (id, n) in sim.nodes.iter() { 34 | if n.server.state.is_leader() { 35 | let term = n.server.state.term().unwrap(); 36 | let tok = n.tok.as_usize(); 37 | assert!(*leaders.entry(term).or_insert(tok) == tok); 38 | } 39 | } 40 | } 41 | } 42 | 43 | #[test] 44 | fn stable_leader_with_no_faults() { 45 | let mut sim = SimCluster::new("stable", 5); 46 | let mut leader = None; 47 | for i in 0..3000 { 48 | sim.step(); 49 | for (id, n) in sim.nodes.iter() { 50 | match n.server.state.term() { 51 | Some(term) => { 52 | if leader.is_none() && n.server.state.is_leader() { 53 | leader = Some(term); 54 | } else if n.server.state.is_leader() { 55 | assert!(leader.unwrap() == term); 56 | } 57 | }, 58 | None => { 59 | // If there's no term, make sure leader was not previously 60 | // elected. 61 | assert!(leader.is_none()); 62 | }, 63 | } 64 | } 65 | } 66 | } 67 | 68 | #[test] 69 | fn leader_append_only() { 70 | 71 | } 72 | 73 | #[test] 74 | fn log_matching() { 75 | 76 | } 77 | 78 | #[test] 79 | fn leader_completeness() { 80 | 81 | } 82 | 83 | #[test] 84 | fn state_machine_safety() { 85 | 86 | } 87 | --------------------------------------------------------------------------------