├── .gitignore ├── .travis.yml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── rustfmt.toml ├── scripts ├── sanity.py ├── simple.py └── test_cluster.sh ├── src ├── command.rs ├── config.rs ├── cubes.rs ├── database.rs ├── dht.rs ├── fabric.rs ├── fabric_msg.rs ├── gossip.rs ├── hash.rs ├── inflightmap.rs ├── main.rs ├── metrics.rs ├── resp.rs ├── server.rs ├── storage.rs ├── types.rs ├── utils.rs ├── version_vector.rs ├── vnode.rs ├── vnode_sync.rs └── workers.rs └── sucredb.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.rs.bk 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | sudo: false 3 | language: rust 4 | rust: nightly 5 | 6 | env: 7 | global: 8 | - RUST_BACKTRACE=1 9 | - RUST_TEST_THREADS=1 10 | - CARGO_BUILD_JOBS=1 11 | - MAKE_PARALLELISM=1 12 | 13 | cache: 14 | - cargo 15 | 16 | install: 17 | - pip install --user redis redis-py-cluster funcy 18 | 19 | script: 20 | - travis_wait sleep 1000000000 & 21 | - cargo test --verbose 22 | - python scripts/sanity.py verbose 2>&1 | tail -n 100 23 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sucredb" 3 | version = "0.1.0" 4 | authors = ["arthurprs"] 5 | 6 | [dependencies] 7 | rand = "0.5" 8 | serde = "1.0" 9 | serde_derive = "1.0" 10 | log = "0.4" 11 | byteorder="1.0" 12 | tokio-core = "0.1" 13 | tokio-io = "0.1" 14 | tokio-codec = "0.1" 15 | futures = "0.1" 16 | clap="2.0" 17 | crc16="0.4" 18 | metrics="0.2" 19 | lazy_static = "1.0" 20 | serde_yaml = "0.8" 21 | bincode="1.0" 22 | num_cpus="1.0" 23 | roaring="0.5" 24 | crossbeam-channel="0.2" 25 | 26 | [dependencies.log4rs] 27 | version = "0.8" 28 | default-features = false 29 | features = ["all_components", "file", "yaml_format"] 30 | 31 | [dependencies.rocksdb] 32 | git = "https://github.com/pingcap/rust-rocksdb.git" 33 | rev = "b011ecb17759d052ae39e2c86addc7b1c7e6c178" 34 | features = ["portable", "sse"] 35 | 36 | [dependencies.linear-map] 37 | version = "1.2" 38 | features = ["serde_impl"] 39 | 40 | [dependencies.bytes] 41 | version = "0.4" 42 | features = ["serde"] 43 | 44 | [dev-dependencies] 45 | env_logger = "0.5.0" 46 | 47 | # enable for profiling 48 | # [profile.release] 49 | # debug=true 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Arthur Silva 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sucredb 2 | 3 | > *A database made of sugar cubes* 4 | 5 | [![Build Status](https://travis-ci.org/arthurprs/sucredb.svg?branch=master)](https://travis-ci.org/arthurprs/sucredb) 6 | 7 | Sucredb is a multi-master key-value distributed database, it provides a dynamo style tunable consistent and causality tracking. 8 | 9 | Any node that owns a partition (replicas) can serve both reads and writes. The database tracks causality using vector-clocks and will NOT drop any conflicting writes unlike LWW (last write wins) and other strategies. Conflicts can and do happen due to races between clients and network partitions. 10 | 11 | Status: Alpha quality with missing pieces. 12 | 13 | # API & Clients 14 | 15 | Theoretically you can use Sucredb with any Redis Cluster clients. 16 | 17 | It implements a tiny subset of Redis commands. Only basic Key-Value/Sets/Hashes operations are supported at this point. 18 | 19 | ### Key Value 20 | 21 | #### GET 22 | 23 | *GET* result(s) is/are returned as an array containing the values (zero, one or more if there's conflicting versions) plus the causal context. The context is an binary string and is always returned as the last item of the array even if no values are present. 24 | 25 | `> GET key {consistency}` 26 | 27 | `< [{value1}, {value2}, .., context]` 28 | 29 | #### MGET 30 | 31 | *MGET* takes the # of keys (N) followed by N keys. Results are returned as an array. 32 | 33 | 34 | `> MGET key_count {key1} {key2} {..} {consistency}` 35 | 36 | `< [[{value1_1}, {value1_2}, .., context], [{value2_1}, {value2_2}, .., context]]` 37 | 38 | #### SET 39 | 40 | *SET*, in addition to the key and value, also takes the causal context. If you're sure it don't exist you can actually omit the context, if you're wrong it'll create a conflicting version. 41 | 42 | `> SET key value {context} {consistency}` 43 | 44 | `< OK` 45 | 46 | #### GETSET 47 | 48 | *GETSET* is similar to set, but returns the updated value(s) and a new context. Despite the name and the semantics in Redis, the get is always done *after* the set. 49 | 50 | `> GETSET key value context {consistency}` 51 | 52 | `< [{value1}, {value2}, .., context]` 53 | 54 | #### DEL 55 | 56 | *DEL* is like set and also requires a context when dealing with basic values. 57 | Following Redis api *del* works for keys with any datastructure, in these cases the context is ignored (you can use an empty string instead). 58 | 59 | `> DEL key context {consistency}` 60 | 61 | `< 1 OR 0 (if not found)` 62 | 63 | ### Data structures 64 | 65 | Sucredb also supports a tiny subset of commands for Hash and Set datatypes in addition to a dedicated Counter type. These types are [CRDTs](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type) and don't require a context to be sent along the operation. Mutations depend on the coordinator version of the value and conflicts are handled as follow: 66 | 67 | * Hash: On values conflict the latest write wins. 68 | * Set: On values conflict add wins. 69 | * Counter: Deletes may erase non observed increments. 70 | 71 | ### CGET 72 | 73 | Returns the value for a counter or Nil if none is found. 74 | 75 | `> GET key {consistency}` 76 | 77 | `< 1011` 78 | 79 | ### CSET 80 | 81 | Sets the value for a counter. 82 | 83 | `> SET key int_value {consistency}` 84 | 85 | `< OK` 86 | 87 | ### INCRBY 88 | 89 | Increments the value for a counter, the delta can be either positive or negative. 90 | 91 | `> INCRBY key delta_value {consistency}` 92 | 93 | `< resulting_int_value` 94 | 95 | #### HGETALL 96 | 97 | Gets all key value pairs from a hash. 98 | 99 | `> HGETALL key {consistency}` 100 | 101 | `< [{KA, VA}, {KB, VB}, ...]` 102 | 103 | #### HSET 104 | 105 | Set key a value pair in a hash. 106 | 107 | `> HSET key hash_key value {consistency}` 108 | 109 | `< 1 OR 0 (if hash_key already existed) ` 110 | 111 | #### HDEL 112 | 113 | Deletes a key from a hash. 114 | 115 | `> HDEL key hash_key {consistency}` 116 | 117 | `< 1 OR 0 (if hash_key didn't exist)` 118 | 119 | #### SMEMBERS 120 | 121 | Gets all values from a set. 122 | 123 | `> SMEMBERS key {consistency}` 124 | 125 | `< [{KA}, {KB}, ...]` 126 | 127 | #### SADD 128 | 129 | Adds a value from the set. 130 | 131 | `> SADD key value {consistency}` 132 | 133 | `< 1 OR 0 (if value already existed) ` 134 | 135 | #### SREM 136 | 137 | Removes a value from the set. 138 | 139 | `> SREM key value {consistency}` 140 | 141 | `< 1 OR 0 (if value didn't exist) ` 142 | 143 | ### MULTI/EXEC Batches 144 | 145 | todo 146 | 147 | ### Other parameters 148 | 149 | #### `context` parameter 150 | 151 | If you don't have a context (from a previous get or getset) you can send an empty string. 152 | 153 | #### `consistency` parameter 154 | 155 | `{consistency}` follows the dynamo/cassandra/riak style: 156 | 157 | * `1`, `o`, `O`: One 158 | * `q`, `Q`: Quorum 159 | * `a`, `A`: All 160 | 161 | # Running 162 | 163 | **Requirements** 164 | 165 | * Needs a reasonably recent Rust (nightly[2]) 166 | * C++ compiler (for Rocksdb). 167 | 168 | **Running** 169 | 170 | * The following setup will use the default settings. 171 | * Clone the repo and enter repository root 172 | * `cargo install .` [3] 173 | * `sucredb --help` 174 | 175 | Single/First instance 176 | 177 | `sucredb -d datadir1 -l 127.0.0.1:6379 -f 127.0.0.1:16379 init` 178 | 179 | The command above will initialize a new cluster containing this node. The cluster will have the default name, partition count and replication factor. 180 | 181 | Second instance 182 | 183 | `sucredb -d datadir2 -l 127.0.0.1:6378 -f 127.0.0.1:16378 -s 127.0.0.1:16379` 184 | 185 | The second instance joins the cluster using the first instance as a seed. 186 | 187 | Quick test 188 | 189 | `redis-cli CLUSTER SLOTS` 190 | 191 | #### Example 192 | 193 | Quick example using *redis-cli* 194 | 195 | ``` 196 | ➜ ~ redis-cli 197 | 127.0.0.1:6379> GET there 198 | 1) "\x00\x00\x00\x00\x00\x00\x00\x00" 199 | 127.0.0.1:6379> SET there 1 "\x00\x00\x00\x00\x00\x00\x00\x00" 200 | OK 201 | 127.0.0.1:6379> GET there 202 | 1) "1" 203 | 2) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x01\x00\x00\x00\x00\x00\x00\x00" 204 | 127.0.0.1:6379> SET there 2 205 | OK 206 | 127.0.0.1:6379> GET there 1 207 | 1) "1" 208 | 2) "2" 209 | 3) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x02\x00\x00\x00\x00\x00\x00\x00" 210 | 127.0.0.1:6379> SET there 3 "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x02\x00\x00\x00\x00\x00\x00\x00" 211 | OK 212 | 127.0.0.1:6379> GET there 213 | 1) "3" 214 | 2) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x03\x00\x00\x00\x00\x00\x00\x00" 215 | 127.0.0.1:6379> GETSET there 4 "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x03\x00\x00\x00\x00\x00\x00\x00" 216 | 1) "4" 217 | 2) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x04\x00\x00\x00\x00\x00\x00\x00" 218 | 127.0.0.1:6379> DEL there "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x04\x00\x00\x00\x00\x00\x00\x00" 219 | 1 220 | 127.0.0.1:6379> GET there q 221 | 1) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x05\x00\x00\x00\x00\x00\x00\x00 222 | ``` 223 | 224 | # Configuration 225 | 226 | See `sucredb.yaml` 227 | 228 | To use configuration file use: `sucredb -c sucredb.yaml` 229 | 230 | # CAP theorem 231 | 232 | It behaves mostly like an AP system but not exactly. 233 | 234 | Sucredb doesn't use sloppy quorum or hinted handoff so it can't serve requests that don't satisfy the requested/default consistency level. 235 | 236 | # Performance 237 | 238 | Almost every single new thing claims to be fast or blazing fast. Sucredb makes no claims at this point, but it's probably fast. 239 | 240 | The data structure operations move the entire collection around the cluster so it's *not* suitable for large values/collections. 241 | 242 | # Ideas worth exploring 243 | 244 | * Improve the data model with a range/clustering key. 245 | 246 | # Background 247 | 248 | Storage takes advantage of RocksDB. 249 | 250 | It uses a variant of version clocks to track causality. The actual algorithm is heavily inspired by [1]. 251 | 252 | ---- 253 | 254 | [1] Gonçalves, Ricardo, et al. "Concise server-wide causality management for eventually consistent data stores." 255 | 256 | [2] Mostly due to the try_from and impl trait features that should be stable soon. 257 | 258 | [3] Be patient. 259 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | use_try_shorthand=true 2 | error_on_line_overflow=false 3 | -------------------------------------------------------------------------------- /scripts/sanity.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import sys 4 | import random 5 | from itertools import chain 6 | from redis import StrictRedis 7 | from rediscluster import StrictRedisCluster 8 | from funcy import retry 9 | from collections import defaultdict 10 | import shutil 11 | 12 | VERBOSE = False 13 | 14 | 15 | class Instance(object): 16 | BIND = "127.0.0.1" 17 | PORT = 6379 18 | FPORT = 16379 19 | 20 | def __init__(self, i, ii): 21 | super(Instance, self).__init__() 22 | self.i = i 23 | self.ii = ii 24 | self.process = None 25 | self.listen_addr = "{}:{}".format(self.BIND, self.PORT + self.i) 26 | self.fabric_addr = "{}:{}".format(self.BIND, self.FPORT + self.i) 27 | self.data_dir = "n{}".format(self.i) 28 | 29 | @property 30 | def client(self): 31 | return StrictRedis(self.BIND, self.PORT + self.i) 32 | 33 | def clear_data(self): 34 | shutil.rmtree(self.data_dir, ignore_errors=True) 35 | 36 | def cluster_init(self): 37 | self.clear_data() 38 | self.start("init") 39 | 40 | def cluster_join(self): 41 | self.clear_data() 42 | self.start() 43 | 44 | def wait_ready(self, callback=lambda c: c.ping(), 45 | timeout=5, sleep=0.1): 46 | @retry(int(timeout / float(sleep) + 0.5), timeout=sleep) 47 | def inner(): 48 | assert callback(self.client) 49 | inner() 50 | 51 | def start(self, *args): 52 | assert not self.process 53 | self.process = subprocess.Popen( 54 | ["cargo", "run", "--", 55 | "-l", self.listen_addr, 56 | "-f", self.fabric_addr, 57 | "-d", self.data_dir] 58 | + list(chain.from_iterable( 59 | ["-s", "{}:{}".format(self.BIND, self.FPORT + i)] 60 | for i in range(self.ii) 61 | if i != self.i 62 | )) 63 | + list(args), 64 | stdin=sys.stdin if VERBOSE else None, 65 | stdout=sys.stdout if VERBOSE else None, 66 | stderr=sys.stderr if VERBOSE else None, 67 | ) 68 | self.wait_ready() 69 | 70 | def __del__(self): 71 | if self.process: 72 | self.process.kill() 73 | 74 | def kill(self): 75 | assert self.process 76 | self.process.kill() 77 | self.process.wait() 78 | self.process = None 79 | 80 | def restart(self): 81 | self.kill() 82 | self.start() 83 | 84 | @property 85 | def running(self): 86 | return bool(self.process) 87 | 88 | def execute(self, *args, **kwargs): 89 | self.client.execute_command(*args, **kwargs) 90 | 91 | 92 | def main(): 93 | global VERBOSE 94 | VERBOSE = "verbose" in sys.argv[1:] 95 | subprocess.check_call(["cargo", "build"]) 96 | cluster_sz = 3 97 | cluster = [Instance(i, cluster_sz) for i in range(cluster_sz)] 98 | cluster[0].cluster_init() 99 | cluster[1].cluster_join() 100 | cluster[2].cluster_join() 101 | cluster[0].execute("CLUSTER", "REBALANCE") 102 | time.sleep(5) 103 | 104 | client = StrictRedisCluster( 105 | startup_nodes=[ 106 | {"host": n.listen_addr.partition(":")[0], 107 | "port": int(n.listen_addr.partition(":")[2])} 108 | for n in cluster 109 | ], 110 | decode_responses=False, 111 | socket_timeout=0.5, 112 | ) 113 | 114 | check_map = defaultdict(set) 115 | items = 1000 116 | groups = 100 117 | for i in xrange(items): 118 | k = str(i % groups) 119 | v = str(i) 120 | client.execute_command("SET", k, v, "", "Q") 121 | check_map[k].add(v) 122 | if random.random() < 0.1: 123 | n = random.choice(cluster) 124 | # restart and wait for it to connect to cluster 125 | n.restart() 126 | n.wait_ready(lambda c: c.execute_command("CLUSTER", "CONNECTIONS")) 127 | 128 | # let the syncs settle 129 | time.sleep(5) 130 | 131 | @retry(2, timeout=5) 132 | def test_all_nodes_complete(): 133 | for k, expected in check_map.items(): 134 | values = set(client.get(k)[:-1]) 135 | assert values == expected, "%s %s %s" % (k, expected, values) 136 | for c in cluster: 137 | values = set(c.client.execute_command("GET", k, "1")[:-1]) 138 | assert values == expected, \ 139 | "key %s expected %s got %s (diff %s)" % ( 140 | k, expected, values, expected ^ values) 141 | 142 | test_all_nodes_complete() 143 | 144 | 145 | if __name__ == '__main__': 146 | main() 147 | -------------------------------------------------------------------------------- /scripts/simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # python std lib 4 | import time 5 | 6 | # 3rd party imports 7 | import funcy 8 | from docopt import docopt 9 | from redis._compat import xrange 10 | 11 | 12 | def resp(aa): 13 | if len(aa) == 1: 14 | return [], aa[0] 15 | if len(aa) == 2: 16 | return aa[0], aa[1] 17 | raise Exception("multiple values returned ~ " + str(aa)) 18 | 19 | 20 | def loop(rc, reset_last_key=None): 21 | """ 22 | Regular debug loop that can be used to test how redis behaves during changes in the cluster. 23 | """ 24 | _, last_ctx = resp(rc.get("__last__")) 25 | if reset_last_key: 26 | rc.hset("__last__", 0, last_ctx) 27 | 28 | last = False 29 | while last is False: 30 | try: 31 | last, last_ctx = resp(rc.get("__last__")) 32 | print "last is %s" % last 33 | last = 0 if not last else int(last) 34 | print("starting at foo{0}".format(last)) 35 | except Exception as e: 36 | print("error1 {0}".format(repr(e))) 37 | time.sleep(1) 38 | 39 | for i in xrange(last, 1000000000): # noqa 40 | try: 41 | print("SET foo{} {}".format(i, i)) 42 | rc.set("foo{}".format(i), str(i)) 43 | got, got_ctx = resp(rc.get("foo{}".format(i))) 44 | print("GET foo{} {}".format(i, got)) 45 | assert got == str(i), "%s != %s" % (got, i) 46 | _, last_ctx = rc.execute_command("getset", "__last__", i, last_ctx) 47 | except Exception as e: 48 | print("error2 {}".format(repr(e))) 49 | 50 | 51 | def timeit(rc, itterations=50000): 52 | """ 53 | Time how long it take to run a number of set/get:s 54 | """ 55 | t0 = time.time() 56 | for i in xrange(0, itterations): # noqa 57 | s = "foo{0}".format(i) 58 | rc.set(s, i) 59 | rc.get(s) 60 | 61 | t1 = time.time() - t0 62 | print("{0}k SET/GET operations took: {1} seconds... {2} operations per second".format( 63 | (itterations / 1000) * 2, t1, (itterations / t1) * 2)) 64 | 65 | 66 | def timeit_pipeline(rc, itterations=50000): 67 | """ 68 | Time how long it takes to run a number of set/get:s inside a cluster pipeline 69 | """ 70 | t0 = time.time() 71 | for i in xrange(0, itterations): # noqa 72 | s = "foo{0}".format(i) 73 | 74 | p = rc.pipeline() 75 | p.set(s, i) 76 | p.get(s) 77 | p.execute() 78 | 79 | t1 = time.time() - t0 80 | print("{0}k SET/GET operations inside pipelines took: {1} seconds... {2} operations per second".format( 81 | (itterations / 1000) * 2, t1, (itterations / t1) * 2) 82 | ) 83 | 84 | 85 | if __name__ == "__main__": 86 | __docopt__ = """ 87 | Usage: 88 | simple [--host IP] [--port PORT] [--nocluster] [--timeit] [--pipeline] [--resetlastkey] [-h] [--version] 89 | 90 | Options: 91 | --nocluster If flag is set then StrictRedis will be used instead of cluster lib 92 | --host IP Redis server to test against [default: 127.0.0.1] 93 | --port PORT Port on redis server [default: 7000] 94 | --timeit run a mini benchmark to test performance 95 | --pipeline Only usable with --timeit flag. Runs SET/GET inside pipelines. 96 | --resetlastkey reset __last__ key 97 | -h --help show this help and exit 98 | -v --version show version and exit 99 | """ 100 | 101 | args = docopt(__docopt__, version="0.3.0") 102 | 103 | startup_nodes = [{"host": args["--host"], "port": args["--port"]}] 104 | 105 | if not args["--nocluster"]: 106 | from rediscluster import StrictRedisCluster 107 | rc = StrictRedisCluster(startup_nodes=startup_nodes, 108 | max_connections=32, socket_timeout=0.5, 109 | decode_responses=False, skip_full_coverage_check=True) 110 | else: 111 | from redis import StrictRedis 112 | rc = StrictRedis(host=args["--host"], port=args["--port"], 113 | socket_timeout=0.5, decode_responses=False) 114 | 115 | if args["--timeit"]: 116 | test_itterstions = [ 117 | 5000, 118 | 10000, 119 | 20000, 120 | ] 121 | 122 | if args["--pipeline"]: 123 | for itterations in test_itterstions: 124 | timeit_pipeline(rc, itterations=itterations) 125 | else: 126 | for itterations in test_itterstions: 127 | timeit(rc, itterations=itterations) 128 | else: 129 | loop(rc, reset_last_key=args["--resetlastkey"]) 130 | -------------------------------------------------------------------------------- /scripts/test_cluster.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT 4 | export RUST_BACKTRACE=1 5 | export RUST_LOG=sucredb=info,sucredb::vnode=debug,sucredb::fabric=info 6 | SLEEP=2 7 | cargo build 8 | rm -rf n1 n2 9 | ../target/debug/sucredb -d n1 -l 127.0.0.1:6379 -f 127.0.0.1:16379 init -r 2 > log1.txt 2>&1 & 10 | echo "WAITING $SLEEP" 11 | sleep $SLEEP 12 | ../target/debug/sucredb -d n2 -l 127.0.0.1:6378 -f 127.0.0.1:16378 -s 127.0.0.1:16379 > log2.txt 2>&1 & 13 | 14 | tail -f log1.txt log2.txt 15 | -------------------------------------------------------------------------------- /src/command.rs: -------------------------------------------------------------------------------- 1 | use bincode; 2 | use bytes::Bytes; 3 | use config; 4 | use cubes::{self, Cube}; 5 | use database::{Context, Database}; 6 | use metrics::{self, Meter}; 7 | use resp::RespValue; 8 | use std::convert::TryInto; 9 | use std::net; 10 | use types::*; 11 | use utils::{assume_str, replace_default}; 12 | use version_vector::*; 13 | 14 | #[derive(Debug)] 15 | pub enum CommandError { 16 | Timeout, 17 | ProtocolError, 18 | StorageError, 19 | UnknownCommand, 20 | TooManyVersions, 21 | TypeError, 22 | InvalidContext, 23 | InvalidArgCount, 24 | InvalidKey, 25 | InvalidValue, 26 | InvalidConsistencyValue, 27 | InvalidIntValue, 28 | InvalidExec, 29 | InvalidCommand, 30 | InvalidMultiCommand, 31 | MultiplePartitions, 32 | MultipleKeyMutations, 33 | Unavailable, 34 | } 35 | 36 | impl Into for CommandError { 37 | fn into(self) -> RespValue { 38 | RespValue::Error(format!("{:?}", self).into()) 39 | } 40 | } 41 | 42 | fn parse_int( 43 | try: bool, 44 | args: &[&Bytes], 45 | i: usize, 46 | ) -> Result { 47 | if try { 48 | assume_str(&args[i]) 49 | .parse() 50 | .map_err(|_| CommandError::InvalidIntValue) 51 | } else { 52 | Ok(Default::default()) 53 | } 54 | } 55 | 56 | fn check_arg_count(count: usize, min: usize, max: usize) -> Result<(), CommandError> { 57 | if count < min || count > max { 58 | Err(CommandError::InvalidArgCount) 59 | } else { 60 | Ok(()) 61 | } 62 | } 63 | 64 | fn check_key_len(key_len: usize) -> Result<(), CommandError> { 65 | if key_len > config::MAX_KEY_LEN { 66 | Err(CommandError::InvalidKey) 67 | } else { 68 | Ok(()) 69 | } 70 | } 71 | 72 | fn check_value_len(value_len: usize) -> Result<(), CommandError> { 73 | if value_len > config::MAX_VALUE_LEN { 74 | Err(CommandError::InvalidKey) 75 | } else { 76 | Ok(()) 77 | } 78 | } 79 | 80 | impl Database { 81 | pub fn handler_cmd(&self, mut context: Context) { 82 | let cmd = context.commands.pop().unwrap(); 83 | if let Err(e) = self.handle_cmd(&mut context, cmd) { 84 | context.clear(); 85 | self.respond_error(&mut context, e); 86 | } 87 | } 88 | 89 | fn handle_cmd(&self, context: &mut Context, cmd: RespValue) -> Result<(), CommandError> { 90 | debug!("Processing ({:?}) {:?}", context.token, cmd); 91 | let mut args = Vec::new(); 92 | match cmd { 93 | RespValue::Array(ref a) => { 94 | args.reserve_exact(a.len()); 95 | for v in a.iter() { 96 | if let &RespValue::Data(ref b) = v { 97 | args.push(b); 98 | } else { 99 | args.clear(); 100 | break; 101 | } 102 | } 103 | } 104 | _ => (), 105 | } 106 | 107 | if args.is_empty() { 108 | return Err(CommandError::ProtocolError); 109 | } 110 | 111 | let arg0 = args[0]; 112 | let args = &args[1..]; 113 | 114 | if context.is_exec_cmd { 115 | match arg0.as_ref() { 116 | b"CSET" | b"cset" => self.cmd_cset(context, args), 117 | b"INCRBY" | b"incrby" => self.cmd_incrby(context, args), 118 | b"SET" | b"set" => self.cmd_set(context, args, false), 119 | b"HSET" | b"hset" => self.cmd_hset(context, args), 120 | b"HDEL" | b"hdel" => self.cmd_hdel(context, args), 121 | b"SADD" | b"sadd" => self.cmd_sadd(context, args), 122 | b"SREM" | b"srem" => self.cmd_srem(context, args), 123 | b"GETSET" | b"getset" => self.cmd_set(context, args, true), 124 | b"DEL" | b"del" => self.cmd_del(context, args), 125 | _ => { 126 | debug!("Unknown command for multi {:?}", cmd); 127 | Err(CommandError::InvalidMultiCommand) 128 | } 129 | } 130 | } else if context.is_multi_cmd { 131 | match arg0.as_ref() { 132 | b"EXEC" | b"exec" => self.cmd_exec(context, args), 133 | _ => { 134 | // Enqueue command for later exec 135 | context.commands.push(cmd); 136 | Ok(self.respond_resp(context, RespValue::Status("QUEUED".into()))) 137 | } 138 | } 139 | } else { 140 | match arg0.as_ref() { 141 | b"GET" | b"get" => self.cmd_get(context, args), 142 | b"MGET" | b"mget" => self.cmd_mget(context, args), 143 | b"SET" | b"set" => self.cmd_set(context, args, false), 144 | b"CGET" | b"cget" => self.cmd_cget(context, args), 145 | b"CSET" | b"cset" => self.cmd_cset(context, args), 146 | b"INCRBY" | b"incrby" => self.cmd_incrby(context, args), 147 | b"HGETALL" | b"hgetall" => self.cmd_hgetall(context, args), 148 | b"HSET" | b"hset" => self.cmd_hset(context, args), 149 | b"HDEL" | b"hdel" => self.cmd_hdel(context, args), 150 | b"SMEMBERS" | b"smembers" => self.cmd_smembers(context, args), 151 | b"SADD" | b"sadd" => self.cmd_sadd(context, args), 152 | b"SREM" | b"srem" => self.cmd_srem(context, args), 153 | b"GETSET" | b"getset" => self.cmd_set(context, args, true), 154 | b"DEL" | b"del" => self.cmd_del(context, args), 155 | b"CLUSTER" | b"cluster" => self.cmd_cluster(context, args), 156 | b"TYPE" | b"type" => self.cmd_type(context, args), 157 | b"MULTI" | b"multi" => self.cmd_multi(context, args), 158 | b"EXEC" | b"exec" => self.cmd_exec(context, args), 159 | b"ECHO" | b"echo" => Ok(self.respond_resp(context, cmd)), 160 | b"PING" | b"ping" => Ok(self.respond_resp(context, RespValue::Data("PONG".into()))), 161 | b"ASKING" | b"asking" | b"READWRITE" | b"readwrite" => { 162 | check_arg_count(args.len(), 0, 0).and_then(|_| Ok(self.respond_ok(context))) 163 | } 164 | b"CONFIG" | b"config" => self.cmd_config(context, args), 165 | _ => { 166 | debug!("Unknown command {:?}", cmd); 167 | Err(CommandError::UnknownCommand) 168 | } 169 | } 170 | } 171 | } 172 | 173 | fn parse_vv( 174 | &self, 175 | try: bool, 176 | args: &[&Bytes], 177 | i: usize, 178 | ) -> Result { 179 | if try && !args[i].is_empty() { 180 | bincode::deserialize(args[i]).map_err(|_| CommandError::InvalidContext) 181 | } else { 182 | Ok(Default::default()) 183 | } 184 | } 185 | 186 | fn parse_consistency( 187 | &self, 188 | try: bool, 189 | args: &[&Bytes], 190 | i: usize, 191 | ) -> Result { 192 | Ok(if try { 193 | args[i] 194 | .as_ref() 195 | .try_into() 196 | .map_err(|_| CommandError::InvalidConsistencyValue)? 197 | } else { 198 | self.config.consistency_read 199 | }) 200 | } 201 | 202 | fn cmd_multi(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 203 | assert!(!context.is_multi_cmd); 204 | context.is_multi_cmd = true; 205 | check_arg_count(args.len(), 0, 0)?; 206 | Ok(self.respond_ok(context)) 207 | } 208 | 209 | fn cmd_exec(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 210 | if !context.is_multi_cmd { 211 | return Err(CommandError::InvalidExec); 212 | } 213 | check_arg_count(args.len(), 0, 1)?; 214 | let consistency = self.parse_consistency(args.len() > 0, args, 0)?; 215 | assert!(!context.is_exec_cmd); 216 | context.is_exec_cmd = true; 217 | let mut cmds = replace_default(&mut context.commands); 218 | for cmd in cmds.drain(..) { 219 | debug!("token:{} exec: {:?}", context.token, cmd); 220 | self.handle_cmd(context, cmd)?; 221 | } 222 | context.commands = cmds; 223 | self.set_flush(context, consistency) 224 | } 225 | 226 | fn cmd_config(&self, context: &mut Context, _args: &[&Bytes]) -> Result<(), CommandError> { 227 | Ok(self.respond_resp(context, RespValue::Array(Default::default()))) 228 | } 229 | 230 | fn cmd_hgetall(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 231 | metrics::REQUEST_GET.mark(1); 232 | check_arg_count(args.len(), 1, 2)?; 233 | check_key_len(args[0].len())?; 234 | let consistency = self.parse_consistency(args.len() > 1, args, 1)?; 235 | self.get(context, args[0], consistency, Box::new(cubes::render_map)) 236 | } 237 | 238 | fn cmd_hset(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 239 | metrics::REQUEST_SET.mark(1); 240 | check_arg_count(args.len(), 3, 4)?; 241 | check_key_len(args[0].len())?; 242 | check_key_len(args[1].len())?; 243 | check_value_len(args[2].len())?; 244 | let hash_key = args[1].clone(); 245 | let hash_value = args[2].clone(); 246 | let consistency = self.parse_consistency(args.len() > 3, args, 3)?; 247 | self.set( 248 | context, 249 | args[0], 250 | Box::new(move |i, v, c: Cube| { 251 | let mut map = c.into_map().ok_or(CommandError::TypeError)?; 252 | let result = map.insert(i, v, hash_key, hash_value) as i64; 253 | Ok((Cube::Map(map), Some(RespValue::Int(result)))) 254 | }), 255 | consistency, 256 | false, 257 | None, 258 | ) 259 | } 260 | 261 | fn cmd_hdel(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 262 | metrics::REQUEST_DEL.mark(1); 263 | check_arg_count(args.len(), 2, 3)?; 264 | check_key_len(args[0].len())?; 265 | check_key_len(args[1].len())?; 266 | let hash_key = args[1].clone(); 267 | let consistency = self.parse_consistency(args.len() > 2, args, 2)?; 268 | self.set( 269 | context, 270 | args[0], 271 | Box::new(move |i, v, c: Cube| { 272 | let mut map = c.into_map().ok_or(CommandError::TypeError)?; 273 | let result = map.remove(i, v, &hash_key) as i64; 274 | Ok((Cube::Map(map), Some(RespValue::Int(result)))) 275 | }), 276 | consistency, 277 | false, 278 | None, 279 | ) 280 | } 281 | 282 | fn cmd_smembers(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 283 | metrics::REQUEST_GET.mark(1); 284 | check_arg_count(args.len(), 1, 2)?; 285 | check_key_len(args[0].len())?; 286 | let consistency = self.parse_consistency(args.len() > 1, args, 1)?; 287 | self.get(context, args[0], consistency, Box::new(cubes::render_set)) 288 | } 289 | 290 | fn cmd_sadd(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 291 | metrics::REQUEST_SET.mark(1); 292 | check_arg_count(args.len(), 2, 3)?; 293 | check_key_len(args[0].len())?; 294 | check_value_len(args[1].len())?; 295 | let set_value = args[1].clone(); 296 | let consistency = self.parse_consistency(args.len() > 2, args, 2)?; 297 | self.set( 298 | context, 299 | args[0], 300 | Box::new(move |i, v, c: Cube| { 301 | let mut set = c.into_set().ok_or(CommandError::TypeError)?; 302 | let result = set.insert(i, v, set_value) as i64; 303 | Ok((Cube::Set(set), Some(RespValue::Int(result)))) 304 | }), 305 | consistency, 306 | false, 307 | None, 308 | ) 309 | } 310 | 311 | fn cmd_srem(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 312 | metrics::REQUEST_DEL.mark(1); 313 | check_arg_count(args.len(), 2, 3)?; 314 | check_key_len(args[0].len())?; 315 | check_value_len(args[1].len())?; 316 | let set_value = args[1].clone(); 317 | let consistency = self.parse_consistency(args.len() > 2, args, 2)?; 318 | self.set( 319 | context, 320 | args[0], 321 | Box::new(move |i, v, c: Cube| { 322 | let mut set = c.into_set().ok_or(CommandError::TypeError)?; 323 | let result = set.remove(i, v, &set_value) as i64; 324 | Ok((Cube::Set(set), Some(RespValue::Int(result)))) 325 | }), 326 | consistency, 327 | false, 328 | None, 329 | ) 330 | } 331 | 332 | fn cmd_get(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 333 | metrics::REQUEST_GET.mark(1); 334 | check_arg_count(args.len(), 1, 2)?; 335 | check_key_len(args[0].len())?; 336 | let consistency = self.parse_consistency(args.len() > 1, args, 1)?; 337 | self.get(context, args[0], consistency, Box::new(cubes::render_value)) 338 | } 339 | 340 | fn cmd_mget(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 341 | context.is_multi_cmd = true; 342 | metrics::REQUEST_GET.mark(1); 343 | check_arg_count(args.len(), 1, 100)?; 344 | let key_count: usize = parse_int(args.len() > 0, args, 0)?; 345 | if key_count >= args.len() { 346 | return Err(CommandError::InvalidCommand); 347 | } 348 | let keys = &args[1..1 + key_count]; 349 | let consistency = 350 | self.parse_consistency(args.len() > 1 + key_count, args, 1 + key_count)?; 351 | for key in keys { 352 | check_key_len(key.len())?; 353 | } 354 | self.mget(context, keys, consistency, Box::new(cubes::render_value)) 355 | } 356 | 357 | fn cmd_set( 358 | &self, 359 | context: &mut Context, 360 | args: &[&Bytes], 361 | reply_result: bool, 362 | ) -> Result<(), CommandError> { 363 | metrics::REQUEST_SET.mark(1); 364 | check_arg_count(args.len(), 2, 4)?; 365 | check_key_len(args[0].len())?; 366 | check_value_len(args[1].len())?; 367 | let value = args[1].clone(); 368 | let vv = self.parse_vv(args.len() > 2, args, 2)?; 369 | let consistency = self.parse_consistency(args.len() > 3, args, 3)?; 370 | self.set( 371 | context, 372 | args[0], 373 | Box::new(move |i, v, c: Cube| { 374 | let mut cube_value = c.into_value().ok_or(CommandError::TypeError)?; 375 | cube_value.set(i, v, Some(value), &vv); 376 | let resp = if reply_result { 377 | None 378 | } else { 379 | Some(RespValue::Status("OK".into())) 380 | }; 381 | Ok((Cube::Value(cube_value), resp)) 382 | }), 383 | consistency, 384 | reply_result, 385 | if reply_result { 386 | Some(Box::new(cubes::render_value)) 387 | } else { 388 | None 389 | }, 390 | ) 391 | } 392 | 393 | fn cmd_del(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 394 | metrics::REQUEST_DEL.mark(1); 395 | check_arg_count(args.len(), 1, 3)?; 396 | check_key_len(args[0].len())?; 397 | let vv = self.parse_vv(args.len() > 1, args, 1)?; 398 | let consistency = self.parse_consistency(args.len() > 2, args, 2)?; 399 | self.set( 400 | context, 401 | args[0], 402 | Box::new(move |i, v, mut c: Cube| { 403 | let result = c.del(i, v, &vv) as i64; 404 | Ok((c, Some(RespValue::Int(result)))) 405 | }), 406 | consistency, 407 | false, 408 | None, 409 | ) 410 | } 411 | 412 | fn cmd_cset(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 413 | metrics::REQUEST_SET.mark(1); 414 | check_arg_count(args.len(), 2, 3)?; 415 | check_key_len(args[0].len())?; 416 | let value: i64 = parse_int(args.len() > 1, args, 1)?; 417 | let consistency = self.parse_consistency(args.len() > 2, args, 2)?; 418 | self.set( 419 | context, 420 | args[0], 421 | Box::new(move |i, v, c: Cube| { 422 | let mut counter = c.into_counter().ok_or(CommandError::TypeError)?; 423 | counter.clear(i, v); 424 | counter.inc(i, v, value); 425 | Ok((Cube::Counter(counter), Some(RespValue::Status("OK".into())))) 426 | }), 427 | consistency, 428 | false, 429 | None, 430 | ) 431 | } 432 | 433 | fn cmd_cget(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 434 | metrics::REQUEST_GET.mark(1); 435 | check_arg_count(args.len(), 1, 2)?; 436 | check_key_len(args[0].len())?; 437 | let consistency = self.parse_consistency(args.len() > 1, args, 1)?; 438 | self.get( 439 | context, 440 | args[0], 441 | consistency, 442 | Box::new(cubes::render_counter), 443 | ) 444 | } 445 | 446 | fn cmd_incrby(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 447 | metrics::REQUEST_SET.mark(1); 448 | check_arg_count(args.len(), 2, 3)?; 449 | check_key_len(args[0].len())?; 450 | let inc: i64 = parse_int(args.len() > 1, args, 1)?; 451 | let consistency = self.parse_consistency(args.len() > 2, args, 2)?; 452 | self.set( 453 | context, 454 | args[0], 455 | Box::new(move |i, v, c: Cube| { 456 | let mut counter = c.into_counter().ok_or(CommandError::TypeError)?; 457 | counter.inc(i, v, inc); 458 | Ok((Cube::Counter(counter), Some(RespValue::Status("OK".into())))) 459 | }), 460 | consistency, 461 | false, 462 | None, 463 | ) 464 | } 465 | 466 | fn cmd_type(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 467 | check_arg_count(args.len(), 1, 2)?; 468 | let consistency = self.parse_consistency(args.len() > 1, args, 1)?; 469 | self.get(context, args[0], consistency, Box::new(cubes::render_type)) 470 | } 471 | 472 | fn cmd_cluster(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> { 473 | check_arg_count(args.len(), 1, 1)?; 474 | match args[0].as_ref() { 475 | b"CONNECTIONS" | b"connections" => { 476 | let conns = self.fabric.connections(); 477 | let resp_conns = conns.into_iter().map(|x| RespValue::Int(x as _)).collect(); 478 | Ok(self.respond_resp(context, RespValue::Array(resp_conns))) 479 | } 480 | b"REBALANCE" | b"rebalance" => { 481 | self.dht.rebalance().unwrap(); 482 | Ok(self.respond_ok(context)) 483 | } 484 | b"SLOTS" | b"slots" => { 485 | let mut slots = Vec::new(); 486 | for (&(start, end), members) in &self.dht.slots() { 487 | let mut slot = vec![RespValue::Int(start as _), RespValue::Int(end as _)]; 488 | slot.extend(members.iter().map(|&(node, (_, ext_addr))| { 489 | RespValue::Array(vec![ 490 | RespValue::Data(ext_addr.ip().to_string().as_bytes().into()), 491 | RespValue::Int(ext_addr.port() as _), 492 | RespValue::Data(node.to_string().as_bytes().into()), 493 | ]) 494 | })); 495 | slots.push(RespValue::Array(slot)); 496 | } 497 | Ok(self.respond_resp(context, RespValue::Array(slots))) 498 | } 499 | _ => Err(CommandError::UnknownCommand), 500 | } 501 | } 502 | 503 | pub fn respond(&self, context: &mut Context) { 504 | debug!("Respond request ({}) {:?}", context.token, context.response); 505 | (&self.response_fn)(replace_default(context)); 506 | } 507 | 508 | pub fn respond_resp(&self, context: &mut Context, resp: RespValue) { 509 | context.response.push(resp); 510 | self.respond(context); 511 | } 512 | 513 | pub fn respond_int(&self, context: &mut Context, int: i64) { 514 | self.respond_resp(context, RespValue::Int(int)); 515 | } 516 | 517 | pub fn respond_ok(&self, context: &mut Context) { 518 | self.respond_resp(context, RespValue::Status("OK".into())); 519 | } 520 | 521 | pub fn respond_error(&self, context: &mut Context, error: CommandError) { 522 | self.respond_resp(context, error.into()); 523 | } 524 | 525 | pub fn respond_moved(&self, context: &mut Context, vnode: VNodeNo, addr: net::SocketAddr) { 526 | self.respond_resp( 527 | context, 528 | RespValue::Error(format!("MOVED {} {}", vnode, addr).into()), 529 | ); 530 | } 531 | 532 | pub fn respond_ask(&self, context: &mut Context, vnode: VNodeNo, addr: net::SocketAddr) { 533 | self.respond_resp( 534 | context, 535 | RespValue::Error(format!("ASK {} {}", vnode, addr).into()), 536 | ); 537 | } 538 | } 539 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::max; 2 | use std::convert::TryInto; 3 | use std::fs::File; 4 | use std::io::Read; 5 | use std::net::SocketAddr; 6 | use std::path::{Path, PathBuf}; 7 | use std::str::FromStr; 8 | 9 | use log; 10 | use log4rs; 11 | use num_cpus; 12 | use serde_yaml as yaml; 13 | 14 | use types::ConsistencyLevel; 15 | use utils::GenericError; 16 | 17 | // Remember to update defaults in sucredb.yaml! 18 | pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:6379"; 19 | pub const DEFAULT_FABRIC_ADDR: &str = "127.0.0.1:16379"; 20 | pub const DEFAULT_CLUSTER_NAME: &str = "default"; 21 | pub const DEFAULT_DATA_DIR: &str = "./data"; 22 | pub const DEFAULT_REPLICATION_FACTOR: &str = "3"; 23 | pub const DEFAULT_PARTITIONS: &str = "64"; 24 | pub const MAX_KEY_LEN: usize = 500; 25 | pub const MAX_VALUE_LEN: usize = 10 * 1024 * 1024; 26 | 27 | #[derive(Debug, Clone)] 28 | pub struct Config { 29 | pub data_dir: PathBuf, 30 | pub cluster_name: String, 31 | pub listen_addr: SocketAddr, 32 | pub fabric_addr: SocketAddr, 33 | pub cmd_init: Option, 34 | pub worker_timer: u32, 35 | pub worker_count: u16, 36 | pub sync_incomming_max: u16, 37 | pub sync_outgoing_max: u16, 38 | pub sync_auto: bool, 39 | pub sync_timeout: u32, 40 | pub sync_msg_timeout: u32, 41 | pub sync_msg_inflight: u32, 42 | pub dht_sync_on_connect: bool, 43 | pub dht_sync_aae: bool, 44 | pub fabric_timeout: u32, 45 | pub request_timeout: u32, 46 | pub client_connection_max: u32, 47 | pub value_version_max: u16, 48 | pub seed_nodes: Vec, 49 | // TODO: these should be in the cluster config instead 50 | pub consistency_read: ConsistencyLevel, 51 | pub consistency_write: ConsistencyLevel, 52 | } 53 | 54 | impl Default for Config { 55 | fn default() -> Self { 56 | // Remember to update defaults in sucre.yaml! 57 | Config { 58 | data_dir: DEFAULT_DATA_DIR.into(), 59 | cluster_name: DEFAULT_CLUSTER_NAME.into(), 60 | listen_addr: DEFAULT_LISTEN_ADDR.parse().unwrap(), 61 | fabric_addr: DEFAULT_FABRIC_ADDR.parse().unwrap(), 62 | cmd_init: None, 63 | worker_timer: 500, 64 | worker_count: max(4, num_cpus::get() as u16 * 2), 65 | sync_incomming_max: 10, 66 | sync_outgoing_max: 10, 67 | sync_timeout: 10_000, 68 | sync_msg_timeout: 1000, 69 | sync_msg_inflight: 10, 70 | sync_auto: true, 71 | dht_sync_on_connect: true, 72 | dht_sync_aae: true, 73 | fabric_timeout: 1000, 74 | request_timeout: 1000, 75 | client_connection_max: 100, 76 | value_version_max: 100, 77 | seed_nodes: Vec::new(), 78 | consistency_read: ConsistencyLevel::One, 79 | consistency_write: ConsistencyLevel::One, 80 | } 81 | } 82 | } 83 | 84 | #[derive(Debug, Clone)] 85 | pub struct InitCommand { 86 | pub replication_factor: u8, 87 | pub partitions: u16, 88 | } 89 | 90 | fn split_number_suffix(s: &str) -> Result<(i64, &str), GenericError> { 91 | let digits_end = s 92 | .trim() 93 | .chars() 94 | .position(|c| !c.is_digit(10)) 95 | .unwrap_or(s.len()); 96 | let (digits, suffix) = s.split_at(digits_end); 97 | Ok((digits.parse::()?, suffix.trim_left())) 98 | } 99 | 100 | pub fn parse_duration(duration_text: &str) -> Result { 101 | let (number, suffix) = split_number_suffix(duration_text)?; 102 | let scale = match suffix.to_lowercase().as_ref() { 103 | "ms" => 1, 104 | "s" => 1000, 105 | "m" => 1000 * 60, 106 | "h" => 1000 * 60 * 60, 107 | _ => return Err(format!("Unknown duration suffix `{}`", suffix).into()), 108 | }; 109 | number.checked_mul(scale).ok_or("Overflow error".into()) 110 | } 111 | 112 | pub fn parse_size(size_text: &str) -> Result { 113 | let (number, suffix) = split_number_suffix(size_text)?; 114 | let scale = match suffix.to_lowercase().as_ref() { 115 | "b" => 1, 116 | "k" | "kb" => 1024, 117 | "m" | "mb" => 1024 * 1024, 118 | "g" | "gb" => 1024 * 1024 * 1024, 119 | _ => return Err(format!("Unknown size suffix `{}`", suffix).into()), 120 | }; 121 | number.checked_mul(scale).ok_or("Overflow error".into()) 122 | } 123 | 124 | macro_rules! cfi { 125 | ($yaml:ident, $target:ident, $string:ident, $method:ident) => { 126 | if let Some(v) = $yaml.get(stringify!($string)) { 127 | let v = v 128 | .$method() 129 | .expect(concat!("Can't access field with", stringify!($method))); 130 | $target.$string = v.into(); 131 | } 132 | }; 133 | ($yaml:ident, $target:ident, $string:ident, $method:ident,try_into) => { 134 | if let Some(v) = $yaml.get(stringify!($string)) { 135 | let v = v 136 | .$method() 137 | .expect(concat!("Can't access field with", stringify!($method))); 138 | $target.$string = v 139 | .try_into() 140 | .expect(concat!("Can't convert ", stringify!($string))); 141 | } 142 | }; 143 | ($yaml:ident, $target:ident, $string:ident, $method:ident, $convert:expr) => { 144 | if let Some(v) = $yaml.get(stringify!($string)) { 145 | let v = v.$method().expect(concat!( 146 | "Can't access key ", 147 | stringify!($string), 148 | " with", 149 | stringify!($method) 150 | )); 151 | $target.$string = $convert(v) 152 | .expect(concat!( 153 | "Can't convert ", 154 | stringify!($string), 155 | " with ", 156 | stringify!($convert) 157 | )).try_into() 158 | .expect(concat!("Can't convert ", stringify!($string))); 159 | } 160 | }; 161 | } 162 | 163 | pub fn read_config_file(path: &Path, config: &mut Config) { 164 | debug!("Reading config file"); 165 | let yaml = { 166 | let mut s = String::new(); 167 | File::open(path) 168 | .and_then(|mut f| f.read_to_string(&mut s)) 169 | .expect("Error reading config file"); 170 | yaml::from_str::(&s).expect("Error parsing config file") 171 | }; 172 | debug!("Done reading config file: {:?}", config); 173 | 174 | cfi!(yaml, config, data_dir, as_str); 175 | cfi!(yaml, config, cluster_name, as_str); 176 | cfi!(yaml, config, listen_addr, as_str, SocketAddr::from_str); 177 | cfi!(yaml, config, fabric_addr, as_str, SocketAddr::from_str); 178 | // pub cmd_init: Option, 179 | cfi!(yaml, config, worker_timer, as_str, parse_duration); 180 | cfi!(yaml, config, worker_count, as_u64, try_into); 181 | cfi!(yaml, config, sync_incomming_max, as_u64, try_into); 182 | cfi!(yaml, config, sync_outgoing_max, as_u64, try_into); 183 | cfi!(yaml, config, sync_auto, as_bool); 184 | cfi!(yaml, config, sync_timeout, as_str, parse_duration); 185 | cfi!(yaml, config, sync_msg_timeout, as_str, parse_duration); 186 | cfi!(yaml, config, sync_msg_inflight, as_u64, try_into); 187 | cfi!(yaml, config, fabric_timeout, as_str, parse_duration); 188 | cfi!(yaml, config, request_timeout, as_str, parse_duration); 189 | cfi!(yaml, config, client_connection_max, as_u64, try_into); 190 | cfi!(yaml, config, value_version_max, as_u64, try_into); 191 | cfi!( 192 | yaml, 193 | config, 194 | consistency_read, 195 | as_str, 196 | ConsistencyLevel::from_str 197 | ); 198 | cfi!( 199 | yaml, 200 | config, 201 | consistency_write, 202 | as_str, 203 | ConsistencyLevel::from_str 204 | ); 205 | 206 | if let Some(v) = yaml.get("seed_nodes") { 207 | config.seed_nodes = v 208 | .as_sequence() 209 | .expect("seed_nodes is not a sequence") 210 | .iter() 211 | .map(|v| { 212 | v.as_str() 213 | .expect("seed_nodes element is not a string") 214 | .parse() 215 | .expect("seed_nodes element can't be parsed") 216 | }).collect(); 217 | } 218 | 219 | if let Some(config_value) = yaml.get("logging") { 220 | setup_logging(config_value); 221 | } 222 | } 223 | 224 | pub fn setup_logging(config_value: &yaml::Value) { 225 | let raw_config: log4rs::file::RawConfig = 226 | yaml::from_value(config_value.clone()).expect("failed to parse logging config"); 227 | 228 | let (appenders, errors) = raw_config.appenders_lossy(&Default::default()); 229 | if !errors.is_empty() { 230 | panic!("failed to configure logging: {:?}", errors); 231 | } 232 | 233 | let (config, errors) = log4rs::config::Config::builder() 234 | .appenders(appenders) 235 | .loggers(raw_config.loggers()) 236 | .build_lossy(raw_config.root()); 237 | 238 | if !errors.is_empty() { 239 | panic!("failed to configure logging: {:?}", errors); 240 | } 241 | 242 | log4rs::init_config(config).expect("failed to init logging"); 243 | } 244 | 245 | pub fn setup_default_logging() { 246 | let config = log4rs::config::Config::builder() 247 | .appender( 248 | log4rs::config::Appender::builder().build( 249 | "console", 250 | Box::new( 251 | log4rs::append::console::ConsoleAppender::builder() 252 | .target(log4rs::append::console::Target::Stderr) 253 | .build(), 254 | ), 255 | ), 256 | ).logger( 257 | log4rs::config::Logger::builder() 258 | .appender("console") 259 | .build("sucredb", log::LevelFilter::Info), 260 | ).build(log4rs::config::Root::builder().build(log::LevelFilter::Off)) 261 | .expect("failed to setup default logging"); 262 | 263 | log4rs::init_config(config).expect("failed to init logging"); 264 | } 265 | -------------------------------------------------------------------------------- /src/cubes.rs: -------------------------------------------------------------------------------- 1 | use bincode; 2 | use bytes::Bytes; 3 | use command::CommandError; 4 | use linear_map::{Entry as LMEntry, LinearMap}; 5 | use resp::RespValue; 6 | use std::boxed::FnBox; 7 | use std::time; 8 | use version_vector::*; 9 | 10 | pub type MutatorFn = 11 | Box Result<(Cube, Option), CommandError> + Send>; 12 | pub type ResponseFn = Box RespValue + Send>; 13 | 14 | #[derive(Clone, Debug, Serialize, Deserialize)] 15 | pub enum Cube { 16 | // the order is used to merge different types in a deterministic way 17 | Counter(Counter), 18 | Value(Value), 19 | Map(Map), 20 | Set(Set), 21 | Void(VersionVector), 22 | } 23 | 24 | macro_rules! impl_into{ 25 | ($s:ident, $v:ident) => { 26 | pub fn $s(self) -> Option<$v>{ 27 | match self { 28 | Cube::$v(a) => Some(a), 29 | Cube::Void(a) => Some($v::with(a)), 30 | _ => None, 31 | } 32 | } 33 | } 34 | } 35 | 36 | impl Default for Cube { 37 | fn default() -> Self { 38 | Cube::Void(Default::default()) 39 | } 40 | } 41 | 42 | impl Cube { 43 | pub fn is_subsumed(&self, bvv: &BitmappedVersionVector) -> bool { 44 | use self::Cube::*; 45 | match *self { 46 | Counter(ref a) => a.values.is_empty() && a.vv.contained(bvv), 47 | Value(ref a) => a.values.is_empty() && a.vv.contained(bvv), 48 | Map(ref a) => a.values.is_empty() && a.vv.contained(bvv), 49 | Set(ref a) => a.values.is_empty() && a.vv.contained(bvv), 50 | Void(_) => unreachable!(), 51 | } 52 | } 53 | 54 | impl_into!(into_value, Value); 55 | impl_into!(into_counter, Counter); 56 | impl_into!(into_map, Map); 57 | impl_into!(into_set, Set); 58 | 59 | // minimum set of dots required to assemble this cube 60 | // see comment at the bottom 61 | pub fn for_each_dot(&self, mut cb: CB) { 62 | use self::Cube::*; 63 | match *self { 64 | Counter(ref a) => a.values.iter().for_each(|(&i, &(v, _))| cb(i, v)), 65 | Value(ref a) => a.values.iter().for_each(|(&(i, v), _)| cb(i, v)), 66 | Map(ref a) => a.dots.iter().for_each(|(i, v)| cb(i, v)), 67 | Set(ref a) => a.dots.iter().for_each(|(i, v)| cb(i, v)), 68 | Void(_) => unreachable!(), 69 | } 70 | } 71 | 72 | pub fn new(bvv: &BitmappedVersionVector) -> Cube { 73 | let mut vv = VersionVector::new(); 74 | for (&n, bv) in bvv.iter() { 75 | vv.add(n, bv.base()); 76 | } 77 | Cube::Void(vv) 78 | } 79 | 80 | pub fn del(&mut self, id: Id, version: Version, vv: &VersionVector) -> bool { 81 | use self::Cube::*; 82 | match *self { 83 | Counter(ref mut a) => a.clear(id, version), 84 | Value(ref mut a) => a.set(id, version, None, vv), 85 | Map(ref mut a) => a.clear(id, version), 86 | Set(ref mut a) => a.clear(id, version), 87 | Void(_) => return false, 88 | } 89 | true 90 | } 91 | 92 | pub fn merge(self, other: Self) -> Self { 93 | use self::Cube::*; 94 | match (self, other) { 95 | (Counter(a), Counter(b)) => Counter(a.merge(b)), 96 | (Value(a), Value(b)) => Value(a.merge(b)), 97 | (Map(a), Map(b)) => Map(a.merge(b)), 98 | (Set(a), Set(b)) => Set(a.merge(b)), 99 | (Void(vv), a) | (a, Void(vv)) => match a { 100 | Counter(a) => Counter(a.merge(self::Counter::with(vv))), 101 | Value(a) => Value(a.merge(self::Value::with(vv))), 102 | Map(a) => Map(a.merge(self::Map::with(vv))), 103 | Set(a) => Set(a.merge(self::Set::with(vv))), 104 | Void(mut o_vv) => { 105 | o_vv.merge(&vv); 106 | Void(o_vv) 107 | } 108 | }, 109 | (a, b) => { 110 | warn!("Merging Cubes with different types"); 111 | #[allow(unreachable_patterns)] 112 | match (a, b) { 113 | (Counter(a), _) | (_, Counter(a)) => Counter(a), 114 | (Value(a), _) | (_, Value(a)) => Value(a), 115 | (Map(a), _) | (_, Map(a)) => Map(a), 116 | (Set(a), _) | (_, Set(a)) => Set(a), 117 | (Void(_), _) | (_, Void(_)) => unreachable!(), 118 | } 119 | } 120 | } 121 | } 122 | } 123 | 124 | // RWCounter 125 | #[derive(Clone, Debug, Serialize, Deserialize)] 126 | pub struct Counter { 127 | values: LinearMap, 128 | vv: VersionVector, 129 | } 130 | 131 | impl Counter { 132 | fn with(vv: VersionVector) -> Self { 133 | Counter { 134 | values: Default::default(), 135 | vv, 136 | } 137 | } 138 | 139 | pub fn get(&self) -> i64 { 140 | self.values.values().map(|&(_, c)| c).sum() 141 | } 142 | 143 | pub fn inc(&mut self, node: Id, version: Version, by: i64) -> i64 { 144 | self.vv.add(node, version); 145 | let version_counter = self.values.entry(node).or_insert((0, 0)); 146 | version_counter.0 = version; 147 | version_counter.1 += by; 148 | version_counter.1 149 | } 150 | 151 | pub fn clear(&mut self, node: Id, version: Version) { 152 | self.values.clear(); 153 | self.vv.add(node, version); 154 | } 155 | 156 | fn merge(mut self, other: Self) -> Self { 157 | for (id, other) in other.values { 158 | match self.values.entry(id) { 159 | LMEntry::Occupied(mut oc) => if other.0 > oc.get().0 { 160 | *oc.get_mut() = other; 161 | }, 162 | LMEntry::Vacant(va) => { 163 | va.insert(other); 164 | } 165 | } 166 | } 167 | self 168 | } 169 | } 170 | 171 | // MultiRegister 172 | #[derive(Clone, Debug, Serialize, Deserialize)] 173 | pub struct Value { 174 | values: DotMap>, 175 | vv: VersionVector, 176 | } 177 | 178 | impl Value { 179 | fn with(vv: VersionVector) -> Self { 180 | Value { 181 | values: Default::default(), 182 | vv, 183 | } 184 | } 185 | 186 | pub fn len(&self) -> usize { 187 | self.values.len() 188 | } 189 | 190 | pub fn set(&mut self, node: Id, version: Version, value: Option, vv: &VersionVector) { 191 | self.values.discard(vv); 192 | self.values.insert(node, version, value); 193 | self.vv.add(node, version); 194 | } 195 | 196 | fn merge(mut self, mut other: Self) -> Self { 197 | self.values.merge(&mut other.values, &self.vv, &other.vv); 198 | self.vv.merge(&other.vv); 199 | self 200 | } 201 | } 202 | 203 | /// Actor Observed removal 204 | /// Add wins on conflict 205 | #[derive(Clone, Debug, Serialize, Deserialize)] 206 | pub struct Set { 207 | values: CausalMap, 208 | dots: VersionVector, 209 | vv: VersionVector, 210 | } 211 | 212 | impl Set { 213 | fn with(vv: VersionVector) -> Self { 214 | Set { 215 | values: Default::default(), 216 | dots: Default::default(), 217 | vv, 218 | } 219 | } 220 | 221 | pub fn insert(&mut self, node: Id, version: Version, item: Bytes) -> bool { 222 | let result = self 223 | .values 224 | .insert(item, DotSet::from_dot((node, version))) 225 | .is_none(); 226 | self.vv.add(node, version); 227 | self.dots.add(node, version); 228 | result 229 | } 230 | 231 | pub fn remove(&mut self, node: Id, version: Version, item: &[u8]) -> bool { 232 | let result = self.values.remove(item).is_some(); 233 | self.vv.add(node, version); 234 | self.dots.add(node, version); 235 | result 236 | } 237 | 238 | pub fn clear(&mut self, node: Id, version: Version) { 239 | self.values.clear(); 240 | self.vv.add(node, version); 241 | self.dots.add(node, version); 242 | } 243 | 244 | fn merge(mut self, mut other: Self) -> Self { 245 | self.values.merge(&mut other.values, &self.vv, &other.vv); 246 | self.vv.merge(&other.vv); 247 | self.dots.merge(&other.dots); 248 | self 249 | } 250 | } 251 | 252 | // Actor Observed removal 253 | // LWW on value conflict (max as tiebreaker) 254 | #[derive(Clone, Debug, Serialize, Deserialize)] 255 | pub struct Map { 256 | values: CausalMap, 257 | dots: VersionVector, 258 | vv: VersionVector, 259 | } 260 | 261 | impl Map { 262 | fn with(vv: VersionVector) -> Self { 263 | Map { 264 | values: Default::default(), 265 | dots: Default::default(), 266 | vv, 267 | } 268 | } 269 | 270 | pub fn insert(&mut self, node: Id, version: Version, key: Bytes, value: Bytes) -> bool { 271 | let result = self 272 | .values 273 | .insert(key, MapValue::new((node, version), value)) 274 | .is_none(); 275 | self.vv.add(node, version); 276 | self.dots.add(node, version); 277 | result 278 | } 279 | 280 | pub fn remove(&mut self, node: Id, version: Version, key: &[u8]) -> bool { 281 | let result = self.values.remove(key).is_some(); 282 | self.vv.add(node, version); 283 | self.dots.add(node, version); 284 | result 285 | } 286 | 287 | pub fn clear(&mut self, node: Id, version: Version) { 288 | self.values.clear(); 289 | self.vv.add(node, version); 290 | self.dots.add(node, version); 291 | } 292 | 293 | fn merge(mut self, mut other: Self) -> Self { 294 | self.values.merge(&mut other.values, &self.vv, &other.vv); 295 | self.vv.merge(&other.vv); 296 | self.dots.merge(&other.dots); 297 | self 298 | } 299 | } 300 | 301 | #[derive(Clone, Debug, Default, Serialize, Deserialize)] 302 | struct MapValue { 303 | dots: DotSet, 304 | value: Bytes, 305 | timestamp: u64, // millis since epoch 306 | } 307 | 308 | impl MapValue { 309 | fn new(dot: (Id, Version), value: Bytes) -> Self { 310 | let timestamp = time::UNIX_EPOCH.elapsed().unwrap(); 311 | MapValue { 312 | dots: DotSet::from_dot(dot), 313 | value, 314 | timestamp: timestamp.as_secs() * 1_000 + (timestamp.subsec_nanos() / 1_000_000) as u64, 315 | } 316 | } 317 | } 318 | 319 | impl CausalValue for MapValue { 320 | fn merge(&mut self, other: &mut Self, s_vv: &VV, o_vv: &VV) { 321 | self.dots.merge(&mut other.dots, s_vv, o_vv); 322 | // resolve possible value collision 323 | // if timestamps are equal value becomes max(a, b) 324 | if self.timestamp > other.timestamp { 325 | // nothing to do 326 | } else if other.timestamp > self.timestamp || other.value > self.value { 327 | self.timestamp = other.timestamp; 328 | ::std::mem::swap(&mut self.value, &mut other.value); 329 | } 330 | } 331 | 332 | fn is_empty(&self) -> bool { 333 | self.dots.is_empty() 334 | } 335 | } 336 | 337 | pub fn render_value(cube: Cube) -> RespValue { 338 | match cube { 339 | Cube::Value(v) => { 340 | let serialized_vv = bincode::serialize(&v.vv).unwrap(); 341 | let mut values: Vec<_> = v 342 | .values 343 | .into_iter() 344 | .filter_map(|(_, ov)| ov.map(RespValue::Data)) 345 | .collect(); 346 | values.push(RespValue::Data(serialized_vv.into())); 347 | RespValue::Array(values) 348 | } 349 | Cube::Void(vv) => { 350 | let serialized_vv = bincode::serialize(&vv).unwrap(); 351 | RespValue::Array(vec![RespValue::Data(serialized_vv.into())]) 352 | } 353 | _ => CommandError::TypeError.into(), 354 | } 355 | } 356 | 357 | pub fn render_counter(cube: Cube) -> RespValue { 358 | match cube { 359 | Cube::Counter(c) => RespValue::Int(c.get()), 360 | Cube::Void(_vv) => RespValue::Nil, 361 | _ => CommandError::TypeError.into(), 362 | } 363 | } 364 | 365 | pub fn render_type(cube: Cube) -> RespValue { 366 | use self::Cube::*; 367 | let ty = match cube { 368 | Counter(_) => "counter", // non-standard 369 | Value(_) => "string", 370 | Map(_) => "hash", 371 | Set(_) => "set", 372 | Void(_) => "none", 373 | }; 374 | RespValue::Data(ty.into()) 375 | } 376 | 377 | pub fn render_map(cube: Cube) -> RespValue { 378 | match cube { 379 | Cube::Map(m) => { 380 | let mut array = Vec::with_capacity(m.values.len() * 2); 381 | for (k, v) in m.values.into_iter() { 382 | array.push(RespValue::Data(k)); 383 | array.push(RespValue::Data(v.value)); 384 | } 385 | RespValue::Array(array) 386 | } 387 | Cube::Void(_) => RespValue::Array(vec![]), 388 | _ => CommandError::TypeError.into(), 389 | } 390 | } 391 | 392 | pub fn render_set(cube: Cube) -> RespValue { 393 | match cube { 394 | Cube::Set(s) => { 395 | let array = s 396 | .values 397 | .into_iter() 398 | .map(|(v, _)| RespValue::Data(v)) 399 | .collect(); 400 | RespValue::Array(array) 401 | } 402 | Cube::Void(_) => RespValue::Array(vec![]), 403 | _ => CommandError::TypeError.into(), 404 | } 405 | } 406 | 407 | /* 408 | Using the vv from cubes to track key dots (the latest version from each node) doesn't work, example: 409 | 410 | -> n3 is partitioned out 411 | -> n1 "SET a v" gets dot n1-1 412 | n1: a => [n1-1 v][n1 1] log: n1-1 => a 413 | n2: a => [n1-1 v][n1 1] log: n1-1 => a 414 | n3: -- 415 | 416 | -> n2 "SET a z [n1 1]" dot n2-1 417 | n1: a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a 418 | n2: a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a 419 | n3: -- 420 | 421 | -> n2 "SET b y" gets dot n2-2 422 | n1: b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b 423 | n2: b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b 424 | n3: -- 425 | 426 | -> n3 can receive messages 427 | -> n2 "SET c y" gets dot n2-3 (merges with void cube w/ [n1 1, n2 2]) 428 | n1: c => [n2-3 y][n1 1, n2 3] b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b, n2-3 => c 429 | n2: c => [n2-3 y][n1 1, n2 3] b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b, n2-3 => c 430 | n3: c => [n2-3 y][n1 1, n2 3] log: n2-3 => c, n1-1 => c 431 | 432 | n3 stores (n1-1 => c) in dot log but that's wrong 433 | 434 | Problem: 435 | VV from Voids pollutes the vv of new writes. 436 | 437 | Solution: 438 | For the MultiRegister the DotMap with optional values works as its own dot tracker, 439 | even if it could have more than 1 version per actor the number should stay low. 440 | Counters are similar to the MultiRegister case. 441 | Sets/Maps carry an additional VV to track the dots. 442 | Voids don't need any dot, their state is empty and they history is contained in the node clock (previous dots). 443 | 444 | */ 445 | 446 | /* 447 | Optimized bootstrap doesn't work? 448 | 449 | Given n1 with lots of data churn and the only kv left is k => [n1-100 v][n1 100] 450 | node clock says [n1 1000] and logs have all the expected entries 451 | 452 | When n2 bootstrap n1 will send only non-deleted kvs as this is always <= number of dots (optimized bootstrap). 453 | Thus only k => [n1-100 v][n1 100] is sent. 454 | n2 will store k as above and log will contain only n1-100 => k 455 | syncfin will update n2 node clock to [n1 1000] 456 | 457 | If asked for any key other than k it'll return the same response as n1. 458 | That is a void with a causal context [n1 1000] 459 | 460 | Problem: 461 | What if n3 asks to sync dots n1-101 n1-102 ... with n2? It'll get nothing but it'll get it's n1 clock bumped to 1000 462 | If key y was deleted as dot n1-101 the delete will never get propagated to n3. 463 | 464 | Fix: on SyncFin sync only remote (n2) part of clock, like: n3LocalClock[n2].merge(n2RemoteClock[n2]) 465 | It was in the updated paper all the time, I just didn't see it. 466 | 467 | */ 468 | 469 | /* 470 | AAE based bootstrap any better? 471 | 472 | Given n1 with lots of data churn and the only kv left is k => [n1-100 v][n1 100] 473 | node clock says [n1 1000] and logs have all the expected entries 474 | 475 | n2 comes up (or it was partitioned the entire time) and wants to sync with n1. 476 | It needs all dots of n1, it'll get k => [n1-100 v][n1 100] and voids for all other keys. 477 | 478 | What if n3 asks to sync dots n1-101 n1-102 ... with n2? 479 | 480 | Same problem and fix as the above. 481 | 482 | */ 483 | -------------------------------------------------------------------------------- /src/fabric.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::Entry as HMEntry; 2 | use std::net::SocketAddr; 3 | use std::sync::atomic::{AtomicUsize, Ordering}; 4 | use std::sync::{mpsc, Arc, RwLock}; 5 | use std::time::Duration; 6 | use std::{io, thread}; 7 | 8 | use bincode; 9 | use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; 10 | use bytes::{BufMut, Bytes, BytesMut}; 11 | use linear_map::LinearMap; 12 | use rand::{thread_rng, Rng}; 13 | 14 | use futures::future::Either; 15 | use futures::sync::mpsc as fmpsc; 16 | use futures::sync::oneshot as foneshot; 17 | use futures::{Future, Sink, Stream}; 18 | use tokio_codec as codec; 19 | use tokio_core as tokio; 20 | use tokio_io::{io as tokio_io, AsyncRead}; 21 | 22 | use config::Config; 23 | use database::NodeId; 24 | pub use fabric_msg::*; 25 | use utils::{into_io_error, GenericError, IdHashMap}; 26 | 27 | // u32(le) payload len + bincode payload 28 | struct FramedBincodeCodec; 29 | 30 | impl codec::Decoder for FramedBincodeCodec { 31 | type Item = FabricMsg; 32 | type Error = io::Error; 33 | 34 | fn decode(&mut self, src: &mut BytesMut) -> io::Result> { 35 | let (consumed, result) = { 36 | let mut bytes: &[u8] = &*src; 37 | if let Ok(msg_len) = bytes.read_u32::() { 38 | if bytes.len() >= msg_len as usize { 39 | match bincode::deserialize_from(&mut bytes) { 40 | Ok(v) => (4 + msg_len as usize, Ok(Some(v))), 41 | Err(e) => (0, Err(into_io_error(e))), 42 | } 43 | } else { 44 | (0, Ok(None)) 45 | } 46 | } else { 47 | (0, Ok(None)) 48 | } 49 | }; 50 | src.split_to(consumed); 51 | result 52 | } 53 | } 54 | 55 | impl FramedBincodeCodec { 56 | fn serialize(item: FabricMsgRef) -> Bytes { 57 | let item_size = bincode::serialized_size(&item).unwrap(); 58 | let mut dst = BytesMut::with_capacity(item_size as usize + 4); 59 | dst.put_u32_le(item_size as u32); 60 | bincode::serialize_into(&mut (&mut dst).writer(), &item).unwrap(); 61 | dst.into() 62 | } 63 | } 64 | 65 | impl codec::Encoder for FramedBincodeCodec { 66 | type Item = Bytes; 67 | type Error = io::Error; 68 | 69 | fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> io::Result<()> { 70 | dst.reserve(item.len()); 71 | dst.put(&item); 72 | Ok(()) 73 | } 74 | } 75 | 76 | pub type FabricMsgFn = Box; 77 | pub type FabricConFn = Box; 78 | 79 | type SenderChan = fmpsc::UnboundedSender; 80 | type InitType = io::Result<(Arc, foneshot::Sender<()>)>; 81 | 82 | const FABRIC_KEEPALIVE_MS: u64 = 1000; 83 | const FABRIC_RECONNECT_INTERVAL_MS: u64 = 1000; 84 | 85 | /// The messaging network that encompasses all nodes of the cluster 86 | /// using the fabric you can send messages (best-effort delivery) 87 | /// to any registered node. 88 | /// Currently each node keeps a connection to every other node. Due to the 89 | /// full-duplex nature of tcp this gives 2 pipes to each server, both are 90 | /// used to make better use of the socket buffers (is this a good idea though?). 91 | /// This also helps parallelism as an eventual big message won't affect 92 | /// the latency as much. 93 | pub struct Fabric { 94 | context: Arc, 95 | loop_thread: Option<( 96 | foneshot::Sender<()>, 97 | thread::JoinHandle>, 98 | )>, 99 | } 100 | 101 | struct ReaderContext { 102 | context: Arc, 103 | peer: NodeId, 104 | } 105 | 106 | struct WriterContext { 107 | context: Arc, 108 | peer: NodeId, 109 | connection_id: usize, 110 | } 111 | 112 | struct SharedContext { 113 | node: NodeId, 114 | addr: SocketAddr, 115 | loop_remote: tokio::reactor::Remote, 116 | msg_handlers: RwLock>, 117 | con_handlers: RwLock>, 118 | // TODO: unify nodes_addr and connections maps 119 | nodes_addr: RwLock>, 120 | connections: RwLock>>, 121 | connection_gen: AtomicUsize, 122 | } 123 | 124 | impl SharedContext { 125 | fn register_node(&self, peer: NodeId, peer_addr: SocketAddr) -> Option { 126 | self.nodes_addr.write().unwrap().insert(peer, peer_addr) 127 | } 128 | 129 | fn remove_node(&self, peer: NodeId) -> Option { 130 | self.nodes_addr.write().unwrap().remove(&peer) 131 | } 132 | 133 | fn register_connection(&self, peer: NodeId, sender: SenderChan) -> usize { 134 | let connection_id = self.connection_gen.fetch_add(1, Ordering::Relaxed); 135 | debug!( 136 | "register_connection peer: {}, id: {:?}", 137 | peer, connection_id 138 | ); 139 | let is_new = { 140 | let mut locked = self.connections.write().unwrap(); 141 | let entry = locked.entry(peer).or_insert_with(Default::default); 142 | let is_new = entry.is_empty(); 143 | entry.push((connection_id, sender)); 144 | is_new 145 | }; 146 | if is_new { 147 | for handler in &*self.con_handlers.read().unwrap() { 148 | handler(peer); 149 | } 150 | } 151 | connection_id 152 | } 153 | 154 | fn remove_connection(&self, peer: NodeId, connection_id: usize) { 155 | debug!("Remove_connection peer: {}, id: {:?}", peer, connection_id); 156 | let mut locked = self.connections.write().unwrap(); 157 | if let HMEntry::Occupied(mut o) = locked.entry(peer) { 158 | let p = o 159 | .get() 160 | .iter() 161 | .position(|x| x.0 == connection_id) 162 | .expect("connection_id not found"); 163 | o.get_mut().swap_remove(p); 164 | // cleanup entry if empty 165 | if o.get().is_empty() { 166 | o.remove(); 167 | } 168 | } else { 169 | panic!("Peer not found in connections"); 170 | } 171 | } 172 | } 173 | 174 | impl ReaderContext { 175 | fn new(context: Arc, peer: NodeId) -> Self { 176 | ReaderContext { 177 | context: context, 178 | peer: peer, 179 | } 180 | } 181 | 182 | fn dispatch(&self, msg: FabricMsg) { 183 | let msg_type = msg.get_type(); 184 | if let Some(handler) = self 185 | .context 186 | .msg_handlers 187 | .read() 188 | .unwrap() 189 | .get(&(msg_type as u8)) 190 | { 191 | trace!("recv from {:?} {:?}", self.peer, msg); 192 | handler(self.peer, msg); 193 | } else { 194 | error!("No handler for msg type {:?}", msg_type); 195 | } 196 | } 197 | } 198 | 199 | impl WriterContext { 200 | fn new(context: Arc, peer: NodeId, sender: SenderChan) -> Self { 201 | let connection_id = context.register_connection(peer, sender); 202 | WriterContext { 203 | context: context, 204 | peer: peer, 205 | connection_id: connection_id, 206 | } 207 | } 208 | } 209 | 210 | impl Drop for WriterContext { 211 | fn drop(&mut self) { 212 | self.context 213 | .remove_connection(self.peer, self.connection_id); 214 | } 215 | } 216 | 217 | impl Fabric { 218 | fn listen( 219 | listener: tokio::net::TcpListener, 220 | context: Arc, 221 | handle: tokio::reactor::Handle, 222 | ) -> Box> { 223 | debug!("Starting fabric listener"); 224 | let fut = listener 225 | .incoming() 226 | .for_each(move |(socket, addr)| { 227 | debug!("Accepting connection from {:?}", addr); 228 | let context_cloned = context.clone(); 229 | handle.spawn( 230 | Self::handshake(socket, context_cloned) 231 | .and_then(move |(s, peer_id, context)| { 232 | Self::steady_connection(s, peer_id, context) 233 | }).then(|_| Ok(())), 234 | ); 235 | Ok(()) 236 | }).map_err(|_| ()); 237 | Box::new(fut) 238 | } 239 | 240 | fn connect( 241 | expected_node: Option, 242 | addr: SocketAddr, 243 | context: Arc, 244 | handle: tokio::reactor::Handle, 245 | ) -> Box> { 246 | debug!("Connecting to node {:?}: {:?}", expected_node, addr); 247 | let context1 = context.clone(); 248 | let handle1 = handle.clone(); 249 | let handle2 = handle.clone(); 250 | 251 | let fut = tokio::net::TcpStream::connect(&addr, &handle) 252 | .select2( 253 | tokio::reactor::Timeout::new( 254 | Duration::from_millis(FABRIC_RECONNECT_INTERVAL_MS), 255 | &handle, 256 | ).expect("Can't create connect timeout"), 257 | ).then(|r| match r { 258 | Ok(Either::A((s, _))) => Ok(s), 259 | Ok(Either::B(_)) => Err(io::ErrorKind::TimedOut.into()), 260 | Err(either) => Err(either.split().0), 261 | }).and_then(move |s| Self::handshake(s, context)) 262 | .and_then(move |(s, peer_id, context)| Self::steady_connection(s, peer_id, context)) 263 | .then(move |_| { 264 | tokio::reactor::Timeout::new( 265 | Duration::from_millis(FABRIC_RECONNECT_INTERVAL_MS), 266 | &handle1, 267 | ).expect("Can't create reconnect timeout") 268 | }).and_then(move |_| { 269 | let node = expected_node.ok_or(io::ErrorKind::NotFound)?; 270 | let addr_opt = { 271 | let locked = context1.nodes_addr.read().unwrap(); 272 | locked.get(&node).cloned() 273 | }; 274 | if let Some(addr) = addr_opt { 275 | debug!("Reconnecting fabric connection to {:?}", addr); 276 | handle2.spawn(Self::connect( 277 | expected_node, 278 | addr, 279 | context1, 280 | handle2.clone(), 281 | )); 282 | } 283 | Ok(()) 284 | }); 285 | Box::new(fut.map_err(|_| ())) 286 | } 287 | 288 | fn handshake( 289 | socket: tokio::net::TcpStream, 290 | context: Arc, 291 | ) -> Box), Error = io::Error>> 292 | { 293 | debug!("Stablished connection with {:?}", socket.peer_addr()); 294 | let _ = socket.set_nodelay(true); 295 | let _ = socket.set_keepalive(Some(Duration::from_millis(FABRIC_KEEPALIVE_MS))); 296 | let mut buffer = [0u8; 8]; 297 | (&mut buffer[..]) 298 | .write_u64::(context.node) 299 | .unwrap(); 300 | let fut = tokio_io::write_all(socket, buffer) 301 | .and_then(|(s, b)| tokio_io::read_exact(s, b)) 302 | .and_then(move |(s, b)| { 303 | let peer_id = (&b[..]).read_u64::().unwrap(); 304 | debug!("Identified connection to node {}", peer_id); 305 | Ok((s, peer_id, context)) 306 | }); 307 | 308 | Box::new(fut) 309 | } 310 | 311 | fn steady_connection( 312 | socket: tokio::net::TcpStream, 313 | peer: NodeId, 314 | context: Arc, 315 | ) -> Box> { 316 | let (socket_rx, socket_tx) = socket.split(); 317 | let socket_tx = codec::FramedWrite::new(socket_tx, FramedBincodeCodec); 318 | let socket_rx = codec::FramedRead::new(socket_rx, FramedBincodeCodec); 319 | let (chan_tx, chan_rx) = fmpsc::unbounded(); 320 | 321 | let ctx_rx = ReaderContext::new(context.clone(), peer); 322 | let fut_rx = socket_rx.for_each(move |msg| { 323 | ctx_rx.dispatch(msg); 324 | Ok(()) 325 | }); 326 | 327 | let ctx_tx = WriterContext::new(context, peer, chan_tx); 328 | let fut_tx = socket_tx 329 | .send_all(chan_rx.map_err(|_| io::Error::from(io::ErrorKind::Other))) 330 | .then(move |r| { 331 | // hold onto ctx_tx until the stream is done 332 | drop(ctx_tx); 333 | r.map(|_| ()) 334 | }); 335 | 336 | Box::new(fut_rx.select(fut_tx).map(|_| ()).map_err(|(e, _)| e)) 337 | } 338 | 339 | fn init( 340 | node: NodeId, 341 | config: Config, 342 | handle: tokio::reactor::Handle, 343 | ) -> Result, GenericError> { 344 | let context = Arc::new(SharedContext { 345 | node: node, 346 | addr: config.fabric_addr, 347 | loop_remote: handle.remote().clone(), 348 | nodes_addr: Default::default(), 349 | msg_handlers: Default::default(), 350 | con_handlers: Default::default(), 351 | connections: Default::default(), 352 | connection_gen: Default::default(), 353 | }); 354 | 355 | let listener = tokio::net::TcpListener::bind(&context.addr, &handle)?; 356 | handle.spawn(Self::listen(listener, context.clone(), handle.clone())); 357 | 358 | Ok(context) 359 | } 360 | 361 | pub fn node(&self) -> NodeId { 362 | self.context.node 363 | } 364 | 365 | pub fn addr(&self) -> SocketAddr { 366 | self.context.addr 367 | } 368 | 369 | pub fn new(node: NodeId, config: &Config) -> Result { 370 | let config = config.clone(); 371 | let (init_tx, init_rx) = mpsc::channel(); 372 | let thread = thread::Builder::new() 373 | .name(format!("Fabric:{}", node)) 374 | .spawn(move || { 375 | let mut core = tokio::reactor::Core::new().unwrap(); 376 | let (completer_tx, completer_rx) = foneshot::channel(); 377 | init_tx.send(Self::init(node, config, core.handle()).map(|c| (c, completer_tx)))?; 378 | core.run(completer_rx).map_err(From::from) 379 | }).unwrap(); 380 | let (context, completer) = init_rx.recv()??; 381 | Ok(Fabric { 382 | context: context, 383 | loop_thread: Some((completer, thread)), 384 | }) 385 | } 386 | 387 | pub fn register_msg_handler(&self, msg_type: FabricMsgType, handler: FabricMsgFn) { 388 | self.context 389 | .msg_handlers 390 | .write() 391 | .unwrap() 392 | .insert(msg_type as u8, handler); 393 | } 394 | 395 | pub fn register_con_handler(&self, handler: FabricConFn) { 396 | self.context.con_handlers.write().unwrap().push(handler); 397 | } 398 | 399 | pub fn register_seed(&self, addr: SocketAddr) { 400 | self.start_connect(None, addr) 401 | } 402 | 403 | pub fn register_node(&self, node: NodeId, addr: SocketAddr) { 404 | let prev = self.context.register_node(node, addr); 405 | if prev != Some(addr) { 406 | self.start_connect(Some(node), addr); 407 | } 408 | } 409 | 410 | pub fn remove_node(&self, node: NodeId) { 411 | self.context.remove_node(node); 412 | } 413 | 414 | pub fn connections(&self) -> Vec { 415 | let writers = self.context.connections.read().unwrap(); 416 | writers 417 | .iter() 418 | .filter(|&(_, c)| !c.is_empty()) 419 | .map(|(&n, _)| n) 420 | .collect() 421 | } 422 | 423 | pub fn set_nodes(&self, it: I) 424 | where 425 | I: Iterator, 426 | { 427 | let mut nodes = self.context.nodes_addr.write().unwrap(); 428 | let mut x_nodes = nodes.clone(); 429 | for (node, addr) in it { 430 | if node != self.context.node { 431 | x_nodes.remove(&node); 432 | if nodes.insert(node, addr) != Some(addr) { 433 | self.start_connect(Some(node), addr); 434 | } 435 | } 436 | } 437 | for (node, _) in x_nodes { 438 | nodes.remove(&node); 439 | } 440 | } 441 | 442 | fn start_connect(&self, expected_node: Option, addr: SocketAddr) { 443 | let context = self.context.clone(); 444 | let context_cloned = context.clone(); 445 | context 446 | .loop_remote 447 | .spawn(move |h| Self::connect(expected_node, addr, context_cloned, h.clone())); 448 | } 449 | 450 | // TODO: take msgs as references and buffer serialized bytes instead 451 | pub fn send_msg<'a, T: Into>>( 452 | &'a self, 453 | node: NodeId, 454 | msg: T, 455 | ) -> Result<(), FabricError> { 456 | let msg = msg.into(); 457 | debug!("send_msg node:{} {:?}", node, msg); 458 | if node == self.context.node { 459 | panic!("Can't send message to self"); 460 | } 461 | if cfg!(test) { 462 | let droppable = match msg.get_type() { 463 | FabricMsgType::Crud => false, 464 | _ => true, 465 | }; 466 | if droppable { 467 | let fabric_drop = ::std::env::var("FABRIC_DROP") 468 | .ok() 469 | .map(|s| s.parse::().expect("Can't parse FABRIC_DROP")) 470 | .unwrap_or(0.0); 471 | if fabric_drop > 0.0 && thread_rng().gen::() < fabric_drop { 472 | warn!("Fabric msg droped due to FABRIC_DROP: {:?}", msg); 473 | return Ok(()); 474 | } 475 | } 476 | } 477 | 478 | let serialized_msg = FramedBincodeCodec::serialize(msg); 479 | let connections = self.context.connections.read().unwrap(); 480 | if let Some(o) = connections.get(&node) { 481 | if let Some(&(connection_id, ref chan)) = thread_rng().choose::<(_, _)>(o) { 482 | if let Err(_) = chan.unbounded_send(serialized_msg) { 483 | warn!("Can't send to fabric {}-{} chan", node, connection_id,); 484 | } else { 485 | return Ok(()); 486 | } 487 | } else { 488 | warn!("DROPING MSG - No channel available for {:?}", node); 489 | } 490 | } else { 491 | warn!("DROPING MSG - No entry for node {:?}", node); 492 | } 493 | 494 | Err(FabricError::NoRoute) 495 | } 496 | } 497 | 498 | impl Drop for Fabric { 499 | fn drop(&mut self) { 500 | warn!("droping fabric"); 501 | if let Some((c, t)) = self.loop_thread.take() { 502 | let _ = c.send(()); 503 | let _ = t.join(); 504 | } 505 | } 506 | } 507 | 508 | #[cfg(test)] 509 | mod tests { 510 | use super::*; 511 | use config::Config; 512 | use env_logger; 513 | use std::sync::{atomic, Arc}; 514 | use std::thread; 515 | use std::time::Duration; 516 | 517 | #[test] 518 | fn test() { 519 | let _ = env_logger::try_init(); 520 | let config1 = Config { 521 | fabric_addr: "127.0.0.1:6481".parse().unwrap(), 522 | ..Default::default() 523 | }; 524 | let config2 = Config { 525 | fabric_addr: "127.0.0.1:6482".parse().unwrap(), 526 | ..Default::default() 527 | }; 528 | let fabric1 = Fabric::new(1, &config1).unwrap(); 529 | let fabric2 = Fabric::new(2, &config2).unwrap(); 530 | fabric1.register_node(2, "127.0.0.1:6482".parse().unwrap()); 531 | fabric2.register_node(1, "127.0.0.1:6481".parse().unwrap()); 532 | thread::sleep(Duration::from_millis(10)); 533 | 534 | let counter = Arc::new(atomic::AtomicUsize::new(0)); 535 | let counter_ = counter.clone(); 536 | fabric2.register_msg_handler( 537 | FabricMsgType::Crud, 538 | Box::new(move |_, _| { 539 | counter_.fetch_add(1, atomic::Ordering::Relaxed); 540 | }), 541 | ); 542 | for _ in 0..3 { 543 | fabric1 544 | .send_msg( 545 | 2, 546 | &MsgRemoteSetAck { 547 | cookie: Default::default(), 548 | vnode: Default::default(), 549 | result: Ok(Vec::new()), 550 | }, 551 | ).unwrap(); 552 | } 553 | thread::sleep(Duration::from_millis(10)); 554 | assert_eq!(counter.load(atomic::Ordering::Relaxed), 3); 555 | } 556 | } 557 | -------------------------------------------------------------------------------- /src/fabric_msg.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use cubes::Cube; 3 | use database::*; 4 | use version_vector::*; 5 | 6 | #[derive(Debug, Copy, Clone)] 7 | pub enum FabricMsgType { 8 | Crud, 9 | Synch, 10 | DHT, 11 | Unknown, 12 | } 13 | 14 | #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] 15 | pub enum FabricError { 16 | NoRoute, 17 | CookieNotFound, 18 | BadVNodeStatus, 19 | NotReady, 20 | SyncInterrupted, 21 | StorageError, 22 | } 23 | 24 | #[derive(Debug, Serialize, Deserialize)] 25 | pub enum FabricMsg { 26 | RemoteGet(MsgRemoteGet), 27 | RemoteGetAck(MsgRemoteGetAck), 28 | RemoteSet(MsgRemoteSet), 29 | RemoteSetAck(MsgRemoteSetAck), 30 | SyncStart(MsgSyncStart), 31 | SyncSend(MsgSyncSend), 32 | SyncAck(MsgSyncAck), 33 | SyncFin(MsgSyncFin), 34 | DHTAE(VersionVector), 35 | DHTSync(Bytes), 36 | Unknown, 37 | } 38 | 39 | #[derive(Debug, Serialize)] 40 | pub enum FabricMsgRef<'a> { 41 | RemoteGet(&'a MsgRemoteGet), 42 | RemoteGetAck(&'a MsgRemoteGetAck), 43 | RemoteSet(&'a MsgRemoteSet), 44 | RemoteSetAck(&'a MsgRemoteSetAck), 45 | SyncStart(&'a MsgSyncStart), 46 | SyncSend(&'a MsgSyncSend), 47 | SyncAck(&'a MsgSyncAck), 48 | SyncFin(&'a MsgSyncFin), 49 | DHTAE(&'a VersionVector), 50 | DHTSync(&'a Bytes), 51 | Unknown, 52 | } 53 | 54 | impl FabricMsg { 55 | pub fn get_type(&self) -> FabricMsgType { 56 | match *self { 57 | FabricMsg::RemoteGet(..) 58 | | FabricMsg::RemoteGetAck(..) 59 | | FabricMsg::RemoteSet(..) 60 | | FabricMsg::RemoteSetAck(..) => FabricMsgType::Crud, 61 | FabricMsg::SyncStart(..) 62 | | FabricMsg::SyncSend(..) 63 | | FabricMsg::SyncAck(..) 64 | | FabricMsg::SyncFin(..) => FabricMsgType::Synch, 65 | FabricMsg::DHTSync(..) | FabricMsg::DHTAE(..) => FabricMsgType::DHT, 66 | _ => unreachable!(), 67 | } 68 | } 69 | } 70 | 71 | impl<'a> FabricMsgRef<'a> { 72 | pub fn get_type(&self) -> FabricMsgType { 73 | match *self { 74 | FabricMsgRef::RemoteGet(..) 75 | | FabricMsgRef::RemoteGetAck(..) 76 | | FabricMsgRef::RemoteSet(..) 77 | | FabricMsgRef::RemoteSetAck(..) => FabricMsgType::Crud, 78 | FabricMsgRef::SyncStart(..) 79 | | FabricMsgRef::SyncSend(..) 80 | | FabricMsgRef::SyncAck(..) 81 | | FabricMsgRef::SyncFin(..) => FabricMsgType::Synch, 82 | FabricMsgRef::DHTSync(..) | FabricMsgRef::DHTAE(..) => FabricMsgType::DHT, 83 | _ => unreachable!(), 84 | } 85 | } 86 | } 87 | 88 | #[derive(Debug, Serialize, Deserialize)] 89 | pub struct MsgRemoteGet { 90 | pub vnode: VNodeNo, 91 | pub cookie: Cookie, 92 | pub keys: Vec, 93 | } 94 | 95 | #[derive(Debug, Serialize, Deserialize)] 96 | pub struct MsgRemoteGetAck { 97 | pub vnode: VNodeNo, 98 | pub cookie: Cookie, 99 | pub result: Result, FabricError>, 100 | } 101 | 102 | #[derive(Debug, Serialize, Deserialize)] 103 | pub struct MsgRemoteSet { 104 | pub vnode: VNodeNo, 105 | pub cookie: Cookie, 106 | pub writes: Vec<(Bytes, Cube, bool)>, 107 | pub reply: bool, 108 | } 109 | 110 | #[derive(Debug, Serialize, Deserialize)] 111 | pub struct MsgRemoteSetAck { 112 | pub vnode: VNodeNo, 113 | pub cookie: Cookie, 114 | pub result: Result>, FabricError>, 115 | } 116 | 117 | #[derive(Debug, Serialize, Deserialize)] 118 | pub struct MsgSyncStart { 119 | pub vnode: VNodeNo, 120 | pub cookie: Cookie, 121 | pub clocks_in_peer: BitmappedVersionVector, 122 | pub target: Option, 123 | } 124 | 125 | #[derive(Debug, Serialize, Deserialize)] 126 | pub struct MsgSyncFin { 127 | pub vnode: VNodeNo, 128 | pub cookie: Cookie, 129 | pub result: Result, 130 | } 131 | 132 | #[derive(Debug, Serialize, Deserialize)] 133 | pub struct MsgSyncSend { 134 | pub vnode: VNodeNo, 135 | pub cookie: Cookie, 136 | pub seq: u64, 137 | pub key: Bytes, 138 | pub value: Cube, 139 | } 140 | 141 | #[derive(Debug, Serialize, Deserialize)] 142 | pub struct MsgSyncAck { 143 | pub vnode: VNodeNo, 144 | pub cookie: Cookie, 145 | pub seq: u64, 146 | } 147 | 148 | impl<'a> Into> for &'a FabricMsg { 149 | fn into(self) -> FabricMsgRef<'a> { 150 | match self { 151 | &FabricMsg::RemoteGet(ref a) => FabricMsgRef::RemoteGet(a), 152 | &FabricMsg::RemoteGetAck(ref a) => FabricMsgRef::RemoteGetAck(a), 153 | &FabricMsg::RemoteSet(ref a) => FabricMsgRef::RemoteSet(a), 154 | &FabricMsg::RemoteSetAck(ref a) => FabricMsgRef::RemoteSetAck(a), 155 | &FabricMsg::SyncStart(ref a) => FabricMsgRef::SyncStart(a), 156 | &FabricMsg::SyncSend(ref a) => FabricMsgRef::SyncSend(a), 157 | &FabricMsg::SyncAck(ref a) => FabricMsgRef::SyncAck(a), 158 | &FabricMsg::SyncFin(ref a) => FabricMsgRef::SyncFin(a), 159 | &FabricMsg::DHTSync(ref a) => FabricMsgRef::DHTSync(a), 160 | &FabricMsg::DHTAE(ref a) => FabricMsgRef::DHTAE(a), 161 | _ => unreachable!(), 162 | } 163 | } 164 | } 165 | 166 | macro_rules! impl_into { 167 | ($w:ident, $msg:ident) => { 168 | impl Into for $msg { 169 | fn into(self) -> FabricMsg { 170 | FabricMsg::$w(self) 171 | } 172 | } 173 | impl<'a> Into> for &'a $msg { 174 | fn into(self) -> FabricMsgRef<'a> { 175 | FabricMsgRef::$w(self) 176 | } 177 | } 178 | }; 179 | } 180 | 181 | impl_into!(RemoteGet, MsgRemoteGet); 182 | impl_into!(RemoteGetAck, MsgRemoteGetAck); 183 | impl_into!(RemoteSet, MsgRemoteSet); 184 | impl_into!(RemoteSetAck, MsgRemoteSetAck); 185 | impl_into!(SyncAck, MsgSyncAck); 186 | impl_into!(SyncSend, MsgSyncSend); 187 | impl_into!(SyncFin, MsgSyncFin); 188 | impl_into!(SyncStart, MsgSyncStart); 189 | -------------------------------------------------------------------------------- /src/gossip.rs: -------------------------------------------------------------------------------- 1 | use std::{cmp, thread, io, time, fmt}; 2 | use std::time::{Duration, Instant}; 3 | use std::net::SocketAddr; 4 | use std::marker::PhantomData; 5 | use std::sync::{mpsc, Arc, Mutex}; 6 | use std::collections::HashMap; 7 | 8 | use rand::{thread_rng, Rng}; 9 | use serde::Serialize; 10 | use serde::de::DeserializeOwned; 11 | use bincode; 12 | use futures::{Future, Stream, Sink}; 13 | use futures::sync::mpsc as fmpsc; 14 | use futures::sync::oneshot as foneshot; 15 | use tokio_core as tokio; 16 | 17 | use inflightmap::InFlightMap; 18 | use utils::into_io_error; 19 | 20 | const PACKET_SIZE: usize = 1400; 21 | const PING_PERIOD_MS: u64 = 500; 22 | const PING_TIMEOUT_MS: u64 = 1000; 23 | const SUSPECT_TIMEOUT_MS: u64 = 5 * PING_TIMEOUT_MS; 24 | const PING_SYNC_CHANCE: f32 = 0.05f32; 25 | const PING_CANDIDATES: usize = 3; 26 | const PINGREQ_CANDIDATES: usize = 3; 27 | const TIMER_RESOLUTION_MS: u64 = 150; 28 | 29 | // quick implementation of SWIM 30 | // has various limitations 31 | // TODO: piggyback 32 | pub struct Gossiper { 33 | context: Arc>>, 34 | loop_thread: Option<(foneshot::Sender<()>, thread::JoinHandle>)>, 35 | } 36 | 37 | pub enum GossiperMsg { 38 | New(SocketAddr, T), 39 | Alive(SocketAddr, T), 40 | Dead(SocketAddr), 41 | // Left(SocketAddr), 42 | } 43 | 44 | pub type GossiperCallback = Box) + Send>; 45 | 46 | #[derive(Debug, Eq, PartialEq, Copy, Clone, Serialize, Deserialize)] 47 | enum NodeStatus { 48 | Alive, 49 | Suspect, 50 | Dead, 51 | } 52 | 53 | type Seq = u32; 54 | 55 | pub trait Metadata 56 | : Serialize + DeserializeOwned + Clone + PartialEq + Send + fmt::Debug + 'static 57 | { 58 | } 59 | 60 | impl Metadata 61 | for T { 62 | } 63 | 64 | #[derive(Debug)] 65 | struct Node { 66 | incarnation: Seq, 67 | status_change: Instant, 68 | status: NodeStatus, 69 | meta: T, 70 | } 71 | 72 | struct Inner { 73 | addr: SocketAddr, 74 | seq: Seq, 75 | incarnation: Seq, 76 | meta: T, 77 | nodes: HashMap>, 78 | next_alive_probe: Instant, 79 | next_dead_probe: Instant, 80 | pingreq_inflight: InFlightMap, 81 | ping_inflight: InFlightMap, 82 | suspect_inflight: InFlightMap, 83 | send_queue: fmpsc::UnboundedSender<(SocketAddr, Message)>, 84 | broadcast_queue: Vec<(u32, Message)>, 85 | callback: GossiperCallback, 86 | leaving: bool, 87 | bootstraping: bool, 88 | } 89 | 90 | type State = (SocketAddr, Seq, NodeStatus, T); 91 | 92 | #[derive(Debug, Clone, Serialize, Deserialize)] 93 | #[serde(bound = "T: DeserializeOwned")] 94 | enum Message { 95 | Ping { seq: Seq }, 96 | PingReq { seq: Seq, node: SocketAddr }, 97 | PingAck { seq: Seq }, 98 | Suspect { 99 | from: SocketAddr, 100 | node: SocketAddr, 101 | incarnation: Seq, 102 | }, 103 | Dead { 104 | // named Confirm in original paper 105 | from: SocketAddr, 106 | node: SocketAddr, 107 | incarnation: Seq, 108 | }, 109 | Alive { 110 | incarnation: Seq, 111 | node: SocketAddr, 112 | meta: T, 113 | }, 114 | Sync { state: Vec> }, 115 | SyncAck { state: Vec> }, 116 | } 117 | 118 | struct UdpCodec(PhantomData); 119 | 120 | impl tokio::net::UdpCodec for UdpCodec { 121 | type In = (SocketAddr, Message); 122 | type Out = (SocketAddr, Message); 123 | 124 | fn decode(&mut self, addr: &SocketAddr, buf: &[u8]) -> io::Result { 125 | trace!("decoding {:?}", buf); 126 | match bincode::deserialize(buf) { 127 | Ok(msg) => Ok((*addr, msg)), 128 | Err(err) => { 129 | warn!("decode err: {:?}", err); 130 | Err(into_io_error(err)) 131 | } 132 | } 133 | } 134 | 135 | fn encode(&mut self, addr_msg: Self::Out, buf: &mut Vec) -> SocketAddr { 136 | let (addr, msg) = addr_msg; 137 | trace!("encoding {:?}", msg); 138 | match bincode::serialize_into(buf, &msg, bincode::Infinite) { 139 | Ok(_) => addr, 140 | Err(err) => { 141 | panic!("encode err: {:?}", err); 142 | } 143 | } 144 | } 145 | } 146 | 147 | impl Node { 148 | fn new(status: NodeStatus, incarnation: Seq, meta: T) -> Node { 149 | Node { 150 | incarnation: incarnation, 151 | status_change: Instant::now(), 152 | status: status, 153 | meta: meta, 154 | } 155 | } 156 | 157 | fn set_status(&mut self, status: NodeStatus, incarnation: Seq) -> bool { 158 | if self.status != status { 159 | self.status = status; 160 | self.status_change = Instant::now(); 161 | self.incarnation = incarnation; 162 | true 163 | } else if self.incarnation != incarnation { 164 | self.incarnation = incarnation; 165 | true 166 | } else { 167 | false 168 | } 169 | } 170 | } 171 | 172 | type InitType = io::Result<(Arc>>, foneshot::Sender<()>)>; 173 | 174 | impl Inner { 175 | fn init( 176 | handle: tokio::reactor::Handle, 177 | addr: SocketAddr, 178 | meta: T, 179 | callback: GossiperCallback, 180 | ) -> io::Result>>> { 181 | let (chan_tx, chan_rx) = fmpsc::unbounded::<(SocketAddr, Message)>(); 182 | 183 | let context = Arc::new(Mutex::new(Inner { 184 | addr: addr, 185 | nodes: Default::default(), 186 | incarnation: 0, 187 | seq: 0, 188 | meta: meta, 189 | next_alive_probe: Instant::now(), 190 | next_dead_probe: Instant::now(), 191 | ping_inflight: InFlightMap::new(), 192 | pingreq_inflight: InFlightMap::new(), 193 | suspect_inflight: InFlightMap::new(), 194 | send_queue: chan_tx, 195 | broadcast_queue: Default::default(), 196 | callback: callback, 197 | leaving: false, 198 | bootstraping: false, 199 | })); 200 | 201 | let socket = tokio::net::UdpSocket::bind(&addr, &handle)?; 202 | let (s_tx, s_rx) = socket.framed(UdpCodec::(PhantomData)).split(); 203 | 204 | let fut_tx = s_tx.send_all(chan_rx.map_err(|_| io::Error::from(io::ErrorKind::Other))) 205 | .map(|_| ()); 206 | 207 | let context2 = context.clone(); 208 | let interval = 209 | tokio::reactor::Interval::new(Duration::from_millis(TIMER_RESOLUTION_MS), &handle) 210 | .expect("Can't create Interval"); 211 | let fut_timer = interval 212 | .for_each(move |_| { 213 | context2.lock().unwrap().on_timer(); 214 | Ok(()) 215 | }) 216 | .then(|r| { 217 | info!("timer fut {:?}", r); 218 | Ok(()) 219 | }); 220 | 221 | let context3 = context.clone(); 222 | let fut_rx = s_rx.for_each(move |(a, m)| { 223 | context3.lock().unwrap().on_message(a, m); 224 | Ok(()) 225 | }); 226 | 227 | let fut_socket = fut_tx.select(fut_rx).map(|_| ()).map_err(|(e, _)| e).then( 228 | |r| { 229 | info!("socket fut: {:?}", r); 230 | Ok(()) 231 | }, 232 | ); 233 | 234 | handle.spawn(fut_timer); 235 | handle.spawn(fut_socket); 236 | 237 | Ok(context) 238 | } 239 | 240 | fn on_timer(self: &mut Inner) { 241 | let now = Instant::now(); 242 | 243 | // gossip to alive nodes 244 | self.maybe_gossip_alive(now); 245 | // gossip to dead nodes possibly resolving partitions, etc 246 | self.maybe_gossip_dead(now); 247 | 248 | // expire pings and fire indirect pings 249 | while let Some((seq, node)) = self.ping_inflight.pop_expired(now) { 250 | debug!("{:?} pingreq to {:?}", self.addr, node); 251 | if self.send_ping_reqs(seq, node) == 0 { 252 | // nobody to pingreq!? 253 | self.pingreq_inflight.insert(seq, (self.addr, node), now); 254 | } 255 | } 256 | // expire pingreqs and mark as suspect if we are the originating node 257 | while let Some((_, (from, node))) = self.pingreq_inflight.pop_expired(now) { 258 | debug!("pingreq expired {:?} {:?} - {:?}", from, node, self.addr); 259 | let msg = match self.nodes.get(&node) { 260 | Some(n) if from == self.addr => { 261 | Message::Suspect { 262 | node: node, 263 | incarnation: n.incarnation, 264 | from: self.addr, 265 | } 266 | } 267 | _ => continue, 268 | }; 269 | let addr = self.addr; 270 | self.on_message(addr, msg); 271 | } 272 | 273 | // expire suspicious and mark dead if status didnt change 274 | while let Some((node, status_change)) = self.suspect_inflight.pop_expired(now) { 275 | let msg = match self.nodes.get(&node) { 276 | Some(n) if n.status_change == status_change => { 277 | Message::Dead { 278 | node: node, 279 | incarnation: n.incarnation, 280 | from: self.addr, 281 | } 282 | } 283 | _ => continue, 284 | }; 285 | let addr = self.addr; 286 | self.on_message(addr, msg); 287 | } 288 | 289 | // drain broadcast queue 290 | if !self.broadcast_queue.is_empty() { 291 | let candidates = self.get_candidates(true, !0); 292 | let mut messages = Vec::new(); 293 | let mut counter = 0; 294 | for &mut (ref mut rem, ref msg) in &mut self.broadcast_queue { 295 | let n = cmp::min(candidates.len(), *rem as usize); 296 | *rem -= n as u32; 297 | for _ in 0..n { 298 | messages.push((candidates[counter % candidates.len()], msg.clone())); 299 | counter += 1; 300 | } 301 | } 302 | self.broadcast_queue.retain(|&(r, _)| r > 0); 303 | for (addr, msg) in messages { 304 | self.send(addr, msg); 305 | } 306 | } 307 | } 308 | 309 | fn refute(&mut self, incarnation: Seq) { 310 | self.incarnation = cmp::max(self.incarnation, incarnation) + 1; 311 | let msg = Message::Alive { 312 | incarnation: self.incarnation, 313 | node: self.addr, 314 | meta: self.meta.clone(), 315 | }; 316 | self.broadcast(msg); 317 | } 318 | 319 | fn get_candidates(&self, alive: bool, limit: usize) -> Vec { 320 | let mut candidates: Vec<_> = self.nodes 321 | .iter() 322 | .filter_map(|(&k, v)| if (alive && v.status != NodeStatus::Dead) || 323 | (!alive && v.status == NodeStatus::Dead) 324 | { 325 | Some(k) 326 | } else { 327 | None 328 | }) 329 | .collect(); 330 | if candidates.len() > limit { 331 | thread_rng().shuffle(&mut candidates); 332 | candidates.truncate(limit); 333 | } 334 | trace!( 335 | "{:?} nodes are {:?}, returning {} candidates", 336 | self.addr, 337 | self.nodes, 338 | candidates.len() 339 | ); 340 | candidates 341 | } 342 | 343 | fn send_ping_reqs(&mut self, seq: Seq, node: SocketAddr) -> usize { 344 | let now = Instant::now(); 345 | let candidates = self.get_candidates(true, PINGREQ_CANDIDATES); 346 | debug!( 347 | "{} sending indirect pings to {} through {} other nodes", 348 | self.addr, 349 | node, 350 | candidates.len() 351 | ); 352 | for &k in &candidates { 353 | self.pingreq_inflight.insert( 354 | seq, 355 | (self.addr, k), 356 | now + time::Duration::from_millis(PING_TIMEOUT_MS), 357 | ); 358 | self.send( 359 | k, 360 | Message::PingReq { 361 | seq: seq, 362 | node: node, 363 | }, 364 | ); 365 | } 366 | candidates.len() 367 | } 368 | 369 | fn maybe_gossip_alive(&mut self, now: Instant) -> usize { 370 | if now < self.next_alive_probe { 371 | return 0; 372 | } 373 | self.next_alive_probe = now + time::Duration::from_millis(PING_PERIOD_MS); 374 | let candidates = self.get_candidates(true, PING_CANDIDATES); 375 | if !candidates.is_empty() { 376 | debug!( 377 | "{} gossiping to {} alive nodes", 378 | self.addr, 379 | candidates.len() 380 | ); 381 | for &k in &candidates { 382 | // TODO: in case a node is suspect, 383 | // it'd be best to probe with a Suspect msg 384 | let (seq, msg) = self.generate_ping_msg(); 385 | self.ping_inflight.insert( 386 | seq, 387 | k, 388 | now + 389 | time::Duration::from_millis( 390 | PING_TIMEOUT_MS, 391 | ), 392 | ); 393 | self.send(k, msg); 394 | // chance to fire a sync message as well 395 | if thread_rng().gen::() < PING_SYNC_CHANCE { 396 | let sync_state = self.generate_sync_state(); 397 | self.send(k, Message::Sync { state: sync_state }); 398 | } 399 | } 400 | } 401 | candidates.len() 402 | } 403 | 404 | fn maybe_gossip_dead(&mut self, now: Instant) -> usize { 405 | // TODO: maybe sync instead 406 | if now < self.next_dead_probe { 407 | return 0; 408 | } 409 | self.next_dead_probe = now + time::Duration::from_secs(PING_PERIOD_MS); 410 | 411 | let candidates = self.get_candidates(false, PING_CANDIDATES); 412 | if candidates.len() != 0 { 413 | debug!("{} gossiping to {} dead nodes", self.addr, candidates.len()); 414 | for &k in &candidates { 415 | // probe with a dead msg so it does have a chance to refute 416 | let msg = Message::Dead { 417 | node: k, 418 | incarnation: self.nodes[&k].incarnation, 419 | from: self.addr, 420 | }; 421 | self.send(k, msg); 422 | } 423 | } 424 | candidates.len() 425 | } 426 | 427 | fn on_message(&mut self, sender: SocketAddr, msg: Message) { 428 | trace!("{} on_message: {:?}", self.addr, msg); 429 | match msg { 430 | Message::Ping { seq } => { 431 | self.send(sender, Message::PingAck { seq: seq }); 432 | } 433 | Message::PingReq { seq, node } => { 434 | self.pingreq_inflight.insert( 435 | seq, 436 | (sender, node), 437 | Instant::now() + time::Duration::from_millis(PING_TIMEOUT_MS), 438 | ); 439 | self.send(node, Message::Ping { seq: seq }); 440 | } 441 | Message::PingAck { seq } => { 442 | if let Some(_) = self.ping_inflight.remove(&seq) { 443 | // good 444 | } else if let Some((from, _)) = self.pingreq_inflight.remove(&seq) { 445 | // send to original sender 446 | self.send(from, msg); 447 | } else { 448 | // do nothing if we dont have it in state 449 | }; 450 | } 451 | Message::Alive { 452 | incarnation, 453 | node, 454 | meta, 455 | } => { 456 | if node == self.addr { 457 | if incarnation < self.incarnation || 458 | (incarnation == self.incarnation && meta == self.meta) 459 | { 460 | return; 461 | } 462 | if self.leaving { 463 | // TODO! 464 | return; 465 | } 466 | // refute 467 | debug!("node {:?} REFUTE ALIVE", node); 468 | self.refute(incarnation); 469 | return; 470 | } 471 | 472 | { 473 | let mut existing = true; 474 | let n = self.nodes.entry(node).or_insert_with(|| { 475 | existing = false; 476 | Node::new(NodeStatus::Dead, 0, meta.clone()) 477 | }); 478 | if existing && incarnation <= n.incarnation { 479 | return; 480 | } 481 | debug!("{:?} node {:?} IS ALIVE", self.addr, node); 482 | if existing { 483 | (self.callback)(GossiperMsg::Alive(node, meta.clone())); 484 | } else { 485 | (self.callback)(GossiperMsg::New(node, meta.clone())); 486 | } 487 | n.set_status(NodeStatus::Alive, incarnation); 488 | } 489 | 490 | // help broadcast 491 | self.broadcast(Message::Alive { 492 | incarnation: incarnation, 493 | node: node, 494 | meta: meta, 495 | }); 496 | } 497 | Message::Suspect { 498 | incarnation, 499 | from, 500 | node, 501 | } => { 502 | if node == self.addr { 503 | // ignore old info 504 | if incarnation < self.incarnation { 505 | return; 506 | } 507 | // refute & broadcast 508 | debug!("node {:?} REFUTE SUSPECT", node); 509 | self.refute(incarnation); 510 | return; 511 | } 512 | 513 | if let Some(n) = self.nodes.get_mut(&node) { 514 | // ignore old info or irrelevant 515 | if incarnation < n.incarnation || n.status != NodeStatus::Alive { 516 | return; 517 | } 518 | debug!("{:?} node {:?} IS SUSPECT", self.addr, node); 519 | n.set_status(NodeStatus::Suspect, incarnation); 520 | self.suspect_inflight.insert( 521 | node, 522 | n.status_change, 523 | Instant::now() + 524 | time::Duration::from_millis(SUSPECT_TIMEOUT_MS), 525 | ); 526 | } else { 527 | // about an unknown node!? 528 | return; 529 | } 530 | 531 | // help broadcast 532 | self.broadcast(Message::Suspect { 533 | incarnation: incarnation, 534 | from: from, 535 | node: node, 536 | }); 537 | } 538 | Message::Dead { 539 | incarnation, 540 | from, 541 | node, 542 | } => { 543 | if node == self.addr { 544 | // ignore old info 545 | if incarnation < self.incarnation { 546 | return; 547 | } 548 | if self.leaving { 549 | // TODO! 550 | return; 551 | } 552 | // refute & broadcast 553 | debug!("node {:?} REFUTE DEAD", node); 554 | self.refute(incarnation); 555 | return; 556 | } 557 | 558 | if let Some(n) = self.nodes.get_mut(&node) { 559 | // ignore old info or irrelevant 560 | if incarnation < n.incarnation || n.status == NodeStatus::Dead { 561 | return; 562 | } 563 | debug!("{:?} node {:?} IS DEAD", self.addr, node); 564 | (self.callback)(GossiperMsg::Dead(node)); 565 | n.set_status(NodeStatus::Dead, incarnation); 566 | } else { 567 | // about an unknown node!? 568 | return; 569 | } 570 | 571 | // help broadcast 572 | self.broadcast(Message::Dead { 573 | incarnation: incarnation, 574 | from: from, 575 | node: node, 576 | }); 577 | } 578 | Message::Sync { state } => { 579 | self.do_sync(state); 580 | let ack_state = self.generate_sync_state(); 581 | self.send(sender, Message::SyncAck { state: ack_state }); 582 | } 583 | Message::SyncAck { state } => { 584 | self.do_sync(state); 585 | } 586 | } 587 | } 588 | 589 | fn generate_sync_state(&mut self) -> Vec> { 590 | let mut state: Vec<_> = self.nodes 591 | .iter() 592 | .map(|(&k, n)| (k, n.incarnation, n.status, n.meta.clone())) 593 | .collect(); 594 | state.push(( 595 | self.addr, 596 | self.incarnation, 597 | NodeStatus::Alive, 598 | self.meta.clone(), 599 | )); 600 | // TODO: worry about size 601 | if state.len() > 20 { 602 | thread_rng().shuffle(&mut state); 603 | state.truncate(20); 604 | } 605 | state 606 | } 607 | 608 | fn do_sync(&mut self, state: Vec>) { 609 | let sender = self.addr; 610 | for (addr, incarnation, status, meta) in state { 611 | let msg = match status { 612 | NodeStatus::Alive => { 613 | Message::Alive { 614 | node: addr, 615 | incarnation: incarnation, 616 | meta: meta, 617 | } 618 | } 619 | // threat suspect and dead the same 620 | NodeStatus::Suspect | NodeStatus::Dead => { 621 | Message::Suspect { 622 | from: sender, 623 | node: addr, 624 | incarnation: incarnation, 625 | } 626 | } 627 | }; 628 | self.on_message(sender, msg); 629 | } 630 | } 631 | 632 | fn send(&mut self, to: SocketAddr, msg: Message) { 633 | let _ = self.send_queue.unbounded_send((to, msg)); 634 | } 635 | 636 | fn broadcast(&mut self, msg: Message) { 637 | // self.nodes dont include self, so + 2 638 | let n = ((self.nodes.len() + 2) as f32).log10().ceil() as u32 * 4; 639 | self.broadcast_queue.push((n, msg)); 640 | } 641 | 642 | fn generate_ping_msg(&mut self) -> (Seq, Message) { 643 | let seq = self.seq; 644 | self.seq += 1; 645 | (seq, Message::Ping { seq: seq }) 646 | } 647 | 648 | pub fn update_meta(&mut self, meta: T) { 649 | self.incarnation += 1; 650 | self.meta = meta; 651 | let msg = Message::Alive { 652 | node: self.addr, 653 | incarnation: self.incarnation, 654 | meta: self.meta.clone(), 655 | }; 656 | self.broadcast(msg); 657 | } 658 | 659 | pub fn join(&mut self, seeds: &[SocketAddr]) { 660 | let state = self.generate_sync_state(); 661 | for &seed in seeds { 662 | self.send(seed, Message::Sync { state: state.clone() }); 663 | } 664 | } 665 | } 666 | 667 | impl Gossiper { 668 | pub fn new( 669 | listen_addr: SocketAddr, 670 | meta: T, 671 | callback: GossiperCallback, 672 | ) -> io::Result> { 673 | let (init_tx, init_rx) = mpsc::channel(); 674 | let thread = thread::Builder::new() 675 | .name(format!("Gossiper:{}", listen_addr)) 676 | .spawn(move || { 677 | let mut core = tokio::reactor::Core::new().unwrap(); 678 | let (completer_tx, completer_rx) = foneshot::channel(); 679 | init_tx 680 | .send( 681 | Inner::init(core.handle(), listen_addr, meta, callback) 682 | .map(|c| (c, completer_tx)), 683 | ) 684 | .map_err(into_io_error)?; 685 | core.run(completer_rx).map_err(into_io_error) 686 | })?; 687 | 688 | let (context, completer) = init_rx.recv().map_err(into_io_error)??; 689 | Ok(Gossiper { 690 | context: context, 691 | loop_thread: Some((completer, thread)), 692 | }) 693 | } 694 | 695 | pub fn join(&self, seeds: &[SocketAddr]) { 696 | self.context.lock().unwrap().join(seeds) 697 | } 698 | 699 | pub fn node_count(&self) -> usize { 700 | self.context.lock().unwrap().nodes.len() + 1 701 | } 702 | 703 | pub fn alive_count(&self) -> usize { 704 | self.context 705 | .lock() 706 | .unwrap() 707 | .nodes 708 | .values() 709 | .filter(|n| n.status != NodeStatus::Dead) 710 | .count() + 1 711 | } 712 | } 713 | 714 | impl Drop for Gossiper { 715 | fn drop(&mut self) { 716 | if let Some((completer, thread)) = self.loop_thread.take() { 717 | let _ = completer.send(()); 718 | let _ = thread.join(); 719 | } 720 | } 721 | } 722 | 723 | #[cfg(test)] 724 | mod tests { 725 | use super::*; 726 | use env_logger; 727 | use std::{time, thread}; 728 | 729 | fn test_converge(n: usize) -> Vec> { 730 | let _ = env_logger::try_init(); 731 | let g: Vec<_> = (0..n) 732 | .map(|i| { 733 | Gossiper::new(format!("0.0.0.0:{}", 9000 + i).parse().unwrap(), ()).unwrap() 734 | }) 735 | .collect(); 736 | let start = Instant::now(); 737 | for (i, g0) in (&g[1..]).iter().enumerate() { 738 | g0.join(&[format!("0.0.0.0:{}", 9000 + i).parse().unwrap()]); 739 | } 740 | for _ in 0..(n * 1000) { 741 | if g.iter().all(|g| g.alive_count() == n) { 742 | break; 743 | } 744 | thread::sleep(time::Duration::from_millis(1)); 745 | } 746 | warn!("{:?} has passed", Instant::now() - start); 747 | assert!( 748 | g.iter().all(|g| g.alive_count() == n), 749 | "{} {:?}", 750 | n, 751 | g.iter().map(|g| g.alive_count()).collect::>() 752 | ); 753 | g 754 | } 755 | 756 | macro_rules! test_converge_n { 757 | ($fn_name: ident, $n: expr) => ( 758 | #[test] 759 | fn $fn_name() { 760 | test_converge($n); 761 | } 762 | ); 763 | } 764 | 765 | test_converge_n!(test_converge_1, 1); 766 | test_converge_n!(test_converge_2, 2); 767 | test_converge_n!(test_converge_3, 3); 768 | test_converge_n!(test_converge_5, 5); 769 | test_converge_n!(test_converge_10, 10); 770 | test_converge_n!(test_converge_20, 20); 771 | test_converge_n!(test_converge_30, 30); 772 | test_converge_n!(test_converge_50, 50); 773 | 774 | fn test_dead(n: usize) { 775 | let _ = env_logger::try_init(); 776 | let mut g = test_converge(n); 777 | g.pop(); 778 | let start = Instant::now(); 779 | for _ in 0..(n * 2000) { 780 | if g.iter().all(|g| g.alive_count() == n - 1) { 781 | break; 782 | } 783 | thread::sleep(time::Duration::from_millis(1)); 784 | } 785 | warn!("{:?} has passed", Instant::now() - start); 786 | assert!( 787 | g.iter().all(|g| g.alive_count() == n - 1), 788 | "{} {:?}", 789 | n - 1, 790 | g.iter().map(|g| g.alive_count()).collect::>() 791 | ); 792 | } 793 | 794 | macro_rules! test_dead_n { 795 | ($fn_name: ident, $n: expr) => ( 796 | #[test] 797 | fn $fn_name() { 798 | test_dead($n); 799 | } 800 | ); 801 | } 802 | 803 | test_dead_n!(test_dead_1, 1); 804 | test_dead_n!(test_dead_2, 2); 805 | test_dead_n!(test_dead_3, 3); 806 | test_dead_n!(test_dead_5, 5); 807 | test_dead_n!(test_dead_10, 10); 808 | test_dead_n!(test_dead_20, 20); 809 | test_dead_n!(test_dead_30, 30); 810 | test_dead_n!(test_dead_50, 50); 811 | } 812 | -------------------------------------------------------------------------------- /src/hash.rs: -------------------------------------------------------------------------------- 1 | use crc16; 2 | 3 | pub const HASH_SLOTS: u16 = 16384; 4 | 5 | /// RedisCluster style partitioning 6 | pub fn hash_slot(mut key: &[u8]) -> u16 { 7 | if let Some(open) = key.iter().position(|&x| x == b'{') { 8 | // note that close will be relative to open due to the skip() 9 | if let Some(close) = key[open + 1..].iter().position(|&x| x == b'}') { 10 | if close > 0 { 11 | // found { and } with something in between 12 | key = &key[open + 1..open + 1 + close]; 13 | } 14 | } 15 | } 16 | crc16::State::::calculate(key) % HASH_SLOTS 17 | } 18 | 19 | #[cfg(test)] 20 | mod tests { 21 | use super::*; 22 | 23 | fn raw_hash(key: &[u8]) -> u16 { 24 | crc16::State::::calculate(key) % HASH_SLOTS 25 | } 26 | 27 | #[test] 28 | fn test_hash_slot() { 29 | assert_eq!(hash_slot(b"{}"), raw_hash(b"{}")); 30 | assert_eq!(hash_slot(b"_{abc}"), raw_hash(b"abc")); 31 | assert_eq!(hash_slot(b"{abc}_"), raw_hash(b"abc")); 32 | assert_eq!(hash_slot(b"_{abc}_"), raw_hash(b"abc")); 33 | assert_eq!(hash_slot(b"{abc}{def}"), raw_hash(b"abc")); 34 | assert_eq!(hash_slot(b"{}{abc}"), raw_hash(b"{}{abc}")); 35 | assert_eq!(hash_slot(b"{abc}{}"), raw_hash(b"abc")); 36 | assert_eq!(hash_slot(b"{{abc}}"), raw_hash(b"{abc")); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/inflightmap.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::collections::hash_map::{Entry, RandomState}; 3 | use std::collections::{BinaryHeap, HashMap}; 4 | use std::fmt; 5 | use std::hash::{BuildHasher, Hash}; 6 | use std::ops::Deref; 7 | 8 | // TODO: need a more efficient implementation and possibly more flexibility 9 | 10 | #[derive(Debug)] 11 | pub struct InFlightMap { 12 | map: HashMap, 13 | heap: BinaryHeap>, 14 | } 15 | 16 | impl 17 | InFlightMap 18 | { 19 | pub fn new() -> Self { 20 | InFlightMap { 21 | map: Default::default(), 22 | heap: Default::default(), 23 | } 24 | } 25 | 26 | pub fn clear(&mut self) { 27 | self.map.clear(); 28 | self.heap.clear(); 29 | } 30 | 31 | pub fn remove(&mut self, key: &K) -> Option { 32 | self.map.remove(key) 33 | } 34 | 35 | pub fn entry_with_timeout(&mut self, key: K, expire: T) -> Entry { 36 | self.heap.push(Pair(expire, key)); 37 | self.map.entry(key) 38 | } 39 | 40 | pub fn entry(&mut self, key: K) -> Entry { 41 | self.map.entry(key) 42 | } 43 | 44 | pub fn insert(&mut self, key: K, value: V, expire: T) -> &mut V { 45 | self.heap.push(Pair(expire, key)); 46 | 47 | let mut inserted = false; 48 | let result = self.map.entry(key).or_insert_with(|| { 49 | inserted = true; 50 | value 51 | }); 52 | 53 | if !inserted { 54 | panic!("{:?} is already present in the map", key); 55 | } 56 | 57 | result 58 | } 59 | 60 | pub fn pop_expired(&mut self, now: T) -> Option<(K, V)> { 61 | loop { 62 | let key = match self.heap.peek() { 63 | Some(&Pair(e, k)) if now >= e => k, 64 | _ => return None, 65 | }; 66 | self.heap.pop(); 67 | if let Some(v) = self.map.remove(&key) { 68 | return Some((key, v)); 69 | } 70 | } 71 | } 72 | 73 | pub fn touch_expired(&mut self, now: T, expire: T) -> Option<(K, &V)> { 74 | loop { 75 | let key = match self.heap.peek() { 76 | Some(&Pair(e, k)) if now >= e => k, 77 | _ => return None, 78 | }; 79 | if let Some(v) = self.map.get(&key) { 80 | *self.heap.peek_mut().unwrap() = Pair(expire, key); 81 | return Some((key, &v)); 82 | } else { 83 | self.heap.pop(); 84 | } 85 | } 86 | } 87 | } 88 | 89 | impl Deref for InFlightMap { 90 | type Target = HashMap; 91 | 92 | fn deref(&self) -> &Self::Target { 93 | &self.map 94 | } 95 | } 96 | 97 | // Like a 2-tuple but comparison is only done for the first item 98 | #[derive(Debug)] 99 | struct Pair(T, V); 100 | 101 | impl PartialEq> for Pair { 102 | fn eq(&self, other: &Pair) -> bool { 103 | other.0.eq(&self.0) 104 | } 105 | } 106 | 107 | impl Eq for Pair {} 108 | 109 | impl PartialOrd> for Pair { 110 | fn partial_cmp(&self, other: &Pair) -> Option { 111 | other.0.partial_cmp(&self.0) 112 | } 113 | } 114 | 115 | impl Ord for Pair { 116 | fn cmp(&self, other: &Pair) -> Ordering { 117 | other.0.cmp(&self.0) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![feature(nll)] 2 | #![feature(fnbox, try_from)] 3 | #![allow(dead_code)] 4 | // #![cfg_attr(feature = "cargo-clippy", allow(cast_lossless))] 5 | 6 | // #![feature(alloc_system, global_allocator, allocator_api)] 7 | // 8 | // extern crate alloc_system; 9 | // 10 | // use alloc_system::System; 11 | // 12 | // #[global_allocator] 13 | // static A: System = System; 14 | 15 | extern crate bincode; 16 | extern crate byteorder; 17 | extern crate bytes; 18 | extern crate clap; 19 | extern crate crc16; 20 | extern crate futures; 21 | #[macro_use] 22 | extern crate lazy_static; 23 | extern crate linear_map; 24 | #[macro_use] 25 | extern crate log; 26 | extern crate log4rs; 27 | extern crate metrics as rust_metrics; 28 | extern crate num_cpus; 29 | extern crate rand; 30 | extern crate roaring; 31 | extern crate rocksdb; 32 | extern crate serde; 33 | #[macro_use] 34 | extern crate serde_derive; 35 | extern crate crossbeam_channel; 36 | extern crate serde_yaml; 37 | extern crate tokio_codec; 38 | extern crate tokio_core; 39 | extern crate tokio_io; 40 | 41 | #[cfg(test)] 42 | extern crate env_logger; 43 | 44 | #[macro_use] 45 | mod utils; 46 | mod types; 47 | mod version_vector; 48 | // mod gossip; 49 | mod cubes; 50 | mod dht; 51 | mod fabric; 52 | mod fabric_msg; 53 | mod hash; 54 | mod inflightmap; 55 | mod storage; 56 | #[macro_use] 57 | mod database; 58 | mod command; 59 | mod config; 60 | mod metrics; 61 | mod resp; 62 | mod server; 63 | mod vnode; 64 | mod vnode_sync; 65 | mod workers; 66 | 67 | fn configure() -> config::Config { 68 | use clap::{App, Arg, SubCommand}; 69 | use config::*; 70 | use std::path::Path; 71 | 72 | let matches = App::new("SucreDB") 73 | .version("0.1") 74 | .about("A database made of sugar cubes") 75 | .arg( 76 | Arg::with_name("config_file") 77 | .short("c") 78 | .long("config") 79 | .takes_value(true) 80 | .help(".yaml config file") 81 | .long_help( 82 | "Path to the .yaml config file. Note that configuration \ 83 | set through the command line will take precedence \ 84 | over the config file.", 85 | ).display_order(0), 86 | ).arg( 87 | Arg::with_name("data_dir") 88 | .short("d") 89 | .long("data") 90 | .takes_value(true) 91 | .help("Data directory"), 92 | ).arg( 93 | Arg::with_name("cluster_name") 94 | .short("n") 95 | .long("cluster") 96 | .help("The cluster name") 97 | .takes_value(true), 98 | ).arg( 99 | Arg::with_name("listen_addr") 100 | .short("l") 101 | .long("listen") 102 | .help("Listen addr") 103 | .takes_value(true), 104 | ).arg( 105 | Arg::with_name("fabric_addr") 106 | .short("f") 107 | .long("fabric") 108 | .help("Fabric listen addr") 109 | .takes_value(true), 110 | ).arg( 111 | Arg::with_name("seed_nodes") 112 | .short("s") 113 | .long("seeds") 114 | .multiple(true) 115 | .takes_value(true) 116 | .require_delimiter(true), 117 | ).subcommand( 118 | SubCommand::with_name("init") 119 | .about("Init and configure the cluster") 120 | .arg( 121 | Arg::with_name("replication_factor") 122 | .short("r") 123 | .help("Number of replicas") 124 | .default_value(DEFAULT_REPLICATION_FACTOR), 125 | ).arg( 126 | Arg::with_name("partitions") 127 | .short("p") 128 | .help("Number of partitions") 129 | .long_help( 130 | "Number of partitions, the recommended value is \ 131 | `expected node count * 10` rounded up to the next power of 2.", 132 | ).default_value(DEFAULT_PARTITIONS), 133 | ).display_order(0), 134 | ).get_matches(); 135 | 136 | let mut config = Default::default(); 137 | 138 | if let Some(v) = matches.value_of("config_file") { 139 | read_config_file(Path::new(v), &mut config); 140 | } else { 141 | setup_default_logging(); 142 | } 143 | 144 | if let Some(v) = matches.value_of("data_dir") { 145 | config.data_dir = v.into(); 146 | } 147 | 148 | if let Some(v) = matches.value_of("cluster_name") { 149 | config.cluster_name = v.into(); 150 | } 151 | 152 | if let Some(v) = matches.value_of("listen_addr") { 153 | config.listen_addr = v.parse().expect("Can't parse listen_addr"); 154 | } 155 | 156 | if let Some(v) = matches.values_of("seed_nodes") { 157 | config.seed_nodes = v 158 | .map(|v| v.parse().expect("Can't parse seed_nodes")) 159 | .collect(); 160 | } 161 | 162 | if let Some(v) = matches.value_of("fabric_addr") { 163 | config.fabric_addr = v.parse().expect("Can't parse fabric_addr"); 164 | } 165 | 166 | if let Some(sub) = matches.subcommand_matches("init") { 167 | config.cmd_init = Some(InitCommand { 168 | partitions: sub 169 | .value_of("partitions") 170 | .unwrap() 171 | .parse() 172 | .expect("Can't parse partitions"), 173 | replication_factor: sub 174 | .value_of("replication_factor") 175 | .unwrap() 176 | .parse() 177 | .expect("Can't parse replication_factor"), 178 | }); 179 | } 180 | 181 | config 182 | } 183 | 184 | #[cfg(not(test))] 185 | fn main() { 186 | let server = server::Server::new(configure()); 187 | server.run(); 188 | } 189 | -------------------------------------------------------------------------------- /src/metrics.rs: -------------------------------------------------------------------------------- 1 | pub use rust_metrics::metrics::{Counter, Gauge, Meter, Metric}; 2 | use rust_metrics::metrics::{StdGauge, StdMeter}; 3 | use std::sync::Arc; 4 | 5 | // TODO: Expose these metrics 6 | lazy_static! { 7 | pub static ref CLIENT_CONNECTION: Arc = { StdGauge::new() }; 8 | pub static ref REQUEST_GET: Arc = { StdMeter::new() }; 9 | pub static ref REQUEST_SET: Arc = { StdMeter::new() }; 10 | pub static ref REQUEST_DEL: Arc = { StdMeter::new() }; 11 | pub static ref SYNC_SEND: Arc = { StdMeter::new() }; 12 | pub static ref SYNC_RECV: Arc = { StdMeter::new() }; 13 | pub static ref SYNC_RESEND: Arc = { StdMeter::new() }; 14 | pub static ref SYNC_OUTGOING: Arc = { StdGauge::new() }; 15 | pub static ref SYNC_INCOMING: Arc = { StdGauge::new() }; 16 | } 17 | -------------------------------------------------------------------------------- /src/resp.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use std::error::Error; 3 | use std::io::{self, Write}; 4 | use std::{fmt, str}; 5 | use utils::assume_str; 6 | 7 | #[derive(Eq, PartialEq, Debug)] 8 | pub enum RespError { 9 | Incomplete, 10 | Invalid(&'static str), 11 | } 12 | 13 | impl From<&'static str> for RespError { 14 | fn from(from: &'static str) -> Self { 15 | RespError::Invalid(from) 16 | } 17 | } 18 | 19 | pub type RespResult = Result; 20 | 21 | #[derive(Clone, PartialEq)] 22 | pub enum RespValue { 23 | Nil, 24 | Int(i64), 25 | Data(Bytes), 26 | Array(Vec), 27 | Status(Bytes), 28 | Error(Bytes), 29 | } 30 | 31 | impl RespValue { 32 | pub fn serialized_size(&self) -> usize { 33 | match *self { 34 | RespValue::Nil => "$-1\r\n".len(), 35 | RespValue::Int(_) => ":-9223372036854775808\r\n".len(), 36 | RespValue::Data(ref v) => "$4294967296\r\n".len() + v.len() + "\r\n".len(), 37 | RespValue::Array(ref a) => { 38 | "*4294967296\r\n".len() + a.iter().map(Self::serialized_size).sum::() 39 | } 40 | RespValue::Status(ref v) | RespValue::Error(ref v) => { 41 | "+".len() + v.len() + "\r\n".len() 42 | } 43 | } 44 | } 45 | 46 | pub fn serialize_into(self, f: &mut W) -> io::Result<()> { 47 | match self { 48 | RespValue::Nil => write!(f, "$-1\r\n"), 49 | RespValue::Int(v) => write!(f, ":{}\r\n", v), 50 | RespValue::Data(v) => { 51 | write!(f, "${}\r\n", v.len())?; 52 | f.write_all(v.as_ref())?; 53 | write!(f, "\r\n") 54 | } 55 | RespValue::Array(a) => { 56 | write!(f, "*{}\r\n", a.len())?; 57 | for v in a { 58 | v.serialize_into(f)?; 59 | } 60 | Ok(()) 61 | } 62 | RespValue::Status(v) => { 63 | write!(f, "+")?; 64 | f.write_all(v.as_ref())?; 65 | write!(f, "\r\n") 66 | } 67 | RespValue::Error(v) => { 68 | write!(f, "-")?; 69 | f.write_all(v.as_ref())?; 70 | write!(f, "\r\n") 71 | } 72 | } 73 | } 74 | } 75 | 76 | impl From for RespValue { 77 | fn from(from: T) -> Self { 78 | RespValue::Error(format!("{}", from).into()) 79 | } 80 | } 81 | 82 | impl fmt::Debug for RespValue { 83 | fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { 84 | match *self { 85 | RespValue::Nil => write!(f, "Nil"), 86 | RespValue::Int(v) => write!(f, "Int({:?})", v), 87 | RespValue::Data(ref v) => write!(f, "Data({:?})", v), 88 | RespValue::Array(ref b) => { 89 | write!(f, "Array(")?; 90 | f.debug_list().entries(b).finish()?; 91 | write!(f, ")") 92 | } 93 | RespValue::Status(ref v) => write!(f, "Status({:?})", v), 94 | RespValue::Error(ref v) => write!(f, "Error({:?})", v), 95 | } 96 | } 97 | } 98 | 99 | /// The internal redis response parser. 100 | pub struct Parser { 101 | consumed: usize, 102 | body: Bytes, 103 | } 104 | 105 | impl Parser { 106 | pub fn new>(body: T) -> RespResult { 107 | let valid_to = Self::speculate_buffer(body.as_ref())?; 108 | Ok(Parser { 109 | consumed: 0, 110 | body: body.as_ref()[..valid_to].into(), 111 | }) 112 | } 113 | 114 | // Quickly speculate a buffer, checking whatever it has a complete resp objects or not. 115 | // If succesfull returns the resp objects length in bytes. 116 | fn speculate_buffer(buf: &[u8]) -> RespResult { 117 | if buf.len() < 3 { 118 | return Err(RespError::Incomplete); 119 | } 120 | if &buf[buf.len() - 2..] == b"\r\n" { 121 | return Ok(buf.len()); 122 | } 123 | let mut valid = 0; 124 | let mut i = 0; 125 | let mut values_pending = 0; 126 | while i < buf.len() { 127 | match buf[i] { 128 | b'$' | b'*' => { 129 | let is_multi = buf[i] == b'*'; 130 | let mut len = 0i64; 131 | i += 1; 132 | while i < buf.len() { 133 | match buf[i] { 134 | b'0'...b'9' => len = len * 10 + (buf[i] - b'0') as i64, 135 | b'-' => { 136 | // only valid negative len is -1 137 | len = -1; 138 | i += 2; 139 | break; 140 | } 141 | b'\r' => break, 142 | _ => return Err(RespError::Invalid("Invalid digit")), 143 | } 144 | i += 1; 145 | } 146 | if len >= 0 { 147 | if is_multi { 148 | values_pending = len + 1; 149 | } else { 150 | i += 2 + len as usize; 151 | } 152 | } 153 | } 154 | b':' | b'+' | b'-' => { 155 | i += 1; 156 | while i < buf.len() && buf[i] != b'\r' { 157 | i += 1; 158 | } 159 | } 160 | b'\r' => { 161 | i += 2; 162 | continue; 163 | } 164 | _ => return Err(RespError::Invalid("Invalid prefix")), 165 | } 166 | // skip delimiter 167 | i += 2; 168 | if values_pending > 0 { 169 | values_pending -= 1; 170 | } 171 | if values_pending == 0 && i <= buf.len() { 172 | valid = i; 173 | } 174 | } 175 | if valid != 0 { 176 | Ok(valid) 177 | } else { 178 | Err(RespError::Incomplete) 179 | } 180 | } 181 | 182 | pub fn consumed(&self) -> usize { 183 | self.consumed 184 | } 185 | 186 | /// parses a single value out of the stream. If there are multiple 187 | /// values you can call this multiple times. 188 | pub fn parse(&mut self) -> RespResult { 189 | let saved_len = self.body.len(); 190 | let value = self.parse_value(); 191 | if value.is_ok() { 192 | self.consumed += saved_len - self.body.len(); 193 | } 194 | value 195 | } 196 | 197 | fn parse_value(&mut self) -> RespResult { 198 | match self.read_byte()? { 199 | b'$' => self.parse_data(), 200 | b'*' => self.parse_array(), 201 | b':' => self.parse_int(), 202 | b'+' => self.parse_status(), 203 | b'-' => self.parse_error(), 204 | c => { 205 | if c == b'\r' && self.read_byte()? == b'\n' { 206 | return self.parse_value(); 207 | } 208 | debug!( 209 | "Invalid prefix {:?}{:?} when parsing value", 210 | c as char, 211 | String::from_utf8_lossy(self.body.as_ref()) 212 | ); 213 | Err("Invalid prefix when parsing value".into()) 214 | } 215 | } 216 | } 217 | 218 | #[inline] 219 | fn read_byte(&mut self) -> RespResult { 220 | if self.body.len() >= 1 { 221 | let byte = self.body[0]; 222 | self.body = self.body.slice_from(1); 223 | Ok(byte) 224 | } else { 225 | Err(RespError::Incomplete) 226 | } 227 | } 228 | 229 | #[inline] 230 | fn read(&mut self, len: usize) -> RespResult { 231 | if self.body.len() >= len { 232 | Ok(self.body.split_to(len)) 233 | } else { 234 | Err(RespError::Incomplete) 235 | } 236 | } 237 | 238 | fn read_with_separator(&mut self, len: usize) -> RespResult { 239 | let result = self.read(len + 2)?; 240 | if &result[len..] != b"\r\n" { 241 | Err("Invalid line separator".into()) 242 | } else { 243 | Ok(result.slice_to(len)) 244 | } 245 | } 246 | 247 | fn read_line(&mut self) -> RespResult { 248 | let nl_pos = match self.body.iter().position(|&b| b == b'\r') { 249 | Some(nl_pos) => nl_pos, 250 | None => return Err(RespError::Incomplete), 251 | }; 252 | Ok(self.read_with_separator(nl_pos)?) 253 | } 254 | 255 | fn read_int_line(&mut self) -> RespResult { 256 | let line = self.read_line()?; 257 | match assume_str(line.as_ref()).parse::() { 258 | Err(_) => Err("Expected integer, got garbage".into()), 259 | Ok(value) => Ok(value), 260 | } 261 | } 262 | 263 | fn parse_status(&mut self) -> RespResult { 264 | Ok(RespValue::Status(self.read_line()?)) 265 | } 266 | 267 | fn parse_int(&mut self) -> RespResult { 268 | Ok(RespValue::Int(self.read_int_line()?)) 269 | } 270 | 271 | fn parse_data(&mut self) -> RespResult { 272 | let length = self.read_int_line()?; 273 | if length < 0 { 274 | Ok(RespValue::Nil) 275 | } else { 276 | let data = self.read_with_separator(length as usize)?; 277 | Ok(RespValue::Data(data)) 278 | } 279 | } 280 | 281 | fn parse_array(&mut self) -> RespResult { 282 | let length = self.read_int_line()?; 283 | if length < 0 { 284 | Ok(RespValue::Nil) 285 | } else { 286 | let mut rv = Vec::with_capacity(length as usize); 287 | for _ in 0..length { 288 | rv.push(self.parse_value()?); 289 | } 290 | Ok(RespValue::Array(rv)) 291 | } 292 | } 293 | 294 | fn parse_error(&mut self) -> RespResult { 295 | Ok(RespValue::Error(self.read_line()?)) 296 | } 297 | } 298 | 299 | #[cfg(test)] 300 | mod tests { 301 | use super::{Parser, RespError, RespResult, RespValue}; 302 | 303 | fn parse(slice: &[u8]) -> RespResult { 304 | Parser::new(slice)?.parse() 305 | } 306 | 307 | #[test] 308 | fn parse_incomplete() { 309 | let r = parse(b"*2\r\n$3\r\nfoo"); 310 | assert_eq_repr!(r.unwrap_err(), RespError::Incomplete); 311 | } 312 | 313 | #[test] 314 | fn parse_error() { 315 | let r = parse(b"-foo\r\n"); 316 | assert_eq_repr!(r.unwrap(), RespValue::Error("foo".into())); 317 | 318 | let r = parse(b"-invalid line sep\r\r"); 319 | assert!(if let RespError::Invalid(_) = r.unwrap_err() { 320 | true 321 | } else { 322 | false 323 | }); 324 | } 325 | 326 | #[test] 327 | fn parse_valid_array() { 328 | let r = parse(b"*2\r\n$3\r\nfoo\r\n$4\r\nbarz\r\n"); 329 | assert!(r.is_ok(), "{:?} not ok", r.unwrap_err()); 330 | assert_eq_repr!( 331 | r.unwrap(), 332 | RespValue::Array(vec![ 333 | RespValue::Data(b"foo".as_ref().into()), 334 | RespValue::Data(b"barz".as_ref().into()), 335 | ]) 336 | ); 337 | } 338 | 339 | #[test] 340 | fn parser_multiple2() { 341 | let mut parser = Parser::new( 342 | b"*2\r\n$3\r\nfoo\r\n$4\r\nbarz\r\n*2\r\n$3\r\nfoo\r\n$4\r\nbarz\r\n".as_ref(), 343 | ).unwrap(); 344 | for _ in 0..2 { 345 | let r = parser.parse(); 346 | assert!(r.is_ok(), "{:?} not ok", r.unwrap_err()); 347 | assert_eq_repr!( 348 | r.unwrap(), 349 | RespValue::Array(vec![ 350 | RespValue::Data(b"foo".as_ref().into()), 351 | RespValue::Data(b"barz".as_ref().into()), 352 | ]) 353 | ); 354 | } 355 | let r = parser.parse(); 356 | assert_eq_repr!(r.unwrap_err(), RespError::Incomplete); 357 | } 358 | 359 | #[test] 360 | fn message_response() { 361 | let mut parser = Parser::new( 362 | b"*2\r\n*2\r\n:7270781675605147315\r\n$25\r\nmessage 1 from producer 0\r\n*2\r\n:4590316895040267280\r\n$25\r\nmessage 2 from producer 0\r\n" 363 | .as_ref(), 364 | ).unwrap(); 365 | let r = parser.parse(); 366 | assert!(r.is_ok(), "{:?} not ok", r.unwrap_err()); 367 | assert_eq!(parser.body.len(), 0); 368 | } 369 | } 370 | -------------------------------------------------------------------------------- /src/server.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::collections::VecDeque; 3 | use std::io; 4 | use std::rc::Rc; 5 | use std::sync::{Arc, Mutex}; 6 | 7 | use bytes::{BufMut, BytesMut}; 8 | use database::{Context as DbContext, Database, Token, WorkerMsg}; 9 | use futures::sync::mpsc as fmpsc; 10 | use futures::{Future, Sink, Stream}; 11 | use tokio_codec as codec; 12 | use tokio_core as tokio; 13 | use tokio_io::AsyncRead; 14 | use workers::WorkerSender; 15 | 16 | use config::Config; 17 | use metrics::{self, Gauge}; 18 | use resp::{self, RespValue}; 19 | use utils::IdHashMap; 20 | 21 | struct RespCodec; 22 | 23 | impl codec::Decoder for RespCodec { 24 | type Item = RespValue; 25 | type Error = io::Error; 26 | 27 | fn decode(&mut self, src: &mut BytesMut) -> io::Result> { 28 | let (consumed, result) = resp::Parser::new(&*src) 29 | .and_then(|mut p| match p.parse() { 30 | Ok(v) => Ok((p.consumed(), Ok(Some(v)))), 31 | Err(e) => Err(e), 32 | }).unwrap_or_else(|e| match e { 33 | resp::RespError::Incomplete => (0, Ok(None)), 34 | _ => (0, Err(io::ErrorKind::InvalidData.into())), 35 | }); 36 | src.split_to(consumed); 37 | result 38 | } 39 | } 40 | 41 | impl codec::Encoder for RespCodec { 42 | type Item = RespValue; 43 | type Error = io::Error; 44 | 45 | fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> io::Result<()> { 46 | dst.reserve(item.serialized_size()); 47 | item.serialize_into(&mut dst.writer()) 48 | .expect("Failed to serialize into reserved space"); 49 | Ok(()) 50 | } 51 | } 52 | 53 | struct Context { 54 | context: Rc, 55 | token: Token, 56 | requests: VecDeque, 57 | db_context: Option, 58 | } 59 | 60 | struct SharedContext { 61 | database: Arc, 62 | db_sender: RefCell>, 63 | token_chans: Arc>>>, 64 | } 65 | 66 | pub struct Server { 67 | config: Config, 68 | } 69 | 70 | impl Context { 71 | fn new( 72 | context: Rc, 73 | token: Token, 74 | chan_tx: fmpsc::UnboundedSender, 75 | ) -> Self { 76 | metrics::CLIENT_CONNECTION.inc(); 77 | context.token_chans.lock().unwrap().insert(token, chan_tx); 78 | Context { 79 | context: context, 80 | token: token, 81 | db_context: Some(DbContext::new(token)), 82 | requests: VecDeque::new(), 83 | } 84 | } 85 | 86 | fn dispatch(&mut self, req: RespValue) { 87 | if let Some(mut db_context) = self.db_context.take() { 88 | debug!("Dispatched request ({}) {:?}", self.token, req); 89 | db_context.commands.push(req); 90 | self.context 91 | .db_sender 92 | .borrow_mut() 93 | .send(WorkerMsg::Command(db_context)); 94 | } else { 95 | debug!("Enqueued request ({}) {:?}", self.token, req); 96 | self.requests.push_back(req); 97 | } 98 | } 99 | 100 | fn dispatch_next(&mut self, mut db_context: DbContext) { 101 | assert!( 102 | self.db_context.is_none(), 103 | "can't cycle if there's nothing inflight" 104 | ); 105 | if let Some(req) = self.requests.pop_front() { 106 | debug!("Dispatched request ({}) {:?}", self.token, req); 107 | db_context.commands.push(req); 108 | self.context 109 | .db_sender 110 | .borrow_mut() 111 | .send(WorkerMsg::Command(db_context)); 112 | } else { 113 | self.db_context = Some(db_context); 114 | } 115 | } 116 | } 117 | 118 | impl Drop for Context { 119 | fn drop(&mut self) { 120 | self.context.token_chans.lock().unwrap().remove(&self.token); 121 | metrics::CLIENT_CONNECTION.dec(); 122 | } 123 | } 124 | 125 | impl Server { 126 | pub fn new(config: Config) -> Server { 127 | Server { config: config } 128 | } 129 | 130 | fn connection( 131 | context: Rc, 132 | token: Token, 133 | socket: tokio::net::TcpStream, 134 | ) -> Box> { 135 | socket.set_nodelay(true).expect("Failed to set nodelay"); 136 | let (sock_rx, sock_tx) = socket.split(); 137 | let sock_tx = codec::FramedWrite::new(sock_tx, RespCodec); 138 | let sock_rx = codec::FramedRead::new(sock_rx, RespCodec); 139 | let (chan_tx, chan_rx) = fmpsc::unbounded(); 140 | let ctx_rx = Rc::new(RefCell::new(Context::new(context, token, chan_tx))); 141 | let ctx_tx = ctx_rx.clone(); 142 | 143 | let fut_rx = sock_rx.for_each(move |request| { 144 | ctx_rx.borrow_mut().dispatch(request); 145 | Ok(()) 146 | }); 147 | 148 | let fut_tx = sock_tx 149 | .send_all( 150 | chan_rx 151 | .map(move |mut context| { 152 | let response = context.take_response(); 153 | context.clear(); 154 | ctx_tx.borrow_mut().dispatch_next(context); 155 | response 156 | }).map_err(|_| io::Error::from(io::ErrorKind::Other)), 157 | ).map(|_| ()); 158 | 159 | Box::new(fut_rx.select(fut_tx).map(|_| ()).map_err(|(e, _)| e)) 160 | } 161 | 162 | pub fn run(self) { 163 | let mut core = tokio::reactor::Core::new().unwrap(); 164 | 165 | let token_chans: Arc>>> = 166 | Default::default(); 167 | let token_chans_cloned = token_chans.clone(); 168 | let response_fn = Box::new(move |context: DbContext| { 169 | let token = context.token; 170 | if let Some(chan) = token_chans_cloned.lock().unwrap().get_mut(&token) { 171 | if let Err(e) = chan.unbounded_send(context) { 172 | warn!("Can't send to token {} chan: {:?}", token, e); 173 | } 174 | } else { 175 | debug!("Can't find response channel for token {:?}", token); 176 | } 177 | }); 178 | 179 | let database = Database::new(&self.config, response_fn); 180 | 181 | let context = Rc::new(SharedContext { 182 | db_sender: RefCell::new(database.sender()), 183 | database: database, 184 | token_chans: token_chans, 185 | }); 186 | 187 | let mut next_token = 0; 188 | let handle = core.handle(); 189 | let listener = 190 | tokio::net::TcpListener::bind(&self.config.listen_addr, &core.handle()).unwrap(); 191 | let listener_fut = listener.incoming().for_each(|(socket, addr)| { 192 | if context.token_chans.lock().unwrap().len() 193 | >= context.database.config.client_connection_max as usize 194 | { 195 | info!( 196 | "Refusing connection from {:?}, connection limit reached", 197 | addr 198 | ); 199 | return Ok(()); 200 | } 201 | info!("Token {} accepting connection from {:?}", next_token, addr); 202 | let conn_ctx = context.clone(); 203 | handle.spawn( 204 | Self::connection(conn_ctx, next_token, socket).then(move |r| { 205 | info!("Token {} disconnected {:?}", next_token, r); 206 | Ok(()) 207 | }), 208 | ); 209 | next_token = next_token.wrapping_add(1); 210 | Ok(()) 211 | }); 212 | 213 | core.run(listener_fut).unwrap(); 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/storage.rs: -------------------------------------------------------------------------------- 1 | use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; 2 | use rocksdb::{self, Writable}; 3 | use std::io::Write; 4 | use std::path::Path; 5 | use std::sync::Arc; 6 | use std::{mem, str}; 7 | use utils::*; 8 | 9 | struct U16BeSuffixTransform; 10 | 11 | impl rocksdb::SliceTransform for U16BeSuffixTransform { 12 | fn transform<'a>(&mut self, key: &'a [u8]) -> &'a [u8] { 13 | &key[..2] 14 | } 15 | 16 | fn in_domain(&mut self, _key: &[u8]) -> bool { 17 | true 18 | } 19 | } 20 | 21 | pub struct StorageManager { 22 | db: Arc, 23 | } 24 | 25 | #[inline] 26 | fn build_key<'a>(buffer: &'a mut [u8], num: u16, key: &[u8]) -> &'a [u8] { 27 | (&mut buffer[..2]).write_u16::(num).unwrap(); 28 | (&mut buffer[2..]).write_all(key).unwrap(); 29 | &buffer[..2 + key.len()] 30 | } 31 | 32 | #[inline] 33 | fn build_log_key<'a>(buffer: &'a mut [u8], num: u16, log_key: (u64, u64)) -> &'a [u8] { 34 | (&mut buffer[..2]).write_u16::(num).unwrap(); 35 | (&mut buffer[2..2 + 8]) 36 | .write_u64::(log_key.0) 37 | .unwrap(); 38 | (&mut buffer[2 + 8..2 + 8 + 8]) 39 | .write_u64::(log_key.1) 40 | .unwrap(); 41 | &buffer[..2 + 8 + 8] 42 | } 43 | 44 | #[inline] 45 | fn build_log_prefix<'a>(buffer: &'a mut [u8], num: u16, prefix: u64) -> &'a [u8] { 46 | (&mut buffer[..2]).write_u16::(num).unwrap(); 47 | (&mut buffer[2..2 + 8]) 48 | .write_u64::(prefix) 49 | .unwrap(); 50 | &buffer[..2 + 8] 51 | } 52 | 53 | // TODO: support TTL 54 | // TODO: specific comparator for log cf 55 | // TODO: merge operator could be a big win 56 | pub struct Storage { 57 | db: Arc, 58 | cf: &'static rocksdb::CFHandle, 59 | log_cf: &'static rocksdb::CFHandle, 60 | num: u16, 61 | } 62 | 63 | unsafe impl Sync for Storage {} 64 | unsafe impl Send for Storage {} 65 | 66 | pub struct StorageBatch<'a> { 67 | storage: &'a Storage, 68 | wb: rocksdb::WriteBatch, 69 | } 70 | 71 | pub struct SendableStorageBatch(rocksdb::WriteBatch); 72 | 73 | impl<'a> From> for SendableStorageBatch { 74 | fn from(sb: StorageBatch<'a>) -> Self { 75 | SendableStorageBatch(sb.wb) 76 | } 77 | } 78 | 79 | struct GenericIterator { 80 | db: Arc, 81 | iterator: rocksdb::rocksdb::DBIterator>, 82 | first: bool, 83 | } 84 | 85 | pub struct StorageIterator(GenericIterator); 86 | 87 | pub struct LogStorageIterator(GenericIterator); 88 | 89 | unsafe impl Send for GenericIterator {} 90 | 91 | impl StorageManager { 92 | pub fn new>(path: P) -> Result { 93 | let mut opts = rocksdb::DBOptions::new(); 94 | opts.create_if_missing(true); 95 | opts.set_max_background_jobs(4); 96 | opts.enable_pipelined_write(true); 97 | let mut def_cf_opts = rocksdb::ColumnFamilyOptions::new(); 98 | def_cf_opts 99 | .set_prefix_extractor("U16BeSuffixTransform", Box::new(U16BeSuffixTransform)) 100 | .unwrap(); 101 | def_cf_opts.compression_per_level(&[ 102 | rocksdb::DBCompressionType::No, 103 | rocksdb::DBCompressionType::No, 104 | rocksdb::DBCompressionType::Lz4, 105 | rocksdb::DBCompressionType::Lz4, 106 | rocksdb::DBCompressionType::Lz4, 107 | rocksdb::DBCompressionType::Lz4, 108 | rocksdb::DBCompressionType::Lz4, 109 | ]); 110 | def_cf_opts.set_write_buffer_size(32 * 1024 * 1024); 111 | def_cf_opts.set_max_bytes_for_level_base(4 * 32 * 1024 * 1024); 112 | def_cf_opts.set_max_write_buffer_number(4); 113 | 114 | let mut block_opts = rocksdb::BlockBasedOptions::new(); 115 | block_opts.set_bloom_filter(10, false); 116 | block_opts.set_lru_cache(4 * 32 * 1024 * 1024, -1, 0, 0f64); 117 | def_cf_opts.set_block_based_table_factory(&block_opts); 118 | 119 | let mut log_cf_opts = rocksdb::ColumnFamilyOptions::new(); 120 | log_cf_opts.compression(rocksdb::DBCompressionType::No); 121 | let mut fifo_opts = rocksdb::FifoCompactionOptions::new(); 122 | fifo_opts.set_ttl(3600 * 72); // 72 hours 123 | log_cf_opts.set_fifo_compaction_options(fifo_opts); 124 | log_cf_opts.set_compaction_style(rocksdb::DBCompactionStyle::Fifo); 125 | log_cf_opts.set_write_buffer_size(32 * 1024 * 1024); 126 | log_cf_opts.set_max_write_buffer_number(4); 127 | 128 | let mut block_opts = rocksdb::BlockBasedOptions::new(); 129 | block_opts.set_bloom_filter(10, false); 130 | block_opts.set_lru_cache(2 * 32 * 1024 * 1024, -1, 0, 0f64); 131 | log_cf_opts.set_block_based_table_factory(&block_opts); 132 | 133 | // TODO: Rocksdb is complicated, we might want to tune some more options 134 | 135 | let db = rocksdb::DB::open_cf( 136 | opts.clone(), 137 | path.as_ref().to_str().unwrap(), 138 | vec![ 139 | ("default", def_cf_opts.clone()), 140 | ("log", log_cf_opts.clone()), 141 | ], 142 | ).or_else(|_| -> Result<_, String> { 143 | let mut db = rocksdb::DB::open_cf( 144 | opts, 145 | path.as_ref().to_str().unwrap(), 146 | vec![("default", def_cf_opts)], 147 | )?; 148 | 149 | db.create_cf(("log", log_cf_opts))?; 150 | Ok(db) 151 | })?; 152 | 153 | Ok(StorageManager { db: Arc::new(db) }) 154 | } 155 | 156 | pub fn open(&self, db_num: u16) -> Result { 157 | Ok(Storage { 158 | db: self.db.clone(), 159 | cf: unsafe { mem::transmute(self.db.cf_handle("default").unwrap()) }, 160 | log_cf: unsafe { mem::transmute(self.db.cf_handle("log").unwrap()) }, 161 | num: db_num, 162 | }) 163 | } 164 | 165 | pub fn batch_write(&self, batch: SendableStorageBatch) -> Result<(), GenericError> { 166 | Ok(self.db.write(batch.0)?) 167 | } 168 | } 169 | 170 | impl Drop for StorageManager { 171 | fn drop(&mut self) { 172 | let sc = Arc::strong_count(&self.db); 173 | let wc = Arc::weak_count(&self.db); 174 | assert_eq!(wc, 0); 175 | assert_eq!(sc, 1); 176 | } 177 | } 178 | 179 | impl Storage { 180 | pub fn iterator(&self) -> StorageIterator { 181 | let mut key_prefix = [0u8; 2]; 182 | build_key(&mut key_prefix, self.num, b""); 183 | let mut ro = rocksdb::ReadOptions::new(); 184 | ro.set_total_order_seek(false); 185 | ro.set_prefix_same_as_start(true); 186 | let mut iterator = rocksdb::DBIterator::new_cf(self.db.clone(), self.cf, ro); 187 | iterator.seek(rocksdb::SeekKey::Key(&key_prefix[..])); 188 | StorageIterator(GenericIterator { 189 | db: self.db.clone(), 190 | iterator: iterator, 191 | first: true, 192 | }) 193 | } 194 | 195 | pub fn log_iterator_all(&self) -> LogStorageIterator { 196 | let mut key_prefix = [0u8; 2]; 197 | build_key(&mut key_prefix, self.num, b""); 198 | let mut end_prefix = [0u8; 2]; 199 | build_key(&mut end_prefix, self.num + 1, b""); 200 | let mut ro = rocksdb::ReadOptions::new(); 201 | ro.set_iterate_upper_bound(&end_prefix[..]); 202 | let mut iterator = rocksdb::DBIterator::new_cf(self.db.clone(), self.log_cf, ro); 203 | iterator.seek(rocksdb::SeekKey::Key(&key_prefix[..])); 204 | LogStorageIterator(GenericIterator { 205 | db: self.db.clone(), 206 | iterator: iterator, 207 | first: true, 208 | }) 209 | } 210 | 211 | pub fn log_iterator(&self, prefix: u64, start: u64) -> LogStorageIterator { 212 | let mut end_prefix = [0u8; 2 + 8]; 213 | build_log_prefix(&mut end_prefix, self.num, prefix + 1); 214 | let mut start_key = [0u8; 2 + 8 + 8]; 215 | build_log_key(&mut start_key, self.num, (prefix, start)); 216 | let mut ro = rocksdb::ReadOptions::new(); 217 | ro.set_iterate_upper_bound(&end_prefix[..]); 218 | let mut iterator = rocksdb::DBIterator::new_cf(self.db.clone(), self.log_cf, ro); 219 | iterator.seek(rocksdb::SeekKey::Key(&start_key[..])); 220 | LogStorageIterator(GenericIterator { 221 | db: self.db.clone(), 222 | iterator: iterator, 223 | first: true, 224 | }) 225 | } 226 | 227 | pub fn get R>( 228 | &self, 229 | key: &[u8], 230 | callback: F, 231 | ) -> Result, GenericError> { 232 | let mut buffer = [0u8; 512]; 233 | let buffer = build_key(&mut buffer, self.num, key); 234 | let r = self.db.get_cf(self.cf, buffer)?; 235 | trace!( 236 | "get {:?} ({:?} bytes)", 237 | str::from_utf8(key), 238 | r.as_ref().map(|x| x.len()) 239 | ); 240 | Ok(r.map(|r| callback(&*r))) 241 | } 242 | 243 | pub fn log_get R>( 244 | &self, 245 | log_key: (u64, u64), 246 | callback: F, 247 | ) -> Result, GenericError> { 248 | let mut buffer = [0u8; 2 + 8 + 8]; 249 | let buffer = build_log_key(&mut buffer, self.num, log_key); 250 | let r = self.db.get_cf(self.log_cf, buffer)?; 251 | trace!( 252 | "log_get {:?} ({:?} bytes)", 253 | log_key, 254 | r.as_ref().map(|x| x.len()) 255 | ); 256 | Ok(r.map(|r| callback(&*r))) 257 | } 258 | 259 | pub fn get_vec(&self, key: &[u8]) -> Result>, GenericError> { 260 | self.get(key, |v| v.to_owned()) 261 | } 262 | 263 | pub fn log_get_vec(&self, log_key: (u64, u64)) -> Result>, GenericError> { 264 | self.log_get(log_key, |v| v.to_owned()) 265 | } 266 | 267 | pub fn set(&self, key: &[u8], value: &[u8]) -> Result<(), GenericError> { 268 | let mut b = self.batch_new(0); 269 | b.set(key, value); 270 | self.batch_write(b) 271 | } 272 | 273 | pub fn del(&self, key: &[u8]) -> Result<(), GenericError> { 274 | let mut b = self.batch_new(0); 275 | b.del(key); 276 | self.batch_write(b) 277 | } 278 | 279 | pub fn batch_new(&self, reserve: usize) -> StorageBatch { 280 | StorageBatch { 281 | storage: self, 282 | wb: rocksdb::WriteBatch::with_capacity(reserve), 283 | } 284 | } 285 | 286 | pub fn batch_write(&self, batch: StorageBatch) -> Result<(), GenericError> { 287 | Ok(self.db.write(batch.wb)?) 288 | } 289 | 290 | pub fn clear(&self) { 291 | trace!("clear"); 292 | let mut from = [0u8; 2]; 293 | let mut to = [0u8; 2]; 294 | (&mut from[..]).write_u16::(self.num).unwrap(); 295 | (&mut to[..]).write_u16::(self.num + 1).unwrap(); 296 | 297 | for &cf in &[self.cf, self.log_cf] { 298 | self.db 299 | .delete_files_in_range_cf(cf, &from[..], &to[..], false) 300 | .unwrap(); 301 | let mut ro = rocksdb::ReadOptions::new(); 302 | ro.set_total_order_seek(false); 303 | ro.set_prefix_same_as_start(true); 304 | ro.set_iterate_upper_bound(&to[..]); 305 | let mut iter = self.db.iter_cf_opt(cf, ro); 306 | iter.seek(rocksdb::SeekKey::Key(&from[..])); 307 | while iter.valid() { 308 | self.db.delete_cf(cf, iter.key()).unwrap(); 309 | iter.next(); 310 | } 311 | } 312 | } 313 | 314 | pub fn sync(&self) -> Result<(), GenericError> { 315 | debug!("sync"); 316 | Ok(self.db.sync_wal()?) 317 | } 318 | } 319 | 320 | impl<'a> StorageBatch<'a> { 321 | pub fn is_empty(&self) -> bool { 322 | self.wb.is_empty() 323 | } 324 | 325 | pub fn set(&mut self, key: &[u8], value: &[u8]) { 326 | trace!("set {:?} ({} bytes)", str::from_utf8(key), value.len()); 327 | let mut buffer = [0u8; 512]; 328 | let buffer = build_key(&mut buffer, self.storage.num, key); 329 | self.wb.put_cf(self.storage.cf, buffer, value).unwrap(); 330 | } 331 | 332 | pub fn log_set(&mut self, key: (u64, u64), value: &[u8]) { 333 | trace!("log_set {:?} ({} bytes)", key, value.len()); 334 | let mut buffer = [0u8; 2 + 8 + 8]; 335 | let buffer = build_log_key(&mut buffer, self.storage.num, key); 336 | self.wb.put_cf(self.storage.log_cf, buffer, value).unwrap(); 337 | } 338 | 339 | pub fn del(&mut self, key: &[u8]) { 340 | trace!("del {:?}", str::from_utf8(key)); 341 | let mut buffer = [0u8; 512]; 342 | let buffer = build_key(&mut buffer, self.storage.num, key); 343 | self.wb.delete_cf(self.storage.cf, buffer).unwrap() 344 | } 345 | } 346 | 347 | impl GenericIterator { 348 | pub fn iter<'a>(&'a mut self) -> GenericIteratorIter<'a> { 349 | GenericIteratorIter { it: self } 350 | } 351 | } 352 | 353 | pub struct GenericIteratorIter<'a> { 354 | it: &'a mut GenericIterator, 355 | } 356 | 357 | impl<'a> Iterator for GenericIteratorIter<'a> { 358 | type Item = (u16, &'a [u8], &'a [u8]); 359 | fn next(&mut self) -> Option { 360 | if self.it.first { 361 | self.it.first = false; 362 | } else { 363 | // this iterator isn't fused so we need to check for valid here too 364 | if !self.it.iterator.valid() { 365 | return None; 366 | } 367 | self.it.iterator.next(); 368 | } 369 | if self.it.iterator.valid() { 370 | unsafe { 371 | let key = self.it.iterator.key(); 372 | let value = self.it.iterator.value(); 373 | // FIXME: bogus lifetime as slices are only valid until the next call to next() 374 | Some(( 375 | (&key[..2]).read_u16::().unwrap(), 376 | mem::transmute(&key[2..]), 377 | mem::transmute(value), 378 | )) 379 | } 380 | } else { 381 | None 382 | } 383 | } 384 | } 385 | 386 | pub struct StorageIteratorIter<'a>(GenericIteratorIter<'a>); 387 | 388 | impl StorageIterator { 389 | pub fn iter<'a>(&'a mut self) -> StorageIteratorIter<'a> { 390 | StorageIteratorIter(self.0.iter()) 391 | } 392 | } 393 | 394 | impl<'a> Iterator for StorageIteratorIter<'a> { 395 | type Item = (&'a [u8], &'a [u8]); 396 | fn next(&mut self) -> Option { 397 | self.0.next().map(|(_, k, v)| (k, v)) 398 | } 399 | } 400 | 401 | pub struct LogStorageIteratorIter<'a>(GenericIteratorIter<'a>); 402 | 403 | impl<'a> Iterator for LogStorageIteratorIter<'a> { 404 | type Item = ((u64, u64), &'a [u8]); 405 | fn next(&mut self) -> Option { 406 | self.0.next().map(|(_, key, value)| { 407 | let first = (&key[..8]).read_u64::().unwrap(); 408 | let second = (&key[8..8 + 8]).read_u64::().unwrap(); 409 | ((first, second), value) 410 | }) 411 | } 412 | } 413 | 414 | impl LogStorageIterator { 415 | pub fn iter<'a>(&'a mut self) -> LogStorageIteratorIter<'a> { 416 | LogStorageIteratorIter(self.0.iter()) 417 | } 418 | } 419 | 420 | #[cfg(test)] 421 | mod tests { 422 | use super::*; 423 | use std::fs; 424 | 425 | #[test] 426 | fn test_simple() { 427 | let _ = fs::remove_dir_all("t/test_simple"); 428 | let sm = StorageManager::new("t/test_simple").unwrap(); 429 | let storage = sm.open(1).unwrap(); 430 | assert_eq!(storage.get_vec(b"sample").unwrap(), None); 431 | storage.set(b"sample", b"sample_value").unwrap(); 432 | assert_eq!( 433 | storage.get_vec(b"sample").unwrap().unwrap(), 434 | b"sample_value" 435 | ); 436 | storage.del(b"sample").unwrap(); 437 | assert_eq!(storage.get_vec(b"sample").unwrap(), None); 438 | } 439 | 440 | #[test] 441 | fn test_simple_log() { 442 | let _ = fs::remove_dir_all("t/test_simple_log"); 443 | let sm = StorageManager::new("t/test_simple_log").unwrap(); 444 | let storage = sm.open(1).unwrap(); 445 | assert_eq!(storage.get_vec(b"sample").unwrap(), None); 446 | let mut b = storage.batch_new(0); 447 | b.set(b"sample", b"sample_value"); 448 | b.log_set((1, 1), b"sample"); 449 | storage.batch_write(b).unwrap(); 450 | assert_eq!( 451 | storage.get_vec(b"sample").unwrap().unwrap(), 452 | b"sample_value" 453 | ); 454 | assert_eq!(storage.log_get_vec((1, 1)).unwrap().unwrap(), b"sample"); 455 | } 456 | 457 | #[test] 458 | fn test_iter() { 459 | let _ = fs::remove_dir_all("t/test_iter"); 460 | let sm = StorageManager::new("t/test_iter").unwrap(); 461 | for &i in &[0, 1, 2] { 462 | let storage = sm.open(i).unwrap(); 463 | storage.set(b"1", i.to_string().as_bytes()).unwrap(); 464 | storage.set(b"2", i.to_string().as_bytes()).unwrap(); 465 | storage.set(b"3", i.to_string().as_bytes()).unwrap(); 466 | } 467 | for &i in &[0, 1, 2] { 468 | let storage = sm.open(i).unwrap(); 469 | let results: Vec> = storage.iterator().iter().map(|(_, v)| v.into()).collect(); 470 | assert_eq!(results, vec![i.to_string().as_bytes(); 3]); 471 | } 472 | } 473 | 474 | #[test] 475 | fn test_iter_log() { 476 | let _ = fs::remove_dir_all("t/test_iter_log"); 477 | let sm = StorageManager::new("t/test_iter_log").unwrap(); 478 | for &i in &[0u64, 1, 2] { 479 | let storage = sm.open(i as u16).unwrap(); 480 | let mut b = storage.batch_new(0); 481 | b.log_set((i, i + 0), (i + 0).to_string().as_bytes()); 482 | b.log_set((i, i + 1), (i + 1).to_string().as_bytes()); 483 | b.log_set((i, i + 2), (i + 2).to_string().as_bytes()); 484 | b.log_set((i + 1, i), b""); 485 | storage.batch_write(b).unwrap(); 486 | } 487 | for &i in &[0u64, 1, 2] { 488 | let storage = sm.open(i as u16).unwrap(); 489 | let results: Vec<(_, Vec)> = storage 490 | .log_iterator(i, i + 1) 491 | .iter() 492 | .map(|(k, v)| (k, v.into())) 493 | .collect(); 494 | assert_eq!( 495 | results, 496 | vec![ 497 | ((i, i + 1), (i + 1).to_string().into_bytes()), 498 | ((i, i + 2), (i + 2).to_string().into_bytes()), 499 | ] 500 | ); 501 | assert_eq!(storage.log_iterator(i, i).iter().count(), 3); 502 | assert_eq!(storage.log_iterator_all().iter().count(), 4); 503 | } 504 | } 505 | 506 | #[test] 507 | fn test_clear() { 508 | let _ = fs::remove_dir_all("t/test_clear"); 509 | let sm = StorageManager::new("t/test_clear").unwrap(); 510 | for &i in &[0u64, 1, 2] { 511 | let storage = sm.open(i as u16).unwrap(); 512 | let mut b = storage.batch_new(0); 513 | b.set(i.to_string().as_bytes(), i.to_string().as_bytes()); 514 | b.log_set((i, i), i.to_string().as_bytes()); 515 | storage.batch_write(b).unwrap(); 516 | } 517 | for &i in &[0u64, 1, 2] { 518 | let storage = sm.open(i as u16).unwrap(); 519 | assert_eq!(storage.iterator().iter().count(), 1); 520 | assert_eq!(storage.log_iterator(i, i).iter().count(), 1); 521 | storage.clear(); 522 | assert_eq!(storage.iterator().iter().count(), 0); 523 | assert_eq!(storage.log_iterator(i, i).iter().count(), 0); 524 | } 525 | } 526 | 527 | #[test] 528 | fn test_open_all() { 529 | let _ = fs::remove_dir_all("t/test_open_all"); 530 | let sm = StorageManager::new("t/test_open_all").unwrap(); 531 | sm.open(1).unwrap(); 532 | sm.open(2).unwrap(); 533 | sm.open(3).unwrap(); 534 | sm.open(1).unwrap(); 535 | sm.open(2).unwrap(); 536 | sm.open(3).unwrap(); 537 | } 538 | 539 | } 540 | -------------------------------------------------------------------------------- /src/types.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | use std::fmt; 3 | use std::str::FromStr; 4 | 5 | /// Identifier for a Database instance 6 | /// node id should be a positive i64 to work nicelly with the RESP protocol 7 | pub type NodeId = u64; 8 | /// Identifier for physical node (high u32 of NodeId) 9 | pub type PhysicalNodeId = u32; 10 | /// Identifier for connection with client 11 | pub type Token = u64; 12 | /// Identifier for a vnode 13 | pub type VNodeNo = u16; 14 | 15 | /// Identifier for communication between nodes 16 | #[derive(PartialEq, Eq, Hash, Serialize, Deserialize, Default, Copy, Clone)] 17 | pub struct Cookie(u64, u64); 18 | 19 | impl Cookie { 20 | pub fn new(a: u64, b: u64) -> Self { 21 | Cookie(a, b) 22 | } 23 | } 24 | 25 | impl fmt::Debug for Cookie { 26 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 27 | write!(f, "{:016X}{:016X}", self.0, self.1) 28 | } 29 | } 30 | 31 | /// Consistency Level as in Dynamo/Riak/Cassandra style 32 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 33 | pub enum ConsistencyLevel { 34 | One, 35 | Quorum, 36 | All, 37 | } 38 | 39 | #[derive(Copy, Clone, Debug)] 40 | pub struct ConsistencyLevelParseError; 41 | 42 | impl<'a> TryFrom<&'a [u8]> for ConsistencyLevel { 43 | type Error = ConsistencyLevelParseError; 44 | fn try_from(bytes: &'a [u8]) -> Result { 45 | if bytes.len() > 0 { 46 | match bytes[0] { 47 | b'1' | b'o' | b'O' => return Ok(ConsistencyLevel::One), 48 | b'q' | b'Q' => return Ok(ConsistencyLevel::Quorum), 49 | b'a' | b'A' => return Ok(ConsistencyLevel::All), 50 | _ => (), 51 | } 52 | } 53 | Err(ConsistencyLevelParseError) 54 | } 55 | } 56 | 57 | impl FromStr for ConsistencyLevel { 58 | type Err = ConsistencyLevelParseError; 59 | fn from_str(s: &str) -> Result { 60 | Self::try_from(s.as_bytes()) 61 | } 62 | } 63 | 64 | impl ConsistencyLevel { 65 | pub fn required(&self, replicas: u8) -> u8 { 66 | match *self { 67 | ConsistencyLevel::One => 1, 68 | ConsistencyLevel::Quorum => replicas / 2 + 1, 69 | ConsistencyLevel::All => replicas, 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{HashMap, HashSet}; 2 | use std::error::Error; 3 | use std::hash::{BuildHasherDefault, Hasher}; 4 | use std::{fmt, fs, io, path}; 5 | 6 | pub type GenericError = Box; 7 | 8 | pub type IdHasherBuilder = BuildHasherDefault; 9 | pub type IdHashMap = HashMap; 10 | pub type IdHashSet = HashSet; 11 | pub struct IdHasher(u64); 12 | 13 | impl Default for IdHasher { 14 | #[inline] 15 | fn default() -> IdHasher { 16 | IdHasher(0) 17 | } 18 | } 19 | 20 | impl Hasher for IdHasher { 21 | #[inline] 22 | fn finish(&self) -> u64 { 23 | self.0 24 | } 25 | 26 | #[inline] 27 | fn write(&mut self, bytes: &[u8]) { 28 | #[inline] 29 | fn mix(mut x: u64) -> u64 { 30 | // Seahash diffuse method 31 | x = x.wrapping_mul(0x6eed0e9da4d94a4f); 32 | let a = x >> 32; 33 | let b = x >> 60; 34 | x ^= a >> b; 35 | x.wrapping_mul(0x6eed0e9da4d94a4f) 36 | } 37 | 38 | debug_assert!(bytes.len() <= 8); 39 | unsafe { 40 | let mut temp = 0u64; 41 | ::std::ptr::copy_nonoverlapping( 42 | bytes.as_ptr(), 43 | &mut temp as *mut _ as *mut u8, 44 | bytes.len(), 45 | ); 46 | self.0 ^= mix(temp); 47 | } 48 | } 49 | } 50 | 51 | pub fn replace_default(subject: &mut T) -> T { 52 | ::std::mem::replace(subject, Default::default()) 53 | } 54 | 55 | pub fn into_io_error(e: E) -> io::Error { 56 | io::Error::new(io::ErrorKind::Other, e) 57 | } 58 | 59 | pub fn split_u64(uint: u64) -> (u32, u32) { 60 | ((uint >> 32) as u32, uint as u32) 61 | } 62 | 63 | pub fn join_u64(hi: u32, lo: u32) -> u64 { 64 | ((hi as u64) << 32) | (lo as u64) 65 | } 66 | 67 | pub fn assume_str(bytes: &[u8]) -> &str { 68 | unsafe { ::std::str::from_utf8_unchecked(bytes) } 69 | } 70 | 71 | pub fn is_dir_empty_or_absent>(path: P) -> io::Result { 72 | match fs::read_dir(path.as_ref()) { 73 | Ok(dir) => Ok(dir.count() == 0), 74 | Err(ref err) if err.kind() == io::ErrorKind::NotFound => Ok(true), 75 | Err(err) => Err(err), 76 | } 77 | } 78 | 79 | #[cfg(test)] 80 | pub fn sleep_ms(ms: u64) { 81 | ::std::thread::sleep(::std::time::Duration::from_millis(ms)); 82 | } 83 | 84 | #[cfg(test)] 85 | macro_rules! assert_eq_repr { 86 | ($left:expr, $right:expr) => {{ 87 | match (format!("{:?}", &$left), format!("{:?}", &$right)) { 88 | (left_val, right_val) => { 89 | if !(left_val == right_val) { 90 | panic!( 91 | "repr assertion failed: `(debug(left) == debug(right))` \ 92 | (left: `{:?}`, right: `{:?}`)", 93 | left_val, right_val 94 | ) 95 | } 96 | } 97 | } 98 | }}; 99 | } 100 | 101 | pub trait LoggerExt { 102 | fn log_error(&self, msg: &str); 103 | fn log_warn(&self, msg: &str); 104 | } 105 | 106 | impl LoggerExt for Result { 107 | fn log_error(&self, msg: &str) { 108 | if let &Err(ref e) = self { 109 | error!("{}: {:?}", msg, e); 110 | } 111 | } 112 | fn log_warn(&self, msg: &str) { 113 | if let &Err(ref e) = self { 114 | warn!("{}: {:?}", msg, e); 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/vnode_sync.rs: -------------------------------------------------------------------------------- 1 | use bincode; 2 | use bytes::Bytes; 3 | use cubes::Cube; 4 | use database::*; 5 | use fabric::*; 6 | use inflightmap::InFlightMap; 7 | use metrics::{self, Meter}; 8 | use std::collections::{hash_set, HashSet}; 9 | use std::time::{Duration, Instant}; 10 | use utils::IdHasherBuilder; 11 | use version_vector::*; 12 | use vnode::VNodeState; 13 | 14 | #[derive(Debug, Copy, Clone, PartialEq)] 15 | #[must_use] 16 | pub enum SyncResult { 17 | Continue, 18 | Done, 19 | Error, 20 | } 21 | 22 | impl From> for SyncResult { 23 | fn from(result: Result) -> Self { 24 | if result.is_ok() { 25 | SyncResult::Continue 26 | } else { 27 | SyncResult::Error 28 | } 29 | } 30 | } 31 | 32 | macro_rules! stry { 33 | ($expr:expr) => {{ 34 | let conv = $expr.into(); 35 | if let SyncResult::Continue = conv { 36 | conv 37 | } else { 38 | return conv; 39 | } 40 | }}; 41 | } 42 | 43 | #[derive(Debug)] 44 | pub enum SyncDirection { 45 | Incomming, 46 | Outgoing, 47 | } 48 | 49 | type IteratorFn = Box Result, ()> + Send>; 50 | 51 | type InFlightSyncMsgMap = InFlightMap; 52 | 53 | struct SyncKeysIterator { 54 | dots_delta: BitmappedVersionVectorDelta, 55 | keys: hash_set::IntoIter, 56 | } 57 | 58 | // TODO: Refactor into trait objects 59 | // trait Synchronization { fn on_.., .. } 60 | // new_sync_sender -> Box 61 | pub enum Synchronization { 62 | SyncSender { 63 | // bvv in peer at the time of sync start 64 | clocks_in_peer: BitmappedVersionVector, 65 | // partial copy of the local bvv at the time of sync start 66 | clocks_snapshot: BitmappedVersionVector, 67 | iterator: IteratorFn, 68 | // TODO: only store keys as resends should be rare 69 | inflight: InFlightSyncMsgMap, 70 | cookie: Cookie, 71 | peer: NodeId, 72 | // count of sent keys (includes inflight) 73 | count: u64, 74 | last_recv: Instant, 75 | last_send: Instant, 76 | }, 77 | SyncReceiver { 78 | // local bvv at the time of sync start 79 | clocks_in_peer: BitmappedVersionVector, 80 | cookie: Cookie, 81 | peer: NodeId, 82 | // aprox count of received keys (includes dups) 83 | recv_count: u64, 84 | last_recv: Instant, 85 | last_send: Instant, 86 | }, 87 | BootstrapSender { 88 | clocks_snapshot: BitmappedVersionVector, 89 | iterator: IteratorFn, 90 | inflight: InFlightSyncMsgMap, 91 | cookie: Cookie, 92 | peer: NodeId, 93 | // count of sent keys (includes inflight) 94 | count: u64, 95 | last_recv: Instant, 96 | last_send: Instant, 97 | }, 98 | BootstrapReceiver { 99 | cookie: Cookie, 100 | peer: NodeId, 101 | // aprox count of received keys (includes dups) 102 | recv_count: u64, 103 | last_recv: Instant, 104 | last_send: Instant, 105 | }, 106 | } 107 | 108 | impl SyncKeysIterator { 109 | fn new(dots_delta: BitmappedVersionVectorDelta) -> Self { 110 | SyncKeysIterator { 111 | dots_delta: dots_delta, 112 | keys: HashSet::new().into_iter(), 113 | } 114 | } 115 | 116 | fn next(&mut self, state: &VNodeState) -> Result, ()> { 117 | loop { 118 | if let Some(key) = self.keys.next() { 119 | return Ok(Some(key)); 120 | } 121 | // fetch log in batches of ~1_000 keys 122 | let hint_size = self.dots_delta.size_hint().0.min(1_000); 123 | let mut keys = HashSet::with_capacity(hint_size); 124 | // consider up to 90% of the actual capacity as an alternative limit 125 | let limit = (keys.capacity() * 9 / 10).max(1_000); 126 | for (n, v) in self.dots_delta.by_ref() { 127 | let key = state 128 | .storage 129 | .log_get((n, v), |x| Bytes::from(x)) 130 | .map_err(|_| ())?; 131 | if let Some(key) = key { 132 | keys.insert(key); 133 | if keys.len() >= limit { 134 | break; 135 | } 136 | } else { 137 | warn!("Can't find log key for ({}, {})", n, v); 138 | } 139 | } 140 | if keys.is_empty() { 141 | return Ok(None); 142 | } 143 | debug!("Sync will send key batch with {:?} keys", keys.len()); 144 | self.keys = keys.into_iter(); 145 | } 146 | } 147 | } 148 | 149 | use self::Synchronization::*; 150 | 151 | impl Synchronization { 152 | pub fn new_bootstrap_receiver( 153 | _db: &Database, 154 | _state: &mut VNodeState, 155 | peer: NodeId, 156 | cookie: Cookie, 157 | ) -> Self { 158 | BootstrapReceiver { 159 | cookie: cookie, 160 | peer: peer, 161 | recv_count: 0, 162 | last_recv: Instant::now(), 163 | last_send: Instant::now(), 164 | } 165 | } 166 | 167 | pub fn new_bootstrap_sender( 168 | _db: &Database, 169 | state: &mut VNodeState, 170 | peer: NodeId, 171 | msg: MsgSyncStart, 172 | ) -> Self { 173 | let mut storage_iterator = state.storage.iterator(); 174 | let iterator_fn: IteratorFn = Box::new(move |_| { 175 | let next = storage_iterator 176 | .iter() 177 | .map(|(k, v)| { 178 | let cube = bincode::deserialize::(v).map_err(|_| ())?; 179 | Ok((Bytes::from(k), cube)) 180 | }).next(); 181 | 182 | match next { 183 | Some(Ok(r)) => Ok(Some(r)), 184 | None => Ok(None), 185 | Some(Err(e)) => Err(e), 186 | } 187 | }); 188 | 189 | BootstrapSender { 190 | cookie: msg.cookie, 191 | clocks_snapshot: state.clocks.clone(), 192 | iterator: iterator_fn, 193 | inflight: InFlightMap::new(), 194 | peer: peer, 195 | count: 0, 196 | last_recv: Instant::now(), 197 | last_send: Instant::now(), 198 | } 199 | } 200 | 201 | pub fn new_sync_receiver( 202 | _db: &Database, 203 | state: &mut VNodeState, 204 | peer: NodeId, 205 | cookie: Cookie, 206 | ) -> Self { 207 | assert!(state.sync_nodes.insert(peer)); 208 | SyncReceiver { 209 | clocks_in_peer: state.clocks.clone(), 210 | peer: peer, 211 | cookie: cookie, 212 | recv_count: 0, 213 | last_recv: Instant::now(), 214 | last_send: Instant::now(), 215 | } 216 | } 217 | 218 | pub fn new_sync_sender( 219 | db: &Database, 220 | state: &mut VNodeState, 221 | peer: NodeId, 222 | msg: MsgSyncStart, 223 | ) -> Self { 224 | let MsgSyncStart { 225 | target, 226 | cookie, 227 | clocks_in_peer, 228 | .. 229 | } = msg; 230 | assert_eq!(target, Some(db.dht.node())); 231 | 232 | let clocks_snapshot = state.log_clocks.clone(); 233 | let dots_delta = clocks_snapshot.delta(&clocks_in_peer); 234 | 235 | debug!( 236 | "Creating SyncSender {:?} from {:?} to {:?}", 237 | cookie, clocks_snapshot, clocks_in_peer 238 | ); 239 | 240 | let mut sync_keys = SyncKeysIterator::new(dots_delta); 241 | let iterator_fn: IteratorFn = Box::new(move |state| { 242 | if let Some(key) = sync_keys.next(state)? { 243 | let cube = state.storage_get(&key)?; 244 | Ok(Some((key, cube))) 245 | } else { 246 | Ok(None) 247 | } 248 | }); 249 | 250 | SyncSender { 251 | clocks_in_peer: clocks_in_peer, 252 | clocks_snapshot: clocks_snapshot, 253 | iterator: iterator_fn, 254 | inflight: InFlightMap::new(), 255 | cookie: cookie, 256 | peer: peer, 257 | count: 0, 258 | last_recv: Instant::now(), 259 | last_send: Instant::now(), 260 | } 261 | } 262 | 263 | // send SyncStart message, only valid for Receivers 264 | fn send_start(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult { 265 | let (peer, cookie, target, clocks_in_peer) = match *self { 266 | SyncReceiver { 267 | cookie, 268 | peer, 269 | ref mut last_send, 270 | ref clocks_in_peer, 271 | .. 272 | } => { 273 | *last_send = Instant::now(); 274 | (peer, cookie, Some(peer), clocks_in_peer.clone()) 275 | } 276 | BootstrapReceiver { 277 | peer, 278 | cookie, 279 | ref mut last_send, 280 | .. 281 | } => { 282 | *last_send = Instant::now(); 283 | (peer, cookie, None, BitmappedVersionVector::new()) 284 | } 285 | _ => unreachable!(), 286 | }; 287 | 288 | info!("Sending start for {:?}", cookie); 289 | db.fabric 290 | .send_msg( 291 | peer, 292 | &MsgSyncStart { 293 | cookie: cookie, 294 | vnode: state.num(), 295 | clocks_in_peer: clocks_in_peer, 296 | target: target, 297 | }, 298 | ).into() 299 | } 300 | 301 | // Sending Errors always result in Error 302 | fn send_error_fin( 303 | &mut self, 304 | db: &Database, 305 | state: &mut VNodeState, 306 | error: FabricError, 307 | ) -> SyncResult { 308 | match *self { 309 | SyncReceiver { 310 | peer, 311 | cookie, 312 | ref mut last_send, 313 | .. 314 | } 315 | | BootstrapReceiver { 316 | peer, 317 | cookie, 318 | ref mut last_send, 319 | .. 320 | } 321 | | SyncSender { 322 | peer, 323 | cookie, 324 | ref mut last_send, 325 | .. 326 | } 327 | | BootstrapSender { 328 | peer, 329 | cookie, 330 | ref mut last_send, 331 | .. 332 | } => { 333 | *last_send = Instant::now(); 334 | let _ = db.fabric.send_msg( 335 | peer, 336 | &MsgSyncFin { 337 | cookie: cookie, 338 | vnode: state.num(), 339 | result: Err(error), 340 | }, 341 | ); 342 | SyncResult::Error 343 | } 344 | } 345 | } 346 | 347 | // Senders wait for the Receivers to reply => Continue 348 | // unless there's no route the peer => Error 349 | fn send_sender_success_fin(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult { 350 | match *self { 351 | SyncSender { 352 | peer, 353 | cookie, 354 | ref clocks_snapshot, 355 | ref mut last_send, 356 | .. 357 | } 358 | | BootstrapSender { 359 | peer, 360 | cookie, 361 | ref clocks_snapshot, 362 | ref mut last_send, 363 | .. 364 | } => { 365 | *last_send = Instant::now(); 366 | db.fabric 367 | .send_msg( 368 | peer, 369 | &MsgSyncFin { 370 | cookie: cookie, 371 | vnode: state.num(), 372 | result: Ok(clocks_snapshot.clone()), 373 | }, 374 | ).into() 375 | } 376 | _ => unreachable!(), 377 | } 378 | } 379 | 380 | // send (possibly multiple) SyncSend messages and eventual SyncFin 381 | // (also takes care of expired SyncSend) 382 | fn send_next(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult { 383 | let now = Instant::now(); 384 | let timeout = now + Duration::from_millis(db.config.sync_msg_timeout as _); 385 | let (error, inflight_empty) = match *self { 386 | SyncSender { 387 | peer, 388 | cookie, 389 | ref mut iterator, 390 | ref mut count, 391 | ref mut inflight, 392 | ref mut last_send, 393 | .. 394 | } 395 | | BootstrapSender { 396 | peer, 397 | cookie, 398 | ref mut iterator, 399 | ref mut count, 400 | ref mut inflight, 401 | ref mut last_send, 402 | .. 403 | } => { 404 | while let Some((seq, msg)) = inflight.touch_expired(now, timeout) { 405 | debug!("resending seq {} for sync/bootstrap {:?}", seq, cookie); 406 | let _ = stry!(db.fabric.send_msg(peer, msg,)); 407 | metrics::SYNC_RESEND.mark(1); 408 | } 409 | let mut error = false; 410 | while inflight.len() < db.config.sync_msg_inflight as usize { 411 | match iterator(state) { 412 | Ok(Some((k, v))) => { 413 | let msg = MsgSyncSend { 414 | cookie: cookie, 415 | vnode: state.num(), 416 | seq: *count, 417 | key: k.clone(), 418 | value: v.clone(), 419 | }; 420 | let _ = stry!(db.fabric.send_msg(peer, &msg,)); 421 | inflight.insert(*count, msg, timeout); 422 | *count += 1; 423 | *last_send = now; 424 | metrics::SYNC_SEND.mark(1); 425 | continue; 426 | } 427 | Ok(None) => { 428 | break; 429 | } 430 | Err(_) => { 431 | error = true; 432 | break; 433 | } 434 | } 435 | } 436 | (error, inflight.is_empty()) 437 | } 438 | _ => unreachable!(), 439 | }; 440 | 441 | if error { 442 | self.send_error_fin(db, state, FabricError::SyncInterrupted) 443 | } else if inflight_empty { 444 | // do not trottle success fin as we don't know if last_send 445 | // was set by MsgSend or MsgFin 446 | self.send_sender_success_fin(db, state) 447 | } else { 448 | SyncResult::Continue 449 | } 450 | } 451 | 452 | // called by vnode when node is transition to an incompatible state 453 | // only valid for Receivers right now 454 | pub fn on_cancel(&mut self, db: &Database, state: &mut VNodeState) { 455 | match *self { 456 | BootstrapReceiver { .. } | SyncReceiver { .. } => { 457 | let _ = self.send_error_fin(db, state, FabricError::BadVNodeStatus); 458 | } 459 | _ => unreachable!(), 460 | } 461 | } 462 | 463 | // called by vnode as soon as the sync is unregistered 464 | pub fn on_remove(self, db: &Database, state: &mut VNodeState) { 465 | match self { 466 | SyncReceiver { peer, .. } => { 467 | assert!(state.sync_nodes.remove(&peer)); 468 | } 469 | _ => (), 470 | } 471 | 472 | db.signal_sync_end(self.direction()); 473 | } 474 | 475 | pub fn on_tick(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult { 476 | match *self { 477 | SyncSender { 478 | last_recv, cookie, .. 479 | } 480 | | BootstrapSender { 481 | last_recv, cookie, .. 482 | } => if last_recv.elapsed() > Duration::from_millis(db.config.sync_timeout as _) { 483 | warn!("sync/boostrap sender timed out {:?}", cookie); 484 | SyncResult::Error 485 | } else { 486 | self.send_next(db, state) 487 | }, 488 | SyncReceiver { 489 | last_recv, 490 | recv_count, 491 | last_send, 492 | cookie, 493 | .. 494 | } 495 | | BootstrapReceiver { 496 | last_recv, 497 | recv_count, 498 | last_send, 499 | cookie, 500 | .. 501 | } => if last_recv.elapsed() > Duration::from_millis(db.config.sync_timeout as _) { 502 | warn!("sync/boostrap receiver timed out {:?}", cookie); 503 | SyncResult::Error 504 | } else if recv_count == 0 505 | && last_send.elapsed() > Duration::from_millis(db.config.sync_msg_timeout as _) 506 | { 507 | self.send_start(db, state) 508 | } else { 509 | SyncResult::Continue 510 | }, 511 | } 512 | } 513 | 514 | // called by vnode as soon as the sync is registered (after creation) 515 | pub fn on_start(&mut self, db: &Database, state: &mut VNodeState) { 516 | let _ = match *self { 517 | SyncReceiver { .. } | BootstrapReceiver { .. } => self.send_start(db, state), 518 | SyncSender { .. } | BootstrapSender { .. } => self.send_next(db, state), 519 | }; 520 | } 521 | 522 | pub fn on_msg_fin( 523 | &mut self, 524 | db: &Database, 525 | state: &mut VNodeState, 526 | msg: MsgSyncFin, 527 | ) -> SyncResult { 528 | match *self { 529 | SyncReceiver { peer, .. } | BootstrapReceiver { peer, .. } => { 530 | if msg.result.is_ok() { 531 | state.clocks.merge(msg.result.as_ref().unwrap()); 532 | state.save(db, false); 533 | // send it back as a form of ack-ack 534 | let _ = db.fabric.send_msg(peer, &msg); 535 | SyncResult::Done 536 | } else if msg.result.err() == Some(FabricError::NotReady) { 537 | SyncResult::Continue 538 | } else { 539 | SyncResult::Error 540 | } 541 | } 542 | SyncSender { .. } | BootstrapSender { .. } => { 543 | // Senders are always Done on SyncFin messages 544 | SyncResult::Done 545 | } 546 | } 547 | } 548 | 549 | pub fn on_msg_send(&mut self, db: &Database, state: &mut VNodeState, msg: MsgSyncSend) { 550 | match *self { 551 | SyncReceiver { 552 | peer, 553 | ref mut recv_count, 554 | ref mut last_recv, 555 | ref mut last_send, 556 | .. 557 | } 558 | | BootstrapReceiver { 559 | peer, 560 | ref mut recv_count, 561 | ref mut last_recv, 562 | ref mut last_send, 563 | .. 564 | } => { 565 | // TODO: what to do with errors here? 566 | state 567 | .storage_set_remote(db, vec![(msg.key, msg.value, false)]) 568 | .unwrap(); 569 | 570 | let _ = db.fabric.send_msg( 571 | peer, 572 | &MsgSyncAck { 573 | cookie: msg.cookie, 574 | vnode: state.num(), 575 | seq: msg.seq, 576 | }, 577 | ); 578 | 579 | *recv_count += 1; 580 | let now = Instant::now(); 581 | *last_recv = now; 582 | *last_send = now; 583 | metrics::SYNC_RECV.mark(1); 584 | } 585 | _ => unreachable!(), 586 | } 587 | } 588 | 589 | pub fn on_msg_ack(&mut self, db: &Database, state: &mut VNodeState, msg: MsgSyncAck) { 590 | match *self { 591 | SyncSender { 592 | ref mut inflight, 593 | ref mut last_recv, 594 | .. 595 | } 596 | | BootstrapSender { 597 | ref mut inflight, 598 | ref mut last_recv, 599 | .. 600 | } => { 601 | inflight.remove(&msg.seq); 602 | *last_recv = Instant::now(); 603 | } 604 | _ => unreachable!(), 605 | } 606 | let _ = self.send_next(db, state); 607 | } 608 | 609 | pub fn direction(&self) -> SyncDirection { 610 | match *self { 611 | BootstrapReceiver { .. } | SyncReceiver { .. } => SyncDirection::Incomming, 612 | BootstrapSender { .. } | SyncSender { .. } => SyncDirection::Outgoing, 613 | } 614 | } 615 | } 616 | -------------------------------------------------------------------------------- /src/workers.rs: -------------------------------------------------------------------------------- 1 | use crossbeam_channel as chan; 2 | use std::sync::atomic::{AtomicUsize, Ordering}; 3 | use std::sync::Arc; 4 | use std::{thread, time}; 5 | 6 | pub trait ExitMsg { 7 | fn exit_msg() -> Self; 8 | fn is_exit(&self) -> bool; 9 | } 10 | 11 | /// A Sender attached to a WorkerManager 12 | /// messages are distributed to threads in a Round-Robin manner. 13 | pub struct WorkerSender { 14 | cursor: AtomicUsize, 15 | alive_threads: Arc, 16 | channels: Vec>, 17 | } 18 | 19 | impl Clone for WorkerSender { 20 | fn clone(&self) -> Self { 21 | WorkerSender { 22 | cursor: Default::default(), 23 | channels: self.channels.clone(), 24 | alive_threads: self.alive_threads.clone(), 25 | } 26 | } 27 | } 28 | 29 | /// A thread pool containing threads prepared to receive WorkerMsg's 30 | pub struct WorkerManager { 31 | thread_count: usize, 32 | threads: Vec>, 33 | name: String, 34 | alive_threads: Arc, 35 | channels: Vec>, 36 | } 37 | 38 | impl WorkerManager { 39 | pub fn new(name: String, thread_count: usize) -> Self { 40 | assert!(thread_count > 0); 41 | WorkerManager { 42 | thread_count: thread_count, 43 | threads: Default::default(), 44 | name: name, 45 | alive_threads: Default::default(), 46 | channels: Default::default(), 47 | } 48 | } 49 | 50 | pub fn start(&mut self, mut worker_fn_gen: F) 51 | where 52 | F: FnMut() -> Box, 53 | { 54 | assert!(self.channels.is_empty()); 55 | for i in 0..self.thread_count { 56 | // since neither closure cloning or Box are stable use Box 57 | let mut worker_fn = worker_fn_gen(); 58 | let (tx, rx) = chan::unbounded(); 59 | let alive_handle = self.alive_threads.clone(); 60 | self.channels.push(tx); 61 | self.threads.push( 62 | thread::Builder::new() 63 | .name(format!("Worker:{}:{}", i, self.name)) 64 | .spawn(move || { 65 | alive_handle.fetch_add(1, Ordering::SeqCst); 66 | for m in rx { 67 | if m.is_exit() { 68 | break; 69 | } 70 | worker_fn(m); 71 | } 72 | alive_handle.fetch_sub(1, Ordering::SeqCst); 73 | info!("Exiting worker"); 74 | }).unwrap(), 75 | ); 76 | } 77 | } 78 | 79 | pub fn sender(&self) -> WorkerSender { 80 | assert!(!self.channels.is_empty()); 81 | WorkerSender { 82 | cursor: Default::default(), 83 | channels: self.channels.clone(), 84 | alive_threads: self.alive_threads.clone(), 85 | } 86 | } 87 | } 88 | 89 | impl WorkerSender { 90 | pub fn send(&self, msg: T) -> bool { 91 | let cursor = self.cursor.fetch_add(1, Ordering::Relaxed); 92 | self.send_to(cursor, msg) 93 | } 94 | 95 | pub fn send_to(&self, seed: usize, msg: T) -> bool { 96 | self.channels[seed % self.channels.len()].send(msg); 97 | self.alive_threads.load(Ordering::SeqCst) > 0 98 | } 99 | } 100 | 101 | impl Drop for WorkerManager { 102 | fn drop(&mut self) { 103 | for c in &*self.channels { 104 | let _ = c.send(T::exit_msg()); 105 | } 106 | for t in self.threads.drain(..) { 107 | let _ = t.join(); 108 | } 109 | } 110 | } 111 | 112 | pub fn timer_fn( 113 | name: String, 114 | interval: time::Duration, 115 | mut callback: F, 116 | ) -> thread::JoinHandle<()> 117 | where 118 | F: FnMut(time::Instant) -> bool + Send + 'static, 119 | { 120 | thread::Builder::new() 121 | .name(format!("Timer:{}", name)) 122 | .spawn(move || loop { 123 | thread::sleep(interval); 124 | if !callback(time::Instant::now()) { 125 | break; 126 | } 127 | }).expect("Can't start timer") 128 | } 129 | -------------------------------------------------------------------------------- /sucredb.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: 2 | # to specify "size" you need to use a suffix, like: 3 | # 4 | # 1b => 1 byte 5 | # 1k | 1kb => 1024 bytes 6 | # 1m | 1mb => 1024*1024 bytes 7 | # 1g | 1gb => 1024*1024*1024 bytes 8 | # 9 | # to specify "time" you need to use a suffix, like: 10 | # 11 | # 1ms => 1 millisecond 12 | # 1s => 1 second 13 | # 1m => 60 seconds 14 | # 1h => 60 minutes 15 | 16 | # ====== GENERAL CONFIGURATION ====== 17 | 18 | # Location of data directory in the file system 19 | data_dir: "./data" 20 | 21 | # Seed nodes when joining a cluster 22 | # seed_nodes: ["123.123.123:16379"] 23 | seed_nodes: [] 24 | 25 | # Cluster name, must be the same for nodes to "see" each other 26 | cluster_name: "default" 27 | 28 | # Ip and port to bind the socket for client connections 29 | listen_addr: "127.0.0.1:6379" 30 | 31 | # Ip and port to bind the socket for internal cluster connections 32 | fabric_addr: "127.0.0.1:16379" 33 | 34 | # Timeout for client requests 35 | # request_timeout: "1000ms" 36 | 37 | # Resolution for internal tasks timer 38 | # worker_timer: "500ms" 39 | 40 | # Number of worker threads 41 | # Defaults to max(4, 1 + cpucount * 2) 42 | # worker_count: 4 43 | 44 | # Maximum number of incomming syncs 45 | # sync_incomming_max: 10 46 | 47 | # Maximum number of outgoing syncs 48 | # sync_outgoing_max: 10 49 | 50 | # Maximum number of client connections 51 | # client_connection_max: 100 52 | 53 | # logging configuration, log4rs style 54 | logging: 55 | appenders: 56 | console: 57 | kind: "console" 58 | target: "stderr" 59 | file: 60 | kind: "file" 61 | path: "./sucredb.log" 62 | 63 | loggers: 64 | sucredb: 65 | level: "debug" 66 | appenders: ["file", "console"] 67 | 68 | # ====== ADVANCED CONFIGURATION ====== 69 | 70 | # Amount of time to wait before aborting an unresponsible sync 71 | # sync_timeout: "10s" 72 | 73 | # Timeout for sync messages 74 | # sync_msg_timeout: "1000ms" 75 | 76 | # Maximum number of sync messages inflight (per sync) 77 | # sync_msg_inflight: 10 78 | 79 | # Maximum number of conflicting versions for a given value 80 | # value_version_max: 100 81 | --------------------------------------------------------------------------------