├── .gitignore
├── .travis.yml
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── rustfmt.toml
├── scripts
    ├── sanity.py
    ├── simple.py
    └── test_cluster.sh
├── src
    ├── command.rs
    ├── config.rs
    ├── cubes.rs
    ├── database.rs
    ├── dht.rs
    ├── fabric.rs
    ├── fabric_msg.rs
    ├── gossip.rs
    ├── hash.rs
    ├── inflightmap.rs
    ├── main.rs
    ├── metrics.rs
    ├── resp.rs
    ├── server.rs
    ├── storage.rs
    ├── types.rs
    ├── utils.rs
    ├── version_vector.rs
    ├── vnode.rs
    ├── vnode_sync.rs
    └── workers.rs
└── sucredb.yaml


/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | *.rs.bk
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | sudo: false
 3 | language: rust
 4 | rust: nightly
 5 | 
 6 | env:
 7 |   global:
 8 |     - RUST_BACKTRACE=1
 9 |     - RUST_TEST_THREADS=1
10 |     - CARGO_BUILD_JOBS=1
11 |     - MAKE_PARALLELISM=1
12 | 
13 | cache:
14 |   - cargo
15 | 
16 | install:
17 |   - pip install --user redis redis-py-cluster funcy
18 | 
19 | script:
20 |   - travis_wait sleep 1000000000 &
21 |   - cargo test --verbose
22 |   - python scripts/sanity.py verbose 2>&1 | tail -n 100
23 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "sucredb"
 3 | version = "0.1.0"
 4 | authors = ["arthurprs"]
 5 | 
 6 | [dependencies]
 7 | rand = "0.5"
 8 | serde = "1.0"
 9 | serde_derive = "1.0"
10 | log = "0.4"
11 | byteorder="1.0"
12 | tokio-core = "0.1"
13 | tokio-io = "0.1"
14 | tokio-codec = "0.1"
15 | futures = "0.1"
16 | clap="2.0"
17 | crc16="0.4"
18 | metrics="0.2"
19 | lazy_static = "1.0"
20 | serde_yaml = "0.8"
21 | bincode="1.0"
22 | num_cpus="1.0"
23 | roaring="0.5"
24 | crossbeam-channel="0.2"
25 | 
26 | [dependencies.log4rs]
27 | version = "0.8"
28 | default-features = false
29 | features = ["all_components", "file", "yaml_format"]
30 | 
31 | [dependencies.rocksdb]
32 | git = "https://github.com/pingcap/rust-rocksdb.git"
33 | rev = "b011ecb17759d052ae39e2c86addc7b1c7e6c178"
34 | features = ["portable", "sse"]
35 | 
36 | [dependencies.linear-map]
37 | version = "1.2"
38 | features = ["serde_impl"]
39 | 
40 | [dependencies.bytes]
41 | version = "0.4"
42 | features = ["serde"]
43 | 
44 | [dev-dependencies]
45 | env_logger = "0.5.0"
46 | 
47 | # enable for profiling
48 | # [profile.release]
49 | # debug=true
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Arthur Silva
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Sucredb
  2 | 
  3 | > *A database made of sugar cubes*
  4 | 
  5 | [![Build Status](https://travis-ci.org/arthurprs/sucredb.svg?branch=master)](https://travis-ci.org/arthurprs/sucredb)
  6 | 
  7 | Sucredb is a multi-master key-value distributed database, it provides a dynamo style tunable consistent and causality tracking.
  8 | 
  9 | Any node that owns a partition (replicas) can serve both reads and writes. The database tracks causality using vector-clocks and will NOT drop any conflicting writes unlike LWW (last write wins) and other strategies. Conflicts can and do happen due to races between clients and network partitions.
 10 | 
 11 | Status: Alpha quality with missing pieces.
 12 | 
 13 | # API & Clients
 14 | 
 15 | Theoretically you can use Sucredb with any Redis Cluster clients.
 16 | 
 17 | It implements a tiny subset of Redis commands. Only basic Key-Value/Sets/Hashes operations are supported at this point.
 18 | 
 19 | ### Key Value
 20 | 
 21 | #### GET
 22 | 
 23 | *GET* result(s) is/are returned as an array containing the values (zero, one or more if there's conflicting versions) plus the causal context. The context is an binary string and is always returned as the last item of the array even if no values are present.
 24 | 
 25 | `> GET key {consistency}`
 26 | 
 27 | `< [{value1}, {value2}, .., context]`
 28 | 
 29 | #### MGET
 30 | 
 31 | *MGET* takes the # of keys (N) followed by N keys. Results are returned as an array.
 32 | 
 33 | 
 34 | `> MGET key_count {key1} {key2} {..} {consistency}`
 35 | 
 36 | `< [[{value1_1}, {value1_2}, .., context], [{value2_1}, {value2_2}, .., context]]`
 37 | 
 38 | #### SET
 39 | 
 40 | *SET*, in addition to the key and value, also takes the causal context. If you're sure it don't exist you can actually omit the context, if you're wrong it'll create a conflicting version.
 41 | 
 42 | `> SET key value {context} {consistency}`
 43 | 
 44 | `< OK`
 45 | 
 46 | #### GETSET
 47 | 
 48 | *GETSET* is similar to set, but returns the updated value(s) and a new context. Despite the name and the semantics in Redis, the get is always done *after* the set.
 49 | 
 50 | `> GETSET key value context {consistency}`
 51 | 
 52 | `< [{value1}, {value2}, .., context]`
 53 | 
 54 | #### DEL
 55 | 
 56 | *DEL* is like set and also requires a context when dealing with basic values.
 57 | Following Redis api *del* works for keys with any datastructure, in these cases the context is ignored (you can use an empty string instead).
 58 | 
 59 | `> DEL key context {consistency}`
 60 | 
 61 | `< 1 OR 0 (if not found)`
 62 | 
 63 | ### Data structures
 64 | 
 65 | Sucredb also supports a tiny subset of commands for Hash and Set datatypes in addition to a dedicated Counter type. These types are [CRDTs](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type) and don't require a context to be sent along the operation. Mutations depend on the coordinator version of the value and conflicts are handled as follow:
 66 | 
 67 | * Hash: On values conflict the latest write wins.
 68 | * Set: On values conflict add wins.
 69 | * Counter: Deletes may erase non observed increments.
 70 | 
 71 | ### CGET
 72 | 
 73 | Returns the value for a counter or Nil if none is found.
 74 | 
 75 | `> GET key {consistency}`
 76 | 
 77 | `< 1011`
 78 | 
 79 | ### CSET
 80 | 
 81 | Sets the value for a counter.
 82 | 
 83 | `> SET key int_value {consistency}`
 84 | 
 85 | `< OK`
 86 | 
 87 | ### INCRBY
 88 | 
 89 | Increments the value for a counter, the delta can be either positive or negative.
 90 | 
 91 | `> INCRBY key delta_value {consistency}`
 92 | 
 93 | `< resulting_int_value`
 94 | 
 95 | #### HGETALL
 96 | 
 97 | Gets all key value pairs from a hash.
 98 | 
 99 | `> HGETALL key {consistency}`
100 | 
101 | `< [{KA, VA}, {KB, VB}, ...]`
102 | 
103 | #### HSET
104 | 
105 | Set key a value pair in a hash.
106 | 
107 | `> HSET key hash_key value {consistency}`
108 | 
109 | `< 1 OR 0 (if hash_key already existed) `
110 | 
111 | #### HDEL
112 | 
113 | Deletes a key from a hash.
114 | 
115 | `> HDEL key hash_key {consistency}`
116 | 
117 | `< 1 OR 0 (if hash_key didn't exist)`
118 | 
119 | #### SMEMBERS
120 | 
121 | Gets all values from a set.
122 | 
123 | `> SMEMBERS key {consistency}`
124 | 
125 | `< [{KA}, {KB}, ...]`
126 | 
127 | #### SADD
128 | 
129 | Adds a value from the set.
130 | 
131 | `> SADD key value {consistency}`
132 | 
133 | `< 1 OR 0 (if value already existed) `
134 | 
135 | #### SREM
136 | 
137 | Removes a value from the set.
138 | 
139 | `> SREM key value {consistency}`
140 | 
141 | `< 1 OR 0 (if value didn't exist) `
142 | 
143 | ### MULTI/EXEC Batches
144 | 
145 | todo
146 | 
147 | ### Other parameters
148 | 
149 | #### `context` parameter
150 | 
151 | If you don't have a context (from a previous get or getset) you can send an empty string.
152 | 
153 | #### `consistency` parameter
154 | 
155 | `{consistency}` follows the dynamo/cassandra/riak style:
156 | 
157 | * `1`, `o`, `O`: One
158 | * `q`, `Q`: Quorum
159 | * `a`, `A`: All
160 | 
161 | # Running
162 | 
163 | **Requirements**
164 | 
165 | * Needs a reasonably recent Rust (nightly[2])
166 | * C++ compiler (for Rocksdb).
167 | 
168 | **Running**
169 | 
170 | * The following setup will use the default settings.
171 | * Clone the repo and enter repository root
172 | * `cargo install .` [3]
173 | * `sucredb --help`
174 | 
175 | Single/First instance
176 | 
177 | `sucredb -d datadir1 -l 127.0.0.1:6379 -f 127.0.0.1:16379 init`
178 | 
179 | The command above will initialize a new cluster containing this node. The cluster will have the default name, partition count and replication factor.
180 | 
181 | Second instance
182 | 
183 | `sucredb -d datadir2 -l 127.0.0.1:6378 -f 127.0.0.1:16378 -s 127.0.0.1:16379`
184 | 
185 | The second instance joins the cluster using the first instance as a seed.
186 | 
187 | Quick test
188 | 
189 | `redis-cli CLUSTER SLOTS`
190 | 
191 | #### Example
192 | 
193 | Quick example using *redis-cli*
194 | 
195 | ```
196 | ➜  ~ redis-cli
197 | 127.0.0.1:6379> GET there
198 | 1) "\x00\x00\x00\x00\x00\x00\x00\x00"
199 | 127.0.0.1:6379> SET there 1 "\x00\x00\x00\x00\x00\x00\x00\x00"
200 | OK
201 | 127.0.0.1:6379> GET there
202 | 1) "1"
203 | 2) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x01\x00\x00\x00\x00\x00\x00\x00"
204 | 127.0.0.1:6379> SET there 2
205 | OK
206 | 127.0.0.1:6379> GET there 1
207 | 1) "1"
208 | 2) "2"
209 | 3) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x02\x00\x00\x00\x00\x00\x00\x00"
210 | 127.0.0.1:6379> SET there 3 "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x02\x00\x00\x00\x00\x00\x00\x00"
211 | OK
212 | 127.0.0.1:6379> GET there
213 | 1) "3"
214 | 2) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x03\x00\x00\x00\x00\x00\x00\x00"
215 | 127.0.0.1:6379> GETSET there 4 "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x03\x00\x00\x00\x00\x00\x00\x00"
216 | 1) "4"
217 | 2) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x04\x00\x00\x00\x00\x00\x00\x00"
218 | 127.0.0.1:6379> DEL there "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x04\x00\x00\x00\x00\x00\x00\x00"
219 | 1
220 | 127.0.0.1:6379> GET there q
221 | 1) "\x01\x00\x00\x00\x00\x00\x00\x00P\xb0n\x83g\xef`\n\x05\x00\x00\x00\x00\x00\x00\x00
222 | ```
223 | 
224 | # Configuration
225 | 
226 | See `sucredb.yaml`
227 | 
228 | To use configuration file use: `sucredb -c sucredb.yaml`
229 | 
230 | # CAP theorem
231 | 
232 | It behaves mostly like an AP system but not exactly.
233 | 
234 | Sucredb doesn't use sloppy quorum or hinted handoff so it can't serve requests that don't satisfy the requested/default consistency level.
235 | 
236 | # Performance
237 | 
238 | Almost every single new thing claims to be fast or blazing fast. Sucredb makes no claims at this point, but it's probably fast.
239 | 
240 | The data structure operations move the entire collection around the cluster so it's *not* suitable for large values/collections.
241 | 
242 | # Ideas worth exploring
243 | 
244 | * Improve the data model with a range/clustering key.
245 | 
246 | # Background
247 | 
248 | Storage takes advantage of RocksDB.
249 | 
250 | It uses a variant of version clocks to track causality. The actual algorithm is heavily inspired by [1].
251 | 
252 | ----
253 | 
254 | [1] Gonçalves, Ricardo, et al. "Concise server-wide causality management for eventually consistent data stores."
255 | 
256 | [2] Mostly due to the try_from and impl trait features that should be stable soon.
257 | 
258 | [3] Be patient.
259 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | use_try_shorthand=true
2 | error_on_line_overflow=false
3 | 


--------------------------------------------------------------------------------
/scripts/sanity.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import time
  3 | import sys
  4 | import random
  5 | from itertools import chain
  6 | from redis import StrictRedis
  7 | from rediscluster import StrictRedisCluster
  8 | from funcy import retry
  9 | from collections import defaultdict
 10 | import shutil
 11 | 
 12 | VERBOSE = False
 13 | 
 14 | 
 15 | class Instance(object):
 16 |     BIND = "127.0.0.1"
 17 |     PORT = 6379
 18 |     FPORT = 16379
 19 | 
 20 |     def __init__(self, i, ii):
 21 |         super(Instance, self).__init__()
 22 |         self.i = i
 23 |         self.ii = ii
 24 |         self.process = None
 25 |         self.listen_addr = "{}:{}".format(self.BIND, self.PORT + self.i)
 26 |         self.fabric_addr = "{}:{}".format(self.BIND, self.FPORT + self.i)
 27 |         self.data_dir = "n{}".format(self.i)
 28 | 
 29 |     @property
 30 |     def client(self):
 31 |         return StrictRedis(self.BIND, self.PORT + self.i)
 32 | 
 33 |     def clear_data(self):
 34 |         shutil.rmtree(self.data_dir, ignore_errors=True)
 35 | 
 36 |     def cluster_init(self):
 37 |         self.clear_data()
 38 |         self.start("init")
 39 | 
 40 |     def cluster_join(self):
 41 |         self.clear_data()
 42 |         self.start()
 43 | 
 44 |     def wait_ready(self, callback=lambda c: c.ping(),
 45 |                    timeout=5, sleep=0.1):
 46 |         @retry(int(timeout / float(sleep) + 0.5), timeout=sleep)
 47 |         def inner():
 48 |             assert callback(self.client)
 49 |         inner()
 50 | 
 51 |     def start(self, *args):
 52 |         assert not self.process
 53 |         self.process = subprocess.Popen(
 54 |             ["cargo", "run", "--",
 55 |              "-l", self.listen_addr,
 56 |              "-f", self.fabric_addr,
 57 |              "-d", self.data_dir]
 58 |             + list(chain.from_iterable(
 59 |                 ["-s", "{}:{}".format(self.BIND, self.FPORT + i)]
 60 |                 for i in range(self.ii)
 61 |                 if i != self.i
 62 |             ))
 63 |             + list(args),
 64 |             stdin=sys.stdin if VERBOSE else None,
 65 |             stdout=sys.stdout if VERBOSE else None,
 66 |             stderr=sys.stderr if VERBOSE else None,
 67 |             )
 68 |         self.wait_ready()
 69 | 
 70 |     def __del__(self):
 71 |         if self.process:
 72 |             self.process.kill()
 73 | 
 74 |     def kill(self):
 75 |         assert self.process
 76 |         self.process.kill()
 77 |         self.process.wait()
 78 |         self.process = None
 79 | 
 80 |     def restart(self):
 81 |         self.kill()
 82 |         self.start()
 83 | 
 84 |     @property
 85 |     def running(self):
 86 |         return bool(self.process)
 87 | 
 88 |     def execute(self, *args, **kwargs):
 89 |         self.client.execute_command(*args, **kwargs)
 90 | 
 91 | 
 92 | def main():
 93 |     global VERBOSE
 94 |     VERBOSE = "verbose" in sys.argv[1:]
 95 |     subprocess.check_call(["cargo", "build"])
 96 |     cluster_sz = 3
 97 |     cluster = [Instance(i, cluster_sz) for i in range(cluster_sz)]
 98 |     cluster[0].cluster_init()
 99 |     cluster[1].cluster_join()
100 |     cluster[2].cluster_join()
101 |     cluster[0].execute("CLUSTER", "REBALANCE")
102 |     time.sleep(5)
103 | 
104 |     client = StrictRedisCluster(
105 |         startup_nodes=[
106 |             {"host": n.listen_addr.partition(":")[0],
107 |              "port": int(n.listen_addr.partition(":")[2])}
108 |             for n in cluster
109 |         ],
110 |         decode_responses=False,
111 |         socket_timeout=0.5,
112 |     )
113 | 
114 |     check_map = defaultdict(set)
115 |     items = 1000
116 |     groups = 100
117 |     for i in xrange(items):
118 |         k = str(i % groups)
119 |         v = str(i)
120 |         client.execute_command("SET", k, v, "", "Q")
121 |         check_map[k].add(v)
122 |         if random.random() < 0.1:
123 |             n = random.choice(cluster)
124 |             # restart and wait for it to connect to cluster
125 |             n.restart()
126 |             n.wait_ready(lambda c: c.execute_command("CLUSTER", "CONNECTIONS"))
127 | 
128 |     # let the syncs settle
129 |     time.sleep(5)
130 | 
131 |     @retry(2, timeout=5)
132 |     def test_all_nodes_complete():
133 |         for k, expected in check_map.items():
134 |             values = set(client.get(k)[:-1])
135 |             assert values == expected, "%s %s %s" % (k, expected, values)
136 |             for c in cluster:
137 |                 values = set(c.client.execute_command("GET", k, "1")[:-1])
138 |                 assert values == expected, \
139 |                     "key %s expected %s got %s (diff %s)" % (
140 |                         k, expected, values, expected ^ values)
141 | 
142 |     test_all_nodes_complete()
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     main()
147 | 


--------------------------------------------------------------------------------
/scripts/simple.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # python std lib
  4 | import time
  5 | 
  6 | # 3rd party imports
  7 | import funcy
  8 | from docopt import docopt
  9 | from redis._compat import xrange
 10 | 
 11 | 
 12 | def resp(aa):
 13 |     if len(aa) == 1:
 14 |         return [], aa[0]
 15 |     if len(aa) == 2:
 16 |         return aa[0], aa[1]
 17 |     raise Exception("multiple values returned ~ " + str(aa))
 18 | 
 19 | 
 20 | def loop(rc, reset_last_key=None):
 21 |     """
 22 |     Regular debug loop that can be used to test how redis behaves during changes in the cluster.
 23 |     """
 24 |     _, last_ctx = resp(rc.get("__last__"))
 25 |     if reset_last_key:
 26 |         rc.hset("__last__", 0, last_ctx)
 27 | 
 28 |     last = False
 29 |     while last is False:
 30 |         try:
 31 |             last, last_ctx = resp(rc.get("__last__"))
 32 |             print "last is %s" % last
 33 |             last = 0 if not last else int(last)
 34 |             print("starting at foo{0}".format(last))
 35 |         except Exception as e:
 36 |             print("error1 {0}".format(repr(e)))
 37 |             time.sleep(1)
 38 | 
 39 |     for i in xrange(last, 1000000000):  # noqa
 40 |         try:
 41 |             print("SET foo{} {}".format(i, i))
 42 |             rc.set("foo{}".format(i), str(i))
 43 |             got, got_ctx = resp(rc.get("foo{}".format(i)))
 44 |             print("GET foo{} {}".format(i, got))
 45 |             assert got == str(i), "%s != %s" % (got, i)
 46 |             _, last_ctx = rc.execute_command("getset", "__last__", i, last_ctx)
 47 |         except Exception as e:
 48 |             print("error2 {}".format(repr(e)))
 49 | 
 50 | 
 51 | def timeit(rc, itterations=50000):
 52 |     """
 53 |     Time how long it take to run a number of set/get:s
 54 |     """
 55 |     t0 = time.time()
 56 |     for i in xrange(0, itterations):  # noqa
 57 |         s = "foo{0}".format(i)
 58 |         rc.set(s, i)
 59 |         rc.get(s)
 60 | 
 61 |     t1 = time.time() - t0
 62 |     print("{0}k SET/GET operations took: {1} seconds... {2} operations per second".format(
 63 |         (itterations / 1000) * 2, t1, (itterations / t1) * 2))
 64 | 
 65 | 
 66 | def timeit_pipeline(rc, itterations=50000):
 67 |     """
 68 |     Time how long it takes to run a number of set/get:s inside a cluster pipeline
 69 |     """
 70 |     t0 = time.time()
 71 |     for i in xrange(0, itterations):  # noqa
 72 |         s = "foo{0}".format(i)
 73 | 
 74 |         p = rc.pipeline()
 75 |         p.set(s, i)
 76 |         p.get(s)
 77 |         p.execute()
 78 | 
 79 |     t1 = time.time() - t0
 80 |     print("{0}k SET/GET operations inside pipelines took: {1} seconds... {2} operations per second".format(
 81 |         (itterations / 1000) * 2, t1, (itterations / t1) * 2)
 82 |     )
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     __docopt__ = """
 87 | Usage:
 88 |   simple [--host IP] [--port PORT] [--nocluster] [--timeit] [--pipeline] [--resetlastkey] [-h] [--version]
 89 | 
 90 | Options:
 91 |   --nocluster        If flag is set then StrictRedis will be used instead of cluster lib
 92 |   --host IP          Redis server to test against [default: 127.0.0.1]
 93 |   --port PORT        Port on redis server [default: 7000]
 94 |   --timeit           run a mini benchmark to test performance
 95 |   --pipeline         Only usable with --timeit flag. Runs SET/GET inside pipelines.
 96 |   --resetlastkey     reset __last__ key
 97 |   -h --help          show this help and exit
 98 |   -v --version       show version and exit
 99 |     """
100 | 
101 |     args = docopt(__docopt__, version="0.3.0")
102 | 
103 |     startup_nodes = [{"host": args["--host"], "port": args["--port"]}]
104 | 
105 |     if not args["--nocluster"]:
106 |         from rediscluster import StrictRedisCluster
107 |         rc = StrictRedisCluster(startup_nodes=startup_nodes,
108 |                                 max_connections=32, socket_timeout=0.5,
109 |                                 decode_responses=False, skip_full_coverage_check=True)
110 |     else:
111 |         from redis import StrictRedis
112 |         rc = StrictRedis(host=args["--host"], port=args["--port"],
113 |                          socket_timeout=0.5, decode_responses=False)
114 | 
115 |     if args["--timeit"]:
116 |         test_itterstions = [
117 |             5000,
118 |             10000,
119 |             20000,
120 |         ]
121 | 
122 |         if args["--pipeline"]:
123 |             for itterations in test_itterstions:
124 |                 timeit_pipeline(rc, itterations=itterations)
125 |         else:
126 |             for itterations in test_itterstions:
127 |                 timeit(rc, itterations=itterations)
128 |     else:
129 |         loop(rc, reset_last_key=args["--resetlastkey"])
130 | 


--------------------------------------------------------------------------------
/scripts/test_cluster.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
 4 | export RUST_BACKTRACE=1
 5 | export RUST_LOG=sucredb=info,sucredb::vnode=debug,sucredb::fabric=info
 6 | SLEEP=2
 7 | cargo build
 8 | rm -rf n1 n2
 9 | ../target/debug/sucredb -d n1 -l 127.0.0.1:6379 -f 127.0.0.1:16379 init -r 2 > log1.txt 2>&1  &
10 | echo "WAITING $SLEEP"
11 | sleep $SLEEP
12 | ../target/debug/sucredb -d n2 -l 127.0.0.1:6378 -f 127.0.0.1:16378 -s 127.0.0.1:16379 > log2.txt 2>&1  &
13 | 
14 | tail -f log1.txt log2.txt
15 | 


--------------------------------------------------------------------------------
/src/command.rs:
--------------------------------------------------------------------------------
  1 | use bincode;
  2 | use bytes::Bytes;
  3 | use config;
  4 | use cubes::{self, Cube};
  5 | use database::{Context, Database};
  6 | use metrics::{self, Meter};
  7 | use resp::RespValue;
  8 | use std::convert::TryInto;
  9 | use std::net;
 10 | use types::*;
 11 | use utils::{assume_str, replace_default};
 12 | use version_vector::*;
 13 | 
 14 | #[derive(Debug)]
 15 | pub enum CommandError {
 16 |     Timeout,
 17 |     ProtocolError,
 18 |     StorageError,
 19 |     UnknownCommand,
 20 |     TooManyVersions,
 21 |     TypeError,
 22 |     InvalidContext,
 23 |     InvalidArgCount,
 24 |     InvalidKey,
 25 |     InvalidValue,
 26 |     InvalidConsistencyValue,
 27 |     InvalidIntValue,
 28 |     InvalidExec,
 29 |     InvalidCommand,
 30 |     InvalidMultiCommand,
 31 |     MultiplePartitions,
 32 |     MultipleKeyMutations,
 33 |     Unavailable,
 34 | }
 35 | 
 36 | impl Into<RespValue> for CommandError {
 37 |     fn into(self) -> RespValue {
 38 |         RespValue::Error(format!("{:?}", self).into())
 39 |     }
 40 | }
 41 | 
 42 | fn parse_int<T: ::std::str::FromStr + Default>(
 43 |     try: bool,
 44 |     args: &[&Bytes],
 45 |     i: usize,
 46 | ) -> Result<T, CommandError> {
 47 |     if try {
 48 |         assume_str(&args[i])
 49 |             .parse()
 50 |             .map_err(|_| CommandError::InvalidIntValue)
 51 |     } else {
 52 |         Ok(Default::default())
 53 |     }
 54 | }
 55 | 
 56 | fn check_arg_count(count: usize, min: usize, max: usize) -> Result<(), CommandError> {
 57 |     if count < min || count > max {
 58 |         Err(CommandError::InvalidArgCount)
 59 |     } else {
 60 |         Ok(())
 61 |     }
 62 | }
 63 | 
 64 | fn check_key_len(key_len: usize) -> Result<(), CommandError> {
 65 |     if key_len > config::MAX_KEY_LEN {
 66 |         Err(CommandError::InvalidKey)
 67 |     } else {
 68 |         Ok(())
 69 |     }
 70 | }
 71 | 
 72 | fn check_value_len(value_len: usize) -> Result<(), CommandError> {
 73 |     if value_len > config::MAX_VALUE_LEN {
 74 |         Err(CommandError::InvalidKey)
 75 |     } else {
 76 |         Ok(())
 77 |     }
 78 | }
 79 | 
 80 | impl Database {
 81 |     pub fn handler_cmd(&self, mut context: Context) {
 82 |         let cmd = context.commands.pop().unwrap();
 83 |         if let Err(e) = self.handle_cmd(&mut context, cmd) {
 84 |             context.clear();
 85 |             self.respond_error(&mut context, e);
 86 |         }
 87 |     }
 88 | 
 89 |     fn handle_cmd(&self, context: &mut Context, cmd: RespValue) -> Result<(), CommandError> {
 90 |         debug!("Processing ({:?}) {:?}", context.token, cmd);
 91 |         let mut args = Vec::new();
 92 |         match cmd {
 93 |             RespValue::Array(ref a) => {
 94 |                 args.reserve_exact(a.len());
 95 |                 for v in a.iter() {
 96 |                     if let &RespValue::Data(ref b) = v {
 97 |                         args.push(b);
 98 |                     } else {
 99 |                         args.clear();
100 |                         break;
101 |                     }
102 |                 }
103 |             }
104 |             _ => (),
105 |         }
106 | 
107 |         if args.is_empty() {
108 |             return Err(CommandError::ProtocolError);
109 |         }
110 | 
111 |         let arg0 = args[0];
112 |         let args = &args[1..];
113 | 
114 |         if context.is_exec_cmd {
115 |             match arg0.as_ref() {
116 |                 b"CSET" | b"cset" => self.cmd_cset(context, args),
117 |                 b"INCRBY" | b"incrby" => self.cmd_incrby(context, args),
118 |                 b"SET" | b"set" => self.cmd_set(context, args, false),
119 |                 b"HSET" | b"hset" => self.cmd_hset(context, args),
120 |                 b"HDEL" | b"hdel" => self.cmd_hdel(context, args),
121 |                 b"SADD" | b"sadd" => self.cmd_sadd(context, args),
122 |                 b"SREM" | b"srem" => self.cmd_srem(context, args),
123 |                 b"GETSET" | b"getset" => self.cmd_set(context, args, true),
124 |                 b"DEL" | b"del" => self.cmd_del(context, args),
125 |                 _ => {
126 |                     debug!("Unknown command for multi {:?}", cmd);
127 |                     Err(CommandError::InvalidMultiCommand)
128 |                 }
129 |             }
130 |         } else if context.is_multi_cmd {
131 |             match arg0.as_ref() {
132 |                 b"EXEC" | b"exec" => self.cmd_exec(context, args),
133 |                 _ => {
134 |                     // Enqueue command for later exec
135 |                     context.commands.push(cmd);
136 |                     Ok(self.respond_resp(context, RespValue::Status("QUEUED".into())))
137 |                 }
138 |             }
139 |         } else {
140 |             match arg0.as_ref() {
141 |                 b"GET" | b"get" => self.cmd_get(context, args),
142 |                 b"MGET" | b"mget" => self.cmd_mget(context, args),
143 |                 b"SET" | b"set" => self.cmd_set(context, args, false),
144 |                 b"CGET" | b"cget" => self.cmd_cget(context, args),
145 |                 b"CSET" | b"cset" => self.cmd_cset(context, args),
146 |                 b"INCRBY" | b"incrby" => self.cmd_incrby(context, args),
147 |                 b"HGETALL" | b"hgetall" => self.cmd_hgetall(context, args),
148 |                 b"HSET" | b"hset" => self.cmd_hset(context, args),
149 |                 b"HDEL" | b"hdel" => self.cmd_hdel(context, args),
150 |                 b"SMEMBERS" | b"smembers" => self.cmd_smembers(context, args),
151 |                 b"SADD" | b"sadd" => self.cmd_sadd(context, args),
152 |                 b"SREM" | b"srem" => self.cmd_srem(context, args),
153 |                 b"GETSET" | b"getset" => self.cmd_set(context, args, true),
154 |                 b"DEL" | b"del" => self.cmd_del(context, args),
155 |                 b"CLUSTER" | b"cluster" => self.cmd_cluster(context, args),
156 |                 b"TYPE" | b"type" => self.cmd_type(context, args),
157 |                 b"MULTI" | b"multi" => self.cmd_multi(context, args),
158 |                 b"EXEC" | b"exec" => self.cmd_exec(context, args),
159 |                 b"ECHO" | b"echo" => Ok(self.respond_resp(context, cmd)),
160 |                 b"PING" | b"ping" => Ok(self.respond_resp(context, RespValue::Data("PONG".into()))),
161 |                 b"ASKING" | b"asking" | b"READWRITE" | b"readwrite" => {
162 |                     check_arg_count(args.len(), 0, 0).and_then(|_| Ok(self.respond_ok(context)))
163 |                 }
164 |                 b"CONFIG" | b"config" => self.cmd_config(context, args),
165 |                 _ => {
166 |                     debug!("Unknown command {:?}", cmd);
167 |                     Err(CommandError::UnknownCommand)
168 |                 }
169 |             }
170 |         }
171 |     }
172 | 
173 |     fn parse_vv(
174 |         &self,
175 |         try: bool,
176 |         args: &[&Bytes],
177 |         i: usize,
178 |     ) -> Result<VersionVector, CommandError> {
179 |         if try && !args[i].is_empty() {
180 |             bincode::deserialize(args[i]).map_err(|_| CommandError::InvalidContext)
181 |         } else {
182 |             Ok(Default::default())
183 |         }
184 |     }
185 | 
186 |     fn parse_consistency(
187 |         &self,
188 |         try: bool,
189 |         args: &[&Bytes],
190 |         i: usize,
191 |     ) -> Result<ConsistencyLevel, CommandError> {
192 |         Ok(if try {
193 |             args[i]
194 |                 .as_ref()
195 |                 .try_into()
196 |                 .map_err(|_| CommandError::InvalidConsistencyValue)?
197 |         } else {
198 |             self.config.consistency_read
199 |         })
200 |     }
201 | 
202 |     fn cmd_multi(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
203 |         assert!(!context.is_multi_cmd);
204 |         context.is_multi_cmd = true;
205 |         check_arg_count(args.len(), 0, 0)?;
206 |         Ok(self.respond_ok(context))
207 |     }
208 | 
209 |     fn cmd_exec(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
210 |         if !context.is_multi_cmd {
211 |             return Err(CommandError::InvalidExec);
212 |         }
213 |         check_arg_count(args.len(), 0, 1)?;
214 |         let consistency = self.parse_consistency(args.len() > 0, args, 0)?;
215 |         assert!(!context.is_exec_cmd);
216 |         context.is_exec_cmd = true;
217 |         let mut cmds = replace_default(&mut context.commands);
218 |         for cmd in cmds.drain(..) {
219 |             debug!("token:{} exec: {:?}", context.token, cmd);
220 |             self.handle_cmd(context, cmd)?;
221 |         }
222 |         context.commands = cmds;
223 |         self.set_flush(context, consistency)
224 |     }
225 | 
226 |     fn cmd_config(&self, context: &mut Context, _args: &[&Bytes]) -> Result<(), CommandError> {
227 |         Ok(self.respond_resp(context, RespValue::Array(Default::default())))
228 |     }
229 | 
230 |     fn cmd_hgetall(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
231 |         metrics::REQUEST_GET.mark(1);
232 |         check_arg_count(args.len(), 1, 2)?;
233 |         check_key_len(args[0].len())?;
234 |         let consistency = self.parse_consistency(args.len() > 1, args, 1)?;
235 |         self.get(context, args[0], consistency, Box::new(cubes::render_map))
236 |     }
237 | 
238 |     fn cmd_hset(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
239 |         metrics::REQUEST_SET.mark(1);
240 |         check_arg_count(args.len(), 3, 4)?;
241 |         check_key_len(args[0].len())?;
242 |         check_key_len(args[1].len())?;
243 |         check_value_len(args[2].len())?;
244 |         let hash_key = args[1].clone();
245 |         let hash_value = args[2].clone();
246 |         let consistency = self.parse_consistency(args.len() > 3, args, 3)?;
247 |         self.set(
248 |             context,
249 |             args[0],
250 |             Box::new(move |i, v, c: Cube| {
251 |                 let mut map = c.into_map().ok_or(CommandError::TypeError)?;
252 |                 let result = map.insert(i, v, hash_key, hash_value) as i64;
253 |                 Ok((Cube::Map(map), Some(RespValue::Int(result))))
254 |             }),
255 |             consistency,
256 |             false,
257 |             None,
258 |         )
259 |     }
260 | 
261 |     fn cmd_hdel(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
262 |         metrics::REQUEST_DEL.mark(1);
263 |         check_arg_count(args.len(), 2, 3)?;
264 |         check_key_len(args[0].len())?;
265 |         check_key_len(args[1].len())?;
266 |         let hash_key = args[1].clone();
267 |         let consistency = self.parse_consistency(args.len() > 2, args, 2)?;
268 |         self.set(
269 |             context,
270 |             args[0],
271 |             Box::new(move |i, v, c: Cube| {
272 |                 let mut map = c.into_map().ok_or(CommandError::TypeError)?;
273 |                 let result = map.remove(i, v, &hash_key) as i64;
274 |                 Ok((Cube::Map(map), Some(RespValue::Int(result))))
275 |             }),
276 |             consistency,
277 |             false,
278 |             None,
279 |         )
280 |     }
281 | 
282 |     fn cmd_smembers(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
283 |         metrics::REQUEST_GET.mark(1);
284 |         check_arg_count(args.len(), 1, 2)?;
285 |         check_key_len(args[0].len())?;
286 |         let consistency = self.parse_consistency(args.len() > 1, args, 1)?;
287 |         self.get(context, args[0], consistency, Box::new(cubes::render_set))
288 |     }
289 | 
290 |     fn cmd_sadd(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
291 |         metrics::REQUEST_SET.mark(1);
292 |         check_arg_count(args.len(), 2, 3)?;
293 |         check_key_len(args[0].len())?;
294 |         check_value_len(args[1].len())?;
295 |         let set_value = args[1].clone();
296 |         let consistency = self.parse_consistency(args.len() > 2, args, 2)?;
297 |         self.set(
298 |             context,
299 |             args[0],
300 |             Box::new(move |i, v, c: Cube| {
301 |                 let mut set = c.into_set().ok_or(CommandError::TypeError)?;
302 |                 let result = set.insert(i, v, set_value) as i64;
303 |                 Ok((Cube::Set(set), Some(RespValue::Int(result))))
304 |             }),
305 |             consistency,
306 |             false,
307 |             None,
308 |         )
309 |     }
310 | 
311 |     fn cmd_srem(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
312 |         metrics::REQUEST_DEL.mark(1);
313 |         check_arg_count(args.len(), 2, 3)?;
314 |         check_key_len(args[0].len())?;
315 |         check_value_len(args[1].len())?;
316 |         let set_value = args[1].clone();
317 |         let consistency = self.parse_consistency(args.len() > 2, args, 2)?;
318 |         self.set(
319 |             context,
320 |             args[0],
321 |             Box::new(move |i, v, c: Cube| {
322 |                 let mut set = c.into_set().ok_or(CommandError::TypeError)?;
323 |                 let result = set.remove(i, v, &set_value) as i64;
324 |                 Ok((Cube::Set(set), Some(RespValue::Int(result))))
325 |             }),
326 |             consistency,
327 |             false,
328 |             None,
329 |         )
330 |     }
331 | 
332 |     fn cmd_get(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
333 |         metrics::REQUEST_GET.mark(1);
334 |         check_arg_count(args.len(), 1, 2)?;
335 |         check_key_len(args[0].len())?;
336 |         let consistency = self.parse_consistency(args.len() > 1, args, 1)?;
337 |         self.get(context, args[0], consistency, Box::new(cubes::render_value))
338 |     }
339 | 
340 |     fn cmd_mget(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
341 |         context.is_multi_cmd = true;
342 |         metrics::REQUEST_GET.mark(1);
343 |         check_arg_count(args.len(), 1, 100)?;
344 |         let key_count: usize = parse_int(args.len() > 0, args, 0)?;
345 |         if key_count >= args.len() {
346 |             return Err(CommandError::InvalidCommand);
347 |         }
348 |         let keys = &args[1..1 + key_count];
349 |         let consistency =
350 |             self.parse_consistency(args.len() > 1 + key_count, args, 1 + key_count)?;
351 |         for key in keys {
352 |             check_key_len(key.len())?;
353 |         }
354 |         self.mget(context, keys, consistency, Box::new(cubes::render_value))
355 |     }
356 | 
357 |     fn cmd_set(
358 |         &self,
359 |         context: &mut Context,
360 |         args: &[&Bytes],
361 |         reply_result: bool,
362 |     ) -> Result<(), CommandError> {
363 |         metrics::REQUEST_SET.mark(1);
364 |         check_arg_count(args.len(), 2, 4)?;
365 |         check_key_len(args[0].len())?;
366 |         check_value_len(args[1].len())?;
367 |         let value = args[1].clone();
368 |         let vv = self.parse_vv(args.len() > 2, args, 2)?;
369 |         let consistency = self.parse_consistency(args.len() > 3, args, 3)?;
370 |         self.set(
371 |             context,
372 |             args[0],
373 |             Box::new(move |i, v, c: Cube| {
374 |                 let mut cube_value = c.into_value().ok_or(CommandError::TypeError)?;
375 |                 cube_value.set(i, v, Some(value), &vv);
376 |                 let resp = if reply_result {
377 |                     None
378 |                 } else {
379 |                     Some(RespValue::Status("OK".into()))
380 |                 };
381 |                 Ok((Cube::Value(cube_value), resp))
382 |             }),
383 |             consistency,
384 |             reply_result,
385 |             if reply_result {
386 |                 Some(Box::new(cubes::render_value))
387 |             } else {
388 |                 None
389 |             },
390 |         )
391 |     }
392 | 
393 |     fn cmd_del(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
394 |         metrics::REQUEST_DEL.mark(1);
395 |         check_arg_count(args.len(), 1, 3)?;
396 |         check_key_len(args[0].len())?;
397 |         let vv = self.parse_vv(args.len() > 1, args, 1)?;
398 |         let consistency = self.parse_consistency(args.len() > 2, args, 2)?;
399 |         self.set(
400 |             context,
401 |             args[0],
402 |             Box::new(move |i, v, mut c: Cube| {
403 |                 let result = c.del(i, v, &vv) as i64;
404 |                 Ok((c, Some(RespValue::Int(result))))
405 |             }),
406 |             consistency,
407 |             false,
408 |             None,
409 |         )
410 |     }
411 | 
412 |     fn cmd_cset(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
413 |         metrics::REQUEST_SET.mark(1);
414 |         check_arg_count(args.len(), 2, 3)?;
415 |         check_key_len(args[0].len())?;
416 |         let value: i64 = parse_int(args.len() > 1, args, 1)?;
417 |         let consistency = self.parse_consistency(args.len() > 2, args, 2)?;
418 |         self.set(
419 |             context,
420 |             args[0],
421 |             Box::new(move |i, v, c: Cube| {
422 |                 let mut counter = c.into_counter().ok_or(CommandError::TypeError)?;
423 |                 counter.clear(i, v);
424 |                 counter.inc(i, v, value);
425 |                 Ok((Cube::Counter(counter), Some(RespValue::Status("OK".into()))))
426 |             }),
427 |             consistency,
428 |             false,
429 |             None,
430 |         )
431 |     }
432 | 
433 |     fn cmd_cget(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
434 |         metrics::REQUEST_GET.mark(1);
435 |         check_arg_count(args.len(), 1, 2)?;
436 |         check_key_len(args[0].len())?;
437 |         let consistency = self.parse_consistency(args.len() > 1, args, 1)?;
438 |         self.get(
439 |             context,
440 |             args[0],
441 |             consistency,
442 |             Box::new(cubes::render_counter),
443 |         )
444 |     }
445 | 
446 |     fn cmd_incrby(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
447 |         metrics::REQUEST_SET.mark(1);
448 |         check_arg_count(args.len(), 2, 3)?;
449 |         check_key_len(args[0].len())?;
450 |         let inc: i64 = parse_int(args.len() > 1, args, 1)?;
451 |         let consistency = self.parse_consistency(args.len() > 2, args, 2)?;
452 |         self.set(
453 |             context,
454 |             args[0],
455 |             Box::new(move |i, v, c: Cube| {
456 |                 let mut counter = c.into_counter().ok_or(CommandError::TypeError)?;
457 |                 counter.inc(i, v, inc);
458 |                 Ok((Cube::Counter(counter), Some(RespValue::Status("OK".into()))))
459 |             }),
460 |             consistency,
461 |             false,
462 |             None,
463 |         )
464 |     }
465 | 
466 |     fn cmd_type(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
467 |         check_arg_count(args.len(), 1, 2)?;
468 |         let consistency = self.parse_consistency(args.len() > 1, args, 1)?;
469 |         self.get(context, args[0], consistency, Box::new(cubes::render_type))
470 |     }
471 | 
472 |     fn cmd_cluster(&self, context: &mut Context, args: &[&Bytes]) -> Result<(), CommandError> {
473 |         check_arg_count(args.len(), 1, 1)?;
474 |         match args[0].as_ref() {
475 |             b"CONNECTIONS" | b"connections" => {
476 |                 let conns = self.fabric.connections();
477 |                 let resp_conns = conns.into_iter().map(|x| RespValue::Int(x as _)).collect();
478 |                 Ok(self.respond_resp(context, RespValue::Array(resp_conns)))
479 |             }
480 |             b"REBALANCE" | b"rebalance" => {
481 |                 self.dht.rebalance().unwrap();
482 |                 Ok(self.respond_ok(context))
483 |             }
484 |             b"SLOTS" | b"slots" => {
485 |                 let mut slots = Vec::new();
486 |                 for (&(start, end), members) in &self.dht.slots() {
487 |                     let mut slot = vec![RespValue::Int(start as _), RespValue::Int(end as _)];
488 |                     slot.extend(members.iter().map(|&(node, (_, ext_addr))| {
489 |                         RespValue::Array(vec![
490 |                             RespValue::Data(ext_addr.ip().to_string().as_bytes().into()),
491 |                             RespValue::Int(ext_addr.port() as _),
492 |                             RespValue::Data(node.to_string().as_bytes().into()),
493 |                         ])
494 |                     }));
495 |                     slots.push(RespValue::Array(slot));
496 |                 }
497 |                 Ok(self.respond_resp(context, RespValue::Array(slots)))
498 |             }
499 |             _ => Err(CommandError::UnknownCommand),
500 |         }
501 |     }
502 | 
503 |     pub fn respond(&self, context: &mut Context) {
504 |         debug!("Respond request ({}) {:?}", context.token, context.response);
505 |         (&self.response_fn)(replace_default(context));
506 |     }
507 | 
508 |     pub fn respond_resp(&self, context: &mut Context, resp: RespValue) {
509 |         context.response.push(resp);
510 |         self.respond(context);
511 |     }
512 | 
513 |     pub fn respond_int(&self, context: &mut Context, int: i64) {
514 |         self.respond_resp(context, RespValue::Int(int));
515 |     }
516 | 
517 |     pub fn respond_ok(&self, context: &mut Context) {
518 |         self.respond_resp(context, RespValue::Status("OK".into()));
519 |     }
520 | 
521 |     pub fn respond_error(&self, context: &mut Context, error: CommandError) {
522 |         self.respond_resp(context, error.into());
523 |     }
524 | 
525 |     pub fn respond_moved(&self, context: &mut Context, vnode: VNodeNo, addr: net::SocketAddr) {
526 |         self.respond_resp(
527 |             context,
528 |             RespValue::Error(format!("MOVED {} {}", vnode, addr).into()),
529 |         );
530 |     }
531 | 
532 |     pub fn respond_ask(&self, context: &mut Context, vnode: VNodeNo, addr: net::SocketAddr) {
533 |         self.respond_resp(
534 |             context,
535 |             RespValue::Error(format!("ASK {} {}", vnode, addr).into()),
536 |         );
537 |     }
538 | }
539 | 


--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::max;
  2 | use std::convert::TryInto;
  3 | use std::fs::File;
  4 | use std::io::Read;
  5 | use std::net::SocketAddr;
  6 | use std::path::{Path, PathBuf};
  7 | use std::str::FromStr;
  8 | 
  9 | use log;
 10 | use log4rs;
 11 | use num_cpus;
 12 | use serde_yaml as yaml;
 13 | 
 14 | use types::ConsistencyLevel;
 15 | use utils::GenericError;
 16 | 
 17 | // Remember to update defaults in sucredb.yaml!
 18 | pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:6379";
 19 | pub const DEFAULT_FABRIC_ADDR: &str = "127.0.0.1:16379";
 20 | pub const DEFAULT_CLUSTER_NAME: &str = "default";
 21 | pub const DEFAULT_DATA_DIR: &str = "./data";
 22 | pub const DEFAULT_REPLICATION_FACTOR: &str = "3";
 23 | pub const DEFAULT_PARTITIONS: &str = "64";
 24 | pub const MAX_KEY_LEN: usize = 500;
 25 | pub const MAX_VALUE_LEN: usize = 10 * 1024 * 1024;
 26 | 
 27 | #[derive(Debug, Clone)]
 28 | pub struct Config {
 29 |     pub data_dir: PathBuf,
 30 |     pub cluster_name: String,
 31 |     pub listen_addr: SocketAddr,
 32 |     pub fabric_addr: SocketAddr,
 33 |     pub cmd_init: Option<InitCommand>,
 34 |     pub worker_timer: u32,
 35 |     pub worker_count: u16,
 36 |     pub sync_incomming_max: u16,
 37 |     pub sync_outgoing_max: u16,
 38 |     pub sync_auto: bool,
 39 |     pub sync_timeout: u32,
 40 |     pub sync_msg_timeout: u32,
 41 |     pub sync_msg_inflight: u32,
 42 |     pub dht_sync_on_connect: bool,
 43 |     pub dht_sync_aae: bool,
 44 |     pub fabric_timeout: u32,
 45 |     pub request_timeout: u32,
 46 |     pub client_connection_max: u32,
 47 |     pub value_version_max: u16,
 48 |     pub seed_nodes: Vec<SocketAddr>,
 49 |     // TODO: these should be in the cluster config instead
 50 |     pub consistency_read: ConsistencyLevel,
 51 |     pub consistency_write: ConsistencyLevel,
 52 | }
 53 | 
 54 | impl Default for Config {
 55 |     fn default() -> Self {
 56 |         // Remember to update defaults in sucre.yaml!
 57 |         Config {
 58 |             data_dir: DEFAULT_DATA_DIR.into(),
 59 |             cluster_name: DEFAULT_CLUSTER_NAME.into(),
 60 |             listen_addr: DEFAULT_LISTEN_ADDR.parse().unwrap(),
 61 |             fabric_addr: DEFAULT_FABRIC_ADDR.parse().unwrap(),
 62 |             cmd_init: None,
 63 |             worker_timer: 500,
 64 |             worker_count: max(4, num_cpus::get() as u16 * 2),
 65 |             sync_incomming_max: 10,
 66 |             sync_outgoing_max: 10,
 67 |             sync_timeout: 10_000,
 68 |             sync_msg_timeout: 1000,
 69 |             sync_msg_inflight: 10,
 70 |             sync_auto: true,
 71 |             dht_sync_on_connect: true,
 72 |             dht_sync_aae: true,
 73 |             fabric_timeout: 1000,
 74 |             request_timeout: 1000,
 75 |             client_connection_max: 100,
 76 |             value_version_max: 100,
 77 |             seed_nodes: Vec::new(),
 78 |             consistency_read: ConsistencyLevel::One,
 79 |             consistency_write: ConsistencyLevel::One,
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | #[derive(Debug, Clone)]
 85 | pub struct InitCommand {
 86 |     pub replication_factor: u8,
 87 |     pub partitions: u16,
 88 | }
 89 | 
 90 | fn split_number_suffix(s: &str) -> Result<(i64, &str), GenericError> {
 91 |     let digits_end = s
 92 |         .trim()
 93 |         .chars()
 94 |         .position(|c| !c.is_digit(10))
 95 |         .unwrap_or(s.len());
 96 |     let (digits, suffix) = s.split_at(digits_end);
 97 |     Ok((digits.parse::<i64>()?, suffix.trim_left()))
 98 | }
 99 | 
100 | pub fn parse_duration(duration_text: &str) -> Result<i64, GenericError> {
101 |     let (number, suffix) = split_number_suffix(duration_text)?;
102 |     let scale = match suffix.to_lowercase().as_ref() {
103 |         "ms" => 1,
104 |         "s" => 1000,
105 |         "m" => 1000 * 60,
106 |         "h" => 1000 * 60 * 60,
107 |         _ => return Err(format!("Unknown duration suffix `{}`", suffix).into()),
108 |     };
109 |     number.checked_mul(scale).ok_or("Overflow error".into())
110 | }
111 | 
112 | pub fn parse_size(size_text: &str) -> Result<i64, GenericError> {
113 |     let (number, suffix) = split_number_suffix(size_text)?;
114 |     let scale = match suffix.to_lowercase().as_ref() {
115 |         "b" => 1,
116 |         "k" | "kb" => 1024,
117 |         "m" | "mb" => 1024 * 1024,
118 |         "g" | "gb" => 1024 * 1024 * 1024,
119 |         _ => return Err(format!("Unknown size suffix `{}`", suffix).into()),
120 |     };
121 |     number.checked_mul(scale).ok_or("Overflow error".into())
122 | }
123 | 
124 | macro_rules! cfi {
125 |     ($yaml:ident, $target:ident, $string:ident, $method:ident) => {
126 |         if let Some(v) = $yaml.get(stringify!($string)) {
127 |             let v = v
128 |                 .$method()
129 |                 .expect(concat!("Can't access field with", stringify!($method)));
130 |             $target.$string = v.into();
131 |         }
132 |     };
133 |     ($yaml:ident, $target:ident, $string:ident, $method:ident,try_into) => {
134 |         if let Some(v) = $yaml.get(stringify!($string)) {
135 |             let v = v
136 |                 .$method()
137 |                 .expect(concat!("Can't access field with", stringify!($method)));
138 |             $target.$string = v
139 |                 .try_into()
140 |                 .expect(concat!("Can't convert ", stringify!($string)));
141 |         }
142 |     };
143 |     ($yaml:ident, $target:ident, $string:ident, $method:ident, $convert:expr) => {
144 |         if let Some(v) = $yaml.get(stringify!($string)) {
145 |             let v = v.$method().expect(concat!(
146 |                 "Can't access key ",
147 |                 stringify!($string),
148 |                 " with",
149 |                 stringify!($method)
150 |             ));
151 |             $target.$string = $convert(v)
152 |                 .expect(concat!(
153 |                     "Can't convert ",
154 |                     stringify!($string),
155 |                     " with ",
156 |                     stringify!($convert)
157 |                 )).try_into()
158 |                 .expect(concat!("Can't convert ", stringify!($string)));
159 |         }
160 |     };
161 | }
162 | 
163 | pub fn read_config_file(path: &Path, config: &mut Config) {
164 |     debug!("Reading config file");
165 |     let yaml = {
166 |         let mut s = String::new();
167 |         File::open(path)
168 |             .and_then(|mut f| f.read_to_string(&mut s))
169 |             .expect("Error reading config file");
170 |         yaml::from_str::<yaml::Value>(&s).expect("Error parsing config file")
171 |     };
172 |     debug!("Done reading config file: {:?}", config);
173 | 
174 |     cfi!(yaml, config, data_dir, as_str);
175 |     cfi!(yaml, config, cluster_name, as_str);
176 |     cfi!(yaml, config, listen_addr, as_str, SocketAddr::from_str);
177 |     cfi!(yaml, config, fabric_addr, as_str, SocketAddr::from_str);
178 |     // pub cmd_init: Option<InitCommand>,
179 |     cfi!(yaml, config, worker_timer, as_str, parse_duration);
180 |     cfi!(yaml, config, worker_count, as_u64, try_into);
181 |     cfi!(yaml, config, sync_incomming_max, as_u64, try_into);
182 |     cfi!(yaml, config, sync_outgoing_max, as_u64, try_into);
183 |     cfi!(yaml, config, sync_auto, as_bool);
184 |     cfi!(yaml, config, sync_timeout, as_str, parse_duration);
185 |     cfi!(yaml, config, sync_msg_timeout, as_str, parse_duration);
186 |     cfi!(yaml, config, sync_msg_inflight, as_u64, try_into);
187 |     cfi!(yaml, config, fabric_timeout, as_str, parse_duration);
188 |     cfi!(yaml, config, request_timeout, as_str, parse_duration);
189 |     cfi!(yaml, config, client_connection_max, as_u64, try_into);
190 |     cfi!(yaml, config, value_version_max, as_u64, try_into);
191 |     cfi!(
192 |         yaml,
193 |         config,
194 |         consistency_read,
195 |         as_str,
196 |         ConsistencyLevel::from_str
197 |     );
198 |     cfi!(
199 |         yaml,
200 |         config,
201 |         consistency_write,
202 |         as_str,
203 |         ConsistencyLevel::from_str
204 |     );
205 | 
206 |     if let Some(v) = yaml.get("seed_nodes") {
207 |         config.seed_nodes = v
208 |             .as_sequence()
209 |             .expect("seed_nodes is not a sequence")
210 |             .iter()
211 |             .map(|v| {
212 |                 v.as_str()
213 |                     .expect("seed_nodes element is not a string")
214 |                     .parse()
215 |                     .expect("seed_nodes element can't be parsed")
216 |             }).collect();
217 |     }
218 | 
219 |     if let Some(config_value) = yaml.get("logging") {
220 |         setup_logging(config_value);
221 |     }
222 | }
223 | 
224 | pub fn setup_logging(config_value: &yaml::Value) {
225 |     let raw_config: log4rs::file::RawConfig =
226 |         yaml::from_value(config_value.clone()).expect("failed to parse logging config");
227 | 
228 |     let (appenders, errors) = raw_config.appenders_lossy(&Default::default());
229 |     if !errors.is_empty() {
230 |         panic!("failed to configure logging: {:?}", errors);
231 |     }
232 | 
233 |     let (config, errors) = log4rs::config::Config::builder()
234 |         .appenders(appenders)
235 |         .loggers(raw_config.loggers())
236 |         .build_lossy(raw_config.root());
237 | 
238 |     if !errors.is_empty() {
239 |         panic!("failed to configure logging: {:?}", errors);
240 |     }
241 | 
242 |     log4rs::init_config(config).expect("failed to init logging");
243 | }
244 | 
245 | pub fn setup_default_logging() {
246 |     let config = log4rs::config::Config::builder()
247 |         .appender(
248 |             log4rs::config::Appender::builder().build(
249 |                 "console",
250 |                 Box::new(
251 |                     log4rs::append::console::ConsoleAppender::builder()
252 |                         .target(log4rs::append::console::Target::Stderr)
253 |                         .build(),
254 |                 ),
255 |             ),
256 |         ).logger(
257 |             log4rs::config::Logger::builder()
258 |                 .appender("console")
259 |                 .build("sucredb", log::LevelFilter::Info),
260 |         ).build(log4rs::config::Root::builder().build(log::LevelFilter::Off))
261 |         .expect("failed to setup default logging");
262 | 
263 |     log4rs::init_config(config).expect("failed to init logging");
264 | }
265 | 


--------------------------------------------------------------------------------
/src/cubes.rs:
--------------------------------------------------------------------------------
  1 | use bincode;
  2 | use bytes::Bytes;
  3 | use command::CommandError;
  4 | use linear_map::{Entry as LMEntry, LinearMap};
  5 | use resp::RespValue;
  6 | use std::boxed::FnBox;
  7 | use std::time;
  8 | use version_vector::*;
  9 | 
 10 | pub type MutatorFn =
 11 |     Box<FnBox(Id, Version, Cube) -> Result<(Cube, Option<RespValue>), CommandError> + Send>;
 12 | pub type ResponseFn = Box<FnMut(Cube) -> RespValue + Send>;
 13 | 
 14 | #[derive(Clone, Debug, Serialize, Deserialize)]
 15 | pub enum Cube {
 16 |     // the order is used to merge different types in a deterministic way
 17 |     Counter(Counter),
 18 |     Value(Value),
 19 |     Map(Map),
 20 |     Set(Set),
 21 |     Void(VersionVector),
 22 | }
 23 | 
 24 | macro_rules! impl_into{
 25 |     ($s:ident, $v:ident) => {
 26 |         pub fn $s(self) -> Option<$v>{
 27 |             match self {
 28 |                 Cube::$v(a) => Some(a),
 29 |                 Cube::Void(a) => Some($v::with(a)),
 30 |                 _ => None,
 31 |             }
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | impl Default for Cube {
 37 |     fn default() -> Self {
 38 |         Cube::Void(Default::default())
 39 |     }
 40 | }
 41 | 
 42 | impl Cube {
 43 |     pub fn is_subsumed(&self, bvv: &BitmappedVersionVector) -> bool {
 44 |         use self::Cube::*;
 45 |         match *self {
 46 |             Counter(ref a) => a.values.is_empty() && a.vv.contained(bvv),
 47 |             Value(ref a) => a.values.is_empty() && a.vv.contained(bvv),
 48 |             Map(ref a) => a.values.is_empty() && a.vv.contained(bvv),
 49 |             Set(ref a) => a.values.is_empty() && a.vv.contained(bvv),
 50 |             Void(_) => unreachable!(),
 51 |         }
 52 |     }
 53 | 
 54 |     impl_into!(into_value, Value);
 55 |     impl_into!(into_counter, Counter);
 56 |     impl_into!(into_map, Map);
 57 |     impl_into!(into_set, Set);
 58 | 
 59 |     // minimum set of dots required to assemble this cube
 60 |     // see comment at the bottom
 61 |     pub fn for_each_dot<CB: FnMut(Id, Version)>(&self, mut cb: CB) {
 62 |         use self::Cube::*;
 63 |         match *self {
 64 |             Counter(ref a) => a.values.iter().for_each(|(&i, &(v, _))| cb(i, v)),
 65 |             Value(ref a) => a.values.iter().for_each(|(&(i, v), _)| cb(i, v)),
 66 |             Map(ref a) => a.dots.iter().for_each(|(i, v)| cb(i, v)),
 67 |             Set(ref a) => a.dots.iter().for_each(|(i, v)| cb(i, v)),
 68 |             Void(_) => unreachable!(),
 69 |         }
 70 |     }
 71 | 
 72 |     pub fn new(bvv: &BitmappedVersionVector) -> Cube {
 73 |         let mut vv = VersionVector::new();
 74 |         for (&n, bv) in bvv.iter() {
 75 |             vv.add(n, bv.base());
 76 |         }
 77 |         Cube::Void(vv)
 78 |     }
 79 | 
 80 |     pub fn del(&mut self, id: Id, version: Version, vv: &VersionVector) -> bool {
 81 |         use self::Cube::*;
 82 |         match *self {
 83 |             Counter(ref mut a) => a.clear(id, version),
 84 |             Value(ref mut a) => a.set(id, version, None, vv),
 85 |             Map(ref mut a) => a.clear(id, version),
 86 |             Set(ref mut a) => a.clear(id, version),
 87 |             Void(_) => return false,
 88 |         }
 89 |         true
 90 |     }
 91 | 
 92 |     pub fn merge(self, other: Self) -> Self {
 93 |         use self::Cube::*;
 94 |         match (self, other) {
 95 |             (Counter(a), Counter(b)) => Counter(a.merge(b)),
 96 |             (Value(a), Value(b)) => Value(a.merge(b)),
 97 |             (Map(a), Map(b)) => Map(a.merge(b)),
 98 |             (Set(a), Set(b)) => Set(a.merge(b)),
 99 |             (Void(vv), a) | (a, Void(vv)) => match a {
100 |                 Counter(a) => Counter(a.merge(self::Counter::with(vv))),
101 |                 Value(a) => Value(a.merge(self::Value::with(vv))),
102 |                 Map(a) => Map(a.merge(self::Map::with(vv))),
103 |                 Set(a) => Set(a.merge(self::Set::with(vv))),
104 |                 Void(mut o_vv) => {
105 |                     o_vv.merge(&vv);
106 |                     Void(o_vv)
107 |                 }
108 |             },
109 |             (a, b) => {
110 |                 warn!("Merging Cubes with different types");
111 |                 #[allow(unreachable_patterns)]
112 |                 match (a, b) {
113 |                     (Counter(a), _) | (_, Counter(a)) => Counter(a),
114 |                     (Value(a), _) | (_, Value(a)) => Value(a),
115 |                     (Map(a), _) | (_, Map(a)) => Map(a),
116 |                     (Set(a), _) | (_, Set(a)) => Set(a),
117 |                     (Void(_), _) | (_, Void(_)) => unreachable!(),
118 |                 }
119 |             }
120 |         }
121 |     }
122 | }
123 | 
124 | // RWCounter
125 | #[derive(Clone, Debug, Serialize, Deserialize)]
126 | pub struct Counter {
127 |     values: LinearMap<Id, (Version, i64)>,
128 |     vv: VersionVector,
129 | }
130 | 
131 | impl Counter {
132 |     fn with(vv: VersionVector) -> Self {
133 |         Counter {
134 |             values: Default::default(),
135 |             vv,
136 |         }
137 |     }
138 | 
139 |     pub fn get(&self) -> i64 {
140 |         self.values.values().map(|&(_, c)| c).sum()
141 |     }
142 | 
143 |     pub fn inc(&mut self, node: Id, version: Version, by: i64) -> i64 {
144 |         self.vv.add(node, version);
145 |         let version_counter = self.values.entry(node).or_insert((0, 0));
146 |         version_counter.0 = version;
147 |         version_counter.1 += by;
148 |         version_counter.1
149 |     }
150 | 
151 |     pub fn clear(&mut self, node: Id, version: Version) {
152 |         self.values.clear();
153 |         self.vv.add(node, version);
154 |     }
155 | 
156 |     fn merge(mut self, other: Self) -> Self {
157 |         for (id, other) in other.values {
158 |             match self.values.entry(id) {
159 |                 LMEntry::Occupied(mut oc) => if other.0 > oc.get().0 {
160 |                     *oc.get_mut() = other;
161 |                 },
162 |                 LMEntry::Vacant(va) => {
163 |                     va.insert(other);
164 |                 }
165 |             }
166 |         }
167 |         self
168 |     }
169 | }
170 | 
171 | // MultiRegister
172 | #[derive(Clone, Debug, Serialize, Deserialize)]
173 | pub struct Value {
174 |     values: DotMap<Option<Bytes>>,
175 |     vv: VersionVector,
176 | }
177 | 
178 | impl Value {
179 |     fn with(vv: VersionVector) -> Self {
180 |         Value {
181 |             values: Default::default(),
182 |             vv,
183 |         }
184 |     }
185 | 
186 |     pub fn len(&self) -> usize {
187 |         self.values.len()
188 |     }
189 | 
190 |     pub fn set(&mut self, node: Id, version: Version, value: Option<Bytes>, vv: &VersionVector) {
191 |         self.values.discard(vv);
192 |         self.values.insert(node, version, value);
193 |         self.vv.add(node, version);
194 |     }
195 | 
196 |     fn merge(mut self, mut other: Self) -> Self {
197 |         self.values.merge(&mut other.values, &self.vv, &other.vv);
198 |         self.vv.merge(&other.vv);
199 |         self
200 |     }
201 | }
202 | 
203 | /// Actor Observed removal
204 | /// Add wins on conflict
205 | #[derive(Clone, Debug, Serialize, Deserialize)]
206 | pub struct Set {
207 |     values: CausalMap<Bytes, DotSet>,
208 |     dots: VersionVector,
209 |     vv: VersionVector,
210 | }
211 | 
212 | impl Set {
213 |     fn with(vv: VersionVector) -> Self {
214 |         Set {
215 |             values: Default::default(),
216 |             dots: Default::default(),
217 |             vv,
218 |         }
219 |     }
220 | 
221 |     pub fn insert(&mut self, node: Id, version: Version, item: Bytes) -> bool {
222 |         let result = self
223 |             .values
224 |             .insert(item, DotSet::from_dot((node, version)))
225 |             .is_none();
226 |         self.vv.add(node, version);
227 |         self.dots.add(node, version);
228 |         result
229 |     }
230 | 
231 |     pub fn remove(&mut self, node: Id, version: Version, item: &[u8]) -> bool {
232 |         let result = self.values.remove(item).is_some();
233 |         self.vv.add(node, version);
234 |         self.dots.add(node, version);
235 |         result
236 |     }
237 | 
238 |     pub fn clear(&mut self, node: Id, version: Version) {
239 |         self.values.clear();
240 |         self.vv.add(node, version);
241 |         self.dots.add(node, version);
242 |     }
243 | 
244 |     fn merge(mut self, mut other: Self) -> Self {
245 |         self.values.merge(&mut other.values, &self.vv, &other.vv);
246 |         self.vv.merge(&other.vv);
247 |         self.dots.merge(&other.dots);
248 |         self
249 |     }
250 | }
251 | 
252 | // Actor Observed removal
253 | // LWW on value conflict (max as tiebreaker)
254 | #[derive(Clone, Debug, Serialize, Deserialize)]
255 | pub struct Map {
256 |     values: CausalMap<Bytes, MapValue>,
257 |     dots: VersionVector,
258 |     vv: VersionVector,
259 | }
260 | 
261 | impl Map {
262 |     fn with(vv: VersionVector) -> Self {
263 |         Map {
264 |             values: Default::default(),
265 |             dots: Default::default(),
266 |             vv,
267 |         }
268 |     }
269 | 
270 |     pub fn insert(&mut self, node: Id, version: Version, key: Bytes, value: Bytes) -> bool {
271 |         let result = self
272 |             .values
273 |             .insert(key, MapValue::new((node, version), value))
274 |             .is_none();
275 |         self.vv.add(node, version);
276 |         self.dots.add(node, version);
277 |         result
278 |     }
279 | 
280 |     pub fn remove(&mut self, node: Id, version: Version, key: &[u8]) -> bool {
281 |         let result = self.values.remove(key).is_some();
282 |         self.vv.add(node, version);
283 |         self.dots.add(node, version);
284 |         result
285 |     }
286 | 
287 |     pub fn clear(&mut self, node: Id, version: Version) {
288 |         self.values.clear();
289 |         self.vv.add(node, version);
290 |         self.dots.add(node, version);
291 |     }
292 | 
293 |     fn merge(mut self, mut other: Self) -> Self {
294 |         self.values.merge(&mut other.values, &self.vv, &other.vv);
295 |         self.vv.merge(&other.vv);
296 |         self.dots.merge(&other.dots);
297 |         self
298 |     }
299 | }
300 | 
301 | #[derive(Clone, Debug, Default, Serialize, Deserialize)]
302 | struct MapValue {
303 |     dots: DotSet,
304 |     value: Bytes,
305 |     timestamp: u64, // millis since epoch
306 | }
307 | 
308 | impl MapValue {
309 |     fn new(dot: (Id, Version), value: Bytes) -> Self {
310 |         let timestamp = time::UNIX_EPOCH.elapsed().unwrap();
311 |         MapValue {
312 |             dots: DotSet::from_dot(dot),
313 |             value,
314 |             timestamp: timestamp.as_secs() * 1_000 + (timestamp.subsec_nanos() / 1_000_000) as u64,
315 |         }
316 |     }
317 | }
318 | 
319 | impl CausalValue for MapValue {
320 |     fn merge<VV: AbsVersionVector>(&mut self, other: &mut Self, s_vv: &VV, o_vv: &VV) {
321 |         self.dots.merge(&mut other.dots, s_vv, o_vv);
322 |         // resolve possible value collision
323 |         // if timestamps are equal value becomes max(a, b)
324 |         if self.timestamp > other.timestamp {
325 |             // nothing to do
326 |         } else if other.timestamp > self.timestamp || other.value > self.value {
327 |             self.timestamp = other.timestamp;
328 |             ::std::mem::swap(&mut self.value, &mut other.value);
329 |         }
330 |     }
331 | 
332 |     fn is_empty(&self) -> bool {
333 |         self.dots.is_empty()
334 |     }
335 | }
336 | 
337 | pub fn render_value(cube: Cube) -> RespValue {
338 |     match cube {
339 |         Cube::Value(v) => {
340 |             let serialized_vv = bincode::serialize(&v.vv).unwrap();
341 |             let mut values: Vec<_> = v
342 |                 .values
343 |                 .into_iter()
344 |                 .filter_map(|(_, ov)| ov.map(RespValue::Data))
345 |                 .collect();
346 |             values.push(RespValue::Data(serialized_vv.into()));
347 |             RespValue::Array(values)
348 |         }
349 |         Cube::Void(vv) => {
350 |             let serialized_vv = bincode::serialize(&vv).unwrap();
351 |             RespValue::Array(vec![RespValue::Data(serialized_vv.into())])
352 |         }
353 |         _ => CommandError::TypeError.into(),
354 |     }
355 | }
356 | 
357 | pub fn render_counter(cube: Cube) -> RespValue {
358 |     match cube {
359 |         Cube::Counter(c) => RespValue::Int(c.get()),
360 |         Cube::Void(_vv) => RespValue::Nil,
361 |         _ => CommandError::TypeError.into(),
362 |     }
363 | }
364 | 
365 | pub fn render_type(cube: Cube) -> RespValue {
366 |     use self::Cube::*;
367 |     let ty = match cube {
368 |         Counter(_) => "counter", // non-standard
369 |         Value(_) => "string",
370 |         Map(_) => "hash",
371 |         Set(_) => "set",
372 |         Void(_) => "none",
373 |     };
374 |     RespValue::Data(ty.into())
375 | }
376 | 
377 | pub fn render_map(cube: Cube) -> RespValue {
378 |     match cube {
379 |         Cube::Map(m) => {
380 |             let mut array = Vec::with_capacity(m.values.len() * 2);
381 |             for (k, v) in m.values.into_iter() {
382 |                 array.push(RespValue::Data(k));
383 |                 array.push(RespValue::Data(v.value));
384 |             }
385 |             RespValue::Array(array)
386 |         }
387 |         Cube::Void(_) => RespValue::Array(vec![]),
388 |         _ => CommandError::TypeError.into(),
389 |     }
390 | }
391 | 
392 | pub fn render_set(cube: Cube) -> RespValue {
393 |     match cube {
394 |         Cube::Set(s) => {
395 |             let array = s
396 |                 .values
397 |                 .into_iter()
398 |                 .map(|(v, _)| RespValue::Data(v))
399 |                 .collect();
400 |             RespValue::Array(array)
401 |         }
402 |         Cube::Void(_) => RespValue::Array(vec![]),
403 |         _ => CommandError::TypeError.into(),
404 |     }
405 | }
406 | 
407 | /*
408 | Using the vv from cubes to track key dots (the latest version from each node) doesn't work, example:
409 | 
410 | -> n3 is partitioned out
411 | -> n1 "SET a v" gets dot n1-1
412 | n1: a => [n1-1 v][n1 1] log: n1-1 => a
413 | n2: a => [n1-1 v][n1 1] log: n1-1 => a
414 | n3: --
415 | 
416 | -> n2 "SET a z [n1 1]" dot n2-1
417 | n1: a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a
418 | n2: a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a
419 | n3: --
420 | 
421 | -> n2 "SET b y" gets dot n2-2
422 | n1: b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b
423 | n2: b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b
424 | n3: --
425 | 
426 | -> n3 can receive messages
427 | -> n2 "SET c y" gets dot n2-3 (merges with void cube w/ [n1 1, n2 2])
428 | n1: c => [n2-3 y][n1 1, n2 3] b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b, n2-3 => c
429 | n2: c => [n2-3 y][n1 1, n2 3] b => [n2-2 y][n1 1, n2 2] a => [n2-1 z][n1 1, n2 1] log: n1-1 => a, n2-1 => a, n2-2 => b, n2-3 => c
430 | n3: c => [n2-3 y][n1 1, n2 3] log: n2-3 => c, n1-1 => c
431 | 
432 | n3 stores (n1-1 => c) in dot log but that's wrong
433 | 
434 | Problem:
435 | VV from Voids pollutes the vv of new writes.
436 | 
437 | Solution:
438 | For the MultiRegister the DotMap with optional values works as its own dot tracker,
439 | even if it could have more than 1 version per actor the number should stay low.
440 | Counters are similar to the MultiRegister case.
441 | Sets/Maps carry an additional VV to track the dots.
442 | Voids don't need any dot, their state is empty and they history is contained in the node clock (previous dots).
443 | 
444 | */
445 | 
446 | /*
447 | Optimized bootstrap doesn't work?
448 | 
449 | Given n1 with lots of data churn and the only kv left is k => [n1-100 v][n1 100]
450 | node clock says [n1 1000] and logs have all the expected entries
451 | 
452 | When n2 bootstrap n1 will send only non-deleted kvs as this is always <= number of dots (optimized bootstrap).
453 | Thus only k => [n1-100 v][n1 100] is sent.
454 | n2 will store k as above and log will contain only n1-100 => k
455 | syncfin will update n2 node clock to [n1 1000]
456 | 
457 | If asked for any key other than k it'll return the same response as n1.
458 | That is a void with a causal context [n1 1000]
459 | 
460 | Problem:
461 | What if n3 asks to sync dots n1-101 n1-102 ... with n2? It'll get nothing but it'll get it's n1 clock bumped to 1000
462 | If key y was deleted as dot n1-101 the delete will never get propagated to n3.
463 | 
464 | Fix: on SyncFin sync only remote (n2) part of clock, like: n3LocalClock[n2].merge(n2RemoteClock[n2])
465 | It was in the updated paper all the time, I just didn't see it.
466 | 
467 | */
468 | 
469 | /*
470 | AAE based bootstrap any better?
471 | 
472 | Given n1 with lots of data churn and the only kv left is k => [n1-100 v][n1 100]
473 | node clock says [n1 1000] and logs have all the expected entries
474 | 
475 | n2 comes up (or it was partitioned the entire time) and wants to sync with n1.
476 | It needs all dots of n1, it'll get k => [n1-100 v][n1 100] and voids for all other keys.
477 | 
478 | What if n3 asks to sync dots n1-101 n1-102 ... with n2?
479 | 
480 | Same problem and fix as the above.
481 | 
482 | */
483 | 


--------------------------------------------------------------------------------
/src/fabric.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::hash_map::Entry as HMEntry;
  2 | use std::net::SocketAddr;
  3 | use std::sync::atomic::{AtomicUsize, Ordering};
  4 | use std::sync::{mpsc, Arc, RwLock};
  5 | use std::time::Duration;
  6 | use std::{io, thread};
  7 | 
  8 | use bincode;
  9 | use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 10 | use bytes::{BufMut, Bytes, BytesMut};
 11 | use linear_map::LinearMap;
 12 | use rand::{thread_rng, Rng};
 13 | 
 14 | use futures::future::Either;
 15 | use futures::sync::mpsc as fmpsc;
 16 | use futures::sync::oneshot as foneshot;
 17 | use futures::{Future, Sink, Stream};
 18 | use tokio_codec as codec;
 19 | use tokio_core as tokio;
 20 | use tokio_io::{io as tokio_io, AsyncRead};
 21 | 
 22 | use config::Config;
 23 | use database::NodeId;
 24 | pub use fabric_msg::*;
 25 | use utils::{into_io_error, GenericError, IdHashMap};
 26 | 
 27 | // u32(le) payload len + bincode payload
 28 | struct FramedBincodeCodec;
 29 | 
 30 | impl codec::Decoder for FramedBincodeCodec {
 31 |     type Item = FabricMsg;
 32 |     type Error = io::Error;
 33 | 
 34 |     fn decode(&mut self, src: &mut BytesMut) -> io::Result<Option<Self::Item>> {
 35 |         let (consumed, result) = {
 36 |             let mut bytes: &[u8] = &*src;
 37 |             if let Ok(msg_len) = bytes.read_u32::<LittleEndian>() {
 38 |                 if bytes.len() >= msg_len as usize {
 39 |                     match bincode::deserialize_from(&mut bytes) {
 40 |                         Ok(v) => (4 + msg_len as usize, Ok(Some(v))),
 41 |                         Err(e) => (0, Err(into_io_error(e))),
 42 |                     }
 43 |                 } else {
 44 |                     (0, Ok(None))
 45 |                 }
 46 |             } else {
 47 |                 (0, Ok(None))
 48 |             }
 49 |         };
 50 |         src.split_to(consumed);
 51 |         result
 52 |     }
 53 | }
 54 | 
 55 | impl FramedBincodeCodec {
 56 |     fn serialize(item: FabricMsgRef) -> Bytes {
 57 |         let item_size = bincode::serialized_size(&item).unwrap();
 58 |         let mut dst = BytesMut::with_capacity(item_size as usize + 4);
 59 |         dst.put_u32_le(item_size as u32);
 60 |         bincode::serialize_into(&mut (&mut dst).writer(), &item).unwrap();
 61 |         dst.into()
 62 |     }
 63 | }
 64 | 
 65 | impl codec::Encoder for FramedBincodeCodec {
 66 |     type Item = Bytes;
 67 |     type Error = io::Error;
 68 | 
 69 |     fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> io::Result<()> {
 70 |         dst.reserve(item.len());
 71 |         dst.put(&item);
 72 |         Ok(())
 73 |     }
 74 | }
 75 | 
 76 | pub type FabricMsgFn = Box<Fn(NodeId, FabricMsg) + Sync + Send>;
 77 | pub type FabricConFn = Box<Fn(NodeId) + Sync + Send>;
 78 | 
 79 | type SenderChan = fmpsc::UnboundedSender<Bytes>;
 80 | type InitType = io::Result<(Arc<SharedContext>, foneshot::Sender<()>)>;
 81 | 
 82 | const FABRIC_KEEPALIVE_MS: u64 = 1000;
 83 | const FABRIC_RECONNECT_INTERVAL_MS: u64 = 1000;
 84 | 
 85 | /// The messaging network that encompasses all nodes of the cluster
 86 | /// using the fabric you can send messages (best-effort delivery)
 87 | /// to any registered node.
 88 | /// Currently each node keeps a connection to every other node. Due to the
 89 | /// full-duplex nature of tcp this gives 2 pipes to each server, both are
 90 | /// used to make better use of the socket buffers (is this a good idea though?).
 91 | /// This also helps parallelism as an eventual big message won't affect
 92 | /// the latency as much.
 93 | pub struct Fabric {
 94 |     context: Arc<SharedContext>,
 95 |     loop_thread: Option<(
 96 |         foneshot::Sender<()>,
 97 |         thread::JoinHandle<Result<(), GenericError>>,
 98 |     )>,
 99 | }
100 | 
101 | struct ReaderContext {
102 |     context: Arc<SharedContext>,
103 |     peer: NodeId,
104 | }
105 | 
106 | struct WriterContext {
107 |     context: Arc<SharedContext>,
108 |     peer: NodeId,
109 |     connection_id: usize,
110 | }
111 | 
112 | struct SharedContext {
113 |     node: NodeId,
114 |     addr: SocketAddr,
115 |     loop_remote: tokio::reactor::Remote,
116 |     msg_handlers: RwLock<LinearMap<u8, FabricMsgFn>>,
117 |     con_handlers: RwLock<Vec<FabricConFn>>,
118 |     // TODO: unify nodes_addr and connections maps
119 |     nodes_addr: RwLock<IdHashMap<NodeId, SocketAddr>>,
120 |     connections: RwLock<IdHashMap<NodeId, Vec<(usize, SenderChan)>>>,
121 |     connection_gen: AtomicUsize,
122 | }
123 | 
124 | impl SharedContext {
125 |     fn register_node(&self, peer: NodeId, peer_addr: SocketAddr) -> Option<SocketAddr> {
126 |         self.nodes_addr.write().unwrap().insert(peer, peer_addr)
127 |     }
128 | 
129 |     fn remove_node(&self, peer: NodeId) -> Option<SocketAddr> {
130 |         self.nodes_addr.write().unwrap().remove(&peer)
131 |     }
132 | 
133 |     fn register_connection(&self, peer: NodeId, sender: SenderChan) -> usize {
134 |         let connection_id = self.connection_gen.fetch_add(1, Ordering::Relaxed);
135 |         debug!(
136 |             "register_connection peer: {}, id: {:?}",
137 |             peer, connection_id
138 |         );
139 |         let is_new = {
140 |             let mut locked = self.connections.write().unwrap();
141 |             let entry = locked.entry(peer).or_insert_with(Default::default);
142 |             let is_new = entry.is_empty();
143 |             entry.push((connection_id, sender));
144 |             is_new
145 |         };
146 |         if is_new {
147 |             for handler in &*self.con_handlers.read().unwrap() {
148 |                 handler(peer);
149 |             }
150 |         }
151 |         connection_id
152 |     }
153 | 
154 |     fn remove_connection(&self, peer: NodeId, connection_id: usize) {
155 |         debug!("Remove_connection peer: {}, id: {:?}", peer, connection_id);
156 |         let mut locked = self.connections.write().unwrap();
157 |         if let HMEntry::Occupied(mut o) = locked.entry(peer) {
158 |             let p = o
159 |                 .get()
160 |                 .iter()
161 |                 .position(|x| x.0 == connection_id)
162 |                 .expect("connection_id not found");
163 |             o.get_mut().swap_remove(p);
164 |             // cleanup entry if empty
165 |             if o.get().is_empty() {
166 |                 o.remove();
167 |             }
168 |         } else {
169 |             panic!("Peer not found in connections");
170 |         }
171 |     }
172 | }
173 | 
174 | impl ReaderContext {
175 |     fn new(context: Arc<SharedContext>, peer: NodeId) -> Self {
176 |         ReaderContext {
177 |             context: context,
178 |             peer: peer,
179 |         }
180 |     }
181 | 
182 |     fn dispatch(&self, msg: FabricMsg) {
183 |         let msg_type = msg.get_type();
184 |         if let Some(handler) = self
185 |             .context
186 |             .msg_handlers
187 |             .read()
188 |             .unwrap()
189 |             .get(&(msg_type as u8))
190 |         {
191 |             trace!("recv from {:?} {:?}", self.peer, msg);
192 |             handler(self.peer, msg);
193 |         } else {
194 |             error!("No handler for msg type {:?}", msg_type);
195 |         }
196 |     }
197 | }
198 | 
199 | impl WriterContext {
200 |     fn new(context: Arc<SharedContext>, peer: NodeId, sender: SenderChan) -> Self {
201 |         let connection_id = context.register_connection(peer, sender);
202 |         WriterContext {
203 |             context: context,
204 |             peer: peer,
205 |             connection_id: connection_id,
206 |         }
207 |     }
208 | }
209 | 
210 | impl Drop for WriterContext {
211 |     fn drop(&mut self) {
212 |         self.context
213 |             .remove_connection(self.peer, self.connection_id);
214 |     }
215 | }
216 | 
217 | impl Fabric {
218 |     fn listen(
219 |         listener: tokio::net::TcpListener,
220 |         context: Arc<SharedContext>,
221 |         handle: tokio::reactor::Handle,
222 |     ) -> Box<Future<Item = (), Error = ()>> {
223 |         debug!("Starting fabric listener");
224 |         let fut = listener
225 |             .incoming()
226 |             .for_each(move |(socket, addr)| {
227 |                 debug!("Accepting connection from {:?}", addr);
228 |                 let context_cloned = context.clone();
229 |                 handle.spawn(
230 |                     Self::handshake(socket, context_cloned)
231 |                         .and_then(move |(s, peer_id, context)| {
232 |                             Self::steady_connection(s, peer_id, context)
233 |                         }).then(|_| Ok(())),
234 |                 );
235 |                 Ok(())
236 |             }).map_err(|_| ());
237 |         Box::new(fut)
238 |     }
239 | 
240 |     fn connect(
241 |         expected_node: Option<NodeId>,
242 |         addr: SocketAddr,
243 |         context: Arc<SharedContext>,
244 |         handle: tokio::reactor::Handle,
245 |     ) -> Box<Future<Item = (), Error = ()>> {
246 |         debug!("Connecting to node {:?}: {:?}", expected_node, addr);
247 |         let context1 = context.clone();
248 |         let handle1 = handle.clone();
249 |         let handle2 = handle.clone();
250 | 
251 |         let fut = tokio::net::TcpStream::connect(&addr, &handle)
252 |             .select2(
253 |                 tokio::reactor::Timeout::new(
254 |                     Duration::from_millis(FABRIC_RECONNECT_INTERVAL_MS),
255 |                     &handle,
256 |                 ).expect("Can't create connect timeout"),
257 |             ).then(|r| match r {
258 |                 Ok(Either::A((s, _))) => Ok(s),
259 |                 Ok(Either::B(_)) => Err(io::ErrorKind::TimedOut.into()),
260 |                 Err(either) => Err(either.split().0),
261 |             }).and_then(move |s| Self::handshake(s, context))
262 |             .and_then(move |(s, peer_id, context)| Self::steady_connection(s, peer_id, context))
263 |             .then(move |_| {
264 |                 tokio::reactor::Timeout::new(
265 |                     Duration::from_millis(FABRIC_RECONNECT_INTERVAL_MS),
266 |                     &handle1,
267 |                 ).expect("Can't create reconnect timeout")
268 |             }).and_then(move |_| {
269 |                 let node = expected_node.ok_or(io::ErrorKind::NotFound)?;
270 |                 let addr_opt = {
271 |                     let locked = context1.nodes_addr.read().unwrap();
272 |                     locked.get(&node).cloned()
273 |                 };
274 |                 if let Some(addr) = addr_opt {
275 |                     debug!("Reconnecting fabric connection to {:?}", addr);
276 |                     handle2.spawn(Self::connect(
277 |                         expected_node,
278 |                         addr,
279 |                         context1,
280 |                         handle2.clone(),
281 |                     ));
282 |                 }
283 |                 Ok(())
284 |             });
285 |         Box::new(fut.map_err(|_| ()))
286 |     }
287 | 
288 |     fn handshake(
289 |         socket: tokio::net::TcpStream,
290 |         context: Arc<SharedContext>,
291 |     ) -> Box<Future<Item = (tokio::net::TcpStream, NodeId, Arc<SharedContext>), Error = io::Error>>
292 |     {
293 |         debug!("Stablished connection with {:?}", socket.peer_addr());
294 |         let _ = socket.set_nodelay(true);
295 |         let _ = socket.set_keepalive(Some(Duration::from_millis(FABRIC_KEEPALIVE_MS)));
296 |         let mut buffer = [0u8; 8];
297 |         (&mut buffer[..])
298 |             .write_u64::<LittleEndian>(context.node)
299 |             .unwrap();
300 |         let fut = tokio_io::write_all(socket, buffer)
301 |             .and_then(|(s, b)| tokio_io::read_exact(s, b))
302 |             .and_then(move |(s, b)| {
303 |                 let peer_id = (&b[..]).read_u64::<LittleEndian>().unwrap();
304 |                 debug!("Identified connection to node {}", peer_id);
305 |                 Ok((s, peer_id, context))
306 |             });
307 | 
308 |         Box::new(fut)
309 |     }
310 | 
311 |     fn steady_connection(
312 |         socket: tokio::net::TcpStream,
313 |         peer: NodeId,
314 |         context: Arc<SharedContext>,
315 |     ) -> Box<Future<Item = (), Error = io::Error>> {
316 |         let (socket_rx, socket_tx) = socket.split();
317 |         let socket_tx = codec::FramedWrite::new(socket_tx, FramedBincodeCodec);
318 |         let socket_rx = codec::FramedRead::new(socket_rx, FramedBincodeCodec);
319 |         let (chan_tx, chan_rx) = fmpsc::unbounded();
320 | 
321 |         let ctx_rx = ReaderContext::new(context.clone(), peer);
322 |         let fut_rx = socket_rx.for_each(move |msg| {
323 |             ctx_rx.dispatch(msg);
324 |             Ok(())
325 |         });
326 | 
327 |         let ctx_tx = WriterContext::new(context, peer, chan_tx);
328 |         let fut_tx = socket_tx
329 |             .send_all(chan_rx.map_err(|_| io::Error::from(io::ErrorKind::Other)))
330 |             .then(move |r| {
331 |                 // hold onto ctx_tx until the stream is done
332 |                 drop(ctx_tx);
333 |                 r.map(|_| ())
334 |             });
335 | 
336 |         Box::new(fut_rx.select(fut_tx).map(|_| ()).map_err(|(e, _)| e))
337 |     }
338 | 
339 |     fn init(
340 |         node: NodeId,
341 |         config: Config,
342 |         handle: tokio::reactor::Handle,
343 |     ) -> Result<Arc<SharedContext>, GenericError> {
344 |         let context = Arc::new(SharedContext {
345 |             node: node,
346 |             addr: config.fabric_addr,
347 |             loop_remote: handle.remote().clone(),
348 |             nodes_addr: Default::default(),
349 |             msg_handlers: Default::default(),
350 |             con_handlers: Default::default(),
351 |             connections: Default::default(),
352 |             connection_gen: Default::default(),
353 |         });
354 | 
355 |         let listener = tokio::net::TcpListener::bind(&context.addr, &handle)?;
356 |         handle.spawn(Self::listen(listener, context.clone(), handle.clone()));
357 | 
358 |         Ok(context)
359 |     }
360 | 
361 |     pub fn node(&self) -> NodeId {
362 |         self.context.node
363 |     }
364 | 
365 |     pub fn addr(&self) -> SocketAddr {
366 |         self.context.addr
367 |     }
368 | 
369 |     pub fn new(node: NodeId, config: &Config) -> Result<Self, GenericError> {
370 |         let config = config.clone();
371 |         let (init_tx, init_rx) = mpsc::channel();
372 |         let thread = thread::Builder::new()
373 |             .name(format!("Fabric:{}", node))
374 |             .spawn(move || {
375 |                 let mut core = tokio::reactor::Core::new().unwrap();
376 |                 let (completer_tx, completer_rx) = foneshot::channel();
377 |                 init_tx.send(Self::init(node, config, core.handle()).map(|c| (c, completer_tx)))?;
378 |                 core.run(completer_rx).map_err(From::from)
379 |             }).unwrap();
380 |         let (context, completer) = init_rx.recv()??;
381 |         Ok(Fabric {
382 |             context: context,
383 |             loop_thread: Some((completer, thread)),
384 |         })
385 |     }
386 | 
387 |     pub fn register_msg_handler(&self, msg_type: FabricMsgType, handler: FabricMsgFn) {
388 |         self.context
389 |             .msg_handlers
390 |             .write()
391 |             .unwrap()
392 |             .insert(msg_type as u8, handler);
393 |     }
394 | 
395 |     pub fn register_con_handler(&self, handler: FabricConFn) {
396 |         self.context.con_handlers.write().unwrap().push(handler);
397 |     }
398 | 
399 |     pub fn register_seed(&self, addr: SocketAddr) {
400 |         self.start_connect(None, addr)
401 |     }
402 | 
403 |     pub fn register_node(&self, node: NodeId, addr: SocketAddr) {
404 |         let prev = self.context.register_node(node, addr);
405 |         if prev != Some(addr) {
406 |             self.start_connect(Some(node), addr);
407 |         }
408 |     }
409 | 
410 |     pub fn remove_node(&self, node: NodeId) {
411 |         self.context.remove_node(node);
412 |     }
413 | 
414 |     pub fn connections(&self) -> Vec<NodeId> {
415 |         let writers = self.context.connections.read().unwrap();
416 |         writers
417 |             .iter()
418 |             .filter(|&(_, c)| !c.is_empty())
419 |             .map(|(&n, _)| n)
420 |             .collect()
421 |     }
422 | 
423 |     pub fn set_nodes<I>(&self, it: I)
424 |     where
425 |         I: Iterator<Item = (NodeId, SocketAddr)>,
426 |     {
427 |         let mut nodes = self.context.nodes_addr.write().unwrap();
428 |         let mut x_nodes = nodes.clone();
429 |         for (node, addr) in it {
430 |             if node != self.context.node {
431 |                 x_nodes.remove(&node);
432 |                 if nodes.insert(node, addr) != Some(addr) {
433 |                     self.start_connect(Some(node), addr);
434 |                 }
435 |             }
436 |         }
437 |         for (node, _) in x_nodes {
438 |             nodes.remove(&node);
439 |         }
440 |     }
441 | 
442 |     fn start_connect(&self, expected_node: Option<NodeId>, addr: SocketAddr) {
443 |         let context = self.context.clone();
444 |         let context_cloned = context.clone();
445 |         context
446 |             .loop_remote
447 |             .spawn(move |h| Self::connect(expected_node, addr, context_cloned, h.clone()));
448 |     }
449 | 
450 |     // TODO: take msgs as references and buffer serialized bytes instead
451 |     pub fn send_msg<'a, T: Into<FabricMsgRef<'a>>>(
452 |         &'a self,
453 |         node: NodeId,
454 |         msg: T,
455 |     ) -> Result<(), FabricError> {
456 |         let msg = msg.into();
457 |         debug!("send_msg node:{} {:?}", node, msg);
458 |         if node == self.context.node {
459 |             panic!("Can't send message to self");
460 |         }
461 |         if cfg!(test) {
462 |             let droppable = match msg.get_type() {
463 |                 FabricMsgType::Crud => false,
464 |                 _ => true,
465 |             };
466 |             if droppable {
467 |                 let fabric_drop = ::std::env::var("FABRIC_DROP")
468 |                     .ok()
469 |                     .map(|s| s.parse::<f64>().expect("Can't parse FABRIC_DROP"))
470 |                     .unwrap_or(0.0);
471 |                 if fabric_drop > 0.0 && thread_rng().gen::<f64>() < fabric_drop {
472 |                     warn!("Fabric msg droped due to FABRIC_DROP: {:?}", msg);
473 |                     return Ok(());
474 |                 }
475 |             }
476 |         }
477 | 
478 |         let serialized_msg = FramedBincodeCodec::serialize(msg);
479 |         let connections = self.context.connections.read().unwrap();
480 |         if let Some(o) = connections.get(&node) {
481 |             if let Some(&(connection_id, ref chan)) = thread_rng().choose::<(_, _)>(o) {
482 |                 if let Err(_) = chan.unbounded_send(serialized_msg) {
483 |                     warn!("Can't send to fabric {}-{} chan", node, connection_id,);
484 |                 } else {
485 |                     return Ok(());
486 |                 }
487 |             } else {
488 |                 warn!("DROPING MSG - No channel available for {:?}", node);
489 |             }
490 |         } else {
491 |             warn!("DROPING MSG - No entry for node {:?}", node);
492 |         }
493 | 
494 |         Err(FabricError::NoRoute)
495 |     }
496 | }
497 | 
498 | impl Drop for Fabric {
499 |     fn drop(&mut self) {
500 |         warn!("droping fabric");
501 |         if let Some((c, t)) = self.loop_thread.take() {
502 |             let _ = c.send(());
503 |             let _ = t.join();
504 |         }
505 |     }
506 | }
507 | 
508 | #[cfg(test)]
509 | mod tests {
510 |     use super::*;
511 |     use config::Config;
512 |     use env_logger;
513 |     use std::sync::{atomic, Arc};
514 |     use std::thread;
515 |     use std::time::Duration;
516 | 
517 |     #[test]
518 |     fn test() {
519 |         let _ = env_logger::try_init();
520 |         let config1 = Config {
521 |             fabric_addr: "127.0.0.1:6481".parse().unwrap(),
522 |             ..Default::default()
523 |         };
524 |         let config2 = Config {
525 |             fabric_addr: "127.0.0.1:6482".parse().unwrap(),
526 |             ..Default::default()
527 |         };
528 |         let fabric1 = Fabric::new(1, &config1).unwrap();
529 |         let fabric2 = Fabric::new(2, &config2).unwrap();
530 |         fabric1.register_node(2, "127.0.0.1:6482".parse().unwrap());
531 |         fabric2.register_node(1, "127.0.0.1:6481".parse().unwrap());
532 |         thread::sleep(Duration::from_millis(10));
533 | 
534 |         let counter = Arc::new(atomic::AtomicUsize::new(0));
535 |         let counter_ = counter.clone();
536 |         fabric2.register_msg_handler(
537 |             FabricMsgType::Crud,
538 |             Box::new(move |_, _| {
539 |                 counter_.fetch_add(1, atomic::Ordering::Relaxed);
540 |             }),
541 |         );
542 |         for _ in 0..3 {
543 |             fabric1
544 |                 .send_msg(
545 |                     2,
546 |                     &MsgRemoteSetAck {
547 |                         cookie: Default::default(),
548 |                         vnode: Default::default(),
549 |                         result: Ok(Vec::new()),
550 |                     },
551 |                 ).unwrap();
552 |         }
553 |         thread::sleep(Duration::from_millis(10));
554 |         assert_eq!(counter.load(atomic::Ordering::Relaxed), 3);
555 |     }
556 | }
557 | 


--------------------------------------------------------------------------------
/src/fabric_msg.rs:
--------------------------------------------------------------------------------
  1 | use bytes::Bytes;
  2 | use cubes::Cube;
  3 | use database::*;
  4 | use version_vector::*;
  5 | 
  6 | #[derive(Debug, Copy, Clone)]
  7 | pub enum FabricMsgType {
  8 |     Crud,
  9 |     Synch,
 10 |     DHT,
 11 |     Unknown,
 12 | }
 13 | 
 14 | #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 15 | pub enum FabricError {
 16 |     NoRoute,
 17 |     CookieNotFound,
 18 |     BadVNodeStatus,
 19 |     NotReady,
 20 |     SyncInterrupted,
 21 |     StorageError,
 22 | }
 23 | 
 24 | #[derive(Debug, Serialize, Deserialize)]
 25 | pub enum FabricMsg {
 26 |     RemoteGet(MsgRemoteGet),
 27 |     RemoteGetAck(MsgRemoteGetAck),
 28 |     RemoteSet(MsgRemoteSet),
 29 |     RemoteSetAck(MsgRemoteSetAck),
 30 |     SyncStart(MsgSyncStart),
 31 |     SyncSend(MsgSyncSend),
 32 |     SyncAck(MsgSyncAck),
 33 |     SyncFin(MsgSyncFin),
 34 |     DHTAE(VersionVector),
 35 |     DHTSync(Bytes),
 36 |     Unknown,
 37 | }
 38 | 
 39 | #[derive(Debug, Serialize)]
 40 | pub enum FabricMsgRef<'a> {
 41 |     RemoteGet(&'a MsgRemoteGet),
 42 |     RemoteGetAck(&'a MsgRemoteGetAck),
 43 |     RemoteSet(&'a MsgRemoteSet),
 44 |     RemoteSetAck(&'a MsgRemoteSetAck),
 45 |     SyncStart(&'a MsgSyncStart),
 46 |     SyncSend(&'a MsgSyncSend),
 47 |     SyncAck(&'a MsgSyncAck),
 48 |     SyncFin(&'a MsgSyncFin),
 49 |     DHTAE(&'a VersionVector),
 50 |     DHTSync(&'a Bytes),
 51 |     Unknown,
 52 | }
 53 | 
 54 | impl FabricMsg {
 55 |     pub fn get_type(&self) -> FabricMsgType {
 56 |         match *self {
 57 |             FabricMsg::RemoteGet(..)
 58 |             | FabricMsg::RemoteGetAck(..)
 59 |             | FabricMsg::RemoteSet(..)
 60 |             | FabricMsg::RemoteSetAck(..) => FabricMsgType::Crud,
 61 |             FabricMsg::SyncStart(..)
 62 |             | FabricMsg::SyncSend(..)
 63 |             | FabricMsg::SyncAck(..)
 64 |             | FabricMsg::SyncFin(..) => FabricMsgType::Synch,
 65 |             FabricMsg::DHTSync(..) | FabricMsg::DHTAE(..) => FabricMsgType::DHT,
 66 |             _ => unreachable!(),
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | impl<'a> FabricMsgRef<'a> {
 72 |     pub fn get_type(&self) -> FabricMsgType {
 73 |         match *self {
 74 |             FabricMsgRef::RemoteGet(..)
 75 |             | FabricMsgRef::RemoteGetAck(..)
 76 |             | FabricMsgRef::RemoteSet(..)
 77 |             | FabricMsgRef::RemoteSetAck(..) => FabricMsgType::Crud,
 78 |             FabricMsgRef::SyncStart(..)
 79 |             | FabricMsgRef::SyncSend(..)
 80 |             | FabricMsgRef::SyncAck(..)
 81 |             | FabricMsgRef::SyncFin(..) => FabricMsgType::Synch,
 82 |             FabricMsgRef::DHTSync(..) | FabricMsgRef::DHTAE(..) => FabricMsgType::DHT,
 83 |             _ => unreachable!(),
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | #[derive(Debug, Serialize, Deserialize)]
 89 | pub struct MsgRemoteGet {
 90 |     pub vnode: VNodeNo,
 91 |     pub cookie: Cookie,
 92 |     pub keys: Vec<Bytes>,
 93 | }
 94 | 
 95 | #[derive(Debug, Serialize, Deserialize)]
 96 | pub struct MsgRemoteGetAck {
 97 |     pub vnode: VNodeNo,
 98 |     pub cookie: Cookie,
 99 |     pub result: Result<Vec<Cube>, FabricError>,
100 | }
101 | 
102 | #[derive(Debug, Serialize, Deserialize)]
103 | pub struct MsgRemoteSet {
104 |     pub vnode: VNodeNo,
105 |     pub cookie: Cookie,
106 |     pub writes: Vec<(Bytes, Cube, bool)>,
107 |     pub reply: bool,
108 | }
109 | 
110 | #[derive(Debug, Serialize, Deserialize)]
111 | pub struct MsgRemoteSetAck {
112 |     pub vnode: VNodeNo,
113 |     pub cookie: Cookie,
114 |     pub result: Result<Vec<Option<Cube>>, FabricError>,
115 | }
116 | 
117 | #[derive(Debug, Serialize, Deserialize)]
118 | pub struct MsgSyncStart {
119 |     pub vnode: VNodeNo,
120 |     pub cookie: Cookie,
121 |     pub clocks_in_peer: BitmappedVersionVector,
122 |     pub target: Option<NodeId>,
123 | }
124 | 
125 | #[derive(Debug, Serialize, Deserialize)]
126 | pub struct MsgSyncFin {
127 |     pub vnode: VNodeNo,
128 |     pub cookie: Cookie,
129 |     pub result: Result<BitmappedVersionVector, FabricError>,
130 | }
131 | 
132 | #[derive(Debug, Serialize, Deserialize)]
133 | pub struct MsgSyncSend {
134 |     pub vnode: VNodeNo,
135 |     pub cookie: Cookie,
136 |     pub seq: u64,
137 |     pub key: Bytes,
138 |     pub value: Cube,
139 | }
140 | 
141 | #[derive(Debug, Serialize, Deserialize)]
142 | pub struct MsgSyncAck {
143 |     pub vnode: VNodeNo,
144 |     pub cookie: Cookie,
145 |     pub seq: u64,
146 | }
147 | 
148 | impl<'a> Into<FabricMsgRef<'a>> for &'a FabricMsg {
149 |     fn into(self) -> FabricMsgRef<'a> {
150 |         match self {
151 |             &FabricMsg::RemoteGet(ref a) => FabricMsgRef::RemoteGet(a),
152 |             &FabricMsg::RemoteGetAck(ref a) => FabricMsgRef::RemoteGetAck(a),
153 |             &FabricMsg::RemoteSet(ref a) => FabricMsgRef::RemoteSet(a),
154 |             &FabricMsg::RemoteSetAck(ref a) => FabricMsgRef::RemoteSetAck(a),
155 |             &FabricMsg::SyncStart(ref a) => FabricMsgRef::SyncStart(a),
156 |             &FabricMsg::SyncSend(ref a) => FabricMsgRef::SyncSend(a),
157 |             &FabricMsg::SyncAck(ref a) => FabricMsgRef::SyncAck(a),
158 |             &FabricMsg::SyncFin(ref a) => FabricMsgRef::SyncFin(a),
159 |             &FabricMsg::DHTSync(ref a) => FabricMsgRef::DHTSync(a),
160 |             &FabricMsg::DHTAE(ref a) => FabricMsgRef::DHTAE(a),
161 |             _ => unreachable!(),
162 |         }
163 |     }
164 | }
165 | 
166 | macro_rules! impl_into {
167 |     ($w:ident, $msg:ident) => {
168 |         impl Into<FabricMsg> for $msg {
169 |             fn into(self) -> FabricMsg {
170 |                 FabricMsg::$w(self)
171 |             }
172 |         }
173 |         impl<'a> Into<FabricMsgRef<'a>> for &'a $msg {
174 |             fn into(self) -> FabricMsgRef<'a> {
175 |                 FabricMsgRef::$w(self)
176 |             }
177 |         }
178 |     };
179 | }
180 | 
181 | impl_into!(RemoteGet, MsgRemoteGet);
182 | impl_into!(RemoteGetAck, MsgRemoteGetAck);
183 | impl_into!(RemoteSet, MsgRemoteSet);
184 | impl_into!(RemoteSetAck, MsgRemoteSetAck);
185 | impl_into!(SyncAck, MsgSyncAck);
186 | impl_into!(SyncSend, MsgSyncSend);
187 | impl_into!(SyncFin, MsgSyncFin);
188 | impl_into!(SyncStart, MsgSyncStart);
189 | 


--------------------------------------------------------------------------------
/src/gossip.rs:
--------------------------------------------------------------------------------
  1 | use std::{cmp, thread, io, time, fmt};
  2 | use std::time::{Duration, Instant};
  3 | use std::net::SocketAddr;
  4 | use std::marker::PhantomData;
  5 | use std::sync::{mpsc, Arc, Mutex};
  6 | use std::collections::HashMap;
  7 | 
  8 | use rand::{thread_rng, Rng};
  9 | use serde::Serialize;
 10 | use serde::de::DeserializeOwned;
 11 | use bincode;
 12 | use futures::{Future, Stream, Sink};
 13 | use futures::sync::mpsc as fmpsc;
 14 | use futures::sync::oneshot as foneshot;
 15 | use tokio_core as tokio;
 16 | 
 17 | use inflightmap::InFlightMap;
 18 | use utils::into_io_error;
 19 | 
 20 | const PACKET_SIZE: usize = 1400;
 21 | const PING_PERIOD_MS: u64 = 500;
 22 | const PING_TIMEOUT_MS: u64 = 1000;
 23 | const SUSPECT_TIMEOUT_MS: u64 = 5 * PING_TIMEOUT_MS;
 24 | const PING_SYNC_CHANCE: f32 = 0.05f32;
 25 | const PING_CANDIDATES: usize = 3;
 26 | const PINGREQ_CANDIDATES: usize = 3;
 27 | const TIMER_RESOLUTION_MS: u64 = 150;
 28 | 
 29 | // quick implementation of SWIM
 30 | // has various limitations
 31 | // TODO: piggyback
 32 | pub struct Gossiper<T: Metadata> {
 33 |     context: Arc<Mutex<Inner<T>>>,
 34 |     loop_thread: Option<(foneshot::Sender<()>, thread::JoinHandle<io::Result<()>>)>,
 35 | }
 36 | 
 37 | pub enum GossiperMsg<T: Metadata> {
 38 |     New(SocketAddr, T),
 39 |     Alive(SocketAddr, T),
 40 |     Dead(SocketAddr),
 41 |     // Left(SocketAddr),
 42 | }
 43 | 
 44 | pub type GossiperCallback<T> = Box<FnMut(GossiperMsg<T>) + Send>;
 45 | 
 46 | #[derive(Debug, Eq, PartialEq, Copy, Clone, Serialize, Deserialize)]
 47 | enum NodeStatus {
 48 |     Alive,
 49 |     Suspect,
 50 |     Dead,
 51 | }
 52 | 
 53 | type Seq = u32;
 54 | 
 55 | pub trait Metadata
 56 |     : Serialize + DeserializeOwned + Clone + PartialEq + Send + fmt::Debug + 'static
 57 |     {
 58 | }
 59 | 
 60 | impl<T: Serialize + DeserializeOwned + Clone + PartialEq + Send + fmt::Debug + 'static> Metadata
 61 |     for T {
 62 | }
 63 | 
 64 | #[derive(Debug)]
 65 | struct Node<T: Metadata> {
 66 |     incarnation: Seq,
 67 |     status_change: Instant,
 68 |     status: NodeStatus,
 69 |     meta: T,
 70 | }
 71 | 
 72 | struct Inner<T: Metadata> {
 73 |     addr: SocketAddr,
 74 |     seq: Seq,
 75 |     incarnation: Seq,
 76 |     meta: T,
 77 |     nodes: HashMap<SocketAddr, Node<T>>,
 78 |     next_alive_probe: Instant,
 79 |     next_dead_probe: Instant,
 80 |     pingreq_inflight: InFlightMap<Seq, (SocketAddr, SocketAddr), Instant>,
 81 |     ping_inflight: InFlightMap<Seq, SocketAddr, Instant>,
 82 |     suspect_inflight: InFlightMap<SocketAddr, Instant, Instant>,
 83 |     send_queue: fmpsc::UnboundedSender<(SocketAddr, Message<T>)>,
 84 |     broadcast_queue: Vec<(u32, Message<T>)>,
 85 |     callback: GossiperCallback<T>,
 86 |     leaving: bool,
 87 |     bootstraping: bool,
 88 | }
 89 | 
 90 | type State<T> = (SocketAddr, Seq, NodeStatus, T);
 91 | 
 92 | #[derive(Debug, Clone, Serialize, Deserialize)]
 93 | #[serde(bound = "T: DeserializeOwned")]
 94 | enum Message<T: Metadata> {
 95 |     Ping { seq: Seq },
 96 |     PingReq { seq: Seq, node: SocketAddr },
 97 |     PingAck { seq: Seq },
 98 |     Suspect {
 99 |         from: SocketAddr,
100 |         node: SocketAddr,
101 |         incarnation: Seq,
102 |     },
103 |     Dead {
104 |         // named Confirm in original paper
105 |         from: SocketAddr,
106 |         node: SocketAddr,
107 |         incarnation: Seq,
108 |     },
109 |     Alive {
110 |         incarnation: Seq,
111 |         node: SocketAddr,
112 |         meta: T,
113 |     },
114 |     Sync { state: Vec<State<T>> },
115 |     SyncAck { state: Vec<State<T>> },
116 | }
117 | 
118 | struct UdpCodec<T: Metadata>(PhantomData<T>);
119 | 
120 | impl<T: Metadata> tokio::net::UdpCodec for UdpCodec<T> {
121 |     type In = (SocketAddr, Message<T>);
122 |     type Out = (SocketAddr, Message<T>);
123 | 
124 |     fn decode(&mut self, addr: &SocketAddr, buf: &[u8]) -> io::Result<Self::In> {
125 |         trace!("decoding {:?}", buf);
126 |         match bincode::deserialize(buf) {
127 |             Ok(msg) => Ok((*addr, msg)),
128 |             Err(err) => {
129 |                 warn!("decode err: {:?}", err);
130 |                 Err(into_io_error(err))
131 |             }
132 |         }
133 |     }
134 | 
135 |     fn encode(&mut self, addr_msg: Self::Out, buf: &mut Vec<u8>) -> SocketAddr {
136 |         let (addr, msg) = addr_msg;
137 |         trace!("encoding {:?}", msg);
138 |         match bincode::serialize_into(buf, &msg, bincode::Infinite) {
139 |             Ok(_) => addr,
140 |             Err(err) => {
141 |                 panic!("encode err: {:?}", err);
142 |             }
143 |         }
144 |     }
145 | }
146 | 
147 | impl<T: Metadata> Node<T> {
148 |     fn new(status: NodeStatus, incarnation: Seq, meta: T) -> Node<T> {
149 |         Node {
150 |             incarnation: incarnation,
151 |             status_change: Instant::now(),
152 |             status: status,
153 |             meta: meta,
154 |         }
155 |     }
156 | 
157 |     fn set_status(&mut self, status: NodeStatus, incarnation: Seq) -> bool {
158 |         if self.status != status {
159 |             self.status = status;
160 |             self.status_change = Instant::now();
161 |             self.incarnation = incarnation;
162 |             true
163 |         } else if self.incarnation != incarnation {
164 |             self.incarnation = incarnation;
165 |             true
166 |         } else {
167 |             false
168 |         }
169 |     }
170 | }
171 | 
172 | type InitType<T> = io::Result<(Arc<Mutex<Inner<T>>>, foneshot::Sender<()>)>;
173 | 
174 | impl<T: Metadata> Inner<T> {
175 |     fn init(
176 |         handle: tokio::reactor::Handle,
177 |         addr: SocketAddr,
178 |         meta: T,
179 |         callback: GossiperCallback<T>,
180 |     ) -> io::Result<Arc<Mutex<Inner<T>>>> {
181 |         let (chan_tx, chan_rx) = fmpsc::unbounded::<(SocketAddr, Message<T>)>();
182 | 
183 |         let context = Arc::new(Mutex::new(Inner {
184 |             addr: addr,
185 |             nodes: Default::default(),
186 |             incarnation: 0,
187 |             seq: 0,
188 |             meta: meta,
189 |             next_alive_probe: Instant::now(),
190 |             next_dead_probe: Instant::now(),
191 |             ping_inflight: InFlightMap::new(),
192 |             pingreq_inflight: InFlightMap::new(),
193 |             suspect_inflight: InFlightMap::new(),
194 |             send_queue: chan_tx,
195 |             broadcast_queue: Default::default(),
196 |             callback: callback,
197 |             leaving: false,
198 |             bootstraping: false,
199 |         }));
200 | 
201 |         let socket = tokio::net::UdpSocket::bind(&addr, &handle)?;
202 |         let (s_tx, s_rx) = socket.framed(UdpCodec::<T>(PhantomData)).split();
203 | 
204 |         let fut_tx = s_tx.send_all(chan_rx.map_err(|_| io::Error::from(io::ErrorKind::Other)))
205 |             .map(|_| ());
206 | 
207 |         let context2 = context.clone();
208 |         let interval =
209 |             tokio::reactor::Interval::new(Duration::from_millis(TIMER_RESOLUTION_MS), &handle)
210 |                 .expect("Can't create Interval");
211 |         let fut_timer = interval
212 |             .for_each(move |_| {
213 |                 context2.lock().unwrap().on_timer();
214 |                 Ok(())
215 |             })
216 |             .then(|r| {
217 |                 info!("timer fut {:?}", r);
218 |                 Ok(())
219 |             });
220 | 
221 |         let context3 = context.clone();
222 |         let fut_rx = s_rx.for_each(move |(a, m)| {
223 |             context3.lock().unwrap().on_message(a, m);
224 |             Ok(())
225 |         });
226 | 
227 |         let fut_socket = fut_tx.select(fut_rx).map(|_| ()).map_err(|(e, _)| e).then(
228 |             |r| {
229 |                 info!("socket fut: {:?}", r);
230 |                 Ok(())
231 |             },
232 |         );
233 | 
234 |         handle.spawn(fut_timer);
235 |         handle.spawn(fut_socket);
236 | 
237 |         Ok(context)
238 |     }
239 | 
240 |     fn on_timer(self: &mut Inner<T>) {
241 |         let now = Instant::now();
242 | 
243 |         // gossip to alive nodes
244 |         self.maybe_gossip_alive(now);
245 |         // gossip to dead nodes possibly resolving partitions, etc
246 |         self.maybe_gossip_dead(now);
247 | 
248 |         // expire pings and fire indirect pings
249 |         while let Some((seq, node)) = self.ping_inflight.pop_expired(now) {
250 |             debug!("{:?} pingreq to {:?}", self.addr, node);
251 |             if self.send_ping_reqs(seq, node) == 0 {
252 |                 // nobody to pingreq!?
253 |                 self.pingreq_inflight.insert(seq, (self.addr, node), now);
254 |             }
255 |         }
256 |         // expire pingreqs and mark as suspect if we are the originating node
257 |         while let Some((_, (from, node))) = self.pingreq_inflight.pop_expired(now) {
258 |             debug!("pingreq expired {:?} {:?} - {:?}", from, node, self.addr);
259 |             let msg = match self.nodes.get(&node) {
260 |                 Some(n) if from == self.addr => {
261 |                     Message::Suspect {
262 |                         node: node,
263 |                         incarnation: n.incarnation,
264 |                         from: self.addr,
265 |                     }
266 |                 }
267 |                 _ => continue,
268 |             };
269 |             let addr = self.addr;
270 |             self.on_message(addr, msg);
271 |         }
272 | 
273 |         // expire suspicious and mark dead if status didnt change
274 |         while let Some((node, status_change)) = self.suspect_inflight.pop_expired(now) {
275 |             let msg = match self.nodes.get(&node) {
276 |                 Some(n) if n.status_change == status_change => {
277 |                     Message::Dead {
278 |                         node: node,
279 |                         incarnation: n.incarnation,
280 |                         from: self.addr,
281 |                     }
282 |                 }
283 |                 _ => continue,
284 |             };
285 |             let addr = self.addr;
286 |             self.on_message(addr, msg);
287 |         }
288 | 
289 |         // drain broadcast queue
290 |         if !self.broadcast_queue.is_empty() {
291 |             let candidates = self.get_candidates(true, !0);
292 |             let mut messages = Vec::new();
293 |             let mut counter = 0;
294 |             for &mut (ref mut rem, ref msg) in &mut self.broadcast_queue {
295 |                 let n = cmp::min(candidates.len(), *rem as usize);
296 |                 *rem -= n as u32;
297 |                 for _ in 0..n {
298 |                     messages.push((candidates[counter % candidates.len()], msg.clone()));
299 |                     counter += 1;
300 |                 }
301 |             }
302 |             self.broadcast_queue.retain(|&(r, _)| r > 0);
303 |             for (addr, msg) in messages {
304 |                 self.send(addr, msg);
305 |             }
306 |         }
307 |     }
308 | 
309 |     fn refute(&mut self, incarnation: Seq) {
310 |         self.incarnation = cmp::max(self.incarnation, incarnation) + 1;
311 |         let msg = Message::Alive {
312 |             incarnation: self.incarnation,
313 |             node: self.addr,
314 |             meta: self.meta.clone(),
315 |         };
316 |         self.broadcast(msg);
317 |     }
318 | 
319 |     fn get_candidates(&self, alive: bool, limit: usize) -> Vec<SocketAddr> {
320 |         let mut candidates: Vec<_> = self.nodes
321 |             .iter()
322 |             .filter_map(|(&k, v)| if (alive && v.status != NodeStatus::Dead) ||
323 |                 (!alive && v.status == NodeStatus::Dead)
324 |             {
325 |                 Some(k)
326 |             } else {
327 |                 None
328 |             })
329 |             .collect();
330 |         if candidates.len() > limit {
331 |             thread_rng().shuffle(&mut candidates);
332 |             candidates.truncate(limit);
333 |         }
334 |         trace!(
335 |             "{:?} nodes are {:?}, returning {} candidates",
336 |             self.addr,
337 |             self.nodes,
338 |             candidates.len()
339 |         );
340 |         candidates
341 |     }
342 | 
343 |     fn send_ping_reqs(&mut self, seq: Seq, node: SocketAddr) -> usize {
344 |         let now = Instant::now();
345 |         let candidates = self.get_candidates(true, PINGREQ_CANDIDATES);
346 |         debug!(
347 |             "{} sending indirect pings to {} through {} other nodes",
348 |             self.addr,
349 |             node,
350 |             candidates.len()
351 |         );
352 |         for &k in &candidates {
353 |             self.pingreq_inflight.insert(
354 |                 seq,
355 |                 (self.addr, k),
356 |                 now + time::Duration::from_millis(PING_TIMEOUT_MS),
357 |             );
358 |             self.send(
359 |                 k,
360 |                 Message::PingReq {
361 |                     seq: seq,
362 |                     node: node,
363 |                 },
364 |             );
365 |         }
366 |         candidates.len()
367 |     }
368 | 
369 |     fn maybe_gossip_alive(&mut self, now: Instant) -> usize {
370 |         if now < self.next_alive_probe {
371 |             return 0;
372 |         }
373 |         self.next_alive_probe = now + time::Duration::from_millis(PING_PERIOD_MS);
374 |         let candidates = self.get_candidates(true, PING_CANDIDATES);
375 |         if !candidates.is_empty() {
376 |             debug!(
377 |                 "{} gossiping to {} alive nodes",
378 |                 self.addr,
379 |                 candidates.len()
380 |             );
381 |             for &k in &candidates {
382 |                 // TODO: in case a node is suspect,
383 |                 // it'd be best to probe with a Suspect msg
384 |                 let (seq, msg) = self.generate_ping_msg();
385 |                 self.ping_inflight.insert(
386 |                     seq,
387 |                     k,
388 |                     now +
389 |                         time::Duration::from_millis(
390 |                             PING_TIMEOUT_MS,
391 |                         ),
392 |                 );
393 |                 self.send(k, msg);
394 |                 // chance to fire a sync message as well
395 |                 if thread_rng().gen::<f32>() < PING_SYNC_CHANCE {
396 |                     let sync_state = self.generate_sync_state();
397 |                     self.send(k, Message::Sync { state: sync_state });
398 |                 }
399 |             }
400 |         }
401 |         candidates.len()
402 |     }
403 | 
404 |     fn maybe_gossip_dead(&mut self, now: Instant) -> usize {
405 |         // TODO: maybe sync instead
406 |         if now < self.next_dead_probe {
407 |             return 0;
408 |         }
409 |         self.next_dead_probe = now + time::Duration::from_secs(PING_PERIOD_MS);
410 | 
411 |         let candidates = self.get_candidates(false, PING_CANDIDATES);
412 |         if candidates.len() != 0 {
413 |             debug!("{} gossiping to {} dead nodes", self.addr, candidates.len());
414 |             for &k in &candidates {
415 |                 // probe with a dead msg so it does have a chance to refute
416 |                 let msg = Message::Dead {
417 |                     node: k,
418 |                     incarnation: self.nodes[&k].incarnation,
419 |                     from: self.addr,
420 |                 };
421 |                 self.send(k, msg);
422 |             }
423 |         }
424 |         candidates.len()
425 |     }
426 | 
427 |     fn on_message(&mut self, sender: SocketAddr, msg: Message<T>) {
428 |         trace!("{} on_message: {:?}", self.addr, msg);
429 |         match msg {
430 |             Message::Ping { seq } => {
431 |                 self.send(sender, Message::PingAck { seq: seq });
432 |             }
433 |             Message::PingReq { seq, node } => {
434 |                 self.pingreq_inflight.insert(
435 |                     seq,
436 |                     (sender, node),
437 |                     Instant::now() + time::Duration::from_millis(PING_TIMEOUT_MS),
438 |                 );
439 |                 self.send(node, Message::Ping { seq: seq });
440 |             }
441 |             Message::PingAck { seq } => {
442 |                 if let Some(_) = self.ping_inflight.remove(&seq) {
443 |                     // good
444 |                 } else if let Some((from, _)) = self.pingreq_inflight.remove(&seq) {
445 |                     // send to original sender
446 |                     self.send(from, msg);
447 |                 } else {
448 |                     // do nothing if we dont have it in state
449 |                 };
450 |             }
451 |             Message::Alive {
452 |                 incarnation,
453 |                 node,
454 |                 meta,
455 |             } => {
456 |                 if node == self.addr {
457 |                     if incarnation < self.incarnation ||
458 |                         (incarnation == self.incarnation && meta == self.meta)
459 |                     {
460 |                         return;
461 |                     }
462 |                     if self.leaving {
463 |                         // TODO!
464 |                         return;
465 |                     }
466 |                     // refute
467 |                     debug!("node {:?} REFUTE ALIVE", node);
468 |                     self.refute(incarnation);
469 |                     return;
470 |                 }
471 | 
472 |                 {
473 |                     let mut existing = true;
474 |                     let n = self.nodes.entry(node).or_insert_with(|| {
475 |                         existing = false;
476 |                         Node::new(NodeStatus::Dead, 0, meta.clone())
477 |                     });
478 |                     if existing && incarnation <= n.incarnation {
479 |                         return;
480 |                     }
481 |                     debug!("{:?} node {:?} IS ALIVE", self.addr, node);
482 |                     if existing {
483 |                         (self.callback)(GossiperMsg::Alive(node, meta.clone()));
484 |                     } else {
485 |                         (self.callback)(GossiperMsg::New(node, meta.clone()));
486 |                     }
487 |                     n.set_status(NodeStatus::Alive, incarnation);
488 |                 }
489 | 
490 |                 // help broadcast
491 |                 self.broadcast(Message::Alive {
492 |                     incarnation: incarnation,
493 |                     node: node,
494 |                     meta: meta,
495 |                 });
496 |             }
497 |             Message::Suspect {
498 |                 incarnation,
499 |                 from,
500 |                 node,
501 |             } => {
502 |                 if node == self.addr {
503 |                     // ignore old info
504 |                     if incarnation < self.incarnation {
505 |                         return;
506 |                     }
507 |                     // refute & broadcast
508 |                     debug!("node {:?} REFUTE SUSPECT", node);
509 |                     self.refute(incarnation);
510 |                     return;
511 |                 }
512 | 
513 |                 if let Some(n) = self.nodes.get_mut(&node) {
514 |                     // ignore old info or irrelevant
515 |                     if incarnation < n.incarnation || n.status != NodeStatus::Alive {
516 |                         return;
517 |                     }
518 |                     debug!("{:?} node {:?} IS SUSPECT", self.addr, node);
519 |                     n.set_status(NodeStatus::Suspect, incarnation);
520 |                     self.suspect_inflight.insert(
521 |                         node,
522 |                         n.status_change,
523 |                         Instant::now() +
524 |                             time::Duration::from_millis(SUSPECT_TIMEOUT_MS),
525 |                     );
526 |                 } else {
527 |                     // about an unknown node!?
528 |                     return;
529 |                 }
530 | 
531 |                 // help broadcast
532 |                 self.broadcast(Message::Suspect {
533 |                     incarnation: incarnation,
534 |                     from: from,
535 |                     node: node,
536 |                 });
537 |             }
538 |             Message::Dead {
539 |                 incarnation,
540 |                 from,
541 |                 node,
542 |             } => {
543 |                 if node == self.addr {
544 |                     // ignore old info
545 |                     if incarnation < self.incarnation {
546 |                         return;
547 |                     }
548 |                     if self.leaving {
549 |                         // TODO!
550 |                         return;
551 |                     }
552 |                     // refute & broadcast
553 |                     debug!("node {:?} REFUTE DEAD", node);
554 |                     self.refute(incarnation);
555 |                     return;
556 |                 }
557 | 
558 |                 if let Some(n) = self.nodes.get_mut(&node) {
559 |                     // ignore old info or irrelevant
560 |                     if incarnation < n.incarnation || n.status == NodeStatus::Dead {
561 |                         return;
562 |                     }
563 |                     debug!("{:?} node {:?} IS DEAD", self.addr, node);
564 |                     (self.callback)(GossiperMsg::Dead(node));
565 |                     n.set_status(NodeStatus::Dead, incarnation);
566 |                 } else {
567 |                     // about an unknown node!?
568 |                     return;
569 |                 }
570 | 
571 |                 // help broadcast
572 |                 self.broadcast(Message::Dead {
573 |                     incarnation: incarnation,
574 |                     from: from,
575 |                     node: node,
576 |                 });
577 |             }
578 |             Message::Sync { state } => {
579 |                 self.do_sync(state);
580 |                 let ack_state = self.generate_sync_state();
581 |                 self.send(sender, Message::SyncAck { state: ack_state });
582 |             }
583 |             Message::SyncAck { state } => {
584 |                 self.do_sync(state);
585 |             }
586 |         }
587 |     }
588 | 
589 |     fn generate_sync_state(&mut self) -> Vec<State<T>> {
590 |         let mut state: Vec<_> = self.nodes
591 |             .iter()
592 |             .map(|(&k, n)| (k, n.incarnation, n.status, n.meta.clone()))
593 |             .collect();
594 |         state.push((
595 |             self.addr,
596 |             self.incarnation,
597 |             NodeStatus::Alive,
598 |             self.meta.clone(),
599 |         ));
600 |         // TODO: worry about size
601 |         if state.len() > 20 {
602 |             thread_rng().shuffle(&mut state);
603 |             state.truncate(20);
604 |         }
605 |         state
606 |     }
607 | 
608 |     fn do_sync(&mut self, state: Vec<State<T>>) {
609 |         let sender = self.addr;
610 |         for (addr, incarnation, status, meta) in state {
611 |             let msg = match status {
612 |                 NodeStatus::Alive => {
613 |                     Message::Alive {
614 |                         node: addr,
615 |                         incarnation: incarnation,
616 |                         meta: meta,
617 |                     }
618 |                 }
619 |                 // threat suspect and dead the same
620 |                 NodeStatus::Suspect | NodeStatus::Dead => {
621 |                     Message::Suspect {
622 |                         from: sender,
623 |                         node: addr,
624 |                         incarnation: incarnation,
625 |                     }
626 |                 }
627 |             };
628 |             self.on_message(sender, msg);
629 |         }
630 |     }
631 | 
632 |     fn send(&mut self, to: SocketAddr, msg: Message<T>) {
633 |         let _ = self.send_queue.unbounded_send((to, msg));
634 |     }
635 | 
636 |     fn broadcast(&mut self, msg: Message<T>) {
637 |         // self.nodes dont include self, so + 2
638 |         let n = ((self.nodes.len() + 2) as f32).log10().ceil() as u32 * 4;
639 |         self.broadcast_queue.push((n, msg));
640 |     }
641 | 
642 |     fn generate_ping_msg(&mut self) -> (Seq, Message<T>) {
643 |         let seq = self.seq;
644 |         self.seq += 1;
645 |         (seq, Message::Ping { seq: seq })
646 |     }
647 | 
648 |     pub fn update_meta(&mut self, meta: T) {
649 |         self.incarnation += 1;
650 |         self.meta = meta;
651 |         let msg = Message::Alive {
652 |             node: self.addr,
653 |             incarnation: self.incarnation,
654 |             meta: self.meta.clone(),
655 |         };
656 |         self.broadcast(msg);
657 |     }
658 | 
659 |     pub fn join(&mut self, seeds: &[SocketAddr]) {
660 |         let state = self.generate_sync_state();
661 |         for &seed in seeds {
662 |             self.send(seed, Message::Sync { state: state.clone() });
663 |         }
664 |     }
665 | }
666 | 
667 | impl<T: Metadata> Gossiper<T> {
668 |     pub fn new(
669 |         listen_addr: SocketAddr,
670 |         meta: T,
671 |         callback: GossiperCallback<T>,
672 |     ) -> io::Result<Gossiper<T>> {
673 |         let (init_tx, init_rx) = mpsc::channel();
674 |         let thread = thread::Builder::new()
675 |             .name(format!("Gossiper:{}", listen_addr))
676 |             .spawn(move || {
677 |                 let mut core = tokio::reactor::Core::new().unwrap();
678 |                 let (completer_tx, completer_rx) = foneshot::channel();
679 |                 init_tx
680 |                     .send(
681 |                         Inner::init(core.handle(), listen_addr, meta, callback)
682 |                             .map(|c| (c, completer_tx)),
683 |                     )
684 |                     .map_err(into_io_error)?;
685 |                 core.run(completer_rx).map_err(into_io_error)
686 |             })?;
687 | 
688 |         let (context, completer) = init_rx.recv().map_err(into_io_error)??;
689 |         Ok(Gossiper {
690 |             context: context,
691 |             loop_thread: Some((completer, thread)),
692 |         })
693 |     }
694 | 
695 |     pub fn join(&self, seeds: &[SocketAddr]) {
696 |         self.context.lock().unwrap().join(seeds)
697 |     }
698 | 
699 |     pub fn node_count(&self) -> usize {
700 |         self.context.lock().unwrap().nodes.len() + 1
701 |     }
702 | 
703 |     pub fn alive_count(&self) -> usize {
704 |         self.context
705 |             .lock()
706 |             .unwrap()
707 |             .nodes
708 |             .values()
709 |             .filter(|n| n.status != NodeStatus::Dead)
710 |             .count() + 1
711 |     }
712 | }
713 | 
714 | impl<T: Metadata> Drop for Gossiper<T> {
715 |     fn drop(&mut self) {
716 |         if let Some((completer, thread)) = self.loop_thread.take() {
717 |             let _ = completer.send(());
718 |             let _ = thread.join();
719 |         }
720 |     }
721 | }
722 | 
723 | #[cfg(test)]
724 | mod tests {
725 |     use super::*;
726 |     use env_logger;
727 |     use std::{time, thread};
728 | 
729 |     fn test_converge(n: usize) -> Vec<Gossiper<()>> {
730 |         let _ = env_logger::try_init();
731 |         let g: Vec<_> = (0..n)
732 |             .map(|i| {
733 |                 Gossiper::new(format!("0.0.0.0:{}", 9000 + i).parse().unwrap(), ()).unwrap()
734 |             })
735 |             .collect();
736 |         let start = Instant::now();
737 |         for (i, g0) in (&g[1..]).iter().enumerate() {
738 |             g0.join(&[format!("0.0.0.0:{}", 9000 + i).parse().unwrap()]);
739 |         }
740 |         for _ in 0..(n * 1000) {
741 |             if g.iter().all(|g| g.alive_count() == n) {
742 |                 break;
743 |             }
744 |             thread::sleep(time::Duration::from_millis(1));
745 |         }
746 |         warn!("{:?} has passed", Instant::now() - start);
747 |         assert!(
748 |             g.iter().all(|g| g.alive_count() == n),
749 |             "{} {:?}",
750 |             n,
751 |             g.iter().map(|g| g.alive_count()).collect::<Vec<_>>()
752 |         );
753 |         g
754 |     }
755 | 
756 |     macro_rules! test_converge_n {
757 |         ($fn_name: ident, $n: expr) => (
758 |             #[test]
759 |             fn $fn_name() {
760 |                 test_converge($n);
761 |             }
762 |         );
763 |     }
764 | 
765 |     test_converge_n!(test_converge_1, 1);
766 |     test_converge_n!(test_converge_2, 2);
767 |     test_converge_n!(test_converge_3, 3);
768 |     test_converge_n!(test_converge_5, 5);
769 |     test_converge_n!(test_converge_10, 10);
770 |     test_converge_n!(test_converge_20, 20);
771 |     test_converge_n!(test_converge_30, 30);
772 |     test_converge_n!(test_converge_50, 50);
773 | 
774 |     fn test_dead(n: usize) {
775 |         let _ = env_logger::try_init();
776 |         let mut g = test_converge(n);
777 |         g.pop();
778 |         let start = Instant::now();
779 |         for _ in 0..(n * 2000) {
780 |             if g.iter().all(|g| g.alive_count() == n - 1) {
781 |                 break;
782 |             }
783 |             thread::sleep(time::Duration::from_millis(1));
784 |         }
785 |         warn!("{:?} has passed", Instant::now() - start);
786 |         assert!(
787 |             g.iter().all(|g| g.alive_count() == n - 1),
788 |             "{} {:?}",
789 |             n - 1,
790 |             g.iter().map(|g| g.alive_count()).collect::<Vec<_>>()
791 |         );
792 |     }
793 | 
794 |     macro_rules! test_dead_n {
795 |         ($fn_name: ident, $n: expr) => (
796 |             #[test]
797 |             fn $fn_name() {
798 |                 test_dead($n);
799 |             }
800 |         );
801 |     }
802 | 
803 |     test_dead_n!(test_dead_1, 1);
804 |     test_dead_n!(test_dead_2, 2);
805 |     test_dead_n!(test_dead_3, 3);
806 |     test_dead_n!(test_dead_5, 5);
807 |     test_dead_n!(test_dead_10, 10);
808 |     test_dead_n!(test_dead_20, 20);
809 |     test_dead_n!(test_dead_30, 30);
810 |     test_dead_n!(test_dead_50, 50);
811 | }
812 | 


--------------------------------------------------------------------------------
/src/hash.rs:
--------------------------------------------------------------------------------
 1 | use crc16;
 2 | 
 3 | pub const HASH_SLOTS: u16 = 16384;
 4 | 
 5 | /// RedisCluster style partitioning
 6 | pub fn hash_slot(mut key: &[u8]) -> u16 {
 7 |     if let Some(open) = key.iter().position(|&x| x == b'{') {
 8 |         // note that close will be relative to open due to the skip()
 9 |         if let Some(close) = key[open + 1..].iter().position(|&x| x == b'}') {
10 |             if close > 0 {
11 |                 // found  { and } with something in between
12 |                 key = &key[open + 1..open + 1 + close];
13 |             }
14 |         }
15 |     }
16 |     crc16::State::<crc16::XMODEM>::calculate(key) % HASH_SLOTS
17 | }
18 | 
19 | #[cfg(test)]
20 | mod tests {
21 |     use super::*;
22 | 
23 |     fn raw_hash(key: &[u8]) -> u16 {
24 |         crc16::State::<crc16::XMODEM>::calculate(key) % HASH_SLOTS
25 |     }
26 | 
27 |     #[test]
28 |     fn test_hash_slot() {
29 |         assert_eq!(hash_slot(b"{}"), raw_hash(b"{}"));
30 |         assert_eq!(hash_slot(b"_{abc}"), raw_hash(b"abc"));
31 |         assert_eq!(hash_slot(b"{abc}_"), raw_hash(b"abc"));
32 |         assert_eq!(hash_slot(b"_{abc}_"), raw_hash(b"abc"));
33 |         assert_eq!(hash_slot(b"{abc}{def}"), raw_hash(b"abc"));
34 |         assert_eq!(hash_slot(b"{}{abc}"), raw_hash(b"{}{abc}"));
35 |         assert_eq!(hash_slot(b"{abc}{}"), raw_hash(b"abc"));
36 |         assert_eq!(hash_slot(b"{{abc}}"), raw_hash(b"{abc"));
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/inflightmap.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::collections::hash_map::{Entry, RandomState};
  3 | use std::collections::{BinaryHeap, HashMap};
  4 | use std::fmt;
  5 | use std::hash::{BuildHasher, Hash};
  6 | use std::ops::Deref;
  7 | 
  8 | // TODO: need a more efficient implementation and possibly more flexibility
  9 | 
 10 | #[derive(Debug)]
 11 | pub struct InFlightMap<K: Hash + Eq + Copy, V, T: Ord + Copy, H: BuildHasher = RandomState> {
 12 |     map: HashMap<K, V, H>,
 13 |     heap: BinaryHeap<Pair<T, K>>,
 14 | }
 15 | 
 16 | impl<K: Hash + Eq + Copy + fmt::Debug, V, T: Ord + Copy, H: BuildHasher + Default>
 17 |     InFlightMap<K, V, T, H>
 18 | {
 19 |     pub fn new() -> Self {
 20 |         InFlightMap {
 21 |             map: Default::default(),
 22 |             heap: Default::default(),
 23 |         }
 24 |     }
 25 | 
 26 |     pub fn clear(&mut self) {
 27 |         self.map.clear();
 28 |         self.heap.clear();
 29 |     }
 30 | 
 31 |     pub fn remove(&mut self, key: &K) -> Option<V> {
 32 |         self.map.remove(key)
 33 |     }
 34 | 
 35 |     pub fn entry_with_timeout(&mut self, key: K, expire: T) -> Entry<K, V> {
 36 |         self.heap.push(Pair(expire, key));
 37 |         self.map.entry(key)
 38 |     }
 39 | 
 40 |     pub fn entry(&mut self, key: K) -> Entry<K, V> {
 41 |         self.map.entry(key)
 42 |     }
 43 | 
 44 |     pub fn insert(&mut self, key: K, value: V, expire: T) -> &mut V {
 45 |         self.heap.push(Pair(expire, key));
 46 | 
 47 |         let mut inserted = false;
 48 |         let result = self.map.entry(key).or_insert_with(|| {
 49 |             inserted = true;
 50 |             value
 51 |         });
 52 | 
 53 |         if !inserted {
 54 |             panic!("{:?} is already present in the map", key);
 55 |         }
 56 | 
 57 |         result
 58 |     }
 59 | 
 60 |     pub fn pop_expired(&mut self, now: T) -> Option<(K, V)> {
 61 |         loop {
 62 |             let key = match self.heap.peek() {
 63 |                 Some(&Pair(e, k)) if now >= e => k,
 64 |                 _ => return None,
 65 |             };
 66 |             self.heap.pop();
 67 |             if let Some(v) = self.map.remove(&key) {
 68 |                 return Some((key, v));
 69 |             }
 70 |         }
 71 |     }
 72 | 
 73 |     pub fn touch_expired(&mut self, now: T, expire: T) -> Option<(K, &V)> {
 74 |         loop {
 75 |             let key = match self.heap.peek() {
 76 |                 Some(&Pair(e, k)) if now >= e => k,
 77 |                 _ => return None,
 78 |             };
 79 |             if let Some(v) = self.map.get(&key) {
 80 |                 *self.heap.peek_mut().unwrap() = Pair(expire, key);
 81 |                 return Some((key, &v));
 82 |             } else {
 83 |                 self.heap.pop();
 84 |             }
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | impl<K: Hash + Eq + Copy, V, T: Ord + Copy, H: BuildHasher> Deref for InFlightMap<K, V, T, H> {
 90 |     type Target = HashMap<K, V, H>;
 91 | 
 92 |     fn deref(&self) -> &Self::Target {
 93 |         &self.map
 94 |     }
 95 | }
 96 | 
 97 | // Like a 2-tuple but comparison is only done for the first item
 98 | #[derive(Debug)]
 99 | struct Pair<T, V>(T, V);
100 | 
101 | impl<T: PartialEq, V> PartialEq<Pair<T, V>> for Pair<T, V> {
102 |     fn eq(&self, other: &Pair<T, V>) -> bool {
103 |         other.0.eq(&self.0)
104 |     }
105 | }
106 | 
107 | impl<T: Eq, V> Eq for Pair<T, V> {}
108 | 
109 | impl<T: PartialOrd, V> PartialOrd<Pair<T, V>> for Pair<T, V> {
110 |     fn partial_cmp(&self, other: &Pair<T, V>) -> Option<Ordering> {
111 |         other.0.partial_cmp(&self.0)
112 |     }
113 | }
114 | 
115 | impl<T: Ord, V> Ord for Pair<T, V> {
116 |     fn cmp(&self, other: &Pair<T, V>) -> Ordering {
117 |         other.0.cmp(&self.0)
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | #![feature(nll)]
  2 | #![feature(fnbox, try_from)]
  3 | #![allow(dead_code)]
  4 | // #![cfg_attr(feature = "cargo-clippy", allow(cast_lossless))]
  5 | 
  6 | // #![feature(alloc_system, global_allocator, allocator_api)]
  7 | //
  8 | // extern crate alloc_system;
  9 | //
 10 | // use alloc_system::System;
 11 | //
 12 | // #[global_allocator]
 13 | // static A: System = System;
 14 | 
 15 | extern crate bincode;
 16 | extern crate byteorder;
 17 | extern crate bytes;
 18 | extern crate clap;
 19 | extern crate crc16;
 20 | extern crate futures;
 21 | #[macro_use]
 22 | extern crate lazy_static;
 23 | extern crate linear_map;
 24 | #[macro_use]
 25 | extern crate log;
 26 | extern crate log4rs;
 27 | extern crate metrics as rust_metrics;
 28 | extern crate num_cpus;
 29 | extern crate rand;
 30 | extern crate roaring;
 31 | extern crate rocksdb;
 32 | extern crate serde;
 33 | #[macro_use]
 34 | extern crate serde_derive;
 35 | extern crate crossbeam_channel;
 36 | extern crate serde_yaml;
 37 | extern crate tokio_codec;
 38 | extern crate tokio_core;
 39 | extern crate tokio_io;
 40 | 
 41 | #[cfg(test)]
 42 | extern crate env_logger;
 43 | 
 44 | #[macro_use]
 45 | mod utils;
 46 | mod types;
 47 | mod version_vector;
 48 | // mod gossip;
 49 | mod cubes;
 50 | mod dht;
 51 | mod fabric;
 52 | mod fabric_msg;
 53 | mod hash;
 54 | mod inflightmap;
 55 | mod storage;
 56 | #[macro_use]
 57 | mod database;
 58 | mod command;
 59 | mod config;
 60 | mod metrics;
 61 | mod resp;
 62 | mod server;
 63 | mod vnode;
 64 | mod vnode_sync;
 65 | mod workers;
 66 | 
 67 | fn configure() -> config::Config {
 68 |     use clap::{App, Arg, SubCommand};
 69 |     use config::*;
 70 |     use std::path::Path;
 71 | 
 72 |     let matches = App::new("SucreDB")
 73 |         .version("0.1")
 74 |         .about("A database made of sugar cubes")
 75 |         .arg(
 76 |             Arg::with_name("config_file")
 77 |                 .short("c")
 78 |                 .long("config")
 79 |                 .takes_value(true)
 80 |                 .help(".yaml config file")
 81 |                 .long_help(
 82 |                     "Path to the .yaml config file. Note that configuration \
 83 |                      set through the command line will take precedence \
 84 |                      over the config file.",
 85 |                 ).display_order(0),
 86 |         ).arg(
 87 |             Arg::with_name("data_dir")
 88 |                 .short("d")
 89 |                 .long("data")
 90 |                 .takes_value(true)
 91 |                 .help("Data directory"),
 92 |         ).arg(
 93 |             Arg::with_name("cluster_name")
 94 |                 .short("n")
 95 |                 .long("cluster")
 96 |                 .help("The cluster name")
 97 |                 .takes_value(true),
 98 |         ).arg(
 99 |             Arg::with_name("listen_addr")
100 |                 .short("l")
101 |                 .long("listen")
102 |                 .help("Listen addr")
103 |                 .takes_value(true),
104 |         ).arg(
105 |             Arg::with_name("fabric_addr")
106 |                 .short("f")
107 |                 .long("fabric")
108 |                 .help("Fabric listen addr")
109 |                 .takes_value(true),
110 |         ).arg(
111 |             Arg::with_name("seed_nodes")
112 |                 .short("s")
113 |                 .long("seeds")
114 |                 .multiple(true)
115 |                 .takes_value(true)
116 |                 .require_delimiter(true),
117 |         ).subcommand(
118 |             SubCommand::with_name("init")
119 |                 .about("Init and configure the cluster")
120 |                 .arg(
121 |                     Arg::with_name("replication_factor")
122 |                         .short("r")
123 |                         .help("Number of replicas")
124 |                         .default_value(DEFAULT_REPLICATION_FACTOR),
125 |                 ).arg(
126 |                     Arg::with_name("partitions")
127 |                         .short("p")
128 |                         .help("Number of partitions")
129 |                         .long_help(
130 |                             "Number of partitions, the recommended value is \
131 |                              `expected node count * 10` rounded up to the next power of 2.",
132 |                         ).default_value(DEFAULT_PARTITIONS),
133 |                 ).display_order(0),
134 |         ).get_matches();
135 | 
136 |     let mut config = Default::default();
137 | 
138 |     if let Some(v) = matches.value_of("config_file") {
139 |         read_config_file(Path::new(v), &mut config);
140 |     } else {
141 |         setup_default_logging();
142 |     }
143 | 
144 |     if let Some(v) = matches.value_of("data_dir") {
145 |         config.data_dir = v.into();
146 |     }
147 | 
148 |     if let Some(v) = matches.value_of("cluster_name") {
149 |         config.cluster_name = v.into();
150 |     }
151 | 
152 |     if let Some(v) = matches.value_of("listen_addr") {
153 |         config.listen_addr = v.parse().expect("Can't parse listen_addr");
154 |     }
155 | 
156 |     if let Some(v) = matches.values_of("seed_nodes") {
157 |         config.seed_nodes = v
158 |             .map(|v| v.parse().expect("Can't parse seed_nodes"))
159 |             .collect();
160 |     }
161 | 
162 |     if let Some(v) = matches.value_of("fabric_addr") {
163 |         config.fabric_addr = v.parse().expect("Can't parse fabric_addr");
164 |     }
165 | 
166 |     if let Some(sub) = matches.subcommand_matches("init") {
167 |         config.cmd_init = Some(InitCommand {
168 |             partitions: sub
169 |                 .value_of("partitions")
170 |                 .unwrap()
171 |                 .parse()
172 |                 .expect("Can't parse partitions"),
173 |             replication_factor: sub
174 |                 .value_of("replication_factor")
175 |                 .unwrap()
176 |                 .parse()
177 |                 .expect("Can't parse replication_factor"),
178 |         });
179 |     }
180 | 
181 |     config
182 | }
183 | 
184 | #[cfg(not(test))]
185 | fn main() {
186 |     let server = server::Server::new(configure());
187 |     server.run();
188 | }
189 | 


--------------------------------------------------------------------------------
/src/metrics.rs:
--------------------------------------------------------------------------------
 1 | pub use rust_metrics::metrics::{Counter, Gauge, Meter, Metric};
 2 | use rust_metrics::metrics::{StdGauge, StdMeter};
 3 | use std::sync::Arc;
 4 | 
 5 | // TODO: Expose these metrics
 6 | lazy_static! {
 7 |     pub static ref CLIENT_CONNECTION: Arc<StdGauge> = { StdGauge::new() };
 8 |     pub static ref REQUEST_GET: Arc<Meter> = { StdMeter::new() };
 9 |     pub static ref REQUEST_SET: Arc<StdMeter> = { StdMeter::new() };
10 |     pub static ref REQUEST_DEL: Arc<StdMeter> = { StdMeter::new() };
11 |     pub static ref SYNC_SEND: Arc<StdMeter> = { StdMeter::new() };
12 |     pub static ref SYNC_RECV: Arc<StdMeter> = { StdMeter::new() };
13 |     pub static ref SYNC_RESEND: Arc<StdMeter> = { StdMeter::new() };
14 |     pub static ref SYNC_OUTGOING: Arc<StdGauge> = { StdGauge::new() };
15 |     pub static ref SYNC_INCOMING: Arc<StdGauge> = { StdGauge::new() };
16 | }
17 | 


--------------------------------------------------------------------------------
/src/resp.rs:
--------------------------------------------------------------------------------
  1 | use bytes::Bytes;
  2 | use std::error::Error;
  3 | use std::io::{self, Write};
  4 | use std::{fmt, str};
  5 | use utils::assume_str;
  6 | 
  7 | #[derive(Eq, PartialEq, Debug)]
  8 | pub enum RespError {
  9 |     Incomplete,
 10 |     Invalid(&'static str),
 11 | }
 12 | 
 13 | impl From<&'static str> for RespError {
 14 |     fn from(from: &'static str) -> Self {
 15 |         RespError::Invalid(from)
 16 |     }
 17 | }
 18 | 
 19 | pub type RespResult<T> = Result<T, RespError>;
 20 | 
 21 | #[derive(Clone, PartialEq)]
 22 | pub enum RespValue {
 23 |     Nil,
 24 |     Int(i64),
 25 |     Data(Bytes),
 26 |     Array(Vec<RespValue>),
 27 |     Status(Bytes),
 28 |     Error(Bytes),
 29 | }
 30 | 
 31 | impl RespValue {
 32 |     pub fn serialized_size(&self) -> usize {
 33 |         match *self {
 34 |             RespValue::Nil => "$-1\r\n".len(),
 35 |             RespValue::Int(_) => ":-9223372036854775808\r\n".len(),
 36 |             RespValue::Data(ref v) => "$4294967296\r\n".len() + v.len() + "\r\n".len(),
 37 |             RespValue::Array(ref a) => {
 38 |                 "*4294967296\r\n".len() + a.iter().map(Self::serialized_size).sum::<usize>()
 39 |             }
 40 |             RespValue::Status(ref v) | RespValue::Error(ref v) => {
 41 |                 "+".len() + v.len() + "\r\n".len()
 42 |             }
 43 |         }
 44 |     }
 45 | 
 46 |     pub fn serialize_into<W: Write>(self, f: &mut W) -> io::Result<()> {
 47 |         match self {
 48 |             RespValue::Nil => write!(f, "$-1\r\n"),
 49 |             RespValue::Int(v) => write!(f, ":{}\r\n", v),
 50 |             RespValue::Data(v) => {
 51 |                 write!(f, "${}\r\n", v.len())?;
 52 |                 f.write_all(v.as_ref())?;
 53 |                 write!(f, "\r\n")
 54 |             }
 55 |             RespValue::Array(a) => {
 56 |                 write!(f, "*{}\r\n", a.len())?;
 57 |                 for v in a {
 58 |                     v.serialize_into(f)?;
 59 |                 }
 60 |                 Ok(())
 61 |             }
 62 |             RespValue::Status(v) => {
 63 |                 write!(f, "+")?;
 64 |                 f.write_all(v.as_ref())?;
 65 |                 write!(f, "\r\n")
 66 |             }
 67 |             RespValue::Error(v) => {
 68 |                 write!(f, "-")?;
 69 |                 f.write_all(v.as_ref())?;
 70 |                 write!(f, "\r\n")
 71 |             }
 72 |         }
 73 |     }
 74 | }
 75 | 
 76 | impl<T: Error> From<T> for RespValue {
 77 |     fn from(from: T) -> Self {
 78 |         RespValue::Error(format!("{}", from).into())
 79 |     }
 80 | }
 81 | 
 82 | impl fmt::Debug for RespValue {
 83 |     fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
 84 |         match *self {
 85 |             RespValue::Nil => write!(f, "Nil"),
 86 |             RespValue::Int(v) => write!(f, "Int({:?})", v),
 87 |             RespValue::Data(ref v) => write!(f, "Data({:?})", v),
 88 |             RespValue::Array(ref b) => {
 89 |                 write!(f, "Array(")?;
 90 |                 f.debug_list().entries(b).finish()?;
 91 |                 write!(f, ")")
 92 |             }
 93 |             RespValue::Status(ref v) => write!(f, "Status({:?})", v),
 94 |             RespValue::Error(ref v) => write!(f, "Error({:?})", v),
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | /// The internal redis response parser.
100 | pub struct Parser {
101 |     consumed: usize,
102 |     body: Bytes,
103 | }
104 | 
105 | impl Parser {
106 |     pub fn new<T: AsRef<[u8]>>(body: T) -> RespResult<Parser> {
107 |         let valid_to = Self::speculate_buffer(body.as_ref())?;
108 |         Ok(Parser {
109 |             consumed: 0,
110 |             body: body.as_ref()[..valid_to].into(),
111 |         })
112 |     }
113 | 
114 |     // Quickly speculate a buffer, checking whatever it has a complete resp objects or not.
115 |     // If succesfull returns the resp objects length in bytes.
116 |     fn speculate_buffer(buf: &[u8]) -> RespResult<usize> {
117 |         if buf.len() < 3 {
118 |             return Err(RespError::Incomplete);
119 |         }
120 |         if &buf[buf.len() - 2..] == b"\r\n" {
121 |             return Ok(buf.len());
122 |         }
123 |         let mut valid = 0;
124 |         let mut i = 0;
125 |         let mut values_pending = 0;
126 |         while i < buf.len() {
127 |             match buf[i] {
128 |                 b'$' | b'*' => {
129 |                     let is_multi = buf[i] == b'*';
130 |                     let mut len = 0i64;
131 |                     i += 1;
132 |                     while i < buf.len() {
133 |                         match buf[i] {
134 |                             b'0'...b'9' => len = len * 10 + (buf[i] - b'0') as i64,
135 |                             b'-' => {
136 |                                 // only valid negative len is -1
137 |                                 len = -1;
138 |                                 i += 2;
139 |                                 break;
140 |                             }
141 |                             b'\r' => break,
142 |                             _ => return Err(RespError::Invalid("Invalid digit")),
143 |                         }
144 |                         i += 1;
145 |                     }
146 |                     if len >= 0 {
147 |                         if is_multi {
148 |                             values_pending = len + 1;
149 |                         } else {
150 |                             i += 2 + len as usize;
151 |                         }
152 |                     }
153 |                 }
154 |                 b':' | b'+' | b'-' => {
155 |                     i += 1;
156 |                     while i < buf.len() && buf[i] != b'\r' {
157 |                         i += 1;
158 |                     }
159 |                 }
160 |                 b'\r' => {
161 |                     i += 2;
162 |                     continue;
163 |                 }
164 |                 _ => return Err(RespError::Invalid("Invalid prefix")),
165 |             }
166 |             // skip delimiter
167 |             i += 2;
168 |             if values_pending > 0 {
169 |                 values_pending -= 1;
170 |             }
171 |             if values_pending == 0 && i <= buf.len() {
172 |                 valid = i;
173 |             }
174 |         }
175 |         if valid != 0 {
176 |             Ok(valid)
177 |         } else {
178 |             Err(RespError::Incomplete)
179 |         }
180 |     }
181 | 
182 |     pub fn consumed(&self) -> usize {
183 |         self.consumed
184 |     }
185 | 
186 |     /// parses a single value out of the stream.  If there are multiple
187 |     /// values you can call this multiple times.
188 |     pub fn parse(&mut self) -> RespResult<RespValue> {
189 |         let saved_len = self.body.len();
190 |         let value = self.parse_value();
191 |         if value.is_ok() {
192 |             self.consumed += saved_len - self.body.len();
193 |         }
194 |         value
195 |     }
196 | 
197 |     fn parse_value(&mut self) -> RespResult<RespValue> {
198 |         match self.read_byte()? {
199 |             b'$' => self.parse_data(),
200 |             b'*' => self.parse_array(),
201 |             b':' => self.parse_int(),
202 |             b'+' => self.parse_status(),
203 |             b'-' => self.parse_error(),
204 |             c => {
205 |                 if c == b'\r' && self.read_byte()? == b'\n' {
206 |                     return self.parse_value();
207 |                 }
208 |                 debug!(
209 |                     "Invalid prefix {:?}{:?} when parsing value",
210 |                     c as char,
211 |                     String::from_utf8_lossy(self.body.as_ref())
212 |                 );
213 |                 Err("Invalid prefix when parsing value".into())
214 |             }
215 |         }
216 |     }
217 | 
218 |     #[inline]
219 |     fn read_byte(&mut self) -> RespResult<u8> {
220 |         if self.body.len() >= 1 {
221 |             let byte = self.body[0];
222 |             self.body = self.body.slice_from(1);
223 |             Ok(byte)
224 |         } else {
225 |             Err(RespError::Incomplete)
226 |         }
227 |     }
228 | 
229 |     #[inline]
230 |     fn read(&mut self, len: usize) -> RespResult<Bytes> {
231 |         if self.body.len() >= len {
232 |             Ok(self.body.split_to(len))
233 |         } else {
234 |             Err(RespError::Incomplete)
235 |         }
236 |     }
237 | 
238 |     fn read_with_separator(&mut self, len: usize) -> RespResult<Bytes> {
239 |         let result = self.read(len + 2)?;
240 |         if &result[len..] != b"\r\n" {
241 |             Err("Invalid line separator".into())
242 |         } else {
243 |             Ok(result.slice_to(len))
244 |         }
245 |     }
246 | 
247 |     fn read_line(&mut self) -> RespResult<Bytes> {
248 |         let nl_pos = match self.body.iter().position(|&b| b == b'\r') {
249 |             Some(nl_pos) => nl_pos,
250 |             None => return Err(RespError::Incomplete),
251 |         };
252 |         Ok(self.read_with_separator(nl_pos)?)
253 |     }
254 | 
255 |     fn read_int_line(&mut self) -> RespResult<i64> {
256 |         let line = self.read_line()?;
257 |         match assume_str(line.as_ref()).parse::<i64>() {
258 |             Err(_) => Err("Expected integer, got garbage".into()),
259 |             Ok(value) => Ok(value),
260 |         }
261 |     }
262 | 
263 |     fn parse_status(&mut self) -> RespResult<RespValue> {
264 |         Ok(RespValue::Status(self.read_line()?))
265 |     }
266 | 
267 |     fn parse_int(&mut self) -> RespResult<RespValue> {
268 |         Ok(RespValue::Int(self.read_int_line()?))
269 |     }
270 | 
271 |     fn parse_data(&mut self) -> RespResult<RespValue> {
272 |         let length = self.read_int_line()?;
273 |         if length < 0 {
274 |             Ok(RespValue::Nil)
275 |         } else {
276 |             let data = self.read_with_separator(length as usize)?;
277 |             Ok(RespValue::Data(data))
278 |         }
279 |     }
280 | 
281 |     fn parse_array(&mut self) -> RespResult<RespValue> {
282 |         let length = self.read_int_line()?;
283 |         if length < 0 {
284 |             Ok(RespValue::Nil)
285 |         } else {
286 |             let mut rv = Vec::with_capacity(length as usize);
287 |             for _ in 0..length {
288 |                 rv.push(self.parse_value()?);
289 |             }
290 |             Ok(RespValue::Array(rv))
291 |         }
292 |     }
293 | 
294 |     fn parse_error(&mut self) -> RespResult<RespValue> {
295 |         Ok(RespValue::Error(self.read_line()?))
296 |     }
297 | }
298 | 
299 | #[cfg(test)]
300 | mod tests {
301 |     use super::{Parser, RespError, RespResult, RespValue};
302 | 
303 |     fn parse(slice: &[u8]) -> RespResult<RespValue> {
304 |         Parser::new(slice)?.parse()
305 |     }
306 | 
307 |     #[test]
308 |     fn parse_incomplete() {
309 |         let r = parse(b"*2\r\n$3\r\nfoo");
310 |         assert_eq_repr!(r.unwrap_err(), RespError::Incomplete);
311 |     }
312 | 
313 |     #[test]
314 |     fn parse_error() {
315 |         let r = parse(b"-foo\r\n");
316 |         assert_eq_repr!(r.unwrap(), RespValue::Error("foo".into()));
317 | 
318 |         let r = parse(b"-invalid line sep\r\r");
319 |         assert!(if let RespError::Invalid(_) = r.unwrap_err() {
320 |             true
321 |         } else {
322 |             false
323 |         });
324 |     }
325 | 
326 |     #[test]
327 |     fn parse_valid_array() {
328 |         let r = parse(b"*2\r\n$3\r\nfoo\r\n$4\r\nbarz\r\n");
329 |         assert!(r.is_ok(), "{:?} not ok", r.unwrap_err());
330 |         assert_eq_repr!(
331 |             r.unwrap(),
332 |             RespValue::Array(vec![
333 |                 RespValue::Data(b"foo".as_ref().into()),
334 |                 RespValue::Data(b"barz".as_ref().into()),
335 |             ])
336 |         );
337 |     }
338 | 
339 |     #[test]
340 |     fn parser_multiple2() {
341 |         let mut parser = Parser::new(
342 |             b"*2\r\n$3\r\nfoo\r\n$4\r\nbarz\r\n*2\r\n$3\r\nfoo\r\n$4\r\nbarz\r\n".as_ref(),
343 |         ).unwrap();
344 |         for _ in 0..2 {
345 |             let r = parser.parse();
346 |             assert!(r.is_ok(), "{:?} not ok", r.unwrap_err());
347 |             assert_eq_repr!(
348 |                 r.unwrap(),
349 |                 RespValue::Array(vec![
350 |                     RespValue::Data(b"foo".as_ref().into()),
351 |                     RespValue::Data(b"barz".as_ref().into()),
352 |                 ])
353 |             );
354 |         }
355 |         let r = parser.parse();
356 |         assert_eq_repr!(r.unwrap_err(), RespError::Incomplete);
357 |     }
358 | 
359 |     #[test]
360 |     fn message_response() {
361 |         let mut parser = Parser::new(
362 |             b"*2\r\n*2\r\n:7270781675605147315\r\n$25\r\nmessage 1 from producer 0\r\n*2\r\n:4590316895040267280\r\n$25\r\nmessage 2 from producer 0\r\n"
363 |                 .as_ref(),
364 |         ).unwrap();
365 |         let r = parser.parse();
366 |         assert!(r.is_ok(), "{:?} not ok", r.unwrap_err());
367 |         assert_eq!(parser.body.len(), 0);
368 |     }
369 | }
370 | 


--------------------------------------------------------------------------------
/src/server.rs:
--------------------------------------------------------------------------------
  1 | use std::cell::RefCell;
  2 | use std::collections::VecDeque;
  3 | use std::io;
  4 | use std::rc::Rc;
  5 | use std::sync::{Arc, Mutex};
  6 | 
  7 | use bytes::{BufMut, BytesMut};
  8 | use database::{Context as DbContext, Database, Token, WorkerMsg};
  9 | use futures::sync::mpsc as fmpsc;
 10 | use futures::{Future, Sink, Stream};
 11 | use tokio_codec as codec;
 12 | use tokio_core as tokio;
 13 | use tokio_io::AsyncRead;
 14 | use workers::WorkerSender;
 15 | 
 16 | use config::Config;
 17 | use metrics::{self, Gauge};
 18 | use resp::{self, RespValue};
 19 | use utils::IdHashMap;
 20 | 
 21 | struct RespCodec;
 22 | 
 23 | impl codec::Decoder for RespCodec {
 24 |     type Item = RespValue;
 25 |     type Error = io::Error;
 26 | 
 27 |     fn decode(&mut self, src: &mut BytesMut) -> io::Result<Option<Self::Item>> {
 28 |         let (consumed, result) = resp::Parser::new(&*src)
 29 |             .and_then(|mut p| match p.parse() {
 30 |                 Ok(v) => Ok((p.consumed(), Ok(Some(v)))),
 31 |                 Err(e) => Err(e),
 32 |             }).unwrap_or_else(|e| match e {
 33 |                 resp::RespError::Incomplete => (0, Ok(None)),
 34 |                 _ => (0, Err(io::ErrorKind::InvalidData.into())),
 35 |             });
 36 |         src.split_to(consumed);
 37 |         result
 38 |     }
 39 | }
 40 | 
 41 | impl codec::Encoder for RespCodec {
 42 |     type Item = RespValue;
 43 |     type Error = io::Error;
 44 | 
 45 |     fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> io::Result<()> {
 46 |         dst.reserve(item.serialized_size());
 47 |         item.serialize_into(&mut dst.writer())
 48 |             .expect("Failed to serialize into reserved space");
 49 |         Ok(())
 50 |     }
 51 | }
 52 | 
 53 | struct Context {
 54 |     context: Rc<SharedContext>,
 55 |     token: Token,
 56 |     requests: VecDeque<RespValue>,
 57 |     db_context: Option<DbContext>,
 58 | }
 59 | 
 60 | struct SharedContext {
 61 |     database: Arc<Database>,
 62 |     db_sender: RefCell<WorkerSender<WorkerMsg>>,
 63 |     token_chans: Arc<Mutex<IdHashMap<Token, fmpsc::UnboundedSender<DbContext>>>>,
 64 | }
 65 | 
 66 | pub struct Server {
 67 |     config: Config,
 68 | }
 69 | 
 70 | impl Context {
 71 |     fn new(
 72 |         context: Rc<SharedContext>,
 73 |         token: Token,
 74 |         chan_tx: fmpsc::UnboundedSender<DbContext>,
 75 |     ) -> Self {
 76 |         metrics::CLIENT_CONNECTION.inc();
 77 |         context.token_chans.lock().unwrap().insert(token, chan_tx);
 78 |         Context {
 79 |             context: context,
 80 |             token: token,
 81 |             db_context: Some(DbContext::new(token)),
 82 |             requests: VecDeque::new(),
 83 |         }
 84 |     }
 85 | 
 86 |     fn dispatch(&mut self, req: RespValue) {
 87 |         if let Some(mut db_context) = self.db_context.take() {
 88 |             debug!("Dispatched request ({}) {:?}", self.token, req);
 89 |             db_context.commands.push(req);
 90 |             self.context
 91 |                 .db_sender
 92 |                 .borrow_mut()
 93 |                 .send(WorkerMsg::Command(db_context));
 94 |         } else {
 95 |             debug!("Enqueued request ({}) {:?}", self.token, req);
 96 |             self.requests.push_back(req);
 97 |         }
 98 |     }
 99 | 
100 |     fn dispatch_next(&mut self, mut db_context: DbContext) {
101 |         assert!(
102 |             self.db_context.is_none(),
103 |             "can't cycle if there's nothing inflight"
104 |         );
105 |         if let Some(req) = self.requests.pop_front() {
106 |             debug!("Dispatched request ({}) {:?}", self.token, req);
107 |             db_context.commands.push(req);
108 |             self.context
109 |                 .db_sender
110 |                 .borrow_mut()
111 |                 .send(WorkerMsg::Command(db_context));
112 |         } else {
113 |             self.db_context = Some(db_context);
114 |         }
115 |     }
116 | }
117 | 
118 | impl Drop for Context {
119 |     fn drop(&mut self) {
120 |         self.context.token_chans.lock().unwrap().remove(&self.token);
121 |         metrics::CLIENT_CONNECTION.dec();
122 |     }
123 | }
124 | 
125 | impl Server {
126 |     pub fn new(config: Config) -> Server {
127 |         Server { config: config }
128 |     }
129 | 
130 |     fn connection(
131 |         context: Rc<SharedContext>,
132 |         token: Token,
133 |         socket: tokio::net::TcpStream,
134 |     ) -> Box<Future<Item = (), Error = io::Error>> {
135 |         socket.set_nodelay(true).expect("Failed to set nodelay");
136 |         let (sock_rx, sock_tx) = socket.split();
137 |         let sock_tx = codec::FramedWrite::new(sock_tx, RespCodec);
138 |         let sock_rx = codec::FramedRead::new(sock_rx, RespCodec);
139 |         let (chan_tx, chan_rx) = fmpsc::unbounded();
140 |         let ctx_rx = Rc::new(RefCell::new(Context::new(context, token, chan_tx)));
141 |         let ctx_tx = ctx_rx.clone();
142 | 
143 |         let fut_rx = sock_rx.for_each(move |request| {
144 |             ctx_rx.borrow_mut().dispatch(request);
145 |             Ok(())
146 |         });
147 | 
148 |         let fut_tx = sock_tx
149 |             .send_all(
150 |                 chan_rx
151 |                     .map(move |mut context| {
152 |                         let response = context.take_response();
153 |                         context.clear();
154 |                         ctx_tx.borrow_mut().dispatch_next(context);
155 |                         response
156 |                     }).map_err(|_| io::Error::from(io::ErrorKind::Other)),
157 |             ).map(|_| ());
158 | 
159 |         Box::new(fut_rx.select(fut_tx).map(|_| ()).map_err(|(e, _)| e))
160 |     }
161 | 
162 |     pub fn run(self) {
163 |         let mut core = tokio::reactor::Core::new().unwrap();
164 | 
165 |         let token_chans: Arc<Mutex<IdHashMap<Token, fmpsc::UnboundedSender<_>>>> =
166 |             Default::default();
167 |         let token_chans_cloned = token_chans.clone();
168 |         let response_fn = Box::new(move |context: DbContext| {
169 |             let token = context.token;
170 |             if let Some(chan) = token_chans_cloned.lock().unwrap().get_mut(&token) {
171 |                 if let Err(e) = chan.unbounded_send(context) {
172 |                     warn!("Can't send to token {} chan: {:?}", token, e);
173 |                 }
174 |             } else {
175 |                 debug!("Can't find response channel for token {:?}", token);
176 |             }
177 |         });
178 | 
179 |         let database = Database::new(&self.config, response_fn);
180 | 
181 |         let context = Rc::new(SharedContext {
182 |             db_sender: RefCell::new(database.sender()),
183 |             database: database,
184 |             token_chans: token_chans,
185 |         });
186 | 
187 |         let mut next_token = 0;
188 |         let handle = core.handle();
189 |         let listener =
190 |             tokio::net::TcpListener::bind(&self.config.listen_addr, &core.handle()).unwrap();
191 |         let listener_fut = listener.incoming().for_each(|(socket, addr)| {
192 |             if context.token_chans.lock().unwrap().len()
193 |                 >= context.database.config.client_connection_max as usize
194 |             {
195 |                 info!(
196 |                     "Refusing connection from {:?}, connection limit reached",
197 |                     addr
198 |                 );
199 |                 return Ok(());
200 |             }
201 |             info!("Token {} accepting connection from {:?}", next_token, addr);
202 |             let conn_ctx = context.clone();
203 |             handle.spawn(
204 |                 Self::connection(conn_ctx, next_token, socket).then(move |r| {
205 |                     info!("Token {} disconnected {:?}", next_token, r);
206 |                     Ok(())
207 |                 }),
208 |             );
209 |             next_token = next_token.wrapping_add(1);
210 |             Ok(())
211 |         });
212 | 
213 |         core.run(listener_fut).unwrap();
214 |     }
215 | }
216 | 


--------------------------------------------------------------------------------
/src/storage.rs:
--------------------------------------------------------------------------------
  1 | use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
  2 | use rocksdb::{self, Writable};
  3 | use std::io::Write;
  4 | use std::path::Path;
  5 | use std::sync::Arc;
  6 | use std::{mem, str};
  7 | use utils::*;
  8 | 
  9 | struct U16BeSuffixTransform;
 10 | 
 11 | impl rocksdb::SliceTransform for U16BeSuffixTransform {
 12 |     fn transform<'a>(&mut self, key: &'a [u8]) -> &'a [u8] {
 13 |         &key[..2]
 14 |     }
 15 | 
 16 |     fn in_domain(&mut self, _key: &[u8]) -> bool {
 17 |         true
 18 |     }
 19 | }
 20 | 
 21 | pub struct StorageManager {
 22 |     db: Arc<rocksdb::DB>,
 23 | }
 24 | 
 25 | #[inline]
 26 | fn build_key<'a>(buffer: &'a mut [u8], num: u16, key: &[u8]) -> &'a [u8] {
 27 |     (&mut buffer[..2]).write_u16::<BigEndian>(num).unwrap();
 28 |     (&mut buffer[2..]).write_all(key).unwrap();
 29 |     &buffer[..2 + key.len()]
 30 | }
 31 | 
 32 | #[inline]
 33 | fn build_log_key<'a>(buffer: &'a mut [u8], num: u16, log_key: (u64, u64)) -> &'a [u8] {
 34 |     (&mut buffer[..2]).write_u16::<BigEndian>(num).unwrap();
 35 |     (&mut buffer[2..2 + 8])
 36 |         .write_u64::<BigEndian>(log_key.0)
 37 |         .unwrap();
 38 |     (&mut buffer[2 + 8..2 + 8 + 8])
 39 |         .write_u64::<BigEndian>(log_key.1)
 40 |         .unwrap();
 41 |     &buffer[..2 + 8 + 8]
 42 | }
 43 | 
 44 | #[inline]
 45 | fn build_log_prefix<'a>(buffer: &'a mut [u8], num: u16, prefix: u64) -> &'a [u8] {
 46 |     (&mut buffer[..2]).write_u16::<BigEndian>(num).unwrap();
 47 |     (&mut buffer[2..2 + 8])
 48 |         .write_u64::<BigEndian>(prefix)
 49 |         .unwrap();
 50 |     &buffer[..2 + 8]
 51 | }
 52 | 
 53 | // TODO: support TTL
 54 | // TODO: specific comparator for log cf
 55 | // TODO: merge operator could be a big win
 56 | pub struct Storage {
 57 |     db: Arc<rocksdb::DB>,
 58 |     cf: &'static rocksdb::CFHandle,
 59 |     log_cf: &'static rocksdb::CFHandle,
 60 |     num: u16,
 61 | }
 62 | 
 63 | unsafe impl Sync for Storage {}
 64 | unsafe impl Send for Storage {}
 65 | 
 66 | pub struct StorageBatch<'a> {
 67 |     storage: &'a Storage,
 68 |     wb: rocksdb::WriteBatch,
 69 | }
 70 | 
 71 | pub struct SendableStorageBatch(rocksdb::WriteBatch);
 72 | 
 73 | impl<'a> From<StorageBatch<'a>> for SendableStorageBatch {
 74 |     fn from(sb: StorageBatch<'a>) -> Self {
 75 |         SendableStorageBatch(sb.wb)
 76 |     }
 77 | }
 78 | 
 79 | struct GenericIterator {
 80 |     db: Arc<rocksdb::DB>,
 81 |     iterator: rocksdb::rocksdb::DBIterator<Arc<rocksdb::DB>>,
 82 |     first: bool,
 83 | }
 84 | 
 85 | pub struct StorageIterator(GenericIterator);
 86 | 
 87 | pub struct LogStorageIterator(GenericIterator);
 88 | 
 89 | unsafe impl Send for GenericIterator {}
 90 | 
 91 | impl StorageManager {
 92 |     pub fn new<P: AsRef<Path>>(path: P) -> Result<StorageManager, GenericError> {
 93 |         let mut opts = rocksdb::DBOptions::new();
 94 |         opts.create_if_missing(true);
 95 |         opts.set_max_background_jobs(4);
 96 |         opts.enable_pipelined_write(true);
 97 |         let mut def_cf_opts = rocksdb::ColumnFamilyOptions::new();
 98 |         def_cf_opts
 99 |             .set_prefix_extractor("U16BeSuffixTransform", Box::new(U16BeSuffixTransform))
100 |             .unwrap();
101 |         def_cf_opts.compression_per_level(&[
102 |             rocksdb::DBCompressionType::No,
103 |             rocksdb::DBCompressionType::No,
104 |             rocksdb::DBCompressionType::Lz4,
105 |             rocksdb::DBCompressionType::Lz4,
106 |             rocksdb::DBCompressionType::Lz4,
107 |             rocksdb::DBCompressionType::Lz4,
108 |             rocksdb::DBCompressionType::Lz4,
109 |         ]);
110 |         def_cf_opts.set_write_buffer_size(32 * 1024 * 1024);
111 |         def_cf_opts.set_max_bytes_for_level_base(4 * 32 * 1024 * 1024);
112 |         def_cf_opts.set_max_write_buffer_number(4);
113 | 
114 |         let mut block_opts = rocksdb::BlockBasedOptions::new();
115 |         block_opts.set_bloom_filter(10, false);
116 |         block_opts.set_lru_cache(4 * 32 * 1024 * 1024, -1, 0, 0f64);
117 |         def_cf_opts.set_block_based_table_factory(&block_opts);
118 | 
119 |         let mut log_cf_opts = rocksdb::ColumnFamilyOptions::new();
120 |         log_cf_opts.compression(rocksdb::DBCompressionType::No);
121 |         let mut fifo_opts = rocksdb::FifoCompactionOptions::new();
122 |         fifo_opts.set_ttl(3600 * 72); // 72 hours
123 |         log_cf_opts.set_fifo_compaction_options(fifo_opts);
124 |         log_cf_opts.set_compaction_style(rocksdb::DBCompactionStyle::Fifo);
125 |         log_cf_opts.set_write_buffer_size(32 * 1024 * 1024);
126 |         log_cf_opts.set_max_write_buffer_number(4);
127 | 
128 |         let mut block_opts = rocksdb::BlockBasedOptions::new();
129 |         block_opts.set_bloom_filter(10, false);
130 |         block_opts.set_lru_cache(2 * 32 * 1024 * 1024, -1, 0, 0f64);
131 |         log_cf_opts.set_block_based_table_factory(&block_opts);
132 | 
133 |         // TODO: Rocksdb is complicated, we might want to tune some more options
134 | 
135 |         let db = rocksdb::DB::open_cf(
136 |             opts.clone(),
137 |             path.as_ref().to_str().unwrap(),
138 |             vec![
139 |                 ("default", def_cf_opts.clone()),
140 |                 ("log", log_cf_opts.clone()),
141 |             ],
142 |         ).or_else(|_| -> Result<_, String> {
143 |             let mut db = rocksdb::DB::open_cf(
144 |                 opts,
145 |                 path.as_ref().to_str().unwrap(),
146 |                 vec![("default", def_cf_opts)],
147 |             )?;
148 | 
149 |             db.create_cf(("log", log_cf_opts))?;
150 |             Ok(db)
151 |         })?;
152 | 
153 |         Ok(StorageManager { db: Arc::new(db) })
154 |     }
155 | 
156 |     pub fn open(&self, db_num: u16) -> Result<Storage, GenericError> {
157 |         Ok(Storage {
158 |             db: self.db.clone(),
159 |             cf: unsafe { mem::transmute(self.db.cf_handle("default").unwrap()) },
160 |             log_cf: unsafe { mem::transmute(self.db.cf_handle("log").unwrap()) },
161 |             num: db_num,
162 |         })
163 |     }
164 | 
165 |     pub fn batch_write(&self, batch: SendableStorageBatch) -> Result<(), GenericError> {
166 |         Ok(self.db.write(batch.0)?)
167 |     }
168 | }
169 | 
170 | impl Drop for StorageManager {
171 |     fn drop(&mut self) {
172 |         let sc = Arc::strong_count(&self.db);
173 |         let wc = Arc::weak_count(&self.db);
174 |         assert_eq!(wc, 0);
175 |         assert_eq!(sc, 1);
176 |     }
177 | }
178 | 
179 | impl Storage {
180 |     pub fn iterator(&self) -> StorageIterator {
181 |         let mut key_prefix = [0u8; 2];
182 |         build_key(&mut key_prefix, self.num, b"");
183 |         let mut ro = rocksdb::ReadOptions::new();
184 |         ro.set_total_order_seek(false);
185 |         ro.set_prefix_same_as_start(true);
186 |         let mut iterator = rocksdb::DBIterator::new_cf(self.db.clone(), self.cf, ro);
187 |         iterator.seek(rocksdb::SeekKey::Key(&key_prefix[..]));
188 |         StorageIterator(GenericIterator {
189 |             db: self.db.clone(),
190 |             iterator: iterator,
191 |             first: true,
192 |         })
193 |     }
194 | 
195 |     pub fn log_iterator_all(&self) -> LogStorageIterator {
196 |         let mut key_prefix = [0u8; 2];
197 |         build_key(&mut key_prefix, self.num, b"");
198 |         let mut end_prefix = [0u8; 2];
199 |         build_key(&mut end_prefix, self.num + 1, b"");
200 |         let mut ro = rocksdb::ReadOptions::new();
201 |         ro.set_iterate_upper_bound(&end_prefix[..]);
202 |         let mut iterator = rocksdb::DBIterator::new_cf(self.db.clone(), self.log_cf, ro);
203 |         iterator.seek(rocksdb::SeekKey::Key(&key_prefix[..]));
204 |         LogStorageIterator(GenericIterator {
205 |             db: self.db.clone(),
206 |             iterator: iterator,
207 |             first: true,
208 |         })
209 |     }
210 | 
211 |     pub fn log_iterator(&self, prefix: u64, start: u64) -> LogStorageIterator {
212 |         let mut end_prefix = [0u8; 2 + 8];
213 |         build_log_prefix(&mut end_prefix, self.num, prefix + 1);
214 |         let mut start_key = [0u8; 2 + 8 + 8];
215 |         build_log_key(&mut start_key, self.num, (prefix, start));
216 |         let mut ro = rocksdb::ReadOptions::new();
217 |         ro.set_iterate_upper_bound(&end_prefix[..]);
218 |         let mut iterator = rocksdb::DBIterator::new_cf(self.db.clone(), self.log_cf, ro);
219 |         iterator.seek(rocksdb::SeekKey::Key(&start_key[..]));
220 |         LogStorageIterator(GenericIterator {
221 |             db: self.db.clone(),
222 |             iterator: iterator,
223 |             first: true,
224 |         })
225 |     }
226 | 
227 |     pub fn get<R, F: FnOnce(&[u8]) -> R>(
228 |         &self,
229 |         key: &[u8],
230 |         callback: F,
231 |     ) -> Result<Option<R>, GenericError> {
232 |         let mut buffer = [0u8; 512];
233 |         let buffer = build_key(&mut buffer, self.num, key);
234 |         let r = self.db.get_cf(self.cf, buffer)?;
235 |         trace!(
236 |             "get {:?} ({:?} bytes)",
237 |             str::from_utf8(key),
238 |             r.as_ref().map(|x| x.len())
239 |         );
240 |         Ok(r.map(|r| callback(&*r)))
241 |     }
242 | 
243 |     pub fn log_get<R, F: FnOnce(&[u8]) -> R>(
244 |         &self,
245 |         log_key: (u64, u64),
246 |         callback: F,
247 |     ) -> Result<Option<R>, GenericError> {
248 |         let mut buffer = [0u8; 2 + 8 + 8];
249 |         let buffer = build_log_key(&mut buffer, self.num, log_key);
250 |         let r = self.db.get_cf(self.log_cf, buffer)?;
251 |         trace!(
252 |             "log_get {:?} ({:?} bytes)",
253 |             log_key,
254 |             r.as_ref().map(|x| x.len())
255 |         );
256 |         Ok(r.map(|r| callback(&*r)))
257 |     }
258 | 
259 |     pub fn get_vec(&self, key: &[u8]) -> Result<Option<Vec<u8>>, GenericError> {
260 |         self.get(key, |v| v.to_owned())
261 |     }
262 | 
263 |     pub fn log_get_vec(&self, log_key: (u64, u64)) -> Result<Option<Vec<u8>>, GenericError> {
264 |         self.log_get(log_key, |v| v.to_owned())
265 |     }
266 | 
267 |     pub fn set(&self, key: &[u8], value: &[u8]) -> Result<(), GenericError> {
268 |         let mut b = self.batch_new(0);
269 |         b.set(key, value);
270 |         self.batch_write(b)
271 |     }
272 | 
273 |     pub fn del(&self, key: &[u8]) -> Result<(), GenericError> {
274 |         let mut b = self.batch_new(0);
275 |         b.del(key);
276 |         self.batch_write(b)
277 |     }
278 | 
279 |     pub fn batch_new(&self, reserve: usize) -> StorageBatch {
280 |         StorageBatch {
281 |             storage: self,
282 |             wb: rocksdb::WriteBatch::with_capacity(reserve),
283 |         }
284 |     }
285 | 
286 |     pub fn batch_write(&self, batch: StorageBatch) -> Result<(), GenericError> {
287 |         Ok(self.db.write(batch.wb)?)
288 |     }
289 | 
290 |     pub fn clear(&self) {
291 |         trace!("clear");
292 |         let mut from = [0u8; 2];
293 |         let mut to = [0u8; 2];
294 |         (&mut from[..]).write_u16::<BigEndian>(self.num).unwrap();
295 |         (&mut to[..]).write_u16::<BigEndian>(self.num + 1).unwrap();
296 | 
297 |         for &cf in &[self.cf, self.log_cf] {
298 |             self.db
299 |                 .delete_files_in_range_cf(cf, &from[..], &to[..], false)
300 |                 .unwrap();
301 |             let mut ro = rocksdb::ReadOptions::new();
302 |             ro.set_total_order_seek(false);
303 |             ro.set_prefix_same_as_start(true);
304 |             ro.set_iterate_upper_bound(&to[..]);
305 |             let mut iter = self.db.iter_cf_opt(cf, ro);
306 |             iter.seek(rocksdb::SeekKey::Key(&from[..]));
307 |             while iter.valid() {
308 |                 self.db.delete_cf(cf, iter.key()).unwrap();
309 |                 iter.next();
310 |             }
311 |         }
312 |     }
313 | 
314 |     pub fn sync(&self) -> Result<(), GenericError> {
315 |         debug!("sync");
316 |         Ok(self.db.sync_wal()?)
317 |     }
318 | }
319 | 
320 | impl<'a> StorageBatch<'a> {
321 |     pub fn is_empty(&self) -> bool {
322 |         self.wb.is_empty()
323 |     }
324 | 
325 |     pub fn set(&mut self, key: &[u8], value: &[u8]) {
326 |         trace!("set {:?} ({} bytes)", str::from_utf8(key), value.len());
327 |         let mut buffer = [0u8; 512];
328 |         let buffer = build_key(&mut buffer, self.storage.num, key);
329 |         self.wb.put_cf(self.storage.cf, buffer, value).unwrap();
330 |     }
331 | 
332 |     pub fn log_set(&mut self, key: (u64, u64), value: &[u8]) {
333 |         trace!("log_set {:?} ({} bytes)", key, value.len());
334 |         let mut buffer = [0u8; 2 + 8 + 8];
335 |         let buffer = build_log_key(&mut buffer, self.storage.num, key);
336 |         self.wb.put_cf(self.storage.log_cf, buffer, value).unwrap();
337 |     }
338 | 
339 |     pub fn del(&mut self, key: &[u8]) {
340 |         trace!("del {:?}", str::from_utf8(key));
341 |         let mut buffer = [0u8; 512];
342 |         let buffer = build_key(&mut buffer, self.storage.num, key);
343 |         self.wb.delete_cf(self.storage.cf, buffer).unwrap()
344 |     }
345 | }
346 | 
347 | impl GenericIterator {
348 |     pub fn iter<'a>(&'a mut self) -> GenericIteratorIter<'a> {
349 |         GenericIteratorIter { it: self }
350 |     }
351 | }
352 | 
353 | pub struct GenericIteratorIter<'a> {
354 |     it: &'a mut GenericIterator,
355 | }
356 | 
357 | impl<'a> Iterator for GenericIteratorIter<'a> {
358 |     type Item = (u16, &'a [u8], &'a [u8]);
359 |     fn next(&mut self) -> Option<Self::Item> {
360 |         if self.it.first {
361 |             self.it.first = false;
362 |         } else {
363 |             // this iterator isn't fused so we need to check for valid here too
364 |             if !self.it.iterator.valid() {
365 |                 return None;
366 |             }
367 |             self.it.iterator.next();
368 |         }
369 |         if self.it.iterator.valid() {
370 |             unsafe {
371 |                 let key = self.it.iterator.key();
372 |                 let value = self.it.iterator.value();
373 |                 // FIXME: bogus lifetime as slices are only valid until the next call to next()
374 |                 Some((
375 |                     (&key[..2]).read_u16::<BigEndian>().unwrap(),
376 |                     mem::transmute(&key[2..]),
377 |                     mem::transmute(value),
378 |                 ))
379 |             }
380 |         } else {
381 |             None
382 |         }
383 |     }
384 | }
385 | 
386 | pub struct StorageIteratorIter<'a>(GenericIteratorIter<'a>);
387 | 
388 | impl StorageIterator {
389 |     pub fn iter<'a>(&'a mut self) -> StorageIteratorIter<'a> {
390 |         StorageIteratorIter(self.0.iter())
391 |     }
392 | }
393 | 
394 | impl<'a> Iterator for StorageIteratorIter<'a> {
395 |     type Item = (&'a [u8], &'a [u8]);
396 |     fn next(&mut self) -> Option<Self::Item> {
397 |         self.0.next().map(|(_, k, v)| (k, v))
398 |     }
399 | }
400 | 
401 | pub struct LogStorageIteratorIter<'a>(GenericIteratorIter<'a>);
402 | 
403 | impl<'a> Iterator for LogStorageIteratorIter<'a> {
404 |     type Item = ((u64, u64), &'a [u8]);
405 |     fn next(&mut self) -> Option<Self::Item> {
406 |         self.0.next().map(|(_, key, value)| {
407 |             let first = (&key[..8]).read_u64::<BigEndian>().unwrap();
408 |             let second = (&key[8..8 + 8]).read_u64::<BigEndian>().unwrap();
409 |             ((first, second), value)
410 |         })
411 |     }
412 | }
413 | 
414 | impl LogStorageIterator {
415 |     pub fn iter<'a>(&'a mut self) -> LogStorageIteratorIter<'a> {
416 |         LogStorageIteratorIter(self.0.iter())
417 |     }
418 | }
419 | 
420 | #[cfg(test)]
421 | mod tests {
422 |     use super::*;
423 |     use std::fs;
424 | 
425 |     #[test]
426 |     fn test_simple() {
427 |         let _ = fs::remove_dir_all("t/test_simple");
428 |         let sm = StorageManager::new("t/test_simple").unwrap();
429 |         let storage = sm.open(1).unwrap();
430 |         assert_eq!(storage.get_vec(b"sample").unwrap(), None);
431 |         storage.set(b"sample", b"sample_value").unwrap();
432 |         assert_eq!(
433 |             storage.get_vec(b"sample").unwrap().unwrap(),
434 |             b"sample_value"
435 |         );
436 |         storage.del(b"sample").unwrap();
437 |         assert_eq!(storage.get_vec(b"sample").unwrap(), None);
438 |     }
439 | 
440 |     #[test]
441 |     fn test_simple_log() {
442 |         let _ = fs::remove_dir_all("t/test_simple_log");
443 |         let sm = StorageManager::new("t/test_simple_log").unwrap();
444 |         let storage = sm.open(1).unwrap();
445 |         assert_eq!(storage.get_vec(b"sample").unwrap(), None);
446 |         let mut b = storage.batch_new(0);
447 |         b.set(b"sample", b"sample_value");
448 |         b.log_set((1, 1), b"sample");
449 |         storage.batch_write(b).unwrap();
450 |         assert_eq!(
451 |             storage.get_vec(b"sample").unwrap().unwrap(),
452 |             b"sample_value"
453 |         );
454 |         assert_eq!(storage.log_get_vec((1, 1)).unwrap().unwrap(), b"sample");
455 |     }
456 | 
457 |     #[test]
458 |     fn test_iter() {
459 |         let _ = fs::remove_dir_all("t/test_iter");
460 |         let sm = StorageManager::new("t/test_iter").unwrap();
461 |         for &i in &[0, 1, 2] {
462 |             let storage = sm.open(i).unwrap();
463 |             storage.set(b"1", i.to_string().as_bytes()).unwrap();
464 |             storage.set(b"2", i.to_string().as_bytes()).unwrap();
465 |             storage.set(b"3", i.to_string().as_bytes()).unwrap();
466 |         }
467 |         for &i in &[0, 1, 2] {
468 |             let storage = sm.open(i).unwrap();
469 |             let results: Vec<Vec<u8>> = storage.iterator().iter().map(|(_, v)| v.into()).collect();
470 |             assert_eq!(results, vec![i.to_string().as_bytes(); 3]);
471 |         }
472 |     }
473 | 
474 |     #[test]
475 |     fn test_iter_log() {
476 |         let _ = fs::remove_dir_all("t/test_iter_log");
477 |         let sm = StorageManager::new("t/test_iter_log").unwrap();
478 |         for &i in &[0u64, 1, 2] {
479 |             let storage = sm.open(i as u16).unwrap();
480 |             let mut b = storage.batch_new(0);
481 |             b.log_set((i, i + 0), (i + 0).to_string().as_bytes());
482 |             b.log_set((i, i + 1), (i + 1).to_string().as_bytes());
483 |             b.log_set((i, i + 2), (i + 2).to_string().as_bytes());
484 |             b.log_set((i + 1, i), b"");
485 |             storage.batch_write(b).unwrap();
486 |         }
487 |         for &i in &[0u64, 1, 2] {
488 |             let storage = sm.open(i as u16).unwrap();
489 |             let results: Vec<(_, Vec<u8>)> = storage
490 |                 .log_iterator(i, i + 1)
491 |                 .iter()
492 |                 .map(|(k, v)| (k, v.into()))
493 |                 .collect();
494 |             assert_eq!(
495 |                 results,
496 |                 vec![
497 |                     ((i, i + 1), (i + 1).to_string().into_bytes()),
498 |                     ((i, i + 2), (i + 2).to_string().into_bytes()),
499 |                 ]
500 |             );
501 |             assert_eq!(storage.log_iterator(i, i).iter().count(), 3);
502 |             assert_eq!(storage.log_iterator_all().iter().count(), 4);
503 |         }
504 |     }
505 | 
506 |     #[test]
507 |     fn test_clear() {
508 |         let _ = fs::remove_dir_all("t/test_clear");
509 |         let sm = StorageManager::new("t/test_clear").unwrap();
510 |         for &i in &[0u64, 1, 2] {
511 |             let storage = sm.open(i as u16).unwrap();
512 |             let mut b = storage.batch_new(0);
513 |             b.set(i.to_string().as_bytes(), i.to_string().as_bytes());
514 |             b.log_set((i, i), i.to_string().as_bytes());
515 |             storage.batch_write(b).unwrap();
516 |         }
517 |         for &i in &[0u64, 1, 2] {
518 |             let storage = sm.open(i as u16).unwrap();
519 |             assert_eq!(storage.iterator().iter().count(), 1);
520 |             assert_eq!(storage.log_iterator(i, i).iter().count(), 1);
521 |             storage.clear();
522 |             assert_eq!(storage.iterator().iter().count(), 0);
523 |             assert_eq!(storage.log_iterator(i, i).iter().count(), 0);
524 |         }
525 |     }
526 | 
527 |     #[test]
528 |     fn test_open_all() {
529 |         let _ = fs::remove_dir_all("t/test_open_all");
530 |         let sm = StorageManager::new("t/test_open_all").unwrap();
531 |         sm.open(1).unwrap();
532 |         sm.open(2).unwrap();
533 |         sm.open(3).unwrap();
534 |         sm.open(1).unwrap();
535 |         sm.open(2).unwrap();
536 |         sm.open(3).unwrap();
537 |     }
538 | 
539 | }
540 | 


--------------------------------------------------------------------------------
/src/types.rs:
--------------------------------------------------------------------------------
 1 | use std::convert::TryFrom;
 2 | use std::fmt;
 3 | use std::str::FromStr;
 4 | 
 5 | /// Identifier for a Database instance
 6 | /// node id should be a positive i64 to work nicelly with the RESP protocol
 7 | pub type NodeId = u64;
 8 | /// Identifier for physical node (high u32 of NodeId)
 9 | pub type PhysicalNodeId = u32;
10 | /// Identifier for connection with client
11 | pub type Token = u64;
12 | /// Identifier for a vnode
13 | pub type VNodeNo = u16;
14 | 
15 | /// Identifier for communication between nodes
16 | #[derive(PartialEq, Eq, Hash, Serialize, Deserialize, Default, Copy, Clone)]
17 | pub struct Cookie(u64, u64);
18 | 
19 | impl Cookie {
20 |     pub fn new(a: u64, b: u64) -> Self {
21 |         Cookie(a, b)
22 |     }
23 | }
24 | 
25 | impl fmt::Debug for Cookie {
26 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
27 |         write!(f, "{:016X}{:016X}", self.0, self.1)
28 |     }
29 | }
30 | 
31 | /// Consistency Level as in Dynamo/Riak/Cassandra style
32 | #[derive(Copy, Clone, Debug, PartialEq, Eq)]
33 | pub enum ConsistencyLevel {
34 |     One,
35 |     Quorum,
36 |     All,
37 | }
38 | 
39 | #[derive(Copy, Clone, Debug)]
40 | pub struct ConsistencyLevelParseError;
41 | 
42 | impl<'a> TryFrom<&'a [u8]> for ConsistencyLevel {
43 |     type Error = ConsistencyLevelParseError;
44 |     fn try_from(bytes: &'a [u8]) -> Result<Self, Self::Error> {
45 |         if bytes.len() > 0 {
46 |             match bytes[0] {
47 |                 b'1' | b'o' | b'O' => return Ok(ConsistencyLevel::One),
48 |                 b'q' | b'Q' => return Ok(ConsistencyLevel::Quorum),
49 |                 b'a' | b'A' => return Ok(ConsistencyLevel::All),
50 |                 _ => (),
51 |             }
52 |         }
53 |         Err(ConsistencyLevelParseError)
54 |     }
55 | }
56 | 
57 | impl FromStr for ConsistencyLevel {
58 |     type Err = ConsistencyLevelParseError;
59 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
60 |         Self::try_from(s.as_bytes())
61 |     }
62 | }
63 | 
64 | impl ConsistencyLevel {
65 |     pub fn required(&self, replicas: u8) -> u8 {
66 |         match *self {
67 |             ConsistencyLevel::One => 1,
68 |             ConsistencyLevel::Quorum => replicas / 2 + 1,
69 |             ConsistencyLevel::All => replicas,
70 |         }
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{HashMap, HashSet};
  2 | use std::error::Error;
  3 | use std::hash::{BuildHasherDefault, Hasher};
  4 | use std::{fmt, fs, io, path};
  5 | 
  6 | pub type GenericError = Box<Error + Send + Sync + 'static>;
  7 | 
  8 | pub type IdHasherBuilder = BuildHasherDefault<IdHasher>;
  9 | pub type IdHashMap<K, V> = HashMap<K, V, IdHasherBuilder>;
 10 | pub type IdHashSet<K> = HashSet<K, IdHasherBuilder>;
 11 | pub struct IdHasher(u64);
 12 | 
 13 | impl Default for IdHasher {
 14 |     #[inline]
 15 |     fn default() -> IdHasher {
 16 |         IdHasher(0)
 17 |     }
 18 | }
 19 | 
 20 | impl Hasher for IdHasher {
 21 |     #[inline]
 22 |     fn finish(&self) -> u64 {
 23 |         self.0
 24 |     }
 25 | 
 26 |     #[inline]
 27 |     fn write(&mut self, bytes: &[u8]) {
 28 |         #[inline]
 29 |         fn mix(mut x: u64) -> u64 {
 30 |             // Seahash diffuse method
 31 |             x = x.wrapping_mul(0x6eed0e9da4d94a4f);
 32 |             let a = x >> 32;
 33 |             let b = x >> 60;
 34 |             x ^= a >> b;
 35 |             x.wrapping_mul(0x6eed0e9da4d94a4f)
 36 |         }
 37 | 
 38 |         debug_assert!(bytes.len() <= 8);
 39 |         unsafe {
 40 |             let mut temp = 0u64;
 41 |             ::std::ptr::copy_nonoverlapping(
 42 |                 bytes.as_ptr(),
 43 |                 &mut temp as *mut _ as *mut u8,
 44 |                 bytes.len(),
 45 |             );
 46 |             self.0 ^= mix(temp);
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | pub fn replace_default<T: Default>(subject: &mut T) -> T {
 52 |     ::std::mem::replace(subject, Default::default())
 53 | }
 54 | 
 55 | pub fn into_io_error<E: Error + Send + Sync + 'static>(e: E) -> io::Error {
 56 |     io::Error::new(io::ErrorKind::Other, e)
 57 | }
 58 | 
 59 | pub fn split_u64(uint: u64) -> (u32, u32) {
 60 |     ((uint >> 32) as u32, uint as u32)
 61 | }
 62 | 
 63 | pub fn join_u64(hi: u32, lo: u32) -> u64 {
 64 |     ((hi as u64) << 32) | (lo as u64)
 65 | }
 66 | 
 67 | pub fn assume_str(bytes: &[u8]) -> &str {
 68 |     unsafe { ::std::str::from_utf8_unchecked(bytes) }
 69 | }
 70 | 
 71 | pub fn is_dir_empty_or_absent<P: AsRef<path::Path>>(path: P) -> io::Result<bool> {
 72 |     match fs::read_dir(path.as_ref()) {
 73 |         Ok(dir) => Ok(dir.count() == 0),
 74 |         Err(ref err) if err.kind() == io::ErrorKind::NotFound => Ok(true),
 75 |         Err(err) => Err(err),
 76 |     }
 77 | }
 78 | 
 79 | #[cfg(test)]
 80 | pub fn sleep_ms(ms: u64) {
 81 |     ::std::thread::sleep(::std::time::Duration::from_millis(ms));
 82 | }
 83 | 
 84 | #[cfg(test)]
 85 | macro_rules! assert_eq_repr {
 86 |     ($left:expr, $right:expr) => {{
 87 |         match (format!("{:?}", &$left), format!("{:?}", &$right)) {
 88 |             (left_val, right_val) => {
 89 |                 if !(left_val == right_val) {
 90 |                     panic!(
 91 |                         "repr assertion failed: `(debug(left) == debug(right))` \
 92 |                          (left: `{:?}`, right: `{:?}`)",
 93 |                         left_val, right_val
 94 |                     )
 95 |                 }
 96 |             }
 97 |         }
 98 |     }};
 99 | }
100 | 
101 | pub trait LoggerExt {
102 |     fn log_error(&self, msg: &str);
103 |     fn log_warn(&self, msg: &str);
104 | }
105 | 
106 | impl<T, U: fmt::Debug> LoggerExt for Result<T, U> {
107 |     fn log_error(&self, msg: &str) {
108 |         if let &Err(ref e) = self {
109 |             error!("{}: {:?}", msg, e);
110 |         }
111 |     }
112 |     fn log_warn(&self, msg: &str) {
113 |         if let &Err(ref e) = self {
114 |             warn!("{}: {:?}", msg, e);
115 |         }
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/vnode_sync.rs:
--------------------------------------------------------------------------------
  1 | use bincode;
  2 | use bytes::Bytes;
  3 | use cubes::Cube;
  4 | use database::*;
  5 | use fabric::*;
  6 | use inflightmap::InFlightMap;
  7 | use metrics::{self, Meter};
  8 | use std::collections::{hash_set, HashSet};
  9 | use std::time::{Duration, Instant};
 10 | use utils::IdHasherBuilder;
 11 | use version_vector::*;
 12 | use vnode::VNodeState;
 13 | 
 14 | #[derive(Debug, Copy, Clone, PartialEq)]
 15 | #[must_use]
 16 | pub enum SyncResult {
 17 |     Continue,
 18 |     Done,
 19 |     Error,
 20 | }
 21 | 
 22 | impl<S, E> From<Result<S, E>> for SyncResult {
 23 |     fn from(result: Result<S, E>) -> Self {
 24 |         if result.is_ok() {
 25 |             SyncResult::Continue
 26 |         } else {
 27 |             SyncResult::Error
 28 |         }
 29 |     }
 30 | }
 31 | 
 32 | macro_rules! stry {
 33 |     ($expr:expr) => {{
 34 |         let conv = $expr.into();
 35 |         if let SyncResult::Continue = conv {
 36 |             conv
 37 |         } else {
 38 |             return conv;
 39 |         }
 40 |     }};
 41 | }
 42 | 
 43 | #[derive(Debug)]
 44 | pub enum SyncDirection {
 45 |     Incomming,
 46 |     Outgoing,
 47 | }
 48 | 
 49 | type IteratorFn = Box<FnMut(&VNodeState) -> Result<Option<(Bytes, Cube)>, ()> + Send>;
 50 | 
 51 | type InFlightSyncMsgMap = InFlightMap<u64, MsgSyncSend, Instant, IdHasherBuilder>;
 52 | 
 53 | struct SyncKeysIterator {
 54 |     dots_delta: BitmappedVersionVectorDelta,
 55 |     keys: hash_set::IntoIter<Bytes>,
 56 | }
 57 | 
 58 | // TODO: Refactor into trait objects
 59 | // trait Synchronization { fn on_.., .. }
 60 | // new_sync_sender -> Box<Synchronization>
 61 | pub enum Synchronization {
 62 |     SyncSender {
 63 |         // bvv in peer at the time of sync start
 64 |         clocks_in_peer: BitmappedVersionVector,
 65 |         // partial copy of the local bvv at the time of sync start
 66 |         clocks_snapshot: BitmappedVersionVector,
 67 |         iterator: IteratorFn,
 68 |         // TODO: only store keys as resends should be rare
 69 |         inflight: InFlightSyncMsgMap,
 70 |         cookie: Cookie,
 71 |         peer: NodeId,
 72 |         // count of sent keys (includes inflight)
 73 |         count: u64,
 74 |         last_recv: Instant,
 75 |         last_send: Instant,
 76 |     },
 77 |     SyncReceiver {
 78 |         // local bvv at the time of sync start
 79 |         clocks_in_peer: BitmappedVersionVector,
 80 |         cookie: Cookie,
 81 |         peer: NodeId,
 82 |         // aprox count of received keys (includes dups)
 83 |         recv_count: u64,
 84 |         last_recv: Instant,
 85 |         last_send: Instant,
 86 |     },
 87 |     BootstrapSender {
 88 |         clocks_snapshot: BitmappedVersionVector,
 89 |         iterator: IteratorFn,
 90 |         inflight: InFlightSyncMsgMap,
 91 |         cookie: Cookie,
 92 |         peer: NodeId,
 93 |         // count of sent keys (includes inflight)
 94 |         count: u64,
 95 |         last_recv: Instant,
 96 |         last_send: Instant,
 97 |     },
 98 |     BootstrapReceiver {
 99 |         cookie: Cookie,
100 |         peer: NodeId,
101 |         // aprox count of received keys (includes dups)
102 |         recv_count: u64,
103 |         last_recv: Instant,
104 |         last_send: Instant,
105 |     },
106 | }
107 | 
108 | impl SyncKeysIterator {
109 |     fn new(dots_delta: BitmappedVersionVectorDelta) -> Self {
110 |         SyncKeysIterator {
111 |             dots_delta: dots_delta,
112 |             keys: HashSet::new().into_iter(),
113 |         }
114 |     }
115 | 
116 |     fn next(&mut self, state: &VNodeState) -> Result<Option<Bytes>, ()> {
117 |         loop {
118 |             if let Some(key) = self.keys.next() {
119 |                 return Ok(Some(key));
120 |             }
121 |             // fetch log in batches of ~1_000 keys
122 |             let hint_size = self.dots_delta.size_hint().0.min(1_000);
123 |             let mut keys = HashSet::with_capacity(hint_size);
124 |             // consider up to 90% of the actual capacity as an alternative limit
125 |             let limit = (keys.capacity() * 9 / 10).max(1_000);
126 |             for (n, v) in self.dots_delta.by_ref() {
127 |                 let key = state
128 |                     .storage
129 |                     .log_get((n, v), |x| Bytes::from(x))
130 |                     .map_err(|_| ())?;
131 |                 if let Some(key) = key {
132 |                     keys.insert(key);
133 |                     if keys.len() >= limit {
134 |                         break;
135 |                     }
136 |                 } else {
137 |                     warn!("Can't find log key for ({}, {})", n, v);
138 |                 }
139 |             }
140 |             if keys.is_empty() {
141 |                 return Ok(None);
142 |             }
143 |             debug!("Sync will send key batch with {:?} keys", keys.len());
144 |             self.keys = keys.into_iter();
145 |         }
146 |     }
147 | }
148 | 
149 | use self::Synchronization::*;
150 | 
151 | impl Synchronization {
152 |     pub fn new_bootstrap_receiver(
153 |         _db: &Database,
154 |         _state: &mut VNodeState,
155 |         peer: NodeId,
156 |         cookie: Cookie,
157 |     ) -> Self {
158 |         BootstrapReceiver {
159 |             cookie: cookie,
160 |             peer: peer,
161 |             recv_count: 0,
162 |             last_recv: Instant::now(),
163 |             last_send: Instant::now(),
164 |         }
165 |     }
166 | 
167 |     pub fn new_bootstrap_sender(
168 |         _db: &Database,
169 |         state: &mut VNodeState,
170 |         peer: NodeId,
171 |         msg: MsgSyncStart,
172 |     ) -> Self {
173 |         let mut storage_iterator = state.storage.iterator();
174 |         let iterator_fn: IteratorFn = Box::new(move |_| {
175 |             let next = storage_iterator
176 |                 .iter()
177 |                 .map(|(k, v)| {
178 |                     let cube = bincode::deserialize::<Cube>(v).map_err(|_| ())?;
179 |                     Ok((Bytes::from(k), cube))
180 |                 }).next();
181 | 
182 |             match next {
183 |                 Some(Ok(r)) => Ok(Some(r)),
184 |                 None => Ok(None),
185 |                 Some(Err(e)) => Err(e),
186 |             }
187 |         });
188 | 
189 |         BootstrapSender {
190 |             cookie: msg.cookie,
191 |             clocks_snapshot: state.clocks.clone(),
192 |             iterator: iterator_fn,
193 |             inflight: InFlightMap::new(),
194 |             peer: peer,
195 |             count: 0,
196 |             last_recv: Instant::now(),
197 |             last_send: Instant::now(),
198 |         }
199 |     }
200 | 
201 |     pub fn new_sync_receiver(
202 |         _db: &Database,
203 |         state: &mut VNodeState,
204 |         peer: NodeId,
205 |         cookie: Cookie,
206 |     ) -> Self {
207 |         assert!(state.sync_nodes.insert(peer));
208 |         SyncReceiver {
209 |             clocks_in_peer: state.clocks.clone(),
210 |             peer: peer,
211 |             cookie: cookie,
212 |             recv_count: 0,
213 |             last_recv: Instant::now(),
214 |             last_send: Instant::now(),
215 |         }
216 |     }
217 | 
218 |     pub fn new_sync_sender(
219 |         db: &Database,
220 |         state: &mut VNodeState,
221 |         peer: NodeId,
222 |         msg: MsgSyncStart,
223 |     ) -> Self {
224 |         let MsgSyncStart {
225 |             target,
226 |             cookie,
227 |             clocks_in_peer,
228 |             ..
229 |         } = msg;
230 |         assert_eq!(target, Some(db.dht.node()));
231 | 
232 |         let clocks_snapshot = state.log_clocks.clone();
233 |         let dots_delta = clocks_snapshot.delta(&clocks_in_peer);
234 | 
235 |         debug!(
236 |             "Creating SyncSender {:?} from {:?} to {:?}",
237 |             cookie, clocks_snapshot, clocks_in_peer
238 |         );
239 | 
240 |         let mut sync_keys = SyncKeysIterator::new(dots_delta);
241 |         let iterator_fn: IteratorFn = Box::new(move |state| {
242 |             if let Some(key) = sync_keys.next(state)? {
243 |                 let cube = state.storage_get(&key)?;
244 |                 Ok(Some((key, cube)))
245 |             } else {
246 |                 Ok(None)
247 |             }
248 |         });
249 | 
250 |         SyncSender {
251 |             clocks_in_peer: clocks_in_peer,
252 |             clocks_snapshot: clocks_snapshot,
253 |             iterator: iterator_fn,
254 |             inflight: InFlightMap::new(),
255 |             cookie: cookie,
256 |             peer: peer,
257 |             count: 0,
258 |             last_recv: Instant::now(),
259 |             last_send: Instant::now(),
260 |         }
261 |     }
262 | 
263 |     // send SyncStart message, only valid for Receivers
264 |     fn send_start(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult {
265 |         let (peer, cookie, target, clocks_in_peer) = match *self {
266 |             SyncReceiver {
267 |                 cookie,
268 |                 peer,
269 |                 ref mut last_send,
270 |                 ref clocks_in_peer,
271 |                 ..
272 |             } => {
273 |                 *last_send = Instant::now();
274 |                 (peer, cookie, Some(peer), clocks_in_peer.clone())
275 |             }
276 |             BootstrapReceiver {
277 |                 peer,
278 |                 cookie,
279 |                 ref mut last_send,
280 |                 ..
281 |             } => {
282 |                 *last_send = Instant::now();
283 |                 (peer, cookie, None, BitmappedVersionVector::new())
284 |             }
285 |             _ => unreachable!(),
286 |         };
287 | 
288 |         info!("Sending start for {:?}", cookie);
289 |         db.fabric
290 |             .send_msg(
291 |                 peer,
292 |                 &MsgSyncStart {
293 |                     cookie: cookie,
294 |                     vnode: state.num(),
295 |                     clocks_in_peer: clocks_in_peer,
296 |                     target: target,
297 |                 },
298 |             ).into()
299 |     }
300 | 
301 |     // Sending Errors always result in Error
302 |     fn send_error_fin(
303 |         &mut self,
304 |         db: &Database,
305 |         state: &mut VNodeState,
306 |         error: FabricError,
307 |     ) -> SyncResult {
308 |         match *self {
309 |             SyncReceiver {
310 |                 peer,
311 |                 cookie,
312 |                 ref mut last_send,
313 |                 ..
314 |             }
315 |             | BootstrapReceiver {
316 |                 peer,
317 |                 cookie,
318 |                 ref mut last_send,
319 |                 ..
320 |             }
321 |             | SyncSender {
322 |                 peer,
323 |                 cookie,
324 |                 ref mut last_send,
325 |                 ..
326 |             }
327 |             | BootstrapSender {
328 |                 peer,
329 |                 cookie,
330 |                 ref mut last_send,
331 |                 ..
332 |             } => {
333 |                 *last_send = Instant::now();
334 |                 let _ = db.fabric.send_msg(
335 |                     peer,
336 |                     &MsgSyncFin {
337 |                         cookie: cookie,
338 |                         vnode: state.num(),
339 |                         result: Err(error),
340 |                     },
341 |                 );
342 |                 SyncResult::Error
343 |             }
344 |         }
345 |     }
346 | 
347 |     // Senders wait for the Receivers to reply => Continue
348 |     // unless there's no route the peer => Error
349 |     fn send_sender_success_fin(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult {
350 |         match *self {
351 |             SyncSender {
352 |                 peer,
353 |                 cookie,
354 |                 ref clocks_snapshot,
355 |                 ref mut last_send,
356 |                 ..
357 |             }
358 |             | BootstrapSender {
359 |                 peer,
360 |                 cookie,
361 |                 ref clocks_snapshot,
362 |                 ref mut last_send,
363 |                 ..
364 |             } => {
365 |                 *last_send = Instant::now();
366 |                 db.fabric
367 |                     .send_msg(
368 |                         peer,
369 |                         &MsgSyncFin {
370 |                             cookie: cookie,
371 |                             vnode: state.num(),
372 |                             result: Ok(clocks_snapshot.clone()),
373 |                         },
374 |                     ).into()
375 |             }
376 |             _ => unreachable!(),
377 |         }
378 |     }
379 | 
380 |     // send (possibly multiple) SyncSend messages and eventual SyncFin
381 |     // (also takes care of expired SyncSend)
382 |     fn send_next(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult {
383 |         let now = Instant::now();
384 |         let timeout = now + Duration::from_millis(db.config.sync_msg_timeout as _);
385 |         let (error, inflight_empty) = match *self {
386 |             SyncSender {
387 |                 peer,
388 |                 cookie,
389 |                 ref mut iterator,
390 |                 ref mut count,
391 |                 ref mut inflight,
392 |                 ref mut last_send,
393 |                 ..
394 |             }
395 |             | BootstrapSender {
396 |                 peer,
397 |                 cookie,
398 |                 ref mut iterator,
399 |                 ref mut count,
400 |                 ref mut inflight,
401 |                 ref mut last_send,
402 |                 ..
403 |             } => {
404 |                 while let Some((seq, msg)) = inflight.touch_expired(now, timeout) {
405 |                     debug!("resending seq {} for sync/bootstrap {:?}", seq, cookie);
406 |                     let _ = stry!(db.fabric.send_msg(peer, msg,));
407 |                     metrics::SYNC_RESEND.mark(1);
408 |                 }
409 |                 let mut error = false;
410 |                 while inflight.len() < db.config.sync_msg_inflight as usize {
411 |                     match iterator(state) {
412 |                         Ok(Some((k, v))) => {
413 |                             let msg = MsgSyncSend {
414 |                                 cookie: cookie,
415 |                                 vnode: state.num(),
416 |                                 seq: *count,
417 |                                 key: k.clone(),
418 |                                 value: v.clone(),
419 |                             };
420 |                             let _ = stry!(db.fabric.send_msg(peer, &msg,));
421 |                             inflight.insert(*count, msg, timeout);
422 |                             *count += 1;
423 |                             *last_send = now;
424 |                             metrics::SYNC_SEND.mark(1);
425 |                             continue;
426 |                         }
427 |                         Ok(None) => {
428 |                             break;
429 |                         }
430 |                         Err(_) => {
431 |                             error = true;
432 |                             break;
433 |                         }
434 |                     }
435 |                 }
436 |                 (error, inflight.is_empty())
437 |             }
438 |             _ => unreachable!(),
439 |         };
440 | 
441 |         if error {
442 |             self.send_error_fin(db, state, FabricError::SyncInterrupted)
443 |         } else if inflight_empty {
444 |             // do not trottle success fin as we don't know if last_send
445 |             // was set by MsgSend or MsgFin
446 |             self.send_sender_success_fin(db, state)
447 |         } else {
448 |             SyncResult::Continue
449 |         }
450 |     }
451 | 
452 |     // called by vnode when node is transition to an incompatible state
453 |     // only valid for Receivers right now
454 |     pub fn on_cancel(&mut self, db: &Database, state: &mut VNodeState) {
455 |         match *self {
456 |             BootstrapReceiver { .. } | SyncReceiver { .. } => {
457 |                 let _ = self.send_error_fin(db, state, FabricError::BadVNodeStatus);
458 |             }
459 |             _ => unreachable!(),
460 |         }
461 |     }
462 | 
463 |     // called by vnode as soon as the sync is unregistered
464 |     pub fn on_remove(self, db: &Database, state: &mut VNodeState) {
465 |         match self {
466 |             SyncReceiver { peer, .. } => {
467 |                 assert!(state.sync_nodes.remove(&peer));
468 |             }
469 |             _ => (),
470 |         }
471 | 
472 |         db.signal_sync_end(self.direction());
473 |     }
474 | 
475 |     pub fn on_tick(&mut self, db: &Database, state: &mut VNodeState) -> SyncResult {
476 |         match *self {
477 |             SyncSender {
478 |                 last_recv, cookie, ..
479 |             }
480 |             | BootstrapSender {
481 |                 last_recv, cookie, ..
482 |             } => if last_recv.elapsed() > Duration::from_millis(db.config.sync_timeout as _) {
483 |                 warn!("sync/boostrap sender timed out {:?}", cookie);
484 |                 SyncResult::Error
485 |             } else {
486 |                 self.send_next(db, state)
487 |             },
488 |             SyncReceiver {
489 |                 last_recv,
490 |                 recv_count,
491 |                 last_send,
492 |                 cookie,
493 |                 ..
494 |             }
495 |             | BootstrapReceiver {
496 |                 last_recv,
497 |                 recv_count,
498 |                 last_send,
499 |                 cookie,
500 |                 ..
501 |             } => if last_recv.elapsed() > Duration::from_millis(db.config.sync_timeout as _) {
502 |                 warn!("sync/boostrap receiver timed out {:?}", cookie);
503 |                 SyncResult::Error
504 |             } else if recv_count == 0
505 |                 && last_send.elapsed() > Duration::from_millis(db.config.sync_msg_timeout as _)
506 |             {
507 |                 self.send_start(db, state)
508 |             } else {
509 |                 SyncResult::Continue
510 |             },
511 |         }
512 |     }
513 | 
514 |     // called by vnode as soon as the sync is registered (after creation)
515 |     pub fn on_start(&mut self, db: &Database, state: &mut VNodeState) {
516 |         let _ = match *self {
517 |             SyncReceiver { .. } | BootstrapReceiver { .. } => self.send_start(db, state),
518 |             SyncSender { .. } | BootstrapSender { .. } => self.send_next(db, state),
519 |         };
520 |     }
521 | 
522 |     pub fn on_msg_fin(
523 |         &mut self,
524 |         db: &Database,
525 |         state: &mut VNodeState,
526 |         msg: MsgSyncFin,
527 |     ) -> SyncResult {
528 |         match *self {
529 |             SyncReceiver { peer, .. } | BootstrapReceiver { peer, .. } => {
530 |                 if msg.result.is_ok() {
531 |                     state.clocks.merge(msg.result.as_ref().unwrap());
532 |                     state.save(db, false);
533 |                     // send it back as a form of ack-ack
534 |                     let _ = db.fabric.send_msg(peer, &msg);
535 |                     SyncResult::Done
536 |                 } else if msg.result.err() == Some(FabricError::NotReady) {
537 |                     SyncResult::Continue
538 |                 } else {
539 |                     SyncResult::Error
540 |                 }
541 |             }
542 |             SyncSender { .. } | BootstrapSender { .. } => {
543 |                 // Senders are always Done on SyncFin messages
544 |                 SyncResult::Done
545 |             }
546 |         }
547 |     }
548 | 
549 |     pub fn on_msg_send(&mut self, db: &Database, state: &mut VNodeState, msg: MsgSyncSend) {
550 |         match *self {
551 |             SyncReceiver {
552 |                 peer,
553 |                 ref mut recv_count,
554 |                 ref mut last_recv,
555 |                 ref mut last_send,
556 |                 ..
557 |             }
558 |             | BootstrapReceiver {
559 |                 peer,
560 |                 ref mut recv_count,
561 |                 ref mut last_recv,
562 |                 ref mut last_send,
563 |                 ..
564 |             } => {
565 |                 // TODO: what to do with errors here?
566 |                 state
567 |                     .storage_set_remote(db, vec![(msg.key, msg.value, false)])
568 |                     .unwrap();
569 | 
570 |                 let _ = db.fabric.send_msg(
571 |                     peer,
572 |                     &MsgSyncAck {
573 |                         cookie: msg.cookie,
574 |                         vnode: state.num(),
575 |                         seq: msg.seq,
576 |                     },
577 |                 );
578 | 
579 |                 *recv_count += 1;
580 |                 let now = Instant::now();
581 |                 *last_recv = now;
582 |                 *last_send = now;
583 |                 metrics::SYNC_RECV.mark(1);
584 |             }
585 |             _ => unreachable!(),
586 |         }
587 |     }
588 | 
589 |     pub fn on_msg_ack(&mut self, db: &Database, state: &mut VNodeState, msg: MsgSyncAck) {
590 |         match *self {
591 |             SyncSender {
592 |                 ref mut inflight,
593 |                 ref mut last_recv,
594 |                 ..
595 |             }
596 |             | BootstrapSender {
597 |                 ref mut inflight,
598 |                 ref mut last_recv,
599 |                 ..
600 |             } => {
601 |                 inflight.remove(&msg.seq);
602 |                 *last_recv = Instant::now();
603 |             }
604 |             _ => unreachable!(),
605 |         }
606 |         let _ = self.send_next(db, state);
607 |     }
608 | 
609 |     pub fn direction(&self) -> SyncDirection {
610 |         match *self {
611 |             BootstrapReceiver { .. } | SyncReceiver { .. } => SyncDirection::Incomming,
612 |             BootstrapSender { .. } | SyncSender { .. } => SyncDirection::Outgoing,
613 |         }
614 |     }
615 | }
616 | 


--------------------------------------------------------------------------------
/src/workers.rs:
--------------------------------------------------------------------------------
  1 | use crossbeam_channel as chan;
  2 | use std::sync::atomic::{AtomicUsize, Ordering};
  3 | use std::sync::Arc;
  4 | use std::{thread, time};
  5 | 
  6 | pub trait ExitMsg {
  7 |     fn exit_msg() -> Self;
  8 |     fn is_exit(&self) -> bool;
  9 | }
 10 | 
 11 | /// A Sender attached to a WorkerManager
 12 | /// messages are distributed to threads in a Round-Robin manner.
 13 | pub struct WorkerSender<T: ExitMsg + Send + 'static> {
 14 |     cursor: AtomicUsize,
 15 |     alive_threads: Arc<AtomicUsize>,
 16 |     channels: Vec<chan::Sender<T>>,
 17 | }
 18 | 
 19 | impl<T: ExitMsg + Send + 'static> Clone for WorkerSender<T> {
 20 |     fn clone(&self) -> Self {
 21 |         WorkerSender {
 22 |             cursor: Default::default(),
 23 |             channels: self.channels.clone(),
 24 |             alive_threads: self.alive_threads.clone(),
 25 |         }
 26 |     }
 27 | }
 28 | 
 29 | /// A thread pool containing threads prepared to receive WorkerMsg's
 30 | pub struct WorkerManager<T: ExitMsg + Send + 'static> {
 31 |     thread_count: usize,
 32 |     threads: Vec<thread::JoinHandle<()>>,
 33 |     name: String,
 34 |     alive_threads: Arc<AtomicUsize>,
 35 |     channels: Vec<chan::Sender<T>>,
 36 | }
 37 | 
 38 | impl<T: ExitMsg + Send + 'static> WorkerManager<T> {
 39 |     pub fn new(name: String, thread_count: usize) -> Self {
 40 |         assert!(thread_count > 0);
 41 |         WorkerManager {
 42 |             thread_count: thread_count,
 43 |             threads: Default::default(),
 44 |             name: name,
 45 |             alive_threads: Default::default(),
 46 |             channels: Default::default(),
 47 |         }
 48 |     }
 49 | 
 50 |     pub fn start<F>(&mut self, mut worker_fn_gen: F)
 51 |     where
 52 |         F: FnMut() -> Box<FnMut(T) + Send>,
 53 |     {
 54 |         assert!(self.channels.is_empty());
 55 |         for i in 0..self.thread_count {
 56 |             // since neither closure cloning or Box<FnOnce> are stable use Box<FnMut>
 57 |             let mut worker_fn = worker_fn_gen();
 58 |             let (tx, rx) = chan::unbounded();
 59 |             let alive_handle = self.alive_threads.clone();
 60 |             self.channels.push(tx);
 61 |             self.threads.push(
 62 |                 thread::Builder::new()
 63 |                     .name(format!("Worker:{}:{}", i, self.name))
 64 |                     .spawn(move || {
 65 |                         alive_handle.fetch_add(1, Ordering::SeqCst);
 66 |                         for m in rx {
 67 |                             if m.is_exit() {
 68 |                                 break;
 69 |                             }
 70 |                             worker_fn(m);
 71 |                         }
 72 |                         alive_handle.fetch_sub(1, Ordering::SeqCst);
 73 |                         info!("Exiting worker");
 74 |                     }).unwrap(),
 75 |             );
 76 |         }
 77 |     }
 78 | 
 79 |     pub fn sender(&self) -> WorkerSender<T> {
 80 |         assert!(!self.channels.is_empty());
 81 |         WorkerSender {
 82 |             cursor: Default::default(),
 83 |             channels: self.channels.clone(),
 84 |             alive_threads: self.alive_threads.clone(),
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | impl<T: ExitMsg + Send + 'static> WorkerSender<T> {
 90 |     pub fn send(&self, msg: T) -> bool {
 91 |         let cursor = self.cursor.fetch_add(1, Ordering::Relaxed);
 92 |         self.send_to(cursor, msg)
 93 |     }
 94 | 
 95 |     pub fn send_to(&self, seed: usize, msg: T) -> bool {
 96 |         self.channels[seed % self.channels.len()].send(msg);
 97 |         self.alive_threads.load(Ordering::SeqCst) > 0
 98 |     }
 99 | }
100 | 
101 | impl<T: ExitMsg + Send + 'static> Drop for WorkerManager<T> {
102 |     fn drop(&mut self) {
103 |         for c in &*self.channels {
104 |             let _ = c.send(T::exit_msg());
105 |         }
106 |         for t in self.threads.drain(..) {
107 |             let _ = t.join();
108 |         }
109 |     }
110 | }
111 | 
112 | pub fn timer_fn<F>(
113 |     name: String,
114 |     interval: time::Duration,
115 |     mut callback: F,
116 | ) -> thread::JoinHandle<()>
117 | where
118 |     F: FnMut(time::Instant) -> bool + Send + 'static,
119 | {
120 |     thread::Builder::new()
121 |         .name(format!("Timer:{}", name))
122 |         .spawn(move || loop {
123 |             thread::sleep(interval);
124 |             if !callback(time::Instant::now()) {
125 |                 break;
126 |             }
127 |         }).expect("Can't start timer")
128 | }
129 | 


--------------------------------------------------------------------------------
/sucredb.yaml:
--------------------------------------------------------------------------------
 1 | # NOTE:
 2 | # to specify "size" you need to use a suffix, like:
 3 | #
 4 | # 1b => 1 byte
 5 | # 1k | 1kb => 1024 bytes
 6 | # 1m | 1mb => 1024*1024 bytes
 7 | # 1g | 1gb => 1024*1024*1024 bytes
 8 | #
 9 | # to specify "time" you need to use a suffix, like:
10 | #
11 | # 1ms => 1 millisecond
12 | # 1s => 1 second
13 | # 1m => 60 seconds
14 | # 1h => 60 minutes
15 | 
16 | # ====== GENERAL CONFIGURATION ======
17 | 
18 | # Location of data directory in the file system
19 | data_dir: "./data"
20 | 
21 | # Seed nodes when joining a cluster
22 | # seed_nodes: ["123.123.123:16379"]
23 | seed_nodes: []
24 | 
25 | # Cluster name, must be the same for nodes to "see" each other
26 | cluster_name: "default"
27 | 
28 | # Ip and port to bind the socket for client connections
29 | listen_addr: "127.0.0.1:6379"
30 | 
31 | # Ip and port to bind the socket for internal cluster connections
32 | fabric_addr: "127.0.0.1:16379"
33 | 
34 | # Timeout for client requests
35 | # request_timeout: "1000ms"
36 | 
37 | # Resolution for internal tasks timer
38 | # worker_timer: "500ms"
39 | 
40 | # Number of worker threads
41 | # Defaults to max(4, 1 + cpucount * 2)
42 | # worker_count: 4
43 | 
44 | # Maximum number of incomming syncs
45 | # sync_incomming_max: 10
46 | 
47 | # Maximum number of outgoing syncs
48 | # sync_outgoing_max: 10
49 | 
50 | # Maximum number of client connections
51 | # client_connection_max: 100
52 | 
53 | # logging configuration, log4rs style
54 | logging:
55 |   appenders:
56 |     console:
57 |       kind: "console"
58 |       target: "stderr"
59 |     file:
60 |       kind: "file"
61 |       path: "./sucredb.log"
62 | 
63 |   loggers:
64 |     sucredb:
65 |       level: "debug"
66 |       appenders: ["file", "console"]
67 | 
68 | # ====== ADVANCED CONFIGURATION ======
69 | 
70 | # Amount of time to wait before aborting an unresponsible sync
71 | # sync_timeout: "10s"
72 | 
73 | # Timeout for sync messages
74 | # sync_msg_timeout: "1000ms"
75 | 
76 | # Maximum number of sync messages inflight (per sync)
77 | # sync_msg_inflight: 10
78 | 
79 | # Maximum number of conflicting versions for a given value
80 | # value_version_max: 100
81 | 


--------------------------------------------------------------------------------