├── .gitignore
├── FunctionalTest.cc
├── LICENSE
├── Main.cc
├── Makefile
├── NutcrackerConsistentHashRing.cc
├── NutcrackerConsistentHashRing.hh
├── Protocol.cc
├── Protocol.hh
├── ProtocolTest.cc
├── Proxy.cc
├── Proxy.hh
├── README
├── redis-shatter.conf.json
├── redis.conf
├── run_multiple_redis.sh
└── run_tests.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.o
3 | gmon.out
4 | redis-shatter
5 | ProtocolTest
6 | FunctionalTest
7 | 


--------------------------------------------------------------------------------
/FunctionalTest.cc:
--------------------------------------------------------------------------------
  1 | #define _STDC_FORMAT_MACROS
  2 | 
  3 | #include <errno.h>
  4 | #include <event2/buffer.h>
  5 | #include <inttypes.h>
  6 | #include <stdarg.h>
  7 | #include <stdlib.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | 
 11 | #include <phosg/Filesystem.hh>
 12 | #include <phosg/Network.hh>
 13 | #include <phosg/Strings.hh>
 14 | #include <phosg/UnitTest.hh>
 15 | #include <string>
 16 | 
 17 | #include "Protocol.hh"
 18 | 
 19 | using namespace std;
 20 | 
 21 | 
 22 | shared_ptr<Response> parse_response(const char* contents) {
 23 |   unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> buf(evbuffer_new(),
 24 |       evbuffer_free);
 25 |   evbuffer_add(buf.get(), contents, strlen(contents));
 26 |   return ResponseParser().resume(buf.get());
 27 | }
 28 | 
 29 | shared_ptr<Response> test_expect_response(const char* host, int port,
 30 |     const char* expected_response, ...) {
 31 | 
 32 |   DataCommand cmd;
 33 | 
 34 |   va_list va;
 35 |   va_start(va, expected_response);
 36 |   const char* arg;
 37 |   while ((arg = va_arg(va, const char*))) {
 38 |     cmd.args.emplace_back(arg);
 39 |   }
 40 |   va_end(va);
 41 | 
 42 |   shared_ptr<Response> r;
 43 |   {
 44 |     scoped_fd fd = connect(host, port, false); // not nonblocking
 45 |     expect_ge(fd, 0);
 46 | 
 47 |     unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> buf(evbuffer_new(),
 48 |         evbuffer_free);
 49 |     cmd.write(buf.get());
 50 |     evbuffer_write(buf.get(), fd);
 51 |     evbuffer_drain(buf.get(), evbuffer_get_length(buf.get()));
 52 | 
 53 |     evbuffer_read(buf.get(), fd, 1024 * 128);
 54 |     r = ResponseParser().resume(buf.get());
 55 |   }
 56 | 
 57 |   if (expected_response) {
 58 |     shared_ptr<Response> expected_r = parse_response(expected_response);
 59 |     expect(expected_r.get()); // if this fails, the test itself is broken
 60 | 
 61 |     if (!r.get()) {
 62 |       fprintf(stderr, "cmd = ");
 63 |       cmd.print(stderr);
 64 |       fprintf(stderr, "\nexpected = ");
 65 |       expected_r->print(stderr);
 66 |       fprintf(stderr, "\nactual   = (not present)\n");
 67 |       expect(false);
 68 |     }
 69 |     if (*r != *expected_r) {
 70 |       fprintf(stderr, "cmd = ");
 71 |       cmd.print(stderr);
 72 |       fprintf(stderr, "\nexpected = ");
 73 |       expected_r->print(stderr);
 74 |       fprintf(stderr, "\nactual   = ");
 75 |       r->print(stderr);
 76 |       fprintf(stderr, "\n");
 77 |       expect(false);
 78 |     }
 79 |   }
 80 | 
 81 |   return expected_response ? NULL : r;
 82 | }
 83 | 
 84 | int main(int argc, char* argv[]) {
 85 | 
 86 |   printf("functional tests\n");
 87 |   printf("we expect redis-shatter to be running with all backends connected\n");
 88 | 
 89 |   {
 90 |     printf("-- unimplemented commands return PROXYERROR\n");
 91 | 
 92 |     const vector<string> unimplemented_commands = {
 93 |       "AUTH", "BLPOP", "BRPOP", "BRPOPLPUSH", "DISCARD", "EXEC", "MONITOR",
 94 |       "MOVE", "MULTI", "PSUBSCRIBE", "PUBSUB", "PUBLISH", "PUNSUBSCRIBE",
 95 |       "SELECT", "SLAVEOF", "SUBSCRIBE", "SYNC", "UNSUBSCRIBE", "UNWATCH",
 96 |       "WATCH"};
 97 | 
 98 |     for (const auto& cmd : unimplemented_commands) {
 99 |       test_expect_response("localhost", 6379,
100 |           "-PROXYERROR command not supported\r\n", cmd.c_str(), NULL);
101 |     }
102 |   }
103 | 
104 |   {
105 |     printf("-- PING\n");
106 |     test_expect_response("localhost", 6379, "+PONG\r\n", "PING", NULL);
107 |   }
108 | 
109 |   {
110 |     printf("-- ECHO\n");
111 |     test_expect_response("localhost", 6379, "$3\r\nLOL\r\n", "ECHO", "LOL", NULL);
112 |   }
113 | 
114 |   {
115 |     printf("-- FLUSHALL, DBSIZE\n");
116 |     test_expect_response("localhost", 6379, "+OK\r\n", "FLUSHALL", NULL);
117 |     test_expect_response("localhost", 6379, ":0\r\n", "DBSIZE", NULL);
118 |   }
119 | 
120 |   {
121 |     printf("-- GET, SET, GETSET, MGET, MSET, DEL\n");
122 |     test_expect_response("localhost", 6379, "$-1\r\n", "GET", "x", NULL);
123 |     test_expect_response("localhost", 6379, "+OK\r\n", "SET", "x", "23", NULL);
124 |     test_expect_response("localhost", 6379, "$2\r\n23\r\n", "GET", "x", NULL);
125 |     test_expect_response("localhost", 6379, "$2\r\n23\r\n", "GETSET", "x", "45", NULL);
126 |     test_expect_response("localhost", 6379, "$2\r\n45\r\n", "GET", "x", NULL);
127 |     test_expect_response("localhost", 6379, "*3\r\n$2\r\n45\r\n$-1\r\n$-1\r\n", "MGET", "x", "y", "z", NULL);
128 |     test_expect_response("localhost", 6379, "+OK\r\n", "MSET", "x", "1", "y", "2", "z", "3", NULL);
129 |     test_expect_response("localhost", 6379, "*3\r\n$1\r\n1\r\n$1\r\n2\r\n$1\r\n3\r\n", "MGET", "x", "y", "z", NULL);
130 |     test_expect_response("localhost", 6379, ":2\r\n", "DEL", "x", "y", "w", NULL);
131 |   }
132 | 
133 |   {
134 |     printf("-- proxy commands: FORWARD, BACKENDS, BACKENDNUM\n");
135 |     test_expect_response("localhost", 6379, "+PONG\r\n", "FORWARD", "0", "PING", NULL);
136 | 
137 |     auto r = test_expect_response("localhost", 6379, NULL, "BACKENDS", NULL);
138 |     expect_eq(r->type, Response::Type::Multi);
139 |     size_t num_backends = r->fields.size();
140 |     printf("---- note: there are %zu backends\n", num_backends);
141 | 
142 |     r = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "z", NULL);
143 |     expect_eq(r->type, Response::Type::Integer);
144 |     int64_t z_backend = r->int_value;
145 |     printf("---- note: \'z\' goes to backend %" PRId64 "\n", z_backend);
146 | 
147 |     string z_backend_str = string_printf("%" PRId64, z_backend);
148 |     test_expect_response("localhost", 6379, "$1\r\n3\r\n", "GET", "z", NULL);
149 |     test_expect_response("localhost", 6379, "$1\r\n3\r\n", "FORWARD",
150 |         z_backend_str.c_str(), "GET", "z", NULL);
151 |   }
152 | 
153 |   {
154 |     printf("-- FLUSHDB, DBSIZE\n");
155 |     test_expect_response("localhost", 6379, "+OK\r\n", "FLUSHDB", NULL);
156 |     test_expect_response("localhost", 6379, ":0\r\n", "DBSIZE", NULL);
157 |     test_expect_response("localhost", 6379, "*3\r\n$-1\r\n$-1\r\n$-1\r\n", "MGET", "x", "y", "z", NULL);
158 |   }
159 | 
160 |   {
161 |     printf("-- MSETNX, RENAME\n");
162 |     test_expect_response("localhost", 6379, "-PROXYERROR keys are on different backends\r\n", "MSETNX", "x{abc}", "a", "y{abc}", "b", "z{bbc}", "b", NULL);
163 |     test_expect_response("localhost", 6379, ":1\r\n", "MSETNX", "x{abc}", "a", "y{abc}", "b", NULL);
164 |     test_expect_response("localhost", 6379, ":0\r\n", "MSETNX", "x{abc}", "a", "y{abc}", "b", "z{abc}", "c", NULL);
165 |     test_expect_response("localhost", 6379, ":1\r\n", "MSETNX", "z{abd}", "b", NULL);
166 | 
167 |     // make sure the keys are on the same backend
168 |     auto backend_x_resp = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "x{abc}", NULL);
169 |     auto backend_y_resp = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "y{abc}", NULL);
170 |     auto backend_z_resp = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "z{bbc}", NULL);
171 |     expect_eq(*backend_x_resp, *backend_y_resp);
172 |     expect_ne(*backend_x_resp, *backend_z_resp);
173 | 
174 |     test_expect_response("localhost", 6379, "-PROXYERROR keys are on different backends\r\n", "RENAME", "x{abc}", "x{bbc}", NULL);
175 |     test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "x{abc}", "y{abc}", NULL);
176 |     test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "y{abc}", "zxcvbnm{abc}", NULL);
177 | 
178 |     test_expect_response("localhost", 6379, "-PROXYERROR keys are on different backends\r\n", "RENAME", "z{bbc}", "z{abc}", NULL);
179 |     test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "z{abd}", "y{abd}", NULL);
180 |     test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "y{abd}", "zxcvbnm{abd}", NULL);
181 |   }
182 | 
183 |   printf("all tests passed\n");
184 |   return 0;
185 | }
186 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Martin Michelsen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Main.cc:
--------------------------------------------------------------------------------
  1 | #define _STDC_FORMAT_MACROS
  2 | #include <errno.h>
  3 | #include <inttypes.h>
  4 | #include <pthread.h>
  5 | #include <sched.h>
  6 | #include <signal.h>
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <stdlib.h>
 10 | #include <sys/types.h>
 11 | #include <sys/socket.h>
 12 | #include <sys/un.h>
 13 | #include <sys/wait.h>
 14 | #include <unistd.h>
 15 | 
 16 | #ifdef __APPLE__
 17 | #include <mach/thread_policy.h>
 18 | #include <mach/thread_act.h>
 19 | #endif
 20 | 
 21 | #include <phosg/Filesystem.hh>
 22 | #include <phosg/JSON.hh>
 23 | #include <phosg/Network.hh>
 24 | #include <phosg/Strings.hh>
 25 | #include <string>
 26 | #include <thread>
 27 | #include <unordered_set>
 28 | #include <vector>
 29 | 
 30 | #include "NutcrackerConsistentHashRing.hh"
 31 | #include "Proxy.hh"
 32 | 
 33 | using namespace std;
 34 | 
 35 | 
 36 | bool set_thread_affinity(pthread_t thread, int64_t cpu_id) {
 37 | #ifdef __APPLE__
 38 |   thread_affinity_policy_data_t pd;
 39 |   pd.affinity_tag = cpu_id + 1;
 40 |   return thread_policy_set(pthread_mach_thread_np(thread),
 41 |       THREAD_AFFINITY_POLICY, (thread_policy_t)&pd,
 42 |       THREAD_AFFINITY_POLICY_COUNT) == 0;
 43 | 
 44 | #else // Linux
 45 |   cpu_set_t cpuset;
 46 |   CPU_ZERO(&cpuset);
 47 |   CPU_SET(cpu_id, &cpuset);
 48 |   return pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset) == 0;
 49 | #endif
 50 | }
 51 | 
 52 | 
 53 | bool should_exit = false;
 54 | 
 55 | void sigint_handler(int signum) {
 56 |   should_exit = true;
 57 | }
 58 | 
 59 | 
 60 | struct Options {
 61 |   struct ProxyOptions {
 62 |     size_t num_threads;
 63 |     int64_t affinity_cpus;
 64 | 
 65 |     string listen_addr;
 66 |     int port;
 67 |     int listen_fd;
 68 | 
 69 |     vector<string> backend_netlocs;
 70 |     unordered_set<string> commands_to_disable;
 71 | 
 72 |     uint8_t hash_precision;
 73 |     int hash_begin_delimiter;
 74 |     int hash_end_delimiter;
 75 | 
 76 |     ProxyOptions() : num_threads(1), affinity_cpus(0), listen_addr(""),
 77 |         port(6379), listen_fd(-1), backend_netlocs(), commands_to_disable(),
 78 |         hash_precision(17), hash_begin_delimiter(-1), hash_end_delimiter(-1) { }
 79 | 
 80 |     void print(FILE* stream, const char* name) const {
 81 |       fprintf(stream, "[%s] %zu worker thread(s)\n", name, this->num_threads);
 82 |       if (this->affinity_cpus) {
 83 |         fprintf(stream, "[%s] set thread affinity for cores with mask %016" PRIX64 "\n",
 84 |             name, this->affinity_cpus);
 85 |       } else {
 86 |         fprintf(stream, "[%s] don\'t set thread affinity\n", name);
 87 |       }
 88 |       if (this->listen_fd >= 0) {
 89 |         fprintf(stream, "[%s] accept connections on fd %d\n", name,
 90 |             this->listen_fd);
 91 |       } else if (!this->listen_addr.empty()) {
 92 |         fprintf(stream, "[%s] listen on %s:%d\n", name,
 93 |             this->listen_addr.c_str(), this->port);
 94 |       } else {
 95 |         fprintf(stream, "[%s] listen on port %d on all interfaces\n", name,
 96 |             this->port);
 97 |       }
 98 | 
 99 |       for (const auto& backend_netloc : this->backend_netlocs) {
100 |         fprintf(stream, "[%s] register backend %s\n", name,
101 |             backend_netloc.c_str());
102 |       }
103 | 
104 |       for (const auto& command : this->commands_to_disable) {
105 |         fprintf(stream, "[%s] disable command %s\n", name, command.c_str());
106 |       }
107 | 
108 |       if (this->hash_begin_delimiter >= 0) {
109 |         fprintf(stream, "[%s] hash begin delimiter is 0x%02X\n", name,
110 |             this->hash_begin_delimiter);
111 |       }
112 |       if (this->hash_end_delimiter >= 0) {
113 |         fprintf(stream, "[%s] hash end delimiter is 0x%02X\n", name,
114 |             this->hash_end_delimiter);
115 |       }
116 |     }
117 | 
118 |     void validate() const {
119 |       if (this->backend_netlocs.empty()) {
120 |         throw invalid_argument("no backends specified");
121 |       }
122 |     }
123 |   };
124 | 
125 |   unordered_map<string, ProxyOptions> name_to_proxy_options;
126 | 
127 |   Options() = delete;
128 |   Options(Options&&) = default;
129 |   Options(const Options&) = default;
130 |   Options(const char* filename) {
131 |     string json;
132 |     if (!strcmp(filename, "-")) {
133 |       scoped_fd fd(0);
134 |       json = read_all(fd);
135 |     } else {
136 |       scoped_fd fd(filename, O_RDONLY);
137 |       json = read_all(fd);
138 |     }
139 |     shared_ptr<JSONObject> config = JSONObject::parse(json);
140 | 
141 |     if (!config->is_dict()) {
142 |       throw invalid_argument("configuration is not a dictionary");
143 |     }
144 | 
145 |     for (const auto& proxy_config_it : config->as_dict()) {
146 |       const string& proxy_name = proxy_config_it.first;
147 |       const auto& proxy_config = proxy_config_it.second->as_dict();
148 | 
149 |       ProxyOptions& options = this->name_to_proxy_options.emplace(
150 |           piecewise_construct, forward_as_tuple(proxy_name), forward_as_tuple())
151 |           .first->second;
152 | 
153 |       try {
154 |         options.num_threads = proxy_config.at("num_threads")->as_int();
155 |         if (options.num_threads == 0) {
156 |           options.num_threads = thread::hardware_concurrency();
157 |         }
158 |       } catch (const out_of_range& e) { }
159 | 
160 |       try {
161 |         options.affinity_cpus = proxy_config.at("affinity_cpus")->as_int();
162 |       } catch (const out_of_range& e) { }
163 | 
164 |       try {
165 |         options.listen_addr = proxy_config.at("interface")->as_string();
166 |       } catch (const out_of_range& e) { }
167 | 
168 |       try {
169 |         options.port = proxy_config.at("port")->as_int();
170 |       } catch (const out_of_range& e) { }
171 | 
172 |       try {
173 |         options.hash_precision = proxy_config.at("hash_precision")->as_int();
174 |       } catch (const out_of_range& e) { }
175 | 
176 |       try {
177 |         const auto& s = proxy_config.at("hash_field_begin")->as_string();
178 |         if (s.size() != 1) {
179 |           throw invalid_argument("hash_field_begin is not a 1-char string");
180 |         }
181 |         options.hash_begin_delimiter = s[0];
182 |       } catch (const out_of_range& e) { }
183 | 
184 |       try {
185 |         const auto& s = proxy_config.at("hash_field_end")->as_string();
186 |         if (s.size() != 1) {
187 |           throw invalid_argument("hash_field_end is not a 1-char string");
188 |         }
189 |         options.hash_end_delimiter = s[0];
190 |       } catch (const out_of_range& e) { }
191 | 
192 |       try {
193 |         for (const auto& command : proxy_config.at("disable_commands")->as_list()) {
194 |           options.commands_to_disable.emplace(command->as_string());
195 |         }
196 |       } catch (const out_of_range& e) { }
197 | 
198 |       try {
199 |         for (const auto& backend_it : proxy_config.at("backends")->as_dict()) {
200 |           const auto& backend_name = backend_it.first;
201 |           const auto& backend_netloc = backend_it.second->as_string();
202 | 
203 |           options.backend_netlocs.emplace_back(string_printf("%s@%s",
204 |               backend_netloc.c_str(), backend_name.c_str()));
205 |         }
206 |       } catch (const out_of_range& e) { }
207 |     }
208 |   }
209 | 
210 |   void print(FILE* stream) const {
211 |     fprintf(stream, "%zu proxy instance(s) defined\n",
212 |         this->name_to_proxy_options.size());
213 |     for (const auto& it : this->name_to_proxy_options) {
214 |       it.second.print(stream, it.first.c_str());
215 |     }
216 |   }
217 | 
218 |   void validate() const {
219 |     for (const auto& it : this->name_to_proxy_options) {
220 |       it.second.validate();
221 |     }
222 |   }
223 | };
224 | 
225 | 
226 | 
227 | int main(int argc, char** argv) {
228 | 
229 |   log(INFO, "> fuzziqer software redis-shatter");
230 | 
231 |   // parse command-line args
232 |   if (argc > 2) {
233 |     log(ERROR, "usage: %s [config-filename]", argv[0]);
234 |     return 1;
235 |   }
236 |   const char* config_filename = (argc == 2) ? argv[1] : "redis-shatter.conf.json";
237 |   Options opt(config_filename);
238 |   opt.print(stderr);
239 |   opt.validate();
240 | 
241 |   srand(getpid() ^ time(NULL));
242 |   signal(SIGPIPE, SIG_IGN);
243 |   signal(SIGINT, sigint_handler);
244 | 
245 |   vector<thread> threads;
246 |   vector<unique_ptr<Proxy>> proxies;
247 | 
248 |   // start all the proxies
249 |   vector<size_t> cpu_to_thread_count(thread::hardware_concurrency());
250 |   for (auto& proxy_options_it : opt.name_to_proxy_options) {
251 |     const char* proxy_name = proxy_options_it.first.c_str();
252 |     auto& proxy_options = proxy_options_it.second;
253 | 
254 |     // if there's no listening socket from a parent process, open a new one
255 |     if (proxy_options.listen_fd == -1) {
256 |       proxy_options.listen_fd = listen(proxy_options.listen_addr,
257 |           proxy_options.port, SOMAXCONN);
258 |       if (!proxy_options.listen_addr.empty()) {
259 |         log(INFO, "[%s] opened server socket %d on %s:%d", proxy_name,
260 |             proxy_options.listen_fd, proxy_options.listen_addr.c_str(),
261 |             proxy_options.port);
262 |       } else {
263 |         log(INFO, "[%s] opened server socket %d on port %d", proxy_name,
264 |             proxy_options.listen_fd, proxy_options.port);
265 |       }
266 | 
267 |     } else {
268 |       fprintf(stderr, "[%s] using server socket %d from parent process\n",
269 |           proxy_name, proxy_options.listen_fd);
270 |     }
271 | 
272 |     evutil_make_socket_nonblocking(proxy_options.listen_fd);
273 | 
274 |     fprintf(stderr, "[%s] setting up configuration\n", proxy_name);
275 |     auto hosts = ConsistentHashRing::Host::parse_netloc_list(
276 |         proxy_options.backend_netlocs, 6379);
277 |     shared_ptr<ConsistentHashRing> ring;
278 |     if (proxy_options.hash_precision) {
279 |       ring.reset(new ConstantTimeConsistentHashRing(
280 |           hosts, proxy_options.hash_precision));
281 |     } else {
282 |       ring.reset(new NutcrackerConsistentHashRing(hosts));
283 |     }
284 |     shared_ptr<Proxy::Stats> stats(new Proxy::Stats());
285 | 
286 |     fprintf(stderr, "[%s] starting %zu proxy instances\n", proxy_name,
287 |         proxy_options.num_threads);
288 |     while (threads.size() < proxy_options.num_threads) {
289 |       proxies.emplace_back(new Proxy(proxy_options.listen_fd, ring,
290 |           proxy_options.hash_begin_delimiter, proxy_options.hash_end_delimiter,
291 |           stats, proxies.size()));
292 |       for (const auto& command : proxy_options.commands_to_disable) {
293 |         proxies.back()->disable_command(command);
294 |       }
295 | 
296 |       // run the thread on the least-loaded cpu
297 |       int64_t min_load_cpu = -1;
298 |       for (int64_t cpu_id = 0; cpu_id < static_cast<ssize_t>(cpu_to_thread_count.size()); cpu_id++) {
299 |         if ((proxy_options.affinity_cpus & (1 << cpu_id)) &&
300 |             ((min_load_cpu < 0) ||
301 |              (cpu_to_thread_count[cpu_id] < cpu_to_thread_count[min_load_cpu]))) {
302 |           min_load_cpu = cpu_id;
303 |         }
304 |       }
305 | 
306 |       threads.emplace_back(&Proxy::serve, proxies.back().get());
307 |       if (min_load_cpu >= 0) {
308 |         if (set_thread_affinity(threads.back().native_handle(), min_load_cpu)) {
309 |           cpu_to_thread_count[min_load_cpu]++;
310 |           fprintf(stderr, "[%s] created worker thread on core %" PRId64 "\n",
311 |               proxy_name, min_load_cpu);
312 |         } else {
313 |           fprintf(stderr, "[%s] created worker thread, but failed to bind to core %" PRId64 "\n",
314 |               proxy_name, min_load_cpu);
315 |         }
316 |       } else {
317 |         fprintf(stderr, "[%s] created worker thread\n", proxy_name);
318 |       }
319 |     }
320 |   }
321 | 
322 |   fprintf(stderr, "ready for connections\n");
323 |   sigset_t sigset;
324 |   sigemptyset(&sigset);
325 |   while (!should_exit) {
326 |     sigsuspend(&sigset);
327 |   }
328 | 
329 |   fprintf(stderr, "stopping proxy instances\n");
330 |   for (auto& p : proxies) {
331 |     p->stop();
332 |   }
333 | 
334 |   fprintf(stderr, "waiting for proxy instances to terminate\n");
335 |   for (auto& t : threads) {
336 |     t.join();
337 |   }
338 | 
339 |   return 0;
340 | }
341 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | OBJECTS=NutcrackerConsistentHashRing.o Protocol.o Proxy.o Main.o
 3 | CXXFLAGS=-g -Wall -Werror -std=c++14 -I/opt/local/include
 4 | LDFLAGS=-levent -lphosg -lpthread -g -std=c++14 -L/opt/local/lib
 5 | EXECUTABLE=redis-shatter
 6 | 
 7 | TESTS=ProtocolTest FunctionalTest
 8 | 
 9 | all: $(EXECUTABLE) $(TESTS)
10 | 
11 | $(EXECUTABLE): $(OBJECTS)
12 | 	g++ -o $(EXECUTABLE) $^ $(LDFLAGS)
13 | 
14 | test: all
15 | 	./run_tests.sh
16 | 
17 | ProtocolTest: ProtocolTest.o Protocol.o
18 | 	g++ -o ProtocolTest $^ $(LDFLAGS)
19 | 
20 | FunctionalTest: FunctionalTest.o Protocol.o
21 | 	g++ -o FunctionalTest $^ $(LDFLAGS)
22 | 
23 | clean:
24 | 	rm -rf *.dSYM *.o $(EXECUTABLE) $(TESTS) gmon.out
25 | 
26 | .PHONY: clean
27 | 


--------------------------------------------------------------------------------
/NutcrackerConsistentHashRing.cc:
--------------------------------------------------------------------------------
 1 | #include "NutcrackerConsistentHashRing.hh"
 2 | 
 3 | #include <inttypes.h>
 4 | #include <math.h>
 5 | #include <string.h>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | 
 9 | #include <phosg/Hash.hh>
10 | #include <phosg/Strings.hh>
11 | #include <string>
12 | #include <vector>
13 | 
14 | using namespace std;
15 | 
16 | 
17 | 
18 | NutcrackerConsistentHashRing::Point::Point(uint32_t index, uint32_t value) : index(index),
19 |     value(value) { }
20 | 
21 | 
22 | 
23 | #define KETAMA_POINTS_PER_SERVER    160
24 | #define KETAMA_POINTS_PER_HASH      4
25 | #define KETAMA_MAX_HOSTLEN          256
26 | 
27 | NutcrackerConsistentHashRing::NutcrackerConsistentHashRing(
28 |     const vector<Host>& hosts) : ConsistentHashRing(hosts) {
29 |   if (this->hosts.empty()) {
30 |     throw invalid_argument("no hosts in continuum");
31 |   }
32 | 
33 |   uint64_t total_weight = this->hosts.size();
34 | 
35 |   for (size_t host_index = 0; host_index < this->hosts.size(); host_index++) {
36 |     const auto& host = this->hosts[host_index];
37 | 
38 |     float pct = 1.0 / (float)total_weight;
39 |     size_t points_per_host = (size_t)((floorf((float) (pct * KETAMA_POINTS_PER_SERVER / 4 * (float)this->hosts.size() + 0.0000000001))) * 4);
40 | 
41 |     for (size_t point_index = 0; point_index <= (points_per_host / KETAMA_POINTS_PER_HASH) - 1; point_index++) {
42 |       char point_data[KETAMA_MAX_HOSTLEN];
43 |       size_t point_data_size = snprintf(point_data, KETAMA_MAX_HOSTLEN,
44 |           "%s-%zu", host.name.c_str(), point_index);
45 |       string hash = md5(point_data, point_data_size);
46 | 
47 |       for (size_t x = 0; x < KETAMA_POINTS_PER_HASH; x++) {
48 |         uint32_t value = (static_cast<uint32_t>(hash[3 + x * 4] & 0xFF) << 24) |
49 |                          (static_cast<uint32_t>(hash[2 + x * 4] & 0xFF) << 16) |
50 |                          (static_cast<uint32_t>(hash[1 + x * 4] & 0xFF) << 8) |
51 |                          (static_cast<uint32_t>(hash[0 + x * 4] & 0xFF));
52 |         this->points.emplace_back(host_index, value);
53 |       }
54 |     }
55 |   }
56 | 
57 |   qsort(this->points.data(), this->points.size(), sizeof(this->points[0]),
58 |         [](const void* t1, const void* t2) -> int {
59 |     const Point* ct1 = reinterpret_cast<const Point*>(t1);
60 |     const Point* ct2 = reinterpret_cast<const Point*>(t2);
61 |     if (ct1->value == ct2->value) {
62 |       return 0;
63 |     } else if (ct1->value > ct2->value) {
64 |       return 1;
65 |     } else {
66 |       return -1;
67 |     }
68 |   });
69 | }
70 | 
71 | uint64_t NutcrackerConsistentHashRing::host_id_for_key(const void* key,
72 |     int64_t size) const {
73 |   // TODO: use std::lower_bound here instead of manual binary search
74 | 
75 |   uint32_t hash32 = fnv1a64(key, size);
76 | 
77 |   const Point* left = this->points.data();
78 |   const Point* right = left + this->points.size();
79 | 
80 |   while (left < right) {
81 |     const Point* middle = left + (right - left) / 2;
82 |     if (middle->value < hash32) {
83 |       left = middle + 1;
84 |     } else {
85 |       right = middle;
86 |     }
87 |   }
88 | 
89 |   if (right == this->points.data() + this->points.size()) {
90 |     return this->points[0].index;
91 |   }
92 |   return right->index;
93 | }
94 | 


--------------------------------------------------------------------------------
/NutcrackerConsistentHashRing.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | #include <phosg/ConsistentHashRing.hh>
 6 | #include <string>
 7 | #include <unordered_map>
 8 | #include <vector>
 9 | 
10 | 
11 | // this isn't the same as phosg's ConsistentHashRing; this one is designed to
12 | // mirror the implementation in twemproxy/nutcracker so that redis-shatter can
13 | // be used alongside it
14 | 
15 | class NutcrackerConsistentHashRing : public ConsistentHashRing {
16 | public:
17 |   NutcrackerConsistentHashRing() = delete;
18 |   NutcrackerConsistentHashRing(const std::vector<Host>& hosts);
19 |   virtual ~NutcrackerConsistentHashRing() = default;
20 | 
21 |   virtual uint64_t host_id_for_key(const void* key, int64_t size) const;
22 | 
23 | protected:
24 |   struct Point {
25 |     uint32_t index;
26 |     uint32_t value;
27 | 
28 |     Point(uint32_t index, uint32_t hash);
29 |   };
30 | 
31 |   std::vector<Point> points;
32 | };
33 | 


--------------------------------------------------------------------------------
/Protocol.cc:
--------------------------------------------------------------------------------
  1 | #include "Protocol.hh"
  2 | 
  3 | #include <event2/bufferevent.h>
  4 | #include <event2/buffer.h>
  5 | #include <inttypes.h>
  6 | #include <stdarg.h>
  7 | #include <stdio.h>
  8 | #include <stdint.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <errno.h>
 12 | 
 13 | #include <phosg/Strings.hh>
 14 | 
 15 | using namespace std;
 16 | 
 17 | 
 18 | 
 19 | static size_t evbuffer_readln_into(struct evbuffer* buf, char* buffer,
 20 |     size_t buffer_size, enum evbuffer_eol_style eol_style, bool drain = true) {
 21 | 
 22 |   size_t eol_len;
 23 |   struct evbuffer_ptr ptr = evbuffer_search_eol(buf, NULL, &eol_len, eol_style);
 24 |   if (ptr.pos == -1) {
 25 |     throw out_of_range("no line available");
 26 |   }
 27 | 
 28 |   if (ptr.pos < static_cast<ssize_t>(buffer_size)) {
 29 |     evbuffer_copyout(buf, buffer, ptr.pos);
 30 |     buffer[ptr.pos] = 0;
 31 |     if (drain) {
 32 |       evbuffer_drain(buf, ptr.pos + eol_len);
 33 |     }
 34 |     return ptr.pos;
 35 |   }
 36 |   throw runtime_error("line too long");
 37 | }
 38 | 
 39 | 
 40 | 
 41 | DataCommand::DataCommand(size_t num_args) {
 42 |   this->args.reserve(num_args);
 43 | }
 44 | 
 45 | void DataCommand::print(FILE* stream, int indent_level) const {
 46 | 
 47 |   if (indent_level < 0) {
 48 |     indent_level = -indent_level;
 49 |   } else {
 50 |     print_indent(stream, indent_level);
 51 |   }
 52 | 
 53 |   fprintf(stream, "DataCommand[\n");
 54 |   for (const auto& arg : this->args) {
 55 |     print_indent(stream, indent_level + 1);
 56 |     for (char ch : arg) {
 57 |       if (ch < 0x20 || ch > 0x7F) {
 58 |         fprintf(stream, "\\x%02X", ch);
 59 |       } else {
 60 |         fputc(ch, stream);
 61 |       }
 62 |     }
 63 |     fprintf(stream, ",\n");
 64 |   }
 65 |   fprintf(stream, "]]");
 66 | }
 67 | 
 68 | string DataCommand::format() const {
 69 |   string ret = "[";
 70 | 
 71 |   for (const auto& arg : this->args) {
 72 |     if (ret.size() > 1) {
 73 |       ret += ',';
 74 |     }
 75 |     ret += '\'';
 76 |     for (char ch : arg) {
 77 |       if (ch < 0x20 || ch > 0x7F) {
 78 |         ret += string_printf("\\x%02X", ch);
 79 |       } else if (ch == '\'') {
 80 |         ret += "\\\'";
 81 |       } else {
 82 |         ret += ch;
 83 |       }
 84 |     }
 85 |     ret += '\'';
 86 |   }
 87 |   ret += ']';
 88 | 
 89 |   return ret;
 90 | }
 91 | 
 92 | void DataCommand::write(struct evbuffer* buf) const {
 93 |   if (!buf) {
 94 |     return;
 95 |   }
 96 | 
 97 |   evbuffer_add_printf(buf, "*%zu\r\n", this->args.size());
 98 | 
 99 |   for (const auto& arg : this->args) {
100 |     evbuffer_add_printf(buf, "$%zu\r\n", arg.size());
101 |     evbuffer_add(buf, arg.data(), arg.size());
102 |     evbuffer_add(buf, "\r\n", 2);
103 |   }
104 | }
105 | 
106 | 
107 | 
108 | ReferenceCommand::DataReference::DataReference() : data(NULL), size(0) { }
109 | 
110 | ReferenceCommand::DataReference::DataReference(const void* data, size_t size) :
111 |     data(data), size(size) { }
112 | 
113 | ReferenceCommand::DataReference::DataReference(const string& data) :
114 |     data(data.data()), size(data.size()) { }
115 | 
116 | ReferenceCommand::ReferenceCommand(size_t num_args) {
117 |   this->args.reserve(num_args);
118 | }
119 | 
120 | void ReferenceCommand::print(FILE* stream, int indent_level) const {
121 | 
122 |   if (indent_level < 0) {
123 |     indent_level = -indent_level;
124 |   } else {
125 |     print_indent(stream, indent_level);
126 |   }
127 | 
128 |   fprintf(stream, "ReferenceCommand[\n");
129 |   for (const auto& arg : this->args) {
130 |     print_indent(stream, indent_level + 1);
131 |     for (size_t x = 0; x < arg.size; x++) {
132 |       char ch = ((const char*)arg.data)[x];
133 |       if (ch < 0x20 || ch > 0x7F) {
134 |         fprintf(stream, "\\x%02X", ch);
135 |       } else {
136 |         fputc(ch, stream);
137 |       }
138 |     }
139 |     fprintf(stream, ",\n");
140 |   }
141 |   fprintf(stream, "]]");
142 | }
143 | 
144 | string ReferenceCommand::format() const {
145 |   string ret = "[";
146 | 
147 |   for (const auto& arg : this->args) {
148 |     if (ret.size() > 1) {
149 |       ret += ',';
150 |     }
151 |     ret += '\'';
152 |     for (size_t x = 0; x < arg.size; x++) {
153 |       char ch = ((const char*)arg.data)[x];
154 |       if (ch < 0x20 || ch > 0x7F) {
155 |         ret += string_printf("\\x%02X", ch);
156 |       } else if (ch == '\'') {
157 |         ret += "\\\'";
158 |       } else {
159 |         ret += ch;
160 |       }
161 |     }
162 |     ret += '\'';
163 |   }
164 |   ret += ']';
165 | 
166 |   return ret;
167 | }
168 | 
169 | void ReferenceCommand::write(struct evbuffer* buf) const {
170 |   if (!buf) {
171 |     return;
172 |   }
173 | 
174 |   evbuffer_add_printf(buf, "*%zu\r\n", this->args.size());
175 | 
176 |   for (const auto& arg : this->args) {
177 |     evbuffer_add_printf(buf, "$%zu\r\n", arg.size);
178 |     evbuffer_add(buf, arg.data, arg.size);
179 |     evbuffer_add(buf, "\r\n", 2);
180 |   }
181 | }
182 | 
183 | 
184 | 
185 | Response::Response(Response::Type type, int64_t size) : type(type),
186 |     int_value(size) {
187 |   switch (this->type) {
188 |     case Type::Status:
189 |     case Type::Error:
190 |     case Type::Data:
191 |       if (size > 0) {
192 |         this->fields.reserve(size);
193 |       }
194 |       break;
195 | 
196 |     case Type::Integer:
197 |       this->int_value = 0;
198 |       break;
199 | 
200 |     case Type::Multi:
201 |       if (size > 0) {
202 |         this->data.reserve(size);
203 |       }
204 |   }
205 | }
206 | 
207 | Response::Response(Type type, const char* fmt, ...) : type(type), int_value(0) {
208 |   va_list va;
209 |   va_start(va, fmt);
210 |   this->data = string_vprintf(fmt, va);
211 |   va_end(va);
212 | }
213 | 
214 | Response::Response(Type type, const void* data, size_t size) : type(type),
215 |     data((const char*)data, size), int_value(0) { }
216 | 
217 | Response::Response(Type type, const string& data) : type(type), data(data),
218 |     int_value(0) { }
219 | 
220 | bool Response::operator==(const Response& other) const {
221 |   if (this->type != other.type) {
222 |     return false;
223 |   }
224 | 
225 |   // check for nulls
226 |   if (this->type == Type::Data || this->type == Type::Multi) {
227 |     if ((this->int_value < 0) && (other.int_value < 0)) {
228 |       return true; // both are null
229 |     }
230 |     if ((this->int_value < 0) || (other.int_value < 0)) {
231 |       return false; // one is null but the other isn't
232 |     }
233 |   }
234 | 
235 |   switch (this->type) {
236 |     case Type::Status:
237 |     case Type::Error:
238 |     case Type::Data:
239 |       return this->data == other.data;
240 | 
241 |     case Type::Integer:
242 |       return this->int_value == other.int_value;
243 | 
244 |     case Type::Multi:
245 |       if (this->fields.size() != other.fields.size()) {
246 |         return false;
247 |       }
248 |       for (size_t x = 0; x < this->fields.size(); x++) {
249 |         if (*this->fields[x] != *other.fields[x]) {
250 |           return false;
251 |         }
252 |       }
253 |       return true;
254 | 
255 |     default:
256 |       return false;
257 |   }
258 | }
259 | 
260 | bool Response::operator!=(const Response& other) const {
261 |   return !(this->operator==(other));
262 | }
263 | 
264 | void Response::print(FILE* stream, int indent_level) const {
265 | 
266 |   if (indent_level < 0) {
267 |     indent_level = -indent_level;
268 |   } else {
269 |     print_indent(stream, indent_level);
270 |   }
271 | 
272 |   switch (this->type) {
273 |     case Type::Status:
274 |       fprintf(stream, "Response[type=Status, data=%s]", this->data.c_str());
275 |       break;
276 | 
277 |     case Type::Error:
278 |       fprintf(stream, "Response[type=Error, data=%s]", this->data.c_str());
279 |       break;
280 | 
281 |     case Type::Integer:
282 |       fprintf(stream, "Response[type=Integer, int_value=%" PRId64 "]",
283 |           this->int_value);
284 |       break;
285 | 
286 |     case Type::Data:
287 |       if (this->int_value < 0) {
288 |         fprintf(stream, "Response[type=Data, null]\n");
289 |       } else {
290 |         fprintf(stream, "Response[type=Data, data=");
291 |         for (char ch : data) {
292 |           if (ch < 0x20 || ch > 0x7F) {
293 |             fprintf(stream, "\\x%02X", ch);
294 |           } else {
295 |             fputc(ch, stream);
296 |           }
297 |         }
298 |         fputc(']', stream);
299 |       }
300 |       break;
301 | 
302 |     case Type::Multi:
303 |       if (this->int_value < 0) {
304 |         fprintf(stream, "Response[type=Multi, null]");
305 |       } else {
306 |         fprintf(stream, "Response[type=MULTI, fields=[\n");
307 |         for (const auto& resp : this->fields) {
308 |           resp->print(stream, indent_level + 1);
309 |           fprintf(stream, ",\n");
310 |         }
311 |         print_indent(stream, indent_level);
312 |         fprintf(stream, "]");
313 |       }
314 |       break;
315 | 
316 |     default:
317 |       fprintf(stream, "Response[type=Unknown]\n");
318 |   }
319 | }
320 | 
321 | string Response::format() const {
322 | 
323 |   switch (this->type) {
324 |     case Type::Status:
325 |       return "(Status) " + this->data;
326 | 
327 |     case Type::Error:
328 |       return "(Error) " + this->data;
329 | 
330 |     case Type::Integer:
331 |       return string_printf("%" PRId64, this->int_value);
332 | 
333 |     case Type::Data:
334 |       if (this->int_value < 0) {
335 |         return "(Null)";
336 |       } else {
337 |         string ret = "\'";
338 |         for (char ch : this->data) {
339 |           if (ch < 0x20 || ch > 0x7F) {
340 |             ret += string_printf("\\x%02X", ch);
341 |           } else if (ch == '\'') {
342 |             ret += "\\\'";
343 |           } else {
344 |             ret += ch;
345 |           }
346 |         }
347 |         ret += '\'';
348 |         return ret;
349 |       }
350 |       break;
351 | 
352 |     case Type::Multi:
353 |       if (this->int_value < 0) {
354 |         return "(Null)";
355 |       } else {
356 |         string ret = "[";
357 |         for (const auto& f : this->fields) {
358 |           if (ret.size() > 1) {
359 |             ret += ", ";
360 |           }
361 |           ret += f->format();
362 |         }
363 |         ret += "]";
364 |         return ret;
365 |       }
366 |       break;
367 | 
368 |     default:
369 |       return string_printf("(UnknownType:%02" PRIX8 ")", (uint8_t)this->type);
370 |   }
371 | }
372 | 
373 | void Response::write(struct evbuffer* buf) const {
374 | 
375 |   if (!buf) {
376 |     return;
377 |   }
378 | 
379 |   switch (this->type) {
380 |     case Type::Status:
381 |     case Type::Error:
382 |       this->write_string(buf, this->data.data(), this->data.size(),
383 |           (char)this->type);
384 |       break;
385 | 
386 |     case Type::Integer:
387 |       this->write_int(buf, this->int_value, (char)Type::Integer);
388 |       break;
389 | 
390 |     case Type::Data:
391 |       if (this->int_value >= 0) {
392 |         this->write_int(buf, this->data.size(), (char)Type::Data);
393 |         evbuffer_add(buf, this->data.data(), this->data.size());
394 |         evbuffer_add(buf, "\r\n", 2);
395 |       } else {
396 |         evbuffer_add(buf, "$-1\r\n", 5);
397 |       }
398 |       break;
399 | 
400 |     case Type::Multi:
401 |       if (this->int_value >= 0) {
402 |         this->write_int(buf, this->fields.size(), (char)Type::Multi);
403 |         for (const auto& field : this->fields) {
404 |           field->write(buf);
405 |         }
406 |       } else {
407 |         evbuffer_add(buf, "*-1\r\n", 5);
408 |       }
409 |       break;
410 | 
411 |     default:
412 |       throw runtime_error("invalid response type in write()");
413 |   }
414 | }
415 | 
416 | void Response::write_string(struct evbuffer* buf, const char* string,
417 |     char sentinel) {
418 |   if (!buf) {
419 |     return;
420 |   }
421 |   if (sentinel == Response::Type::Data) {
422 |     evbuffer_add_printf(buf, "$%zu\r\n%s\r\n", strlen(string), string);
423 |   } else {
424 |     evbuffer_add_printf(buf, "%c%s\r\n", sentinel, string);
425 |   }
426 | }
427 | 
428 | void Response::write_string(struct evbuffer* buf, const void* string,
429 |     size_t size, char sentinel) {
430 |   if (!buf) {
431 |     return;
432 |   }
433 |   if (sentinel == Response::Type::Data) {
434 |     evbuffer_add_printf(buf, "$%zu\r\n", size);
435 |   } else {
436 |     evbuffer_add(buf, &sentinel, 1);
437 |   }
438 |   evbuffer_add(buf, string, size);
439 |   evbuffer_add(buf, "\r\n", 2);
440 | }
441 | 
442 | void Response::write_int(struct evbuffer* buf, int64_t value,
443 |     char sentinel) {
444 |   if (!buf) {
445 |     return;
446 |   }
447 |   evbuffer_add_printf(buf, "%c%" PRId64 "\r\n", sentinel, value);
448 | }
449 | 
450 | 
451 | 
452 | CommandParser::CommandParser() : state(State::Initial), error_str(NULL) { }
453 | 
454 | const char* CommandParser::error() const {
455 |   return this->error_str;
456 | }
457 | 
458 | shared_ptr<DataCommand> CommandParser::resume(struct evbuffer* buf) {
459 |   char input_line[0x100];
460 |   for (;;) {
461 |     switch (this->state) {
462 |       case State::Initial: {
463 |         // expect "*num_args\r\n", or inline command
464 |         try {
465 |           evbuffer_readln_into(buf, input_line, sizeof(input_line),
466 |               EVBUFFER_EOL_CRLF);
467 |         } catch (const out_of_range&) {
468 |           return NULL; // complete line not yet available
469 |         } catch (const runtime_error& e) {
470 |           this->error_str = "line too long";
471 |           return NULL;
472 |         }
473 | 
474 |         if (input_line[0] != '*') {
475 |           // this is an inline command; split it on spaces
476 |           shared_ptr<DataCommand> cmd(new DataCommand());
477 |           auto& args = cmd->args;
478 | 
479 |           size_t arg_start_offset = 0;
480 |           for (size_t x = 0; input_line[x];) {
481 |             // find the end of the current token
482 |             for (; input_line[x] && (input_line[x] != ' '); x++);
483 | 
484 |             args.emplace_back(&input_line[arg_start_offset], x - arg_start_offset);
485 | 
486 |             // find the start of the next argument
487 |             for (; input_line[x] && (input_line[x] == ' '); x++);
488 |             arg_start_offset = x;
489 |           }
490 | 
491 |           // we're done. notice that this doesn't affect the parser state at all
492 |           return cmd;
493 | 
494 |         }
495 | 
496 |         // not an inline command. move to reading-argument state
497 |         this->arguments_remaining = strtoll(&input_line[1], NULL, 10);
498 |         if (this->arguments_remaining <= 0) {
499 |           throw runtime_error("command with zero or fewer arguments");
500 |         }
501 |         this->command_in_progress.reset(new DataCommand(this->arguments_remaining));
502 |         this->state = State::ReadingArgumentSize;
503 |         break;
504 |       }
505 | 
506 |       case State::ReadingArgumentSize: {
507 |         // expect "$arg_size\r\n"
508 |         try {
509 |           evbuffer_readln_into(buf, input_line, sizeof(input_line),
510 |               EVBUFFER_EOL_CRLF);
511 |         } catch (const out_of_range&) {
512 |           return NULL; // complete line not yet available
513 |         } catch (const runtime_error& e) {
514 |           this->error_str = "line too long";
515 |           return NULL;
516 |         }
517 | 
518 |         if (input_line[0] != '$') {
519 |           throw runtime_error("didn\'t get command arg size where expected");
520 |         } else {
521 |           this->data_bytes_remaining = strtoull(&input_line[1], NULL, 10);
522 |           this->command_in_progress->args.emplace_back();
523 |           this->command_in_progress->args.back().reserve(
524 |               this->data_bytes_remaining);
525 |           this->state = State::ReadingArgumentData;
526 |         }
527 |         break;
528 |       }
529 | 
530 |       case State::ReadingArgumentData: {
531 |         // copy data into the last argument
532 |         string& arg = this->command_in_progress->args.back();
533 |         ssize_t bytes_available = evbuffer_get_length(buf);
534 |         if (bytes_available == 0) {
535 |           return NULL;
536 |         }
537 |         if (bytes_available > this->data_bytes_remaining) {
538 |           bytes_available = this->data_bytes_remaining;
539 |         }
540 | 
541 |         size_t bytes_existing = arg.size();
542 |         arg.resize(bytes_existing + bytes_available);
543 |         ssize_t bytes_copied = evbuffer_remove(buf,
544 |             const_cast<char*>(arg.data()) + bytes_existing,
545 |             bytes_available);
546 |         if (bytes_copied < 0) {
547 |           throw runtime_error("can\'t read from evbuffer");
548 |         }
549 |         this->data_bytes_remaining -= bytes_copied;
550 | 
551 |         // TODO: do we need to handle the case where bytes_copied != bytes_available?
552 | 
553 |         if (this->data_bytes_remaining == 0) {
554 |           this->arguments_remaining--;
555 |           this->state = State::ReadingNewlineAfterArgumentData;
556 |         }
557 |         break;
558 |       }
559 | 
560 |       case State::ReadingNewlineAfterArgumentData:
561 |         if (evbuffer_get_length(buf) < 2) {
562 |           return NULL; // not ready yet
563 |         }
564 |         char data[2];
565 |         if (2 != evbuffer_remove(buf, data, 2)) {
566 |           throw runtime_error("can\'t read newline after argument data");
567 |         }
568 |         if (data[0] != '\r' && data[1] != '\n') {
569 |           throw runtime_error("\\r\\n did not follow argument data");
570 |         }
571 | 
572 |         // if we're expecting more arguments, move back to the appropriate
573 |         // state. if not, return the command and return to the initial state.
574 |         if (this->arguments_remaining) {
575 |           this->state = State::ReadingArgumentSize;
576 |         } else {
577 |           this->state = State::Initial;
578 |           return move(this->command_in_progress);
579 |         }
580 |         break;
581 | 
582 |       default:
583 |         throw runtime_error("command parser got into unknown state");
584 |     }
585 |   }
586 | 
587 |   return NULL; // complete line not yet available
588 | }
589 | 
590 | 
591 | 
592 | ResponseParser::ResponseParser() : state(State::Initial), error_str(NULL) { }
593 | 
594 | const char* ResponseParser::error() const {
595 |   return this->error_str;
596 | }
597 | 
598 | shared_ptr<Response> ResponseParser::resume(struct evbuffer* buf) {
599 |   char input_line[0x100];
600 |   for (;;) {
601 |     switch (this->state) {
602 |       case State::Initial: {
603 |         try {
604 |           evbuffer_readln_into(buf, input_line, sizeof(input_line),
605 |               EVBUFFER_EOL_CRLF);
606 |         } catch (const out_of_range&) {
607 |           return NULL; // complete line not yet available
608 |         } catch (const runtime_error& e) {
609 |           this->error_str = "line too long";
610 |           return NULL;
611 |         }
612 | 
613 |         switch (input_line[0]) {
614 |           case Response::Type::Status:
615 |           case Response::Type::Error: {
616 |             shared_ptr<Response> resp(new Response((Response::Type)input_line[0], (int64_t)0));
617 |             resp->data.assign(&input_line[1]);
618 |             return resp;
619 |           }
620 | 
621 |           case Response::Type::Integer: {
622 |             shared_ptr<Response> resp(new Response(Response::Type::Integer));
623 |             resp->int_value = strtoll(&input_line[1], NULL, 10);
624 |             return resp;
625 |           }
626 | 
627 |           case Response::Type::Data: {
628 |             this->data_bytes_remaining = strtoll(&input_line[1], NULL, 0);
629 |             if (this->data_bytes_remaining < 0) {
630 |               return shared_ptr<Response>(new Response(Response::Type::Data,
631 |                   this->data_bytes_remaining));
632 |             }
633 | 
634 |             this->response_in_progress.reset(new Response(Response::Type::Data,
635 |                 this->data_bytes_remaining));
636 |             this->state = (this->data_bytes_remaining ? State::ReadingData :
637 |                 State::ReadingNewlineAfterData);
638 |             break;
639 |           }
640 | 
641 |           case Response::Type::Multi: {
642 |             this->multi_fields_remaining = strtoll(&input_line[1], NULL, 0);
643 |             if (this->multi_fields_remaining <= 0) {
644 |               return shared_ptr<Response>(new Response(Response::Type::Multi,
645 |                   this->multi_fields_remaining));
646 |             }
647 | 
648 |             this->response_in_progress.reset(new Response(Response::Type::Multi,
649 |                 this->multi_fields_remaining));
650 |             this->multi_in_progress.reset(new ResponseParser());
651 |             this->state = State::MultiRecursive;
652 |             break; }
653 | 
654 |           default:
655 |             throw runtime_error(string_printf("incorrect sentinel: %c", input_line[0]));
656 |         }
657 |         break; // State::Initial
658 |       }
659 | 
660 |       case State::MultiRecursive: {
661 |         for (;;) {
662 |           auto field = this->multi_in_progress->resume(buf);
663 |           if (!field.get()) {
664 |             return NULL;
665 |           }
666 | 
667 |           this->response_in_progress->fields.emplace_back(field);
668 |           this->multi_fields_remaining--;
669 |           if (this->multi_fields_remaining == 0) {
670 |             this->state = State::Initial;
671 |             return move(this->response_in_progress);
672 |           }
673 |         }
674 |         break; // State::MultiRecursive
675 |       }
676 | 
677 |       case State::ReadingData: {
678 |         // copy data into the data field
679 |         ssize_t bytes_available = evbuffer_get_length(buf);
680 |         if (bytes_available == 0) {
681 |           return NULL;
682 |         }
683 |         if (bytes_available > this->data_bytes_remaining) {
684 |           bytes_available = this->data_bytes_remaining;
685 |         }
686 | 
687 |         size_t bytes_existing = this->response_in_progress->data.size();
688 |         this->response_in_progress->data.resize(bytes_existing + bytes_available);
689 |         ssize_t bytes_copied = evbuffer_remove(buf,
690 |             const_cast<char*>(this->response_in_progress->data.data()) + bytes_existing,
691 |             bytes_available);
692 |         if (bytes_copied < 0) {
693 |           throw runtime_error("can\'t read from evbuffer");
694 |         }
695 |         this->data_bytes_remaining -= bytes_copied;
696 | 
697 |         if (this->data_bytes_remaining == 0) {
698 |           this->state = State::ReadingNewlineAfterData;
699 |         }
700 |         break;
701 |       }
702 | 
703 |       case State::ReadingNewlineAfterData:
704 |         if (evbuffer_get_length(buf) < 2) {
705 |           return NULL; // not ready yet
706 |         }
707 |         char data[2];
708 |         if (2 != evbuffer_remove(buf, data, 2)) {
709 |           throw runtime_error("can\'t read newline after argument data");
710 |         }
711 |         if (data[0] != '\r' && data[1] != '\n') {
712 |           throw runtime_error("\\r\\n did not follow argument data");
713 |         }
714 | 
715 |         this->state = State::Initial;
716 |         return move(this->response_in_progress);
717 | 
718 |       default:
719 |         throw runtime_error("response parser got into unknown state");
720 |     }
721 |   }
722 |   return NULL;
723 | }
724 | 
725 | bool ResponseParser::forward(struct evbuffer* buf,
726 |     struct evbuffer* output_buffer) {
727 | 
728 |   // output_buffer can be NULL if the client has already disconnected. in this
729 |   // case, we just don't write to the output buffer (discard the response).
730 |   char input_line[0x100];
731 |   size_t input_line_size;
732 |   for (;;) {
733 |     switch (this->state) {
734 |       case State::Initial: {
735 |         try {
736 |           input_line_size = evbuffer_readln_into(buf, input_line,
737 |               sizeof(input_line), EVBUFFER_EOL_CRLF, false);
738 |         } catch (const out_of_range&) {
739 |           return false; // complete line not yet available
740 |         } catch (const runtime_error& e) {
741 |           this->error_str = "line too long";
742 |           return false;
743 |         }
744 | 
745 |         // forward the line to the client immediately. unlike in resume(), we
746 |         // didn't drain it from the input buffer, so hopefully we can just move
747 |         // the data between buffers instead of copying. add 2 for the \r\n
748 |         if (output_buffer) {
749 |           evbuffer_remove_buffer(buf, output_buffer, input_line_size + 2);
750 |         }
751 | 
752 |         switch (input_line[0]) {
753 |           case Response::Type::Status:
754 |           case Response::Type::Error:
755 |           case Response::Type::Integer:
756 |             return true;
757 | 
758 |           case Response::Type::Data:
759 |             // we add 2 here for the trailing \r\n
760 |             this->data_bytes_remaining = strtoll(&input_line[1], NULL, 0);
761 |             if (this->data_bytes_remaining < 0) {
762 |               return true; // null response
763 |             } else {
764 |               this->state = State::ReadingData;
765 |             }
766 |             break;
767 | 
768 |           case Response::Type::Multi:
769 |             this->multi_fields_remaining = strtoll(&input_line[1], NULL, 0);
770 |             if (this->multi_fields_remaining <= 0) {
771 |               return true; // null response
772 |             } else {
773 |               this->multi_in_progress.reset(new ResponseParser());
774 |               this->state = State::MultiRecursive;
775 |             }
776 |             break;
777 | 
778 |           default:
779 |             throw runtime_error(string_printf("incorrect sentinel: %c", input_line[0]));
780 |         }
781 |         break; // State::Initial
782 |       }
783 | 
784 |       case State::MultiRecursive: {
785 |         for (;;) {
786 |           bool field_forwarded = this->multi_in_progress->forward(buf,
787 |               output_buffer);
788 |           if (!field_forwarded) {
789 |             return false;
790 |           }
791 | 
792 |           this->multi_fields_remaining--;
793 |           if (this->multi_fields_remaining == 0) {
794 |             this->state = State::Initial;
795 |             return true;
796 |           }
797 |         }
798 |         break;
799 |       }
800 | 
801 |       case State::ReadingData: {
802 |         ssize_t bytes_available = evbuffer_get_length(buf);
803 |         if (bytes_available == 0) {
804 |           return false;
805 |         }
806 |         if (bytes_available > this->data_bytes_remaining) {
807 |           bytes_available = this->data_bytes_remaining;
808 |         }
809 | 
810 |         if (output_buffer) {
811 |           ssize_t bytes_copied = evbuffer_remove_buffer(buf, output_buffer,
812 |               bytes_available);
813 |           if (bytes_copied < 0) {
814 |             throw runtime_error("can\'t read from evbuffer");
815 |           }
816 |           this->data_bytes_remaining -= bytes_copied;
817 |         } else {
818 |           evbuffer_drain(buf, bytes_available);
819 |           this->data_bytes_remaining -= bytes_available;
820 |         }
821 | 
822 |         if (this->data_bytes_remaining == 0) {
823 |           this->state = State::ReadingNewlineAfterData;
824 |         }
825 |         break;
826 |       }
827 | 
828 |       case State::ReadingNewlineAfterData: {
829 |         if (evbuffer_get_length(buf) < 2) {
830 |           return false; // not ready yet
831 |         }
832 |         char data[2];
833 |         if (2 != evbuffer_remove(buf, data, 2)) {
834 |           throw runtime_error("can\'t read newline after argument data");
835 |         }
836 |         if (data[0] != '\r' && data[1] != '\n') {
837 |           throw runtime_error("\\r\\n did not follow argument data");
838 |         }
839 |         if (output_buffer) {
840 |           evbuffer_add(output_buffer, "\r\n", 2);
841 |         }
842 | 
843 |         this->state = State::Initial;
844 |         return true;
845 |       }
846 | 
847 |       default:
848 |         throw runtime_error("response parser got into unknown state");
849 |     }
850 |   }
851 | 
852 |   return false;
853 | }
854 | 


--------------------------------------------------------------------------------
/Protocol.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <event2/buffer.h>
  4 | #include <stdint.h>
  5 | 
  6 | #include <memory>
  7 | #include <string>
  8 | #include <vector>
  9 | 
 10 | 
 11 | // DataCommand and ReferenceCommand aren't subclasses of a parent Command class
 12 | // because this incurs a significant performance penalty (up to 7% in some
 13 | // cases).
 14 | 
 15 | struct DataCommand {
 16 |   std::vector<std::string> args;
 17 | 
 18 |   DataCommand() = default;
 19 |   explicit DataCommand(size_t num_args);
 20 |   ~DataCommand() = default;
 21 | 
 22 |   void print(FILE* stream, int indent_level = 0) const;
 23 |   std::string format() const;
 24 | 
 25 |   void write(struct evbuffer* buf) const;
 26 | };
 27 | 
 28 | 
 29 | struct ReferenceCommand {
 30 |   struct DataReference {
 31 |     const void* data;
 32 |     size_t size;
 33 | 
 34 |     DataReference();
 35 |     DataReference(const void* data, size_t size);
 36 |     DataReference(const std::string& data);
 37 |   };
 38 | 
 39 |   std::vector<DataReference> args;
 40 | 
 41 |   ReferenceCommand() = default;
 42 |   explicit ReferenceCommand(size_t num_args);
 43 |   ~ReferenceCommand() = default;
 44 | 
 45 |   void print(FILE* stream, int indent_level = 0) const;
 46 |   std::string format() const;
 47 | 
 48 |   void write(struct evbuffer* buf) const;
 49 | };
 50 | 
 51 | 
 52 | struct Response {
 53 |   enum Type {
 54 |     Status = '+',
 55 |     Error = '-',
 56 |     Integer = ':',
 57 |     Data = '$',
 58 |     Multi = '*',
 59 |   };
 60 |   Type type;
 61 | 
 62 |   std::string data; // Status, Error and Data
 63 |   int64_t int_value; // Integer
 64 |   std::vector<std::shared_ptr<Response>> fields; // Multi
 65 | 
 66 |   Response(Type type, int64_t size = 0);
 67 |   Response(Type type, const char* fmt, ...);
 68 |   Response(Type type, const void* data, size_t size);
 69 |   Response(Type type, const std::string& data);
 70 |   ~Response() = default;
 71 | 
 72 |   bool operator==(const Response& other) const;
 73 |   bool operator!=(const Response& other) const;
 74 | 
 75 |   void print(FILE* stream, int indent_level = 0) const;
 76 |   std::string format() const;
 77 | 
 78 |   void write(struct evbuffer* buf) const;
 79 |   static void write_string(struct evbuffer* buf, const char* s, char sentinel);
 80 |   static void write_string(struct evbuffer* buf, const void* s, size_t size,
 81 |       char sentinel);
 82 |   static void write_int(struct evbuffer* buf, int64_t value, char sentinel);
 83 | };
 84 | 
 85 | 
 86 | struct CommandParser {
 87 |   enum State {
 88 |     Initial = 0,
 89 |     ReadingArgumentSize,
 90 |     ReadingArgumentData,
 91 |     ReadingNewlineAfterArgumentData,
 92 |   };
 93 |   State state;
 94 |   const char* error_str;
 95 | 
 96 |   int64_t num_command_args;
 97 |   std::shared_ptr<DataCommand> command_in_progress;
 98 |   int64_t arguments_remaining;
 99 |   int64_t data_bytes_remaining;
100 | 
101 |   CommandParser();
102 |   ~CommandParser() = default;
103 | 
104 |   std::shared_ptr<DataCommand> resume(struct evbuffer* buffer);
105 | 
106 |   const char* error() const;
107 | };
108 | 
109 | struct ResponseParser {
110 |   enum State {
111 |     Initial = 0,
112 |     MultiRecursive,
113 |     ReadingData,
114 |     ReadingNewlineAfterData,
115 |   };
116 |   State state;
117 |   const char* error_str;
118 | 
119 |   std::shared_ptr<Response> response_in_progress;
120 |   int64_t data_bytes_remaining;
121 | 
122 |   std::shared_ptr<ResponseParser> multi_in_progress;
123 |   int64_t multi_fields_remaining;
124 | 
125 |   ResponseParser();
126 |   ~ResponseParser() = default;
127 | 
128 |   std::shared_ptr<Response> resume(struct evbuffer* buffer);
129 |   bool forward(struct evbuffer* buffer, struct evbuffer* output_buffer);
130 | 
131 |   const char* error() const;
132 | };
133 | 


--------------------------------------------------------------------------------
/ProtocolTest.cc:
--------------------------------------------------------------------------------
  1 | #include <errno.h>
  2 | #include <event2/buffer.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | 
  7 | #include <phosg/UnitTest.hh>
  8 | 
  9 | #include "Protocol.hh"
 10 | 
 11 | using namespace std;
 12 | 
 13 | 
 14 | template <typename T>
 15 | static void check_serialization(const T& obj,
 16 |     const char* expected_serialization) {
 17 |   unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> out_buf(
 18 |       evbuffer_new(), evbuffer_free);
 19 |   obj.write(out_buf.get());
 20 |   struct evbuffer_ptr pos = evbuffer_search(out_buf.get(),
 21 |       expected_serialization, strlen(expected_serialization), NULL);
 22 |   expect_eq(pos.pos, 0);
 23 | }
 24 | 
 25 | template <typename T>
 26 | static void check_serialization(shared_ptr<T> obj,
 27 |     const char* expected_serialization) {
 28 |   unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> out_buf(
 29 |       evbuffer_new(), evbuffer_free);
 30 |   obj->write(out_buf.get());
 31 |   struct evbuffer_ptr pos = evbuffer_search(out_buf.get(),
 32 |       expected_serialization, strlen(expected_serialization), NULL);
 33 |   expect_eq(pos.pos, 0);
 34 | }
 35 | 
 36 | 
 37 | int main(int argc, char* argv[]) {
 38 | 
 39 |   {
 40 |     printf("-- parse a command & serialize it again\n");
 41 | 
 42 |     const char* command_string = "*7\r\n$4\r\nMSET\r\n$1\r\nx\r\n$1\r\n1\r\n$1\r\ny\r\n$1\r\n2\r\n$1\r\nz\r\n$3\r\nlol\r\n";
 43 | 
 44 |     unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> in_buf(
 45 |         evbuffer_new(), evbuffer_free);
 46 |     evbuffer_add(in_buf.get(), command_string, strlen(command_string));
 47 |     auto cmd = CommandParser().resume(in_buf.get());
 48 | 
 49 |     // check that the args were parsed properly
 50 |     expect_eq(cmd->args.size(), 7);
 51 |     expect_eq(cmd->args[0], "MSET");
 52 |     expect_eq(cmd->args[1], "x");
 53 |     expect_eq(cmd->args[2], "1");
 54 |     expect_eq(cmd->args[3], "y");
 55 |     expect_eq(cmd->args[4], "2");
 56 |     expect_eq(cmd->args[5], "z");
 57 |     expect_eq(cmd->args[6], "lol");
 58 | 
 59 |     check_serialization(cmd, command_string);
 60 |   }
 61 | 
 62 |   {
 63 |     printf("-- parse a command (inline) & serialize it again\n");
 64 | 
 65 |     const char* command_string = "MSET x 1 y 2 z lol\r\n";
 66 |     const char* expected_serialization = "*7\r\n$4\r\nMSET\r\n$1\r\nx\r\n$1\r\n1\r\n$1\r\ny\r\n$1\r\n2\r\n$1\r\nz\r\n$3\r\nlol\r\n";
 67 | 
 68 |     unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> in_buf(
 69 |         evbuffer_new(), evbuffer_free);
 70 |     evbuffer_add(in_buf.get(), command_string, strlen(command_string));
 71 |     auto cmd = CommandParser().resume(in_buf.get());
 72 | 
 73 |     // check that the args were parsed properly
 74 |     expect_eq(cmd->args.size(), 7);
 75 |     expect_eq(cmd->args[0], "MSET");
 76 |     expect_eq(cmd->args[1], "x");
 77 |     expect_eq(cmd->args[2], "1");
 78 |     expect_eq(cmd->args[3], "y");
 79 |     expect_eq(cmd->args[4], "2");
 80 |     expect_eq(cmd->args[5], "z");
 81 |     expect_eq(cmd->args[6], "lol");
 82 | 
 83 |     check_serialization(cmd, expected_serialization);
 84 |   }
 85 | 
 86 |   {
 87 |     printf("-- parse a response & serialize it again\n");
 88 | 
 89 |     const char* resp_string = "*6\r\n+omg\r\n-bbq\r\n:284713592\r\n$-1\r\n*-1\r\n*1\r\n$20\r\nTo be or not to be, \r\n";
 90 | 
 91 |     unique_ptr<struct evbuffer, void(*)(struct evbuffer*)> in_buf(
 92 |         evbuffer_new(), evbuffer_free);
 93 |     evbuffer_add(in_buf.get(), resp_string, strlen(resp_string));
 94 |     auto r = ResponseParser().resume(in_buf.get());
 95 | 
 96 |     expect_eq(r->type, Response::Type::Multi);
 97 |     expect_eq(r->fields.size(), 6);
 98 | 
 99 |     expect_eq(r->fields[0]->type, Response::Type::Status);
100 |     expect_eq(r->fields[0]->data, "omg");
101 | 
102 |     expect_eq(r->fields[1]->type, Response::Type::Error);
103 |     expect_eq(r->fields[1]->data, "bbq");
104 | 
105 |     expect_eq(r->fields[2]->type, Response::Type::Integer);
106 |     expect_eq(r->fields[2]->int_value, 284713592);
107 | 
108 |     expect_eq(r->fields[3]->type, Response::Type::Data);
109 |     expect_eq(r->fields[3]->int_value, -1);
110 | 
111 |     expect_eq(r->fields[4]->type, Response::Type::Multi);
112 |     expect_eq(r->fields[4]->int_value, -1);
113 | 
114 |     expect_eq(r->fields[5]->type, Response::Type::Multi);
115 |     expect_eq(r->fields[5]->fields.size(), 1);
116 | 
117 |     expect_eq(r->fields[5]->fields[0]->type, Response::Type::Data);
118 |     expect_eq(r->fields[5]->fields[0]->data, "To be or not to be, ");
119 | 
120 |     check_serialization(r, resp_string);
121 |   }
122 | 
123 |   {
124 |     printf("-- check Response printf-like constructor\n");
125 | 
126 |     {
127 |       Response r(Response::Type::Status,
128 |           "This is response %d of %d; here\'s a string: %s.", 4, 10, "lol");
129 |       const char* expected = "+This is response 4 of 10; here\'s a string: lol.\r\n";
130 |       check_serialization(r, expected);
131 |     }
132 | 
133 |     {
134 |       Response r(Response::Type::Error,
135 |           "This is response %d of %d; here\'s a string: %s.", 4, 10, "lol");
136 |       const char* expected = "-This is response 4 of 10; here\'s a string: lol.\r\n";
137 |       check_serialization(r, expected);
138 |     }
139 | 
140 |     {
141 |       Response r(Response::Type::Data,
142 |         "This is response %d of %d; here\'s a string: %s.", 4, 10, "lol");
143 |       const char* expected = "$47\r\nThis is response 4 of 10; here\'s a string: lol.\r\n";
144 |       check_serialization(r, expected);
145 |     }
146 |   }
147 | 
148 |   printf("all tests passed\n");
149 |   return 0;
150 | }
151 | 


--------------------------------------------------------------------------------
/Proxy.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <event2/event.h>
  4 | #include <event2/listener.h>
  5 | 
  6 | #include <atomic>
  7 | #include <deque>
  8 | #include <memory>
  9 | #include <phosg/ConsistentHashRing.hh>
 10 | #include <set>
 11 | #include <string>
 12 | #include <unordered_map>
 13 | #include <vector>
 14 | 
 15 | #include "Protocol.hh"
 16 | 
 17 | 
 18 | struct ResponseLink;
 19 | struct Backend;
 20 | 
 21 | 
 22 | struct BackendConnection {
 23 |   Backend* backend;
 24 |   int64_t index;
 25 | 
 26 |   std::unique_ptr<struct bufferevent, void(*)(struct bufferevent*)> bev;
 27 |   ResponseParser parser;
 28 | 
 29 |   struct sockaddr_storage local_addr;
 30 |   struct sockaddr_storage remote_addr;
 31 | 
 32 |   size_t num_commands_sent;
 33 |   size_t num_responses_received;
 34 | 
 35 |   ResponseLink* head_link;
 36 |   ResponseLink* tail_link;
 37 | 
 38 |   BackendConnection(Backend* backend, int64_t index,
 39 |       std::unique_ptr<struct bufferevent, void(*)(struct bufferevent*)>&& bev);
 40 |   BackendConnection(const BackendConnection&) = delete;
 41 |   BackendConnection(BackendConnection&&) = delete;
 42 |   BackendConnection& operator=(const BackendConnection&) = delete;
 43 |   BackendConnection& operator=(BackendConnection&&) = delete;
 44 |   ~BackendConnection();
 45 | 
 46 |   struct evbuffer* get_output_buffer();
 47 | 
 48 |   void print(FILE* stream, int indent_level = 0) const;
 49 | };
 50 | 
 51 | struct Backend {
 52 |   size_t index;
 53 | 
 54 |   std::string host;
 55 |   int port;
 56 | 
 57 |   std::string name;
 58 |   std::string debug_name;
 59 | 
 60 |   std::unordered_map<int64_t, BackendConnection> index_to_connection;
 61 |   int64_t next_connection_index;
 62 | 
 63 |   size_t num_responses_received;
 64 |   size_t num_commands_sent;
 65 | 
 66 |   Backend(size_t index, const std::string& host, int port, const std::string& name);
 67 |   Backend(const Backend&) = delete;
 68 |   Backend(Backend&&) = delete;
 69 |   Backend& operator=(const Backend&) = delete;
 70 |   Backend& operator=(Backend&&) = delete;
 71 |   ~Backend() = default;
 72 | 
 73 |   BackendConnection& get_default_connection();
 74 | 
 75 |   void print(FILE* stream, int indent_level = 0) const;
 76 | };
 77 | 
 78 | 
 79 | struct Client {
 80 |   std::string name;
 81 |   std::string debug_name;
 82 |   bool should_disconnect;
 83 | 
 84 |   std::unique_ptr<struct bufferevent, void(*)(struct bufferevent*)> bev;
 85 |   CommandParser parser;
 86 | 
 87 |   struct sockaddr_storage local_addr;
 88 |   struct sockaddr_storage remote_addr;
 89 | 
 90 |   size_t num_commands_received;
 91 |   size_t num_responses_sent;
 92 | 
 93 |   ResponseLink* head_link;
 94 |   ResponseLink* tail_link;
 95 | 
 96 |   Client(std::unique_ptr<struct bufferevent, void(*)(struct bufferevent*)> bev);
 97 |   Client(const Client&) = delete;
 98 |   Client(Client&&) = delete;
 99 |   Client& operator=(const Client&) = delete;
100 |   Client& operator=(Client&&) = delete;
101 |   ~Client();
102 | 
103 |   struct evbuffer* get_output_buffer();
104 | 
105 |   void print(FILE* stream, int indent_level = 0) const;
106 | };
107 | 
108 | 
109 | // a ResponseLink represents a response that a client is expecting to receive,
110 | // and also a promise that one or more backends will send a response that can be
111 | // used to generate the response for the waiting client. each ResponseLink is
112 | // linked in one or more lists representing its dependencies.
113 | //
114 | // each Client has a linked list of ResponseLinks representing the responses
115 | // that the client expects to receive (in order). this list is traversed by
116 | // following the next_client links in the ResponseLink.
117 | //
118 | // similarly, each BackendConnection has a linked list of ResponseLinks in the
119 | // order that responses should be routed. this isn't necessarily the same as the
120 | // next_client order. because a ResponseLink may represent an aggregation of
121 | // multiple backend responses, the ResponseLink may exist in multiple
122 | // BackendConnection lists. to traverse one of these lists, look up the given
123 | // BackendConnection in the ResponseLink's backend_conn_to_next_link map.
124 | //
125 | // a ResponseLink is "ready" when all the needed backend responses have been
126 | // received. this doesn't mean it can be sent to the client though - since the
127 | // order of responses must be preserved, it can only be sent if it's first in
128 | // the client's list. for example, if a client sends "GET x" and "GET y", and
129 | // the backend for y responds first, we can't send the result yet. in this case,
130 | // the ResponseLink stays ready until the backend for x responds. at that time,
131 | // both ResponseLinks are ready, and are sent in the correct order.
132 | //
133 | // ResponseLinks are owned by the Client they're linked to, and are destroyed
134 | // after the response is sent. if a client disconnects before receiving all of
135 | // its pending responses, the client is unlinked from its ResponseLinks, but the
136 | // ResponseLinks remain. this is necessary because there may be pipelined
137 | // commands on the linked backend connections, and we need to discard the
138 | // responses meant for this client. in this case, the ResponseLinks are owned by
139 | // the BackendConnections, and are destroyed when they're unlinked from the last
140 | // BackendConnection.
141 | //
142 | // if a BackendConnection disconnects early, then all of the ResponseLinks it's
143 | // linked to receive an error response, and they're unlinked from the
144 | // BackendConnection immediately. any ready ResponseLinks are processed (sent
145 | // to the client, if possible) at this time also.
146 | 
147 | struct ResponseLink {
148 |   enum class CollectionType {
149 |     ForwardResponse = 0,
150 |     CollectStatusResponses,
151 |     SumIntegerResponses,
152 |     CombineMultiResponses,
153 |     CollectResponses,
154 |     CollectMultiResponsesByKey,
155 |     CollectIdenticalResponses,
156 |     ModifyScanResponse,
157 |     ModifyScriptExistsResponse,
158 |     ModifyMigrateResponse,
159 |   };
160 |   CollectionType type;
161 | 
162 |   static const char* name_for_collection_type(CollectionType type);
163 | 
164 |   Client* client;
165 |   ResponseLink* next_client;
166 |   std::unordered_map<BackendConnection*, ResponseLink*> backend_conn_to_next_link;
167 | 
168 |   std::shared_ptr<Response> error_response;
169 | 
170 |   // type-specific fields
171 | 
172 |   std::shared_ptr<Response> response_to_forward;
173 | 
174 |   int64_t response_integer_sum;
175 | 
176 |   Response::Type expected_response_type;
177 |   std::vector<std::shared_ptr<Response>> responses;
178 | 
179 |   std::vector<size_t> recombination_queue;
180 |   std::unordered_map<int64_t, std::shared_ptr<Response>> backend_index_to_response;
181 | 
182 |   int64_t scan_backend_index;
183 | 
184 |   ResponseLink(CollectionType type, Client* c);
185 |   ResponseLink(const ResponseLink&) = delete;
186 |   ResponseLink(ResponseLink&&) = delete;
187 |   ResponseLink& operator=(const ResponseLink&) = delete;
188 |   ResponseLink& operator=(ResponseLink&&) = delete;
189 |   ~ResponseLink();
190 | 
191 |   bool is_ready() const;
192 | 
193 |   void print(FILE* stream, int indent_level = 0) const;
194 | };
195 | 
196 | 
197 | class Proxy {
198 | public:
199 |   struct Stats {
200 |     std::atomic<size_t> num_commands_received;
201 |     std::atomic<size_t> num_commands_sent;
202 |     std::atomic<size_t> num_responses_received;
203 |     std::atomic<size_t> num_responses_sent;
204 |     std::atomic<size_t> num_connections_received;
205 |     std::atomic<size_t> num_clients;
206 |     uint64_t start_time;
207 | 
208 |     Stats();
209 |   };
210 | 
211 |   struct Netloc {
212 |     std::string name;
213 |     std::string host;
214 |     int port;
215 |   };
216 | 
217 |   Proxy(int listen_fd, std::shared_ptr<const ConsistentHashRing> ring,
218 |       int hash_begin_delimiter = -1, int hash_end_delimiter = -1,
219 |       std::shared_ptr<Stats> stats = NULL, size_t proxy_index = 0);
220 |   Proxy(const Proxy&) = delete;
221 |   Proxy(Proxy&&) = delete;
222 |   Proxy& operator=(const Proxy&) = delete;
223 |   Proxy& operator=(Proxy&&) = delete;
224 |   ~Proxy() = default;
225 | 
226 |   bool disable_command(const std::string& command_name);
227 | 
228 |   void serve();
229 |   void stop();
230 | 
231 |   void print(FILE* stream, int indent_level = 0) const;
232 | 
233 | private:
234 |   // network state
235 |   int listen_fd;
236 |   std::unique_ptr<struct event_base, void(*)(struct event_base*)> base;
237 |   std::unique_ptr<struct evconnlistener, void(*)(struct evconnlistener*)> listener;
238 |   bool should_exit;
239 | 
240 |   // connection indexing and lookup
241 |   std::shared_ptr<const ConsistentHashRing> ring;
242 |   std::vector<Backend*> backends;
243 |   std::unordered_map<std::string, Backend*> name_to_backend;
244 |   std::unordered_map<struct bufferevent*, BackendConnection*> bev_to_backend_conn;
245 |   std::unordered_map<struct bufferevent*, Client> bev_to_client;
246 | 
247 |   // stats
248 |   size_t proxy_index;
249 |   std::shared_ptr<Stats> stats;
250 | 
251 |   // hash configuration
252 |   int hash_begin_delimiter;
253 |   int hash_end_delimiter;
254 | 
255 |   // backend lookups
256 |   int64_t backend_index_for_key(const std::string& s) const;
257 |   int64_t backend_index_for_argument(const std::string& arg) const;
258 |   Backend& backend_for_index(size_t index);
259 |   Backend& backend_for_key(const std::string& s);
260 |   BackendConnection& backend_conn_for_index(size_t index);
261 |   BackendConnection& backend_conn_for_key(const std::string& s);
262 | 
263 |   // connection management
264 |   void disconnect_client(Client* c);
265 |   void disconnect_backend(BackendConnection* b);
266 | 
267 |   // response linking
268 |   ResponseLink* create_link(ResponseLink::CollectionType type, Client* c);
269 |   ResponseLink* create_error_link(Client* c, std::shared_ptr<Response> r);
270 |   struct evbuffer* can_send_command(BackendConnection* conn, ResponseLink* l);
271 |   void link_connection(BackendConnection* conn, ResponseLink* l);
272 |   void send_command_and_link(BackendConnection* conn, ResponseLink* l,
273 |       const DataCommand* cmd);
274 |   void send_command_and_link(BackendConnection* conn, ResponseLink* l,
275 |       const std::shared_ptr<const DataCommand>& cmd);
276 |   void send_command_and_link(BackendConnection* conn, ResponseLink* l,
277 |       const ReferenceCommand* cmd);
278 | 
279 |   // high-level output handlers
280 |   void send_client_response(Client* c, const Response* r);
281 |   void send_client_response(Client* c,
282 |       const std::shared_ptr<const Response>& r);
283 |   void send_client_string_response(Client* c, const char* s,
284 |       Response::Type type);
285 |   void send_client_string_response(Client* c, const std::string& s,
286 |       Response::Type type);
287 |   void send_client_string_response(Client* c, const void* data, size_t size,
288 |       Response::Type type);
289 |   void send_client_int_response(Client* c, int64_t int_value,
290 |       Response::Type type);
291 | 
292 |   // high-level input handlers
293 |   void send_ready_response(ResponseLink* l);
294 |   void send_all_ready_responses(Client* c);
295 |   void handle_backend_response(BackendConnection* conn,
296 |       std::shared_ptr<Response> r);
297 |   void handle_client_command(Client* c, std::shared_ptr<DataCommand> cmd);
298 | 
299 |   // low-level input handlers
300 |   static void dispatch_on_client_input(struct bufferevent *bev, void* ctx);
301 |   void on_client_input(struct bufferevent *bev);
302 |   static void dispatch_on_client_error(struct bufferevent *bev, short events,
303 |       void* ctx);
304 |   void on_client_error(struct bufferevent *bev, short events);
305 |   static void dispatch_on_backend_input(struct bufferevent *bev, void* ctx);
306 |   void on_backend_input(struct bufferevent *bev);
307 |   static void dispatch_on_backend_error(struct bufferevent *bev, short events,
308 |       void* ctx);
309 |   void on_backend_error(struct bufferevent *bev, short events);
310 |   static void dispatch_on_listen_error(struct evconnlistener *listener,
311 |       void* ctx);
312 |   void on_listen_error(struct evconnlistener *listener);
313 |   static void dispatch_on_client_accept(struct evconnlistener *listener,
314 |       evutil_socket_t fd, struct sockaddr *address, int socklen, void* ctx);
315 |   void on_client_accept(struct evconnlistener *listener, evutil_socket_t fd,
316 |       struct sockaddr *address, int socklen);
317 | 
318 |   // timer event handlers
319 |   static void dispatch_check_for_thread_exit(evutil_socket_t fd, short what,
320 |       void* ctx);
321 |   void check_for_thread_exit(evutil_socket_t fd, short what);
322 | 
323 |   // generic command implementations
324 |   void command_all_collect_responses(Client* c,
325 |       std::shared_ptr<DataCommand> cmd);
326 |   void command_all_collect_status_responses(Client* c,
327 |       std::shared_ptr<DataCommand> cmd);
328 |   void command_all_sum_int_responses(Client* c,
329 |       std::shared_ptr<DataCommand> cmd);
330 |   void command_forward_all(Client* c, std::shared_ptr<DataCommand> cmd,
331 |       ResponseLink::CollectionType type);
332 |   void command_forward_by_key_1(Client* c, std::shared_ptr<DataCommand> cmd);
333 |   void command_forward_by_key_index(Client* c, std::shared_ptr<DataCommand> cmd,
334 |       size_t key_index);
335 |   void command_forward_by_keys(Client* c, std::shared_ptr<DataCommand> cmd,
336 |       ssize_t start_key_index, ssize_t end_key_index);
337 |   void command_forward_by_keys_1_all(Client* c,
338 |       std::shared_ptr<DataCommand> cmd);
339 |   void command_forward_by_keys_1_2(Client* c, std::shared_ptr<DataCommand> cmd);
340 |   void command_forward_by_keys_2_all(Client* c,
341 |       std::shared_ptr<DataCommand> cmd);
342 |   void command_forward_random(Client* c, std::shared_ptr<DataCommand> cmd);
343 |   void command_partition_by_keys(Client* c, std::shared_ptr<DataCommand> cmd,
344 |       size_t start_arg_index, size_t args_per_key, bool interleaved,
345 |       ResponseLink::CollectionType type);
346 |   void command_partition_by_keys_1_integer(Client* c,
347 |       std::shared_ptr<DataCommand> cmd);
348 |   void command_partition_by_keys_1_multi(Client* c,
349 |       std::shared_ptr<DataCommand> cmd);
350 |   void command_partition_by_keys_2_status(Client* c,
351 |       std::shared_ptr<DataCommand> cmd);
352 |   void command_unimplemented(Client* c, std::shared_ptr<DataCommand> cmd);
353 |   void command_default(Client* c, std::shared_ptr<DataCommand> cmd);
354 | 
355 |   // specific command implementations
356 |   void command_ACL(Client* c, std::shared_ptr<DataCommand> cmd);
357 |   void command_BACKEND(Client* c, std::shared_ptr<DataCommand> cmd);
358 |   void command_BACKENDNUM(Client* c, std::shared_ptr<DataCommand> cmd);
359 |   void command_BACKENDS(Client* c, std::shared_ptr<DataCommand> cmd);
360 |   void command_CLIENT(Client* c, std::shared_ptr<DataCommand> cmd);
361 |   void command_DBSIZE(Client* c, std::shared_ptr<DataCommand> cmd);
362 |   void command_DEBUG(Client* c, std::shared_ptr<DataCommand> cmd);
363 |   void command_ECHO(Client* c, std::shared_ptr<DataCommand> cmd);
364 |   void command_EVAL(Client* c, std::shared_ptr<DataCommand> cmd);
365 |   void command_FORWARD(Client* c, std::shared_ptr<DataCommand> cmd);
366 |   void command_GEORADIUS(Client* c, std::shared_ptr<DataCommand> cmd);
367 |   void command_INFO(Client* c, std::shared_ptr<DataCommand> cmd);
368 |   void command_KEYS(Client* c, std::shared_ptr<DataCommand> cmd);
369 |   void command_LATENCY(Client* c, std::shared_ptr<DataCommand> cmd);
370 |   void command_MEMORY(Client* c, std::shared_ptr<DataCommand> cmd);
371 |   void command_MIGRATE(Client* c, std::shared_ptr<DataCommand> cmd);
372 |   void command_MODULE(Client* c, std::shared_ptr<DataCommand> cmd);
373 |   void command_MSETNX(Client* c, std::shared_ptr<DataCommand> cmd);
374 |   void command_OBJECT(Client* c, std::shared_ptr<DataCommand> cmd);
375 |   void command_PING(Client* c, std::shared_ptr<DataCommand> cmd);
376 |   void command_PRINTSTATE(Client* c, std::shared_ptr<DataCommand> cmd);
377 |   void command_QUIT(Client* c, std::shared_ptr<DataCommand> cmd);
378 |   void command_ROLE(Client* c, std::shared_ptr<DataCommand> cmd);
379 |   void command_SCAN(Client* c, std::shared_ptr<DataCommand> cmd);
380 |   void command_SCRIPT(Client* c, std::shared_ptr<DataCommand> cmd);
381 |   void command_XGROUP(Client* c, std::shared_ptr<DataCommand> cmd);
382 |   void command_XINFO(Client* c, std::shared_ptr<DataCommand> cmd);
383 |   void command_XREAD(Client* c, std::shared_ptr<DataCommand> cmd);
384 |   void command_ZACTIONSTORE(Client* c, std::shared_ptr<DataCommand> cmd);
385 | 
386 |   // helpers for command implementations
387 |   uint8_t scan_cursor_backend_index_bits() const;
388 | 
389 |   // handler index
390 |   typedef void (Proxy::*command_handler)(Client* c,
391 |       std::shared_ptr<DataCommand> cmd);
392 |   std::unordered_map<std::string, command_handler> handlers;
393 |   static const std::unordered_map<std::string, command_handler>
394 |       default_handlers;
395 | };
396 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | redis-shatter
  2 | -------------
  3 | 
  4 | redis-shatter is a sharding proxy for the Redis protocol, inspired by twemproxy
  5 | (https://github.com/twitter/twemproxy). Documentation on how to actually use
  6 | this is in the example configuration file, redis-shatter.conf.
  7 | 
  8 | In essence, this proxy appears like a standard Redis server to other hosts, but
  9 | doesn't store any data locally - instead, keys are distributed between multiple
 10 | backends (other hosts that speak the Redis protocol, usually Redis servers).
 11 | This can give useful speed, reliability, and scalability benefits, but has some
 12 | significant drawbacks as well:
 13 | - Command execution requires another network hop, which comes with a performance
 14 |   penalty. Clients that pipeline commands are somewhat insulated from this
 15 |   issue.
 16 | - Complex commands that affect multiple keys cannot be run efficiently unless the
 17 |   affected keys are all on the same backend. There are inefficient ways to run
 18 |   these commands, but these are (currently) not implemented by redis-shatter.
 19 | 
 20 | 
 21 | Performance
 22 | -----------
 23 | 
 24 | redis-shatter in front of 8 Redis server instances on the same machine was about
 25 | 60% as fast as a single Redis server itself, according to side-by-side
 26 | redis-benchmark tests. Very little performance optimization work has been done,
 27 | so there's probably a lot of room for improvement here.
 28 | 
 29 | Like most of my projects, this is only tested at a small scale (so far), so
 30 | there may be unfound bugs or inefficiencies. Use at your own risk.
 31 | 
 32 | 
 33 | Behavior
 34 | --------
 35 | 
 36 | Unlike other similar projects, redis-shatter strives to implement as many
 37 | commands as possible. For some of the less-common commands, redis-shatter
 38 | deviates from the standard Redis protocol in order to implement reasonable
 39 | behavior. These deviations are explained in the notes in the below table.
 40 | 
 41 | Command             -- Supported  -- Notes (see end)
 42 | ----------------------------------------------------
 43 | ACL CAT             -- Yes        -- *E
 44 | ACL DELUSER         -- Yes        -- *6
 45 | ACL GENPASS         -- Yes        -- *E
 46 | ACL GETUSER         -- Yes        -- *5
 47 | ACL HELP            -- Yes        -- *E
 48 | ACL LIST            -- Yes        -- *5
 49 | ACL LOAD            -- Yes        -- *6
 50 | ACL LOG             -- Yes        -- *5
 51 | ACL SAVE            -- Yes        -- *6
 52 | ACL SETUSER         -- Yes        -- *6
 53 | ACL USERS           -- Yes        -- *5
 54 | ACL WHOAMI          -- No         --
 55 | APPEND              -- Yes        --
 56 | AUTH                -- No         --
 57 | BGREWRITEAOF        -- Yes        -- *6
 58 | BGSAVE              -- Yes        -- *6
 59 | BITCOUNT            -- Yes        --
 60 | BITFIELD            -- Yes        --
 61 | BITOP               -- Yes        -- *2
 62 | BITPOS              -- Yes        --
 63 | BLPOP               -- No         --
 64 | BRPOP               -- No         --
 65 | BRPOPLPUSH          -- No         --
 66 | BZPOPMAX            -- No         --
 67 | BZPOPMIN            -- No         --
 68 | CLIENT CACHING      -- No         --
 69 | CLIENT GETNAME      -- Yes        -- *F
 70 | CLIENT GETREDIR     -- No         --
 71 | CLIENT ID           -- No         --
 72 | CLIENT KILL         -- No         --
 73 | CLIENT LIST         -- Yes        -- *C
 74 | CLIENT PAUSE        -- No         --
 75 | CLIENT REPLY        -- No         --
 76 | CLIENT SETNAME      -- Yes        -- *F
 77 | CLIENT TRACKING     -- No         --
 78 | CLIENT UNBLOCK      -- No         --
 79 | CLUSTER             -- No         -- *H
 80 | COMMAND             -- Yes        -- *E
 81 | COMMAND COUNT       -- Yes        -- *E
 82 | COMMAND GETKEYS     -- Yes        -- *E
 83 | COMMAND INFO        -- Yes        -- *E
 84 | CONFIG GET          -- Yes        -- *5
 85 | CONFIG RESETSTAT    -- Yes        -- *6
 86 | CONFIG REWRITE      -- Yes        -- *6
 87 | CONFIG SET          -- Yes        -- *6
 88 | DBSIZE              -- Yes        -- *A
 89 | DEBUG OBJECT        -- Yes        --
 90 | DEBUG SEGFAULT      -- No         --
 91 | DECR                -- Yes        --
 92 | DECRBY              -- Yes        --
 93 | DEL                 -- Yes        -- *4
 94 | DISCARD             -- No         --
 95 | DUMP                -- Yes        --
 96 | ECHO                -- Yes        --
 97 | EVAL                -- Yes        -- *0 *2
 98 | EVALSHA             -- Yes        -- *0 *2 *G
 99 | EXEC                -- No         --
100 | EXISTS              -- Yes        -- *4
101 | EXPIRE              -- Yes        --
102 | EXPIREAT            -- Yes        --
103 | FLUSHALL            -- Yes        -- *6
104 | FLUSHDB             -- Yes        -- *6 *7
105 | GEOADD              -- Yes        --
106 | GEODIST             -- Yes        --
107 | GEOHASH             -- Yes        --
108 | GEOPOS              -- Yes        --
109 | GEORADIUS           -- Yes        -- *2
110 | GEORADIUSBYMEMBER   -- Yes        -- *2
111 | GET                 -- Yes        --
112 | GETBIT              -- Yes        --
113 | GETRANGE            -- Yes        --
114 | GETSET              -- Yes        --
115 | HDEL                -- Yes        --
116 | HELLO               -- No         --
117 | HEXISTS             -- Yes        --
118 | HGET                -- Yes        --
119 | HGETALL             -- Yes        --
120 | HINCRBY             -- Yes        --
121 | HINCRBYFLOAT        -- Yes        --
122 | HKEYS               -- Yes        --
123 | HLEN                -- Yes        --
124 | HMGET               -- Yes        --
125 | HMSET               -- Yes        --
126 | HSCAN               -- Yes        --
127 | HSET                -- Yes        --
128 | HSETNX              -- Yes        --
129 | HSTRLEN             -- Yes        --
130 | HVALS               -- Yes        --
131 | INCR                -- Yes        --
132 | INCRBY              -- Yes        --
133 | INCRBYFLOAT         -- Yes        --
134 | INFO                -- Yes        -- *8
135 | KEYS                -- Yes        -- *9 *A
136 | LASTSAVE            -- Yes        -- *5
137 | LATENCY DOCTOR      -- Yes        -- *5
138 | LATENCY GRAPH       -- Yes        -- *5
139 | LATENCY RESET       -- Yes        -- *5
140 | LATENCY LATEST      -- Yes        -- *5
141 | LATENCY HISTORY     -- Yes        -- *5
142 | LATENCY HELP        -- Yes        -- *E
143 | LINDEX              -- Yes        --
144 | LINSERT             -- Yes        --
145 | LLEN                -- Yes        --
146 | LOLWUT              -- Yes        -- *E
147 | LPOP                -- Yes        --
148 | LPUSH               -- Yes        --
149 | LPUSHX              -- Yes        --
150 | LRANGE              -- Yes        --
151 | LREM                -- Yes        --
152 | LSET                -- Yes        --
153 | LTRIM               -- Yes        --
154 | MEMORY DOCTOR       -- Yes        -- *5
155 | MEMORY HELP         -- Yes        -- *E
156 | MEMORY MALLOC-STATS -- Yes        -- *5
157 | MEMORY PURGE        -- Yes        -- *5
158 | MEMORY STATS        -- Yes        -- *5
159 | MEMORY USAGE        -- Yes        --
160 | MGET                -- Yes        -- *4
161 | MIGRATE             -- Yes        -- *4 *J
162 | MODULE LIST         -- Yes        -- *5
163 | MODULE LOAD         -- Yes        -- *5
164 | MODULE UNLOAD       -- Yes        -- *5
165 | MONITOR             -- No         --
166 | MOVE                -- No         --
167 | MSET                -- Yes        -- *4
168 | MSETNX              -- Yes        -- *2
169 | MULTI               -- No         --
170 | OBJECT ENCODING     -- Yes        --
171 | OBJECT FREQ         -- Yes        --
172 | OBJECT IDLETIME     -- Yes        --
173 | OBJECT REFCOUNT     -- Yes        --
174 | OBJECT HELP         -- Yes        -- *E
175 | PERSIST             -- Yes        --
176 | PEXPIRE             -- Yes        --
177 | PEXPIREAT           -- Yes        --
178 | PFADD               -- Yes        --
179 | PFCOUNT             -- Yes        -- *2
180 | PFMERGE             -- Yes        -- *2
181 | PING                -- Yes        --
182 | PSETEX              -- Yes        --
183 | PSUBSCRIBE          -- No         --
184 | PTTL                -- Yes        --
185 | PUBLISH             -- No         --
186 | PUBSUB              -- No         --
187 | PUNSUBSCRIBE        -- No         --
188 | QUIT                -- Yes        --
189 | RANDOMKEY           -- Yes        -- *1
190 | READONLY            -- No         --
191 | READWRITE           -- No         --
192 | RENAME              -- Yes        -- *2
193 | RENAMENX            -- Yes        -- *2
194 | REPLICAOF           -- No         -- *H
195 | RESTORE             -- Yes        --
196 | ROLE                -- Yes        -- *K
197 | RPOP                -- Yes        --
198 | RPOPLPUSH           -- Yes        -- *2
199 | RPUSH               -- Yes        --
200 | RPUSHX              -- Yes        --
201 | SADD                -- Yes        --
202 | SAVE                -- Yes        -- *6
203 | SCAN                -- Yes        -- *A *B
204 | SCARD               -- Yes        --
205 | SCRIPT DEBUG        -- No         --
206 | SCRIPT EXISTS       -- Yes        -- *D
207 | SCRIPT FLUSH        -- Yes        -- *6
208 | SCRIPT KILL         -- No         -- *H
209 | SCRIPT LOAD         -- Yes        -- *6
210 | SDIFF               -- Yes        -- *2
211 | SDIFFSTORE          -- Yes        -- *2
212 | SELECT              -- No         --
213 | SET                 -- Yes        --
214 | SETBIT              -- Yes        --
215 | SETEX               -- Yes        --
216 | SETNX               -- Yes        --
217 | SETRANGE            -- Yes        --
218 | SHUTDOWN            -- Yes        -- *6
219 | SINTER              -- Yes        -- *2
220 | SINTERSTORE         -- Yes        -- *2
221 | SISMEMBER           -- Yes        --
222 | STRALGO             -- No         --
223 | SLAVEOF             -- No         -- *H
224 | SLOWLOG             -- Yes        -- *5
225 | SMEMBERS            -- Yes        --
226 | SMOVE               -- Yes        -- *2
227 | SORT                -- Yes        -- *2 *3
228 | SPOP                -- Yes        --
229 | SRANDMEMBER         -- Yes        --
230 | SREM                -- Yes        --
231 | SSCAN               -- Yes        --
232 | STRLEN              -- Yes        --
233 | SUBSCRIBE           -- No         --
234 | SUNION              -- Yes        -- *2
235 | SUNIONSTORE         -- Yes        -- *2
236 | SWAPDB              -- No         --
237 | SYNC                -- No         --
238 | TIME                -- Yes        -- *6
239 | TOUCH               -- Yes        --
240 | TTL                 -- Yes        --
241 | TYPE                -- Yes        --
242 | UNLINK              -- Yes        -- *4
243 | UNSUBSCRIBE         -- No         --
244 | UNWATCH             -- No         --
245 | WAIT                -- No         --
246 | WATCH               -- No         --
247 | XACK                -- Yes        --
248 | XADD                -- Yes        --
249 | XCLAIM              -- Yes        --
250 | XDEL                -- Yes        --
251 | XGROUP              -- Yes        --
252 | XINFO               -- Yes        --
253 | XLEN                -- Yes        --
254 | XPENDING            -- Yes        --
255 | XRANGE              -- Yes        --
256 | XREAD               -- Yes        -- *L
257 | XREADGROUP          -- Yes        -- *L
258 | XREVRANGE           -- Yes        --
259 | XTRIM               -- Yes        --
260 | ZADD                -- Yes        --
261 | ZCARD               -- Yes        --
262 | ZCOUNT              -- Yes        --
263 | ZINCRBY             -- Yes        --
264 | ZINTERSTORE         -- Yes        -- *2
265 | ZLEXCOUNT           -- Yes        --
266 | ZPOPMAX             -- Yes        --
267 | ZPOPMIN             -- Yes        --
268 | ZRANGE              -- Yes        --
269 | ZRANGEBYLEX         -- Yes        --
270 | ZRANGEBYSCORE       -- Yes        --
271 | ZRANK               -- Yes        --
272 | ZREM                -- Yes        --
273 | ZREMRANGEBYLEX      -- Yes        --
274 | ZREMRANGEBYRANK     -- Yes        --
275 | ZREMRANGEBYSCORE    -- Yes        --
276 | ZREVRANGE           -- Yes        --
277 | ZREVRANGEBYLEX      -- Yes        --
278 | ZREVRANGEBYSCORE    -- Yes        --
279 | ZREVRANK            -- Yes        --
280 | ZSCAN               -- Yes        --
281 | ZSCORE              -- Yes        --
282 | ZUNIONSTORE         -- Yes        -- *2
283 | 
284 | Notes:
285 | *0 -- Scripts that affect no keys will run on a random backend.
286 | *1 -- Distribution of random keys may not be exactly uniform. RANDOMKEY is
287 |       implemented by choosing a random backend and sending RANDOMKEY to it, so
288 |       if backend A has more keys than backend B, the probability of returning
289 |       each key from backend B is higher.
290 | *2 -- The affected keys must all be on the same backend. If they aren't, the
291 |       command fails with PROXYERROR.
292 | *3 -- The proxy does not check that all the affected keys are on the same
293 |       backend; the application has to do this itself. (This is because complex
294 |       patterns may be given in e.g. the GET clause.)
295 | *4 -- These commands are atomic only on each backend; they are not atomic across
296 |       all backends.
297 | *5 -- The proxy's response format is different than that of Redis - the proxy
298 |       returns a multi response with one field per backend. If this isn't what
299 |       you want, you can use FORWARD to interact with a single backend at once.
300 | *6 -- These commands are forwarded to all backends.
301 | *7 -- Since the proxy doesn't support multiple redis DBs, FLUSHDB is essentially
302 |       equivalent to FLUSHALL.
303 | *8 -- INFO syntax is different from what redis-server expects. These are the
304 |       valid forms of the INFO command in redis-shatter:
305 |       - INFO - return proxy information
306 |       - INFO BACKEND <backend-name> - return proxy stats for a backend
307 |       - INFO <backend-name> [section] - send INFO to a specific backend and
308 |         return its response verbatim
309 |       For the first two forms, the response format is different as well, but
310 |       should be mostly self-explanatory. The third form is equivalent to the
311 |       command FORWARD <backend-name> INFO [section].
312 | *9 -- May be slow and consume lots of memory if run on large datasets. It's
313 |       almost always better to use SCAN instead.
314 | *A -- Keys may be considered or returned multiple times or be inaccessible
315 |       through the proxy if they exist on the wrong backend. For example, if 'x'
316 |       belongs on backend 3 but exists on backends 5 and 3, then 'x' will appear
317 |       twice in the KEYS or SCAN results, or will be double-counted when running
318 |       DBSIZE. If 'x' exists on backend 5 and not 3, then 'x' will appear once in
319 |       the KEYS/SCAN results but 'GET x' will return nil.
320 | *B -- SCAN over the entire keyspace is implemented by scanning on each backend
321 |       in turn. The proxy uses the highest-order bits of the cursor to keep track
322 |       of which backend is currently being scanned. If the backend returns a
323 |       cursor that has any of these bits set, the scan will fail. Most practical
324 |       setups shouldn't run into this limit.
325 | *C -- The returned fields are different from redis-server. They are:
326 |       - name: the client's name (this includes the host:port string).
327 |       - cmdrecv: number of commands received from this client.
328 |       - rspsent: number of responses sent to this client.
329 |       - rspchain: response chain length (the number of responses that haven't
330 |         been sent to the client yet because there isn't enough information to
331 |         generate the response - some backends haven't replied yet).
332 | *D -- 1 will be returned for a script only if it exists on all backends - if it
333 |       is missing on one or more backends, 0 is returned.
334 | *E -- These commands are implemented by forwarding them to a random backend. If
335 |       the backends are not configured identically (i.e. some have rename-command
336 |       directives in their configs and some don't) then the results may differ
337 |       between subsequent calls.
338 | *F -- Names pertain only to the connection between the client and the proxy.
339 | *G -- EVALSHA is more likely to fail in a sharded environment, since the script
340 |       needs to be loaded into all the backends' script caches for it to work on
341 |       arbitrary keys. Although EVAL implicitly loads the script into the script
342 |       cache, EVAL is only forwarded to one backend at a time. If you plan to run
343 |       a script many times on different keys, use SCRIPT LOAD first to load the
344 |       script on all backends at once.
345 | *H -- Note that any unsupported commands can still be run on individual backends
346 |       by using the FORWARD command, but be careful when doing this. See below.
347 | *I -- Which command referred you to this note?
348 | *J -- If any backend returns an error, then MIGRATE returns a multi response
349 |       containing the responses from all backends. If no backend returns an
350 |       error, then MIGRATE returns NOKEY if all backends returned NOKEY, and OK
351 |       otherwise.
352 | *K -- This command returns a multi response with two fields. The first field is
353 |       the string "proxy"; the second field is a multi response containing the
354 |       names of all of the backends.
355 | *L -- Blocking reads are not supported (the BLOCK argument to these commands
356 |       must not be given).
357 | 
358 | 
359 | Administration
360 | --------------
361 | 
362 | redis-shatter implements many administrative commands that are omitted in other
363 | similar proxies. See the commands table above for details.
364 | 
365 | redis-shatter also implements some administrative commands that aren't part of
366 | the official Redis protocol. These commands are:
367 | 
368 | BACKEND key [key ...]
369 |   Returns the name of the backend on which the given key belongs, as a data
370 |   response. If multiple keys are given, returns a multi response with one data
371 |   element per key.
372 | 
373 | BACKENDNUM key [key ...]
374 |   Returns the number of the backend on which the given key belongs, as an
375 |   integer response. If multiple keys are given, returns a multi response with
376 |   one integer element per key.
377 | 
378 | BACKENDS
379 |   Returns a list of all backends. The items are formatted as "host:port@name".
380 | 
381 | FORWARD backend-name command [args...]
382 |   Forwards the given command directly to the given backend and return its
383 |   response verbatim. You can use this in lieu of connecting directly to the
384 |   backend. This command shares the backend connections with all other clients,
385 |   so forwarding commands that affect connection state like MULTI or SELECT will
386 |   cause misbehavior.
387 | 
388 | FORWARD "" command [args...]
389 |   Forwards the given command to all backends and returns a multi response
390 |   containing the backends' responses. As noted above, don't forward commands
391 |   that affect connection state.
392 | 
393 | PRINTSTATE
394 |   Prints the proxy's internal state to stderr.
395 | 


--------------------------------------------------------------------------------
/redis-shatter.conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // Example configuration file for redis-shatter.
 3 |   // This is standard JSON, but (obviously) supports comments.
 4 | 
 5 |   // The configuration is a dictionary of proxy names to proxy instances. One
 6 |   // redis-shatter process can support many proxy instances, each of which can
 7 |   // be served by multiple threads.
 8 | 
 9 |   "default": {
10 |     // Number of threads to run for this proxy instance. Must be at least 1.
11 |     // Incoming connections are pseudorandomly assigned to one of the instances.
12 |     "num_threads": 4,
13 | 
14 |     // Which CPUs this proxy instance should use.
15 |     // - If nonzero, each thread will run on exactly one of the CPUs given in
16 |     //   this mask. Setting this to -1 allows all CPUs to be used, but each
17 |     //   thread still runs on exactly one CPU.
18 |     // - If zero, threads will not be assigned to any CPU.
19 |     "affinity_cpus": -1,
20 | 
21 |     // Port and interface on which to listen. If omitted, the defaults are to
22 |     // listen on all interfaces on port 6379.
23 |     "interface": "0.0.0.0",
24 |     "port": 6379,
25 | 
26 |     // List of backends. Order doesn't matter here. Keys are distributed over
27 |     // these backends using a consistent hash ring with the fnv1a64 hash
28 |     // function. (The ring's behavior can be changed with the hash_precision
29 |     // setting below.) Backends have names that are independent of their network
30 |     // location; this is used to relocate backends while keeping the same key
31 |     // distribution.
32 |     "backends": {
33 |       "shard1": "localhost:6381",
34 |       "shard2": "localhost:6382",
35 |       "shard3": "localhost:6383",
36 |       "shard4": "localhost:6384",
37 |       "shard5": "localhost:6385",
38 |       "shard6": "localhost:6386",
39 |       "shard7": "localhost:6387",
40 |       "shard8": "localhost:6388",
41 |     },
42 | 
43 |     // You can optionally disable some commands if you don't want redis-shatter
44 |     // to forward them to backends. By default, we disable a few dangerous
45 |     // commands.
46 |     "disable_commands": ["FLUSHDB", "FLUSHALL", "KEYS"],
47 | 
48 |     // Hash precision and distribution scheme.
49 |     // - If set to zero, redis-shatter uses the same log-time distribution
50 |     //   scheme as twemproxy (nutcracker), so it can be used with the same
51 |     //   backends as an existing twemproxy/nutcracker instance.
52 |     // - If set to a positive number, redis-shatter uses a constant-time
53 |     //   distribution scheme. The precision value determines the number of hash
54 |     //   bits to use in the ring lookup table. A higher value means more
55 |     //   uniformity but also more memory usage - the table uses (2^precision)
56 |     //   bytes in memory.
57 |     // Changing this value for a non-empty cluster will cause some keys to be
58 |     // "left behind" on the wrong backend and inaccessible through the proxy.
59 |     "hash_precision": 17,
60 | 
61 |     // Hash field delimiters. These can be used to make sure keys hash to the
62 |     // same backend.
63 |     //
64 |     // How it works:
65 |     // - If a key contains both delimiters, then only the portion of the key
66 |     //   between the delimiters is hashed to determine which server to send the
67 |     //   key to.
68 |     // - If a key contains only the begin delimiter, then the portion of the key
69 |     //   after the first occurrence of the begin delimiter is used.
70 |     // - If a key contains only the end delimiter, then the portion of the key
71 |     //   before the last occurrence of the end delimiter is used.
72 |     // - If the delimiters are the same and a key contains only one delimiter,
73 |     //   then it is treated as an end delimiter. (What happens is the same as
74 |     //   case 3.)
75 |     // - If the end delimiter comes before the begin delimiter, then only
76 |     //   the end delimiter is used. (What happens is the same as case 3.)
77 |     //
78 |     // Example: hash_field_begin="{", hash_field_end="}"
79 |     //   xy{z}, xy{z, z, x{z}y, z}xy, z}x{y all hash to the same server
80 |     //
81 |     // Example: hash_field_begin=":", hash_field_end=":"
82 |     //   xy:z:, z, z:xy all hash to the same server
83 |     //   xy:z, xy hash to the same server, which may not be the same as above
84 |     //
85 |     // Example: hash_field_begin missing, hash_field_end=":"
86 |     //   xy:z::, xy, xy: all hash to the same server
87 |     //   xyz, xyz:w hash to the same server, which may not be the same as above
88 |     //
89 |     // Example: hash_field_begin=":", hash_field_end missing
90 |     //   xy:z, z, x:y:z all hash to the same server
91 |     //   xyz, w:xyz hash to the same server, which may not be the same as above
92 |     "hash_field_begin": "{",
93 |     "hash_field_end": "}",
94 |   },
95 | }


--------------------------------------------------------------------------------
/redis.conf:
--------------------------------------------------------------------------------
  1 | # Redis configuration file example
  2 | 
  3 | # Note on units: when memory size is needed, it is possible to specify
  4 | # it in the usual form of 1k 5GB 4M and so forth:
  5 | #
  6 | # 1k => 1000 bytes
  7 | # 1kb => 1024 bytes
  8 | # 1m => 1000000 bytes
  9 | # 1mb => 1024*1024 bytes
 10 | # 1g => 1000000000 bytes
 11 | # 1gb => 1024*1024*1024 bytes
 12 | #
 13 | # units are case insensitive so 1GB 1Gb 1gB are all the same.
 14 | 
 15 | # By default Redis does not run as a daemon. Use 'yes' if you need it.
 16 | # Note that Redis will write a pid file in /var/run/redis.pid when daemonized.
 17 | daemonize no
 18 | 
 19 | # When running daemonized, Redis writes a pid file in /var/run/redis.pid by
 20 | # default. You can specify a custom pid file location here.
 21 | pidfile /var/run/redis.pid
 22 | 
 23 | # Accept connections on the specified port, default is 6379.
 24 | # If port 0 is specified Redis will not listen on a TCP socket.
 25 | port 638@@REDIS_NUM@@
 26 | 
 27 | # If you want you can bind a single interface, if the bind option is not
 28 | # specified all the interfaces will listen for incoming connections.
 29 | #
 30 | bind 127.0.0.1
 31 | 
 32 | # Specify the path for the unix socket that will be used to listen for
 33 | # incoming connections. There is no default, so Redis will not listen
 34 | # on a unix socket when not specified.
 35 | #
 36 | # unixsocket /tmp/redis.sock
 37 | # unixsocketperm 755
 38 | 
 39 | # Close the connection after a client is idle for N seconds (0 to disable)
 40 | timeout 0
 41 | 
 42 | # TCP keepalive.
 43 | #
 44 | # If non-zero, use SO_KEEPALIVE to send TCP ACKs to clients in absence
 45 | # of communication. This is useful for two reasons:
 46 | #
 47 | # 1) Detect dead peers.
 48 | # 2) Take the connection alive from the point of view of network
 49 | #    equipment in the middle.
 50 | #
 51 | # On Linux, the specified value (in seconds) is the period used to send ACKs.
 52 | # Note that to close the connection the double of the time is needed.
 53 | # On other kernels the period depends on the kernel configuration.
 54 | #
 55 | # A reasonable value for this option is 60 seconds.
 56 | tcp-keepalive 0
 57 | 
 58 | # Specify the server verbosity level.
 59 | # This can be one of:
 60 | # debug (a lot of information, useful for development/testing)
 61 | # verbose (many rarely useful info, but not a mess like the debug level)
 62 | # notice (moderately verbose, what you want in production probably)
 63 | # warning (only very important / critical messages are logged)
 64 | loglevel notice
 65 | 
 66 | # Specify the log file name. Also 'stdout' can be used to force
 67 | # Redis to log on the standard output. Note that if you use standard
 68 | # output for logging but daemonize, logs will be sent to /dev/null
 69 | logfile stdout
 70 | 
 71 | # To enable logging to the system logger, just set 'syslog-enabled' to yes,
 72 | # and optionally update the other syslog parameters to suit your needs.
 73 | # syslog-enabled no
 74 | 
 75 | # Specify the syslog identity.
 76 | # syslog-ident redis
 77 | 
 78 | # Specify the syslog facility. Must be USER or between LOCAL0-LOCAL7.
 79 | # syslog-facility local0
 80 | 
 81 | # Set the number of databases. The default database is DB 0, you can select
 82 | # a different one on a per-connection basis using SELECT <dbid> where
 83 | # dbid is a number between 0 and 'databases'-1
 84 | databases 16
 85 | 
 86 | ################################ SNAPSHOTTING  #################################
 87 | #
 88 | # Save the DB on disk:
 89 | #
 90 | #   save <seconds> <changes>
 91 | #
 92 | #   Will save the DB if both the given number of seconds and the given
 93 | #   number of write operations against the DB occurred.
 94 | #
 95 | #   In the example below the behaviour will be to save:
 96 | #   after 900 sec (15 min) if at least 1 key changed
 97 | #   after 300 sec (5 min) if at least 10 keys changed
 98 | #   after 60 sec if at least 10000 keys changed
 99 | #
100 | #   Note: you can disable saving at all commenting all the "save" lines.
101 | #
102 | #   It is also possible to remove all the previously configured save
103 | #   points by adding a save directive with a single empty string argument
104 | #   like in the following example:
105 | #
106 | #   save ""
107 | 
108 | #save 900 1
109 | #save 300 10
110 | #save 60 10000
111 | 
112 | # By default Redis will stop accepting writes if RDB snapshots are enabled
113 | # (at least one save point) and the latest background save failed.
114 | # This will make the user aware (in an hard way) that data is not persisting
115 | # on disk properly, otherwise chances are that no one will notice and some
116 | # distater will happen.
117 | #
118 | # If the background saving process will start working again Redis will
119 | # automatically allow writes again.
120 | #
121 | # However if you have setup your proper monitoring of the Redis server
122 | # and persistence, you may want to disable this feature so that Redis will
123 | # continue to work as usually even if there are problems with disk,
124 | # permissions, and so forth.
125 | stop-writes-on-bgsave-error yes
126 | 
127 | # Compress string objects using LZF when dump .rdb databases?
128 | # For default that's set to 'yes' as it's almost always a win.
129 | # If you want to save some CPU in the saving child set it to 'no' but
130 | # the dataset will likely be bigger if you have compressible values or keys.
131 | rdbcompression yes
132 | 
133 | # Since version 5 of RDB a CRC64 checksum is placed at the end of the file.
134 | # This makes the format more resistant to corruption but there is a performance
135 | # hit to pay (around 10%) when saving and loading RDB files, so you can disable it
136 | # for maximum performances.
137 | #
138 | # RDB files created with checksum disabled have a checksum of zero that will
139 | # tell the loading code to skip the check.
140 | rdbchecksum yes
141 | 
142 | # The filename where to dump the DB
143 | dbfilename dump.rdb
144 | 
145 | # The working directory.
146 | #
147 | # The DB will be written inside this directory, with the filename specified
148 | # above using the 'dbfilename' configuration directive.
149 | # 
150 | # The Append Only File will also be created inside this directory.
151 | # 
152 | # Note that you must specify a directory here, not a file name.
153 | dir ./
154 | 
155 | ################################# REPLICATION #################################
156 | 
157 | # Master-Slave replication. Use slaveof to make a Redis instance a copy of
158 | # another Redis server. Note that the configuration is local to the slave
159 | # so for example it is possible to configure the slave to save the DB with a
160 | # different interval, or to listen to another port, and so on.
161 | #
162 | # slaveof <masterip> <masterport>
163 | 
164 | # If the master is password protected (using the "requirepass" configuration
165 | # directive below) it is possible to tell the slave to authenticate before
166 | # starting the replication synchronization process, otherwise the master will
167 | # refuse the slave request.
168 | #
169 | # masterauth <master-password>
170 | 
171 | # When a slave loses its connection with the master, or when the replication
172 | # is still in progress, the slave can act in two different ways:
173 | #
174 | # 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will
175 | #    still reply to client requests, possibly with out of date data, or the
176 | #    data set may just be empty if this is the first synchronization.
177 | #
178 | # 2) if slave-serve-stale-data is set to 'no' the slave will reply with
179 | #    an error "SYNC with master in progress" to all the kind of commands
180 | #    but to INFO and SLAVEOF.
181 | #
182 | slave-serve-stale-data yes
183 | 
184 | # You can configure a slave instance to accept writes or not. Writing against
185 | # a slave instance may be useful to store some ephemeral data (because data
186 | # written on a slave will be easily deleted after resync with the master) but
187 | # may also cause problems if clients are writing to it because of a
188 | # misconfiguration.
189 | #
190 | # Since Redis 2.6 by default slaves are read-only.
191 | #
192 | # Note: read only slaves are not designed to be exposed to untrusted clients
193 | # on the internet. It's just a protection layer against misuse of the instance.
194 | # Still a read only slave exports by default all the administrative commands
195 | # such as CONFIG, DEBUG, and so forth. To a limited extend you can improve
196 | # security of read only slaves using 'rename-command' to shadow all the
197 | # administrative / dangerous commands.
198 | slave-read-only yes
199 | 
200 | # Slaves send PINGs to server in a predefined interval. It's possible to change
201 | # this interval with the repl_ping_slave_period option. The default value is 10
202 | # seconds.
203 | #
204 | # repl-ping-slave-period 10
205 | 
206 | # The following option sets a timeout for both Bulk transfer I/O timeout and
207 | # master data or ping response timeout. The default value is 60 seconds.
208 | #
209 | # It is important to make sure that this value is greater than the value
210 | # specified for repl-ping-slave-period otherwise a timeout will be detected
211 | # every time there is low traffic between the master and the slave.
212 | #
213 | # repl-timeout 60
214 | 
215 | # Disable TCP_NODELAY on the slave socket after SYNC?
216 | #
217 | # If you select "yes" Redis will use a smaller number of TCP packets and
218 | # less bandwidth to send data to slaves. But this can add a delay for
219 | # the data to appear on the slave side, up to 40 milliseconds with
220 | # Linux kernels using a default configuration.
221 | #
222 | # If you select "no" the delay for data to appear on the slave side will
223 | # be reduced but more bandwidth will be used for replication.
224 | #
225 | # By default we optimize for low latency, but in very high traffic conditions
226 | # or when the master and slaves are many hops away, turning this to "yes" may
227 | # be a good idea.
228 | repl-disable-tcp-nodelay no
229 | 
230 | # The slave priority is an integer number published by Redis in the INFO output.
231 | # It is used by Redis Sentinel in order to select a slave to promote into a
232 | # master if the master is no longer working correctly.
233 | #
234 | # A slave with a low priority number is considered better for promotion, so
235 | # for instance if there are three slaves with priority 10, 100, 25 Sentinel will
236 | # pick the one wtih priority 10, that is the lowest.
237 | #
238 | # However a special priority of 0 marks the slave as not able to perform the
239 | # role of master, so a slave with priority of 0 will never be selected by
240 | # Redis Sentinel for promotion.
241 | #
242 | # By default the priority is 100.
243 | slave-priority 100
244 | 
245 | ################################## SECURITY ###################################
246 | 
247 | # Require clients to issue AUTH <PASSWORD> before processing any other
248 | # commands.  This might be useful in environments in which you do not trust
249 | # others with access to the host running redis-server.
250 | #
251 | # This should stay commented out for backward compatibility and because most
252 | # people do not need auth (e.g. they run their own servers).
253 | # 
254 | # Warning: since Redis is pretty fast an outside user can try up to
255 | # 150k passwords per second against a good box. This means that you should
256 | # use a very strong password otherwise it will be very easy to break.
257 | #
258 | # requirepass foobared
259 | 
260 | # Command renaming.
261 | #
262 | # It is possible to change the name of dangerous commands in a shared
263 | # environment. For instance the CONFIG command may be renamed into something
264 | # hard to guess so that it will still be available for internal-use tools
265 | # but not available for general clients.
266 | #
267 | # Example:
268 | #
269 | # rename-command CONFIG b840fc02d524045429941cc15f59e41cb7be6c52
270 | #
271 | # It is also possible to completely kill a command by renaming it into
272 | # an empty string:
273 | #
274 | # rename-command CONFIG ""
275 | #
276 | # Please note that changing the name of commands that are logged into the
277 | # AOF file or transmitted to slaves may cause problems.
278 | 
279 | ################################### LIMITS ####################################
280 | 
281 | # Set the max number of connected clients at the same time. By default
282 | # this limit is set to 10000 clients, however if the Redis server is not
283 | # able to configure the process file limit to allow for the specified limit
284 | # the max number of allowed clients is set to the current file limit
285 | # minus 32 (as Redis reserves a few file descriptors for internal uses).
286 | #
287 | # Once the limit is reached Redis will close all the new connections sending
288 | # an error 'max number of clients reached'.
289 | #
290 | # maxclients 10000
291 | 
292 | # Don't use more memory than the specified amount of bytes.
293 | # When the memory limit is reached Redis will try to remove keys
294 | # accordingly to the eviction policy selected (see maxmemmory-policy).
295 | #
296 | # If Redis can't remove keys according to the policy, or if the policy is
297 | # set to 'noeviction', Redis will start to reply with errors to commands
298 | # that would use more memory, like SET, LPUSH, and so on, and will continue
299 | # to reply to read-only commands like GET.
300 | #
301 | # This option is usually useful when using Redis as an LRU cache, or to set
302 | # an hard memory limit for an instance (using the 'noeviction' policy).
303 | #
304 | # WARNING: If you have slaves attached to an instance with maxmemory on,
305 | # the size of the output buffers needed to feed the slaves are subtracted
306 | # from the used memory count, so that network problems / resyncs will
307 | # not trigger a loop where keys are evicted, and in turn the output
308 | # buffer of slaves is full with DELs of keys evicted triggering the deletion
309 | # of more keys, and so forth until the database is completely emptied.
310 | #
311 | # In short... if you have slaves attached it is suggested that you set a lower
312 | # limit for maxmemory so that there is some free RAM on the system for slave
313 | # output buffers (but this is not needed if the policy is 'noeviction').
314 | #
315 | # maxmemory <bytes>
316 | 
317 | # MAXMEMORY POLICY: how Redis will select what to remove when maxmemory
318 | # is reached. You can select among five behaviors:
319 | # 
320 | # volatile-lru -> remove the key with an expire set using an LRU algorithm
321 | # allkeys-lru -> remove any key accordingly to the LRU algorithm
322 | # volatile-random -> remove a random key with an expire set
323 | # allkeys-random -> remove a random key, any key
324 | # volatile-ttl -> remove the key with the nearest expire time (minor TTL)
325 | # noeviction -> don't expire at all, just return an error on write operations
326 | # 
327 | # Note: with any of the above policies, Redis will return an error on write
328 | #       operations, when there are not suitable keys for eviction.
329 | #
330 | #       At the date of writing this commands are: set setnx setex append
331 | #       incr decr rpush lpush rpushx lpushx linsert lset rpoplpush sadd
332 | #       sinter sinterstore sunion sunionstore sdiff sdiffstore zadd zincrby
333 | #       zunionstore zinterstore hset hsetnx hmset hincrby incrby decrby
334 | #       getset mset msetnx exec sort
335 | #
336 | # The default is:
337 | #
338 | # maxmemory-policy volatile-lru
339 | 
340 | # LRU and minimal TTL algorithms are not precise algorithms but approximated
341 | # algorithms (in order to save memory), so you can select as well the sample
342 | # size to check. For instance for default Redis will check three keys and
343 | # pick the one that was used less recently, you can change the sample size
344 | # using the following configuration directive.
345 | #
346 | # maxmemory-samples 3
347 | 
348 | ############################## APPEND ONLY MODE ###############################
349 | 
350 | # By default Redis asynchronously dumps the dataset on disk. This mode is
351 | # good enough in many applications, but an issue with the Redis process or
352 | # a power outage may result into a few minutes of writes lost (depending on
353 | # the configured save points).
354 | #
355 | # The Append Only File is an alternative persistence mode that provides
356 | # much better durability. For instance using the default data fsync policy
357 | # (see later in the config file) Redis can lose just one second of writes in a
358 | # dramatic event like a server power outage, or a single write if something
359 | # wrong with the Redis process itself happens, but the operating system is
360 | # still running correctly.
361 | #
362 | # AOF and RDB persistence can be enabled at the same time without problems.
363 | # If the AOF is enabled on startup Redis will load the AOF, that is the file
364 | # with the better durability guarantees.
365 | #
366 | # Please check http://redis.io/topics/persistence for more information.
367 | 
368 | appendonly no
369 | 
370 | # The name of the append only file (default: "appendonly.aof")
371 | # appendfilename appendonly.aof
372 | 
373 | # The fsync() call tells the Operating System to actually write data on disk
374 | # instead to wait for more data in the output buffer. Some OS will really flush 
375 | # data on disk, some other OS will just try to do it ASAP.
376 | #
377 | # Redis supports three different modes:
378 | #
379 | # no: don't fsync, just let the OS flush the data when it wants. Faster.
380 | # always: fsync after every write to the append only log . Slow, Safest.
381 | # everysec: fsync only one time every second. Compromise.
382 | #
383 | # The default is "everysec", as that's usually the right compromise between
384 | # speed and data safety. It's up to you to understand if you can relax this to
385 | # "no" that will let the operating system flush the output buffer when
386 | # it wants, for better performances (but if you can live with the idea of
387 | # some data loss consider the default persistence mode that's snapshotting),
388 | # or on the contrary, use "always" that's very slow but a bit safer than
389 | # everysec.
390 | #
391 | # More details please check the following article:
392 | # http://antirez.com/post/redis-persistence-demystified.html
393 | #
394 | # If unsure, use "everysec".
395 | 
396 | # appendfsync always
397 | appendfsync everysec
398 | # appendfsync no
399 | 
400 | # When the AOF fsync policy is set to always or everysec, and a background
401 | # saving process (a background save or AOF log background rewriting) is
402 | # performing a lot of I/O against the disk, in some Linux configurations
403 | # Redis may block too long on the fsync() call. Note that there is no fix for
404 | # this currently, as even performing fsync in a different thread will block
405 | # our synchronous write(2) call.
406 | #
407 | # In order to mitigate this problem it's possible to use the following option
408 | # that will prevent fsync() from being called in the main process while a
409 | # BGSAVE or BGREWRITEAOF is in progress.
410 | #
411 | # This means that while another child is saving, the durability of Redis is
412 | # the same as "appendfsync none". In practical terms, this means that it is
413 | # possible to lose up to 30 seconds of log in the worst scenario (with the
414 | # default Linux settings).
415 | # 
416 | # If you have latency problems turn this to "yes". Otherwise leave it as
417 | # "no" that is the safest pick from the point of view of durability.
418 | no-appendfsync-on-rewrite no
419 | 
420 | # Automatic rewrite of the append only file.
421 | # Redis is able to automatically rewrite the log file implicitly calling
422 | # BGREWRITEAOF when the AOF log size grows by the specified percentage.
423 | # 
424 | # This is how it works: Redis remembers the size of the AOF file after the
425 | # latest rewrite (if no rewrite has happened since the restart, the size of
426 | # the AOF at startup is used).
427 | #
428 | # This base size is compared to the current size. If the current size is
429 | # bigger than the specified percentage, the rewrite is triggered. Also
430 | # you need to specify a minimal size for the AOF file to be rewritten, this
431 | # is useful to avoid rewriting the AOF file even if the percentage increase
432 | # is reached but it is still pretty small.
433 | #
434 | # Specify a percentage of zero in order to disable the automatic AOF
435 | # rewrite feature.
436 | 
437 | auto-aof-rewrite-percentage 100
438 | auto-aof-rewrite-min-size 64mb
439 | 
440 | ################################ LUA SCRIPTING  ###############################
441 | 
442 | # Max execution time of a Lua script in milliseconds.
443 | #
444 | # If the maximum execution time is reached Redis will log that a script is
445 | # still in execution after the maximum allowed time and will start to
446 | # reply to queries with an error.
447 | #
448 | # When a long running script exceed the maximum execution time only the
449 | # SCRIPT KILL and SHUTDOWN NOSAVE commands are available. The first can be
450 | # used to stop a script that did not yet called write commands. The second
451 | # is the only way to shut down the server in the case a write commands was
452 | # already issue by the script but the user don't want to wait for the natural
453 | # termination of the script.
454 | #
455 | # Set it to 0 or a negative value for unlimited execution without warnings.
456 | lua-time-limit 5000
457 | 
458 | ################################## SLOW LOG ###################################
459 | 
460 | # The Redis Slow Log is a system to log queries that exceeded a specified
461 | # execution time. The execution time does not include the I/O operations
462 | # like talking with the client, sending the reply and so forth,
463 | # but just the time needed to actually execute the command (this is the only
464 | # stage of command execution where the thread is blocked and can not serve
465 | # other requests in the meantime).
466 | # 
467 | # You can configure the slow log with two parameters: one tells Redis
468 | # what is the execution time, in microseconds, to exceed in order for the
469 | # command to get logged, and the other parameter is the length of the
470 | # slow log. When a new command is logged the oldest one is removed from the
471 | # queue of logged commands.
472 | 
473 | # The following time is expressed in microseconds, so 1000000 is equivalent
474 | # to one second. Note that a negative number disables the slow log, while
475 | # a value of zero forces the logging of every command.
476 | slowlog-log-slower-than 10000
477 | 
478 | # There is no limit to this length. Just be aware that it will consume memory.
479 | # You can reclaim memory used by the slow log with SLOWLOG RESET.
480 | slowlog-max-len 128
481 | 
482 | ############################### ADVANCED CONFIG ###############################
483 | 
484 | # Hashes are encoded using a memory efficient data structure when they have a
485 | # small number of entries, and the biggest entry does not exceed a given
486 | # threshold. These thresholds can be configured using the following directives.
487 | hash-max-ziplist-entries 512
488 | hash-max-ziplist-value 64
489 | 
490 | # Similarly to hashes, small lists are also encoded in a special way in order
491 | # to save a lot of space. The special representation is only used when
492 | # you are under the following limits:
493 | list-max-ziplist-entries 512
494 | list-max-ziplist-value 64
495 | 
496 | # Sets have a special encoding in just one case: when a set is composed
497 | # of just strings that happens to be integers in radix 10 in the range
498 | # of 64 bit signed integers.
499 | # The following configuration setting sets the limit in the size of the
500 | # set in order to use this special memory saving encoding.
501 | set-max-intset-entries 512
502 | 
503 | # Similarly to hashes and lists, sorted sets are also specially encoded in
504 | # order to save a lot of space. This encoding is only used when the length and
505 | # elements of a sorted set are below the following limits:
506 | zset-max-ziplist-entries 128
507 | zset-max-ziplist-value 64
508 | 
509 | # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in
510 | # order to help rehashing the main Redis hash table (the one mapping top-level
511 | # keys to values). The hash table implementation Redis uses (see dict.c)
512 | # performs a lazy rehashing: the more operation you run into an hash table
513 | # that is rehashing, the more rehashing "steps" are performed, so if the
514 | # server is idle the rehashing is never complete and some more memory is used
515 | # by the hash table.
516 | # 
517 | # The default is to use this millisecond 10 times every second in order to
518 | # active rehashing the main dictionaries, freeing memory when possible.
519 | #
520 | # If unsure:
521 | # use "activerehashing no" if you have hard latency requirements and it is
522 | # not a good thing in your environment that Redis can reply form time to time
523 | # to queries with 2 milliseconds delay.
524 | #
525 | # use "activerehashing yes" if you don't have such hard requirements but
526 | # want to free memory asap when possible.
527 | activerehashing yes
528 | 
529 | # The client output buffer limits can be used to force disconnection of clients
530 | # that are not reading data from the server fast enough for some reason (a
531 | # common reason is that a Pub/Sub client can't consume messages as fast as the
532 | # publisher can produce them).
533 | #
534 | # The limit can be set differently for the three different classes of clients:
535 | #
536 | # normal -> normal clients
537 | # slave  -> slave clients and MONITOR clients
538 | # pubsub -> clients subcribed to at least one pubsub channel or pattern
539 | #
540 | # The syntax of every client-output-buffer-limit directive is the following:
541 | #
542 | # client-output-buffer-limit <class> <hard limit> <soft limit> <soft seconds>
543 | #
544 | # A client is immediately disconnected once the hard limit is reached, or if
545 | # the soft limit is reached and remains reached for the specified number of
546 | # seconds (continuously).
547 | # So for instance if the hard limit is 32 megabytes and the soft limit is
548 | # 16 megabytes / 10 seconds, the client will get disconnected immediately
549 | # if the size of the output buffers reach 32 megabytes, but will also get
550 | # disconnected if the client reaches 16 megabytes and continuously overcomes
551 | # the limit for 10 seconds.
552 | #
553 | # By default normal clients are not limited because they don't receive data
554 | # without asking (in a push way), but just after a request, so only
555 | # asynchronous clients may create a scenario where data is requested faster
556 | # than it can read.
557 | #
558 | # Instead there is a default limit for pubsub and slave clients, since
559 | # subscribers and slaves receive data in a push fashion.
560 | #
561 | # Both the hard or the soft limit can be disabled by setting them to zero.
562 | client-output-buffer-limit normal 0 0 0
563 | client-output-buffer-limit slave 256mb 64mb 60
564 | client-output-buffer-limit pubsub 32mb 8mb 60
565 | 
566 | # Redis calls an internal function to perform many background tasks, like
567 | # closing connections of clients in timeot, purging expired keys that are
568 | # never requested, and so forth.
569 | #
570 | # Not all tasks are perforemd with the same frequency, but Redis checks for
571 | # tasks to perform accordingly to the specified "hz" value.
572 | #
573 | # By default "hz" is set to 10. Raising the value will use more CPU when
574 | # Redis is idle, but at the same time will make Redis more responsive when
575 | # there are many keys expiring at the same time, and timeouts may be
576 | # handled with more precision.
577 | #
578 | # The range is between 1 and 500, however a value over 100 is usually not
579 | # a good idea. Most users should use the default of 10 and raise this up to
580 | # 100 only in environments where very low latency is required.
581 | hz 10
582 | 
583 | # When a child rewrites the AOF file, if the following option is enabled
584 | # the file will be fsync-ed every 32 MB of data generated. This is useful
585 | # in order to commit the file to the disk more incrementally and avoid
586 | # big latency spikes.
587 | aof-rewrite-incremental-fsync yes
588 | 
589 | ################################## INCLUDES ###################################
590 | 
591 | # Include one or more other config files here.  This is useful if you
592 | # have a standard template that goes to all Redis server but also need
593 | # to customize a few per-server settings.  Include files can include
594 | # other files, so use this wisely.
595 | #
596 | # include /path/to/local.conf
597 | # include /path/to/other.conf
598 | 


--------------------------------------------------------------------------------
/run_multiple_redis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | seq 1 8 | xargs -I {} -P 8 bash -c "cat redis.conf | sed s/@@REDIS_NUM@@/{}/g | redis-server -"
4 | 


--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ./run_multiple_redis.sh &
 4 | RUN_MULTI_PID=$?
 5 | sleep 1
 6 | 
 7 | ./redis-shatter redis-shatter-test.conf.json &
 8 | SHATTER_PID=$?
 9 | sleep 1
10 | 
11 | for TEST in *Test
12 | do
13 |     ./$TEST
14 |     if [ "$?" != "0" ]
15 |     then
16 |         FAILURES_FOUND=1
17 |     fi
18 | done
19 | 
20 | kill -TERM $SHATTER_PID
21 | # TODO: we should kill run_multiple_redis.sh directly too, and make it forward
22 | # the signal to the redis procs
23 | ps aux | grep redis-server | grep -v grep | grep -v xargs | awk '{print $2;}' | xargs kill -TERM
24 | 
25 | echo -e "\n\n\n"
26 | if [ "$FAILURES_FOUND" == "1" ]
27 | then
28 |     echo "Some tests failed!"
29 |     exit 1
30 | else
31 |     echo "All tests passed"
32 |     exit 0
33 | fi
34 | 


--------------------------------------------------------------------------------