├── .gitignore ├── FunctionalTest.cc ├── LICENSE ├── Main.cc ├── Makefile ├── NutcrackerConsistentHashRing.cc ├── NutcrackerConsistentHashRing.hh ├── Protocol.cc ├── Protocol.hh ├── ProtocolTest.cc ├── Proxy.cc ├── Proxy.hh ├── README ├── redis-shatter.conf.json ├── redis.conf ├── run_multiple_redis.sh └── run_tests.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.o 3 | gmon.out 4 | redis-shatter 5 | ProtocolTest 6 | FunctionalTest 7 | -------------------------------------------------------------------------------- /FunctionalTest.cc: -------------------------------------------------------------------------------- 1 | #define _STDC_FORMAT_MACROS 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "Protocol.hh" 18 | 19 | using namespace std; 20 | 21 | 22 | shared_ptr parse_response(const char* contents) { 23 | unique_ptr buf(evbuffer_new(), 24 | evbuffer_free); 25 | evbuffer_add(buf.get(), contents, strlen(contents)); 26 | return ResponseParser().resume(buf.get()); 27 | } 28 | 29 | shared_ptr test_expect_response(const char* host, int port, 30 | const char* expected_response, ...) { 31 | 32 | DataCommand cmd; 33 | 34 | va_list va; 35 | va_start(va, expected_response); 36 | const char* arg; 37 | while ((arg = va_arg(va, const char*))) { 38 | cmd.args.emplace_back(arg); 39 | } 40 | va_end(va); 41 | 42 | shared_ptr r; 43 | { 44 | scoped_fd fd = connect(host, port, false); // not nonblocking 45 | expect_ge(fd, 0); 46 | 47 | unique_ptr buf(evbuffer_new(), 48 | evbuffer_free); 49 | cmd.write(buf.get()); 50 | evbuffer_write(buf.get(), fd); 51 | evbuffer_drain(buf.get(), evbuffer_get_length(buf.get())); 52 | 53 | evbuffer_read(buf.get(), fd, 1024 * 128); 54 | r = ResponseParser().resume(buf.get()); 55 | } 56 | 57 | if (expected_response) { 58 | shared_ptr expected_r = parse_response(expected_response); 59 | expect(expected_r.get()); // if this fails, the test itself is broken 60 | 61 | if (!r.get()) { 62 | fprintf(stderr, "cmd = "); 63 | cmd.print(stderr); 64 | fprintf(stderr, "\nexpected = "); 65 | expected_r->print(stderr); 66 | fprintf(stderr, "\nactual = (not present)\n"); 67 | expect(false); 68 | } 69 | if (*r != *expected_r) { 70 | fprintf(stderr, "cmd = "); 71 | cmd.print(stderr); 72 | fprintf(stderr, "\nexpected = "); 73 | expected_r->print(stderr); 74 | fprintf(stderr, "\nactual = "); 75 | r->print(stderr); 76 | fprintf(stderr, "\n"); 77 | expect(false); 78 | } 79 | } 80 | 81 | return expected_response ? NULL : r; 82 | } 83 | 84 | int main(int argc, char* argv[]) { 85 | 86 | printf("functional tests\n"); 87 | printf("we expect redis-shatter to be running with all backends connected\n"); 88 | 89 | { 90 | printf("-- unimplemented commands return PROXYERROR\n"); 91 | 92 | const vector unimplemented_commands = { 93 | "AUTH", "BLPOP", "BRPOP", "BRPOPLPUSH", "DISCARD", "EXEC", "MONITOR", 94 | "MOVE", "MULTI", "PSUBSCRIBE", "PUBSUB", "PUBLISH", "PUNSUBSCRIBE", 95 | "SELECT", "SLAVEOF", "SUBSCRIBE", "SYNC", "UNSUBSCRIBE", "UNWATCH", 96 | "WATCH"}; 97 | 98 | for (const auto& cmd : unimplemented_commands) { 99 | test_expect_response("localhost", 6379, 100 | "-PROXYERROR command not supported\r\n", cmd.c_str(), NULL); 101 | } 102 | } 103 | 104 | { 105 | printf("-- PING\n"); 106 | test_expect_response("localhost", 6379, "+PONG\r\n", "PING", NULL); 107 | } 108 | 109 | { 110 | printf("-- ECHO\n"); 111 | test_expect_response("localhost", 6379, "$3\r\nLOL\r\n", "ECHO", "LOL", NULL); 112 | } 113 | 114 | { 115 | printf("-- FLUSHALL, DBSIZE\n"); 116 | test_expect_response("localhost", 6379, "+OK\r\n", "FLUSHALL", NULL); 117 | test_expect_response("localhost", 6379, ":0\r\n", "DBSIZE", NULL); 118 | } 119 | 120 | { 121 | printf("-- GET, SET, GETSET, MGET, MSET, DEL\n"); 122 | test_expect_response("localhost", 6379, "$-1\r\n", "GET", "x", NULL); 123 | test_expect_response("localhost", 6379, "+OK\r\n", "SET", "x", "23", NULL); 124 | test_expect_response("localhost", 6379, "$2\r\n23\r\n", "GET", "x", NULL); 125 | test_expect_response("localhost", 6379, "$2\r\n23\r\n", "GETSET", "x", "45", NULL); 126 | test_expect_response("localhost", 6379, "$2\r\n45\r\n", "GET", "x", NULL); 127 | test_expect_response("localhost", 6379, "*3\r\n$2\r\n45\r\n$-1\r\n$-1\r\n", "MGET", "x", "y", "z", NULL); 128 | test_expect_response("localhost", 6379, "+OK\r\n", "MSET", "x", "1", "y", "2", "z", "3", NULL); 129 | test_expect_response("localhost", 6379, "*3\r\n$1\r\n1\r\n$1\r\n2\r\n$1\r\n3\r\n", "MGET", "x", "y", "z", NULL); 130 | test_expect_response("localhost", 6379, ":2\r\n", "DEL", "x", "y", "w", NULL); 131 | } 132 | 133 | { 134 | printf("-- proxy commands: FORWARD, BACKENDS, BACKENDNUM\n"); 135 | test_expect_response("localhost", 6379, "+PONG\r\n", "FORWARD", "0", "PING", NULL); 136 | 137 | auto r = test_expect_response("localhost", 6379, NULL, "BACKENDS", NULL); 138 | expect_eq(r->type, Response::Type::Multi); 139 | size_t num_backends = r->fields.size(); 140 | printf("---- note: there are %zu backends\n", num_backends); 141 | 142 | r = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "z", NULL); 143 | expect_eq(r->type, Response::Type::Integer); 144 | int64_t z_backend = r->int_value; 145 | printf("---- note: \'z\' goes to backend %" PRId64 "\n", z_backend); 146 | 147 | string z_backend_str = string_printf("%" PRId64, z_backend); 148 | test_expect_response("localhost", 6379, "$1\r\n3\r\n", "GET", "z", NULL); 149 | test_expect_response("localhost", 6379, "$1\r\n3\r\n", "FORWARD", 150 | z_backend_str.c_str(), "GET", "z", NULL); 151 | } 152 | 153 | { 154 | printf("-- FLUSHDB, DBSIZE\n"); 155 | test_expect_response("localhost", 6379, "+OK\r\n", "FLUSHDB", NULL); 156 | test_expect_response("localhost", 6379, ":0\r\n", "DBSIZE", NULL); 157 | test_expect_response("localhost", 6379, "*3\r\n$-1\r\n$-1\r\n$-1\r\n", "MGET", "x", "y", "z", NULL); 158 | } 159 | 160 | { 161 | printf("-- MSETNX, RENAME\n"); 162 | test_expect_response("localhost", 6379, "-PROXYERROR keys are on different backends\r\n", "MSETNX", "x{abc}", "a", "y{abc}", "b", "z{bbc}", "b", NULL); 163 | test_expect_response("localhost", 6379, ":1\r\n", "MSETNX", "x{abc}", "a", "y{abc}", "b", NULL); 164 | test_expect_response("localhost", 6379, ":0\r\n", "MSETNX", "x{abc}", "a", "y{abc}", "b", "z{abc}", "c", NULL); 165 | test_expect_response("localhost", 6379, ":1\r\n", "MSETNX", "z{abd}", "b", NULL); 166 | 167 | // make sure the keys are on the same backend 168 | auto backend_x_resp = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "x{abc}", NULL); 169 | auto backend_y_resp = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "y{abc}", NULL); 170 | auto backend_z_resp = test_expect_response("localhost", 6379, NULL, "BACKENDNUM", "z{bbc}", NULL); 171 | expect_eq(*backend_x_resp, *backend_y_resp); 172 | expect_ne(*backend_x_resp, *backend_z_resp); 173 | 174 | test_expect_response("localhost", 6379, "-PROXYERROR keys are on different backends\r\n", "RENAME", "x{abc}", "x{bbc}", NULL); 175 | test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "x{abc}", "y{abc}", NULL); 176 | test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "y{abc}", "zxcvbnm{abc}", NULL); 177 | 178 | test_expect_response("localhost", 6379, "-PROXYERROR keys are on different backends\r\n", "RENAME", "z{bbc}", "z{abc}", NULL); 179 | test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "z{abd}", "y{abd}", NULL); 180 | test_expect_response("localhost", 6379, "+OK\r\n", "RENAME", "y{abd}", "zxcvbnm{abd}", NULL); 181 | } 182 | 183 | printf("all tests passed\n"); 184 | return 0; 185 | } 186 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Martin Michelsen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Main.cc: -------------------------------------------------------------------------------- 1 | #define _STDC_FORMAT_MACROS 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #ifdef __APPLE__ 17 | #include 18 | #include 19 | #endif 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "NutcrackerConsistentHashRing.hh" 31 | #include "Proxy.hh" 32 | 33 | using namespace std; 34 | 35 | 36 | bool set_thread_affinity(pthread_t thread, int64_t cpu_id) { 37 | #ifdef __APPLE__ 38 | thread_affinity_policy_data_t pd; 39 | pd.affinity_tag = cpu_id + 1; 40 | return thread_policy_set(pthread_mach_thread_np(thread), 41 | THREAD_AFFINITY_POLICY, (thread_policy_t)&pd, 42 | THREAD_AFFINITY_POLICY_COUNT) == 0; 43 | 44 | #else // Linux 45 | cpu_set_t cpuset; 46 | CPU_ZERO(&cpuset); 47 | CPU_SET(cpu_id, &cpuset); 48 | return pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset) == 0; 49 | #endif 50 | } 51 | 52 | 53 | bool should_exit = false; 54 | 55 | void sigint_handler(int signum) { 56 | should_exit = true; 57 | } 58 | 59 | 60 | struct Options { 61 | struct ProxyOptions { 62 | size_t num_threads; 63 | int64_t affinity_cpus; 64 | 65 | string listen_addr; 66 | int port; 67 | int listen_fd; 68 | 69 | vector backend_netlocs; 70 | unordered_set commands_to_disable; 71 | 72 | uint8_t hash_precision; 73 | int hash_begin_delimiter; 74 | int hash_end_delimiter; 75 | 76 | ProxyOptions() : num_threads(1), affinity_cpus(0), listen_addr(""), 77 | port(6379), listen_fd(-1), backend_netlocs(), commands_to_disable(), 78 | hash_precision(17), hash_begin_delimiter(-1), hash_end_delimiter(-1) { } 79 | 80 | void print(FILE* stream, const char* name) const { 81 | fprintf(stream, "[%s] %zu worker thread(s)\n", name, this->num_threads); 82 | if (this->affinity_cpus) { 83 | fprintf(stream, "[%s] set thread affinity for cores with mask %016" PRIX64 "\n", 84 | name, this->affinity_cpus); 85 | } else { 86 | fprintf(stream, "[%s] don\'t set thread affinity\n", name); 87 | } 88 | if (this->listen_fd >= 0) { 89 | fprintf(stream, "[%s] accept connections on fd %d\n", name, 90 | this->listen_fd); 91 | } else if (!this->listen_addr.empty()) { 92 | fprintf(stream, "[%s] listen on %s:%d\n", name, 93 | this->listen_addr.c_str(), this->port); 94 | } else { 95 | fprintf(stream, "[%s] listen on port %d on all interfaces\n", name, 96 | this->port); 97 | } 98 | 99 | for (const auto& backend_netloc : this->backend_netlocs) { 100 | fprintf(stream, "[%s] register backend %s\n", name, 101 | backend_netloc.c_str()); 102 | } 103 | 104 | for (const auto& command : this->commands_to_disable) { 105 | fprintf(stream, "[%s] disable command %s\n", name, command.c_str()); 106 | } 107 | 108 | if (this->hash_begin_delimiter >= 0) { 109 | fprintf(stream, "[%s] hash begin delimiter is 0x%02X\n", name, 110 | this->hash_begin_delimiter); 111 | } 112 | if (this->hash_end_delimiter >= 0) { 113 | fprintf(stream, "[%s] hash end delimiter is 0x%02X\n", name, 114 | this->hash_end_delimiter); 115 | } 116 | } 117 | 118 | void validate() const { 119 | if (this->backend_netlocs.empty()) { 120 | throw invalid_argument("no backends specified"); 121 | } 122 | } 123 | }; 124 | 125 | unordered_map name_to_proxy_options; 126 | 127 | Options() = delete; 128 | Options(Options&&) = default; 129 | Options(const Options&) = default; 130 | Options(const char* filename) { 131 | string json; 132 | if (!strcmp(filename, "-")) { 133 | scoped_fd fd(0); 134 | json = read_all(fd); 135 | } else { 136 | scoped_fd fd(filename, O_RDONLY); 137 | json = read_all(fd); 138 | } 139 | shared_ptr config = JSONObject::parse(json); 140 | 141 | if (!config->is_dict()) { 142 | throw invalid_argument("configuration is not a dictionary"); 143 | } 144 | 145 | for (const auto& proxy_config_it : config->as_dict()) { 146 | const string& proxy_name = proxy_config_it.first; 147 | const auto& proxy_config = proxy_config_it.second->as_dict(); 148 | 149 | ProxyOptions& options = this->name_to_proxy_options.emplace( 150 | piecewise_construct, forward_as_tuple(proxy_name), forward_as_tuple()) 151 | .first->second; 152 | 153 | try { 154 | options.num_threads = proxy_config.at("num_threads")->as_int(); 155 | if (options.num_threads == 0) { 156 | options.num_threads = thread::hardware_concurrency(); 157 | } 158 | } catch (const out_of_range& e) { } 159 | 160 | try { 161 | options.affinity_cpus = proxy_config.at("affinity_cpus")->as_int(); 162 | } catch (const out_of_range& e) { } 163 | 164 | try { 165 | options.listen_addr = proxy_config.at("interface")->as_string(); 166 | } catch (const out_of_range& e) { } 167 | 168 | try { 169 | options.port = proxy_config.at("port")->as_int(); 170 | } catch (const out_of_range& e) { } 171 | 172 | try { 173 | options.hash_precision = proxy_config.at("hash_precision")->as_int(); 174 | } catch (const out_of_range& e) { } 175 | 176 | try { 177 | const auto& s = proxy_config.at("hash_field_begin")->as_string(); 178 | if (s.size() != 1) { 179 | throw invalid_argument("hash_field_begin is not a 1-char string"); 180 | } 181 | options.hash_begin_delimiter = s[0]; 182 | } catch (const out_of_range& e) { } 183 | 184 | try { 185 | const auto& s = proxy_config.at("hash_field_end")->as_string(); 186 | if (s.size() != 1) { 187 | throw invalid_argument("hash_field_end is not a 1-char string"); 188 | } 189 | options.hash_end_delimiter = s[0]; 190 | } catch (const out_of_range& e) { } 191 | 192 | try { 193 | for (const auto& command : proxy_config.at("disable_commands")->as_list()) { 194 | options.commands_to_disable.emplace(command->as_string()); 195 | } 196 | } catch (const out_of_range& e) { } 197 | 198 | try { 199 | for (const auto& backend_it : proxy_config.at("backends")->as_dict()) { 200 | const auto& backend_name = backend_it.first; 201 | const auto& backend_netloc = backend_it.second->as_string(); 202 | 203 | options.backend_netlocs.emplace_back(string_printf("%s@%s", 204 | backend_netloc.c_str(), backend_name.c_str())); 205 | } 206 | } catch (const out_of_range& e) { } 207 | } 208 | } 209 | 210 | void print(FILE* stream) const { 211 | fprintf(stream, "%zu proxy instance(s) defined\n", 212 | this->name_to_proxy_options.size()); 213 | for (const auto& it : this->name_to_proxy_options) { 214 | it.second.print(stream, it.first.c_str()); 215 | } 216 | } 217 | 218 | void validate() const { 219 | for (const auto& it : this->name_to_proxy_options) { 220 | it.second.validate(); 221 | } 222 | } 223 | }; 224 | 225 | 226 | 227 | int main(int argc, char** argv) { 228 | 229 | log(INFO, "> fuzziqer software redis-shatter"); 230 | 231 | // parse command-line args 232 | if (argc > 2) { 233 | log(ERROR, "usage: %s [config-filename]", argv[0]); 234 | return 1; 235 | } 236 | const char* config_filename = (argc == 2) ? argv[1] : "redis-shatter.conf.json"; 237 | Options opt(config_filename); 238 | opt.print(stderr); 239 | opt.validate(); 240 | 241 | srand(getpid() ^ time(NULL)); 242 | signal(SIGPIPE, SIG_IGN); 243 | signal(SIGINT, sigint_handler); 244 | 245 | vector threads; 246 | vector> proxies; 247 | 248 | // start all the proxies 249 | vector cpu_to_thread_count(thread::hardware_concurrency()); 250 | for (auto& proxy_options_it : opt.name_to_proxy_options) { 251 | const char* proxy_name = proxy_options_it.first.c_str(); 252 | auto& proxy_options = proxy_options_it.second; 253 | 254 | // if there's no listening socket from a parent process, open a new one 255 | if (proxy_options.listen_fd == -1) { 256 | proxy_options.listen_fd = listen(proxy_options.listen_addr, 257 | proxy_options.port, SOMAXCONN); 258 | if (!proxy_options.listen_addr.empty()) { 259 | log(INFO, "[%s] opened server socket %d on %s:%d", proxy_name, 260 | proxy_options.listen_fd, proxy_options.listen_addr.c_str(), 261 | proxy_options.port); 262 | } else { 263 | log(INFO, "[%s] opened server socket %d on port %d", proxy_name, 264 | proxy_options.listen_fd, proxy_options.port); 265 | } 266 | 267 | } else { 268 | fprintf(stderr, "[%s] using server socket %d from parent process\n", 269 | proxy_name, proxy_options.listen_fd); 270 | } 271 | 272 | evutil_make_socket_nonblocking(proxy_options.listen_fd); 273 | 274 | fprintf(stderr, "[%s] setting up configuration\n", proxy_name); 275 | auto hosts = ConsistentHashRing::Host::parse_netloc_list( 276 | proxy_options.backend_netlocs, 6379); 277 | shared_ptr ring; 278 | if (proxy_options.hash_precision) { 279 | ring.reset(new ConstantTimeConsistentHashRing( 280 | hosts, proxy_options.hash_precision)); 281 | } else { 282 | ring.reset(new NutcrackerConsistentHashRing(hosts)); 283 | } 284 | shared_ptr stats(new Proxy::Stats()); 285 | 286 | fprintf(stderr, "[%s] starting %zu proxy instances\n", proxy_name, 287 | proxy_options.num_threads); 288 | while (threads.size() < proxy_options.num_threads) { 289 | proxies.emplace_back(new Proxy(proxy_options.listen_fd, ring, 290 | proxy_options.hash_begin_delimiter, proxy_options.hash_end_delimiter, 291 | stats, proxies.size())); 292 | for (const auto& command : proxy_options.commands_to_disable) { 293 | proxies.back()->disable_command(command); 294 | } 295 | 296 | // run the thread on the least-loaded cpu 297 | int64_t min_load_cpu = -1; 298 | for (int64_t cpu_id = 0; cpu_id < static_cast(cpu_to_thread_count.size()); cpu_id++) { 299 | if ((proxy_options.affinity_cpus & (1 << cpu_id)) && 300 | ((min_load_cpu < 0) || 301 | (cpu_to_thread_count[cpu_id] < cpu_to_thread_count[min_load_cpu]))) { 302 | min_load_cpu = cpu_id; 303 | } 304 | } 305 | 306 | threads.emplace_back(&Proxy::serve, proxies.back().get()); 307 | if (min_load_cpu >= 0) { 308 | if (set_thread_affinity(threads.back().native_handle(), min_load_cpu)) { 309 | cpu_to_thread_count[min_load_cpu]++; 310 | fprintf(stderr, "[%s] created worker thread on core %" PRId64 "\n", 311 | proxy_name, min_load_cpu); 312 | } else { 313 | fprintf(stderr, "[%s] created worker thread, but failed to bind to core %" PRId64 "\n", 314 | proxy_name, min_load_cpu); 315 | } 316 | } else { 317 | fprintf(stderr, "[%s] created worker thread\n", proxy_name); 318 | } 319 | } 320 | } 321 | 322 | fprintf(stderr, "ready for connections\n"); 323 | sigset_t sigset; 324 | sigemptyset(&sigset); 325 | while (!should_exit) { 326 | sigsuspend(&sigset); 327 | } 328 | 329 | fprintf(stderr, "stopping proxy instances\n"); 330 | for (auto& p : proxies) { 331 | p->stop(); 332 | } 333 | 334 | fprintf(stderr, "waiting for proxy instances to terminate\n"); 335 | for (auto& t : threads) { 336 | t.join(); 337 | } 338 | 339 | return 0; 340 | } 341 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | OBJECTS=NutcrackerConsistentHashRing.o Protocol.o Proxy.o Main.o 3 | CXXFLAGS=-g -Wall -Werror -std=c++14 -I/opt/local/include 4 | LDFLAGS=-levent -lphosg -lpthread -g -std=c++14 -L/opt/local/lib 5 | EXECUTABLE=redis-shatter 6 | 7 | TESTS=ProtocolTest FunctionalTest 8 | 9 | all: $(EXECUTABLE) $(TESTS) 10 | 11 | $(EXECUTABLE): $(OBJECTS) 12 | g++ -o $(EXECUTABLE) $^ $(LDFLAGS) 13 | 14 | test: all 15 | ./run_tests.sh 16 | 17 | ProtocolTest: ProtocolTest.o Protocol.o 18 | g++ -o ProtocolTest $^ $(LDFLAGS) 19 | 20 | FunctionalTest: FunctionalTest.o Protocol.o 21 | g++ -o FunctionalTest $^ $(LDFLAGS) 22 | 23 | clean: 24 | rm -rf *.dSYM *.o $(EXECUTABLE) $(TESTS) gmon.out 25 | 26 | .PHONY: clean 27 | -------------------------------------------------------------------------------- /NutcrackerConsistentHashRing.cc: -------------------------------------------------------------------------------- 1 | #include "NutcrackerConsistentHashRing.hh" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | 17 | 18 | NutcrackerConsistentHashRing::Point::Point(uint32_t index, uint32_t value) : index(index), 19 | value(value) { } 20 | 21 | 22 | 23 | #define KETAMA_POINTS_PER_SERVER 160 24 | #define KETAMA_POINTS_PER_HASH 4 25 | #define KETAMA_MAX_HOSTLEN 256 26 | 27 | NutcrackerConsistentHashRing::NutcrackerConsistentHashRing( 28 | const vector& hosts) : ConsistentHashRing(hosts) { 29 | if (this->hosts.empty()) { 30 | throw invalid_argument("no hosts in continuum"); 31 | } 32 | 33 | uint64_t total_weight = this->hosts.size(); 34 | 35 | for (size_t host_index = 0; host_index < this->hosts.size(); host_index++) { 36 | const auto& host = this->hosts[host_index]; 37 | 38 | float pct = 1.0 / (float)total_weight; 39 | size_t points_per_host = (size_t)((floorf((float) (pct * KETAMA_POINTS_PER_SERVER / 4 * (float)this->hosts.size() + 0.0000000001))) * 4); 40 | 41 | for (size_t point_index = 0; point_index <= (points_per_host / KETAMA_POINTS_PER_HASH) - 1; point_index++) { 42 | char point_data[KETAMA_MAX_HOSTLEN]; 43 | size_t point_data_size = snprintf(point_data, KETAMA_MAX_HOSTLEN, 44 | "%s-%zu", host.name.c_str(), point_index); 45 | string hash = md5(point_data, point_data_size); 46 | 47 | for (size_t x = 0; x < KETAMA_POINTS_PER_HASH; x++) { 48 | uint32_t value = (static_cast(hash[3 + x * 4] & 0xFF) << 24) | 49 | (static_cast(hash[2 + x * 4] & 0xFF) << 16) | 50 | (static_cast(hash[1 + x * 4] & 0xFF) << 8) | 51 | (static_cast(hash[0 + x * 4] & 0xFF)); 52 | this->points.emplace_back(host_index, value); 53 | } 54 | } 55 | } 56 | 57 | qsort(this->points.data(), this->points.size(), sizeof(this->points[0]), 58 | [](const void* t1, const void* t2) -> int { 59 | const Point* ct1 = reinterpret_cast(t1); 60 | const Point* ct2 = reinterpret_cast(t2); 61 | if (ct1->value == ct2->value) { 62 | return 0; 63 | } else if (ct1->value > ct2->value) { 64 | return 1; 65 | } else { 66 | return -1; 67 | } 68 | }); 69 | } 70 | 71 | uint64_t NutcrackerConsistentHashRing::host_id_for_key(const void* key, 72 | int64_t size) const { 73 | // TODO: use std::lower_bound here instead of manual binary search 74 | 75 | uint32_t hash32 = fnv1a64(key, size); 76 | 77 | const Point* left = this->points.data(); 78 | const Point* right = left + this->points.size(); 79 | 80 | while (left < right) { 81 | const Point* middle = left + (right - left) / 2; 82 | if (middle->value < hash32) { 83 | left = middle + 1; 84 | } else { 85 | right = middle; 86 | } 87 | } 88 | 89 | if (right == this->points.data() + this->points.size()) { 90 | return this->points[0].index; 91 | } 92 | return right->index; 93 | } 94 | -------------------------------------------------------------------------------- /NutcrackerConsistentHashRing.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | // this isn't the same as phosg's ConsistentHashRing; this one is designed to 12 | // mirror the implementation in twemproxy/nutcracker so that redis-shatter can 13 | // be used alongside it 14 | 15 | class NutcrackerConsistentHashRing : public ConsistentHashRing { 16 | public: 17 | NutcrackerConsistentHashRing() = delete; 18 | NutcrackerConsistentHashRing(const std::vector& hosts); 19 | virtual ~NutcrackerConsistentHashRing() = default; 20 | 21 | virtual uint64_t host_id_for_key(const void* key, int64_t size) const; 22 | 23 | protected: 24 | struct Point { 25 | uint32_t index; 26 | uint32_t value; 27 | 28 | Point(uint32_t index, uint32_t hash); 29 | }; 30 | 31 | std::vector points; 32 | }; 33 | -------------------------------------------------------------------------------- /Protocol.cc: -------------------------------------------------------------------------------- 1 | #include "Protocol.hh" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | using namespace std; 16 | 17 | 18 | 19 | static size_t evbuffer_readln_into(struct evbuffer* buf, char* buffer, 20 | size_t buffer_size, enum evbuffer_eol_style eol_style, bool drain = true) { 21 | 22 | size_t eol_len; 23 | struct evbuffer_ptr ptr = evbuffer_search_eol(buf, NULL, &eol_len, eol_style); 24 | if (ptr.pos == -1) { 25 | throw out_of_range("no line available"); 26 | } 27 | 28 | if (ptr.pos < static_cast(buffer_size)) { 29 | evbuffer_copyout(buf, buffer, ptr.pos); 30 | buffer[ptr.pos] = 0; 31 | if (drain) { 32 | evbuffer_drain(buf, ptr.pos + eol_len); 33 | } 34 | return ptr.pos; 35 | } 36 | throw runtime_error("line too long"); 37 | } 38 | 39 | 40 | 41 | DataCommand::DataCommand(size_t num_args) { 42 | this->args.reserve(num_args); 43 | } 44 | 45 | void DataCommand::print(FILE* stream, int indent_level) const { 46 | 47 | if (indent_level < 0) { 48 | indent_level = -indent_level; 49 | } else { 50 | print_indent(stream, indent_level); 51 | } 52 | 53 | fprintf(stream, "DataCommand[\n"); 54 | for (const auto& arg : this->args) { 55 | print_indent(stream, indent_level + 1); 56 | for (char ch : arg) { 57 | if (ch < 0x20 || ch > 0x7F) { 58 | fprintf(stream, "\\x%02X", ch); 59 | } else { 60 | fputc(ch, stream); 61 | } 62 | } 63 | fprintf(stream, ",\n"); 64 | } 65 | fprintf(stream, "]]"); 66 | } 67 | 68 | string DataCommand::format() const { 69 | string ret = "["; 70 | 71 | for (const auto& arg : this->args) { 72 | if (ret.size() > 1) { 73 | ret += ','; 74 | } 75 | ret += '\''; 76 | for (char ch : arg) { 77 | if (ch < 0x20 || ch > 0x7F) { 78 | ret += string_printf("\\x%02X", ch); 79 | } else if (ch == '\'') { 80 | ret += "\\\'"; 81 | } else { 82 | ret += ch; 83 | } 84 | } 85 | ret += '\''; 86 | } 87 | ret += ']'; 88 | 89 | return ret; 90 | } 91 | 92 | void DataCommand::write(struct evbuffer* buf) const { 93 | if (!buf) { 94 | return; 95 | } 96 | 97 | evbuffer_add_printf(buf, "*%zu\r\n", this->args.size()); 98 | 99 | for (const auto& arg : this->args) { 100 | evbuffer_add_printf(buf, "$%zu\r\n", arg.size()); 101 | evbuffer_add(buf, arg.data(), arg.size()); 102 | evbuffer_add(buf, "\r\n", 2); 103 | } 104 | } 105 | 106 | 107 | 108 | ReferenceCommand::DataReference::DataReference() : data(NULL), size(0) { } 109 | 110 | ReferenceCommand::DataReference::DataReference(const void* data, size_t size) : 111 | data(data), size(size) { } 112 | 113 | ReferenceCommand::DataReference::DataReference(const string& data) : 114 | data(data.data()), size(data.size()) { } 115 | 116 | ReferenceCommand::ReferenceCommand(size_t num_args) { 117 | this->args.reserve(num_args); 118 | } 119 | 120 | void ReferenceCommand::print(FILE* stream, int indent_level) const { 121 | 122 | if (indent_level < 0) { 123 | indent_level = -indent_level; 124 | } else { 125 | print_indent(stream, indent_level); 126 | } 127 | 128 | fprintf(stream, "ReferenceCommand[\n"); 129 | for (const auto& arg : this->args) { 130 | print_indent(stream, indent_level + 1); 131 | for (size_t x = 0; x < arg.size; x++) { 132 | char ch = ((const char*)arg.data)[x]; 133 | if (ch < 0x20 || ch > 0x7F) { 134 | fprintf(stream, "\\x%02X", ch); 135 | } else { 136 | fputc(ch, stream); 137 | } 138 | } 139 | fprintf(stream, ",\n"); 140 | } 141 | fprintf(stream, "]]"); 142 | } 143 | 144 | string ReferenceCommand::format() const { 145 | string ret = "["; 146 | 147 | for (const auto& arg : this->args) { 148 | if (ret.size() > 1) { 149 | ret += ','; 150 | } 151 | ret += '\''; 152 | for (size_t x = 0; x < arg.size; x++) { 153 | char ch = ((const char*)arg.data)[x]; 154 | if (ch < 0x20 || ch > 0x7F) { 155 | ret += string_printf("\\x%02X", ch); 156 | } else if (ch == '\'') { 157 | ret += "\\\'"; 158 | } else { 159 | ret += ch; 160 | } 161 | } 162 | ret += '\''; 163 | } 164 | ret += ']'; 165 | 166 | return ret; 167 | } 168 | 169 | void ReferenceCommand::write(struct evbuffer* buf) const { 170 | if (!buf) { 171 | return; 172 | } 173 | 174 | evbuffer_add_printf(buf, "*%zu\r\n", this->args.size()); 175 | 176 | for (const auto& arg : this->args) { 177 | evbuffer_add_printf(buf, "$%zu\r\n", arg.size); 178 | evbuffer_add(buf, arg.data, arg.size); 179 | evbuffer_add(buf, "\r\n", 2); 180 | } 181 | } 182 | 183 | 184 | 185 | Response::Response(Response::Type type, int64_t size) : type(type), 186 | int_value(size) { 187 | switch (this->type) { 188 | case Type::Status: 189 | case Type::Error: 190 | case Type::Data: 191 | if (size > 0) { 192 | this->fields.reserve(size); 193 | } 194 | break; 195 | 196 | case Type::Integer: 197 | this->int_value = 0; 198 | break; 199 | 200 | case Type::Multi: 201 | if (size > 0) { 202 | this->data.reserve(size); 203 | } 204 | } 205 | } 206 | 207 | Response::Response(Type type, const char* fmt, ...) : type(type), int_value(0) { 208 | va_list va; 209 | va_start(va, fmt); 210 | this->data = string_vprintf(fmt, va); 211 | va_end(va); 212 | } 213 | 214 | Response::Response(Type type, const void* data, size_t size) : type(type), 215 | data((const char*)data, size), int_value(0) { } 216 | 217 | Response::Response(Type type, const string& data) : type(type), data(data), 218 | int_value(0) { } 219 | 220 | bool Response::operator==(const Response& other) const { 221 | if (this->type != other.type) { 222 | return false; 223 | } 224 | 225 | // check for nulls 226 | if (this->type == Type::Data || this->type == Type::Multi) { 227 | if ((this->int_value < 0) && (other.int_value < 0)) { 228 | return true; // both are null 229 | } 230 | if ((this->int_value < 0) || (other.int_value < 0)) { 231 | return false; // one is null but the other isn't 232 | } 233 | } 234 | 235 | switch (this->type) { 236 | case Type::Status: 237 | case Type::Error: 238 | case Type::Data: 239 | return this->data == other.data; 240 | 241 | case Type::Integer: 242 | return this->int_value == other.int_value; 243 | 244 | case Type::Multi: 245 | if (this->fields.size() != other.fields.size()) { 246 | return false; 247 | } 248 | for (size_t x = 0; x < this->fields.size(); x++) { 249 | if (*this->fields[x] != *other.fields[x]) { 250 | return false; 251 | } 252 | } 253 | return true; 254 | 255 | default: 256 | return false; 257 | } 258 | } 259 | 260 | bool Response::operator!=(const Response& other) const { 261 | return !(this->operator==(other)); 262 | } 263 | 264 | void Response::print(FILE* stream, int indent_level) const { 265 | 266 | if (indent_level < 0) { 267 | indent_level = -indent_level; 268 | } else { 269 | print_indent(stream, indent_level); 270 | } 271 | 272 | switch (this->type) { 273 | case Type::Status: 274 | fprintf(stream, "Response[type=Status, data=%s]", this->data.c_str()); 275 | break; 276 | 277 | case Type::Error: 278 | fprintf(stream, "Response[type=Error, data=%s]", this->data.c_str()); 279 | break; 280 | 281 | case Type::Integer: 282 | fprintf(stream, "Response[type=Integer, int_value=%" PRId64 "]", 283 | this->int_value); 284 | break; 285 | 286 | case Type::Data: 287 | if (this->int_value < 0) { 288 | fprintf(stream, "Response[type=Data, null]\n"); 289 | } else { 290 | fprintf(stream, "Response[type=Data, data="); 291 | for (char ch : data) { 292 | if (ch < 0x20 || ch > 0x7F) { 293 | fprintf(stream, "\\x%02X", ch); 294 | } else { 295 | fputc(ch, stream); 296 | } 297 | } 298 | fputc(']', stream); 299 | } 300 | break; 301 | 302 | case Type::Multi: 303 | if (this->int_value < 0) { 304 | fprintf(stream, "Response[type=Multi, null]"); 305 | } else { 306 | fprintf(stream, "Response[type=MULTI, fields=[\n"); 307 | for (const auto& resp : this->fields) { 308 | resp->print(stream, indent_level + 1); 309 | fprintf(stream, ",\n"); 310 | } 311 | print_indent(stream, indent_level); 312 | fprintf(stream, "]"); 313 | } 314 | break; 315 | 316 | default: 317 | fprintf(stream, "Response[type=Unknown]\n"); 318 | } 319 | } 320 | 321 | string Response::format() const { 322 | 323 | switch (this->type) { 324 | case Type::Status: 325 | return "(Status) " + this->data; 326 | 327 | case Type::Error: 328 | return "(Error) " + this->data; 329 | 330 | case Type::Integer: 331 | return string_printf("%" PRId64, this->int_value); 332 | 333 | case Type::Data: 334 | if (this->int_value < 0) { 335 | return "(Null)"; 336 | } else { 337 | string ret = "\'"; 338 | for (char ch : this->data) { 339 | if (ch < 0x20 || ch > 0x7F) { 340 | ret += string_printf("\\x%02X", ch); 341 | } else if (ch == '\'') { 342 | ret += "\\\'"; 343 | } else { 344 | ret += ch; 345 | } 346 | } 347 | ret += '\''; 348 | return ret; 349 | } 350 | break; 351 | 352 | case Type::Multi: 353 | if (this->int_value < 0) { 354 | return "(Null)"; 355 | } else { 356 | string ret = "["; 357 | for (const auto& f : this->fields) { 358 | if (ret.size() > 1) { 359 | ret += ", "; 360 | } 361 | ret += f->format(); 362 | } 363 | ret += "]"; 364 | return ret; 365 | } 366 | break; 367 | 368 | default: 369 | return string_printf("(UnknownType:%02" PRIX8 ")", (uint8_t)this->type); 370 | } 371 | } 372 | 373 | void Response::write(struct evbuffer* buf) const { 374 | 375 | if (!buf) { 376 | return; 377 | } 378 | 379 | switch (this->type) { 380 | case Type::Status: 381 | case Type::Error: 382 | this->write_string(buf, this->data.data(), this->data.size(), 383 | (char)this->type); 384 | break; 385 | 386 | case Type::Integer: 387 | this->write_int(buf, this->int_value, (char)Type::Integer); 388 | break; 389 | 390 | case Type::Data: 391 | if (this->int_value >= 0) { 392 | this->write_int(buf, this->data.size(), (char)Type::Data); 393 | evbuffer_add(buf, this->data.data(), this->data.size()); 394 | evbuffer_add(buf, "\r\n", 2); 395 | } else { 396 | evbuffer_add(buf, "$-1\r\n", 5); 397 | } 398 | break; 399 | 400 | case Type::Multi: 401 | if (this->int_value >= 0) { 402 | this->write_int(buf, this->fields.size(), (char)Type::Multi); 403 | for (const auto& field : this->fields) { 404 | field->write(buf); 405 | } 406 | } else { 407 | evbuffer_add(buf, "*-1\r\n", 5); 408 | } 409 | break; 410 | 411 | default: 412 | throw runtime_error("invalid response type in write()"); 413 | } 414 | } 415 | 416 | void Response::write_string(struct evbuffer* buf, const char* string, 417 | char sentinel) { 418 | if (!buf) { 419 | return; 420 | } 421 | if (sentinel == Response::Type::Data) { 422 | evbuffer_add_printf(buf, "$%zu\r\n%s\r\n", strlen(string), string); 423 | } else { 424 | evbuffer_add_printf(buf, "%c%s\r\n", sentinel, string); 425 | } 426 | } 427 | 428 | void Response::write_string(struct evbuffer* buf, const void* string, 429 | size_t size, char sentinel) { 430 | if (!buf) { 431 | return; 432 | } 433 | if (sentinel == Response::Type::Data) { 434 | evbuffer_add_printf(buf, "$%zu\r\n", size); 435 | } else { 436 | evbuffer_add(buf, &sentinel, 1); 437 | } 438 | evbuffer_add(buf, string, size); 439 | evbuffer_add(buf, "\r\n", 2); 440 | } 441 | 442 | void Response::write_int(struct evbuffer* buf, int64_t value, 443 | char sentinel) { 444 | if (!buf) { 445 | return; 446 | } 447 | evbuffer_add_printf(buf, "%c%" PRId64 "\r\n", sentinel, value); 448 | } 449 | 450 | 451 | 452 | CommandParser::CommandParser() : state(State::Initial), error_str(NULL) { } 453 | 454 | const char* CommandParser::error() const { 455 | return this->error_str; 456 | } 457 | 458 | shared_ptr CommandParser::resume(struct evbuffer* buf) { 459 | char input_line[0x100]; 460 | for (;;) { 461 | switch (this->state) { 462 | case State::Initial: { 463 | // expect "*num_args\r\n", or inline command 464 | try { 465 | evbuffer_readln_into(buf, input_line, sizeof(input_line), 466 | EVBUFFER_EOL_CRLF); 467 | } catch (const out_of_range&) { 468 | return NULL; // complete line not yet available 469 | } catch (const runtime_error& e) { 470 | this->error_str = "line too long"; 471 | return NULL; 472 | } 473 | 474 | if (input_line[0] != '*') { 475 | // this is an inline command; split it on spaces 476 | shared_ptr cmd(new DataCommand()); 477 | auto& args = cmd->args; 478 | 479 | size_t arg_start_offset = 0; 480 | for (size_t x = 0; input_line[x];) { 481 | // find the end of the current token 482 | for (; input_line[x] && (input_line[x] != ' '); x++); 483 | 484 | args.emplace_back(&input_line[arg_start_offset], x - arg_start_offset); 485 | 486 | // find the start of the next argument 487 | for (; input_line[x] && (input_line[x] == ' '); x++); 488 | arg_start_offset = x; 489 | } 490 | 491 | // we're done. notice that this doesn't affect the parser state at all 492 | return cmd; 493 | 494 | } 495 | 496 | // not an inline command. move to reading-argument state 497 | this->arguments_remaining = strtoll(&input_line[1], NULL, 10); 498 | if (this->arguments_remaining <= 0) { 499 | throw runtime_error("command with zero or fewer arguments"); 500 | } 501 | this->command_in_progress.reset(new DataCommand(this->arguments_remaining)); 502 | this->state = State::ReadingArgumentSize; 503 | break; 504 | } 505 | 506 | case State::ReadingArgumentSize: { 507 | // expect "$arg_size\r\n" 508 | try { 509 | evbuffer_readln_into(buf, input_line, sizeof(input_line), 510 | EVBUFFER_EOL_CRLF); 511 | } catch (const out_of_range&) { 512 | return NULL; // complete line not yet available 513 | } catch (const runtime_error& e) { 514 | this->error_str = "line too long"; 515 | return NULL; 516 | } 517 | 518 | if (input_line[0] != '$') { 519 | throw runtime_error("didn\'t get command arg size where expected"); 520 | } else { 521 | this->data_bytes_remaining = strtoull(&input_line[1], NULL, 10); 522 | this->command_in_progress->args.emplace_back(); 523 | this->command_in_progress->args.back().reserve( 524 | this->data_bytes_remaining); 525 | this->state = State::ReadingArgumentData; 526 | } 527 | break; 528 | } 529 | 530 | case State::ReadingArgumentData: { 531 | // copy data into the last argument 532 | string& arg = this->command_in_progress->args.back(); 533 | ssize_t bytes_available = evbuffer_get_length(buf); 534 | if (bytes_available == 0) { 535 | return NULL; 536 | } 537 | if (bytes_available > this->data_bytes_remaining) { 538 | bytes_available = this->data_bytes_remaining; 539 | } 540 | 541 | size_t bytes_existing = arg.size(); 542 | arg.resize(bytes_existing + bytes_available); 543 | ssize_t bytes_copied = evbuffer_remove(buf, 544 | const_cast(arg.data()) + bytes_existing, 545 | bytes_available); 546 | if (bytes_copied < 0) { 547 | throw runtime_error("can\'t read from evbuffer"); 548 | } 549 | this->data_bytes_remaining -= bytes_copied; 550 | 551 | // TODO: do we need to handle the case where bytes_copied != bytes_available? 552 | 553 | if (this->data_bytes_remaining == 0) { 554 | this->arguments_remaining--; 555 | this->state = State::ReadingNewlineAfterArgumentData; 556 | } 557 | break; 558 | } 559 | 560 | case State::ReadingNewlineAfterArgumentData: 561 | if (evbuffer_get_length(buf) < 2) { 562 | return NULL; // not ready yet 563 | } 564 | char data[2]; 565 | if (2 != evbuffer_remove(buf, data, 2)) { 566 | throw runtime_error("can\'t read newline after argument data"); 567 | } 568 | if (data[0] != '\r' && data[1] != '\n') { 569 | throw runtime_error("\\r\\n did not follow argument data"); 570 | } 571 | 572 | // if we're expecting more arguments, move back to the appropriate 573 | // state. if not, return the command and return to the initial state. 574 | if (this->arguments_remaining) { 575 | this->state = State::ReadingArgumentSize; 576 | } else { 577 | this->state = State::Initial; 578 | return move(this->command_in_progress); 579 | } 580 | break; 581 | 582 | default: 583 | throw runtime_error("command parser got into unknown state"); 584 | } 585 | } 586 | 587 | return NULL; // complete line not yet available 588 | } 589 | 590 | 591 | 592 | ResponseParser::ResponseParser() : state(State::Initial), error_str(NULL) { } 593 | 594 | const char* ResponseParser::error() const { 595 | return this->error_str; 596 | } 597 | 598 | shared_ptr ResponseParser::resume(struct evbuffer* buf) { 599 | char input_line[0x100]; 600 | for (;;) { 601 | switch (this->state) { 602 | case State::Initial: { 603 | try { 604 | evbuffer_readln_into(buf, input_line, sizeof(input_line), 605 | EVBUFFER_EOL_CRLF); 606 | } catch (const out_of_range&) { 607 | return NULL; // complete line not yet available 608 | } catch (const runtime_error& e) { 609 | this->error_str = "line too long"; 610 | return NULL; 611 | } 612 | 613 | switch (input_line[0]) { 614 | case Response::Type::Status: 615 | case Response::Type::Error: { 616 | shared_ptr resp(new Response((Response::Type)input_line[0], (int64_t)0)); 617 | resp->data.assign(&input_line[1]); 618 | return resp; 619 | } 620 | 621 | case Response::Type::Integer: { 622 | shared_ptr resp(new Response(Response::Type::Integer)); 623 | resp->int_value = strtoll(&input_line[1], NULL, 10); 624 | return resp; 625 | } 626 | 627 | case Response::Type::Data: { 628 | this->data_bytes_remaining = strtoll(&input_line[1], NULL, 0); 629 | if (this->data_bytes_remaining < 0) { 630 | return shared_ptr(new Response(Response::Type::Data, 631 | this->data_bytes_remaining)); 632 | } 633 | 634 | this->response_in_progress.reset(new Response(Response::Type::Data, 635 | this->data_bytes_remaining)); 636 | this->state = (this->data_bytes_remaining ? State::ReadingData : 637 | State::ReadingNewlineAfterData); 638 | break; 639 | } 640 | 641 | case Response::Type::Multi: { 642 | this->multi_fields_remaining = strtoll(&input_line[1], NULL, 0); 643 | if (this->multi_fields_remaining <= 0) { 644 | return shared_ptr(new Response(Response::Type::Multi, 645 | this->multi_fields_remaining)); 646 | } 647 | 648 | this->response_in_progress.reset(new Response(Response::Type::Multi, 649 | this->multi_fields_remaining)); 650 | this->multi_in_progress.reset(new ResponseParser()); 651 | this->state = State::MultiRecursive; 652 | break; } 653 | 654 | default: 655 | throw runtime_error(string_printf("incorrect sentinel: %c", input_line[0])); 656 | } 657 | break; // State::Initial 658 | } 659 | 660 | case State::MultiRecursive: { 661 | for (;;) { 662 | auto field = this->multi_in_progress->resume(buf); 663 | if (!field.get()) { 664 | return NULL; 665 | } 666 | 667 | this->response_in_progress->fields.emplace_back(field); 668 | this->multi_fields_remaining--; 669 | if (this->multi_fields_remaining == 0) { 670 | this->state = State::Initial; 671 | return move(this->response_in_progress); 672 | } 673 | } 674 | break; // State::MultiRecursive 675 | } 676 | 677 | case State::ReadingData: { 678 | // copy data into the data field 679 | ssize_t bytes_available = evbuffer_get_length(buf); 680 | if (bytes_available == 0) { 681 | return NULL; 682 | } 683 | if (bytes_available > this->data_bytes_remaining) { 684 | bytes_available = this->data_bytes_remaining; 685 | } 686 | 687 | size_t bytes_existing = this->response_in_progress->data.size(); 688 | this->response_in_progress->data.resize(bytes_existing + bytes_available); 689 | ssize_t bytes_copied = evbuffer_remove(buf, 690 | const_cast(this->response_in_progress->data.data()) + bytes_existing, 691 | bytes_available); 692 | if (bytes_copied < 0) { 693 | throw runtime_error("can\'t read from evbuffer"); 694 | } 695 | this->data_bytes_remaining -= bytes_copied; 696 | 697 | if (this->data_bytes_remaining == 0) { 698 | this->state = State::ReadingNewlineAfterData; 699 | } 700 | break; 701 | } 702 | 703 | case State::ReadingNewlineAfterData: 704 | if (evbuffer_get_length(buf) < 2) { 705 | return NULL; // not ready yet 706 | } 707 | char data[2]; 708 | if (2 != evbuffer_remove(buf, data, 2)) { 709 | throw runtime_error("can\'t read newline after argument data"); 710 | } 711 | if (data[0] != '\r' && data[1] != '\n') { 712 | throw runtime_error("\\r\\n did not follow argument data"); 713 | } 714 | 715 | this->state = State::Initial; 716 | return move(this->response_in_progress); 717 | 718 | default: 719 | throw runtime_error("response parser got into unknown state"); 720 | } 721 | } 722 | return NULL; 723 | } 724 | 725 | bool ResponseParser::forward(struct evbuffer* buf, 726 | struct evbuffer* output_buffer) { 727 | 728 | // output_buffer can be NULL if the client has already disconnected. in this 729 | // case, we just don't write to the output buffer (discard the response). 730 | char input_line[0x100]; 731 | size_t input_line_size; 732 | for (;;) { 733 | switch (this->state) { 734 | case State::Initial: { 735 | try { 736 | input_line_size = evbuffer_readln_into(buf, input_line, 737 | sizeof(input_line), EVBUFFER_EOL_CRLF, false); 738 | } catch (const out_of_range&) { 739 | return false; // complete line not yet available 740 | } catch (const runtime_error& e) { 741 | this->error_str = "line too long"; 742 | return false; 743 | } 744 | 745 | // forward the line to the client immediately. unlike in resume(), we 746 | // didn't drain it from the input buffer, so hopefully we can just move 747 | // the data between buffers instead of copying. add 2 for the \r\n 748 | if (output_buffer) { 749 | evbuffer_remove_buffer(buf, output_buffer, input_line_size + 2); 750 | } 751 | 752 | switch (input_line[0]) { 753 | case Response::Type::Status: 754 | case Response::Type::Error: 755 | case Response::Type::Integer: 756 | return true; 757 | 758 | case Response::Type::Data: 759 | // we add 2 here for the trailing \r\n 760 | this->data_bytes_remaining = strtoll(&input_line[1], NULL, 0); 761 | if (this->data_bytes_remaining < 0) { 762 | return true; // null response 763 | } else { 764 | this->state = State::ReadingData; 765 | } 766 | break; 767 | 768 | case Response::Type::Multi: 769 | this->multi_fields_remaining = strtoll(&input_line[1], NULL, 0); 770 | if (this->multi_fields_remaining <= 0) { 771 | return true; // null response 772 | } else { 773 | this->multi_in_progress.reset(new ResponseParser()); 774 | this->state = State::MultiRecursive; 775 | } 776 | break; 777 | 778 | default: 779 | throw runtime_error(string_printf("incorrect sentinel: %c", input_line[0])); 780 | } 781 | break; // State::Initial 782 | } 783 | 784 | case State::MultiRecursive: { 785 | for (;;) { 786 | bool field_forwarded = this->multi_in_progress->forward(buf, 787 | output_buffer); 788 | if (!field_forwarded) { 789 | return false; 790 | } 791 | 792 | this->multi_fields_remaining--; 793 | if (this->multi_fields_remaining == 0) { 794 | this->state = State::Initial; 795 | return true; 796 | } 797 | } 798 | break; 799 | } 800 | 801 | case State::ReadingData: { 802 | ssize_t bytes_available = evbuffer_get_length(buf); 803 | if (bytes_available == 0) { 804 | return false; 805 | } 806 | if (bytes_available > this->data_bytes_remaining) { 807 | bytes_available = this->data_bytes_remaining; 808 | } 809 | 810 | if (output_buffer) { 811 | ssize_t bytes_copied = evbuffer_remove_buffer(buf, output_buffer, 812 | bytes_available); 813 | if (bytes_copied < 0) { 814 | throw runtime_error("can\'t read from evbuffer"); 815 | } 816 | this->data_bytes_remaining -= bytes_copied; 817 | } else { 818 | evbuffer_drain(buf, bytes_available); 819 | this->data_bytes_remaining -= bytes_available; 820 | } 821 | 822 | if (this->data_bytes_remaining == 0) { 823 | this->state = State::ReadingNewlineAfterData; 824 | } 825 | break; 826 | } 827 | 828 | case State::ReadingNewlineAfterData: { 829 | if (evbuffer_get_length(buf) < 2) { 830 | return false; // not ready yet 831 | } 832 | char data[2]; 833 | if (2 != evbuffer_remove(buf, data, 2)) { 834 | throw runtime_error("can\'t read newline after argument data"); 835 | } 836 | if (data[0] != '\r' && data[1] != '\n') { 837 | throw runtime_error("\\r\\n did not follow argument data"); 838 | } 839 | if (output_buffer) { 840 | evbuffer_add(output_buffer, "\r\n", 2); 841 | } 842 | 843 | this->state = State::Initial; 844 | return true; 845 | } 846 | 847 | default: 848 | throw runtime_error("response parser got into unknown state"); 849 | } 850 | } 851 | 852 | return false; 853 | } 854 | -------------------------------------------------------------------------------- /Protocol.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | // DataCommand and ReferenceCommand aren't subclasses of a parent Command class 12 | // because this incurs a significant performance penalty (up to 7% in some 13 | // cases). 14 | 15 | struct DataCommand { 16 | std::vector args; 17 | 18 | DataCommand() = default; 19 | explicit DataCommand(size_t num_args); 20 | ~DataCommand() = default; 21 | 22 | void print(FILE* stream, int indent_level = 0) const; 23 | std::string format() const; 24 | 25 | void write(struct evbuffer* buf) const; 26 | }; 27 | 28 | 29 | struct ReferenceCommand { 30 | struct DataReference { 31 | const void* data; 32 | size_t size; 33 | 34 | DataReference(); 35 | DataReference(const void* data, size_t size); 36 | DataReference(const std::string& data); 37 | }; 38 | 39 | std::vector args; 40 | 41 | ReferenceCommand() = default; 42 | explicit ReferenceCommand(size_t num_args); 43 | ~ReferenceCommand() = default; 44 | 45 | void print(FILE* stream, int indent_level = 0) const; 46 | std::string format() const; 47 | 48 | void write(struct evbuffer* buf) const; 49 | }; 50 | 51 | 52 | struct Response { 53 | enum Type { 54 | Status = '+', 55 | Error = '-', 56 | Integer = ':', 57 | Data = '$', 58 | Multi = '*', 59 | }; 60 | Type type; 61 | 62 | std::string data; // Status, Error and Data 63 | int64_t int_value; // Integer 64 | std::vector> fields; // Multi 65 | 66 | Response(Type type, int64_t size = 0); 67 | Response(Type type, const char* fmt, ...); 68 | Response(Type type, const void* data, size_t size); 69 | Response(Type type, const std::string& data); 70 | ~Response() = default; 71 | 72 | bool operator==(const Response& other) const; 73 | bool operator!=(const Response& other) const; 74 | 75 | void print(FILE* stream, int indent_level = 0) const; 76 | std::string format() const; 77 | 78 | void write(struct evbuffer* buf) const; 79 | static void write_string(struct evbuffer* buf, const char* s, char sentinel); 80 | static void write_string(struct evbuffer* buf, const void* s, size_t size, 81 | char sentinel); 82 | static void write_int(struct evbuffer* buf, int64_t value, char sentinel); 83 | }; 84 | 85 | 86 | struct CommandParser { 87 | enum State { 88 | Initial = 0, 89 | ReadingArgumentSize, 90 | ReadingArgumentData, 91 | ReadingNewlineAfterArgumentData, 92 | }; 93 | State state; 94 | const char* error_str; 95 | 96 | int64_t num_command_args; 97 | std::shared_ptr command_in_progress; 98 | int64_t arguments_remaining; 99 | int64_t data_bytes_remaining; 100 | 101 | CommandParser(); 102 | ~CommandParser() = default; 103 | 104 | std::shared_ptr resume(struct evbuffer* buffer); 105 | 106 | const char* error() const; 107 | }; 108 | 109 | struct ResponseParser { 110 | enum State { 111 | Initial = 0, 112 | MultiRecursive, 113 | ReadingData, 114 | ReadingNewlineAfterData, 115 | }; 116 | State state; 117 | const char* error_str; 118 | 119 | std::shared_ptr response_in_progress; 120 | int64_t data_bytes_remaining; 121 | 122 | std::shared_ptr multi_in_progress; 123 | int64_t multi_fields_remaining; 124 | 125 | ResponseParser(); 126 | ~ResponseParser() = default; 127 | 128 | std::shared_ptr resume(struct evbuffer* buffer); 129 | bool forward(struct evbuffer* buffer, struct evbuffer* output_buffer); 130 | 131 | const char* error() const; 132 | }; 133 | -------------------------------------------------------------------------------- /ProtocolTest.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "Protocol.hh" 10 | 11 | using namespace std; 12 | 13 | 14 | template 15 | static void check_serialization(const T& obj, 16 | const char* expected_serialization) { 17 | unique_ptr out_buf( 18 | evbuffer_new(), evbuffer_free); 19 | obj.write(out_buf.get()); 20 | struct evbuffer_ptr pos = evbuffer_search(out_buf.get(), 21 | expected_serialization, strlen(expected_serialization), NULL); 22 | expect_eq(pos.pos, 0); 23 | } 24 | 25 | template 26 | static void check_serialization(shared_ptr obj, 27 | const char* expected_serialization) { 28 | unique_ptr out_buf( 29 | evbuffer_new(), evbuffer_free); 30 | obj->write(out_buf.get()); 31 | struct evbuffer_ptr pos = evbuffer_search(out_buf.get(), 32 | expected_serialization, strlen(expected_serialization), NULL); 33 | expect_eq(pos.pos, 0); 34 | } 35 | 36 | 37 | int main(int argc, char* argv[]) { 38 | 39 | { 40 | printf("-- parse a command & serialize it again\n"); 41 | 42 | const char* command_string = "*7\r\n$4\r\nMSET\r\n$1\r\nx\r\n$1\r\n1\r\n$1\r\ny\r\n$1\r\n2\r\n$1\r\nz\r\n$3\r\nlol\r\n"; 43 | 44 | unique_ptr in_buf( 45 | evbuffer_new(), evbuffer_free); 46 | evbuffer_add(in_buf.get(), command_string, strlen(command_string)); 47 | auto cmd = CommandParser().resume(in_buf.get()); 48 | 49 | // check that the args were parsed properly 50 | expect_eq(cmd->args.size(), 7); 51 | expect_eq(cmd->args[0], "MSET"); 52 | expect_eq(cmd->args[1], "x"); 53 | expect_eq(cmd->args[2], "1"); 54 | expect_eq(cmd->args[3], "y"); 55 | expect_eq(cmd->args[4], "2"); 56 | expect_eq(cmd->args[5], "z"); 57 | expect_eq(cmd->args[6], "lol"); 58 | 59 | check_serialization(cmd, command_string); 60 | } 61 | 62 | { 63 | printf("-- parse a command (inline) & serialize it again\n"); 64 | 65 | const char* command_string = "MSET x 1 y 2 z lol\r\n"; 66 | const char* expected_serialization = "*7\r\n$4\r\nMSET\r\n$1\r\nx\r\n$1\r\n1\r\n$1\r\ny\r\n$1\r\n2\r\n$1\r\nz\r\n$3\r\nlol\r\n"; 67 | 68 | unique_ptr in_buf( 69 | evbuffer_new(), evbuffer_free); 70 | evbuffer_add(in_buf.get(), command_string, strlen(command_string)); 71 | auto cmd = CommandParser().resume(in_buf.get()); 72 | 73 | // check that the args were parsed properly 74 | expect_eq(cmd->args.size(), 7); 75 | expect_eq(cmd->args[0], "MSET"); 76 | expect_eq(cmd->args[1], "x"); 77 | expect_eq(cmd->args[2], "1"); 78 | expect_eq(cmd->args[3], "y"); 79 | expect_eq(cmd->args[4], "2"); 80 | expect_eq(cmd->args[5], "z"); 81 | expect_eq(cmd->args[6], "lol"); 82 | 83 | check_serialization(cmd, expected_serialization); 84 | } 85 | 86 | { 87 | printf("-- parse a response & serialize it again\n"); 88 | 89 | const char* resp_string = "*6\r\n+omg\r\n-bbq\r\n:284713592\r\n$-1\r\n*-1\r\n*1\r\n$20\r\nTo be or not to be, \r\n"; 90 | 91 | unique_ptr in_buf( 92 | evbuffer_new(), evbuffer_free); 93 | evbuffer_add(in_buf.get(), resp_string, strlen(resp_string)); 94 | auto r = ResponseParser().resume(in_buf.get()); 95 | 96 | expect_eq(r->type, Response::Type::Multi); 97 | expect_eq(r->fields.size(), 6); 98 | 99 | expect_eq(r->fields[0]->type, Response::Type::Status); 100 | expect_eq(r->fields[0]->data, "omg"); 101 | 102 | expect_eq(r->fields[1]->type, Response::Type::Error); 103 | expect_eq(r->fields[1]->data, "bbq"); 104 | 105 | expect_eq(r->fields[2]->type, Response::Type::Integer); 106 | expect_eq(r->fields[2]->int_value, 284713592); 107 | 108 | expect_eq(r->fields[3]->type, Response::Type::Data); 109 | expect_eq(r->fields[3]->int_value, -1); 110 | 111 | expect_eq(r->fields[4]->type, Response::Type::Multi); 112 | expect_eq(r->fields[4]->int_value, -1); 113 | 114 | expect_eq(r->fields[5]->type, Response::Type::Multi); 115 | expect_eq(r->fields[5]->fields.size(), 1); 116 | 117 | expect_eq(r->fields[5]->fields[0]->type, Response::Type::Data); 118 | expect_eq(r->fields[5]->fields[0]->data, "To be or not to be, "); 119 | 120 | check_serialization(r, resp_string); 121 | } 122 | 123 | { 124 | printf("-- check Response printf-like constructor\n"); 125 | 126 | { 127 | Response r(Response::Type::Status, 128 | "This is response %d of %d; here\'s a string: %s.", 4, 10, "lol"); 129 | const char* expected = "+This is response 4 of 10; here\'s a string: lol.\r\n"; 130 | check_serialization(r, expected); 131 | } 132 | 133 | { 134 | Response r(Response::Type::Error, 135 | "This is response %d of %d; here\'s a string: %s.", 4, 10, "lol"); 136 | const char* expected = "-This is response 4 of 10; here\'s a string: lol.\r\n"; 137 | check_serialization(r, expected); 138 | } 139 | 140 | { 141 | Response r(Response::Type::Data, 142 | "This is response %d of %d; here\'s a string: %s.", 4, 10, "lol"); 143 | const char* expected = "$47\r\nThis is response 4 of 10; here\'s a string: lol.\r\n"; 144 | check_serialization(r, expected); 145 | } 146 | } 147 | 148 | printf("all tests passed\n"); 149 | return 0; 150 | } 151 | -------------------------------------------------------------------------------- /Proxy.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "Protocol.hh" 16 | 17 | 18 | struct ResponseLink; 19 | struct Backend; 20 | 21 | 22 | struct BackendConnection { 23 | Backend* backend; 24 | int64_t index; 25 | 26 | std::unique_ptr bev; 27 | ResponseParser parser; 28 | 29 | struct sockaddr_storage local_addr; 30 | struct sockaddr_storage remote_addr; 31 | 32 | size_t num_commands_sent; 33 | size_t num_responses_received; 34 | 35 | ResponseLink* head_link; 36 | ResponseLink* tail_link; 37 | 38 | BackendConnection(Backend* backend, int64_t index, 39 | std::unique_ptr&& bev); 40 | BackendConnection(const BackendConnection&) = delete; 41 | BackendConnection(BackendConnection&&) = delete; 42 | BackendConnection& operator=(const BackendConnection&) = delete; 43 | BackendConnection& operator=(BackendConnection&&) = delete; 44 | ~BackendConnection(); 45 | 46 | struct evbuffer* get_output_buffer(); 47 | 48 | void print(FILE* stream, int indent_level = 0) const; 49 | }; 50 | 51 | struct Backend { 52 | size_t index; 53 | 54 | std::string host; 55 | int port; 56 | 57 | std::string name; 58 | std::string debug_name; 59 | 60 | std::unordered_map index_to_connection; 61 | int64_t next_connection_index; 62 | 63 | size_t num_responses_received; 64 | size_t num_commands_sent; 65 | 66 | Backend(size_t index, const std::string& host, int port, const std::string& name); 67 | Backend(const Backend&) = delete; 68 | Backend(Backend&&) = delete; 69 | Backend& operator=(const Backend&) = delete; 70 | Backend& operator=(Backend&&) = delete; 71 | ~Backend() = default; 72 | 73 | BackendConnection& get_default_connection(); 74 | 75 | void print(FILE* stream, int indent_level = 0) const; 76 | }; 77 | 78 | 79 | struct Client { 80 | std::string name; 81 | std::string debug_name; 82 | bool should_disconnect; 83 | 84 | std::unique_ptr bev; 85 | CommandParser parser; 86 | 87 | struct sockaddr_storage local_addr; 88 | struct sockaddr_storage remote_addr; 89 | 90 | size_t num_commands_received; 91 | size_t num_responses_sent; 92 | 93 | ResponseLink* head_link; 94 | ResponseLink* tail_link; 95 | 96 | Client(std::unique_ptr bev); 97 | Client(const Client&) = delete; 98 | Client(Client&&) = delete; 99 | Client& operator=(const Client&) = delete; 100 | Client& operator=(Client&&) = delete; 101 | ~Client(); 102 | 103 | struct evbuffer* get_output_buffer(); 104 | 105 | void print(FILE* stream, int indent_level = 0) const; 106 | }; 107 | 108 | 109 | // a ResponseLink represents a response that a client is expecting to receive, 110 | // and also a promise that one or more backends will send a response that can be 111 | // used to generate the response for the waiting client. each ResponseLink is 112 | // linked in one or more lists representing its dependencies. 113 | // 114 | // each Client has a linked list of ResponseLinks representing the responses 115 | // that the client expects to receive (in order). this list is traversed by 116 | // following the next_client links in the ResponseLink. 117 | // 118 | // similarly, each BackendConnection has a linked list of ResponseLinks in the 119 | // order that responses should be routed. this isn't necessarily the same as the 120 | // next_client order. because a ResponseLink may represent an aggregation of 121 | // multiple backend responses, the ResponseLink may exist in multiple 122 | // BackendConnection lists. to traverse one of these lists, look up the given 123 | // BackendConnection in the ResponseLink's backend_conn_to_next_link map. 124 | // 125 | // a ResponseLink is "ready" when all the needed backend responses have been 126 | // received. this doesn't mean it can be sent to the client though - since the 127 | // order of responses must be preserved, it can only be sent if it's first in 128 | // the client's list. for example, if a client sends "GET x" and "GET y", and 129 | // the backend for y responds first, we can't send the result yet. in this case, 130 | // the ResponseLink stays ready until the backend for x responds. at that time, 131 | // both ResponseLinks are ready, and are sent in the correct order. 132 | // 133 | // ResponseLinks are owned by the Client they're linked to, and are destroyed 134 | // after the response is sent. if a client disconnects before receiving all of 135 | // its pending responses, the client is unlinked from its ResponseLinks, but the 136 | // ResponseLinks remain. this is necessary because there may be pipelined 137 | // commands on the linked backend connections, and we need to discard the 138 | // responses meant for this client. in this case, the ResponseLinks are owned by 139 | // the BackendConnections, and are destroyed when they're unlinked from the last 140 | // BackendConnection. 141 | // 142 | // if a BackendConnection disconnects early, then all of the ResponseLinks it's 143 | // linked to receive an error response, and they're unlinked from the 144 | // BackendConnection immediately. any ready ResponseLinks are processed (sent 145 | // to the client, if possible) at this time also. 146 | 147 | struct ResponseLink { 148 | enum class CollectionType { 149 | ForwardResponse = 0, 150 | CollectStatusResponses, 151 | SumIntegerResponses, 152 | CombineMultiResponses, 153 | CollectResponses, 154 | CollectMultiResponsesByKey, 155 | CollectIdenticalResponses, 156 | ModifyScanResponse, 157 | ModifyScriptExistsResponse, 158 | ModifyMigrateResponse, 159 | }; 160 | CollectionType type; 161 | 162 | static const char* name_for_collection_type(CollectionType type); 163 | 164 | Client* client; 165 | ResponseLink* next_client; 166 | std::unordered_map backend_conn_to_next_link; 167 | 168 | std::shared_ptr error_response; 169 | 170 | // type-specific fields 171 | 172 | std::shared_ptr response_to_forward; 173 | 174 | int64_t response_integer_sum; 175 | 176 | Response::Type expected_response_type; 177 | std::vector> responses; 178 | 179 | std::vector recombination_queue; 180 | std::unordered_map> backend_index_to_response; 181 | 182 | int64_t scan_backend_index; 183 | 184 | ResponseLink(CollectionType type, Client* c); 185 | ResponseLink(const ResponseLink&) = delete; 186 | ResponseLink(ResponseLink&&) = delete; 187 | ResponseLink& operator=(const ResponseLink&) = delete; 188 | ResponseLink& operator=(ResponseLink&&) = delete; 189 | ~ResponseLink(); 190 | 191 | bool is_ready() const; 192 | 193 | void print(FILE* stream, int indent_level = 0) const; 194 | }; 195 | 196 | 197 | class Proxy { 198 | public: 199 | struct Stats { 200 | std::atomic num_commands_received; 201 | std::atomic num_commands_sent; 202 | std::atomic num_responses_received; 203 | std::atomic num_responses_sent; 204 | std::atomic num_connections_received; 205 | std::atomic num_clients; 206 | uint64_t start_time; 207 | 208 | Stats(); 209 | }; 210 | 211 | struct Netloc { 212 | std::string name; 213 | std::string host; 214 | int port; 215 | }; 216 | 217 | Proxy(int listen_fd, std::shared_ptr ring, 218 | int hash_begin_delimiter = -1, int hash_end_delimiter = -1, 219 | std::shared_ptr stats = NULL, size_t proxy_index = 0); 220 | Proxy(const Proxy&) = delete; 221 | Proxy(Proxy&&) = delete; 222 | Proxy& operator=(const Proxy&) = delete; 223 | Proxy& operator=(Proxy&&) = delete; 224 | ~Proxy() = default; 225 | 226 | bool disable_command(const std::string& command_name); 227 | 228 | void serve(); 229 | void stop(); 230 | 231 | void print(FILE* stream, int indent_level = 0) const; 232 | 233 | private: 234 | // network state 235 | int listen_fd; 236 | std::unique_ptr base; 237 | std::unique_ptr listener; 238 | bool should_exit; 239 | 240 | // connection indexing and lookup 241 | std::shared_ptr ring; 242 | std::vector backends; 243 | std::unordered_map name_to_backend; 244 | std::unordered_map bev_to_backend_conn; 245 | std::unordered_map bev_to_client; 246 | 247 | // stats 248 | size_t proxy_index; 249 | std::shared_ptr stats; 250 | 251 | // hash configuration 252 | int hash_begin_delimiter; 253 | int hash_end_delimiter; 254 | 255 | // backend lookups 256 | int64_t backend_index_for_key(const std::string& s) const; 257 | int64_t backend_index_for_argument(const std::string& arg) const; 258 | Backend& backend_for_index(size_t index); 259 | Backend& backend_for_key(const std::string& s); 260 | BackendConnection& backend_conn_for_index(size_t index); 261 | BackendConnection& backend_conn_for_key(const std::string& s); 262 | 263 | // connection management 264 | void disconnect_client(Client* c); 265 | void disconnect_backend(BackendConnection* b); 266 | 267 | // response linking 268 | ResponseLink* create_link(ResponseLink::CollectionType type, Client* c); 269 | ResponseLink* create_error_link(Client* c, std::shared_ptr r); 270 | struct evbuffer* can_send_command(BackendConnection* conn, ResponseLink* l); 271 | void link_connection(BackendConnection* conn, ResponseLink* l); 272 | void send_command_and_link(BackendConnection* conn, ResponseLink* l, 273 | const DataCommand* cmd); 274 | void send_command_and_link(BackendConnection* conn, ResponseLink* l, 275 | const std::shared_ptr& cmd); 276 | void send_command_and_link(BackendConnection* conn, ResponseLink* l, 277 | const ReferenceCommand* cmd); 278 | 279 | // high-level output handlers 280 | void send_client_response(Client* c, const Response* r); 281 | void send_client_response(Client* c, 282 | const std::shared_ptr& r); 283 | void send_client_string_response(Client* c, const char* s, 284 | Response::Type type); 285 | void send_client_string_response(Client* c, const std::string& s, 286 | Response::Type type); 287 | void send_client_string_response(Client* c, const void* data, size_t size, 288 | Response::Type type); 289 | void send_client_int_response(Client* c, int64_t int_value, 290 | Response::Type type); 291 | 292 | // high-level input handlers 293 | void send_ready_response(ResponseLink* l); 294 | void send_all_ready_responses(Client* c); 295 | void handle_backend_response(BackendConnection* conn, 296 | std::shared_ptr r); 297 | void handle_client_command(Client* c, std::shared_ptr cmd); 298 | 299 | // low-level input handlers 300 | static void dispatch_on_client_input(struct bufferevent *bev, void* ctx); 301 | void on_client_input(struct bufferevent *bev); 302 | static void dispatch_on_client_error(struct bufferevent *bev, short events, 303 | void* ctx); 304 | void on_client_error(struct bufferevent *bev, short events); 305 | static void dispatch_on_backend_input(struct bufferevent *bev, void* ctx); 306 | void on_backend_input(struct bufferevent *bev); 307 | static void dispatch_on_backend_error(struct bufferevent *bev, short events, 308 | void* ctx); 309 | void on_backend_error(struct bufferevent *bev, short events); 310 | static void dispatch_on_listen_error(struct evconnlistener *listener, 311 | void* ctx); 312 | void on_listen_error(struct evconnlistener *listener); 313 | static void dispatch_on_client_accept(struct evconnlistener *listener, 314 | evutil_socket_t fd, struct sockaddr *address, int socklen, void* ctx); 315 | void on_client_accept(struct evconnlistener *listener, evutil_socket_t fd, 316 | struct sockaddr *address, int socklen); 317 | 318 | // timer event handlers 319 | static void dispatch_check_for_thread_exit(evutil_socket_t fd, short what, 320 | void* ctx); 321 | void check_for_thread_exit(evutil_socket_t fd, short what); 322 | 323 | // generic command implementations 324 | void command_all_collect_responses(Client* c, 325 | std::shared_ptr cmd); 326 | void command_all_collect_status_responses(Client* c, 327 | std::shared_ptr cmd); 328 | void command_all_sum_int_responses(Client* c, 329 | std::shared_ptr cmd); 330 | void command_forward_all(Client* c, std::shared_ptr cmd, 331 | ResponseLink::CollectionType type); 332 | void command_forward_by_key_1(Client* c, std::shared_ptr cmd); 333 | void command_forward_by_key_index(Client* c, std::shared_ptr cmd, 334 | size_t key_index); 335 | void command_forward_by_keys(Client* c, std::shared_ptr cmd, 336 | ssize_t start_key_index, ssize_t end_key_index); 337 | void command_forward_by_keys_1_all(Client* c, 338 | std::shared_ptr cmd); 339 | void command_forward_by_keys_1_2(Client* c, std::shared_ptr cmd); 340 | void command_forward_by_keys_2_all(Client* c, 341 | std::shared_ptr cmd); 342 | void command_forward_random(Client* c, std::shared_ptr cmd); 343 | void command_partition_by_keys(Client* c, std::shared_ptr cmd, 344 | size_t start_arg_index, size_t args_per_key, bool interleaved, 345 | ResponseLink::CollectionType type); 346 | void command_partition_by_keys_1_integer(Client* c, 347 | std::shared_ptr cmd); 348 | void command_partition_by_keys_1_multi(Client* c, 349 | std::shared_ptr cmd); 350 | void command_partition_by_keys_2_status(Client* c, 351 | std::shared_ptr cmd); 352 | void command_unimplemented(Client* c, std::shared_ptr cmd); 353 | void command_default(Client* c, std::shared_ptr cmd); 354 | 355 | // specific command implementations 356 | void command_ACL(Client* c, std::shared_ptr cmd); 357 | void command_BACKEND(Client* c, std::shared_ptr cmd); 358 | void command_BACKENDNUM(Client* c, std::shared_ptr cmd); 359 | void command_BACKENDS(Client* c, std::shared_ptr cmd); 360 | void command_CLIENT(Client* c, std::shared_ptr cmd); 361 | void command_DBSIZE(Client* c, std::shared_ptr cmd); 362 | void command_DEBUG(Client* c, std::shared_ptr cmd); 363 | void command_ECHO(Client* c, std::shared_ptr cmd); 364 | void command_EVAL(Client* c, std::shared_ptr cmd); 365 | void command_FORWARD(Client* c, std::shared_ptr cmd); 366 | void command_GEORADIUS(Client* c, std::shared_ptr cmd); 367 | void command_INFO(Client* c, std::shared_ptr cmd); 368 | void command_KEYS(Client* c, std::shared_ptr cmd); 369 | void command_LATENCY(Client* c, std::shared_ptr cmd); 370 | void command_MEMORY(Client* c, std::shared_ptr cmd); 371 | void command_MIGRATE(Client* c, std::shared_ptr cmd); 372 | void command_MODULE(Client* c, std::shared_ptr cmd); 373 | void command_MSETNX(Client* c, std::shared_ptr cmd); 374 | void command_OBJECT(Client* c, std::shared_ptr cmd); 375 | void command_PING(Client* c, std::shared_ptr cmd); 376 | void command_PRINTSTATE(Client* c, std::shared_ptr cmd); 377 | void command_QUIT(Client* c, std::shared_ptr cmd); 378 | void command_ROLE(Client* c, std::shared_ptr cmd); 379 | void command_SCAN(Client* c, std::shared_ptr cmd); 380 | void command_SCRIPT(Client* c, std::shared_ptr cmd); 381 | void command_XGROUP(Client* c, std::shared_ptr cmd); 382 | void command_XINFO(Client* c, std::shared_ptr cmd); 383 | void command_XREAD(Client* c, std::shared_ptr cmd); 384 | void command_ZACTIONSTORE(Client* c, std::shared_ptr cmd); 385 | 386 | // helpers for command implementations 387 | uint8_t scan_cursor_backend_index_bits() const; 388 | 389 | // handler index 390 | typedef void (Proxy::*command_handler)(Client* c, 391 | std::shared_ptr cmd); 392 | std::unordered_map handlers; 393 | static const std::unordered_map 394 | default_handlers; 395 | }; 396 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | redis-shatter 2 | ------------- 3 | 4 | redis-shatter is a sharding proxy for the Redis protocol, inspired by twemproxy 5 | (https://github.com/twitter/twemproxy). Documentation on how to actually use 6 | this is in the example configuration file, redis-shatter.conf. 7 | 8 | In essence, this proxy appears like a standard Redis server to other hosts, but 9 | doesn't store any data locally - instead, keys are distributed between multiple 10 | backends (other hosts that speak the Redis protocol, usually Redis servers). 11 | This can give useful speed, reliability, and scalability benefits, but has some 12 | significant drawbacks as well: 13 | - Command execution requires another network hop, which comes with a performance 14 | penalty. Clients that pipeline commands are somewhat insulated from this 15 | issue. 16 | - Complex commands that affect multiple keys cannot be run efficiently unless the 17 | affected keys are all on the same backend. There are inefficient ways to run 18 | these commands, but these are (currently) not implemented by redis-shatter. 19 | 20 | 21 | Performance 22 | ----------- 23 | 24 | redis-shatter in front of 8 Redis server instances on the same machine was about 25 | 60% as fast as a single Redis server itself, according to side-by-side 26 | redis-benchmark tests. Very little performance optimization work has been done, 27 | so there's probably a lot of room for improvement here. 28 | 29 | Like most of my projects, this is only tested at a small scale (so far), so 30 | there may be unfound bugs or inefficiencies. Use at your own risk. 31 | 32 | 33 | Behavior 34 | -------- 35 | 36 | Unlike other similar projects, redis-shatter strives to implement as many 37 | commands as possible. For some of the less-common commands, redis-shatter 38 | deviates from the standard Redis protocol in order to implement reasonable 39 | behavior. These deviations are explained in the notes in the below table. 40 | 41 | Command -- Supported -- Notes (see end) 42 | ---------------------------------------------------- 43 | ACL CAT -- Yes -- *E 44 | ACL DELUSER -- Yes -- *6 45 | ACL GENPASS -- Yes -- *E 46 | ACL GETUSER -- Yes -- *5 47 | ACL HELP -- Yes -- *E 48 | ACL LIST -- Yes -- *5 49 | ACL LOAD -- Yes -- *6 50 | ACL LOG -- Yes -- *5 51 | ACL SAVE -- Yes -- *6 52 | ACL SETUSER -- Yes -- *6 53 | ACL USERS -- Yes -- *5 54 | ACL WHOAMI -- No -- 55 | APPEND -- Yes -- 56 | AUTH -- No -- 57 | BGREWRITEAOF -- Yes -- *6 58 | BGSAVE -- Yes -- *6 59 | BITCOUNT -- Yes -- 60 | BITFIELD -- Yes -- 61 | BITOP -- Yes -- *2 62 | BITPOS -- Yes -- 63 | BLPOP -- No -- 64 | BRPOP -- No -- 65 | BRPOPLPUSH -- No -- 66 | BZPOPMAX -- No -- 67 | BZPOPMIN -- No -- 68 | CLIENT CACHING -- No -- 69 | CLIENT GETNAME -- Yes -- *F 70 | CLIENT GETREDIR -- No -- 71 | CLIENT ID -- No -- 72 | CLIENT KILL -- No -- 73 | CLIENT LIST -- Yes -- *C 74 | CLIENT PAUSE -- No -- 75 | CLIENT REPLY -- No -- 76 | CLIENT SETNAME -- Yes -- *F 77 | CLIENT TRACKING -- No -- 78 | CLIENT UNBLOCK -- No -- 79 | CLUSTER -- No -- *H 80 | COMMAND -- Yes -- *E 81 | COMMAND COUNT -- Yes -- *E 82 | COMMAND GETKEYS -- Yes -- *E 83 | COMMAND INFO -- Yes -- *E 84 | CONFIG GET -- Yes -- *5 85 | CONFIG RESETSTAT -- Yes -- *6 86 | CONFIG REWRITE -- Yes -- *6 87 | CONFIG SET -- Yes -- *6 88 | DBSIZE -- Yes -- *A 89 | DEBUG OBJECT -- Yes -- 90 | DEBUG SEGFAULT -- No -- 91 | DECR -- Yes -- 92 | DECRBY -- Yes -- 93 | DEL -- Yes -- *4 94 | DISCARD -- No -- 95 | DUMP -- Yes -- 96 | ECHO -- Yes -- 97 | EVAL -- Yes -- *0 *2 98 | EVALSHA -- Yes -- *0 *2 *G 99 | EXEC -- No -- 100 | EXISTS -- Yes -- *4 101 | EXPIRE -- Yes -- 102 | EXPIREAT -- Yes -- 103 | FLUSHALL -- Yes -- *6 104 | FLUSHDB -- Yes -- *6 *7 105 | GEOADD -- Yes -- 106 | GEODIST -- Yes -- 107 | GEOHASH -- Yes -- 108 | GEOPOS -- Yes -- 109 | GEORADIUS -- Yes -- *2 110 | GEORADIUSBYMEMBER -- Yes -- *2 111 | GET -- Yes -- 112 | GETBIT -- Yes -- 113 | GETRANGE -- Yes -- 114 | GETSET -- Yes -- 115 | HDEL -- Yes -- 116 | HELLO -- No -- 117 | HEXISTS -- Yes -- 118 | HGET -- Yes -- 119 | HGETALL -- Yes -- 120 | HINCRBY -- Yes -- 121 | HINCRBYFLOAT -- Yes -- 122 | HKEYS -- Yes -- 123 | HLEN -- Yes -- 124 | HMGET -- Yes -- 125 | HMSET -- Yes -- 126 | HSCAN -- Yes -- 127 | HSET -- Yes -- 128 | HSETNX -- Yes -- 129 | HSTRLEN -- Yes -- 130 | HVALS -- Yes -- 131 | INCR -- Yes -- 132 | INCRBY -- Yes -- 133 | INCRBYFLOAT -- Yes -- 134 | INFO -- Yes -- *8 135 | KEYS -- Yes -- *9 *A 136 | LASTSAVE -- Yes -- *5 137 | LATENCY DOCTOR -- Yes -- *5 138 | LATENCY GRAPH -- Yes -- *5 139 | LATENCY RESET -- Yes -- *5 140 | LATENCY LATEST -- Yes -- *5 141 | LATENCY HISTORY -- Yes -- *5 142 | LATENCY HELP -- Yes -- *E 143 | LINDEX -- Yes -- 144 | LINSERT -- Yes -- 145 | LLEN -- Yes -- 146 | LOLWUT -- Yes -- *E 147 | LPOP -- Yes -- 148 | LPUSH -- Yes -- 149 | LPUSHX -- Yes -- 150 | LRANGE -- Yes -- 151 | LREM -- Yes -- 152 | LSET -- Yes -- 153 | LTRIM -- Yes -- 154 | MEMORY DOCTOR -- Yes -- *5 155 | MEMORY HELP -- Yes -- *E 156 | MEMORY MALLOC-STATS -- Yes -- *5 157 | MEMORY PURGE -- Yes -- *5 158 | MEMORY STATS -- Yes -- *5 159 | MEMORY USAGE -- Yes -- 160 | MGET -- Yes -- *4 161 | MIGRATE -- Yes -- *4 *J 162 | MODULE LIST -- Yes -- *5 163 | MODULE LOAD -- Yes -- *5 164 | MODULE UNLOAD -- Yes -- *5 165 | MONITOR -- No -- 166 | MOVE -- No -- 167 | MSET -- Yes -- *4 168 | MSETNX -- Yes -- *2 169 | MULTI -- No -- 170 | OBJECT ENCODING -- Yes -- 171 | OBJECT FREQ -- Yes -- 172 | OBJECT IDLETIME -- Yes -- 173 | OBJECT REFCOUNT -- Yes -- 174 | OBJECT HELP -- Yes -- *E 175 | PERSIST -- Yes -- 176 | PEXPIRE -- Yes -- 177 | PEXPIREAT -- Yes -- 178 | PFADD -- Yes -- 179 | PFCOUNT -- Yes -- *2 180 | PFMERGE -- Yes -- *2 181 | PING -- Yes -- 182 | PSETEX -- Yes -- 183 | PSUBSCRIBE -- No -- 184 | PTTL -- Yes -- 185 | PUBLISH -- No -- 186 | PUBSUB -- No -- 187 | PUNSUBSCRIBE -- No -- 188 | QUIT -- Yes -- 189 | RANDOMKEY -- Yes -- *1 190 | READONLY -- No -- 191 | READWRITE -- No -- 192 | RENAME -- Yes -- *2 193 | RENAMENX -- Yes -- *2 194 | REPLICAOF -- No -- *H 195 | RESTORE -- Yes -- 196 | ROLE -- Yes -- *K 197 | RPOP -- Yes -- 198 | RPOPLPUSH -- Yes -- *2 199 | RPUSH -- Yes -- 200 | RPUSHX -- Yes -- 201 | SADD -- Yes -- 202 | SAVE -- Yes -- *6 203 | SCAN -- Yes -- *A *B 204 | SCARD -- Yes -- 205 | SCRIPT DEBUG -- No -- 206 | SCRIPT EXISTS -- Yes -- *D 207 | SCRIPT FLUSH -- Yes -- *6 208 | SCRIPT KILL -- No -- *H 209 | SCRIPT LOAD -- Yes -- *6 210 | SDIFF -- Yes -- *2 211 | SDIFFSTORE -- Yes -- *2 212 | SELECT -- No -- 213 | SET -- Yes -- 214 | SETBIT -- Yes -- 215 | SETEX -- Yes -- 216 | SETNX -- Yes -- 217 | SETRANGE -- Yes -- 218 | SHUTDOWN -- Yes -- *6 219 | SINTER -- Yes -- *2 220 | SINTERSTORE -- Yes -- *2 221 | SISMEMBER -- Yes -- 222 | STRALGO -- No -- 223 | SLAVEOF -- No -- *H 224 | SLOWLOG -- Yes -- *5 225 | SMEMBERS -- Yes -- 226 | SMOVE -- Yes -- *2 227 | SORT -- Yes -- *2 *3 228 | SPOP -- Yes -- 229 | SRANDMEMBER -- Yes -- 230 | SREM -- Yes -- 231 | SSCAN -- Yes -- 232 | STRLEN -- Yes -- 233 | SUBSCRIBE -- No -- 234 | SUNION -- Yes -- *2 235 | SUNIONSTORE -- Yes -- *2 236 | SWAPDB -- No -- 237 | SYNC -- No -- 238 | TIME -- Yes -- *6 239 | TOUCH -- Yes -- 240 | TTL -- Yes -- 241 | TYPE -- Yes -- 242 | UNLINK -- Yes -- *4 243 | UNSUBSCRIBE -- No -- 244 | UNWATCH -- No -- 245 | WAIT -- No -- 246 | WATCH -- No -- 247 | XACK -- Yes -- 248 | XADD -- Yes -- 249 | XCLAIM -- Yes -- 250 | XDEL -- Yes -- 251 | XGROUP -- Yes -- 252 | XINFO -- Yes -- 253 | XLEN -- Yes -- 254 | XPENDING -- Yes -- 255 | XRANGE -- Yes -- 256 | XREAD -- Yes -- *L 257 | XREADGROUP -- Yes -- *L 258 | XREVRANGE -- Yes -- 259 | XTRIM -- Yes -- 260 | ZADD -- Yes -- 261 | ZCARD -- Yes -- 262 | ZCOUNT -- Yes -- 263 | ZINCRBY -- Yes -- 264 | ZINTERSTORE -- Yes -- *2 265 | ZLEXCOUNT -- Yes -- 266 | ZPOPMAX -- Yes -- 267 | ZPOPMIN -- Yes -- 268 | ZRANGE -- Yes -- 269 | ZRANGEBYLEX -- Yes -- 270 | ZRANGEBYSCORE -- Yes -- 271 | ZRANK -- Yes -- 272 | ZREM -- Yes -- 273 | ZREMRANGEBYLEX -- Yes -- 274 | ZREMRANGEBYRANK -- Yes -- 275 | ZREMRANGEBYSCORE -- Yes -- 276 | ZREVRANGE -- Yes -- 277 | ZREVRANGEBYLEX -- Yes -- 278 | ZREVRANGEBYSCORE -- Yes -- 279 | ZREVRANK -- Yes -- 280 | ZSCAN -- Yes -- 281 | ZSCORE -- Yes -- 282 | ZUNIONSTORE -- Yes -- *2 283 | 284 | Notes: 285 | *0 -- Scripts that affect no keys will run on a random backend. 286 | *1 -- Distribution of random keys may not be exactly uniform. RANDOMKEY is 287 | implemented by choosing a random backend and sending RANDOMKEY to it, so 288 | if backend A has more keys than backend B, the probability of returning 289 | each key from backend B is higher. 290 | *2 -- The affected keys must all be on the same backend. If they aren't, the 291 | command fails with PROXYERROR. 292 | *3 -- The proxy does not check that all the affected keys are on the same 293 | backend; the application has to do this itself. (This is because complex 294 | patterns may be given in e.g. the GET clause.) 295 | *4 -- These commands are atomic only on each backend; they are not atomic across 296 | all backends. 297 | *5 -- The proxy's response format is different than that of Redis - the proxy 298 | returns a multi response with one field per backend. If this isn't what 299 | you want, you can use FORWARD to interact with a single backend at once. 300 | *6 -- These commands are forwarded to all backends. 301 | *7 -- Since the proxy doesn't support multiple redis DBs, FLUSHDB is essentially 302 | equivalent to FLUSHALL. 303 | *8 -- INFO syntax is different from what redis-server expects. These are the 304 | valid forms of the INFO command in redis-shatter: 305 | - INFO - return proxy information 306 | - INFO BACKEND - return proxy stats for a backend 307 | - INFO [section] - send INFO to a specific backend and 308 | return its response verbatim 309 | For the first two forms, the response format is different as well, but 310 | should be mostly self-explanatory. The third form is equivalent to the 311 | command FORWARD INFO [section]. 312 | *9 -- May be slow and consume lots of memory if run on large datasets. It's 313 | almost always better to use SCAN instead. 314 | *A -- Keys may be considered or returned multiple times or be inaccessible 315 | through the proxy if they exist on the wrong backend. For example, if 'x' 316 | belongs on backend 3 but exists on backends 5 and 3, then 'x' will appear 317 | twice in the KEYS or SCAN results, or will be double-counted when running 318 | DBSIZE. If 'x' exists on backend 5 and not 3, then 'x' will appear once in 319 | the KEYS/SCAN results but 'GET x' will return nil. 320 | *B -- SCAN over the entire keyspace is implemented by scanning on each backend 321 | in turn. The proxy uses the highest-order bits of the cursor to keep track 322 | of which backend is currently being scanned. If the backend returns a 323 | cursor that has any of these bits set, the scan will fail. Most practical 324 | setups shouldn't run into this limit. 325 | *C -- The returned fields are different from redis-server. They are: 326 | - name: the client's name (this includes the host:port string). 327 | - cmdrecv: number of commands received from this client. 328 | - rspsent: number of responses sent to this client. 329 | - rspchain: response chain length (the number of responses that haven't 330 | been sent to the client yet because there isn't enough information to 331 | generate the response - some backends haven't replied yet). 332 | *D -- 1 will be returned for a script only if it exists on all backends - if it 333 | is missing on one or more backends, 0 is returned. 334 | *E -- These commands are implemented by forwarding them to a random backend. If 335 | the backends are not configured identically (i.e. some have rename-command 336 | directives in their configs and some don't) then the results may differ 337 | between subsequent calls. 338 | *F -- Names pertain only to the connection between the client and the proxy. 339 | *G -- EVALSHA is more likely to fail in a sharded environment, since the script 340 | needs to be loaded into all the backends' script caches for it to work on 341 | arbitrary keys. Although EVAL implicitly loads the script into the script 342 | cache, EVAL is only forwarded to one backend at a time. If you plan to run 343 | a script many times on different keys, use SCRIPT LOAD first to load the 344 | script on all backends at once. 345 | *H -- Note that any unsupported commands can still be run on individual backends 346 | by using the FORWARD command, but be careful when doing this. See below. 347 | *I -- Which command referred you to this note? 348 | *J -- If any backend returns an error, then MIGRATE returns a multi response 349 | containing the responses from all backends. If no backend returns an 350 | error, then MIGRATE returns NOKEY if all backends returned NOKEY, and OK 351 | otherwise. 352 | *K -- This command returns a multi response with two fields. The first field is 353 | the string "proxy"; the second field is a multi response containing the 354 | names of all of the backends. 355 | *L -- Blocking reads are not supported (the BLOCK argument to these commands 356 | must not be given). 357 | 358 | 359 | Administration 360 | -------------- 361 | 362 | redis-shatter implements many administrative commands that are omitted in other 363 | similar proxies. See the commands table above for details. 364 | 365 | redis-shatter also implements some administrative commands that aren't part of 366 | the official Redis protocol. These commands are: 367 | 368 | BACKEND key [key ...] 369 | Returns the name of the backend on which the given key belongs, as a data 370 | response. If multiple keys are given, returns a multi response with one data 371 | element per key. 372 | 373 | BACKENDNUM key [key ...] 374 | Returns the number of the backend on which the given key belongs, as an 375 | integer response. If multiple keys are given, returns a multi response with 376 | one integer element per key. 377 | 378 | BACKENDS 379 | Returns a list of all backends. The items are formatted as "host:port@name". 380 | 381 | FORWARD backend-name command [args...] 382 | Forwards the given command directly to the given backend and return its 383 | response verbatim. You can use this in lieu of connecting directly to the 384 | backend. This command shares the backend connections with all other clients, 385 | so forwarding commands that affect connection state like MULTI or SELECT will 386 | cause misbehavior. 387 | 388 | FORWARD "" command [args...] 389 | Forwards the given command to all backends and returns a multi response 390 | containing the backends' responses. As noted above, don't forward commands 391 | that affect connection state. 392 | 393 | PRINTSTATE 394 | Prints the proxy's internal state to stderr. 395 | -------------------------------------------------------------------------------- /redis-shatter.conf.json: -------------------------------------------------------------------------------- 1 | { 2 | // Example configuration file for redis-shatter. 3 | // This is standard JSON, but (obviously) supports comments. 4 | 5 | // The configuration is a dictionary of proxy names to proxy instances. One 6 | // redis-shatter process can support many proxy instances, each of which can 7 | // be served by multiple threads. 8 | 9 | "default": { 10 | // Number of threads to run for this proxy instance. Must be at least 1. 11 | // Incoming connections are pseudorandomly assigned to one of the instances. 12 | "num_threads": 4, 13 | 14 | // Which CPUs this proxy instance should use. 15 | // - If nonzero, each thread will run on exactly one of the CPUs given in 16 | // this mask. Setting this to -1 allows all CPUs to be used, but each 17 | // thread still runs on exactly one CPU. 18 | // - If zero, threads will not be assigned to any CPU. 19 | "affinity_cpus": -1, 20 | 21 | // Port and interface on which to listen. If omitted, the defaults are to 22 | // listen on all interfaces on port 6379. 23 | "interface": "0.0.0.0", 24 | "port": 6379, 25 | 26 | // List of backends. Order doesn't matter here. Keys are distributed over 27 | // these backends using a consistent hash ring with the fnv1a64 hash 28 | // function. (The ring's behavior can be changed with the hash_precision 29 | // setting below.) Backends have names that are independent of their network 30 | // location; this is used to relocate backends while keeping the same key 31 | // distribution. 32 | "backends": { 33 | "shard1": "localhost:6381", 34 | "shard2": "localhost:6382", 35 | "shard3": "localhost:6383", 36 | "shard4": "localhost:6384", 37 | "shard5": "localhost:6385", 38 | "shard6": "localhost:6386", 39 | "shard7": "localhost:6387", 40 | "shard8": "localhost:6388", 41 | }, 42 | 43 | // You can optionally disable some commands if you don't want redis-shatter 44 | // to forward them to backends. By default, we disable a few dangerous 45 | // commands. 46 | "disable_commands": ["FLUSHDB", "FLUSHALL", "KEYS"], 47 | 48 | // Hash precision and distribution scheme. 49 | // - If set to zero, redis-shatter uses the same log-time distribution 50 | // scheme as twemproxy (nutcracker), so it can be used with the same 51 | // backends as an existing twemproxy/nutcracker instance. 52 | // - If set to a positive number, redis-shatter uses a constant-time 53 | // distribution scheme. The precision value determines the number of hash 54 | // bits to use in the ring lookup table. A higher value means more 55 | // uniformity but also more memory usage - the table uses (2^precision) 56 | // bytes in memory. 57 | // Changing this value for a non-empty cluster will cause some keys to be 58 | // "left behind" on the wrong backend and inaccessible through the proxy. 59 | "hash_precision": 17, 60 | 61 | // Hash field delimiters. These can be used to make sure keys hash to the 62 | // same backend. 63 | // 64 | // How it works: 65 | // - If a key contains both delimiters, then only the portion of the key 66 | // between the delimiters is hashed to determine which server to send the 67 | // key to. 68 | // - If a key contains only the begin delimiter, then the portion of the key 69 | // after the first occurrence of the begin delimiter is used. 70 | // - If a key contains only the end delimiter, then the portion of the key 71 | // before the last occurrence of the end delimiter is used. 72 | // - If the delimiters are the same and a key contains only one delimiter, 73 | // then it is treated as an end delimiter. (What happens is the same as 74 | // case 3.) 75 | // - If the end delimiter comes before the begin delimiter, then only 76 | // the end delimiter is used. (What happens is the same as case 3.) 77 | // 78 | // Example: hash_field_begin="{", hash_field_end="}" 79 | // xy{z}, xy{z, z, x{z}y, z}xy, z}x{y all hash to the same server 80 | // 81 | // Example: hash_field_begin=":", hash_field_end=":" 82 | // xy:z:, z, z:xy all hash to the same server 83 | // xy:z, xy hash to the same server, which may not be the same as above 84 | // 85 | // Example: hash_field_begin missing, hash_field_end=":" 86 | // xy:z::, xy, xy: all hash to the same server 87 | // xyz, xyz:w hash to the same server, which may not be the same as above 88 | // 89 | // Example: hash_field_begin=":", hash_field_end missing 90 | // xy:z, z, x:y:z all hash to the same server 91 | // xyz, w:xyz hash to the same server, which may not be the same as above 92 | "hash_field_begin": "{", 93 | "hash_field_end": "}", 94 | }, 95 | } -------------------------------------------------------------------------------- /redis.conf: -------------------------------------------------------------------------------- 1 | # Redis configuration file example 2 | 3 | # Note on units: when memory size is needed, it is possible to specify 4 | # it in the usual form of 1k 5GB 4M and so forth: 5 | # 6 | # 1k => 1000 bytes 7 | # 1kb => 1024 bytes 8 | # 1m => 1000000 bytes 9 | # 1mb => 1024*1024 bytes 10 | # 1g => 1000000000 bytes 11 | # 1gb => 1024*1024*1024 bytes 12 | # 13 | # units are case insensitive so 1GB 1Gb 1gB are all the same. 14 | 15 | # By default Redis does not run as a daemon. Use 'yes' if you need it. 16 | # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. 17 | daemonize no 18 | 19 | # When running daemonized, Redis writes a pid file in /var/run/redis.pid by 20 | # default. You can specify a custom pid file location here. 21 | pidfile /var/run/redis.pid 22 | 23 | # Accept connections on the specified port, default is 6379. 24 | # If port 0 is specified Redis will not listen on a TCP socket. 25 | port 638@@REDIS_NUM@@ 26 | 27 | # If you want you can bind a single interface, if the bind option is not 28 | # specified all the interfaces will listen for incoming connections. 29 | # 30 | bind 127.0.0.1 31 | 32 | # Specify the path for the unix socket that will be used to listen for 33 | # incoming connections. There is no default, so Redis will not listen 34 | # on a unix socket when not specified. 35 | # 36 | # unixsocket /tmp/redis.sock 37 | # unixsocketperm 755 38 | 39 | # Close the connection after a client is idle for N seconds (0 to disable) 40 | timeout 0 41 | 42 | # TCP keepalive. 43 | # 44 | # If non-zero, use SO_KEEPALIVE to send TCP ACKs to clients in absence 45 | # of communication. This is useful for two reasons: 46 | # 47 | # 1) Detect dead peers. 48 | # 2) Take the connection alive from the point of view of network 49 | # equipment in the middle. 50 | # 51 | # On Linux, the specified value (in seconds) is the period used to send ACKs. 52 | # Note that to close the connection the double of the time is needed. 53 | # On other kernels the period depends on the kernel configuration. 54 | # 55 | # A reasonable value for this option is 60 seconds. 56 | tcp-keepalive 0 57 | 58 | # Specify the server verbosity level. 59 | # This can be one of: 60 | # debug (a lot of information, useful for development/testing) 61 | # verbose (many rarely useful info, but not a mess like the debug level) 62 | # notice (moderately verbose, what you want in production probably) 63 | # warning (only very important / critical messages are logged) 64 | loglevel notice 65 | 66 | # Specify the log file name. Also 'stdout' can be used to force 67 | # Redis to log on the standard output. Note that if you use standard 68 | # output for logging but daemonize, logs will be sent to /dev/null 69 | logfile stdout 70 | 71 | # To enable logging to the system logger, just set 'syslog-enabled' to yes, 72 | # and optionally update the other syslog parameters to suit your needs. 73 | # syslog-enabled no 74 | 75 | # Specify the syslog identity. 76 | # syslog-ident redis 77 | 78 | # Specify the syslog facility. Must be USER or between LOCAL0-LOCAL7. 79 | # syslog-facility local0 80 | 81 | # Set the number of databases. The default database is DB 0, you can select 82 | # a different one on a per-connection basis using SELECT where 83 | # dbid is a number between 0 and 'databases'-1 84 | databases 16 85 | 86 | ################################ SNAPSHOTTING ################################# 87 | # 88 | # Save the DB on disk: 89 | # 90 | # save 91 | # 92 | # Will save the DB if both the given number of seconds and the given 93 | # number of write operations against the DB occurred. 94 | # 95 | # In the example below the behaviour will be to save: 96 | # after 900 sec (15 min) if at least 1 key changed 97 | # after 300 sec (5 min) if at least 10 keys changed 98 | # after 60 sec if at least 10000 keys changed 99 | # 100 | # Note: you can disable saving at all commenting all the "save" lines. 101 | # 102 | # It is also possible to remove all the previously configured save 103 | # points by adding a save directive with a single empty string argument 104 | # like in the following example: 105 | # 106 | # save "" 107 | 108 | #save 900 1 109 | #save 300 10 110 | #save 60 10000 111 | 112 | # By default Redis will stop accepting writes if RDB snapshots are enabled 113 | # (at least one save point) and the latest background save failed. 114 | # This will make the user aware (in an hard way) that data is not persisting 115 | # on disk properly, otherwise chances are that no one will notice and some 116 | # distater will happen. 117 | # 118 | # If the background saving process will start working again Redis will 119 | # automatically allow writes again. 120 | # 121 | # However if you have setup your proper monitoring of the Redis server 122 | # and persistence, you may want to disable this feature so that Redis will 123 | # continue to work as usually even if there are problems with disk, 124 | # permissions, and so forth. 125 | stop-writes-on-bgsave-error yes 126 | 127 | # Compress string objects using LZF when dump .rdb databases? 128 | # For default that's set to 'yes' as it's almost always a win. 129 | # If you want to save some CPU in the saving child set it to 'no' but 130 | # the dataset will likely be bigger if you have compressible values or keys. 131 | rdbcompression yes 132 | 133 | # Since version 5 of RDB a CRC64 checksum is placed at the end of the file. 134 | # This makes the format more resistant to corruption but there is a performance 135 | # hit to pay (around 10%) when saving and loading RDB files, so you can disable it 136 | # for maximum performances. 137 | # 138 | # RDB files created with checksum disabled have a checksum of zero that will 139 | # tell the loading code to skip the check. 140 | rdbchecksum yes 141 | 142 | # The filename where to dump the DB 143 | dbfilename dump.rdb 144 | 145 | # The working directory. 146 | # 147 | # The DB will be written inside this directory, with the filename specified 148 | # above using the 'dbfilename' configuration directive. 149 | # 150 | # The Append Only File will also be created inside this directory. 151 | # 152 | # Note that you must specify a directory here, not a file name. 153 | dir ./ 154 | 155 | ################################# REPLICATION ################################# 156 | 157 | # Master-Slave replication. Use slaveof to make a Redis instance a copy of 158 | # another Redis server. Note that the configuration is local to the slave 159 | # so for example it is possible to configure the slave to save the DB with a 160 | # different interval, or to listen to another port, and so on. 161 | # 162 | # slaveof 163 | 164 | # If the master is password protected (using the "requirepass" configuration 165 | # directive below) it is possible to tell the slave to authenticate before 166 | # starting the replication synchronization process, otherwise the master will 167 | # refuse the slave request. 168 | # 169 | # masterauth 170 | 171 | # When a slave loses its connection with the master, or when the replication 172 | # is still in progress, the slave can act in two different ways: 173 | # 174 | # 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will 175 | # still reply to client requests, possibly with out of date data, or the 176 | # data set may just be empty if this is the first synchronization. 177 | # 178 | # 2) if slave-serve-stale-data is set to 'no' the slave will reply with 179 | # an error "SYNC with master in progress" to all the kind of commands 180 | # but to INFO and SLAVEOF. 181 | # 182 | slave-serve-stale-data yes 183 | 184 | # You can configure a slave instance to accept writes or not. Writing against 185 | # a slave instance may be useful to store some ephemeral data (because data 186 | # written on a slave will be easily deleted after resync with the master) but 187 | # may also cause problems if clients are writing to it because of a 188 | # misconfiguration. 189 | # 190 | # Since Redis 2.6 by default slaves are read-only. 191 | # 192 | # Note: read only slaves are not designed to be exposed to untrusted clients 193 | # on the internet. It's just a protection layer against misuse of the instance. 194 | # Still a read only slave exports by default all the administrative commands 195 | # such as CONFIG, DEBUG, and so forth. To a limited extend you can improve 196 | # security of read only slaves using 'rename-command' to shadow all the 197 | # administrative / dangerous commands. 198 | slave-read-only yes 199 | 200 | # Slaves send PINGs to server in a predefined interval. It's possible to change 201 | # this interval with the repl_ping_slave_period option. The default value is 10 202 | # seconds. 203 | # 204 | # repl-ping-slave-period 10 205 | 206 | # The following option sets a timeout for both Bulk transfer I/O timeout and 207 | # master data or ping response timeout. The default value is 60 seconds. 208 | # 209 | # It is important to make sure that this value is greater than the value 210 | # specified for repl-ping-slave-period otherwise a timeout will be detected 211 | # every time there is low traffic between the master and the slave. 212 | # 213 | # repl-timeout 60 214 | 215 | # Disable TCP_NODELAY on the slave socket after SYNC? 216 | # 217 | # If you select "yes" Redis will use a smaller number of TCP packets and 218 | # less bandwidth to send data to slaves. But this can add a delay for 219 | # the data to appear on the slave side, up to 40 milliseconds with 220 | # Linux kernels using a default configuration. 221 | # 222 | # If you select "no" the delay for data to appear on the slave side will 223 | # be reduced but more bandwidth will be used for replication. 224 | # 225 | # By default we optimize for low latency, but in very high traffic conditions 226 | # or when the master and slaves are many hops away, turning this to "yes" may 227 | # be a good idea. 228 | repl-disable-tcp-nodelay no 229 | 230 | # The slave priority is an integer number published by Redis in the INFO output. 231 | # It is used by Redis Sentinel in order to select a slave to promote into a 232 | # master if the master is no longer working correctly. 233 | # 234 | # A slave with a low priority number is considered better for promotion, so 235 | # for instance if there are three slaves with priority 10, 100, 25 Sentinel will 236 | # pick the one wtih priority 10, that is the lowest. 237 | # 238 | # However a special priority of 0 marks the slave as not able to perform the 239 | # role of master, so a slave with priority of 0 will never be selected by 240 | # Redis Sentinel for promotion. 241 | # 242 | # By default the priority is 100. 243 | slave-priority 100 244 | 245 | ################################## SECURITY ################################### 246 | 247 | # Require clients to issue AUTH before processing any other 248 | # commands. This might be useful in environments in which you do not trust 249 | # others with access to the host running redis-server. 250 | # 251 | # This should stay commented out for backward compatibility and because most 252 | # people do not need auth (e.g. they run their own servers). 253 | # 254 | # Warning: since Redis is pretty fast an outside user can try up to 255 | # 150k passwords per second against a good box. This means that you should 256 | # use a very strong password otherwise it will be very easy to break. 257 | # 258 | # requirepass foobared 259 | 260 | # Command renaming. 261 | # 262 | # It is possible to change the name of dangerous commands in a shared 263 | # environment. For instance the CONFIG command may be renamed into something 264 | # hard to guess so that it will still be available for internal-use tools 265 | # but not available for general clients. 266 | # 267 | # Example: 268 | # 269 | # rename-command CONFIG b840fc02d524045429941cc15f59e41cb7be6c52 270 | # 271 | # It is also possible to completely kill a command by renaming it into 272 | # an empty string: 273 | # 274 | # rename-command CONFIG "" 275 | # 276 | # Please note that changing the name of commands that are logged into the 277 | # AOF file or transmitted to slaves may cause problems. 278 | 279 | ################################### LIMITS #################################### 280 | 281 | # Set the max number of connected clients at the same time. By default 282 | # this limit is set to 10000 clients, however if the Redis server is not 283 | # able to configure the process file limit to allow for the specified limit 284 | # the max number of allowed clients is set to the current file limit 285 | # minus 32 (as Redis reserves a few file descriptors for internal uses). 286 | # 287 | # Once the limit is reached Redis will close all the new connections sending 288 | # an error 'max number of clients reached'. 289 | # 290 | # maxclients 10000 291 | 292 | # Don't use more memory than the specified amount of bytes. 293 | # When the memory limit is reached Redis will try to remove keys 294 | # accordingly to the eviction policy selected (see maxmemmory-policy). 295 | # 296 | # If Redis can't remove keys according to the policy, or if the policy is 297 | # set to 'noeviction', Redis will start to reply with errors to commands 298 | # that would use more memory, like SET, LPUSH, and so on, and will continue 299 | # to reply to read-only commands like GET. 300 | # 301 | # This option is usually useful when using Redis as an LRU cache, or to set 302 | # an hard memory limit for an instance (using the 'noeviction' policy). 303 | # 304 | # WARNING: If you have slaves attached to an instance with maxmemory on, 305 | # the size of the output buffers needed to feed the slaves are subtracted 306 | # from the used memory count, so that network problems / resyncs will 307 | # not trigger a loop where keys are evicted, and in turn the output 308 | # buffer of slaves is full with DELs of keys evicted triggering the deletion 309 | # of more keys, and so forth until the database is completely emptied. 310 | # 311 | # In short... if you have slaves attached it is suggested that you set a lower 312 | # limit for maxmemory so that there is some free RAM on the system for slave 313 | # output buffers (but this is not needed if the policy is 'noeviction'). 314 | # 315 | # maxmemory 316 | 317 | # MAXMEMORY POLICY: how Redis will select what to remove when maxmemory 318 | # is reached. You can select among five behaviors: 319 | # 320 | # volatile-lru -> remove the key with an expire set using an LRU algorithm 321 | # allkeys-lru -> remove any key accordingly to the LRU algorithm 322 | # volatile-random -> remove a random key with an expire set 323 | # allkeys-random -> remove a random key, any key 324 | # volatile-ttl -> remove the key with the nearest expire time (minor TTL) 325 | # noeviction -> don't expire at all, just return an error on write operations 326 | # 327 | # Note: with any of the above policies, Redis will return an error on write 328 | # operations, when there are not suitable keys for eviction. 329 | # 330 | # At the date of writing this commands are: set setnx setex append 331 | # incr decr rpush lpush rpushx lpushx linsert lset rpoplpush sadd 332 | # sinter sinterstore sunion sunionstore sdiff sdiffstore zadd zincrby 333 | # zunionstore zinterstore hset hsetnx hmset hincrby incrby decrby 334 | # getset mset msetnx exec sort 335 | # 336 | # The default is: 337 | # 338 | # maxmemory-policy volatile-lru 339 | 340 | # LRU and minimal TTL algorithms are not precise algorithms but approximated 341 | # algorithms (in order to save memory), so you can select as well the sample 342 | # size to check. For instance for default Redis will check three keys and 343 | # pick the one that was used less recently, you can change the sample size 344 | # using the following configuration directive. 345 | # 346 | # maxmemory-samples 3 347 | 348 | ############################## APPEND ONLY MODE ############################### 349 | 350 | # By default Redis asynchronously dumps the dataset on disk. This mode is 351 | # good enough in many applications, but an issue with the Redis process or 352 | # a power outage may result into a few minutes of writes lost (depending on 353 | # the configured save points). 354 | # 355 | # The Append Only File is an alternative persistence mode that provides 356 | # much better durability. For instance using the default data fsync policy 357 | # (see later in the config file) Redis can lose just one second of writes in a 358 | # dramatic event like a server power outage, or a single write if something 359 | # wrong with the Redis process itself happens, but the operating system is 360 | # still running correctly. 361 | # 362 | # AOF and RDB persistence can be enabled at the same time without problems. 363 | # If the AOF is enabled on startup Redis will load the AOF, that is the file 364 | # with the better durability guarantees. 365 | # 366 | # Please check http://redis.io/topics/persistence for more information. 367 | 368 | appendonly no 369 | 370 | # The name of the append only file (default: "appendonly.aof") 371 | # appendfilename appendonly.aof 372 | 373 | # The fsync() call tells the Operating System to actually write data on disk 374 | # instead to wait for more data in the output buffer. Some OS will really flush 375 | # data on disk, some other OS will just try to do it ASAP. 376 | # 377 | # Redis supports three different modes: 378 | # 379 | # no: don't fsync, just let the OS flush the data when it wants. Faster. 380 | # always: fsync after every write to the append only log . Slow, Safest. 381 | # everysec: fsync only one time every second. Compromise. 382 | # 383 | # The default is "everysec", as that's usually the right compromise between 384 | # speed and data safety. It's up to you to understand if you can relax this to 385 | # "no" that will let the operating system flush the output buffer when 386 | # it wants, for better performances (but if you can live with the idea of 387 | # some data loss consider the default persistence mode that's snapshotting), 388 | # or on the contrary, use "always" that's very slow but a bit safer than 389 | # everysec. 390 | # 391 | # More details please check the following article: 392 | # http://antirez.com/post/redis-persistence-demystified.html 393 | # 394 | # If unsure, use "everysec". 395 | 396 | # appendfsync always 397 | appendfsync everysec 398 | # appendfsync no 399 | 400 | # When the AOF fsync policy is set to always or everysec, and a background 401 | # saving process (a background save or AOF log background rewriting) is 402 | # performing a lot of I/O against the disk, in some Linux configurations 403 | # Redis may block too long on the fsync() call. Note that there is no fix for 404 | # this currently, as even performing fsync in a different thread will block 405 | # our synchronous write(2) call. 406 | # 407 | # In order to mitigate this problem it's possible to use the following option 408 | # that will prevent fsync() from being called in the main process while a 409 | # BGSAVE or BGREWRITEAOF is in progress. 410 | # 411 | # This means that while another child is saving, the durability of Redis is 412 | # the same as "appendfsync none". In practical terms, this means that it is 413 | # possible to lose up to 30 seconds of log in the worst scenario (with the 414 | # default Linux settings). 415 | # 416 | # If you have latency problems turn this to "yes". Otherwise leave it as 417 | # "no" that is the safest pick from the point of view of durability. 418 | no-appendfsync-on-rewrite no 419 | 420 | # Automatic rewrite of the append only file. 421 | # Redis is able to automatically rewrite the log file implicitly calling 422 | # BGREWRITEAOF when the AOF log size grows by the specified percentage. 423 | # 424 | # This is how it works: Redis remembers the size of the AOF file after the 425 | # latest rewrite (if no rewrite has happened since the restart, the size of 426 | # the AOF at startup is used). 427 | # 428 | # This base size is compared to the current size. If the current size is 429 | # bigger than the specified percentage, the rewrite is triggered. Also 430 | # you need to specify a minimal size for the AOF file to be rewritten, this 431 | # is useful to avoid rewriting the AOF file even if the percentage increase 432 | # is reached but it is still pretty small. 433 | # 434 | # Specify a percentage of zero in order to disable the automatic AOF 435 | # rewrite feature. 436 | 437 | auto-aof-rewrite-percentage 100 438 | auto-aof-rewrite-min-size 64mb 439 | 440 | ################################ LUA SCRIPTING ############################### 441 | 442 | # Max execution time of a Lua script in milliseconds. 443 | # 444 | # If the maximum execution time is reached Redis will log that a script is 445 | # still in execution after the maximum allowed time and will start to 446 | # reply to queries with an error. 447 | # 448 | # When a long running script exceed the maximum execution time only the 449 | # SCRIPT KILL and SHUTDOWN NOSAVE commands are available. The first can be 450 | # used to stop a script that did not yet called write commands. The second 451 | # is the only way to shut down the server in the case a write commands was 452 | # already issue by the script but the user don't want to wait for the natural 453 | # termination of the script. 454 | # 455 | # Set it to 0 or a negative value for unlimited execution without warnings. 456 | lua-time-limit 5000 457 | 458 | ################################## SLOW LOG ################################### 459 | 460 | # The Redis Slow Log is a system to log queries that exceeded a specified 461 | # execution time. The execution time does not include the I/O operations 462 | # like talking with the client, sending the reply and so forth, 463 | # but just the time needed to actually execute the command (this is the only 464 | # stage of command execution where the thread is blocked and can not serve 465 | # other requests in the meantime). 466 | # 467 | # You can configure the slow log with two parameters: one tells Redis 468 | # what is the execution time, in microseconds, to exceed in order for the 469 | # command to get logged, and the other parameter is the length of the 470 | # slow log. When a new command is logged the oldest one is removed from the 471 | # queue of logged commands. 472 | 473 | # The following time is expressed in microseconds, so 1000000 is equivalent 474 | # to one second. Note that a negative number disables the slow log, while 475 | # a value of zero forces the logging of every command. 476 | slowlog-log-slower-than 10000 477 | 478 | # There is no limit to this length. Just be aware that it will consume memory. 479 | # You can reclaim memory used by the slow log with SLOWLOG RESET. 480 | slowlog-max-len 128 481 | 482 | ############################### ADVANCED CONFIG ############################### 483 | 484 | # Hashes are encoded using a memory efficient data structure when they have a 485 | # small number of entries, and the biggest entry does not exceed a given 486 | # threshold. These thresholds can be configured using the following directives. 487 | hash-max-ziplist-entries 512 488 | hash-max-ziplist-value 64 489 | 490 | # Similarly to hashes, small lists are also encoded in a special way in order 491 | # to save a lot of space. The special representation is only used when 492 | # you are under the following limits: 493 | list-max-ziplist-entries 512 494 | list-max-ziplist-value 64 495 | 496 | # Sets have a special encoding in just one case: when a set is composed 497 | # of just strings that happens to be integers in radix 10 in the range 498 | # of 64 bit signed integers. 499 | # The following configuration setting sets the limit in the size of the 500 | # set in order to use this special memory saving encoding. 501 | set-max-intset-entries 512 502 | 503 | # Similarly to hashes and lists, sorted sets are also specially encoded in 504 | # order to save a lot of space. This encoding is only used when the length and 505 | # elements of a sorted set are below the following limits: 506 | zset-max-ziplist-entries 128 507 | zset-max-ziplist-value 64 508 | 509 | # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in 510 | # order to help rehashing the main Redis hash table (the one mapping top-level 511 | # keys to values). The hash table implementation Redis uses (see dict.c) 512 | # performs a lazy rehashing: the more operation you run into an hash table 513 | # that is rehashing, the more rehashing "steps" are performed, so if the 514 | # server is idle the rehashing is never complete and some more memory is used 515 | # by the hash table. 516 | # 517 | # The default is to use this millisecond 10 times every second in order to 518 | # active rehashing the main dictionaries, freeing memory when possible. 519 | # 520 | # If unsure: 521 | # use "activerehashing no" if you have hard latency requirements and it is 522 | # not a good thing in your environment that Redis can reply form time to time 523 | # to queries with 2 milliseconds delay. 524 | # 525 | # use "activerehashing yes" if you don't have such hard requirements but 526 | # want to free memory asap when possible. 527 | activerehashing yes 528 | 529 | # The client output buffer limits can be used to force disconnection of clients 530 | # that are not reading data from the server fast enough for some reason (a 531 | # common reason is that a Pub/Sub client can't consume messages as fast as the 532 | # publisher can produce them). 533 | # 534 | # The limit can be set differently for the three different classes of clients: 535 | # 536 | # normal -> normal clients 537 | # slave -> slave clients and MONITOR clients 538 | # pubsub -> clients subcribed to at least one pubsub channel or pattern 539 | # 540 | # The syntax of every client-output-buffer-limit directive is the following: 541 | # 542 | # client-output-buffer-limit 543 | # 544 | # A client is immediately disconnected once the hard limit is reached, or if 545 | # the soft limit is reached and remains reached for the specified number of 546 | # seconds (continuously). 547 | # So for instance if the hard limit is 32 megabytes and the soft limit is 548 | # 16 megabytes / 10 seconds, the client will get disconnected immediately 549 | # if the size of the output buffers reach 32 megabytes, but will also get 550 | # disconnected if the client reaches 16 megabytes and continuously overcomes 551 | # the limit for 10 seconds. 552 | # 553 | # By default normal clients are not limited because they don't receive data 554 | # without asking (in a push way), but just after a request, so only 555 | # asynchronous clients may create a scenario where data is requested faster 556 | # than it can read. 557 | # 558 | # Instead there is a default limit for pubsub and slave clients, since 559 | # subscribers and slaves receive data in a push fashion. 560 | # 561 | # Both the hard or the soft limit can be disabled by setting them to zero. 562 | client-output-buffer-limit normal 0 0 0 563 | client-output-buffer-limit slave 256mb 64mb 60 564 | client-output-buffer-limit pubsub 32mb 8mb 60 565 | 566 | # Redis calls an internal function to perform many background tasks, like 567 | # closing connections of clients in timeot, purging expired keys that are 568 | # never requested, and so forth. 569 | # 570 | # Not all tasks are perforemd with the same frequency, but Redis checks for 571 | # tasks to perform accordingly to the specified "hz" value. 572 | # 573 | # By default "hz" is set to 10. Raising the value will use more CPU when 574 | # Redis is idle, but at the same time will make Redis more responsive when 575 | # there are many keys expiring at the same time, and timeouts may be 576 | # handled with more precision. 577 | # 578 | # The range is between 1 and 500, however a value over 100 is usually not 579 | # a good idea. Most users should use the default of 10 and raise this up to 580 | # 100 only in environments where very low latency is required. 581 | hz 10 582 | 583 | # When a child rewrites the AOF file, if the following option is enabled 584 | # the file will be fsync-ed every 32 MB of data generated. This is useful 585 | # in order to commit the file to the disk more incrementally and avoid 586 | # big latency spikes. 587 | aof-rewrite-incremental-fsync yes 588 | 589 | ################################## INCLUDES ################################### 590 | 591 | # Include one or more other config files here. This is useful if you 592 | # have a standard template that goes to all Redis server but also need 593 | # to customize a few per-server settings. Include files can include 594 | # other files, so use this wisely. 595 | # 596 | # include /path/to/local.conf 597 | # include /path/to/other.conf 598 | -------------------------------------------------------------------------------- /run_multiple_redis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | seq 1 8 | xargs -I {} -P 8 bash -c "cat redis.conf | sed s/@@REDIS_NUM@@/{}/g | redis-server -" 4 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./run_multiple_redis.sh & 4 | RUN_MULTI_PID=$? 5 | sleep 1 6 | 7 | ./redis-shatter redis-shatter-test.conf.json & 8 | SHATTER_PID=$? 9 | sleep 1 10 | 11 | for TEST in *Test 12 | do 13 | ./$TEST 14 | if [ "$?" != "0" ] 15 | then 16 | FAILURES_FOUND=1 17 | fi 18 | done 19 | 20 | kill -TERM $SHATTER_PID 21 | # TODO: we should kill run_multiple_redis.sh directly too, and make it forward 22 | # the signal to the redis procs 23 | ps aux | grep redis-server | grep -v grep | grep -v xargs | awk '{print $2;}' | xargs kill -TERM 24 | 25 | echo -e "\n\n\n" 26 | if [ "$FAILURES_FOUND" == "1" ] 27 | then 28 | echo "Some tests failed!" 29 | exit 1 30 | else 31 | echo "All tests passed" 32 | exit 0 33 | fi 34 | --------------------------------------------------------------------------------