├── .gitignore ├── .gitmodules ├── README.md ├── apps ├── memcached │ ├── libmemcached-1.0.18.tar.gz │ ├── mcperf-0.1.1.tar.gz │ ├── mcperf.md │ ├── memcached-1.4.21.tar.gz │ ├── mk │ └── run ├── redis │ ├── mk │ ├── pipeline.md │ ├── redis-2.8.17.tar.gz │ ├── run │ └── sentinel.md └── ssdb │ ├── master.tar.gz │ ├── mk │ └── run ├── benchmarks ├── README ├── mckey.c ├── reconf_bench.sh └── run.sh ├── eval ├── eval.py ├── mongoose_aget.cfg └── readme.txt ├── makefile.init ├── src ├── config-comp │ ├── config-dare.c │ └── config-proxy.c ├── dare │ ├── dare_ep_db.c │ ├── dare_ibv.c │ ├── dare_ibv_rc.c │ ├── dare_ibv_ud.c │ ├── dare_kvs_sm.c │ └── dare_server.c ├── db │ └── db-interface.c ├── include │ ├── config-comp │ │ ├── config-dare.h │ │ └── config-proxy.h │ ├── dare │ │ ├── dare.h │ │ ├── dare_client.h │ │ ├── dare_config.h │ │ ├── dare_ep_db.h │ │ ├── dare_ibv.h │ │ ├── dare_ibv_rc.h │ │ ├── dare_ibv_ud.h │ │ ├── dare_kvs_sm.h │ │ ├── dare_log.h │ │ ├── dare_server.h │ │ ├── dare_sm.h │ │ ├── debug.h │ │ ├── message.h │ │ └── timer.h │ ├── db │ │ └── db-interface.h │ ├── proxy │ │ └── proxy.h │ ├── rsm-interface.h │ └── util │ │ ├── common-header.h │ │ └── debug.h ├── proxy │ └── proxy.c └── spec_hooks.cpp ├── target ├── makefile ├── nodes.local.cfg ├── objects.mk ├── sources.mk └── src │ ├── config-comp │ └── subdir.mk │ ├── dare │ └── subdir.mk │ ├── db │ └── subdir.mk │ ├── proxy │ └── subdir.mk │ └── subdir.mk └── utils ├── dep-lib ├── db-5.1.29.tar.gz ├── libconfig-1.4.9.tar.gz └── libev-4.15.tar.gz ├── mk ├── queue ├── queue.h └── tailq.c ├── rbtree ├── include │ ├── compiler.h │ ├── export.h │ ├── rbtree.h │ └── rbtree_augmented.h ├── rbtree.txt └── src │ └── rbtree.c └── uthash └── uthash.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | obj 10 | 11 | # Precompiled Headers 12 | *.gch 13 | *.pch 14 | 15 | # Compiled Dynamic libraries 16 | *.so 17 | *.dylib 18 | *.dll 19 | 20 | # Fortran module files 21 | *.mod 22 | *.smod 23 | 24 | # Compiled Static libraries 25 | *.lai 26 | *.la 27 | *.a 28 | *.lib 29 | 30 | # Executables 31 | *.exe 32 | *.out 33 | *.app 34 | bin 35 | 36 | **/dep-lib/* 37 | !**/dep-lib/*.tar.gz 38 | **/.local 39 | *.dat 40 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "APUS/apps/test"] 2 | path = APUS/apps/test 3 | url = https://github.com/LaytonW/SSCCPP.git 4 | branch = master 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # APUS: fast and scalable paxos on RDMA 2 | 3 | Build (Ubuntu Linux 15.04) 4 | ---- 5 | 6 | The source code of APUS is based on DARE [HPDC'15] 7 | ### Dependencies 8 | Install libev, libconfig, libdb, libibverbs: 9 | ``` 10 | sudo apt-get install libev-dev libconfig-dev libdb-dev 11 | ``` 12 | ### Build APUS 13 | Set env vars: 14 | ``` 15 | export PAXOS_ROOT= 16 | ``` 17 | To perform a default build execute the following: 18 | ``` 19 | cd target 20 | make clean; make 21 | ``` 22 | Run examples 23 | ---- 24 | 25 | ### Run APUS with Redis 26 | 27 | Install Redis: 28 | ``` 29 | cd apps/redis 30 | ./mk 31 | ``` 32 | Run APUS with Redis: 33 | ``` 34 | cd benchmarks 35 | ./run.sh --app=redis 36 | ``` 37 | 38 | Contact 39 | ---- 40 | 41 | Please send emails to Wang Cheng (wangch.will@gmail.com) If you have any problems about APUS. 42 | -------------------------------------------------------------------------------- /apps/memcached/libmemcached-1.0.18.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/memcached/libmemcached-1.0.18.tar.gz -------------------------------------------------------------------------------- /apps/memcached/mcperf-0.1.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/memcached/mcperf-0.1.1.tar.gz -------------------------------------------------------------------------------- /apps/memcached/mcperf.md: -------------------------------------------------------------------------------- 1 | # twemperf (mcperf) 2 | 3 | ## Building mcperf ## 4 | 5 | To build mcperf from distribution tarball: 6 | 7 | $ ./configure 8 | $ make 9 | $ sudo make install 10 | 11 | To build mcperf from distribution tarball in _debug mode_: 12 | 13 | $ CFLAGS="-ggdb3 -O0" ./configure --enable-debug 14 | $ make 15 | $ sudo make install 16 | 17 | ## Help ## 18 | 19 | Usage: mcperf [-v verbosity level] [-o output file] 20 | [-s server] [-p port] 21 | [-n num-conns] [-N num-calls] 22 | [-r conn-rate] [-R call-rate] 23 | 24 | Options: 25 | -v, --verbosity=N : set logging level (default: 5, min: 0, max: 11) 26 | -o, --output=S : set logging file (default: stderr) 27 | -s, --server=S : set the hostname of the server (default: localhost) 28 | -p, --port=N : set the port number of the server (default: 11211) 29 | -n, --num-conns=N : set the number of connections to create (default: 1) 30 | -N, --num-calls=N : set the number of calls to create on each connection (default: 1) 31 | -r, --conn-rate=R : set the connection creation rate (default: 0 conns/sec) 32 | -R, --call-rate=R : set the call creation rate (default: 0 calls/sec) 33 | 34 | -q, --use-noreply : set noreply for generated requests 35 | ... 36 | 37 | ## Design ## 38 | 39 | 1. Single threaded. 40 | 2. Asynchronous I/O through non-blocking sockets and Linux epoll(7) syscall. 41 | 42 | ## Examples ## 43 | 44 | The following example creates **1000 connections** to a memcached server 45 | running on **localhost:11211**. The connections are created at the rate of 46 | **1000 conns/sec** and on every connection it sends **10 'set' requests** at 47 | the rate of **1000 reqs/sec** with the item sizes derived from a uniform 48 | distribution in the interval of [1,16) bytes. 49 | 50 | $ mcperf --linger=0 --timeout=5 --conn-rate=1000 --call-rate=1000 --num-calls=10 --num-conns=1000 --sizes=u1,16 51 | 52 | The following example creates **100 connections** to a memcached server 53 | running on **localhost:11211**. Every connection is created after the previous 54 | connection is closed. On every connection we send **100 'set' requests** and 55 | every request is created after we have received the response for the previous 56 | request. All the set requests generated have a fixed item size of 1 byte. 57 | 58 | $ mcperf --linger=0 --conn-rate=0 --call-rate=0 --num-calls=100 --num-conns=100 --sizes=d1 59 | 60 | The following example gives you all the details of what mcperf is doing. 61 | 62 | $ mcperf --call-rate=0 --num-calls=100 --num-conns=1 --verbosity=11 63 | 64 | ## Protocol ## 65 | 66 | ### Storage commands ### 67 | First, the client sends a command line which looks like this: 68 | 69 | [noreply]\r\n 70 | 71 | - `` is "set", "add", "replace", "append" or "prepend" 72 | 73 | - `noreply` optional parameter instructs the server to not send the reply. 74 | 75 | ### Error strings ### 76 | 77 | Each command sent by a client may be answered with an error string from the server. These error strings come in three types: 78 | 79 | - `SERVER_ERROR \r\n` 80 | 81 | means some sort of server error prevents the server from carrying out the command. `` is a human-readable error string. In cases of severe server errors, which make it impossible to continue serving the client (this shouldn't normally happen), the server will close the connection after sending the error line. This is the only case in which the server closes a connection to a client. -------------------------------------------------------------------------------- /apps/memcached/memcached-1.4.21.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/memcached/memcached-1.4.21.tar.gz -------------------------------------------------------------------------------- /apps/memcached/mk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # benchmark use memslap from libmemcached, source code in libmemcached-1.0.18/clients/memslap.cc 4 | 5 | # variables 6 | APP_VER=1.4.21 7 | BEN_VER=1.0.18 8 | APP_DIR=$PAXOS_ROOT/apps/memcached 9 | 10 | # working folder 11 | cd $APP_DIR 12 | 13 | # remove folders 14 | rm -rf memcached-$APP_VER 15 | rm -rf install 16 | rm -rf libmemcached-$BEN_VER 17 | rm -rf benchmark 18 | 19 | # download and extract 20 | if [ ! -f memcached-$APP_VER.tar.gz ]; then 21 | wget http://www.memcached.org/files/memcached-$APP_VER.tar.gz 22 | fi 23 | tar zxvf memcached-$APP_VER.tar.gz 24 | 25 | # build 26 | cd memcached-$APP_VER 27 | mkdir ../install 28 | ./configure --prefix=$APP_DIR/install 29 | make -j `nproc` 30 | make install 31 | 32 | 33 | # download benchmark 34 | cd ../ 35 | if [ ! -f libmemcached-$BEN_VER.tar.gz ]; then 36 | wget https://launchpad.net/libmemcached/1.0/1.0.18/+download/libmemcached-$BEN_VER.tar.gz 37 | fi 38 | tar zxvf libmemcached-$BEN_VER.tar.gz 39 | 40 | 41 | # build benchmark 42 | cd libmemcached-$BEN_VER 43 | mkdir ../benchmark 44 | ./configure --prefix=$APP_DIR/benchmark 45 | make -j `nproc` 46 | make install 47 | 48 | -------------------------------------------------------------------------------- /apps/memcached/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # variables 4 | APP_DIR=$PAXOS_ROOT/apps/memcached 5 | 6 | # start server 7 | # -p TCP port number to listen on (default: 11211) 8 | # -d run as a daemon 9 | # -P save PID in , only used with -d option 10 | # -m max memory to use for items in megabytes (default: 64 MB) 11 | # -M return error on memory exhausted (rather than removing items) 12 | # -u assume identity of (only when run as root) 13 | cd $APP_DIR/install 14 | bin/memcached -p 11222 -P $APP_DIR/install/memcached.pid & 15 | sleep 1 16 | 17 | # benchmack 18 | # Generates a load against a memcached custer of servers. 19 | # --concurrency= 20 | # Number of users to simulate with load. 21 | # --execute-number= 22 | # Number of times to execute the given test. 23 | # --servers= 24 | # List which servers you wish to connect to. 25 | cd ../benchmark 26 | bin/memslap -s 127.0.0.1:11222 --concurrency=10 --execute-number=5000 27 | 28 | # stop server 29 | cd ../install 30 | kill $(cat memcached.pid) 31 | rm memcached.pid 32 | 33 | -------------------------------------------------------------------------------- /apps/redis/mk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # benchmark redis-benchmark in redis-2.8.17/src/redis-benchmark.c 4 | 5 | APP_VER=2.8.17 6 | APP_DIR=$PAXOS_ROOT/apps/redis 7 | 8 | # download 9 | cd $APP_DIR 10 | rm -rf redis-$APP_VER 11 | rm -rf install 12 | if [ ! -f redis-$APP_VER.tar.gz ]; then 13 | wget http://download.redis.io/releases/redis-$APP_VER.tar.gz 14 | fi 15 | tar zxvf redis-$APP_VER.tar.gz 16 | 17 | # build 18 | cd redis-$APP_VER 19 | make 20 | make install PREFIX=$APP_DIR/install 21 | -------------------------------------------------------------------------------- /apps/redis/pipeline.md: -------------------------------------------------------------------------------- 1 | ## Request/Response protocols and RTT 2 | By default every client sends the next command only when the reply of the previous command is received, this means that the server will likely need a read call in order to read each command from every client. Also RTT is paid as well. 3 | 4 | So for instance a four commands sequence is something like this: 5 | - *Client*: INCR X 6 | - *Server*: 1 7 | - *Client*: INCR X 8 | - *Server*: 2 9 | - *Client*: INCR X 10 | - *Server*: 3 11 | - *Client*: INCR X 12 | - *Server*: 4 13 | 14 | ## Redis Pipelining 15 | A Request/Response server can be implemented so that it is able to process new requests even if the client didn't already read the old responses. This way it is possible to send *multiple commands* to the server without waiting for the replies at all, and finally read the replies in a single step. 16 | 17 | This is an example using the raw netcat utility: 18 | ``` 19 | $ (printf "PING\r\nPING\r\nPING\r\n"; sleep 1) | nc localhost 6379 20 | +PONG 21 | +PONG 22 | +PONG 23 | ``` 24 | This time we are not paying the cost of RTT for every call, but just one time for the three commands. 25 | 26 | To be very explicit, with pipelining the order of operations of our very first example will be the following: 27 | - *Client*: INCR X 28 | - *Client*: INCR X 29 | - *Client*: INCR X 30 | - *Client*: INCR X 31 | - *Server*: 1 32 | - *Server*: 2 33 | - *Server*: 3 34 | - *Server*: 4 35 | 36 | **IMPORTANT NOTE**: While the client sends commands using pipelining, the server will be forced to queue the replies, using memory. 37 | 38 | ## It's not just a matter of RTT 39 | Pipelining is not just a way in order to reduce the latency cost due to the round trip time, it actually improves by a huge amount the total operations you can perform per second in a given Redis server. This is the result of the fact that, without using pipelining, serving each command is very cheap from the point of view of accessing the data structures and producing the reply, but it is very costly from the point of view of doing the socket I/O. This involes calling the `read()` and `write()` syscall, that means going from user land to kernel land. The context switch is a huge speed penalty. 40 | 41 | When pipelining is used, many commands are usually read with a single `read()` system call, and multiple replies are delivered with a single `write()` system call. Because of this, the number of total queries performed per second initially increases almost linearly with longer pipelines, and eventually reaches 10 times the baseline obtained not using pipelining, as you can see from the following graph: 42 | -------------------------------------------------------------------------------- /apps/redis/redis-2.8.17.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/redis/redis-2.8.17.tar.gz -------------------------------------------------------------------------------- /apps/redis/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # MONITOR is a debugging command that streams back every command processed by the Redis server. 4 | # It can help in understanding what is happening to the database. 5 | # $ redis-cli monitor 6 | # 1339518083.107412 [0 127.0.0.1:60866] "keys" "*" 7 | # 1339518087.877697 [0 127.0.0.1:60866] "dbsize" 8 | # 1339518090.420270 [0 127.0.0.1:60866] "set" "x" "6" 9 | # 1339518096.506257 [0 127.0.0.1:60866] "get" "x" 10 | # 1339518099.363765 [0 127.0.0.1:60866] "del" "x" 11 | # 1339518100.544926 [0 127.0.0.1:60866] "get" "x" 12 | 13 | 14 | # Measuring latency 15 | # showLatencyReport in redis-benchmark.c 16 | # for (i = 0; i < config.requests; i++) { 17 | # # print config.latency 18 | # } 19 | 20 | 21 | ################################ SNAPSHOTTING ################################ 22 | # 23 | # Save the DB on disk: 24 | # 25 | # save 26 | # 27 | # Will save the DB if both the given number of seconds and the given 28 | # number of write operations against the DB occurred. 29 | # 30 | # In the example below the behaviour will be to save: 31 | # after 900 sec (15 min) if at least 1 key changed 32 | # after 300 sec (5 min) if at least 10 keys changed 33 | # after 60 sec if at least 10000 keys changed 34 | # 35 | # Note: you can disable saving at all commenting all the "save" lines. 36 | 37 | # save 900 1 38 | # save 300 10 39 | # save 60 10000 40 | -------------------------------------------------------------------------------- /apps/redis/sentinel.md: -------------------------------------------------------------------------------- 1 | # Redis Sentinel Documentation 2 | Redis Sentinel provides high availability for Redis. In practical terms this means that using Sentinel you can create a Redis deployment that resists without human intervention to certain kind of failures. 3 | 4 | Redis Sentinel also provides other collateral tasks such as monitoring, notifications and acts as a configuration provider for clients. 5 | 6 | This is the full list of Sentinel capabilities at a macroscopical level (i.e. the *big picture*): 7 | 8 | - **Automatic failover**. If a master is not working as expected, Sentinel can start a failover process where a slave is promoted to master, the other additional slaves are reconfigured to use the new master, and the applications using the Redis server informed about the new address to use when connecting. 9 | 10 | ## Distributed nature of Sentinel 11 | 12 | Redis Sentinel is a distributed system: 13 | 14 | Sentinel itself is designed to run in a configuration where there are multiple Sentinel processes cooperating together. The advantage of having multiple Sentinel processes cooperating are the following: 15 | 16 | 1. Failure detection is performed when multiple Sentinels agree about the fact a given master is no longer available. This lowers the probability of false positives. 17 | 18 | ## Example 2: basic setup with three boxes 19 | 20 | This is a very simple setup, that has the advantage to be simple to tune for additional safety. It is based on three boxes, each box running both a Redis process and a Sentinel process. 21 | 22 | ``` 23 | +----+ 24 | | M1 | 25 | | S1 | 26 | +----+ 27 | | 28 | +----+ | +----+ 29 | | R2 |----+----| R3 | 30 | | S2 | | S3 | 31 | +----+ +----+ 32 | 33 | Configuration: quorum = 2 34 | ``` 35 | If the master M1 fails, S2 and S3 will agree about the failure and will be able to authorize a failover, making clients able to continue. 36 | 37 | In every Sentinel setup, being Redis asynchronously replicated, there is always the risk of losing some write because a given acknowledged write may not be able to reach the slave which is promoted to master. However in the above setup there is an higher risk due to clients partitioned away with an old master, like in the following picture: 38 | 39 | ``` 40 | +----+ 41 | | M1 | 42 | | S1 | <- C1 (writes will be lost) 43 | +----+ 44 | | 45 | / 46 | / 47 | +------+ | +----+ 48 | | [M2] |----+----| R3 | 49 | | S2 | | S3 | 50 | +------+ +----+ 51 | ``` 52 | 53 | In this case a network partition isolated the old master M1, so the slave R2 is promoted to master. However clients, like C1, that are in the same partition as the old master, may continue to write data to the old master. This data will be lost forever since when the partition will heal, the master will be reconfigured as a slave of the new master, discarding its data set. 54 | 55 | This problem can be mitigated using the following Redis replication feature, that allows to stop accepting writes if a master detects that is no longer able to transfer its writes to the specified number of slaves. 56 | 57 | ``` 58 | min-slaves-to-write 1 59 | min-slaves-max-lag 10 60 | ``` 61 | -------------------------------------------------------------------------------- /apps/ssdb/master.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/ssdb/master.tar.gz -------------------------------------------------------------------------------- /apps/ssdb/mk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # benchmark source in ssdb/master/tools/ssdb-bench.cpp 4 | 5 | # variables 6 | APP_DIR=$PAXOS_ROOT/apps/ssdb 7 | 8 | # working folder 9 | cd $APP_DIR 10 | 11 | # remove folders 12 | rm -rf ssdb-master 13 | 14 | # download and extract 15 | if [ ! -f master.tar.gz ]; then 16 | wget https://github.com/ideawu/ssdb/archive/master.tar.gz 17 | fi 18 | tar zxvf master.tar.gz 19 | 20 | # build 21 | cd ssdb-master 22 | CFLAGS="-g -O0" CXXFLAGS="-g -O0" make 23 | 24 | # config 25 | sed -i 's/ip: 127.0.0.1/#ip: 127.0.0.1/' ./ssdb.conf 26 | sed -i 's/#ip: 0.0.0.0/ip: 0.0.0.0/' ./ssdb.conf 27 | -------------------------------------------------------------------------------- /apps/ssdb/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # variables 4 | APP_DIR=$PAXOS_ROOT/apps/ssdb 5 | 6 | # start servere 7 | # ./ssdb-server [-d] /path/to/ssdb.conf [-s start|stop|restart] 8 | # Options: 9 | # -d run as daemon 10 | # -s option to start|stop|restart the server 11 | cd $APP_DIR/ssdb-master 12 | ./ssdb-server ssdb.conf 13 | sleep 1 14 | 15 | # benchmark 16 | # ./ssdb-bench [ip] [port] [requests] [clients] 17 | # Options: 18 | # ip server ip (default 127.0.0.1) 19 | # port server port (default 8888) 20 | # requests Total number of requests (default 10000) 21 | # clients Number of parallel connections (default 50) 22 | ./tools/ssdb-bench 127.0.0.1 8888 10000 50 23 | # SSDB supports Redis network protocol, you can use Redis clients to connect to a SSDB server and operate on it. 24 | 25 | # stop server 26 | kill -SIGINT $(cat ./var/ssdb.pid) -------------------------------------------------------------------------------- /benchmarks/README: -------------------------------------------------------------------------------- 1 | 2 | 3 | mckey program is used to test RDMA CM multicast setup and simple data transfer. 4 | usage : mckey [options] 5 | options: -m # multicast_address 6 | -s # sender 7 | -b # bind_address 8 | 9 | Server: $ mckey -m 225.1.1.1 -b 10.22.1.1 10 | Client: $ mckey -m 225.1.1.1 -b 10.22.1.2 -s 11 | 12 | 13 | -------------------------------------------------------------------------------- /benchmarks/mckey.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2005-2007 Intel Corporation. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | * 32 | * BUILD COMMAND: 33 | * gcc -g -Wall -D_GNU_SOURCE -g -O2 -o examples/mckey examples/mckey.c -libverbs -lrdmacm 34 | * 35 | * $Id$ 36 | */ 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #include 52 | 53 | struct cmatest_node { 54 | int id; 55 | struct rdma_cm_id *cma_id; 56 | int connected; 57 | struct ibv_pd *pd; 58 | struct ibv_cq *cq; 59 | struct ibv_mr *mr; 60 | struct ibv_ah *ah; 61 | uint32_t remote_qpn; 62 | uint32_t remote_qkey; 63 | void *mem; 64 | }; 65 | 66 | struct cmatest { 67 | struct rdma_event_channel *channel; 68 | pthread_t cmathread; 69 | struct cmatest_node *nodes; 70 | int conn_index; 71 | int connects_left; 72 | 73 | struct sockaddr_in6 dst_in; 74 | struct sockaddr *dst_addr; 75 | struct sockaddr_in6 src_in; 76 | struct sockaddr *src_addr; 77 | }; 78 | 79 | static struct cmatest test; 80 | static int connections = 1; 81 | static int message_size = 100; 82 | static int message_count = 10; 83 | static int is_sender; 84 | static int unmapped_addr; 85 | static char *dst_addr; 86 | static char *src_addr; 87 | static enum rdma_port_space port_space = RDMA_PS_UDP; 88 | 89 | static int create_message(struct cmatest_node *node) 90 | { 91 | if (!message_size) 92 | message_count = 0; 93 | 94 | if (!message_count) 95 | return 0; 96 | 97 | node->mem = malloc(message_size + sizeof(struct ibv_grh)); 98 | if (!node->mem) { 99 | printf("failed message allocation\n"); 100 | return -1; 101 | } 102 | node->mr = ibv_reg_mr(node->pd, node->mem, 103 | message_size + sizeof(struct ibv_grh), 104 | IBV_ACCESS_LOCAL_WRITE); 105 | if (!node->mr) { 106 | printf("failed to reg MR\n"); 107 | goto err; 108 | } 109 | return 0; 110 | err: 111 | free(node->mem); 112 | return -1; 113 | } 114 | 115 | static int verify_test_params(struct cmatest_node *node) 116 | { 117 | struct ibv_port_attr port_attr; 118 | int ret; 119 | 120 | ret = ibv_query_port(node->cma_id->verbs, node->cma_id->port_num, 121 | &port_attr); 122 | if (ret) 123 | return ret; 124 | 125 | if (message_count && message_size > (1 << (port_attr.active_mtu + 7))) { 126 | printf("mckey: message_size %d is larger than active mtu %d\n", 127 | message_size, 1 << (port_attr.active_mtu + 7)); 128 | return -EINVAL; 129 | } 130 | 131 | return 0; 132 | } 133 | 134 | static int init_node(struct cmatest_node *node) 135 | { 136 | struct ibv_qp_init_attr init_qp_attr; 137 | int cqe, ret; 138 | 139 | node->pd = ibv_alloc_pd(node->cma_id->verbs); 140 | if (!node->pd) { 141 | ret = -ENOMEM; 142 | printf("mckey: unable to allocate PD\n"); 143 | goto out; 144 | } 145 | 146 | cqe = message_count ? message_count * 2 : 2; 147 | node->cq = ibv_create_cq(node->cma_id->verbs, cqe, node, 0, 0); 148 | if (!node->cq) { 149 | ret = -ENOMEM; 150 | printf("mckey: unable to create CQ\n"); 151 | goto out; 152 | } 153 | 154 | memset(&init_qp_attr, 0, sizeof init_qp_attr); 155 | init_qp_attr.cap.max_send_wr = message_count ? message_count : 1; 156 | init_qp_attr.cap.max_recv_wr = message_count ? message_count : 1; 157 | init_qp_attr.cap.max_send_sge = 1; 158 | init_qp_attr.cap.max_recv_sge = 1; 159 | init_qp_attr.qp_context = node; 160 | init_qp_attr.sq_sig_all = 0; 161 | init_qp_attr.qp_type = IBV_QPT_UD; 162 | init_qp_attr.send_cq = node->cq; 163 | init_qp_attr.recv_cq = node->cq; 164 | ret = rdma_create_qp(node->cma_id, node->pd, &init_qp_attr); 165 | if (ret) { 166 | perror("mckey: unable to create QP"); 167 | goto out; 168 | } 169 | 170 | ret = create_message(node); 171 | if (ret) { 172 | printf("mckey: failed to create messages: %d\n", ret); 173 | goto out; 174 | } 175 | out: 176 | return ret; 177 | } 178 | 179 | static int post_recvs(struct cmatest_node *node) 180 | { 181 | struct ibv_recv_wr recv_wr, *recv_failure; 182 | struct ibv_sge sge; 183 | int i, ret = 0; 184 | 185 | if (!message_count) 186 | return 0; 187 | 188 | recv_wr.next = NULL; 189 | recv_wr.sg_list = &sge; 190 | recv_wr.num_sge = 1; 191 | recv_wr.wr_id = (uintptr_t) node; 192 | 193 | sge.length = message_size + sizeof(struct ibv_grh); 194 | sge.lkey = node->mr->lkey; 195 | sge.addr = (uintptr_t) node->mem; 196 | 197 | for (i = 0; i < message_count && !ret; i++ ) { 198 | ret = ibv_post_recv(node->cma_id->qp, &recv_wr, &recv_failure); 199 | if (ret) { 200 | printf("failed to post receives: %d\n", ret); 201 | break; 202 | } 203 | } 204 | return ret; 205 | } 206 | 207 | static int post_sends(struct cmatest_node *node, int signal_flag) 208 | { 209 | struct ibv_send_wr send_wr, *bad_send_wr; 210 | struct ibv_sge sge; 211 | int i, ret = 0; 212 | 213 | if (!node->connected || !message_count) 214 | return 0; 215 | 216 | send_wr.next = NULL; 217 | send_wr.sg_list = &sge; 218 | send_wr.num_sge = 1; 219 | send_wr.opcode = IBV_WR_SEND_WITH_IMM; 220 | send_wr.send_flags = signal_flag; 221 | send_wr.wr_id = (unsigned long)node; 222 | send_wr.imm_data = htonl(node->cma_id->qp->qp_num); 223 | 224 | send_wr.wr.ud.ah = node->ah; 225 | send_wr.wr.ud.remote_qpn = node->remote_qpn; 226 | send_wr.wr.ud.remote_qkey = node->remote_qkey; 227 | 228 | sge.length = message_size; 229 | sge.lkey = node->mr->lkey; 230 | sge.addr = (uintptr_t) node->mem; 231 | 232 | for (i = 0; i < message_count && !ret; i++) { 233 | ret = ibv_post_send(node->cma_id->qp, &send_wr, &bad_send_wr); 234 | if (ret) 235 | printf("failed to post sends: %d\n", ret); 236 | } 237 | return ret; 238 | } 239 | 240 | static void connect_error(void) 241 | { 242 | test.connects_left--; 243 | } 244 | 245 | static int addr_handler(struct cmatest_node *node) 246 | { 247 | int ret; 248 | 249 | ret = verify_test_params(node); 250 | if (ret) 251 | goto err; 252 | 253 | ret = init_node(node); 254 | if (ret) 255 | goto err; 256 | 257 | if (!is_sender) { 258 | ret = post_recvs(node); 259 | if (ret) 260 | goto err; 261 | } 262 | 263 | ret = rdma_join_multicast(node->cma_id, test.dst_addr, node); 264 | if (ret) { 265 | perror("mckey: failure joining"); 266 | goto err; 267 | } 268 | return 0; 269 | err: 270 | connect_error(); 271 | return ret; 272 | } 273 | 274 | static int join_handler(struct cmatest_node *node, 275 | struct rdma_ud_param *param) 276 | { 277 | char buf[40]; 278 | 279 | inet_ntop(AF_INET6, param->ah_attr.grh.dgid.raw, buf, 40); 280 | printf("mckey: joined dgid: %s mlid 0x%x sl %d\n", buf, 281 | param->ah_attr.dlid, param->ah_attr.sl); 282 | 283 | node->remote_qpn = param->qp_num; 284 | node->remote_qkey = param->qkey; 285 | node->ah = ibv_create_ah(node->pd, ¶m->ah_attr); 286 | if (!node->ah) { 287 | printf("mckey: failure creating address handle\n"); 288 | goto err; 289 | } 290 | 291 | node->connected = 1; 292 | test.connects_left--; 293 | return 0; 294 | err: 295 | connect_error(); 296 | return -1; 297 | } 298 | 299 | static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) 300 | { 301 | int ret = 0; 302 | 303 | switch (event->event) { 304 | case RDMA_CM_EVENT_ADDR_RESOLVED: 305 | ret = addr_handler(cma_id->context); 306 | break; 307 | case RDMA_CM_EVENT_MULTICAST_JOIN: 308 | ret = join_handler(cma_id->context, &event->param.ud); 309 | break; 310 | case RDMA_CM_EVENT_ADDR_ERROR: 311 | case RDMA_CM_EVENT_ROUTE_ERROR: 312 | case RDMA_CM_EVENT_MULTICAST_ERROR: 313 | printf("mckey: event: %s, error: %d\n", 314 | rdma_event_str(event->event), event->status); 315 | connect_error(); 316 | ret = event->status; 317 | break; 318 | case RDMA_CM_EVENT_DEVICE_REMOVAL: 319 | /* Cleanup will occur after test completes. */ 320 | break; 321 | default: 322 | break; 323 | } 324 | return ret; 325 | } 326 | 327 | static void *cma_thread(void *arg) 328 | { 329 | struct rdma_cm_event *event; 330 | int ret; 331 | 332 | while (1) { 333 | ret = rdma_get_cm_event(test.channel, &event); 334 | if (ret) { 335 | perror("rdma_get_cm_event"); 336 | break; 337 | } 338 | 339 | switch (event->event) { 340 | case RDMA_CM_EVENT_MULTICAST_ERROR: 341 | case RDMA_CM_EVENT_ADDR_CHANGE: 342 | printf("mckey: event: %s, status: %d\n", 343 | rdma_event_str(event->event), event->status); 344 | break; 345 | default: 346 | break; 347 | } 348 | 349 | rdma_ack_cm_event(event); 350 | } 351 | return NULL; 352 | } 353 | 354 | static void destroy_node(struct cmatest_node *node) 355 | { 356 | if (!node->cma_id) 357 | return; 358 | 359 | if (node->ah) 360 | ibv_destroy_ah(node->ah); 361 | 362 | if (node->cma_id->qp) 363 | rdma_destroy_qp(node->cma_id); 364 | 365 | if (node->cq) 366 | ibv_destroy_cq(node->cq); 367 | 368 | if (node->mem) { 369 | ibv_dereg_mr(node->mr); 370 | free(node->mem); 371 | } 372 | 373 | if (node->pd) 374 | ibv_dealloc_pd(node->pd); 375 | 376 | /* Destroy the RDMA ID after all device resources */ 377 | rdma_destroy_id(node->cma_id); 378 | } 379 | 380 | static int alloc_nodes(void) 381 | { 382 | int ret, i; 383 | 384 | test.nodes = malloc(sizeof *test.nodes * connections); 385 | if (!test.nodes) { 386 | printf("mckey: unable to allocate memory for test nodes\n"); 387 | return -ENOMEM; 388 | } 389 | memset(test.nodes, 0, sizeof *test.nodes * connections); 390 | 391 | for (i = 0; i < connections; i++) { 392 | test.nodes[i].id = i; 393 | ret = rdma_create_id(test.channel, &test.nodes[i].cma_id, 394 | &test.nodes[i], port_space); 395 | if (ret) 396 | goto err; 397 | } 398 | return 0; 399 | err: 400 | while (--i >= 0) 401 | rdma_destroy_id(test.nodes[i].cma_id); 402 | free(test.nodes); 403 | return ret; 404 | } 405 | 406 | static void destroy_nodes(void) 407 | { 408 | int i; 409 | 410 | for (i = 0; i < connections; i++) 411 | destroy_node(&test.nodes[i]); 412 | free(test.nodes); 413 | } 414 | 415 | static int poll_cqs(void) 416 | { 417 | struct ibv_wc wc[8]; 418 | int done, i, ret; 419 | 420 | for (i = 0; i < connections; i++) { 421 | if (!test.nodes[i].connected) 422 | continue; 423 | 424 | for (done = 0; done < message_count; done += ret) { 425 | ret = ibv_poll_cq(test.nodes[i].cq, 8, wc); 426 | if (ret < 0) { 427 | printf("mckey: failed polling CQ: %d\n", ret); 428 | return ret; 429 | } 430 | } 431 | } 432 | return 0; 433 | } 434 | 435 | static int connect_events(void) 436 | { 437 | struct rdma_cm_event *event; 438 | int ret = 0; 439 | 440 | while (test.connects_left && !ret) { 441 | ret = rdma_get_cm_event(test.channel, &event); 442 | if (!ret) { 443 | ret = cma_handler(event->id, event); 444 | rdma_ack_cm_event(event); 445 | } 446 | } 447 | return ret; 448 | } 449 | 450 | static int get_addr(char *dst, struct sockaddr *addr) 451 | { 452 | struct addrinfo *res; 453 | int ret; 454 | 455 | ret = getaddrinfo(dst, NULL, NULL, &res); 456 | if (ret) { 457 | printf("getaddrinfo failed - invalid hostname or IP address\n"); 458 | return ret; 459 | } 460 | 461 | memcpy(addr, res->ai_addr, res->ai_addrlen); 462 | freeaddrinfo(res); 463 | return ret; 464 | } 465 | 466 | static int run(void) 467 | { 468 | int i, ret; 469 | 470 | printf("mckey: starting %s\n", is_sender ? "client" : "server"); 471 | if (src_addr) { 472 | ret = get_addr(src_addr, (struct sockaddr *) &test.src_in); 473 | if (ret) 474 | return ret; 475 | } 476 | 477 | ret = get_addr(dst_addr, (struct sockaddr *) &test.dst_in); 478 | if (ret) 479 | return ret; 480 | 481 | printf("mckey: joining\n"); 482 | for (i = 0; i < connections; i++) { 483 | if (src_addr) { 484 | ret = rdma_bind_addr(test.nodes[i].cma_id, 485 | test.src_addr); 486 | if (ret) { 487 | perror("mckey: addr bind failure"); 488 | connect_error(); 489 | return ret; 490 | } 491 | } 492 | 493 | if (unmapped_addr) 494 | ret = addr_handler(&test.nodes[i]); 495 | else 496 | ret = rdma_resolve_addr(test.nodes[i].cma_id, 497 | test.src_addr, test.dst_addr, 498 | 2000); 499 | if (ret) { 500 | perror("mckey: resolve addr failure"); 501 | connect_error(); 502 | return ret; 503 | } 504 | } 505 | 506 | ret = connect_events(); 507 | if (ret) 508 | goto out; 509 | 510 | pthread_create(&test.cmathread, NULL, cma_thread, NULL); 511 | 512 | /* 513 | * Pause to give SM chance to configure switches. We don't want to 514 | * handle reliability issue in this simple test program. 515 | */ 516 | sleep(3); 517 | 518 | if (message_count) { 519 | if (is_sender) { 520 | printf("initiating data transfers\n"); 521 | for (i = 0; i < connections; i++) { 522 | ret = post_sends(&test.nodes[i], 0); 523 | if (ret) 524 | goto out; 525 | } 526 | } else { 527 | printf("receiving data transfers\n"); 528 | ret = poll_cqs(); 529 | if (ret) 530 | goto out; 531 | } 532 | printf("data transfers complete\n"); 533 | } 534 | out: 535 | for (i = 0; i < connections; i++) { 536 | ret = rdma_leave_multicast(test.nodes[i].cma_id, 537 | test.dst_addr); 538 | if (ret) 539 | perror("mckey: failure leaving"); 540 | } 541 | return ret; 542 | } 543 | 544 | int main(int argc, char **argv) 545 | { 546 | int op, ret; 547 | 548 | 549 | while ((op = getopt(argc, argv, "m:M:sb:c:C:S:p:")) != -1) { 550 | switch (op) { 551 | case 'm': 552 | dst_addr = optarg; 553 | break; 554 | case 'M': 555 | unmapped_addr = 1; 556 | dst_addr = optarg; 557 | break; 558 | case 's': 559 | is_sender = 1; 560 | break; 561 | case 'b': 562 | src_addr = optarg; 563 | test.src_addr = (struct sockaddr *) &test.src_in; 564 | break; 565 | case 'c': 566 | connections = atoi(optarg); 567 | break; 568 | case 'C': 569 | message_count = atoi(optarg); 570 | break; 571 | case 'S': 572 | message_size = atoi(optarg); 573 | break; 574 | case 'p': 575 | port_space = strtol(optarg, NULL, 0); 576 | break; 577 | default: 578 | printf("usage: %s\n", argv[0]); 579 | printf("\t-m multicast_address\n"); 580 | printf("\t[-M unmapped_multicast_address]\n" 581 | "\t replaces -m and requires -b\n"); 582 | printf("\t[-s(ender)]\n"); 583 | printf("\t[-b bind_address]\n"); 584 | printf("\t[-c connections]\n"); 585 | printf("\t[-C message_count]\n"); 586 | printf("\t[-S message_size]\n"); 587 | printf("\t[-p port_space - %#x for UDP (default), " 588 | "%#x for IPOIB]\n", RDMA_PS_UDP, RDMA_PS_IPOIB); 589 | exit(1); 590 | } 591 | } 592 | 593 | if (unmapped_addr && !src_addr) { 594 | printf("unmapped multicast address requires binding " 595 | "to source address\n"); 596 | exit(1); 597 | } 598 | 599 | test.dst_addr = (struct sockaddr *) &test.dst_in; 600 | test.connects_left = connections; 601 | 602 | test.channel = rdma_create_event_channel(); 603 | if (!test.channel) { 604 | perror("failed to create event channel"); 605 | exit(1); 606 | } 607 | 608 | if (alloc_nodes()) 609 | exit(1); 610 | 611 | ret = run(); 612 | 613 | printf("test complete\n"); 614 | destroy_nodes(); 615 | rdma_destroy_event_channel(test.channel); 616 | 617 | printf("return status %d\n", ret); 618 | return ret; 619 | } 620 | -------------------------------------------------------------------------------- /benchmarks/reconf_bench.sh: -------------------------------------------------------------------------------- 1 | define(){ IFS='\n' read -r -d '' ${1} || true; } 2 | declare -A pids 3 | declare -A rounds 4 | redirection=( "> out" "2> err" "< /dev/null" ) 5 | 6 | define HELP <<'EOF' 7 | Script for starting DARE 8 | usage : $0 [options] 9 | options: --app # app to run 10 | EOF 11 | 12 | usage () { 13 | echo -e "$HELP" 14 | } 15 | 16 | timer_start () { 17 | echo "$1" 18 | t1=$(date +%s%N) 19 | } 20 | 21 | timer_stop () { 22 | t2=$(date +%s%N) 23 | echo "done ($(expr $t2 - $t1) nanoseconds)" 24 | } 25 | 26 | ErrorAndExit () { 27 | echo "ERROR: $1" 28 | exit 1 29 | } 30 | 31 | ForceAbsolutePath () { 32 | case "$2" in 33 | /* ) 34 | ;; 35 | *) 36 | ErrorAndExit "Expected an absolute path for $1" 37 | ;; 38 | esac 39 | } 40 | 41 | StartDare() { 42 | for ((i=0; i<${group_size}; ++i)); do 43 | srv=${servers[$i]} 44 | config_dare=( "server_type=start" "server_idx=$i" "group_size=$group_size" "config_path=${DAREDIR}/target/nodes.local.cfg" "dare_log_file=$PWD/srv${i}_1.log" "mgid=$DGID" "LD_PRELOAD=${DAREDIR}/target/interpose.so" ) 45 | cmd=( "ssh" "$USER@${servers[$i]}" "${config_dare[@]}" "nohup" "${run_dare}" "${redirection[@]}" "&" "echo \$!" ) 46 | pids[$srv]=$("${cmd[@]}") 47 | rounds[$srv]=2 48 | #echo "StartDare COMMAND: "${cmd[@]} 49 | echo -e "\tp$i ($srv) -- pid=${pids[$srv]}" 50 | #echo -e enable interpretation of backslash escapes 51 | done 52 | #echo -e "\n\tinitial servers: ${!servers[]}${!pids[@]}" 53 | #echo -e "\t...and their PIDs: ${pids[@]}" 54 | } 55 | 56 | StopDare() { 57 | for srv in "${!pids[@]}"; do 58 | #${!pids[@]}: expand to the list of array indices (keys) assigned in pids 59 | cmd=( "ssh" "$USER@$srv" "kill -2" "${pids[$srv]}" ) 60 | echo "Executing: ${cmd[@]}" 61 | $("${cmd[@]}") 62 | done 63 | } 64 | 65 | FindLeader() { 66 | leader="" 67 | max_idx=-1 68 | max_term="" 69 | 70 | for ((i=0; i<${group_size}; ++i)); do 71 | srv=${servers[$i]} 72 | # look for the latest [T] LEADER 73 | cmd=( "ssh" "$USER@$srv" "grep -r \"] LEADER\"" "$PWD/srv${i}_$((rounds[$srv]-1)).log" ) 74 | #echo ${cmd[@]} 75 | grep_out=$("${cmd[@]}") 76 | if [[ -z $grep_out ]]; then 77 | continue 78 | fi 79 | terms=($(echo $grep_out | awk '{print $2}')) 80 | for j in "${terms[@]}"; do 81 | term=`echo $j | awk -F'T' '{print $2}' | awk -F']' '{print $1}'` 82 | if [[ $term -gt $max_term ]]; then 83 | max_term=$term 84 | leader=$srv 85 | leader_idx=$i 86 | fi 87 | done 88 | done 89 | echo "Leader: p${leader_idx} ($leader)" 90 | } 91 | 92 | RemoveLeader() { 93 | FindLeader 94 | if [[ -z $leader ]]; then 95 | echo -e "\n\tNo leader [$leader]" 96 | return 1 97 | fi 98 | #echo ${!pids[@]} 99 | #echo ${pids[@]} 100 | if [[ -z ${pids[$leader]} ]]; then 101 | echo -e "\n\tNo PID for the leader $leader" 102 | return 1 103 | fi 104 | cmd=( "ssh" "$USER@$leader" "kill -2" "${pids[$leader]}" ) 105 | $("${cmd[@]}") 106 | unset pids[$leader] 107 | echo -e "\tremoved p${leader_idx} ($leader)" 108 | #echo -e "\n\tservers after removing the leader p${leader_idx} ($leader): ${!pids[@]}" 109 | #echo -e "\t...and their PIDs: ${pids[@]}" 110 | #echo ${cmd[@]} 111 | maj=$(bc -l <<< "${group_size}/2.") # bc - An arbitrary precision calculator language 112 | if [[ ${#pids[@]} < $maj ]]; then 113 | ErrorAndExit "...not enough servers!" 114 | fi 115 | return 0 116 | } 117 | 118 | # Stop a server that is not the leader 119 | RemoveServer() { 120 | FindLeader 121 | for ((i=0; i<${group_size}; ++i)); do 122 | srv=${servers[$i]} 123 | if [[ "x$srv" == "x$leader" ]]; then 124 | continue 125 | fi 126 | if [[ "x${pids[$srv]}" == "x" ]]; then 127 | continue 128 | fi 129 | cmd=( "ssh" "$USER@$srv" "kill -2" "${pids[$srv]}" ) 130 | #echo -e "\tcmd: ${cmd[@]}" 131 | $("${cmd[@]}") 132 | unset pids[$srv] 133 | echo -e "\tremoved p$i ($srv) -- p$leader_idx is the leader" 134 | #echo -e "\tservers after removing p$i ($srv): ${!pids[@]}" 135 | #echo -e "\t...and their PIDs: ${pids[@]}" 136 | #echo ${cmd[@]} 137 | break 138 | done 139 | maj=$(bc -l <<< "${group_size}/2.") 140 | if [[ ${#pids[@]} < $maj ]]; then 141 | ErrorAndExit "...not enough servers!" 142 | fi 143 | } 144 | 145 | AddServer() { 146 | if [[ ${#pids[@]} == $group_size ]]; then 147 | # the group is full 148 | group_size=$((group_size+2)) 149 | fi 150 | for ((i=0; i<${group_size}; ++i)); do 151 | srv=${servers[$i]} 152 | next=0 153 | for j in "${!pids[@]}"; do 154 | if [[ "x$srv" == "x$j" ]]; then 155 | next=1 156 | break 157 | fi 158 | done 159 | if [[ $next == 1 ]]; then 160 | continue 161 | fi 162 | break 163 | done 164 | if [[ "x${rounds[$srv]}" == "x" ]]; then 165 | rounds[$srv]=1 166 | fi 167 | config_dare=( "server_type=join" "config_path=${DAREDIR}/target/nodes.local.cfg" "dare_log_file=$PWD/srv${i}_${rounds[$srv]}.log" "mgid=$DGID" "LD_PRELOAD=${DAREDIR}/target/interpose.so" ) 168 | cmd=( "ssh" "$USER@$srv" "${config_dare[@]}" "nohup" "${run_dare}" "${redirection[@]}" "&" "echo \$!" ) 169 | pids[$srv]=$("${cmd[@]}") 170 | rounds[$srv]=$((rounds[$srv] + 1)) 171 | #echo "COMMAND: "${cmd[@]} 172 | echo -e "\tadded p$i ($srv)" 173 | #echo -e "\n\tservers after adding p$i ($srv): ${!pids[@]}" 174 | #echo -e "\t...and their PIDs: ${pids[@]}" 175 | } 176 | 177 | port=8888 178 | StartBenchmark() { 179 | if [[ "$APP" == "ssdb" ]]; then 180 | run_loop=( "${DAREDIR}/apps/ssdb/ssdb-master/tools/ssdb-bench" "$leader" "$port" "$request_count" "$client_count") 181 | elif [[ "$APP" == "redis" ]]; then 182 | run_loop=( "${DAREDIR}/apps/redis/install/bin/redis-benchmark" "-t set,get" "-h $leader" "-p $port" "-n $request_count" "-c $client_count") 183 | fi 184 | rounds[$client]=$((rounds[$client] + 1)) 185 | cmd=( "ssh" "$USER@${client}" "${run_loop[@]}" ">" "clt_${rounds[$client]}.log") 186 | $("${cmd[@]}") 187 | } 188 | 189 | DAREDIR=$PWD/.. 190 | APP="" 191 | client_count=1 192 | request_count=10000 193 | for arg in "$@" 194 | do 195 | case ${arg} in 196 | --help|-help|-h) 197 | usage 198 | exit 1 199 | ;; 200 | --op=*) 201 | OPCODE=`echo $arg | sed -e 's/--op=//'` 202 | OPCODE=`eval echo ${OPCODE}` # tilde and variable expansion 203 | ;; 204 | --app=*) 205 | APP=`echo $arg | sed -e 's/--app=//'` 206 | APP=`eval echo ${APP}` # tilde and variable expansion 207 | ;; 208 | esac 209 | done 210 | 211 | if [[ "x$APP" == "x" ]]; then 212 | ErrorAndExit "No app defined: --app" 213 | elif [[ "$APP" == "ssdb" ]]; then 214 | run_dare="${DAREDIR}/apps/ssdb/ssdb-master/ssdb-server ${DAREDIR}/apps/ssdb/ssdb-master/ssdb.conf" 215 | elif [[ "$APP" == "redis" ]]; then 216 | run_dare="${DAREDIR}/apps/redis/install/bin/redis-server --port $port" 217 | fi 218 | 219 | # list of allocated nodes, e.g., nodes=(n112002 n112001 n111902) 220 | nodes=(10.22.1.3 10.22.1.4 10.22.1.5 10.22.1.6 10.22.1.7 10.22.1.8 10.22.1.9 202.45.128.159) 221 | node_count=${#nodes[@]} 222 | 223 | echo "Allocated ${node_count} nodes:" > nodes 224 | for ((i=0; i<${node_count}; ++i)); do 225 | echo "$i:${nodes[$i]}" >> nodes 226 | done 227 | group_size=5 228 | 229 | client=${nodes[-2]} 230 | echo ">>> client: ${client}" 231 | 232 | for ((i=0; i<$node_count; ++i)); do 233 | servers[${i}]=${nodes[$i]} 234 | done 235 | echo ">>> $(($node_count)) servers: ${servers[@]}" 236 | 237 | DGID="ff0e::ffff:e101:101" 238 | 239 | rm -f *.log 240 | 241 | ######################################################################## 242 | 243 | Stop() { 244 | sleep 0.2 245 | StopDare 246 | exit 1 247 | } 248 | 249 | Start() { 250 | echo -e "Starting $group_size servers..." 251 | StartDare 252 | echo "done" 253 | 254 | sleep 2 255 | 256 | sleep 0.5 257 | FindLeader 258 | StartBenchmark 259 | 260 | if [[ "x$1" == "xstop" ]]; then 261 | Stop 262 | fi 263 | } 264 | 265 | FailLeader() { 266 | echo -e "Removing the leader..." 267 | while true; do 268 | RemoveLeader 269 | ret=$? 270 | #echo "ret=$ret" 271 | if [ $ret -eq 0 ]; then 272 | break; 273 | fi 274 | sleep 0.05 275 | done 276 | echo "done" 277 | 278 | sleep 1 279 | timer_start "Finding the leader..." 280 | FindLeader 281 | echo -e "\tp$leader_idx ($leader) is the leader" 282 | timer_stop 283 | 284 | StartBenchmark 285 | 286 | if [[ "x$1" == "xstop" ]]; then 287 | Stop 288 | fi 289 | } 290 | 291 | RecoverServer() { 292 | echo -e "Adding a server..." 293 | AddServer 294 | echo "done" 295 | 296 | sleep 0.5 297 | StartBenchmark 298 | 299 | if [[ "x$1" == "xstop" ]]; then 300 | Stop 301 | fi 302 | } 303 | 304 | Upsize() { 305 | echo -e "Adding a server (upsize)..." 306 | AddServer 307 | echo "done" 308 | 309 | sleep 0.3 310 | StartBenchmark 311 | 312 | if [[ "x$1" == "xstop" ]]; then 313 | Stop 314 | fi 315 | } 316 | 317 | FailServer() { 318 | echo -e "Removing a server (non-leader)..." 319 | RemoveServer 320 | echo "done" 321 | 322 | sleep 0.7 323 | StartBenchmark 324 | 325 | if [[ "x$1" == "xstop" ]]; then 326 | Stop 327 | fi 328 | } 329 | 330 | ######################################################################## 331 | 332 | # Start DARE 333 | Start 334 | 335 | # Upsize 336 | 337 | # Upsize 338 | 339 | # Remove the leader 340 | FailLeader 341 | 342 | # Remove a server that is not the leader 343 | FailServer stop 344 | -------------------------------------------------------------------------------- /benchmarks/run.sh: -------------------------------------------------------------------------------- 1 | define(){ IFS='\n' read -r -d '' ${1} || true; } 2 | declare -A pids 3 | redirection=( "> out" "2> err" "< /dev/null" ) 4 | 5 | define HELP <<'EOF' 6 | Script for starting DARE 7 | usage : $0 [options] 8 | options: --app # app to run 9 | [--scount=INT] # server count [default 3] 10 | [--ccount=INT] # client count [default 1] 11 | [--rcount=INT] # request count [default 10000] 12 | EOF 13 | 14 | usage () { 15 | echo -e "$HELP" 16 | } 17 | 18 | ErrorAndExit () { 19 | echo "ERROR: $1" 20 | exit 1 21 | } 22 | 23 | StartDare() { 24 | for ((i=0; i<$1; ++i)); 25 | do 26 | config_dare=( "server_type=start" "server_idx=$i" "group_size=$1" "config_path=${DAREDIR}/target/nodes.local.cfg" "dare_log_file=$PWD/srv${i}.log" "mgid=$DGID" "LD_PRELOAD=${DAREDIR}/target/interpose.so" ) 27 | cmd=( "ssh" "$USER@${servers[$i]}" "${config_dare[@]}" "nohup" "${run_dare}" "${redirection[@]}" "&" "echo \$!" ) 28 | pids[${servers[$i]}]=$("${cmd[@]}") 29 | echo "StartDare COMMAND: "${cmd[@]} 30 | done 31 | echo -e "\n\tinitial servers: ${!pids[@]}" 32 | echo -e "\t...and their PIDs: ${pids[@]}" 33 | } 34 | 35 | StopDare() { 36 | for i in "${!pids[@]}" 37 | do 38 | cmd=( "ssh" "$USER@$i" "kill -2" "${pids[$i]}" ) 39 | echo "Executing: ${cmd[@]}" 40 | $("${cmd[@]}") 41 | done 42 | } 43 | 44 | FindLeader() { 45 | leader="" 46 | max_idx=-1 47 | max_term="" 48 | 49 | for ((i=0; i<${server_count}; ++i)); do 50 | srv=${servers[$i]} 51 | # look for the latest [T] LEADER 52 | cmd=( "ssh" "$USER@$srv" "grep -r \"] LEADER\"" "$PWD/srv${i}.log" ) 53 | #echo ${cmd[@]} 54 | grep_out=$("${cmd[@]}") 55 | if [[ -z $grep_out ]]; then 56 | continue 57 | fi 58 | terms=($(echo $grep_out | awk '{print $2}')) 59 | for j in "${terms[@]}"; do 60 | term=`echo $j | awk -F'T' '{print $2}' | awk -F']' '{print $1}'` 61 | if [[ $term -gt $max_term ]]; then 62 | max_term=$term 63 | leader=$srv 64 | leader_idx=$i 65 | fi 66 | done 67 | done 68 | echo "Leader: p${leader_idx} ($leader)" 69 | } 70 | 71 | port=8888 72 | StartBenchmark() { 73 | if [[ "$APP" == "ssdb" ]]; then 74 | run_loop=( "${DAREDIR}/apps/ssdb/ssdb-master/tools/ssdb-bench" "$leader" "$port" "$request_count" "$client_count") 75 | elif [[ "$APP" == "redis" ]]; then 76 | run_loop=( "${DAREDIR}/apps/redis/install/bin/redis-benchmark" "-t set,get" "-h $leader" "-p $port" "-n $request_count" "-c $client_count") 77 | fi 78 | 79 | cmd=( "ssh" "$USER@${client}" "${run_loop[@]}" ">" "clt.log") 80 | $("${cmd[@]}") 81 | } 82 | 83 | DAREDIR=$PWD/.. 84 | run_dare="" 85 | server_count=3 86 | APP="" 87 | client_count=1 88 | request_count=10000 89 | for arg in "$@" 90 | do 91 | case ${arg} in 92 | --help|-help|-h) 93 | usage 94 | exit 1 95 | ;; 96 | --scount=*) 97 | server_count=`echo $arg | sed -e 's/--scount=//'` 98 | server_count=`eval echo ${server_count}` # tilde and variable expansion 99 | ;; 100 | --app=*) 101 | APP=`echo $arg | sed -e 's/--app=//'` 102 | APP=`eval echo ${APP}` # tilde and variable expansion 103 | ;; 104 | --ccount=*) 105 | client_count=`echo $arg | sed -e 's/--ccount=//'` 106 | client_count=`eval echo ${client_count}` # tilde and variable expansion 107 | ;; 108 | --rcount=*) 109 | request_count=`echo $arg | sed -e 's/--rcount=//'` 110 | request_count=`eval echo ${request_count}` # tilde and variable expansion 111 | ;; 112 | esac 113 | done 114 | 115 | if [[ "x$APP" == "x" ]]; then 116 | ErrorAndExit "No app defined: --app" 117 | elif [[ "$APP" == "ssdb" ]]; then 118 | run_dare="${DAREDIR}/apps/ssdb/ssdb-master/ssdb-server ${DAREDIR}/apps/ssdb/ssdb-master/ssdb.conf" 119 | elif [[ "$APP" == "redis" ]]; then 120 | run_dare="${DAREDIR}/apps/redis/install/bin/redis-server --port $port" 121 | elif [[ "$APP" == "memcached" ]]; then 122 | run_dare="${DAREDIR}/apps/memcached/install/bin/memcached -p $port" 123 | fi 124 | 125 | 126 | # list of allocated nodes, e.g., nodes=(n112002 n112001 n111902) 127 | nodes=(10.22.1.3 10.22.1.4 10.22.1.5 10.22.1.6 10.22.1.7 10.22.1.8 10.22.1.9 202.45.128.159) 128 | node_count=${#nodes[@]} 129 | echo "Allocated ${node_count} nodes:" > nodes 130 | for ((i=0; i<${node_count}; ++i)); do 131 | echo "$i:${nodes[$i]}" >> nodes 132 | done 133 | 134 | if [ $server_count -le 0 ]; then 135 | ErrorAndExit "0 < #servers; --scount" 136 | fi 137 | 138 | client=${nodes[-2]} 139 | echo ">>> client: ${client}" 140 | 141 | for ((i=0; i<${server_count}; ++i)); do 142 | servers[${i}]=${nodes[$i]} 143 | done 144 | echo ">>> ${server_count} servers: ${servers[@]}" 145 | 146 | DGID="ff0e::ffff:e101:101" 147 | 148 | ######################################################################## 149 | 150 | echo -ne "Starting $server_count servers...\n" 151 | StartDare $server_count 152 | echo "done" 153 | 154 | sleep 10 155 | #note: wait for leader election 156 | FindLeader 157 | StartBenchmark 158 | 159 | sleep 0.2 160 | StopDare 161 | 162 | ######################################################################## 163 | -------------------------------------------------------------------------------- /eval/mongoose_aget.cfg: -------------------------------------------------------------------------------- 1 | [mongoose /mongoose] 2 | PROXY_MODE=WITH_PROXY 3 | DEBUG_MODE=WITH_DEBUG 4 | SERVER_COUNT=3 5 | SERVER_INPUT=-p -document_root $MSMR_ROOT/eval/current/server 6 | SERVER_KILL=killall mongoose 7 | CLIENT_COUNT=100 8 | CLIENT_PROGRAM=$MSMR_ROOT/libevent_paxos/client-ld-preload/Mongoose_Aget/aget 9 | CLIENT_INPUT=-f -n2 -p 9000 http://localhost/README.md 10 | TEST_FILE=$MSMR_ROOT/README.md 11 | -------------------------------------------------------------------------------- /eval/readme.txt: -------------------------------------------------------------------------------- 1 | This is evaluation framework of m-smr system! 2 | Run: 3 | ./eval.py *.cfg 4 | -------------------------------------------------------------------------------- /makefile.init: -------------------------------------------------------------------------------- 1 | ROOT_DIR := $(shell pwd) 2 | DEBUGOPT := 0 3 | -------------------------------------------------------------------------------- /src/config-comp/config-dare.c: -------------------------------------------------------------------------------- 1 | #include "../include/util/common-header.h" 2 | #include "../include/dare/dare_server.h" 3 | #include 4 | 5 | double hb_period; 6 | uint64_t elec_timeout_low; 7 | uint64_t elec_timeout_high; 8 | double rc_info_period; 9 | double retransmit_period; 10 | double log_pruning_period; 11 | 12 | int dare_read_config(const char* config_path){ 13 | config_t config_file; 14 | config_init(&config_file); 15 | 16 | if(!config_read_file(&config_file,config_path)){ 17 | goto goto_config_error; 18 | } 19 | 20 | config_setting_t *dare_global_config = NULL; 21 | dare_global_config = config_lookup(&config_file,"dare_global_config"); 22 | 23 | if(NULL!=dare_global_config){ 24 | double temp_float; 25 | if(config_setting_lookup_float(dare_global_config,"hb_period",&temp_float)){ 26 | hb_period = temp_float; 27 | } 28 | if(config_setting_lookup_float(dare_global_config,"rc_info_period",&temp_float)){ 29 | rc_info_period = temp_float; 30 | } 31 | if(config_setting_lookup_float(dare_global_config,"retransmit_period",&temp_float)){ 32 | retransmit_period = temp_float; 33 | } 34 | if(config_setting_lookup_float(dare_global_config,"log_pruning_period",&temp_float)){ 35 | log_pruning_period = temp_float; 36 | } 37 | long long temp_int64; 38 | if(config_setting_lookup_int64(dare_global_config,"elec_timeout_low",&temp_int64)){ 39 | elec_timeout_low = temp_int64; 40 | } 41 | if(config_setting_lookup_int64(dare_global_config,"elec_timeout_high",&temp_int64)){ 42 | elec_timeout_high = temp_int64; 43 | } 44 | } 45 | 46 | config_destroy(&config_file); 47 | return 0; 48 | 49 | goto_config_error: 50 | err_log("%s:%d - %s\n", config_error_file(&config_file), 51 | config_error_line(&config_file), config_error_text(&config_file)); 52 | config_destroy(&config_file); 53 | return -1; 54 | } 55 | -------------------------------------------------------------------------------- /src/config-comp/config-proxy.c: -------------------------------------------------------------------------------- 1 | #include "../include/util/common-header.h" 2 | #include "../include/proxy/proxy.h" 3 | #include 4 | 5 | 6 | int proxy_read_config(struct proxy_node_t* cur_node,const char* config_path){ 7 | config_t config_file; 8 | config_init(&config_file); 9 | 10 | if(!config_read_file(&config_file,config_path)){ 11 | goto goto_config_error; 12 | } 13 | 14 | config_lookup_int(&config_file,"req_log",&cur_node->req_log); 15 | 16 | const char* db_name; 17 | if(!config_lookup_string(&config_file,"db_name",&db_name)){ 18 | goto goto_config_error; 19 | } 20 | size_t db_name_len = strlen(db_name); 21 | cur_node->db_name = (char*)malloc(sizeof(char)*(db_name_len+1)); 22 | if(cur_node->db_name==NULL){ 23 | goto goto_config_error; 24 | } 25 | if(NULL==strncpy(cur_node->db_name,db_name,db_name_len)){ 26 | free(cur_node->db_name); 27 | goto goto_config_error; 28 | } 29 | cur_node->db_name[db_name_len] = '\0'; 30 | 31 | 32 | const char* peer_ipaddr=NULL; 33 | int peer_port=-1; 34 | if(!config_lookup_string(&config_file,"ip_address",&peer_ipaddr)){ 35 | goto goto_config_error; 36 | } 37 | if(!config_lookup_int(&config_file,"port",&peer_port)){ 38 | goto goto_config_error; 39 | } 40 | 41 | cur_node->sys_addr.s_addr.sin_port = htons(peer_port); 42 | cur_node->sys_addr.s_addr.sin_family = AF_INET; 43 | inet_pton(AF_INET,peer_ipaddr,&cur_node->sys_addr.s_addr.sin_addr); 44 | 45 | cur_node->sys_addr.s_sock_len = sizeof(cur_node->sys_addr.s_addr); 46 | 47 | 48 | config_destroy(&config_file); 49 | return 0; 50 | 51 | goto_config_error: 52 | err_log("%s:%d - %s\n", config_error_file(&config_file), 53 | config_error_line(&config_file), config_error_text(&config_file)); 54 | config_destroy(&config_file); 55 | return -1; 56 | } 57 | -------------------------------------------------------------------------------- /src/dare/dare_ep_db.c: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Endpoint database 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #include 13 | 14 | #include "../include/dare/debug.h" 15 | #include "../include/dare/dare_ibv_ud.h" 16 | #include "../include/dare/dare_ibv_rc.h" 17 | 18 | #include "../include/dare/dare_ep_db.h" 19 | 20 | /* ================================================================== */ 21 | 22 | static void 23 | free_ep(dare_ep_t *ep); 24 | 25 | /* ================================================================== */ 26 | 27 | dare_ep_t* ep_search( struct rb_root *root, const uint16_t lid ) 28 | { 29 | struct rb_node *node = root->rb_node; 30 | 31 | while (node) 32 | { 33 | dare_ep_t *ep = container_of(node, dare_ep_t, node); 34 | 35 | if (lid < ep->ud_ep.lid) 36 | node = node->rb_left; 37 | else if (lid > ep->ud_ep.lid) 38 | node = node->rb_right; 39 | else 40 | return ep; 41 | } 42 | return NULL; 43 | } 44 | 45 | dare_ep_t* ep_insert( struct rb_root *root, const uint16_t lid, const union ibv_gid dest_gid ) 46 | { 47 | dare_ep_t *ep; 48 | struct rb_node **new = &(root->rb_node), *parent = NULL; 49 | 50 | while (*new) 51 | { 52 | dare_ep_t *this = container_of(*new, dare_ep_t, node); 53 | 54 | parent = *new; 55 | if (lid < this->ud_ep.lid) 56 | new = &((*new)->rb_left); 57 | else if (lid > this->ud_ep.lid) 58 | new = &((*new)->rb_right); 59 | else 60 | return NULL; 61 | } 62 | 63 | /* Create new rr */ 64 | ep = (dare_ep_t*)malloc(sizeof(dare_ep_t)); 65 | ep->ud_ep.lid = lid; 66 | ep->ud_ep.gid = dest_gid; 67 | ep->last_req_id = 0; 68 | ep->cid_idx = 0; 69 | ep->committed = 0; 70 | ep->wait_for_idx = 0; 71 | 72 | /* Create AH */ 73 | ep->ud_ep.ah = ud_ah_create(lid, dest_gid); 74 | 75 | 76 | /* Add new node and rebalance tree. */ 77 | rb_link_node(&ep->node, parent, new); 78 | rb_insert_color(&ep->node, root); 79 | 80 | return ep; 81 | } 82 | 83 | void ep_erase( struct rb_root *root, const uint16_t lid ) 84 | { 85 | dare_ep_t *ep = ep_search(root, lid); 86 | 87 | if (ep) 88 | { 89 | rb_erase(&ep->node, root); 90 | free_ep(ep); 91 | } 92 | } 93 | 94 | void ep_db_print( struct rb_root *root ) 95 | { 96 | struct rb_node *node; 97 | dare_ep_t *ep; 98 | 99 | for (node = rb_first(root); node; node = rb_next(node)) 100 | { 101 | ep = rb_entry(node, dare_ep_t, node); 102 | info(log_fp, "[%"PRIu16": qpn=%"PRIu32"] ", 103 | ep->ud_ep.lid, ep->ud_ep.qpn); 104 | } 105 | } 106 | 107 | void ep_db_free( struct rb_root *root ) 108 | { 109 | struct rb_node *node; 110 | dare_ep_t *ep; 111 | 112 | for (node = rb_first_postorder(root); node;) 113 | { 114 | ep = rb_entry(node, dare_ep_t, node); 115 | node = rb_next_postorder(node); 116 | free_ep(ep); 117 | } 118 | } 119 | 120 | void ep_dp_reset_wait_idx( struct rb_root *root ) 121 | { 122 | struct rb_node *node; 123 | dare_ep_t *ep; 124 | 125 | for (node = rb_first(root); node; node = rb_next(node)) 126 | { 127 | ep = rb_entry(node, dare_ep_t, node); 128 | ep->wait_for_idx = 0; 129 | } 130 | } 131 | 132 | void ep_dp_reply_read_req( struct rb_root *root, uint64_t idx ) 133 | { 134 | int rc; 135 | struct rb_node *node; 136 | dare_ep_t *ep; 137 | int verify_leadership = 0; 138 | int leader = 0; 139 | 140 | for (node = rb_first(root); node; node = rb_next(node)) 141 | { 142 | ep = rb_entry(node, dare_ep_t, node); 143 | if (!ep->wait_for_idx) continue; 144 | if (!verify_leadership) { 145 | /* Verify leadership */ 146 | rc = rc_verify_leadership(&leader); 147 | if (0 != rc) { 148 | error(log_fp, "Cannot verify leadership\n"); 149 | } 150 | if (0 == leader) { 151 | /* No longer the leader; reset the wait idx */ 152 | ep_dp_reset_wait_idx(root); 153 | return; 154 | } 155 | verify_leadership = 1; 156 | } 157 | if (ep->wait_for_idx < idx) { 158 | ud_clt_answer_read_request(ep); 159 | } 160 | } 161 | } 162 | 163 | /* ================================================================== */ 164 | 165 | static void 166 | free_ep(dare_ep_t *ep) 167 | { 168 | ud_ah_destroy(ep->ud_ep.ah); 169 | free(ep); 170 | } 171 | -------------------------------------------------------------------------------- /src/dare/dare_kvs_sm.c: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * State machine implementation (KVS) 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | 15 | #include "../include/dare/dare_sm.h" 16 | #include "../include/dare/dare_kvs_sm.h" 17 | #include "../include/dare/dare.h" 18 | 19 | uint32_t kvs_size; // kvs size in bytes 20 | 21 | struct kvs_list_t { 22 | kvs_entry_t entry; 23 | struct kvs_list_t *next; 24 | }; 25 | typedef struct kvs_list_t kvs_list_t; 26 | 27 | struct kvs_table_t { 28 | uint32_t size; 29 | kvs_list_t **table; 30 | }; 31 | typedef struct kvs_table_t kvs_table_t; 32 | 33 | struct dare_kvs_sm_t { 34 | dare_sm_t sm; 35 | kvs_table_t kvs_table; 36 | }; 37 | typedef struct dare_kvs_sm_t dare_kvs_sm_t; 38 | 39 | struct kvs_snapshot_entry_t { 40 | uint16_t len; 41 | char key[KEY_SIZE]; 42 | uint8_t value[0]; 43 | }; 44 | typedef struct kvs_snapshot_entry_t kvs_snapshot_entry_t; 45 | 46 | /* ================================================================== */ 47 | 48 | static int 49 | create_kvs_table( kvs_table_t* kvs_table ); 50 | static void 51 | destroy_kvs_sm( dare_sm_t* sm ); 52 | static int 53 | apply_kvs_cmd( dare_sm_t *sm, sm_cmd_t *cmd, sm_data_t *data ); 54 | 55 | static uint32_t 56 | hash( kvs_table_t *kvs_table, char *key ); 57 | static kvs_list_t* 58 | lookup_key( kvs_table_t *kvs_table, char *key ); 59 | static void 60 | remove_key( kvs_table_t *kvs_table, char *key ); 61 | static int 62 | write_key( kvs_table_t *kvs_table, char *key, kvs_blob_t *blob ); 63 | 64 | /* ================================================================== */ 65 | /* Create KVS */ 66 | 67 | dare_sm_t* create_kvs_sm( uint32_t size ) 68 | { 69 | 70 | int rc; 71 | dare_kvs_sm_t *kvs_sm; 72 | 73 | if (0 == size) { 74 | size = DEFAULT_KVS_SIZE; 75 | } 76 | 77 | /* Allocate new KVS SM */ 78 | kvs_sm = (dare_kvs_sm_t*)malloc(sizeof(dare_kvs_sm_t)); 79 | if (NULL == kvs_sm) { 80 | error(log_fp, "Cannot allocate KVS SM\n"); 81 | return NULL; 82 | } 83 | 84 | /* Initiate KVS table */ 85 | kvs_sm->kvs_table.size = size; 86 | rc = create_kvs_table(&kvs_sm->kvs_table); 87 | if (0 != rc) { 88 | free(kvs_sm); 89 | kvs_sm = NULL; 90 | error(log_fp, "Cannot allocate KVS SM\n"); 91 | return NULL; 92 | } 93 | 94 | dare_sm_t sm = { 95 | .destroy = destroy_kvs_sm, 96 | .apply_cmd = apply_kvs_cmd, 97 | }; 98 | 99 | memcpy(&kvs_sm->sm, &sm, sizeof(dare_sm_t)); 100 | 101 | // kvs_sm->sm.destroy = destroy_kvs_sm; 102 | // kvs_sm->sm.apply_cmd = apply_kvs_cmd; 103 | 104 | return &(kvs_sm->sm); 105 | } 106 | 107 | static int 108 | create_kvs_table( kvs_table_t* kvs_table ) 109 | { 110 | kvs_table->table = (kvs_list_t**) 111 | malloc(sizeof(kvs_list_t*) * kvs_table->size); 112 | if (NULL == kvs_table->table) { 113 | error_return(1, log_fp, "Cannot allocate KVS table\n"); 114 | } 115 | memset(kvs_table->table, 0, sizeof(kvs_list_t*) * kvs_table->size); 116 | 117 | return 0; 118 | } 119 | 120 | /* ================================================================== */ 121 | /* SM methods */ 122 | 123 | static void 124 | destroy_kvs_sm( dare_sm_t* sm ) 125 | { 126 | uint32_t i; 127 | dare_kvs_sm_t *kvs_sm = (dare_kvs_sm_t*)sm; 128 | kvs_list_t *list, *tmp; 129 | 130 | if (NULL == kvs_sm) { 131 | return; 132 | } 133 | if (NULL == kvs_sm->kvs_table.table) { 134 | free(kvs_sm); 135 | kvs_sm = NULL; 136 | return; 137 | } 138 | for (i = 0; i < kvs_sm->kvs_table.size; i++) { 139 | list = kvs_sm->kvs_table.table[i]; 140 | while (NULL != list) { 141 | tmp = list; 142 | list = list->next; 143 | if (NULL != tmp->entry.blob.data) { 144 | free(tmp->entry.blob.data); 145 | tmp->entry.blob.data = NULL; 146 | } 147 | free(tmp); 148 | } 149 | } 150 | 151 | free(kvs_sm->kvs_table.table); 152 | kvs_sm->kvs_table.table = NULL; 153 | free(kvs_sm); 154 | kvs_sm = NULL; 155 | } 156 | 157 | static int 158 | apply_kvs_cmd( dare_sm_t* sm, sm_cmd_t *cmd, sm_data_t *data ) 159 | { 160 | int rc; 161 | kvs_blob_t blob; 162 | kvs_list_t* list; 163 | dare_kvs_sm_t *kvs_sm = (dare_kvs_sm_t*)sm; 164 | if (NULL == kvs_sm) { 165 | error_return(1, log_fp, "SM is NULL\n"); 166 | } 167 | 168 | kvs_cmd_t *kvs_cmd = (kvs_cmd_t*)cmd->cmd; 169 | if (NULL == kvs_cmd) { 170 | error_return(1, log_fp, "Command is NULL\n"); 171 | } 172 | //debug(log_fp, "KVS type %"PRIu8"\n", kvs_cmd->type); 173 | switch (kvs_cmd->type) { 174 | case KVS_PUT: 175 | //debug(log_fp, "PUT key = %s\n", kvs_cmd->key); 176 | blob.len = kvs_cmd->len; 177 | blob.data = kvs_cmd->data; 178 | rc = write_key(&kvs_sm->kvs_table, kvs_cmd->key, &blob); 179 | if (0 != rc) { 180 | error_return(1, log_fp, "Cannot apply PUT operation\n"); 181 | } 182 | break; 183 | case KVS_GET: 184 | //debug(log_fp, "GET key = %s\n", kvs_cmd->key); 185 | list = lookup_key(&kvs_sm->kvs_table, kvs_cmd->key); 186 | if (NULL == list) { 187 | data->len = 0; 188 | } 189 | else { 190 | data->len = list->entry.blob.len; 191 | memcpy(data->data, list->entry.blob.data, data->len); 192 | } 193 | break; 194 | case KVS_RM: 195 | remove_key(&kvs_sm->kvs_table, kvs_cmd->key); 196 | break; 197 | default: 198 | error_return(1, log_fp, "Unknown KVS command\n"); 199 | } 200 | 201 | return 0; 202 | } 203 | 204 | /* ================================================================== */ 205 | 206 | /** 207 | * Simple hash function 208 | */ 209 | static uint32_t 210 | hash( kvs_table_t *kvs_table, char *key ) 211 | { 212 | uint32_t hashval; 213 | 214 | hashval = 0; 215 | for(; *key != '\0'; key++) { 216 | hashval = *key + (hashval << 5) - hashval; 217 | } 218 | return hashval % kvs_table->size; 219 | } 220 | 221 | static kvs_list_t* 222 | lookup_key( kvs_table_t *kvs_table, char *key ) 223 | { 224 | kvs_list_t *list; 225 | uint32_t hashval = hash(kvs_table, key); 226 | 227 | for(list = kvs_table->table[hashval]; list != NULL; list = list->next) { 228 | if (strcmp(key, list->entry.key) == 0) { 229 | return list; 230 | } 231 | } 232 | return NULL; 233 | } 234 | 235 | static void 236 | remove_key( kvs_table_t *kvs_table, char *key ) 237 | { 238 | kvs_list_t *list, *prev; 239 | uint32_t hashval = hash(kvs_table, key); 240 | 241 | list = kvs_table->table[hashval]; 242 | if (list == NULL) return; 243 | if (strcmp(key, list->entry.key) == 0) { 244 | kvs_table->table[hashval] = list->next; 245 | /* Update KVS size */ 246 | kvs_size = kvs_size - sizeof(kvs_snapshot_entry_t) 247 | - list->entry.blob.len; 248 | if (NULL != list->entry.blob.data) { 249 | free(list->entry.blob.data); 250 | list->entry.blob.data = NULL; 251 | } 252 | free(list); 253 | return; 254 | } 255 | prev = list; 256 | for(list = list->next; list != NULL; list = list->next) { 257 | if (strcmp(key, list->entry.key) == 0) { 258 | prev->next = list->next; 259 | /* Update KVS size */ 260 | kvs_size = kvs_size - sizeof(kvs_snapshot_entry_t) 261 | - list->entry.blob.len; 262 | if (NULL != list->entry.blob.data) { 263 | free(list->entry.blob.data); 264 | list->entry.blob.data = NULL; 265 | } 266 | free(list); 267 | return; 268 | } 269 | prev = list; 270 | } 271 | } 272 | 273 | static int 274 | write_key( kvs_table_t *kvs_table, char *key, kvs_blob_t *blob ) 275 | { 276 | /* Search for list entry with this key */ 277 | kvs_list_t *list = lookup_key(kvs_table, key); 278 | if (NULL != list) { 279 | /* Key already exists - overwrite */ 280 | if (list->entry.blob.len != blob->len) { 281 | /* Update KVS size */ 282 | kvs_size += blob->len - list->entry.blob.len; 283 | /* Resize blob */ 284 | list->entry.blob.len = blob->len; 285 | /* Reallocate memory for the value */ 286 | list->entry.blob.data = realloc(list->entry.blob.data, blob->len); 287 | if (NULL == list->entry.blob.data) { 288 | error_return(1, log_fp, "Cannot allocate new KVS blob\n"); 289 | } 290 | } 291 | memcpy(list->entry.blob.data, blob->data, blob->len); 292 | return 0; 293 | } 294 | 295 | /* Insert new key */ 296 | unsigned int hashval = hash(kvs_table, key); 297 | list = (kvs_list_t*)malloc(sizeof(kvs_list_t)); 298 | if (NULL == list) { 299 | error_return(1, log_fp, "Cannot allocate new KVS list\n"); 300 | } 301 | memcpy(&list->entry.key, key, KEY_SIZE); 302 | list->entry.blob.len = blob->len; 303 | /* Update KVS size */ 304 | kvs_size += sizeof(kvs_snapshot_entry_t) + blob->len; 305 | /* Allocate memory for the value */ 306 | list->entry.blob.data = malloc(blob->len); 307 | if (NULL == list->entry.blob.data) { 308 | error_return(1, log_fp, "Cannot allocate new KVS blob\n"); 309 | } 310 | memcpy(list->entry.blob.data, blob->data, blob->len); 311 | list->next = kvs_table->table[hashval]; 312 | kvs_table->table[hashval] = list; 313 | 314 | return 0; 315 | } 316 | -------------------------------------------------------------------------------- /src/db/db-interface.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../include/db/db-interface.h" 8 | #include "../include/util/debug.h" 9 | 10 | const char* db_dir="./.db"; 11 | 12 | u_int32_t pagesize = 32 * 1024; 13 | u_int cachesize = 32 * 1024 * 1024; 14 | 15 | struct db_t{ 16 | DB* bdb_ptr; 17 | }; 18 | 19 | uint32_t records_len; 20 | 21 | db* initialize_db(const char* db_name,uint32_t flag){ 22 | db* db_ptr=NULL; 23 | DB* b_db; 24 | int ret; 25 | /* Initialize the DB handle */ 26 | if((ret = db_create(&b_db,NULL,flag))!=0){ 27 | err_log("DB : %s.\n",db_strerror(ret)); 28 | goto db_init_return; 29 | } 30 | 31 | if((ret = b_db->set_pagesize(b_db,pagesize))!=0){ 32 | goto db_init_return; 33 | } 34 | if((ret = b_db->set_cachesize(b_db, 0, cachesize, 1))!=0){ 35 | goto db_init_return; 36 | } 37 | 38 | if((ret = b_db->open(b_db,NULL,db_name,NULL,DB_RECNO,DB_THREAD|DB_CREATE,0))!=0){ 39 | //b_db->err(b_db,ret,"%s","test.db"); 40 | goto db_init_return; 41 | } 42 | db_ptr = (db*)(malloc(sizeof(db))); 43 | db_ptr->bdb_ptr = b_db; 44 | 45 | db_init_return: 46 | if(db_ptr!=NULL){ 47 | //debug_log("DB Initialization Finished\n"); 48 | ; 49 | } 50 | return db_ptr; 51 | } 52 | 53 | void close_db(db* db_p,uint32_t mode){ 54 | if(db_p!=NULL){ 55 | if(db_p->bdb_ptr!=NULL){ 56 | db_p->bdb_ptr->close(db_p->bdb_ptr,mode); 57 | db_p->bdb_ptr=NULL; 58 | } 59 | free(db_p); 60 | db_p = NULL; 61 | } 62 | return; 63 | } 64 | 65 | int store_record(db* db_p,size_t data_size,void* data){ 66 | int ret = 1; 67 | if((NULL==db_p)||(NULL==db_p->bdb_ptr)){ 68 | if(db_p == NULL){ 69 | err_log("DB store_record : db_p is null.\n"); 70 | } else{ 71 | err_log("DB store_recor : db_p->bdb_ptr is null.\n"); 72 | } 73 | goto db_store_return; 74 | } 75 | DB* b_db = db_p->bdb_ptr; 76 | DBT key,db_data; 77 | memset(&db_data,0,sizeof(db_data)); 78 | db_data.data = data; 79 | db_data.size = data_size; 80 | 81 | records_len += data_size; 82 | 83 | memset(&key,0,sizeof(key)); 84 | key.flags = DB_DBT_MALLOC; 85 | if ((ret=b_db->put(b_db,NULL,&key,&db_data,DB_AUTO_COMMIT|DB_APPEND))==0){ 86 | //debug_log("db : %ld record stored. \n",*(uint64_t*)key_data); 87 | //b_db->sync(b_db,0); 88 | } 89 | else{ 90 | err_log("DB : %s.\n",db_strerror(ret)); 91 | //debug_log("db : can not save record %ld from database.\n",*(uint64_t*)key_data); 92 | //b_db->err(b_db,ret,"DB->Put"); 93 | } 94 | db_store_return: 95 | return ret; 96 | } 97 | 98 | void dump_records(db* db_p, void* buf){ 99 | DB* b_db = db_p->bdb_ptr; 100 | DBT key, data; 101 | DBC *dbcp; 102 | int ret; 103 | 104 | uint32_t len = 0; 105 | 106 | /* Acquire a cursor for the database. */ 107 | if ((ret = b_db->cursor(b_db, NULL, &dbcp, 0)) != 0) { 108 | b_db->err(b_db, ret, "DB->cursor"); 109 | } 110 | 111 | /* Re-initialize the key/data pair. */ 112 | memset(&key, 0, sizeof(key)); 113 | memset(&data, 0, sizeof(data)); 114 | 115 | /* Walk through the database and print out the key/data pairs. */ 116 | while ((ret = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) { 117 | //debug_log("%lu : %.*s\n", *(u_long *)key.data, (int)data.size, (char *)data.data); 118 | memcpy((char*)buf+len, data.data, data.size); 119 | len += data.size; 120 | } 121 | if (ret != DB_NOTFOUND) 122 | b_db->err(b_db, ret, "DBcursor->get"); 123 | 124 | /* Close the cursor. */ 125 | if ((ret = dbcp->c_close(dbcp)) != 0) { 126 | b_db->err(b_db, ret, "DBcursor->close"); 127 | } 128 | } 129 | 130 | 131 | uint32_t get_records_len() 132 | { 133 | return records_len; 134 | } -------------------------------------------------------------------------------- /src/include/config-comp/config-dare.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_DARE_H 2 | #define CONFIG_DARE_H 3 | 4 | int dare_read_config(const char* config_path); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /src/include/config-comp/config-proxy.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_PROXY_H 2 | #define CONFIG_PROXY_H 3 | 4 | struct proxy_node_t; 5 | 6 | int proxy_read_config(struct proxy_node_t* cur_node,const char* config_path); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /src/include/dare/dare.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * General header file 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #include "./debug.h" 13 | 14 | #ifndef DARE_H_ 15 | #define DARE_H_ 16 | 17 | /* SM types */ 18 | #define CLT_NULL 1 19 | #define CLT_KVS 2 20 | #define CLT_FS 3 21 | 22 | /* For immediate event scheduling */ 23 | #define NOW 0.000000001 24 | 25 | #define MAX_CLIENT_COUNT 64 26 | #define MAX_SERVER_COUNT 13 27 | 28 | #define PAGE_SIZE 4096 29 | 30 | 31 | /** 32 | * UD message types 33 | */ 34 | #define MSG_NONE 0 35 | #define MSG_ERROR 13 36 | /* Initialization messages */ 37 | #define RC_SYN 1 38 | #define RC_SYNACK 2 39 | #define RC_ACK 3 40 | /* Client SM messages */ 41 | #define CSM_READ 201 42 | #define CSM_WRITE 202 43 | #define CSM_REPLY 203 44 | /* Config messages */ 45 | #define JOIN 211 46 | #define DOWNSIZE 213 47 | #define CFG_REPLY 214 48 | /* LOGGP messages */ 49 | #define LOGGP_UD 55 50 | 51 | #endif /* DARE_H_ */ 52 | -------------------------------------------------------------------------------- /src/include/dare/dare_client.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Client implementation 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_CLIENT_H 13 | #define DARE_CLIENT_H 14 | 15 | #include 16 | #include 17 | #include "../../../utils/rbtree/include/rbtree.h" 18 | #include "./dare.h" 19 | #include "./dare_sm.h" 20 | #include "./timer.h" 21 | 22 | /* Retransmission period in ms */ 23 | #ifdef DEBUG 24 | #define CLT_RETRANS_PERIOD 500 25 | #define CLT_OUTPUT_PERIOD 100 26 | #else 27 | #define CLT_RETRANS_PERIOD 20 28 | #define CLT_OUTPUT_PERIOD 10 29 | #endif 30 | 31 | /* Client types */ 32 | #define CLT_TYPE_RECONF 1 33 | #define CLT_TYPE_LOOP 2 34 | #define CLT_TYPE_TRACE 3 35 | #define CLT_TYPE_RTRACE 4 36 | 37 | #define MAX_LINE_LENGTH 128 38 | 39 | struct dare_client_input_t { 40 | FILE* log; 41 | char* trace; 42 | char* output; 43 | uint8_t clt_type; 44 | uint8_t sm_type; 45 | uint8_t first_op_perc; 46 | uint8_t group_size; 47 | }; 48 | typedef struct dare_client_input_t dare_client_input_t; 49 | 50 | struct dare_client_data_t { 51 | dare_client_input_t *input; 52 | struct ev_loop *loop; // loop for EV library 53 | void *leader_ep; 54 | FILE *trace_fp; 55 | FILE *output_fp; 56 | dare_sm_t *sm; // local state machine 57 | HRT_TIMESTAMP_T t1, t2; 58 | }; 59 | typedef struct dare_client_data_t dare_client_data_t; 60 | 61 | /* ================================================================== */ 62 | 63 | int dare_client_init( dare_client_input_t *input ); 64 | void dare_client_shutdown(); 65 | 66 | #endif /* DARE_CLIENT_H */ 67 | -------------------------------------------------------------------------------- /src/include/dare/dare_config.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Group configuration 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_CONFIG_H 13 | #define DARE_CONFIG_H 14 | 15 | #include "./dare.h" 16 | 17 | /* Stable configuration: only one size specified */ 18 | #define CID_STABLE 0 19 | /* Transitional configuration: both old and new size are specified; 20 | * !!! both majority needed */ 21 | #define CID_TRANSIT 1 22 | /* Extended configuration: both old and new size are specified; 23 | * !!! only old majority needed */ 24 | #define CID_EXTENDED 2 25 | 26 | #define CID_IS_SERVER_ON(cid, idx) ((cid).bitmask & (1 << (idx))) 27 | #define CID_SERVER_ADD(cid, idx) (cid).bitmask |= 1 << (idx) 28 | #define CID_SERVER_RM(cid, idx) (cid).bitmask &= ~(1 << (idx)) 29 | 30 | /** 31 | * Configuration ID: A configuration is given by a 32 | * [N, N', STATE, BITMASK] tuple, where: 33 | * N - is the current group size 34 | * N' - is the new size in a transitional configuration 35 | * STATE - is the configuration state: stable, transitional, extended 36 | * BITMASK - is a bitmask with a bit set for every on servers 37 | */ 38 | struct dare_cid_t { 39 | uint64_t epoch; 40 | uint8_t size[2]; 41 | uint8_t state; 42 | uint8_t pad[1]; 43 | uint32_t bitmask; 44 | }; 45 | typedef struct dare_cid_t dare_cid_t; 46 | 47 | static int 48 | equal_cid( dare_cid_t left_cid, dare_cid_t right_cid ) 49 | { 50 | if (left_cid.epoch != right_cid.epoch) return 0; 51 | if (left_cid.state != right_cid.state) return 0; 52 | if (left_cid.size[0] != right_cid.size[0]) return 0; 53 | if (left_cid.size[1] != right_cid.size[1]) return 0; 54 | if (left_cid.bitmask != right_cid.bitmask) return 0; 55 | return 1; 56 | } 57 | 58 | typedef struct server_t server_t; 59 | struct server_config_t { 60 | dare_cid_t cid; /* configuration identifier */ 61 | uint64_t cid_offset; /* the offset of the next entry from where 62 | to start looking for CONFIG entries; 63 | note that it cannot be larger than WRITE */ 64 | uint64_t cid_idx; /* the index of the last CONFIG entry before 65 | joining the cluster; a server considers only 66 | CONFIG entries with a larger index */ 67 | uint64_t req_id; /* Request ID of the endpoint that owns 68 | this configuration change */ 69 | server_t *servers; /* array with info for each server */ 70 | uint16_t clt_id; /* LID of the endpoint that owns 71 | this configuration change */ 72 | uint8_t idx; /* own index in configuration */ 73 | uint8_t len; /* fixed length of configuration array */ 74 | }; 75 | typedef struct server_config_t server_config_t; 76 | 77 | /* Get the maximum size including the extra added servers */ 78 | static uint8_t 79 | get_extended_group_size( server_config_t config ) 80 | { 81 | if (CID_STABLE == config.cid.state) 82 | return config.cid.size[0]; 83 | if (config.cid.size[0] < config.cid.size[1]) 84 | return config.cid.size[1]; 85 | return config.cid.size[0]; 86 | } 87 | 88 | /* Get the maximum size ignoring the extra added servers */ 89 | static uint8_t 90 | get_group_size( server_config_t config ) 91 | { 92 | if (CID_TRANSIT != config.cid.state) 93 | return config.cid.size[0]; 94 | if (config.cid.size[0] < config.cid.size[1]) 95 | return config.cid.size[1]; 96 | return config.cid.size[0]; 97 | } 98 | 99 | #define PRINT_CID(cid) text(log_fp, \ 100 | " [E%"PRIu64":%02"PRIu8"|%02"PRIu8"|%d|%03"PRIu32"] ", \ 101 | (cid).epoch, (cid).size[0], (cid).size[1], (cid).state, (cid).bitmask) 102 | #define PRINT_CID_(cid) PRINT_CID(cid); text(log_fp, "\n"); 103 | 104 | #define PRINT_CONF_TRANSIT(old_cid, new_cid) \ 105 | info_wtime(log_fp, "(%s:%d) Configuration transition: " \ 106 | "[E%"PRIu64":%02"PRIu8"|%02"PRIu8"|%d|%03"PRIu32"] -> " \ 107 | "[E%"PRIu64":%02"PRIu8"|%02"PRIu8"|%d|%03"PRIu32"]\n", \ 108 | __func__, __LINE__, \ 109 | (old_cid).epoch, (old_cid).size[0], (old_cid).size[1], \ 110 | (old_cid).state, (old_cid).bitmask, \ 111 | (new_cid).epoch, (new_cid).size[0], (new_cid).size[1], \ 112 | (new_cid).state, (new_cid).bitmask) 113 | 114 | #endif /* DARE_CONFIG_H */ 115 | -------------------------------------------------------------------------------- /src/include/dare/dare_ep_db.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Endpoint database 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_EP_DB_H 13 | #define DARE_EP_DB_H 14 | 15 | #include "../../../utils/rbtree/include/rbtree.h" 16 | #include "./dare_ibv.h" 17 | 18 | /* ================================================================== */ 19 | 20 | struct dare_ep_t { 21 | struct rb_node node; 22 | ud_ep_t ud_ep; 23 | uint8_t last_read_request[128]; 24 | uint64_t wait_for_idx; 25 | uint64_t last_req_id; /* this is the ID of the last request from 26 | this endpoint that I answer; ignore requests 27 | with lower IDs */ 28 | uint64_t cid_idx; 29 | int committed; 30 | }; 31 | typedef struct dare_ep_t dare_ep_t; 32 | 33 | /* ================================================================== */ 34 | 35 | dare_ep_t* ep_search( struct rb_root *root, const uint16_t lid ); 36 | dare_ep_t* ep_insert( struct rb_root *root, const uint16_t lid, const union ibv_gid dest_gid ); 37 | void ep_erase( struct rb_root *root, const uint16_t lid ); 38 | void ep_db_print( struct rb_root *root ); 39 | void ep_db_free( struct rb_root *root ); 40 | void ep_dp_reset_wait_idx( struct rb_root *root ); 41 | void ep_dp_reply_read_req( struct rb_root *root, uint64_t idx ); 42 | 43 | 44 | #endif /* DARE_EP_DB_H */ 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/include/dare/dare_ibv.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Network module for the DARE consensus algorithm (IB verbs) 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #include /* OFED IB verbs */ 13 | #include "./dare.h" 14 | 15 | #ifndef DARE_IBV_H 16 | #define DARE_IBV_H 17 | 18 | #define DARE_WR_COUNT 32 19 | #define IB_PKEY_MASK 0x7fff 20 | 21 | #define IBV_SERVER 1 22 | #define IBV_CLIENT 2 23 | 24 | 25 | #define mtu_value(mtu) \ 26 | ((mtu == IBV_MTU_256) ? 256 : \ 27 | (mtu == IBV_MTU_512) ? 512 : \ 28 | (mtu == IBV_MTU_1024) ? 1024 : \ 29 | (mtu == IBV_MTU_2048) ? 2048 : \ 30 | (mtu == IBV_MTU_4096) ? 4096 : 0) 31 | 32 | #define qp_state_to_str(state) \ 33 | ((state == IBV_QPS_RESET) ? "RESET" : \ 34 | (state == IBV_QPS_INIT) ? "INIT" : \ 35 | (state == IBV_QPS_RTR) ? "RTR" : \ 36 | (state == IBV_QPS_RTS) ? "RTS" : \ 37 | (state == IBV_QPS_ERR) ? "ERR" : "X") 38 | 39 | #define CTRL_PSN 13 40 | #define LOG_PSN 55 41 | #define LOG_QP 1 42 | #define CTRL_QP 0 43 | 44 | #define HB_CNT_DELAY 3 45 | 46 | /* Endpoint UD info */ 47 | struct ud_ep_t { 48 | uint16_t lid; 49 | union ibv_gid gid; 50 | uint32_t qpn; 51 | struct ibv_ah *ah; 52 | }; 53 | typedef struct ud_ep_t ud_ep_t; 54 | 55 | struct rem_mem_t { 56 | uint64_t raddr; 57 | uint32_t rkey; 58 | }; 59 | typedef struct rem_mem_t rem_mem_t; 60 | 61 | #define RC_QP_ACTIVE 0 62 | #define RC_QP_BLOCKED 1 63 | #define RC_QP_ERROR 2 64 | 65 | struct rc_qp_t { 66 | struct ibv_qp *qp; // RC QP 67 | uint64_t signaled_wr_id; // ID of signaled WR (to avoid overflow) 68 | uint32_t qpn; // remote QP number 69 | uint32_t send_count; // number of posted sends 70 | uint8_t state; // QP's state 71 | }; 72 | typedef struct rc_qp_t rc_qp_t; 73 | 74 | /* Endpoint RC info */ 75 | struct rc_ep_t { 76 | rem_mem_t rmt_mr[2]; // remote memory regions 77 | rc_qp_t rc_qp[2]; // RC QPs (LOG & CTRL) 78 | }; 79 | typedef struct rc_ep_t rc_ep_t; 80 | 81 | struct dare_ib_ep_t { 82 | ud_ep_t ud_ep; // UD info 83 | rc_ep_t rc_ep; // RC info 84 | uint32_t mtu; 85 | int rc_connected; 86 | int log_access; 87 | }; 88 | typedef struct dare_ib_ep_t dare_ib_ep_t; 89 | 90 | struct dare_ib_device_t { 91 | /* General fields */ 92 | struct ibv_device *ib_dev; 93 | struct ibv_context *ib_dev_context; 94 | struct ibv_device_attr ib_dev_attr; 95 | uint16_t pkey_index; 96 | int gid_index; 97 | union ibv_gid gid; 98 | uint8_t port_num; // port number 99 | enum ibv_mtu mtu; // MTU for this device 100 | uint16_t lid; // local ID for this device 101 | 102 | /* QP for listening for clients requests - UD */ 103 | struct ibv_pd *ud_pd; 104 | struct ibv_qp *ud_qp; 105 | struct ibv_cq *ud_rcq; 106 | struct ibv_cq *ud_scq; 107 | int ud_rcqe; 108 | void **ud_recv_bufs; 109 | struct ibv_mr **ud_recv_mrs; 110 | void *ud_send_buf; 111 | struct ibv_mr *ud_send_mr; 112 | uint32_t ud_max_inline_data; 113 | uint64_t request_id; 114 | 115 | /* Multicast */ 116 | struct ibv_ah *ib_mcast_ah; 117 | union ibv_gid mgid; 118 | uint16_t mlid; 119 | 120 | /* QPs for inter-server communication - RC */ 121 | struct ibv_pd *rc_pd; 122 | struct ibv_cq *rc_cq[2]; 123 | int rc_cqe; 124 | struct ibv_wc *rc_wc_array; 125 | struct ibv_mr *lcl_mr[2]; 126 | uint32_t rc_max_inline_data; 127 | uint32_t rc_max_send_wr; 128 | 129 | /* Snapshot */ 130 | struct ibv_mr *prereg_snapshot_mr; 131 | struct ibv_mr *snapshot_mr; 132 | 133 | int ulp_type; 134 | void *udata; 135 | }; 136 | typedef struct dare_ib_device_t dare_ib_device_t; 137 | 138 | /* ================================================================== */ 139 | 140 | /* Init and cleaning up */ 141 | int dare_init_ib_device(); 142 | int dare_start_ib_ud(); 143 | int dare_init_ib_srv_data( void *data ); 144 | int dare_init_ib_clt_data( void *data ); 145 | int dare_init_ib_rc(); 146 | void dare_ib_srv_shutdown(); 147 | void dare_ib_clt_shutdown(); 148 | void dare_ib_destroy_ep( uint8_t idx ); 149 | 150 | /* Starting a server */ 151 | void dare_ib_poll_tailq(); 152 | uint8_t dare_ib_poll_ud_queue(); 153 | int dare_ib_join_cluster(); 154 | int dare_ib_exchange_rc_info(); 155 | int dare_ib_update_rc_info(); 156 | int dare_ib_get_replicated_vote(); 157 | int dare_ib_send_sm_request(); 158 | int dare_ib_send_sm_reply( uint8_t idx, void *s, int reg_mem ); 159 | int dare_ib_recover_sm( uint8_t idx ); 160 | int dare_ib_recover_log(); 161 | 162 | /* HB mechanism */ 163 | int dare_ib_send_hb(); 164 | int dare_ib_send_hb_reply( uint8_t idx ); 165 | 166 | /* Leader election */ 167 | int dare_ib_send_vote_request(); 168 | int dare_ib_replicate_vote(); 169 | int dare_ib_send_vote_ack(); 170 | 171 | /* Normal operation */ 172 | int dare_ib_establish_leadership(); 173 | int dare_ib_write_remote_logs( int wait_for_commit ); 174 | int dare_ib_send_entries_reply( uint8_t idx ); 175 | int dare_ib_get_remote_apply_offsets(); 176 | 177 | /* Handle client requests */ 178 | int dare_ib_apply_cmd_locally(); 179 | int dare_ib_create_clt_request(); 180 | int dare_ib_create_clt_downsize_request(); 181 | int dare_ib_resend_clt_request(); 182 | int dare_ib_send_clt_reply( uint16_t lid, uint64_t req_id, uint8_t type ); 183 | 184 | /* Handle QPs state */ 185 | void dare_ib_disconnect_server( uint8_t idx ); 186 | int dare_ib_revoke_log_access(); 187 | int dare_ib_restore_log_access(); 188 | 189 | /* LogGP */ 190 | double dare_ib_get_loggp_params( uint32_t size, int type, int *poll_count, int write, int inline_flag ); 191 | double dare_ib_loggp_prtt( int n, double delay, uint32_t size, int inline_flag ); 192 | int dare_ib_loggp_exit(); 193 | 194 | void print_rc_info(); 195 | int print_qp_state( void *qp ); 196 | int dare_ib_print_ud_qp(); 197 | 198 | void dare_ib_send_msg(); 199 | int find_max_inline( struct ibv_context *context, 200 | struct ibv_pd *pd, 201 | uint32_t *max_inline_arg ); 202 | 203 | #endif /* DARE_IBV_H */ 204 | -------------------------------------------------------------------------------- /src/include/dare/dare_ibv_rc.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Reliable Connection (RC) over InfiniBand 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_IBV_RC_H 13 | #define DARE_IBV_RC_H 14 | 15 | #include /* OFED stuff */ 16 | #include "./dare_ibv.h" 17 | 18 | #define SIGNALED 1 19 | #define NOTSIGNALED 0 20 | //#define NOTSIGNALED 1 21 | 22 | /** 23 | * The WR Identifier (WRID) 24 | * the WRID is a 64-bit value [SSN|WA|TAG|CONN], where 25 | * SSN is the Send Sequence Number 26 | * WA is the Wrap-Around flag, set for log update WRs 27 | * TAG is a flag set for special signaled WRs (to avoid QPs overflow) 28 | * CONN is a 8-bit index that identifies the connection (the remote server) 29 | */ 30 | /* The CONN consists of the 8 least significant bits (lsbs) */ 31 | #define WRID_GET_CONN(wrid) (uint8_t)((wrid) & (0xFF)) 32 | #define WRID_SET_CONN(wrid, conn) (wrid) = (conn | ((wrid >> 8) << 8)) 33 | /* The TAG flag is the 9th lsb */ 34 | #define WRID_GET_TAG(wrid) ((wrid) & (1 << 8)) 35 | #define WRID_SET_TAG(wrid) (wrid) |= 1 << 8 36 | #define WRID_UNSET_TAG(wrid) (wrid) &= ~(1 << 8) 37 | /* The WA flag is the 10th lsb */ 38 | #define WRID_GET_WA(wrid) ((wrid) & (1 << 9)) 39 | #define WRID_SET_WA(wrid) (wrid) |= 1 << 9 40 | #define WRID_UNSET_WA(wrid) (wrid) &= ~(1 << 9) 41 | /* The SSN consists of the most significant 54 bits */ 42 | #define WRID_GET_SSN(wrid) ((wrid) >> 10) 43 | #define WRID_SET_SSN(wrid, ssn) (wrid) = (((ssn) << 10) | ((wrid) & 0x3FF)) 44 | 45 | #define PRINT_WRID(wrid) info(log_fp, \ 46 | " [%010"PRIu64"|%d|%d|%03"PRIu8"] ", \ 47 | WRID_GET_SSN(wrid), \ 48 | (WRID_GET_WA(wrid) ? 1 : 0), \ 49 | (WRID_GET_TAG(wrid) ? 1 : 0), \ 50 | WRID_GET_CONN(wrid)) 51 | #define PRINT_WRID_(wrid) PRINT_WRID(wrid); info(log_fp, "\n"); 52 | 53 | int rc_init(); 54 | void rc_free(); 55 | 56 | /* Start up */ 57 | int rc_get_replicated_vote(); 58 | int rc_send_sm_request(); 59 | int rc_send_sm_reply( uint8_t idx, void *s, int reg_mem ); 60 | int rc_recover_sm( uint8_t idx ); 61 | int rc_recover_log(); 62 | 63 | /* HB mechanism */ 64 | int rc_send_hb(); 65 | int rc_send_hb_reply( uint8_t idx ); 66 | 67 | /* Leader election */ 68 | int rc_send_vote_request(); 69 | int rc_replicate_vote(); 70 | int rc_send_vote_ack(); 71 | 72 | /* Normal operation */ 73 | int rc_verify_leadership( int *leader ); 74 | int rc_write_remote_logs( int wait_for_commit ); 75 | int rc_send_entries_reply( uint8_t idx ); 76 | int rc_get_remote_apply_offsets(); 77 | 78 | /* QP interface */ 79 | int rc_disconnect_server( uint8_t idx ); 80 | int rc_connect_server( uint8_t idx, int qp_id ); 81 | int rc_revoke_log_access(); 82 | int rc_restore_log_access(); 83 | 84 | /* LogGP */ 85 | double rc_get_loggp_params( uint32_t size, int type, int *poll_count, int write, int inline_flag ); 86 | double rc_loggp_prtt( int n, double delay, uint32_t size ); 87 | int rc_loggp_exit(); 88 | 89 | int rc_print_qp_state( void *data ); 90 | void rc_ib_send_msg(); 91 | #endif /* DARE_IBV_RC_H */ 92 | -------------------------------------------------------------------------------- /src/include/dare/dare_ibv_ud.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Unreliable Datagrams (UD) over InfiniBand 5 | * 6 | * Copyright (c) 2016 HLRS, University of Stuttgart. All rights reserved. 7 | * 8 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 9 | * 10 | * Author(s): Marius Poke 11 | * Nakul Vyas 12 | * 13 | */ 14 | 15 | #ifndef DARE_IBV_UD_H 16 | #define DARE_IBV_UD_H 17 | 18 | #include /* OFED stuff */ 19 | #include "./dare_sm.h" 20 | #include "./dare_ibv.h" 21 | #include "./dare_config.h" 22 | #include "./dare_ep_db.h" 23 | 24 | #define REQ_MAJORITY 13 25 | #define MCG_GID {255,1,0,0,0,2,201,133,0,0,0,0,0,0,0,0} 26 | 27 | /* ================================================================== */ 28 | /* UD messages */ 29 | struct ud_hdr_t { 30 | uint64_t id; 31 | uint8_t type; 32 | union ibv_gid gid; 33 | //uint8_t pad[7]; 34 | uint16_t slid; 35 | }; 36 | typedef struct ud_hdr_t ud_hdr_t; 37 | 38 | struct client_req_t { 39 | ud_hdr_t hdr; 40 | sm_cmd_t cmd; 41 | }; 42 | typedef struct client_req_t client_req_t; 43 | 44 | struct client_rep_t { 45 | ud_hdr_t hdr; 46 | sm_data_t data; 47 | }; 48 | typedef struct client_rep_t client_rep_t; 49 | 50 | struct reconf_req_t { 51 | ud_hdr_t hdr; 52 | uint8_t idx_size; 53 | }; 54 | typedef struct reconf_req_t reconf_req_t; 55 | 56 | struct reconf_rep_t { 57 | ud_hdr_t hdr; 58 | uint8_t idx; 59 | dare_cid_t cid; 60 | uint64_t cid_idx; 61 | uint64_t head; 62 | }; 63 | typedef struct reconf_rep_t reconf_rep_t; 64 | 65 | struct rc_syn_t { 66 | ud_hdr_t hdr; 67 | rem_mem_t log_rm; 68 | rem_mem_t ctrl_rm; 69 | enum ibv_mtu mtu; 70 | //union ibv_gid gid; 71 | uint8_t idx; 72 | uint8_t size; 73 | uint8_t data[0]; // log & ctrl QPNs 74 | }; 75 | typedef struct rc_syn_t rc_syn_t; 76 | 77 | struct rc_ack_t { 78 | ud_hdr_t hdr; 79 | uint8_t idx; 80 | }; 81 | typedef struct rc_ack_t rc_ack_t; 82 | 83 | extern char* global_mgid; 84 | 85 | /* ================================================================== */ 86 | 87 | int ud_init( uint32_t receive_count ); 88 | int ud_start(); 89 | void ud_shutdown(); 90 | 91 | struct ibv_ah* ud_ah_create( uint16_t dlid, union ibv_gid dgid ); 92 | void ud_ah_destroy( struct ibv_ah* ah ); 93 | 94 | void get_tailq_message(); 95 | uint8_t ud_get_message(); 96 | int ud_join_cluster(); 97 | int ud_exchange_rc_info(); 98 | int ud_update_rc_info(); 99 | int ud_discover_servers(); 100 | int ud_establish_rc(); 101 | 102 | /* Client stuff */ 103 | int ud_send_clt_reply( uint16_t lid, uint64_t req_id, uint8_t type ); 104 | void ud_clt_answer_read_request(dare_ep_t *ep); 105 | 106 | #endif /* DARE_IBV_UD_H */ 107 | -------------------------------------------------------------------------------- /src/include/dare/dare_kvs_sm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * State machine implementation (KVS) 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_KVS_SM_H 13 | #define DARE_KVS_SM_H 14 | 15 | #define __STDC_FORMAT_MACROS 16 | #include 17 | 18 | #include "./dare_sm.h" 19 | 20 | #define DEFAULT_KVS_SIZE 1024 21 | #define KEY_SIZE 64 22 | 23 | /* KVS commands */ 24 | #define KVS_PUT 1 25 | #define KVS_GET 2 26 | #define KVS_RM 3 27 | 28 | /* KVS command */ 29 | struct kvs_cmd_t { 30 | uint8_t type; // read, write, delete 31 | char key[KEY_SIZE]; 32 | uint16_t len; 33 | uint8_t data[0]; 34 | }; 35 | typedef struct kvs_cmd_t kvs_cmd_t; 36 | 37 | struct kvs_blob_t { 38 | uint16_t len; 39 | void *data; 40 | }; 41 | typedef struct kvs_blob_t kvs_blob_t; 42 | 43 | struct kvs_entry_t { 44 | char key[KEY_SIZE]; 45 | kvs_blob_t blob; 46 | }; 47 | typedef struct kvs_entry_t kvs_entry_t; 48 | 49 | #endif /* DARE_KVS_SM_H */ 50 | -------------------------------------------------------------------------------- /src/include/dare/dare_server.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Implementation of a DARE server 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_SERVER_H 13 | #define DARE_SERVER_H 14 | 15 | #include 16 | 17 | #include 18 | #include "../../../utils/rbtree/include/rbtree.h" 19 | #include "./dare_log.h" 20 | #include "./dare.h" 21 | #include "./timer.h" 22 | 23 | /* Server types */ 24 | #define SRV_TYPE_START 1 25 | #define SRV_TYPE_JOIN 2 26 | #define SRV_TYPE_LOGGP 3 27 | 28 | /* LogGP param types */ 29 | #define LOGGP_PARAM_O 1 30 | #define LOGGP_PARAM_OP 2 31 | #define LOGGP_PARAM_L 3 32 | #define LOGGP_PARAM_OPX 4 33 | 34 | /* Retry period before failures in ms */ 35 | extern const double retry_exec_period; 36 | 37 | /* Heartbeat period in ms */ 38 | extern double hb_period; 39 | extern uint64_t elec_timeout_low; 40 | extern uint64_t elec_timeout_high; 41 | extern double rc_info_period; 42 | extern double retransmit_period; 43 | extern double log_pruning_period; 44 | 45 | /** 46 | * The state identifier (SID) 47 | * the SID is a 64-bit value [TERM|L|IDX], where 48 | * TERM is the current term 49 | * L is the leader flag, set when there is a leader 50 | * IDX is the index of the server that cause the last SID update 51 | */ 52 | /* The IDX consists of the 8 least significant bits (lsbs) */ 53 | #define SID_GET_IDX(sid) (uint8_t)((sid) & (0xFF)) 54 | #define SID_SET_IDX(sid, idx) (sid) = (idx | ((sid >> 8) << 8)) 55 | /* The L flag is the 9th lsb */ 56 | #define SID_GET_L(sid) ((sid) & (1 << 8)) 57 | #define SID_SET_L(sid) (sid) |= 1 << 8 58 | #define SID_UNSET_L(sid) (sid) &= ~(1 << 8) 59 | /* The TERM consists of the most significant 55 bits */ 60 | #define SID_GET_TERM(sid) ((sid) >> 9) 61 | #define SID_SET_TERM(sid, term) (sid) = (((term) << 9) | ((sid) & 0x1FF)) 62 | 63 | #define PRINT_SID(sid) text(log_fp, \ 64 | " [%010"PRIu64"|%d|%03"PRIu8"] ", \ 65 | SID_GET_TERM(sid), \ 66 | (SID_GET_L(sid) ? 1 : 0), \ 67 | SID_GET_IDX(sid)) 68 | #define PRINT_SID_(sid) PRINT_SID(sid); text(log_fp, "\n"); 69 | 70 | #define IS_SID_NEW(sid) (!SID_GET_L(sid) && (SID_GET_TERM(sid) == 0)) 71 | #define SID_NULL 0xFF 72 | #define SID_DEAD 0xFFFFFFFFFFFFFFFF 73 | 74 | /* Number of fail communication attempts before considering a remote 75 | server as permanently failed */ 76 | #define PERMANENT_FAILURE 2 77 | 78 | /* Normal operation (log replication) steps */ 79 | #define LR_GET_WRITE 1 80 | #define LR_GET_NCE_LEN 2 81 | #define LR_GET_NCE 3 82 | #define LR_SET_END 4 83 | #define LR_UPDATE_LOG 5 84 | #define LR_UPDATE_END 6 85 | 86 | struct server_t { 87 | uint64_t next_wr_id; // next WR ID to wait for 88 | uint64_t cached_end_offset; // the new end offset if the log update succeeds 89 | uint64_t last_get_read_ssn; // ssn of the last get read operation 90 | void *ep; // endpoint data (network related) 91 | uint8_t fail_count; // number of failures detected 92 | uint8_t next_lr_step; // next log replication step 93 | uint8_t send_flag; // flag set for posting send for this EP 94 | uint8_t send_count; // number of sends poster for current step 95 | }; 96 | 97 | //typedef struct server_t server_t; 98 | 99 | struct vote_req_t { 100 | uint64_t sid; 101 | uint64_t index; 102 | uint64_t term; 103 | dare_cid_t cid; 104 | }; 105 | typedef struct vote_req_t vote_req_t; 106 | 107 | struct prv_data_t { 108 | uint64_t vote_sid; // SID of last vote given 109 | // on recovery need to retrieve this from a 110 | // remote server and update own SID to at least 111 | // this SID 112 | }; 113 | typedef struct prv_data_t prv_data_t; 114 | 115 | struct sm_rep_t { 116 | uint64_t sid; 117 | uint64_t raddr; 118 | uint32_t rkey; 119 | uint32_t len; 120 | }; 121 | typedef struct sm_rep_t sm_rep_t; 122 | 123 | struct ctrl_data_t { 124 | /* State identified (SID) */ 125 | uint64_t sid; 126 | 127 | /* DARE arrays */ 128 | vote_req_t vote_req[MAX_SERVER_COUNT]; /* vote requests */ 129 | log_offsets_t log_offsets[MAX_SERVER_COUNT]; /* log offsets */ 130 | sm_rep_t sm_rep[MAX_SERVER_COUNT]; 131 | uint64_t sm_req[MAX_SERVER_COUNT]; 132 | uint64_t hb[MAX_SERVER_COUNT]; /* heartbeat array */ 133 | uint64_t vote_ack[MAX_SERVER_COUNT]; 134 | uint64_t rsid[MAX_SERVER_COUNT]; /* for remote terms & indexes */ 135 | uint64_t apply_offsets[MAX_SERVER_COUNT]; /* apply offsets */ 136 | 137 | /* Remote private data */ 138 | prv_data_t prv_data[MAX_SERVER_COUNT]; // private data 139 | }; 140 | typedef struct ctrl_data_t ctrl_data_t; 141 | 142 | struct dare_server_input_t { 143 | FILE* log; 144 | char* name; 145 | char* output; 146 | uint8_t srv_type; 147 | uint8_t sm_type; 148 | uint8_t group_size; 149 | uint8_t server_idx; 150 | 151 | proxy_do_action_cb_t do_action; 152 | proxy_store_cmd_cb_t store_cmd; 153 | proxy_create_db_snapshot_cb_t create_db_snapshot; 154 | proxy_get_db_size_cb_t get_db_size; 155 | proxy_apply_db_snapshot_cb_t apply_db_snapshot; 156 | proxy_update_state_cb_t update_state; 157 | char config_path[128]; 158 | void* up_para; 159 | }; 160 | typedef struct dare_server_input_t dare_server_input_t; 161 | 162 | struct dare_loggp_t { 163 | double o[2], 164 | o_ninline, 165 | o_poll, 166 | o_poll_x, 167 | L[2], 168 | G[3]; 169 | }; 170 | typedef struct dare_loggp_t dare_loggp_t; 171 | 172 | struct dare_server_data_t { 173 | dare_server_input_t *input; 174 | 175 | server_config_t config; // configuration 176 | 177 | ctrl_data_t *ctrl_data; // control data (state & private data) 178 | dare_log_t *log; // local log (remotely accessible) 179 | dare_sm_t *sm; // local state machine 180 | snapshot_t *prereg_snapshot; 181 | snapshot_t *snapshot; 182 | 183 | struct rb_root endpoints; // RB-tree with remote endpoints 184 | uint64_t last_write_csm_idx; 185 | uint64_t last_cmt_write_csm_idx; 186 | 187 | struct ev_loop *loop; // loop for EV library 188 | 189 | FILE* output_fp; 190 | dare_loggp_t loggp; 191 | 192 | HRT_TIMESTAMP_T t1, t2; 193 | }; 194 | typedef struct dare_server_data_t dare_server_data_t; 195 | /* ================================================================== */ 196 | 197 | void *dare_server_init( void *arg ); 198 | void dare_server_shutdown(); 199 | 200 | void server_to_follower(); 201 | int server_update_sid( uint64_t new_sid, uint64_t old_sid ); 202 | int is_leader(); 203 | uint8_t get_node_id(); 204 | 205 | #endif /* DARE_SERVER_H */ 206 | -------------------------------------------------------------------------------- /src/include/dare/dare_sm.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * State machine abstraction 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DARE_SM_H 13 | #define DARE_SM_H 14 | 15 | #include "./dare_kvs_sm.h" 16 | 17 | /* SM types */ 18 | #define SM_NULL 1 19 | #define SM_KVS 2 20 | #define SM_FS 3 21 | 22 | /* SM command - can be interpreted only by the SM */ 23 | struct sm_cmd_t { 24 | uint16_t len; 25 | uint8_t cmd[0]; 26 | }; 27 | typedef struct sm_cmd_t sm_cmd_t; 28 | 29 | /* SM data - as answer to a command */ 30 | struct sm_data_t { 31 | uint16_t len; 32 | uint8_t data[0]; 33 | }; 34 | typedef struct sm_data_t sm_data_t; 35 | typedef struct dare_sm_t dare_sm_t; 36 | 37 | /* Destroy the state machine */ 38 | typedef void (*destroy_cb_t)(dare_sm_t *sm); 39 | /* Apply a command to the state machine */ 40 | typedef int (*apply_cmd_cb_t)(dare_sm_t *sm, sm_cmd_t *cmd, sm_data_t *data); 41 | 42 | typedef void (*proxy_store_cmd_cb_t)(void* data,void *arg); 43 | typedef void (*proxy_do_action_cb_t)(uint16_t clt_id,uint8_t type,size_t data_size,void* data,void *arg); 44 | typedef void (*proxy_create_db_snapshot_cb_t)(void *snapshot,void *arg); 45 | typedef uint32_t (*proxy_get_db_size_cb_t)(void *arg); 46 | typedef int (*proxy_apply_db_snapshot_cb_t)(void *snapshot,uint32_t size,void *arg); 47 | typedef void (*proxy_update_state_cb_t)(void *arg); 48 | 49 | struct dare_sm_t { 50 | destroy_cb_t destroy; 51 | apply_cmd_cb_t apply_cmd; 52 | 53 | proxy_store_cmd_cb_t proxy_store_cmd; 54 | proxy_do_action_cb_t proxy_do_action; 55 | proxy_get_db_size_cb_t proxy_get_db_size; 56 | proxy_create_db_snapshot_cb_t proxy_create_db_snapshot; 57 | proxy_apply_db_snapshot_cb_t proxy_apply_db_snapshot; 58 | proxy_update_state_cb_t proxy_update_state; 59 | void* up_para; 60 | }; 61 | 62 | /* ================================================================== */ 63 | 64 | dare_sm_t* create_kvs_sm( uint32_t size ); 65 | 66 | 67 | #endif /* DARE_SM_H */ 68 | -------------------------------------------------------------------------------- /src/include/dare/debug.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Debugging and logging utilities 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DEBUG_H_ 13 | #define DEBUG_H_ 14 | 15 | #include 16 | #include 17 | #define __STDC_FORMAT_MACROS 18 | #include 19 | #include 20 | 21 | //extern struct timeval prev_tv; 22 | //extern uint64_t jump_cnt; 23 | 24 | #define info(stream, fmt, ...) do {\ 25 | fprintf(stream, fmt, ##__VA_ARGS__); \ 26 | fflush(stream); \ 27 | } while(0) 28 | #define info_wtime(stream, fmt, ...) do {\ 29 | struct timeval _debug_tv;\ 30 | gettimeofday(&_debug_tv,NULL);\ 31 | /* if (prev_tv.tv_sec != 0) { \ 32 | double __tmp = (_debug_tv.tv_sec - prev_tv.tv_sec) * 1000 + (_debug_tv.tv_usec - prev_tv.tv_usec)/1000;\ 33 | if (__tmp > 15) {\ 34 | jump_cnt++;\ 35 | fprintf(stream, "Time jump (%lf) ms %"PRIu64"\n", __tmp, jump_cnt);\ 36 | }\ 37 | }*/\ 38 | fprintf(stream, "[%lu:%06lu] " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, ##__VA_ARGS__); \ 39 | fflush(stream); \ 40 | } while(0) 41 | 42 | #ifdef DEBUG 43 | #define debug(stream, fmt, ...) do {\ 44 | struct timeval _debug_tv;\ 45 | gettimeofday(&_debug_tv,NULL);\ 46 | fprintf(stream, "[DEBUG %lu:%lu] %s/%d/%s() " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 47 | fflush(stream); \ 48 | } while(0) 49 | #define text(stream, fmt, ...) do {\ 50 | fprintf(stream, fmt, ##__VA_ARGS__); \ 51 | fflush(stream); \ 52 | } while(0) 53 | #define text_wtime(stream, fmt, ...) do {\ 54 | struct timeval _debug_tv;\ 55 | gettimeofday(&_debug_tv,NULL);\ 56 | fprintf(stream, "[%lu:%lu] " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, ##__VA_ARGS__); \ 57 | fflush(stream); \ 58 | } while(0) 59 | #else 60 | #define debug(stream, fmt, ...) 61 | #define text(stream, fmt, ...) 62 | #define text_wtime(stream, fmt, ...) 63 | #endif 64 | 65 | //#ifdef DEBUG 66 | #define error(stream, fmt, ...) do { \ 67 | fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 68 | fflush(stream); \ 69 | } while(0) 70 | //#else 71 | //#define error(stream, fmt, ...) 72 | //#endif 73 | 74 | //#ifdef DEBUG 75 | #define error_return(rc, stream, fmt, ...) do { \ 76 | fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 77 | fflush(stream); \ 78 | return (rc); \ 79 | } while(0) 80 | //#else 81 | //#define error_return(rc, stream, fmt, ...) return (rc) 82 | //#endif 83 | 84 | //#ifdef DEBUG 85 | #define error_exit(rc, stream, fmt, ...) do { \ 86 | fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 87 | fflush(stream); \ 88 | exit(rc); \ 89 | } while(0) 90 | //#else 91 | //#define error_exit(rc, stream, fmt, ...) exit(rc) 92 | //#endif 93 | 94 | #ifndef DEBUG 95 | #define dump_bytes(stream, addr, len, header) do { \ 96 | uint32_t _i; \ 97 | uint8_t *bytes = (uint8_t*)addr; \ 98 | info(stream, "### %s: [" , header); \ 99 | for (_i = 0; _i < (uint32_t)(len); _i++) { \ 100 | info(stream, "%"PRIu8", ", bytes[_i]); \ 101 | } \ 102 | info(stream, "]\n"); \ 103 | } while(0) 104 | #else 105 | #define dump_bytes(stream, addr, len, header) 106 | #endif 107 | 108 | extern FILE *log_fp; 109 | 110 | #endif /* DEBUG_H_ */ 111 | 112 | -------------------------------------------------------------------------------- /src/include/dare/message.h: -------------------------------------------------------------------------------- 1 | #ifndef MESSAGE_H 2 | #define MESSAGE_H 3 | #include 4 | 5 | struct tailq_cmd_t { 6 | uint16_t len; 7 | uint8_t cmd[87380]; 8 | }; 9 | typedef struct tailq_cmd_t tailq_cmd_t; 10 | 11 | struct tailq_entry_t { 12 | uint8_t type; 13 | uint16_t connection_id; 14 | uint64_t req_id; 15 | tailq_cmd_t cmd; 16 | TAILQ_ENTRY(tailq_entry_t) entries; 17 | }; 18 | typedef struct tailq_entry_t tailq_entry_t; 19 | 20 | TAILQ_HEAD(, tailq_entry_t) tailhead; 21 | 22 | pthread_spinlock_t tailq_lock; 23 | 24 | #endif -------------------------------------------------------------------------------- /src/include/dare/timer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DARE (Direct Access REplication) 3 | * 4 | * Timer implementation 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Copyright (c) 2009 The Trustees of Indiana University and Indiana 9 | * University Research and Technology 10 | * Corporation. All rights reserved. 11 | * 12 | * Author(s): Torsten Hoefler 13 | */ 14 | 15 | #ifndef TIMER_H_ 16 | #define TIMER_H_ 17 | 18 | #include "./debug.h" 19 | 20 | #define UINT32_T uint32_t 21 | #define UINT64_T uint64_t 22 | 23 | #define HRT_CALIBRATE(freq) do { \ 24 | static volatile HRT_TIMESTAMP_T t1, t2; \ 25 | static volatile UINT64_T elapsed_ticks, min = (UINT64_T)(~0x1); \ 26 | int notsmaller=0; \ 27 | while(notsmaller<3) { \ 28 | HRT_GET_TIMESTAMP(t1); \ 29 | sleep(1); \ 30 | /* nanosleep((struct timespec[]){{0, 10000000}}, NULL); */ \ 31 | HRT_GET_TIMESTAMP(t2); \ 32 | HRT_GET_ELAPSED_TICKS(t1, t2, &elapsed_ticks); \ 33 | notsmaller++; \ 34 | if(elapsed_ticks 4 | #include 5 | 6 | typedef struct db_t db; 7 | 8 | db* initialize_db(const char* db_name,uint32_t flag); 9 | 10 | void close_db(db*,uint32_t); 11 | 12 | int store_record(db*,size_t,void*); 13 | 14 | // the caller is responsible to release the memory 15 | 16 | void dump_records(db*,void*); 17 | uint32_t get_records_len(); 18 | #endif 19 | -------------------------------------------------------------------------------- /src/include/proxy/proxy.h: -------------------------------------------------------------------------------- 1 | #ifndef PROXY_H 2 | #define PROXY_H 3 | 4 | #include "../util/common-header.h" 5 | #include "../rsm-interface.h" 6 | #include "../../../utils/uthash/uthash.h" 7 | #include "../db/db-interface.h" 8 | #include 9 | 10 | #define CONNECT 4 11 | #define SEND 5 12 | #define CLOSE 6 13 | 14 | typedef uint16_t hk_t; 15 | typedef uint8_t nc_t; 16 | typedef uint8_t nid_t; 17 | 18 | struct list_entry_t { 19 | pthread_t tid; 20 | LIST_ENTRY(list_entry_t) entries; 21 | }; 22 | typedef struct list_entry_t list_entry_t; 23 | 24 | LIST_HEAD(, list_entry_t) listhead; 25 | 26 | typedef struct proxy_address_t{ 27 | struct sockaddr_in s_addr; 28 | size_t s_sock_len; 29 | }proxy_address; 30 | 31 | typedef struct socket_pair_t{ 32 | int clt_id; 33 | uint64_t req_id; 34 | uint16_t connection_id; 35 | int p_s; 36 | 37 | UT_hash_handle hh; 38 | }socket_pair; 39 | 40 | typedef struct proxy_node_t{ 41 | proxy_address sys_addr; 42 | 43 | socket_pair* leader_hash_map; 44 | socket_pair* follower_hash_map; 45 | uint64_t highest_rec; 46 | uint64_t cur_rec; 47 | nc_t pair_count; 48 | 49 | // log option 50 | int req_log; 51 | 52 | FILE* req_log_file; 53 | char* db_name; 54 | db* db_ptr; 55 | }proxy_node; 56 | 57 | typedef struct proxy_msg_header_t{ 58 | uint16_t connection_id; 59 | uint8_t action; 60 | }proxy_msg_header; 61 | #define PROXY_MSG_HEADER_SIZE (sizeof(proxy_msg_header)) 62 | 63 | typedef struct proxy_connect_msg_t{ 64 | proxy_msg_header header; 65 | }proxy_connect_msg; 66 | #define PROXY_CONNECT_MSG_SIZE (sizeof(proxy_connect_msg)) 67 | 68 | struct fake_dare_cid_t { 69 | uint64_t epoch; 70 | uint8_t size[2]; 71 | uint8_t state; 72 | uint8_t pad[1]; 73 | uint32_t bitmask; 74 | }; 75 | typedef struct fake_dare_cid_t fake_dare_cid_t; 76 | 77 | struct fake_sm_cmd_t { 78 | uint16_t len; 79 | uint8_t cmd[0]; 80 | }; 81 | typedef struct fake_sm_cmd_t fake_sm_cmd_t; 82 | 83 | typedef struct proxy_send_msg_t{ 84 | proxy_msg_header header; 85 | union { 86 | fake_sm_cmd_t cmd; 87 | fake_dare_cid_t cid; 88 | uint64_t head; 89 | } data; 90 | }proxy_send_msg; 91 | #define PROXY_SEND_MSG_SIZE(M) (M->data.cmd.len+sizeof(proxy_send_msg)) 92 | 93 | typedef struct proxy_close_msg_t{ 94 | proxy_msg_header header; 95 | }proxy_close_msg; 96 | #define PROXY_CLOSE_MSG_SIZE (sizeof(proxy_close_msg)) 97 | 98 | #endif -------------------------------------------------------------------------------- /src/include/rsm-interface.h: -------------------------------------------------------------------------------- 1 | #ifndef RSM_INTERFACE_H 2 | #define RSM_INTERFACE_H 3 | #include 4 | #include 5 | 6 | struct proxy_node_t; 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | struct proxy_node_t* proxy_init(const char* config_path, const char* proxy_log_path); 13 | void proxy_on_read(struct proxy_node_t* proxy, void* buf, ssize_t ret, int fd); 14 | void proxy_on_accept(struct proxy_node_t* proxy, int ret); 15 | void proxy_on_close(struct proxy_node_t* proxy, int fildes); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/include/util/common-header.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_HEADER_H 2 | #define COMMON_HEADER_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "debug.h" 22 | 23 | #endif 24 | 25 | #ifndef _POSIX_SOURCE 26 | #define _POSIX_SOURCE 27 | #endif 28 | -------------------------------------------------------------------------------- /src/include/util/debug.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef DEBUG_H 3 | #define DEBUG_H 4 | 5 | #define debug_log(args...) do { \ 6 | struct timeval tv; \ 7 | gettimeofday(&tv,0); \ 8 | fprintf(stderr,"%lu.%06lu:",tv.tv_sec,tv.tv_usec); \ 9 | fprintf(stderr,args); \ 10 | }while(0); 11 | 12 | 13 | #define err_log(args...) do { \ 14 | struct timeval tv; \ 15 | gettimeofday(&tv,0); \ 16 | fprintf(stderr,"%lu.%06lu:",tv.tv_sec,tv.tv_usec); \ 17 | fprintf(stderr,args); \ 18 | }while(0); 19 | 20 | #define rec_log(out,args...) do { \ 21 | struct timeval tv; \ 22 | gettimeofday(&tv,0); \ 23 | fprintf((out),"%lu.%06lu:",tv.tv_sec,tv.tv_usec); \ 24 | fprintf((out),args); \ 25 | fflush(out); \ 26 | }while(0); 27 | 28 | #define safe_rec_log(x,args...) {if(NULL!=(x)){rec_log((x),args);}} 29 | 30 | #define SYS_LOG(x,args...) {if((x)->sys_log){safe_rec_log(((x)->sys_log_file),args)}} 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/proxy/proxy.c: -------------------------------------------------------------------------------- 1 | #include "../include/proxy/proxy.h" 2 | #include "../include/config-comp/config-proxy.h" 3 | #include 4 | #include 5 | #include "../include/dare/dare_server.h" 6 | #include "../include/dare/message.h" 7 | #define __STDC_FORMAT_MACROS 8 | 9 | static void stablestorage_save_request(void* data,void*arg); 10 | static void stablestorage_dump_records(void*buf,void*arg); 11 | static uint32_t stablestorage_get_records_len(void*arg); 12 | static int stablestorage_load_records(void*buf,uint32_t size,void*arg); 13 | static void update_highest_rec(void*arg); 14 | static void do_action_to_server(uint16_t clt_id,uint8_t type,size_t data_size,void* data,void *arg); 15 | static void do_action_send(uint16_t clt_id,size_t data_size,void* data,void* arg); 16 | static void do_action_connect(uint16_t clt_id,void* arg); 17 | static void do_action_close(uint16_t clt_id,void* arg); 18 | static int set_socket_blocking(int fd, int blocking); 19 | 20 | FILE *log_fp; 21 | 22 | int dare_main(proxy_node* proxy, const char* config_path) 23 | { 24 | int rc; 25 | dare_server_input_t *input = (dare_server_input_t*)malloc(sizeof(dare_server_input_t)); 26 | memset(input, 0, sizeof(dare_server_input_t)); 27 | input->log = stdout; 28 | input->name = ""; 29 | input->output = "dare_servers.out"; 30 | input->srv_type = SRV_TYPE_START; 31 | input->sm_type = CLT_KVS; 32 | input->server_idx = 0xFF; 33 | char *server_idx = getenv("server_idx"); 34 | if (server_idx != NULL) 35 | input->server_idx = (uint8_t)atoi(server_idx); 36 | input->group_size = 3; 37 | char *group_size = getenv("group_size"); 38 | if (group_size != NULL) 39 | input->group_size = (uint8_t)atoi(group_size); 40 | 41 | input->do_action = do_action_to_server; 42 | input->store_cmd = stablestorage_save_request; 43 | input->get_db_size = stablestorage_get_records_len; 44 | input->create_db_snapshot = stablestorage_dump_records; 45 | input->apply_db_snapshot = stablestorage_load_records; 46 | input->update_state = update_highest_rec; 47 | memcpy(input->config_path, config_path, strlen(config_path)); 48 | input->up_para = proxy; 49 | static int srv_type = SRV_TYPE_START; 50 | 51 | const char *server_type = getenv("server_type"); 52 | if (server_type != NULL) { 53 | if (strcmp(server_type, "join") == 0) { 54 | srv_type = SRV_TYPE_JOIN; 55 | } 56 | } 57 | char *dare_log_file = getenv("dare_log_file"); 58 | if (dare_log_file == NULL) 59 | dare_log_file = ""; 60 | 61 | input->srv_type = srv_type; 62 | 63 | if (strcmp(dare_log_file, "") != 0) { 64 | input->log = fopen(dare_log_file, "w+"); 65 | if (input->log==NULL) { 66 | printf("Cannot open log file\n"); 67 | exit(1); 68 | } 69 | } 70 | if (SRV_TYPE_START == input->srv_type) { 71 | if (0xFF == input->server_idx) { 72 | printf("A server cannot start without an index\n"); 73 | exit(1); 74 | } 75 | } 76 | pthread_t dare_thread; 77 | rc = pthread_create(&dare_thread, NULL, &dare_server_init, input); 78 | if (0 != rc) { 79 | fprintf(log_fp, "Cannot init dare_thread\n"); 80 | return 1; 81 | } 82 | 83 | list_entry_t *n1 = malloc(sizeof(list_entry_t)); 84 | n1->tid = dare_thread; 85 | LIST_INSERT_HEAD(&listhead, n1, entries); 86 | //fclose(log_fp); 87 | 88 | return 0; 89 | } 90 | 91 | static int is_inner(pthread_t tid) 92 | { 93 | list_entry_t *np; 94 | LIST_FOREACH(np, &listhead, entries) { 95 | if (np->tid == tid) 96 | return 1; 97 | } 98 | return 0; 99 | } 100 | 101 | static hk_t gen_key(nid_t node_id,nc_t node_count){ 102 | hk_t key = 0; 103 | key |= ((hk_t)node_id<<8); 104 | key |= (hk_t)node_count; 105 | return key; 106 | } 107 | 108 | static void leader_handle_submit_req(uint8_t type, ssize_t data_size, void* buf, int clt_id, proxy_node* proxy) 109 | { 110 | socket_pair* pair = NULL; 111 | uint64_t req_id; 112 | uint16_t connection_id; 113 | 114 | pthread_spin_lock(&tailq_lock); 115 | uint64_t cur_rec = ++proxy->cur_rec; 116 | switch(type) { 117 | case CONNECT: 118 | pair = (socket_pair*)malloc(sizeof(socket_pair)); 119 | memset(pair,0,sizeof(socket_pair)); 120 | pair->clt_id = clt_id; 121 | pair->req_id = 0; 122 | nid_t node_id = get_node_id(); 123 | pair->connection_id = gen_key(node_id, proxy->pair_count++); 124 | 125 | req_id = ++pair->req_id; 126 | connection_id = pair->connection_id; 127 | 128 | HASH_ADD_INT(proxy->leader_hash_map, clt_id, pair); 129 | break; 130 | case SEND: 131 | HASH_FIND_INT(proxy->leader_hash_map, &clt_id, pair); 132 | 133 | req_id = ++pair->req_id; 134 | connection_id = pair->connection_id; 135 | 136 | socket_pair* replaced_pair = NULL; 137 | HASH_REPLACE_INT(proxy->leader_hash_map, clt_id, pair, replaced_pair); 138 | break; 139 | case CLOSE: 140 | HASH_FIND_INT(proxy->leader_hash_map, &clt_id, pair); 141 | 142 | req_id = ++pair->req_id; 143 | connection_id = pair->connection_id; 144 | 145 | HASH_DEL(proxy->leader_hash_map, pair); 146 | break; 147 | } 148 | 149 | tailq_entry_t* n2 = (tailq_entry_t*)malloc(sizeof(tailq_entry_t)); 150 | n2->req_id = req_id; 151 | n2->connection_id = connection_id; 152 | n2->type = type; 153 | n2->cmd.len = data_size; 154 | if (data_size) 155 | memcpy(n2->cmd.cmd, buf, data_size); 156 | TAILQ_INSERT_TAIL(&tailhead, n2, entries); 157 | 158 | pthread_spin_unlock(&tailq_lock); 159 | 160 | while (cur_rec > proxy->highest_rec); 161 | } 162 | 163 | static void get_socket_buffer_size(int sockfd) 164 | { 165 | /* 166 | * TCP provides flow control. TCP always tells its peer exactly 167 | * how many bytes of data it is willing to accept from the peer 168 | * at any one time. This is called the advertised window. 169 | * At any time, the window is the amount of room currently available 170 | * in the receive buffer, guaranteeing that the sender cannot 171 | * overflow the receiver buffer. The window changes dynamically over 172 | * time: As data is received from the sender, the window size decreases, 173 | * but as the receiving application reads data from the buffer, the 174 | * window size increases. 175 | */ 176 | socklen_t i; 177 | size_t len; 178 | 179 | i = sizeof(len); 180 | if (getsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &len, &i) < 0) { 181 | perror(": getsockopt"); 182 | } 183 | 184 | printf("receive buffer size = %d\n", len); 185 | 186 | if (getsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &len, &i) < 0) { 187 | perror(": getsockopt"); 188 | } 189 | 190 | printf("send buffer size = %d\n", len); 191 | } 192 | 193 | static int set_socket_blocking(int fd, int blocking) { 194 | int flags; 195 | 196 | if ((flags = fcntl(fd, F_GETFL)) == -1) { 197 | fprintf(stderr, "fcntl(F_GETFL): %s", strerror(errno)); 198 | } 199 | 200 | if (blocking) 201 | flags &= ~O_NONBLOCK; 202 | else 203 | flags |= O_NONBLOCK; 204 | 205 | if (fcntl(fd, F_SETFL, flags) == -1) { 206 | fprintf(stderr, "fcntl(F_SETFL,O_NONBLOCK): %s", strerror(errno)); 207 | } 208 | return 0; 209 | } 210 | 211 | static int set_socket_timeout(int fd, struct timeval *timeout) { 212 | /* 213 | * SO_RCVTIMEO and SO_SNDTIMEO 214 | * Specify the receiving or sending timeouts until reporting an 215 | * error. The argument is a struct timeval. If an input or output 216 | * function blocks for this period of time, and data has been sent 217 | * or received, the return value of that function will be the 218 | * amount of data transferred; if no data has been transferred and 219 | * the timeout has been reached then -1 is returned with errno set 220 | * to EAGAIN or EWOULDBLOCK, or EINPROGRESS just as if the socket 221 | * was specified to be nonblocking. If the timeout is set to zero 222 | * (the default) then the operation will never timeout. 223 | */ 224 | if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, timeout, sizeof(struct timeval)) < 0) { 225 | perror("set_socket_timeout"); 226 | } 227 | return 0; 228 | } 229 | 230 | void proxy_on_read(proxy_node* proxy, void* buf, ssize_t bytes_read, int fd) 231 | { 232 | if (is_inner(pthread_self())) 233 | return; 234 | 235 | if (is_leader()) 236 | leader_handle_submit_req(SEND, bytes_read, buf, fd, proxy); 237 | 238 | return; 239 | } 240 | 241 | void proxy_on_accept(proxy_node* proxy, int fd) 242 | { 243 | if (is_inner(pthread_self())) 244 | return; 245 | 246 | if (is_leader()) 247 | leader_handle_submit_req(CONNECT, 0, NULL, fd, proxy); 248 | 249 | return; 250 | } 251 | 252 | void proxy_on_close(proxy_node* proxy, int fd) 253 | { 254 | if (is_inner(pthread_self())) 255 | return; 256 | 257 | if (is_leader()) 258 | leader_handle_submit_req(CLOSE, 0, NULL, fd, proxy); 259 | 260 | return; 261 | } 262 | 263 | static void update_highest_rec(void*arg) 264 | { 265 | proxy_node* proxy = arg; 266 | proxy->highest_rec++; 267 | } 268 | 269 | static void stablestorage_save_request(void* data,void*arg) 270 | { 271 | proxy_node* proxy = arg; 272 | proxy_msg_header* header = (proxy_msg_header*)data; 273 | switch(header->action){ 274 | case CONNECT: 275 | { 276 | store_record(proxy->db_ptr,PROXY_CONNECT_MSG_SIZE,data); 277 | break; 278 | } 279 | case SEND: 280 | { 281 | proxy_send_msg* send_msg = (proxy_send_msg*)data; 282 | store_record(proxy->db_ptr,PROXY_SEND_MSG_SIZE(send_msg),data); 283 | break; 284 | } 285 | case CLOSE: 286 | { 287 | store_record(proxy->db_ptr,PROXY_CLOSE_MSG_SIZE,data); 288 | break; 289 | } 290 | } 291 | } 292 | 293 | static uint32_t stablestorage_get_records_len(void*arg) 294 | { 295 | proxy_node* proxy = arg; 296 | uint32_t records_len = get_records_len(proxy->db_ptr); 297 | return records_len; 298 | } 299 | 300 | static void stablestorage_dump_records(void*buf,void*arg) 301 | { 302 | proxy_node* proxy = arg; 303 | dump_records(proxy->db_ptr,buf); 304 | } 305 | 306 | static int stablestorage_load_records(void*buf,uint32_t size,void*arg) 307 | { 308 | proxy_node* proxy = arg; 309 | proxy_msg_header* header; 310 | uint32_t len = 0; 311 | while(len < size) { 312 | header = (proxy_msg_header*)((char*)buf + len); 313 | switch(header->action){ 314 | case SEND: 315 | { 316 | proxy_send_msg* send_msg = (proxy_send_msg*)header; 317 | len += PROXY_SEND_MSG_SIZE(send_msg); 318 | store_record(proxy->db_ptr,PROXY_SEND_MSG_SIZE(send_msg),header); 319 | do_action_send(header->connection_id, send_msg->data.cmd.len, send_msg->data.cmd.cmd, arg); 320 | break; 321 | } 322 | case CONNECT: 323 | { 324 | len += PROXY_CONNECT_MSG_SIZE; 325 | store_record(proxy->db_ptr,PROXY_CONNECT_MSG_SIZE,header); 326 | do_action_connect(header->connection_id, arg); 327 | break; 328 | } 329 | case CLOSE: 330 | { 331 | len += PROXY_CLOSE_MSG_SIZE; 332 | store_record(proxy->db_ptr,PROXY_CLOSE_MSG_SIZE,header); 333 | do_action_close(header->connection_id, arg); 334 | break; 335 | } 336 | } 337 | } 338 | return 0; 339 | } 340 | 341 | static void do_action_to_server(uint16_t clt_id,uint8_t type,size_t data_size,void* data,void*arg) 342 | { 343 | proxy_node* proxy = arg; 344 | FILE* output = NULL; 345 | if(proxy->req_log){ 346 | output = proxy->req_log_file; 347 | } 348 | switch(type){ 349 | case CONNECT: 350 | if(output!=NULL){ 351 | fprintf(output,"Operation: Connects.\n"); 352 | } 353 | do_action_connect(clt_id,arg); 354 | break; 355 | case SEND: 356 | if(output!=NULL){ 357 | fprintf(output,"Operation: Sends data.\n"); 358 | } 359 | do_action_send(clt_id,data_size,data,arg); 360 | break; 361 | case CLOSE: 362 | if(output!=NULL){ 363 | fprintf(output,"Operation: Closes.\n"); 364 | } 365 | do_action_close(clt_id,arg); 366 | break; 367 | default: 368 | break; 369 | } 370 | return; 371 | } 372 | 373 | static void do_action_connect(uint16_t clt_id,void* arg) 374 | { 375 | proxy_node* proxy = arg; 376 | 377 | socket_pair* ret; 378 | HASH_FIND(hh, proxy->follower_hash_map, &clt_id, sizeof(uint16_t), ret); 379 | if (NULL == ret) 380 | { 381 | ret = malloc(sizeof(socket_pair)); 382 | memset(ret,0,sizeof(socket_pair)); 383 | 384 | ret->connection_id = clt_id; 385 | int sockfd = socket(AF_INET, SOCK_STREAM, 0); 386 | if (sockfd < 0) 387 | { 388 | fprintf(stderr, "ERROR opening socket!\n"); 389 | goto do_action_connect_exit; 390 | } 391 | ret->p_s = sockfd; 392 | HASH_ADD(hh, proxy->follower_hash_map, connection_id, sizeof(uint16_t), ret); 393 | 394 | if (connect(ret->p_s, (struct sockaddr*)&proxy->sys_addr.s_addr, proxy->sys_addr.s_sock_len) < 0) 395 | fprintf(stderr, "ERROR connecting!\n"); 396 | 397 | set_socket_blocking(ret->p_s, 0); 398 | 399 | int enable = 1; 400 | if(setsockopt(ret->p_s, IPPROTO_TCP, TCP_NODELAY, (void*)&enable, sizeof(enable)) < 0) 401 | fprintf(stderr, "TCP_NODELAY SETTING ERROR!\n"); 402 | } 403 | 404 | do_action_connect_exit: 405 | return; 406 | } 407 | 408 | static void do_action_send(uint16_t clt_id,size_t data_size,void* data,void* arg) 409 | { 410 | proxy_node* proxy = arg; 411 | socket_pair* ret; 412 | HASH_FIND(hh, proxy->follower_hash_map, &clt_id, sizeof(uint16_t), ret); 413 | 414 | if(NULL==ret){ 415 | goto do_action_send_exit; 416 | }else{ 417 | int n = write(ret->p_s, data, data_size); 418 | if (n < 0) 419 | fprintf(stderr, "ERROR writing to socket!\n"); 420 | } 421 | do_action_send_exit: 422 | return; 423 | } 424 | 425 | static void do_action_close(uint16_t clt_id,void* arg) 426 | { 427 | proxy_node* proxy = arg; 428 | socket_pair* ret; 429 | HASH_FIND(hh, proxy->follower_hash_map, &clt_id, sizeof(uint16_t), ret); 430 | if(NULL==ret){ 431 | goto do_action_close_exit; 432 | }else{ 433 | if (close(ret->p_s)) 434 | fprintf(stderr, "ERROR closing socket!\n"); 435 | HASH_DEL(proxy->follower_hash_map, ret); 436 | } 437 | do_action_close_exit: 438 | return; 439 | } 440 | 441 | proxy_node* proxy_init(const char* config_path,const char* proxy_log_path) 442 | { 443 | proxy_node* proxy = (proxy_node*)malloc(sizeof(proxy_node)); 444 | 445 | if(NULL==proxy){ 446 | err_log("PROXY : Cannot Malloc Memory For The Proxy.\n"); 447 | goto proxy_exit_error; 448 | } 449 | 450 | memset(proxy,0,sizeof(proxy_node)); 451 | 452 | if(proxy_read_config(proxy,config_path)){ 453 | err_log("PROXY : Configuration File Reading Error.\n"); 454 | goto proxy_exit_error; 455 | } 456 | 457 | int build_log_ret = 0; 458 | if(proxy_log_path==NULL){ 459 | proxy_log_path = "."; 460 | }else{ 461 | if((build_log_ret=mkdir(proxy_log_path,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH))!=0){ 462 | if(errno!=EEXIST){ 463 | err_log("PROXY : Log Directory Creation Failed,No Log Will Be Recorded.\n"); 464 | }else{ 465 | build_log_ret = 0; 466 | } 467 | } 468 | } 469 | 470 | if(!build_log_ret){ 471 | //if(proxy->req_log){ 472 | char* req_log_path = (char*)malloc(sizeof(char)*strlen(proxy_log_path)+50); 473 | memset(req_log_path,0,sizeof(char)*strlen(proxy_log_path)+50); 474 | if(NULL!=req_log_path){ 475 | sprintf(req_log_path,"%s/node-proxy-req.log",proxy_log_path); 476 | //err_log("%s.\n",req_log_path); 477 | proxy->req_log_file = fopen(req_log_path,"w"); 478 | free(req_log_path); 479 | } 480 | if(NULL==proxy->req_log_file && proxy->req_log){ 481 | err_log("PROXY : Client Request Log File Cannot Be Created.\n"); 482 | } 483 | //} 484 | } 485 | 486 | TAILQ_INIT(&tailhead); 487 | LIST_INIT(&listhead); 488 | 489 | proxy->db_ptr = initialize_db(proxy->db_name,0); 490 | 491 | proxy->follower_hash_map = NULL; 492 | proxy->leader_hash_map = NULL; 493 | 494 | if(pthread_spin_init(&tailq_lock, PTHREAD_PROCESS_PRIVATE)){ 495 | err_log("PROXY: Cannot init the lock\n"); 496 | } 497 | 498 | dare_main(proxy, config_path); 499 | 500 | return proxy; 501 | 502 | proxy_exit_error: 503 | if(NULL!=proxy){ 504 | free(proxy); 505 | } 506 | return NULL; 507 | 508 | } 509 | -------------------------------------------------------------------------------- /src/spec_hooks.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "include/rsm-interface.h" 8 | 9 | #define dprintf(fmt...) 10 | 11 | struct proxy_node_t* proxy = NULL; 12 | 13 | typedef int (*main_type)(int, char**, char**); 14 | 15 | struct arg_type 16 | { 17 | char **argv; 18 | int (*main_func) (int, char **, char **); 19 | }; 20 | 21 | main_type saved_init_func = NULL; 22 | void tern_init_func(int argc, char **argv, char **env) 23 | { 24 | dprintf("%04d: __tern_init_func() called.\n", (int) pthread_self()); 25 | if(saved_init_func) 26 | saved_init_func(argc, argv, env); 27 | 28 | printf("tern_init_func is called\n"); 29 | 30 | char* config_path = getenv("config_path"); 31 | 32 | char* proxy_log_dir = NULL; 33 | proxy = proxy_init(config_path, proxy_log_dir); 34 | } 35 | 36 | typedef void (*fini_type)(void*); 37 | fini_type saved_fini_func = NULL; 38 | 39 | extern "C" int my_main(int argc, char **pt, char **aa) 40 | { 41 | int ret; 42 | arg_type *args = (arg_type*)pt; 43 | dprintf("%04d: __libc_start_main() called.\n", (int) pthread_self()); 44 | ret = args->main_func(argc, args->argv, aa); 45 | return ret; 46 | } 47 | 48 | extern "C" int __libc_start_main( 49 | void *func_ptr, 50 | int argc, 51 | char* argv[], 52 | void (*init_func)(void), 53 | void (*fini_func)(void), 54 | void (*rtld_fini_func)(void), 55 | void *stack_end) 56 | { 57 | typedef void (*fnptr_type)(void); 58 | typedef int (*orig_func_type)(void *, int, char *[], fnptr_type, 59 | fnptr_type, fnptr_type, void*); 60 | orig_func_type orig_func; 61 | arg_type args; 62 | 63 | void * handle; 64 | int ret; 65 | 66 | // Get lib path. 67 | Dl_info dli; 68 | dladdr((void *)dlsym, &dli); 69 | std::string libPath = dli.dli_fname; 70 | libPath = dli.dli_fname; 71 | size_t lastSlash = libPath.find_last_of("/"); 72 | libPath = libPath.substr(0, lastSlash); 73 | libPath += "/libc.so.6"; 74 | libPath = "/lib/x86_64-linux-gnu/libc.so.6"; 75 | if(!(handle=dlopen(libPath.c_str(), RTLD_LAZY))) { 76 | puts("dlopen error"); 77 | abort(); 78 | } 79 | 80 | orig_func = (orig_func_type) dlsym(handle, "__libc_start_main"); 81 | 82 | if(dlerror()) { 83 | puts("dlerror"); 84 | abort(); 85 | } 86 | 87 | dlclose(handle); 88 | 89 | dprintf("%04d: __libc_start_main is hooked.\n", (int) pthread_self()); 90 | 91 | args.argv = argv; 92 | args.main_func = (main_type)func_ptr; 93 | saved_init_func = (main_type)init_func; 94 | 95 | saved_fini_func = (fini_type)rtld_fini_func; 96 | 97 | ret = orig_func((void*)my_main, argc, (char**)(&args), (fnptr_type)tern_init_func, (fnptr_type)fini_func, rtld_fini_func, stack_end); 98 | 99 | return ret; 100 | } 101 | 102 | extern "C" int accept(int socket, struct sockaddr *address, socklen_t *address_len) 103 | { 104 | typedef int (*orig_accept_type)(int, sockaddr *, socklen_t *); 105 | static orig_accept_type orig_accept; 106 | if (!orig_accept) 107 | orig_accept = (orig_accept_type) dlsym(RTLD_NEXT, "accept"); 108 | 109 | int ret = orig_accept(socket, address, address_len); 110 | 111 | if (ret >= 0 && proxy != NULL) 112 | { 113 | struct stat sb; 114 | fstat(ret, &sb); 115 | if ((sb.st_mode & S_IFMT) == S_IFSOCK) 116 | proxy_on_accept(proxy, ret); 117 | } 118 | 119 | return ret; 120 | } 121 | 122 | // memcached 123 | extern "C" int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags) 124 | { 125 | typedef int (*orig_accept4_type)(int, sockaddr *, socklen_t *, int); 126 | static orig_accept4_type orig_accept4; 127 | if (!orig_accept4) 128 | orig_accept4 = (orig_accept4_type) dlsym(RTLD_NEXT, "accept4"); 129 | 130 | int ret = orig_accept4(sockfd, addr, addrlen, flags); 131 | 132 | if (ret >= 0 && proxy != NULL) 133 | { 134 | struct stat sb; 135 | fstat(ret, &sb); 136 | if ((sb.st_mode & S_IFMT) == S_IFSOCK) 137 | proxy_on_accept(proxy, ret); 138 | } 139 | 140 | return ret; 141 | } 142 | 143 | extern "C" int close(int fildes) 144 | { 145 | if (proxy != NULL) 146 | { 147 | struct stat sb; 148 | fstat(fildes, &sb); 149 | if ((sb.st_mode & S_IFMT) == S_IFSOCK) 150 | proxy_on_close(proxy, fildes); 151 | } 152 | 153 | typedef int (*orig_close_type)(int); 154 | static orig_close_type orig_close; 155 | if (!orig_close) 156 | orig_close = (orig_close_type) dlsym(RTLD_NEXT, "close"); 157 | int ret = orig_close(fildes); 158 | return ret; 159 | } 160 | 161 | extern "C" ssize_t read(int fd, void *buf, size_t count) 162 | { 163 | typedef ssize_t (*orig_read_type)(int, void *, size_t); 164 | static orig_read_type orig_read; 165 | if (!orig_read) 166 | orig_read = (orig_read_type) dlsym(RTLD_NEXT, "read"); 167 | ssize_t bytes_read = orig_read(fd, buf, count); 168 | 169 | if (bytes_read > 0 && proxy != NULL) 170 | { 171 | struct stat sb; 172 | fstat(fd, &sb); 173 | if ((sb.st_mode & S_IFMT) == S_IFSOCK) 174 | proxy_on_read(proxy, buf, bytes_read, fd); 175 | } 176 | 177 | return bytes_read; 178 | } 179 | -------------------------------------------------------------------------------- /target/makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | include ../makefile.init 6 | 7 | RM := rm -rf 8 | 9 | # All of the sources participating in the build are defined here 10 | -include sources.mk 11 | -include src/dare/subdir.mk 12 | -include src/proxy/subdir.mk 13 | -include src/db/subdir.mk 14 | -include src/config-comp/subdir.mk 15 | -include src/subdir.mk 16 | -include subdir.mk 17 | -include objects.mk 18 | 19 | LIBS += $(DARE) -lev -ldb -lconfig -libverbs -lm 20 | 21 | # Add inputs and outputs from these tool invocations to the build variables 22 | 23 | # All Target 24 | all: interpose.so 25 | 26 | # Tool invocations 27 | interpose.so: $(OBJS) 28 | @echo 'Building target: $@' 29 | @echo 'Invoking: GCC C Linker' 30 | gcc -shared -Wl,-soname,interpose.so $(OBJS) -Wall -o interpose.so $(LIBS) 31 | @echo 'Finished building target: $@' 32 | @echo ' ' 33 | 34 | # Other Targets 35 | clean: 36 | @echo "##### CLEAN-UP DARE#####" 37 | -$(RM) $(RBTREE_OBJS) 38 | -$(RM) $(DARE_OBJS) 39 | -$(RM) $(DARE) $(RBTREE) 40 | @echo "########################" 41 | -@echo ' ' 42 | -$(RM) $(OBJS)$(C_DEPS) interpose.so 43 | -@echo ' ' 44 | 45 | .PHONY: all clean dependents 46 | -------------------------------------------------------------------------------- /target/nodes.local.cfg: -------------------------------------------------------------------------------- 1 | #configuration files for the replicated state machine node group 2 | 3 | #proxy configuration part 4 | 5 | db_name = "node_test"; 6 | req_log = 1; 7 | 8 | #real server configuration 9 | 10 | ip_address = "127.0.0.1"; 11 | port = 8888; 12 | 13 | #dare component configuration part 14 | 15 | #HB period (seconds) 16 | #election timeout range (microseconds) 17 | #retransmission period (seconds) 18 | #period of checking for new connections (seconds) 19 | #log pruning period (seconds) 20 | dare_global_config = { 21 | #hb_period = 0.001; 22 | #elec_timeout_low = 10000; 23 | #elec_timeout_high = 30000; 24 | #rc_info_period = 0.01; 25 | #retransmit_period = 0.02; 26 | #log_pruning_period = 0.03; 27 | 28 | #DEBUG 29 | hb_period = 0.01; 30 | elec_timeout_low = 100000; 31 | elec_timeout_high = 300000; 32 | retransmit_period = 0.04; 33 | rc_info_period = 0.05; 34 | log_pruning_period = 0.05; 35 | }; 36 | -------------------------------------------------------------------------------- /target/objects.mk: -------------------------------------------------------------------------------- 1 | LIBS := -lsupc++ -lpthread -lstdc++ -lrt -------------------------------------------------------------------------------- /target/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | O_SRCS := 6 | C_SRCS := 7 | S_UPPER_SRCS := 8 | OBJ_SRCS := 9 | ASM_SRCS := 10 | OBJS := 11 | C_DEPS := 12 | EXECUTABLES := 13 | 14 | RBTREE_OBJS := 15 | DARE_OBJS := 16 | DARE := 17 | RBTREE := 18 | DARE_LIBPATH := 19 | 20 | # Every subdirectory with source files must be described here 21 | SUBDIRS := \ 22 | src/util \ 23 | src \ 24 | src/dare \ 25 | src/db \ 26 | src/config-comp \ 27 | src/proxy \ 28 | -------------------------------------------------------------------------------- /target/src/config-comp/subdir.mk: -------------------------------------------------------------------------------- 1 | # Add inputs and outputs from these tool invocations to the build variables 2 | C_SRCS += \ 3 | ../src/config-comp/config-dare.c \ 4 | ../src/config-comp/config-proxy.c 5 | 6 | 7 | OBJS += \ 8 | ./src/config-comp/config-dare.o \ 9 | ./src/config-comp/config-proxy.o 10 | 11 | 12 | # Each subdirectory must supply rules for building sources it contributes 13 | src/config-comp/%.o: ../src/config-comp/%.c 14 | @echo 'Building file: $<' 15 | @echo 'Invoking: GCC C Compiler' 16 | gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<" 17 | @echo 'Finished building: $<' 18 | @echo ' ' 19 | 20 | 21 | -------------------------------------------------------------------------------- /target/src/dare/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | CC = gcc 6 | 7 | ifeq ($(DEBUGOPT),1) 8 | FLAGS = -fPIC -rdynamic -std=gnu99 -DDEBUG -I"$(ROOT_DIR)/../src/include/dare" -I"$(ROOT_DIR)/../utils/rbtree/include" -I/usr/include 9 | else 10 | FLAGS = -fPIC -rdynamic -std=gnu99 -I"$(ROOT_DIR)/../src/include/dare" -I"$(ROOT_DIR)/../utils/rbtree/include" -I/usr/include 11 | endif 12 | CFLAGS = #-Wall -Wunused-function #-Wextra 13 | LDFLAGS = -L/usr/lib -libverbs 14 | 15 | PREFIX = $(ROOT_DIR)/src/dare 16 | DARE_LIBPATH = $(PREFIX)/lib 17 | 18 | DARE_HEADERS = $(shell echo $(ROOT_DIR)/../src/include/dare/*.h) 19 | DARE_SRCS = $(shell echo $(ROOT_DIR)/../src/dare/*.c) 20 | DARE_OBJS = $(DARE_SRCS:.c=.o) 21 | DARE = $(DARE_LIBPATH)/libdare.a 22 | 23 | RBTREE_HEADERS = $(shell echo $(ROOT_DIR)/../utils/rbtree/include/*.h) 24 | RBTREE_SRCS = $(shell echo $(ROOT_DIR)/../utils/rbtree/src/*.c) 25 | RBTREE_OBJS = $(RBTREE_SRCS:.c=.o) 26 | RBTREE = $(DARE_LIBPATH)/librbtree.a 27 | 28 | all: dare 29 | 30 | $(RBTREE): rbtree_print $(RBTREE_OBJS) $(RBTREE_HEADERS) 31 | mkdir -pm 755 $(DARE_LIBPATH) 32 | ar -rcs $@ $(RBTREE_OBJS) 33 | @echo "##############################" 34 | @echo 35 | rbtree_print: 36 | @echo "##### BUILDING Red-Black Tree #####" 37 | 38 | dare: FLAGS += -I/usr/local/include 39 | dare: LDFLAGS += /usr/local/lib/libev.a 40 | dare: $(DARE) 41 | $(DARE): $(RBTREE) dare_print $(DARE_OBJS) $(DARE_HEADERS) 42 | mkdir -pm 755 $(DARE_LIBPATH) 43 | ar -rcs $@ $(DARE_OBJS) $(RBTREE_OBJS) 44 | @echo "##############################" 45 | @echo 46 | dare_print: 47 | @echo "##### BUILDING DARE #####" 48 | 49 | %.o: %.c $(HEADERS) 50 | $(CC) $(FLAGS) $(CFLAGS) -c -o $@ $< 51 | 52 | .PHONY : all 53 | -------------------------------------------------------------------------------- /target/src/db/subdir.mk: -------------------------------------------------------------------------------- 1 | # Add inputs and outputs from these tool invocations to the build variables 2 | C_SRCS += \ 3 | ../src/db/db-interface.c 4 | 5 | OBJS += \ 6 | ./src/db/db-interface.o 7 | 8 | 9 | # Each subdirectory must supply rules for building sources it contributes 10 | src/db/%.o: ../src/db/%.c 11 | @echo 'Building file: $<' 12 | @echo 'Invoking: GCC C Compiler' 13 | gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<" 14 | @echo 'Finished building: $<' 15 | @echo ' ' 16 | 17 | 18 | -------------------------------------------------------------------------------- /target/src/proxy/subdir.mk: -------------------------------------------------------------------------------- 1 | # Add inputs and outputs from these tool invocations to the build variables 2 | C_SRCS += \ 3 | ../src/proxy/proxy.c 4 | 5 | OBJS += \ 6 | ./src/proxy/proxy.o 7 | 8 | 9 | # Each subdirectory must supply rules for building sources it contributes 10 | src/proxy/%.o: ../src/proxy/%.c 11 | @echo 'Building file: $<' 12 | @echo 'Invoking: GCC C Compiler' 13 | gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<" 14 | @echo 'Finished building: $<' 15 | @echo ' ' 16 | 17 | 18 | -------------------------------------------------------------------------------- /target/src/subdir.mk: -------------------------------------------------------------------------------- 1 | # Add inputs and outputs from these tool invocations to the build variables 2 | C_SRCS += \ 3 | ../src/spec_hooks.cpp 4 | 5 | OBJS += \ 6 | ./src/spec_hooks.o 7 | 8 | 9 | # Each subdirectory must supply rules for building sources it contributes 10 | src/%.o: ../src/%.cpp 11 | @echo 'Building file: $<' 12 | @echo 'Invoking: GCC C Compiler' 13 | gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<" 14 | @echo 'Finished building: $<' 15 | @echo ' ' 16 | 17 | 18 | -------------------------------------------------------------------------------- /utils/dep-lib/db-5.1.29.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/utils/dep-lib/db-5.1.29.tar.gz -------------------------------------------------------------------------------- /utils/dep-lib/libconfig-1.4.9.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/utils/dep-lib/libconfig-1.4.9.tar.gz -------------------------------------------------------------------------------- /utils/dep-lib/libev-4.15.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/utils/dep-lib/libev-4.15.tar.gz -------------------------------------------------------------------------------- /utils/mk: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #build dep for the program 3 | 4 | CUR_DIR=$(pwd) 5 | LIB_PREFIX=${CUR_DIR}/.local/ 6 | 7 | mkdir -p ${LIB_PREFIX} 8 | 9 | LIBCONFIG_VER=1.4.9 10 | LIBCONFIG_NAME=libconfig-${LIBCONFIG_VER} 11 | 12 | LIBEV_VER=4.15 13 | LIBEV_NAME=libev-${LIBEV_VER} 14 | 15 | BDB_VER=5.1.29 16 | BDB_NAME=db-${BDB_VER} 17 | 18 | if [ ! -d "dep-lib" ];then 19 | # True if dep-lib exists and is a directory. 20 | mkdir dep-lib 21 | fi 22 | cd dep-lib 23 | 24 | if [ ! -f "${LIBCONFIG_NAME}.tar.gz" ];then 25 | # True if ${LIBCONFIG_NAME}.tar.gz exists and is an ordinary file. 26 | wget http://www.hyperrealm.com/libconfig/${LIBCONFIG_NAME}.tar.gz 27 | fi 28 | 29 | if [ ! -d "${LIBCONFIG_NAME}" ];then 30 | tar -xvf ${LIBCONFIG_NAME}.tar.gz 31 | fi 32 | 33 | cd ${LIBCONFIG_NAME} 34 | pwd 35 | ./configure --prefix=${LIB_PREFIX} 36 | make; 37 | make install; 38 | cd .. 39 | 40 | 41 | if [ ! -f "${LIBEV_NAME}.tar.gz" ];then 42 | wget http://dist.schmorp.de/libev/Attic/{LIBEV_NAME}.tar.gz 43 | fi 44 | 45 | if [ ! -d "${LIBEV_NAME}" ];then 46 | tar -xvf ${LIBEV_NAME}.tar.gz 47 | fi 48 | 49 | cd ${LIBEV_NAME} 50 | pwd 51 | ./configure --prefix=${LIB_PREFIX} 52 | make; 53 | make install; 54 | cd .. 55 | 56 | 57 | if [ ! -f "${BDB_NAME}.tar.gz" ];then 58 | wget http://download.oracle.com/berkeley-db/${BDB_NAME}.tar.gz 59 | fi 60 | 61 | if [ ! -d "${BDB_NAME}" ];then 62 | tar -xvf ${BDB_NAME}.tar.gz 63 | fi 64 | 65 | cd ${BDB_NAME} 66 | pwd 67 | cd build_unix 68 | ../dist/configure --prefix=${LIB_PREFIX} 69 | make; 70 | make install; 71 | cd .. 72 | cd .. 73 | 74 | cd .. 75 | -------------------------------------------------------------------------------- /utils/queue/tailq.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | struct entry { 4 | // element 5 | TAILQ_ENTRY(entry) entries; 6 | } *n1; 7 | 8 | TAILQ_HEAD(, entry) head; 9 | 10 | int main(int argc, char const *argv[]) 11 | { 12 | TAILQ_INIT(&head); 13 | 14 | n1 = malloc(sizeof(struct entry)); 15 | 16 | TAILQ_INSERT_TAIL(&head, n1, entries); 17 | 18 | while (!TAILQ_EMPTY(&head)) { 19 | n1 = TAILQ_FIRST(&head); 20 | TAILQ_REMOVE(&head, n1, entries); 21 | free(n1); 22 | } 23 | 24 | return 0; 25 | } 26 | 27 | -------------------------------------------------------------------------------- /utils/rbtree/include/compiler.h: -------------------------------------------------------------------------------- 1 | #ifndef __LINUX_COMPILER_H 2 | #define __LINUX_COMPILER_H 3 | 4 | #ifndef __ASSEMBLY__ 5 | 6 | #ifdef __CHECKER__ 7 | # define __user __attribute__((noderef, address_space(1))) 8 | # define __kernel __attribute__((address_space(0))) 9 | # define __safe __attribute__((safe)) 10 | # define __force __attribute__((force)) 11 | # define __nocast __attribute__((nocast)) 12 | # define __iomem __attribute__((noderef, address_space(2))) 13 | # define __must_hold(x) __attribute__((context(x,1,1))) 14 | # define __acquires(x) __attribute__((context(x,0,1))) 15 | # define __releases(x) __attribute__((context(x,1,0))) 16 | # define __acquire(x) __context__(x,1) 17 | # define __release(x) __context__(x,-1) 18 | # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) 19 | # define __percpu __attribute__((noderef, address_space(3))) 20 | #ifdef CONFIG_SPARSE_RCU_POINTER 21 | # define __rcu __attribute__((noderef, address_space(4))) 22 | #else 23 | # define __rcu 24 | #endif 25 | extern void __chk_user_ptr(const volatile void __user *); 26 | extern void __chk_io_ptr(const volatile void __iomem *); 27 | #else 28 | # define __user 29 | # define __kernel 30 | # define __safe 31 | # define __force 32 | # define __nocast 33 | # define __iomem 34 | # define __chk_user_ptr(x) (void)0 35 | # define __chk_io_ptr(x) (void)0 36 | //# define __builtin_warning(x, y...) (1) 37 | # define __must_hold(x) 38 | # define __acquires(x) 39 | # define __releases(x) 40 | # define __acquire(x) (void)0 41 | # define __release(x) (void)0 42 | # define __cond_lock(x,c) (c) 43 | # define __percpu 44 | # define __rcu 45 | #endif 46 | 47 | /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ 48 | #define ___PASTE(a,b) a##b 49 | #define __PASTE(a,b) ___PASTE(a,b) 50 | 51 | #ifdef __KERNEL__ 52 | 53 | #ifdef __GNUC__ 54 | #include 55 | #endif 56 | 57 | #define notrace __attribute__((no_instrument_function)) 58 | 59 | /* Intel compiler defines __GNUC__. So we will overwrite implementations 60 | * coming from above header files here 61 | */ 62 | #ifdef __INTEL_COMPILER 63 | # include 64 | #endif 65 | 66 | /* 67 | * Generic compiler-dependent macros required for kernel 68 | * build go below this comment. Actual compiler/compiler version 69 | * specific implementations come from the above header files 70 | */ 71 | 72 | struct ftrace_branch_data { 73 | const char *func; 74 | const char *file; 75 | unsigned line; 76 | union { 77 | struct { 78 | unsigned long correct; 79 | unsigned long incorrect; 80 | }; 81 | struct { 82 | unsigned long miss; 83 | unsigned long hit; 84 | }; 85 | unsigned long miss_hit[2]; 86 | }; 87 | }; 88 | 89 | /* 90 | * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code 91 | * to disable branch tracing on a per file basis. 92 | */ 93 | #if defined(CONFIG_TRACE_BRANCH_PROFILING) \ 94 | && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) 95 | void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); 96 | 97 | #define likely_notrace(x) __builtin_expect(!!(x), 1) 98 | #define unlikely_notrace(x) __builtin_expect(!!(x), 0) 99 | 100 | #define __branch_check__(x, expect) ({ \ 101 | int ______r; \ 102 | static struct ftrace_branch_data \ 103 | __attribute__((__aligned__(4))) \ 104 | __attribute__((section("_ftrace_annotated_branch"))) \ 105 | ______f = { \ 106 | .func = __func__, \ 107 | .file = __FILE__, \ 108 | .line = __LINE__, \ 109 | }; \ 110 | ______r = likely_notrace(x); \ 111 | ftrace_likely_update(&______f, ______r, expect); \ 112 | ______r; \ 113 | }) 114 | 115 | /* 116 | * Using __builtin_constant_p(x) to ignore cases where the return 117 | * value is always the same. This idea is taken from a similar patch 118 | * written by Daniel Walker. 119 | */ 120 | # ifndef likely 121 | # define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1)) 122 | # endif 123 | # ifndef unlikely 124 | # define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) 125 | # endif 126 | 127 | #ifdef CONFIG_PROFILE_ALL_BRANCHES 128 | /* 129 | * "Define 'is'", Bill Clinton 130 | * "Define 'if'", Steven Rostedt 131 | */ 132 | #define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) ) 133 | #define __trace_if(cond) \ 134 | if (__builtin_constant_p((cond)) ? !!(cond) : \ 135 | ({ \ 136 | int ______r; \ 137 | static struct ftrace_branch_data \ 138 | __attribute__((__aligned__(4))) \ 139 | __attribute__((section("_ftrace_branch"))) \ 140 | ______f = { \ 141 | .func = __func__, \ 142 | .file = __FILE__, \ 143 | .line = __LINE__, \ 144 | }; \ 145 | ______r = !!(cond); \ 146 | ______f.miss_hit[______r]++; \ 147 | ______r; \ 148 | })) 149 | #endif /* CONFIG_PROFILE_ALL_BRANCHES */ 150 | 151 | #else 152 | # define likely(x) __builtin_expect(!!(x), 1) 153 | # define unlikely(x) __builtin_expect(!!(x), 0) 154 | #endif 155 | 156 | /* Optimization barrier */ 157 | #ifndef barrier 158 | # define barrier() __memory_barrier() 159 | #endif 160 | 161 | /* Unreachable code */ 162 | #ifndef unreachable 163 | # define unreachable() do { } while (1) 164 | #endif 165 | 166 | #ifndef RELOC_HIDE 167 | # define RELOC_HIDE(ptr, off) \ 168 | ({ unsigned long __ptr; \ 169 | __ptr = (unsigned long) (ptr); \ 170 | (typeof(ptr)) (__ptr + (off)); }) 171 | #endif 172 | 173 | /* Not-quite-unique ID. */ 174 | #ifndef __UNIQUE_ID 175 | # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) 176 | #endif 177 | 178 | #endif /* __KERNEL__ */ 179 | 180 | #endif /* __ASSEMBLY__ */ 181 | 182 | #ifdef __KERNEL__ 183 | /* 184 | * Allow us to mark functions as 'deprecated' and have gcc emit a nice 185 | * warning for each use, in hopes of speeding the functions removal. 186 | * Usage is: 187 | * int __deprecated foo(void) 188 | */ 189 | #ifndef __deprecated 190 | # define __deprecated /* unimplemented */ 191 | #endif 192 | 193 | #ifdef MODULE 194 | #define __deprecated_for_modules __deprecated 195 | #else 196 | #define __deprecated_for_modules 197 | #endif 198 | 199 | #ifndef __must_check 200 | #define __must_check 201 | #endif 202 | 203 | #ifndef CONFIG_ENABLE_MUST_CHECK 204 | #undef __must_check 205 | #define __must_check 206 | #endif 207 | #ifndef CONFIG_ENABLE_WARN_DEPRECATED 208 | #undef __deprecated 209 | #undef __deprecated_for_modules 210 | #define __deprecated 211 | #define __deprecated_for_modules 212 | #endif 213 | 214 | /* 215 | * Allow us to avoid 'defined but not used' warnings on functions and data, 216 | * as well as force them to be emitted to the assembly file. 217 | * 218 | * As of gcc 3.4, static functions that are not marked with attribute((used)) 219 | * may be elided from the assembly file. As of gcc 3.4, static data not so 220 | * marked will not be elided, but this may change in a future gcc version. 221 | * 222 | * NOTE: Because distributions shipped with a backported unit-at-a-time 223 | * compiler in gcc 3.3, we must define __used to be __attribute__((used)) 224 | * for gcc >=3.3 instead of 3.4. 225 | * 226 | * In prior versions of gcc, such functions and data would be emitted, but 227 | * would be warned about except with attribute((unused)). 228 | * 229 | * Mark functions that are referenced only in inline assembly as __used so 230 | * the code is emitted even though it appears to be unreferenced. 231 | */ 232 | #ifndef __used 233 | # define __used /* unimplemented */ 234 | #endif 235 | 236 | #ifndef __maybe_unused 237 | # define __maybe_unused /* unimplemented */ 238 | #endif 239 | 240 | #ifndef __always_unused 241 | # define __always_unused /* unimplemented */ 242 | #endif 243 | 244 | #ifndef noinline 245 | #define noinline 246 | #endif 247 | 248 | /* 249 | * Rather then using noinline to prevent stack consumption, use 250 | * noinline_for_stack instead. For documentation reasons. 251 | */ 252 | #define noinline_for_stack noinline 253 | 254 | #ifndef __always_inline 255 | #define __always_inline inline 256 | #endif 257 | 258 | #endif /* __KERNEL__ */ 259 | 260 | /* 261 | * From the GCC manual: 262 | * 263 | * Many functions do not examine any values except their arguments, 264 | * and have no effects except the return value. Basically this is 265 | * just slightly more strict class than the `pure' attribute above, 266 | * since function is not allowed to read global memory. 267 | * 268 | * Note that a function that has pointer arguments and examines the 269 | * data pointed to must _not_ be declared `const'. Likewise, a 270 | * function that calls a non-`const' function usually must not be 271 | * `const'. It does not make sense for a `const' function to return 272 | * `void'. 273 | */ 274 | #ifndef __attribute_const__ 275 | # define __attribute_const__ /* unimplemented */ 276 | #endif 277 | 278 | /* 279 | * Tell gcc if a function is cold. The compiler will assume any path 280 | * directly leading to the call is unlikely. 281 | */ 282 | 283 | #ifndef __cold 284 | #define __cold 285 | #endif 286 | 287 | /* Simple shorthand for a section definition */ 288 | #ifndef __section 289 | # define __section(S) __attribute__ ((__section__(#S))) 290 | #endif 291 | 292 | #ifndef __visible 293 | #define __visible 294 | #endif 295 | 296 | /* Are two types/vars the same type (ignoring qualifiers)? */ 297 | #ifndef __same_type 298 | # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) 299 | #endif 300 | 301 | /* Compile time object size, -1 for unknown */ 302 | #ifndef __compiletime_object_size 303 | # define __compiletime_object_size(obj) -1 304 | #endif 305 | #ifndef __compiletime_warning 306 | # define __compiletime_warning(message) 307 | #endif 308 | #ifndef __compiletime_error 309 | # define __compiletime_error(message) 310 | # define __compiletime_error_fallback(condition) \ 311 | do { ((void)sizeof(char[1 - 2 * condition])); } while (0) 312 | #else 313 | # define __compiletime_error_fallback(condition) do { } while (0) 314 | #endif 315 | 316 | #define __compiletime_assert(condition, msg, prefix, suffix) \ 317 | do { \ 318 | bool __cond = !(condition); \ 319 | extern void prefix ## suffix(void) __compiletime_error(msg); \ 320 | if (__cond) \ 321 | prefix ## suffix(); \ 322 | __compiletime_error_fallback(__cond); \ 323 | } while (0) 324 | 325 | #define _compiletime_assert(condition, msg, prefix, suffix) \ 326 | __compiletime_assert(condition, msg, prefix, suffix) 327 | 328 | /** 329 | * compiletime_assert - break build and emit msg if condition is false 330 | * @condition: a compile-time constant condition to check 331 | * @msg: a message to emit if condition is false 332 | * 333 | * In tradition of POSIX assert, this macro will break the build if the 334 | * supplied condition is *false*, emitting the supplied error message if the 335 | * compiler has support to do so. 336 | */ 337 | #define compiletime_assert(condition, msg) \ 338 | _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) 339 | 340 | /* 341 | * Prevent the compiler from merging or refetching accesses. The compiler 342 | * is also forbidden from reordering successive instances of ACCESS_ONCE(), 343 | * but only when the compiler is aware of some particular ordering. One way 344 | * to make the compiler aware of ordering is to put the two invocations of 345 | * ACCESS_ONCE() in different C statements. 346 | * 347 | * This macro does absolutely -nothing- to prevent the CPU from reordering, 348 | * merging, or refetching absolutely anything at any time. Its main intended 349 | * use is to mediate communication between process-level code and irq/NMI 350 | * handlers, all running on the same CPU. 351 | */ 352 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) 353 | 354 | /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */ 355 | #ifdef CONFIG_KPROBES 356 | # define __kprobes __attribute__((__section__(".kprobes.text"))) 357 | #else 358 | # define __kprobes 359 | #endif 360 | #endif /* __LINUX_COMPILER_H */ 361 | -------------------------------------------------------------------------------- /utils/rbtree/include/export.h: -------------------------------------------------------------------------------- 1 | #ifndef _LINUX_EXPORT_H 2 | #define _LINUX_EXPORT_H 3 | /* 4 | * Export symbols from the kernel to modules. Forked from module.h 5 | * to reduce the amount of pointless cruft we feed to gcc when only 6 | * exporting a simple symbol or two. 7 | * 8 | * Try not to add #includes here. It slows compilation and makes kernel 9 | * hackers place grumpy comments in header files. 10 | */ 11 | 12 | /* Some toolchains use a `_' prefix for all user symbols. */ 13 | #ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX 14 | #define __VMLINUX_SYMBOL(x) _##x 15 | #define __VMLINUX_SYMBOL_STR(x) "_" #x 16 | #else 17 | #define __VMLINUX_SYMBOL(x) x 18 | #define __VMLINUX_SYMBOL_STR(x) #x 19 | #endif 20 | 21 | /* Indirect, so macros are expanded before pasting. */ 22 | #define VMLINUX_SYMBOL(x) __VMLINUX_SYMBOL(x) 23 | #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x) 24 | 25 | #ifndef __ASSEMBLY__ 26 | struct kernel_symbol 27 | { 28 | unsigned long value; 29 | const char *name; 30 | }; 31 | 32 | #ifdef MODULE 33 | extern struct module __this_module; 34 | #define THIS_MODULE (&__this_module) 35 | #else 36 | #define THIS_MODULE ((struct module *)0) 37 | #endif 38 | 39 | #ifdef CONFIG_MODULES 40 | 41 | #ifndef __GENKSYMS__ 42 | #ifdef CONFIG_MODVERSIONS 43 | /* Mark the CRC weak since genksyms apparently decides not to 44 | * generate a checksums for some symbols */ 45 | #define __CRC_SYMBOL(sym, sec) \ 46 | extern void *__crc_##sym __attribute__((weak)); \ 47 | static const unsigned long __kcrctab_##sym \ 48 | __used \ 49 | __attribute__((section("___kcrctab" sec "+" #sym), unused)) \ 50 | = (unsigned long) &__crc_##sym; 51 | #else 52 | #define __CRC_SYMBOL(sym, sec) 53 | #endif 54 | 55 | /* For every exported symbol, place a struct in the __ksymtab section */ 56 | #define __EXPORT_SYMBOL(sym, sec) \ 57 | extern typeof(sym) sym; \ 58 | __CRC_SYMBOL(sym, sec) \ 59 | static const char __kstrtab_##sym[] \ 60 | __attribute__((section("__ksymtab_strings"), aligned(1))) \ 61 | = VMLINUX_SYMBOL_STR(sym); \ 62 | static const struct kernel_symbol __ksymtab_##sym \ 63 | __used \ 64 | __attribute__((section("___ksymtab" sec "+" #sym), unused)) \ 65 | = { (unsigned long)&sym, __kstrtab_##sym } 66 | 67 | #define EXPORT_SYMBOL(sym) \ 68 | __EXPORT_SYMBOL(sym, "") 69 | 70 | #define EXPORT_SYMBOL_GPL(sym) \ 71 | __EXPORT_SYMBOL(sym, "_gpl") 72 | 73 | #define EXPORT_SYMBOL_GPL_FUTURE(sym) \ 74 | __EXPORT_SYMBOL(sym, "_gpl_future") 75 | 76 | #ifdef CONFIG_UNUSED_SYMBOLS 77 | #define EXPORT_UNUSED_SYMBOL(sym) __EXPORT_SYMBOL(sym, "_unused") 78 | #define EXPORT_UNUSED_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_unused_gpl") 79 | #else 80 | #define EXPORT_UNUSED_SYMBOL(sym) 81 | #define EXPORT_UNUSED_SYMBOL_GPL(sym) 82 | #endif 83 | 84 | #endif /* __GENKSYMS__ */ 85 | 86 | #else /* !CONFIG_MODULES... */ 87 | 88 | #define EXPORT_SYMBOL(sym) 89 | #define EXPORT_SYMBOL_GPL(sym) 90 | #define EXPORT_SYMBOL_GPL_FUTURE(sym) 91 | #define EXPORT_UNUSED_SYMBOL(sym) 92 | #define EXPORT_UNUSED_SYMBOL_GPL(sym) 93 | 94 | #endif /* CONFIG_MODULES */ 95 | #endif /* !__ASSEMBLY__ */ 96 | 97 | #endif /* _LINUX_EXPORT_H */ 98 | -------------------------------------------------------------------------------- /utils/rbtree/include/rbtree.h: -------------------------------------------------------------------------------- 1 | /* 2 | Red Black Trees 3 | (C) 1999 Andrea Arcangeli 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program; if not, write to the Free Software 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 | 19 | linux/include/linux/rbtree.h 20 | 21 | To use rbtrees you'll have to implement your own insert and search cores. 22 | This will avoid us to use callbacks and to drop drammatically performances. 23 | I know it's not the cleaner way, but in C (not in C++) to get 24 | performances and genericity... 25 | 26 | See Documentation/rbtree.txt for documentation and samples. 27 | */ 28 | 29 | #ifndef _LINUX_RBTREE_H 30 | #define _LINUX_RBTREE_H 31 | 32 | //#include 33 | 34 | 35 | /* #include */ 36 | #undef NULL 37 | #define NULL ((void *)0) 38 | 39 | enum { 40 | false = 0, 41 | true = 1 42 | }; 43 | 44 | enum { 45 | FALSE = 0, 46 | TRUE = 1 47 | }; 48 | 49 | #ifndef container_of 50 | /** 51 | * container_of - cast a member of a structure out to the containing structure 52 | * @ptr: the pointer to the member. 53 | * @type: the type of the container struct this is embedded in. 54 | * @member: the name of the member within the struct. 55 | * 56 | */ 57 | #define container_of(ptr, type, member) ({ \ 58 | const typeof( ((type *)0)->member ) *__mptr = (ptr); \ 59 | (type *)( (char *)__mptr - offsetof(type,member) );}) 60 | #endif 61 | 62 | #undef offsetof 63 | #ifdef __compiler_offsetof 64 | #define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) 65 | #else 66 | #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) 67 | #endif 68 | 69 | 70 | 71 | struct rb_node { 72 | unsigned long __rb_parent_color; 73 | struct rb_node *rb_right; 74 | struct rb_node *rb_left; 75 | } __attribute__((aligned(sizeof(long)))); 76 | /* The alignment might seem pointless, but allegedly CRIS needs it */ 77 | 78 | struct rb_root { 79 | struct rb_node *rb_node; 80 | }; 81 | 82 | 83 | #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) 84 | 85 | #define RB_ROOT (struct rb_root) { NULL, } 86 | #define rb_entry(ptr, type, member) container_of(ptr, type, member) 87 | 88 | #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) 89 | 90 | /* 'empty' nodes are nodes that are known not to be inserted in an rbree */ 91 | #define RB_EMPTY_NODE(node) \ 92 | ((node)->__rb_parent_color == (unsigned long)(node)) 93 | #define RB_CLEAR_NODE(node) \ 94 | ((node)->__rb_parent_color = (unsigned long)(node)) 95 | 96 | 97 | extern void rb_insert_color(struct rb_node *, struct rb_root *); 98 | extern void rb_erase(struct rb_node *, struct rb_root *); 99 | 100 | 101 | /* Find logical next and previous nodes in a tree */ 102 | extern struct rb_node *rb_next(const struct rb_node *); 103 | extern struct rb_node *rb_prev(const struct rb_node *); 104 | extern struct rb_node *rb_first(const struct rb_root *); 105 | extern struct rb_node *rb_last(const struct rb_root *); 106 | 107 | /* Postorder iteration - always visit the parent after its children */ 108 | extern struct rb_node *rb_first_postorder(const struct rb_root *); 109 | extern struct rb_node *rb_next_postorder(const struct rb_node *); 110 | 111 | /* Fast replacement of a single node without remove/rebalance/add/rebalance */ 112 | extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 113 | struct rb_root *root); 114 | 115 | static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, 116 | struct rb_node ** rb_link) 117 | { 118 | node->__rb_parent_color = (unsigned long)parent; 119 | node->rb_left = node->rb_right = NULL; 120 | 121 | *rb_link = node; 122 | } 123 | 124 | #define rb_entry_safe(ptr, type, member) \ 125 | ({ typeof(ptr) ____ptr = (ptr); \ 126 | ____ptr ? rb_entry(____ptr, type, member) : NULL; \ 127 | }) 128 | 129 | /** 130 | * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of 131 | * given type safe against removal of rb_node entry 132 | * 133 | * @pos: the 'type *' to use as a loop cursor. 134 | * @n: another 'type *' to use as temporary storage 135 | * @root: 'rb_root *' of the rbtree. 136 | * @field: the name of the rb_node field within 'type'. 137 | */ 138 | #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ 139 | for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ 140 | pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ 141 | typeof(*pos), field); 1; }); \ 142 | pos = n) 143 | 144 | #endif /* _LINUX_RBTREE_H */ 145 | -------------------------------------------------------------------------------- /utils/rbtree/include/rbtree_augmented.h: -------------------------------------------------------------------------------- 1 | /* 2 | Red Black Trees 3 | (C) 1999 Andrea Arcangeli 4 | (C) 2002 David Woodhouse 5 | (C) 2012 Michel Lespinasse 6 | 7 | This program is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program; if not, write to the Free Software 19 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 | 21 | linux/include/linux/rbtree_augmented.h 22 | */ 23 | 24 | #ifndef _LINUX_RBTREE_AUGMENTED_H 25 | #define _LINUX_RBTREE_AUGMENTED_H 26 | 27 | #include 28 | #include 29 | 30 | /* 31 | * Please note - only struct rb_augment_callbacks and the prototypes for 32 | * rb_insert_augmented() and rb_erase_augmented() are intended to be public. 33 | * The rest are implementation details you are not expected to depend on. 34 | * 35 | * See Documentation/rbtree.txt for documentation and samples. 36 | */ 37 | 38 | struct rb_augment_callbacks { 39 | void (*propagate)(struct rb_node *node, struct rb_node *stop); 40 | void (*copy)(struct rb_node *old, struct rb_node *new); 41 | void (*rotate)(struct rb_node *old, struct rb_node *new); 42 | }; 43 | 44 | extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, 45 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); 46 | static inline void 47 | rb_insert_augmented(struct rb_node *node, struct rb_root *root, 48 | const struct rb_augment_callbacks *augment) 49 | { 50 | __rb_insert_augmented(node, root, augment->rotate); 51 | } 52 | 53 | #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ 54 | rbtype, rbaugmented, rbcompute) \ 55 | static inline void \ 56 | rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ 57 | { \ 58 | while (rb != stop) { \ 59 | rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ 60 | rbtype augmented = rbcompute(node); \ 61 | if (node->rbaugmented == augmented) \ 62 | break; \ 63 | node->rbaugmented = augmented; \ 64 | rb = rb_parent(&node->rbfield); \ 65 | } \ 66 | } \ 67 | static inline void \ 68 | rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ 69 | { \ 70 | rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ 71 | rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ 72 | new->rbaugmented = old->rbaugmented; \ 73 | } \ 74 | static void \ 75 | rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ 76 | { \ 77 | rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ 78 | rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ 79 | new->rbaugmented = old->rbaugmented; \ 80 | old->rbaugmented = rbcompute(old); \ 81 | } \ 82 | rbstatic const struct rb_augment_callbacks rbname = { \ 83 | rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ 84 | }; 85 | 86 | 87 | #define RB_RED 0 88 | #define RB_BLACK 1 89 | 90 | #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) 91 | 92 | #define __rb_color(pc) ((pc) & 1) 93 | #define __rb_is_black(pc) __rb_color(pc) 94 | #define __rb_is_red(pc) (!__rb_color(pc)) 95 | #define rb_color(rb) __rb_color((rb)->__rb_parent_color) 96 | #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) 97 | #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) 98 | 99 | static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) 100 | { 101 | rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; 102 | } 103 | 104 | static inline void rb_set_parent_color(struct rb_node *rb, 105 | struct rb_node *p, int color) 106 | { 107 | rb->__rb_parent_color = (unsigned long)p | color; 108 | } 109 | 110 | static inline void 111 | __rb_change_child(struct rb_node *old, struct rb_node *new, 112 | struct rb_node *parent, struct rb_root *root) 113 | { 114 | if (parent) { 115 | if (parent->rb_left == old) 116 | parent->rb_left = new; 117 | else 118 | parent->rb_right = new; 119 | } else 120 | root->rb_node = new; 121 | } 122 | 123 | extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, 124 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); 125 | 126 | static inline struct rb_node * 127 | __rb_erase_augmented(struct rb_node *node, struct rb_root *root, 128 | const struct rb_augment_callbacks *augment) 129 | { 130 | struct rb_node *child = node->rb_right, *tmp = node->rb_left; 131 | struct rb_node *parent, *rebalance; 132 | unsigned long pc; 133 | 134 | if (!tmp) { 135 | /* 136 | * Case 1: node to erase has no more than 1 child (easy!) 137 | * 138 | * Note that if there is one child it must be red due to 5) 139 | * and node must be black due to 4). We adjust colors locally 140 | * so as to bypass __rb_erase_color() later on. 141 | */ 142 | pc = node->__rb_parent_color; 143 | parent = __rb_parent(pc); 144 | __rb_change_child(node, child, parent, root); 145 | if (child) { 146 | child->__rb_parent_color = pc; 147 | rebalance = NULL; 148 | } else 149 | rebalance = __rb_is_black(pc) ? parent : NULL; 150 | tmp = parent; 151 | } else if (!child) { 152 | /* Still case 1, but this time the child is node->rb_left */ 153 | tmp->__rb_parent_color = pc = node->__rb_parent_color; 154 | parent = __rb_parent(pc); 155 | __rb_change_child(node, tmp, parent, root); 156 | rebalance = NULL; 157 | tmp = parent; 158 | } else { 159 | struct rb_node *successor = child, *child2; 160 | tmp = child->rb_left; 161 | if (!tmp) { 162 | /* 163 | * Case 2: node's successor is its right child 164 | * 165 | * (n) (s) 166 | * / \ / \ 167 | * (x) (s) -> (x) (c) 168 | * \ 169 | * (c) 170 | */ 171 | parent = successor; 172 | child2 = successor->rb_right; 173 | augment->copy(node, successor); 174 | } else { 175 | /* 176 | * Case 3: node's successor is leftmost under 177 | * node's right child subtree 178 | * 179 | * (n) (s) 180 | * / \ / \ 181 | * (x) (y) -> (x) (y) 182 | * / / 183 | * (p) (p) 184 | * / / 185 | * (s) (c) 186 | * \ 187 | * (c) 188 | */ 189 | do { 190 | parent = successor; 191 | successor = tmp; 192 | tmp = tmp->rb_left; 193 | } while (tmp); 194 | parent->rb_left = child2 = successor->rb_right; 195 | successor->rb_right = child; 196 | rb_set_parent(child, successor); 197 | augment->copy(node, successor); 198 | augment->propagate(parent, successor); 199 | } 200 | 201 | successor->rb_left = tmp = node->rb_left; 202 | rb_set_parent(tmp, successor); 203 | 204 | pc = node->__rb_parent_color; 205 | tmp = __rb_parent(pc); 206 | __rb_change_child(node, successor, tmp, root); 207 | if (child2) { 208 | successor->__rb_parent_color = pc; 209 | rb_set_parent_color(child2, parent, RB_BLACK); 210 | rebalance = NULL; 211 | } else { 212 | unsigned long pc2 = successor->__rb_parent_color; 213 | successor->__rb_parent_color = pc; 214 | rebalance = __rb_is_black(pc2) ? parent : NULL; 215 | } 216 | tmp = successor; 217 | } 218 | 219 | augment->propagate(tmp, NULL); 220 | return rebalance; 221 | } 222 | 223 | static inline void 224 | rb_erase_augmented(struct rb_node *node, struct rb_root *root, 225 | const struct rb_augment_callbacks *augment) 226 | { 227 | struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); 228 | if (rebalance) 229 | __rb_erase_color(rebalance, root, augment->rotate); 230 | } 231 | 232 | #endif /* _LINUX_RBTREE_AUGMENTED_H */ 233 | -------------------------------------------------------------------------------- /utils/rbtree/rbtree.txt: -------------------------------------------------------------------------------- 1 | Red-black Trees (rbtree) in Linux 2 | January 18, 2007 3 | Rob Landley 4 | ============================= 5 | 6 | What are red-black trees, and what are they for? 7 | ------------------------------------------------ 8 | 9 | Red-black trees are a type of self-balancing binary search tree, used for 10 | storing sortable key/value data pairs. This differs from radix trees (which 11 | are used to efficiently store sparse arrays and thus use long integer indexes 12 | to insert/access/delete nodes) and hash tables (which are not kept sorted to 13 | be easily traversed in order, and must be tuned for a specific size and 14 | hash function where rbtrees scale gracefully storing arbitrary keys). 15 | 16 | Red-black trees are similar to AVL trees, but provide faster real-time bounded 17 | worst case performance for insertion and deletion (at most two rotations and 18 | three rotations, respectively, to balance the tree), with slightly slower 19 | (but still O(log n)) lookup time. 20 | 21 | To quote Linux Weekly News: 22 | 23 | There are a number of red-black trees in use in the kernel. 24 | The deadline and CFQ I/O schedulers employ rbtrees to 25 | track requests; the packet CD/DVD driver does the same. 26 | The high-resolution timer code uses an rbtree to organize outstanding 27 | timer requests. The ext3 filesystem tracks directory entries in a 28 | red-black tree. Virtual memory areas (VMAs) are tracked with red-black 29 | trees, as are epoll file descriptors, cryptographic keys, and network 30 | packets in the "hierarchical token bucket" scheduler. 31 | 32 | This document covers use of the Linux rbtree implementation. For more 33 | information on the nature and implementation of Red Black Trees, see: 34 | 35 | Linux Weekly News article on red-black trees 36 | http://lwn.net/Articles/184495/ 37 | 38 | Wikipedia entry on red-black trees 39 | http://en.wikipedia.org/wiki/Red-black_tree 40 | 41 | Linux implementation of red-black trees 42 | --------------------------------------- 43 | 44 | Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it, 45 | "#include ". 46 | 47 | The Linux rbtree implementation is optimized for speed, and thus has one 48 | less layer of indirection (and better cache locality) than more traditional 49 | tree implementations. Instead of using pointers to separate rb_node and data 50 | structures, each instance of struct rb_node is embedded in the data structure 51 | it organizes. And instead of using a comparison callback function pointer, 52 | users are expected to write their own tree search and insert functions 53 | which call the provided rbtree functions. Locking is also left up to the 54 | user of the rbtree code. 55 | 56 | Creating a new rbtree 57 | --------------------- 58 | 59 | Data nodes in an rbtree tree are structures containing a struct rb_node member: 60 | 61 | struct mytype { 62 | struct rb_node node; 63 | char *keystring; 64 | }; 65 | 66 | When dealing with a pointer to the embedded struct rb_node, the containing data 67 | structure may be accessed with the standard container_of() macro. In addition, 68 | individual members may be accessed directly via rb_entry(node, type, member). 69 | 70 | At the root of each rbtree is an rb_root structure, which is initialized to be 71 | empty via: 72 | 73 | struct rb_root mytree = RB_ROOT; 74 | 75 | Searching for a value in an rbtree 76 | ---------------------------------- 77 | 78 | Writing a search function for your tree is fairly straightforward: start at the 79 | root, compare each value, and follow the left or right branch as necessary. 80 | 81 | Example: 82 | 83 | struct mytype *my_search(struct rb_root *root, char *string) 84 | { 85 | struct rb_node *node = root->rb_node; 86 | 87 | while (node) { 88 | struct mytype *data = container_of(node, struct mytype, node); 89 | int result; 90 | 91 | result = strcmp(string, data->keystring); 92 | 93 | if (result < 0) 94 | node = node->rb_left; 95 | else if (result > 0) 96 | node = node->rb_right; 97 | else 98 | return data; 99 | } 100 | return NULL; 101 | } 102 | 103 | Inserting data into an rbtree 104 | ----------------------------- 105 | 106 | Inserting data in the tree involves first searching for the place to insert the 107 | new node, then inserting the node and rebalancing ("recoloring") the tree. 108 | 109 | The search for insertion differs from the previous search by finding the 110 | location of the pointer on which to graft the new node. The new node also 111 | needs a link to its parent node for rebalancing purposes. 112 | 113 | Example: 114 | 115 | int my_insert(struct rb_root *root, struct mytype *data) 116 | { 117 | struct rb_node **new = &(root->rb_node), *parent = NULL; 118 | 119 | /* Figure out where to put new node */ 120 | while (*new) { 121 | struct mytype *this = container_of(*new, struct mytype, node); 122 | int result = strcmp(data->keystring, this->keystring); 123 | 124 | parent = *new; 125 | if (result < 0) 126 | new = &((*new)->rb_left); 127 | else if (result > 0) 128 | new = &((*new)->rb_right); 129 | else 130 | return FALSE; 131 | } 132 | 133 | /* Add new node and rebalance tree. */ 134 | rb_link_node(&data->node, parent, new); 135 | rb_insert_color(&data->node, root); 136 | 137 | return TRUE; 138 | } 139 | 140 | Removing or replacing existing data in an rbtree 141 | ------------------------------------------------ 142 | 143 | To remove an existing node from a tree, call: 144 | 145 | void rb_erase(struct rb_node *victim, struct rb_root *tree); 146 | 147 | Example: 148 | 149 | struct mytype *data = mysearch(&mytree, "walrus"); 150 | 151 | if (data) { 152 | rb_erase(&data->node, &mytree); 153 | myfree(data); 154 | } 155 | 156 | To replace an existing node in a tree with a new one with the same key, call: 157 | 158 | void rb_replace_node(struct rb_node *old, struct rb_node *new, 159 | struct rb_root *tree); 160 | 161 | Replacing a node this way does not re-sort the tree: If the new node doesn't 162 | have the same key as the old node, the rbtree will probably become corrupted. 163 | 164 | Iterating through the elements stored in an rbtree (in sort order) 165 | ------------------------------------------------------------------ 166 | 167 | Four functions are provided for iterating through an rbtree's contents in 168 | sorted order. These work on arbitrary trees, and should not need to be 169 | modified or wrapped (except for locking purposes): 170 | 171 | struct rb_node *rb_first(struct rb_root *tree); 172 | struct rb_node *rb_last(struct rb_root *tree); 173 | struct rb_node *rb_next(struct rb_node *node); 174 | struct rb_node *rb_prev(struct rb_node *node); 175 | 176 | To start iterating, call rb_first() or rb_last() with a pointer to the root 177 | of the tree, which will return a pointer to the node structure contained in 178 | the first or last element in the tree. To continue, fetch the next or previous 179 | node by calling rb_next() or rb_prev() on the current node. This will return 180 | NULL when there are no more nodes left. 181 | 182 | The iterator functions return a pointer to the embedded struct rb_node, from 183 | which the containing data structure may be accessed with the container_of() 184 | macro, and individual members may be accessed directly via 185 | rb_entry(node, type, member). 186 | 187 | Example: 188 | 189 | struct rb_node *node; 190 | for (node = rb_first(&mytree); node; node = rb_next(node)) 191 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); 192 | 193 | Support for Augmented rbtrees 194 | ----------------------------- 195 | 196 | Augmented rbtree is an rbtree with "some" additional data stored in 197 | each node, where the additional data for node N must be a function of 198 | the contents of all nodes in the subtree rooted at N. This data can 199 | be used to augment some new functionality to rbtree. Augmented rbtree 200 | is an optional feature built on top of basic rbtree infrastructure. 201 | An rbtree user who wants this feature will have to call the augmentation 202 | functions with the user provided augmentation callback when inserting 203 | and erasing nodes. 204 | 205 | C files implementing augmented rbtree manipulation must include 206 | instead of . Note that 207 | linux/rbtree_augmented.h exposes some rbtree implementations details 208 | you are not expected to rely on; please stick to the documented APIs 209 | there and do not include from header files 210 | either so as to minimize chances of your users accidentally relying on 211 | such implementation details. 212 | 213 | On insertion, the user must update the augmented information on the path 214 | leading to the inserted node, then call rb_link_node() as usual and 215 | rb_augment_inserted() instead of the usual rb_insert_color() call. 216 | If rb_augment_inserted() rebalances the rbtree, it will callback into 217 | a user provided function to update the augmented information on the 218 | affected subtrees. 219 | 220 | When erasing a node, the user must call rb_erase_augmented() instead of 221 | rb_erase(). rb_erase_augmented() calls back into user provided functions 222 | to updated the augmented information on affected subtrees. 223 | 224 | In both cases, the callbacks are provided through struct rb_augment_callbacks. 225 | 3 callbacks must be defined: 226 | 227 | - A propagation callback, which updates the augmented value for a given 228 | node and its ancestors, up to a given stop point (or NULL to update 229 | all the way to the root). 230 | 231 | - A copy callback, which copies the augmented value for a given subtree 232 | to a newly assigned subtree root. 233 | 234 | - A tree rotation callback, which copies the augmented value for a given 235 | subtree to a newly assigned subtree root AND recomputes the augmented 236 | information for the former subtree root. 237 | 238 | The compiled code for rb_erase_augmented() may inline the propagation and 239 | copy callbacks, which results in a large function, so each augmented rbtree 240 | user should have a single rb_erase_augmented() call site in order to limit 241 | compiled code size. 242 | 243 | 244 | Sample usage: 245 | 246 | Interval tree is an example of augmented rb tree. Reference - 247 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. 248 | More details about interval trees: 249 | 250 | Classical rbtree has a single key and it cannot be directly used to store 251 | interval ranges like [lo:hi] and do a quick lookup for any overlap with a new 252 | lo:hi or to find whether there is an exact match for a new lo:hi. 253 | 254 | However, rbtree can be augmented to store such interval ranges in a structured 255 | way making it possible to do efficient lookup and exact match. 256 | 257 | This "extra information" stored in each node is the maximum hi 258 | (max_hi) value among all the nodes that are its descendents. This 259 | information can be maintained at each node just be looking at the node 260 | and its immediate children. And this will be used in O(log n) lookup 261 | for lowest match (lowest start address among all possible matches) 262 | with something like: 263 | 264 | struct interval_tree_node * 265 | interval_tree_first_match(struct rb_root *root, 266 | unsigned long start, unsigned long last) 267 | { 268 | struct interval_tree_node *node; 269 | 270 | if (!root->rb_node) 271 | return NULL; 272 | node = rb_entry(root->rb_node, struct interval_tree_node, rb); 273 | 274 | while (true) { 275 | if (node->rb.rb_left) { 276 | struct interval_tree_node *left = 277 | rb_entry(node->rb.rb_left, 278 | struct interval_tree_node, rb); 279 | if (left->__subtree_last >= start) { 280 | /* 281 | * Some nodes in left subtree satisfy Cond2. 282 | * Iterate to find the leftmost such node N. 283 | * If it also satisfies Cond1, that's the match 284 | * we are looking for. Otherwise, there is no 285 | * matching interval as nodes to the right of N 286 | * can't satisfy Cond1 either. 287 | */ 288 | node = left; 289 | continue; 290 | } 291 | } 292 | if (node->start <= last) { /* Cond1 */ 293 | if (node->last >= start) /* Cond2 */ 294 | return node; /* node is leftmost match */ 295 | if (node->rb.rb_right) { 296 | node = rb_entry(node->rb.rb_right, 297 | struct interval_tree_node, rb); 298 | if (node->__subtree_last >= start) 299 | continue; 300 | } 301 | } 302 | return NULL; /* No match */ 303 | } 304 | } 305 | 306 | Insertion/removal are defined using the following augmented callbacks: 307 | 308 | static inline unsigned long 309 | compute_subtree_last(struct interval_tree_node *node) 310 | { 311 | unsigned long max = node->last, subtree_last; 312 | if (node->rb.rb_left) { 313 | subtree_last = rb_entry(node->rb.rb_left, 314 | struct interval_tree_node, rb)->__subtree_last; 315 | if (max < subtree_last) 316 | max = subtree_last; 317 | } 318 | if (node->rb.rb_right) { 319 | subtree_last = rb_entry(node->rb.rb_right, 320 | struct interval_tree_node, rb)->__subtree_last; 321 | if (max < subtree_last) 322 | max = subtree_last; 323 | } 324 | return max; 325 | } 326 | 327 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop) 328 | { 329 | while (rb != stop) { 330 | struct interval_tree_node *node = 331 | rb_entry(rb, struct interval_tree_node, rb); 332 | unsigned long subtree_last = compute_subtree_last(node); 333 | if (node->__subtree_last == subtree_last) 334 | break; 335 | node->__subtree_last = subtree_last; 336 | rb = rb_parent(&node->rb); 337 | } 338 | } 339 | 340 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) 341 | { 342 | struct interval_tree_node *old = 343 | rb_entry(rb_old, struct interval_tree_node, rb); 344 | struct interval_tree_node *new = 345 | rb_entry(rb_new, struct interval_tree_node, rb); 346 | 347 | new->__subtree_last = old->__subtree_last; 348 | } 349 | 350 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) 351 | { 352 | struct interval_tree_node *old = 353 | rb_entry(rb_old, struct interval_tree_node, rb); 354 | struct interval_tree_node *new = 355 | rb_entry(rb_new, struct interval_tree_node, rb); 356 | 357 | new->__subtree_last = old->__subtree_last; 358 | old->__subtree_last = compute_subtree_last(old); 359 | } 360 | 361 | static const struct rb_augment_callbacks augment_callbacks = { 362 | augment_propagate, augment_copy, augment_rotate 363 | }; 364 | 365 | void interval_tree_insert(struct interval_tree_node *node, 366 | struct rb_root *root) 367 | { 368 | struct rb_node **link = &root->rb_node, *rb_parent = NULL; 369 | unsigned long start = node->start, last = node->last; 370 | struct interval_tree_node *parent; 371 | 372 | while (*link) { 373 | rb_parent = *link; 374 | parent = rb_entry(rb_parent, struct interval_tree_node, rb); 375 | if (parent->__subtree_last < last) 376 | parent->__subtree_last = last; 377 | if (start < parent->start) 378 | link = &parent->rb.rb_left; 379 | else 380 | link = &parent->rb.rb_right; 381 | } 382 | 383 | node->__subtree_last = last; 384 | rb_link_node(&node->rb, rb_parent, link); 385 | rb_insert_augmented(&node->rb, root, &augment_callbacks); 386 | } 387 | 388 | void interval_tree_remove(struct interval_tree_node *node, 389 | struct rb_root *root) 390 | { 391 | rb_erase_augmented(&node->rb, root, &augment_callbacks); 392 | } 393 | -------------------------------------------------------------------------------- /utils/rbtree/src/rbtree.c: -------------------------------------------------------------------------------- 1 | /* 2 | Red Black Trees 3 | (C) 1999 Andrea Arcangeli 4 | (C) 2002 David Woodhouse 5 | (C) 2012 Michel Lespinasse 6 | 7 | This program is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program; if not, write to the Free Software 19 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 | 21 | linux/lib/rbtree.c 22 | */ 23 | 24 | #include 25 | #include 26 | 27 | /* 28 | * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree 29 | * 30 | * 1) A node is either red or black 31 | * 2) The root is black 32 | * 3) All leaves (NULL) are black 33 | * 4) Both children of every red node are black 34 | * 5) Every simple path from root to leaves contains the same number 35 | * of black nodes. 36 | * 37 | * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two 38 | * consecutive red nodes in a path and every red node is therefore followed by 39 | * a black. So if B is the number of black nodes on every simple path (as per 40 | * 5), then the longest possible path due to 4 is 2B. 41 | * 42 | * We shall indicate color with case, where black nodes are uppercase and red 43 | * nodes will be lowercase. Unknown color nodes shall be drawn as red within 44 | * parentheses and have some accompanying text comment. 45 | */ 46 | 47 | static inline void rb_set_black(struct rb_node *rb) 48 | { 49 | rb->__rb_parent_color |= RB_BLACK; 50 | } 51 | 52 | static inline struct rb_node *rb_red_parent(struct rb_node *red) 53 | { 54 | return (struct rb_node *)red->__rb_parent_color; 55 | } 56 | 57 | /* 58 | * Helper function for rotations: 59 | * - old's parent and color get assigned to new 60 | * - old gets assigned new as a parent and 'color' as a color. 61 | */ 62 | static inline void 63 | __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, 64 | struct rb_root *root, int color) 65 | { 66 | struct rb_node *parent = rb_parent(old); 67 | new->__rb_parent_color = old->__rb_parent_color; 68 | rb_set_parent_color(old, new, color); 69 | __rb_change_child(old, new, parent, root); 70 | } 71 | 72 | static inline void 73 | __rb_insert(struct rb_node *node, struct rb_root *root, 74 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) 75 | { 76 | struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; 77 | 78 | while (true) { 79 | /* 80 | * Loop invariant: node is red 81 | * 82 | * If there is a black parent, we are done. 83 | * Otherwise, take some corrective action as we don't 84 | * want a red root or two consecutive red nodes. 85 | */ 86 | if (!parent) { 87 | rb_set_parent_color(node, NULL, RB_BLACK); 88 | break; 89 | } else if (rb_is_black(parent)) 90 | break; 91 | 92 | gparent = rb_red_parent(parent); 93 | 94 | tmp = gparent->rb_right; 95 | if (parent != tmp) { /* parent == gparent->rb_left */ 96 | if (tmp && rb_is_red(tmp)) { 97 | /* 98 | * Case 1 - color flips 99 | * 100 | * G g 101 | * / \ / \ 102 | * p u --> P U 103 | * / / 104 | * n N 105 | * 106 | * However, since g's parent might be red, and 107 | * 4) does not allow this, we need to recurse 108 | * at g. 109 | */ 110 | rb_set_parent_color(tmp, gparent, RB_BLACK); 111 | rb_set_parent_color(parent, gparent, RB_BLACK); 112 | node = gparent; 113 | parent = rb_parent(node); 114 | rb_set_parent_color(node, parent, RB_RED); 115 | continue; 116 | } 117 | 118 | tmp = parent->rb_right; 119 | if (node == tmp) { 120 | /* 121 | * Case 2 - left rotate at parent 122 | * 123 | * G G 124 | * / \ / \ 125 | * p U --> n U 126 | * \ / 127 | * n p 128 | * 129 | * This still leaves us in violation of 4), the 130 | * continuation into Case 3 will fix that. 131 | */ 132 | parent->rb_right = tmp = node->rb_left; 133 | node->rb_left = parent; 134 | if (tmp) 135 | rb_set_parent_color(tmp, parent, 136 | RB_BLACK); 137 | rb_set_parent_color(parent, node, RB_RED); 138 | augment_rotate(parent, node); 139 | parent = node; 140 | tmp = node->rb_right; 141 | } 142 | 143 | /* 144 | * Case 3 - right rotate at gparent 145 | * 146 | * G P 147 | * / \ / \ 148 | * p U --> n g 149 | * / \ 150 | * n U 151 | */ 152 | gparent->rb_left = tmp; /* == parent->rb_right */ 153 | parent->rb_right = gparent; 154 | if (tmp) 155 | rb_set_parent_color(tmp, gparent, RB_BLACK); 156 | __rb_rotate_set_parents(gparent, parent, root, RB_RED); 157 | augment_rotate(gparent, parent); 158 | break; 159 | } else { 160 | tmp = gparent->rb_left; 161 | if (tmp && rb_is_red(tmp)) { 162 | /* Case 1 - color flips */ 163 | rb_set_parent_color(tmp, gparent, RB_BLACK); 164 | rb_set_parent_color(parent, gparent, RB_BLACK); 165 | node = gparent; 166 | parent = rb_parent(node); 167 | rb_set_parent_color(node, parent, RB_RED); 168 | continue; 169 | } 170 | 171 | tmp = parent->rb_left; 172 | if (node == tmp) { 173 | /* Case 2 - right rotate at parent */ 174 | parent->rb_left = tmp = node->rb_right; 175 | node->rb_right = parent; 176 | if (tmp) 177 | rb_set_parent_color(tmp, parent, 178 | RB_BLACK); 179 | rb_set_parent_color(parent, node, RB_RED); 180 | augment_rotate(parent, node); 181 | parent = node; 182 | tmp = node->rb_left; 183 | } 184 | 185 | /* Case 3 - left rotate at gparent */ 186 | gparent->rb_right = tmp; /* == parent->rb_left */ 187 | parent->rb_left = gparent; 188 | if (tmp) 189 | rb_set_parent_color(tmp, gparent, RB_BLACK); 190 | __rb_rotate_set_parents(gparent, parent, root, RB_RED); 191 | augment_rotate(gparent, parent); 192 | break; 193 | } 194 | } 195 | } 196 | 197 | /* 198 | * Inline version for rb_erase() use - we want to be able to inline 199 | * and eliminate the dummy_rotate callback there 200 | */ 201 | static inline void 202 | ____rb_erase_color(struct rb_node *parent, struct rb_root *root, 203 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) 204 | { 205 | struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; 206 | 207 | while (true) { 208 | /* 209 | * Loop invariants: 210 | * - node is black (or NULL on first iteration) 211 | * - node is not the root (parent is not NULL) 212 | * - All leaf paths going through parent and node have a 213 | * black node count that is 1 lower than other leaf paths. 214 | */ 215 | sibling = parent->rb_right; 216 | if (node != sibling) { /* node == parent->rb_left */ 217 | if (rb_is_red(sibling)) { 218 | /* 219 | * Case 1 - left rotate at parent 220 | * 221 | * P S 222 | * / \ / \ 223 | * N s --> p Sr 224 | * / \ / \ 225 | * Sl Sr N Sl 226 | */ 227 | parent->rb_right = tmp1 = sibling->rb_left; 228 | sibling->rb_left = parent; 229 | rb_set_parent_color(tmp1, parent, RB_BLACK); 230 | __rb_rotate_set_parents(parent, sibling, root, 231 | RB_RED); 232 | augment_rotate(parent, sibling); 233 | sibling = tmp1; 234 | } 235 | tmp1 = sibling->rb_right; 236 | if (!tmp1 || rb_is_black(tmp1)) { 237 | tmp2 = sibling->rb_left; 238 | if (!tmp2 || rb_is_black(tmp2)) { 239 | /* 240 | * Case 2 - sibling color flip 241 | * (p could be either color here) 242 | * 243 | * (p) (p) 244 | * / \ / \ 245 | * N S --> N s 246 | * / \ / \ 247 | * Sl Sr Sl Sr 248 | * 249 | * This leaves us violating 5) which 250 | * can be fixed by flipping p to black 251 | * if it was red, or by recursing at p. 252 | * p is red when coming from Case 1. 253 | */ 254 | rb_set_parent_color(sibling, parent, 255 | RB_RED); 256 | if (rb_is_red(parent)) 257 | rb_set_black(parent); 258 | else { 259 | node = parent; 260 | parent = rb_parent(node); 261 | if (parent) 262 | continue; 263 | } 264 | break; 265 | } 266 | /* 267 | * Case 3 - right rotate at sibling 268 | * (p could be either color here) 269 | * 270 | * (p) (p) 271 | * / \ / \ 272 | * N S --> N Sl 273 | * / \ \ 274 | * sl Sr s 275 | * \ 276 | * Sr 277 | */ 278 | sibling->rb_left = tmp1 = tmp2->rb_right; 279 | tmp2->rb_right = sibling; 280 | parent->rb_right = tmp2; 281 | if (tmp1) 282 | rb_set_parent_color(tmp1, sibling, 283 | RB_BLACK); 284 | augment_rotate(sibling, tmp2); 285 | tmp1 = sibling; 286 | sibling = tmp2; 287 | } 288 | /* 289 | * Case 4 - left rotate at parent + color flips 290 | * (p and sl could be either color here. 291 | * After rotation, p becomes black, s acquires 292 | * p's color, and sl keeps its color) 293 | * 294 | * (p) (s) 295 | * / \ / \ 296 | * N S --> P Sr 297 | * / \ / \ 298 | * (sl) sr N (sl) 299 | */ 300 | parent->rb_right = tmp2 = sibling->rb_left; 301 | sibling->rb_left = parent; 302 | rb_set_parent_color(tmp1, sibling, RB_BLACK); 303 | if (tmp2) 304 | rb_set_parent(tmp2, parent); 305 | __rb_rotate_set_parents(parent, sibling, root, 306 | RB_BLACK); 307 | augment_rotate(parent, sibling); 308 | break; 309 | } else { 310 | sibling = parent->rb_left; 311 | if (rb_is_red(sibling)) { 312 | /* Case 1 - right rotate at parent */ 313 | parent->rb_left = tmp1 = sibling->rb_right; 314 | sibling->rb_right = parent; 315 | rb_set_parent_color(tmp1, parent, RB_BLACK); 316 | __rb_rotate_set_parents(parent, sibling, root, 317 | RB_RED); 318 | augment_rotate(parent, sibling); 319 | sibling = tmp1; 320 | } 321 | tmp1 = sibling->rb_left; 322 | if (!tmp1 || rb_is_black(tmp1)) { 323 | tmp2 = sibling->rb_right; 324 | if (!tmp2 || rb_is_black(tmp2)) { 325 | /* Case 2 - sibling color flip */ 326 | rb_set_parent_color(sibling, parent, 327 | RB_RED); 328 | if (rb_is_red(parent)) 329 | rb_set_black(parent); 330 | else { 331 | node = parent; 332 | parent = rb_parent(node); 333 | if (parent) 334 | continue; 335 | } 336 | break; 337 | } 338 | /* Case 3 - right rotate at sibling */ 339 | sibling->rb_right = tmp1 = tmp2->rb_left; 340 | tmp2->rb_left = sibling; 341 | parent->rb_left = tmp2; 342 | if (tmp1) 343 | rb_set_parent_color(tmp1, sibling, 344 | RB_BLACK); 345 | augment_rotate(sibling, tmp2); 346 | tmp1 = sibling; 347 | sibling = tmp2; 348 | } 349 | /* Case 4 - left rotate at parent + color flips */ 350 | parent->rb_left = tmp2 = sibling->rb_right; 351 | sibling->rb_right = parent; 352 | rb_set_parent_color(tmp1, sibling, RB_BLACK); 353 | if (tmp2) 354 | rb_set_parent(tmp2, parent); 355 | __rb_rotate_set_parents(parent, sibling, root, 356 | RB_BLACK); 357 | augment_rotate(parent, sibling); 358 | break; 359 | } 360 | } 361 | } 362 | 363 | /* Non-inline version for rb_erase_augmented() use */ 364 | void __rb_erase_color(struct rb_node *parent, struct rb_root *root, 365 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) 366 | { 367 | ____rb_erase_color(parent, root, augment_rotate); 368 | } 369 | EXPORT_SYMBOL(__rb_erase_color); 370 | 371 | /* 372 | * Non-augmented rbtree manipulation functions. 373 | * 374 | * We use dummy augmented callbacks here, and have the compiler optimize them 375 | * out of the rb_insert_color() and rb_erase() function definitions. 376 | */ 377 | 378 | static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} 379 | static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} 380 | static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} 381 | 382 | static const struct rb_augment_callbacks dummy_callbacks = { 383 | dummy_propagate, dummy_copy, dummy_rotate 384 | }; 385 | 386 | void rb_insert_color(struct rb_node *node, struct rb_root *root) 387 | { 388 | __rb_insert(node, root, dummy_rotate); 389 | } 390 | EXPORT_SYMBOL(rb_insert_color); 391 | 392 | void rb_erase(struct rb_node *node, struct rb_root *root) 393 | { 394 | struct rb_node *rebalance; 395 | rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); 396 | if (rebalance) 397 | ____rb_erase_color(rebalance, root, dummy_rotate); 398 | } 399 | EXPORT_SYMBOL(rb_erase); 400 | 401 | /* 402 | * Augmented rbtree manipulation functions. 403 | * 404 | * This instantiates the same __always_inline functions as in the non-augmented 405 | * case, but this time with user-defined callbacks. 406 | */ 407 | 408 | void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, 409 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) 410 | { 411 | __rb_insert(node, root, augment_rotate); 412 | } 413 | EXPORT_SYMBOL(__rb_insert_augmented); 414 | 415 | /* 416 | * This function returns the first node (in sort order) of the tree. 417 | */ 418 | struct rb_node *rb_first(const struct rb_root *root) 419 | { 420 | struct rb_node *n; 421 | 422 | n = root->rb_node; 423 | if (!n) 424 | return NULL; 425 | while (n->rb_left) 426 | n = n->rb_left; 427 | return n; 428 | } 429 | EXPORT_SYMBOL(rb_first); 430 | 431 | struct rb_node *rb_last(const struct rb_root *root) 432 | { 433 | struct rb_node *n; 434 | 435 | n = root->rb_node; 436 | if (!n) 437 | return NULL; 438 | while (n->rb_right) 439 | n = n->rb_right; 440 | return n; 441 | } 442 | EXPORT_SYMBOL(rb_last); 443 | 444 | struct rb_node *rb_next(const struct rb_node *node) 445 | { 446 | struct rb_node *parent; 447 | 448 | if (RB_EMPTY_NODE(node)) 449 | return NULL; 450 | 451 | /* 452 | * If we have a right-hand child, go down and then left as far 453 | * as we can. 454 | */ 455 | if (node->rb_right) { 456 | node = node->rb_right; 457 | while (node->rb_left) 458 | node=node->rb_left; 459 | return (struct rb_node *)node; 460 | } 461 | 462 | /* 463 | * No right-hand children. Everything down and left is smaller than us, 464 | * so any 'next' node must be in the general direction of our parent. 465 | * Go up the tree; any time the ancestor is a right-hand child of its 466 | * parent, keep going up. First time it's a left-hand child of its 467 | * parent, said parent is our 'next' node. 468 | */ 469 | while ((parent = rb_parent(node)) && node == parent->rb_right) 470 | node = parent; 471 | 472 | return parent; 473 | } 474 | EXPORT_SYMBOL(rb_next); 475 | 476 | struct rb_node *rb_prev(const struct rb_node *node) 477 | { 478 | struct rb_node *parent; 479 | 480 | if (RB_EMPTY_NODE(node)) 481 | return NULL; 482 | 483 | /* 484 | * If we have a left-hand child, go down and then right as far 485 | * as we can. 486 | */ 487 | if (node->rb_left) { 488 | node = node->rb_left; 489 | while (node->rb_right) 490 | node=node->rb_right; 491 | return (struct rb_node *)node; 492 | } 493 | 494 | /* 495 | * No left-hand children. Go up till we find an ancestor which 496 | * is a right-hand child of its parent. 497 | */ 498 | while ((parent = rb_parent(node)) && node == parent->rb_left) 499 | node = parent; 500 | 501 | return parent; 502 | } 503 | EXPORT_SYMBOL(rb_prev); 504 | 505 | void rb_replace_node(struct rb_node *victim, struct rb_node *new, 506 | struct rb_root *root) 507 | { 508 | struct rb_node *parent = rb_parent(victim); 509 | 510 | /* Set the surrounding nodes to point to the replacement */ 511 | __rb_change_child(victim, new, parent, root); 512 | if (victim->rb_left) 513 | rb_set_parent(victim->rb_left, new); 514 | if (victim->rb_right) 515 | rb_set_parent(victim->rb_right, new); 516 | 517 | /* Copy the pointers/colour from the victim to the replacement */ 518 | *new = *victim; 519 | } 520 | EXPORT_SYMBOL(rb_replace_node); 521 | 522 | static struct rb_node *rb_left_deepest_node(const struct rb_node *node) 523 | { 524 | for (;;) { 525 | if (node->rb_left) 526 | node = node->rb_left; 527 | else if (node->rb_right) 528 | node = node->rb_right; 529 | else 530 | return (struct rb_node *)node; 531 | } 532 | } 533 | 534 | struct rb_node *rb_next_postorder(const struct rb_node *node) 535 | { 536 | const struct rb_node *parent; 537 | if (!node) 538 | return NULL; 539 | parent = rb_parent(node); 540 | 541 | /* If we're sitting on node, we've already seen our children */ 542 | if (parent && node == parent->rb_left && parent->rb_right) { 543 | /* If we are the parent's left node, go to the parent's right 544 | * node then all the way down to the left */ 545 | return rb_left_deepest_node(parent->rb_right); 546 | } else 547 | /* Otherwise we are the parent's right node, and the parent 548 | * should be next */ 549 | return (struct rb_node *)parent; 550 | } 551 | EXPORT_SYMBOL(rb_next_postorder); 552 | 553 | struct rb_node *rb_first_postorder(const struct rb_root *root) 554 | { 555 | if (!root->rb_node) 556 | return NULL; 557 | 558 | return rb_left_deepest_node(root->rb_node); 559 | } 560 | EXPORT_SYMBOL(rb_first_postorder); 561 | --------------------------------------------------------------------------------