├── .gitignore
├── .gitmodules
├── README.md
├── apps
    ├── memcached
    │   ├── libmemcached-1.0.18.tar.gz
    │   ├── mcperf-0.1.1.tar.gz
    │   ├── mcperf.md
    │   ├── memcached-1.4.21.tar.gz
    │   ├── mk
    │   └── run
    ├── redis
    │   ├── mk
    │   ├── pipeline.md
    │   ├── redis-2.8.17.tar.gz
    │   ├── run
    │   └── sentinel.md
    └── ssdb
    │   ├── master.tar.gz
    │   ├── mk
    │   └── run
├── benchmarks
    ├── README
    ├── mckey.c
    ├── reconf_bench.sh
    └── run.sh
├── eval
    ├── eval.py
    ├── mongoose_aget.cfg
    └── readme.txt
├── makefile.init
├── src
    ├── config-comp
    │   ├── config-dare.c
    │   └── config-proxy.c
    ├── dare
    │   ├── dare_ep_db.c
    │   ├── dare_ibv.c
    │   ├── dare_ibv_rc.c
    │   ├── dare_ibv_ud.c
    │   ├── dare_kvs_sm.c
    │   └── dare_server.c
    ├── db
    │   └── db-interface.c
    ├── include
    │   ├── config-comp
    │   │   ├── config-dare.h
    │   │   └── config-proxy.h
    │   ├── dare
    │   │   ├── dare.h
    │   │   ├── dare_client.h
    │   │   ├── dare_config.h
    │   │   ├── dare_ep_db.h
    │   │   ├── dare_ibv.h
    │   │   ├── dare_ibv_rc.h
    │   │   ├── dare_ibv_ud.h
    │   │   ├── dare_kvs_sm.h
    │   │   ├── dare_log.h
    │   │   ├── dare_server.h
    │   │   ├── dare_sm.h
    │   │   ├── debug.h
    │   │   ├── message.h
    │   │   └── timer.h
    │   ├── db
    │   │   └── db-interface.h
    │   ├── proxy
    │   │   └── proxy.h
    │   ├── rsm-interface.h
    │   └── util
    │   │   ├── common-header.h
    │   │   └── debug.h
    ├── proxy
    │   └── proxy.c
    └── spec_hooks.cpp
├── target
    ├── makefile
    ├── nodes.local.cfg
    ├── objects.mk
    ├── sources.mk
    └── src
    │   ├── config-comp
    │       └── subdir.mk
    │   ├── dare
    │       └── subdir.mk
    │   ├── db
    │       └── subdir.mk
    │   ├── proxy
    │       └── subdir.mk
    │   └── subdir.mk
└── utils
    ├── dep-lib
        ├── db-5.1.29.tar.gz
        ├── libconfig-1.4.9.tar.gz
        └── libev-4.15.tar.gz
    ├── mk
    ├── queue
        ├── queue.h
        └── tailq.c
    ├── rbtree
        ├── include
        │   ├── compiler.h
        │   ├── export.h
        │   ├── rbtree.h
        │   └── rbtree_augmented.h
        ├── rbtree.txt
        └── src
        │   └── rbtree.c
    └── uthash
        └── uthash.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | obj
10 | 
11 | # Precompiled Headers
12 | *.gch
13 | *.pch
14 | 
15 | # Compiled Dynamic libraries
16 | *.so
17 | *.dylib
18 | *.dll
19 | 
20 | # Fortran module files
21 | *.mod
22 | *.smod
23 | 
24 | # Compiled Static libraries
25 | *.lai
26 | *.la
27 | *.a
28 | *.lib
29 | 
30 | # Executables
31 | *.exe
32 | *.out
33 | *.app
34 | bin
35 | 
36 | **/dep-lib/*
37 | !**/dep-lib/*.tar.gz
38 | **/.local
39 | *.dat
40 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "APUS/apps/test"]
2 | 	path = APUS/apps/test
3 | 	url = https://github.com/LaytonW/SSCCPP.git
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # APUS: fast and scalable paxos on RDMA
 2 | 
 3 | Build (Ubuntu Linux 15.04)
 4 | ----
 5 | 
 6 | The source code of APUS is based on DARE [HPDC'15]
 7 | ### Dependencies
 8 | Install libev, libconfig, libdb, libibverbs:
 9 | ```
10 | sudo apt-get install libev-dev libconfig-dev libdb-dev
11 | ```
12 | ### Build APUS
13 | Set env vars:
14 | ```
15 | export PAXOS_ROOT=<absolute path of RDMA-PAXOS>
16 | ```
17 | To perform a default build execute the following:
18 | ```
19 | cd target
20 | make clean; make
21 | ```
22 | Run examples
23 | ----
24 | 
25 | ### Run APUS with Redis
26 | 
27 | Install Redis:
28 | ```
29 | cd apps/redis
30 | ./mk
31 | ```
32 | Run APUS with Redis:
33 | ```
34 | cd benchmarks
35 | ./run.sh --app=redis
36 | ```
37 | 
38 | Contact
39 | ----
40 | 
41 | Please send emails to Wang Cheng (wangch.will@gmail.com) If you have any problems about APUS.
42 | 


--------------------------------------------------------------------------------
/apps/memcached/libmemcached-1.0.18.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/memcached/libmemcached-1.0.18.tar.gz


--------------------------------------------------------------------------------
/apps/memcached/mcperf-0.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/memcached/mcperf-0.1.1.tar.gz


--------------------------------------------------------------------------------
/apps/memcached/mcperf.md:
--------------------------------------------------------------------------------
 1 | # twemperf (mcperf)
 2 | 
 3 | ## Building mcperf ##
 4 | 
 5 | To build mcperf from distribution tarball:
 6 | 
 7 |     $ ./configure
 8 |     $ make
 9 |     $ sudo make install
10 | 
11 | To build mcperf from distribution tarball in _debug mode_:
12 | 
13 |     $ CFLAGS="-ggdb3 -O0" ./configure --enable-debug
14 |     $ make
15 |     $ sudo make install
16 | 
17 | ## Help ##
18 | 
19 |     Usage: mcperf [-v verbosity level] [-o output file]
20 |                   [-s server] [-p port]
21 |                   [-n num-conns] [-N num-calls]
22 |                   [-r conn-rate] [-R call-rate]
23 | 
24 |     Options:
25 |       -v, --verbosity=N     : set logging level (default: 5, min: 0, max: 11)
26 |       -o, --output=S        : set logging file (default: stderr)
27 |       -s, --server=S        : set the hostname of the server (default: localhost)
28 |       -p, --port=N          : set the port number of the server (default: 11211)
29 |       -n, --num-conns=N     : set the number of connections to create (default: 1)
30 |       -N, --num-calls=N     : set the number of calls to create on each connection (default: 1)
31 |       -r, --conn-rate=R     : set the connection creation rate (default: 0 conns/sec)
32 |       -R, --call-rate=R     : set the call creation rate (default: 0 calls/sec)
33 | 
34 |       -q, --use-noreply     : set noreply for generated requests
35 |       ...
36 | 
37 | ## Design ##
38 | 
39 | 1. Single threaded.
40 | 2. Asynchronous I/O through non-blocking sockets and Linux epoll(7) syscall.
41 | 
42 | ## Examples ##
43 | 
44 | The following example creates **1000 connections** to a memcached server
45 | running on **localhost:11211**. The connections are created at the rate of
46 | **1000 conns/sec** and on every connection it sends **10 'set' requests** at
47 | the rate of **1000 reqs/sec** with the item sizes derived from a uniform
48 | distribution in the interval of [1,16) bytes.
49 | 
50 |     $ mcperf --linger=0 --timeout=5 --conn-rate=1000 --call-rate=1000 --num-calls=10 --num-conns=1000 --sizes=u1,16
51 | 
52 | The following example creates **100 connections** to a memcached server
53 | running on **localhost:11211**. Every connection is created after the previous
54 | connection is closed. On every connection we send **100 'set' requests** and
55 | every request is created after we have received the response for the previous
56 | request. All the set requests generated have a fixed item size of 1 byte.
57 | 
58 |     $ mcperf --linger=0 --conn-rate=0 --call-rate=0 --num-calls=100 --num-conns=100 --sizes=d1
59 | 
60 | The following example gives you all the details of what mcperf is doing.
61 | 
62 |     $ mcperf --call-rate=0 --num-calls=100 --num-conns=1 --verbosity=11
63 | 
64 | ## Protocol ##
65 | 
66 | ### Storage commands ###
67 | First, the client sends a command line which looks like this:
68 | 
69 |     <command name> <key> <flags> <exptime> <bytes> [noreply]\r\n
70 | 
71 | - `<command name>` is "set", "add", "replace", "append" or "prepend"
72 | 
73 | - `noreply` optional parameter instructs the server to not send the reply.
74 | 
75 | ### Error strings ###
76 | 
77 | Each command sent by a client may be answered with an error string from the server. These error strings come in three types:
78 | 
79 | - `SERVER_ERROR <error>\r\n`
80 | 
81 |   means some sort of server error prevents the server from carrying out the command. `<error>` is a human-readable error string. In cases of severe server errors, which make it impossible to continue serving the client (this shouldn't normally happen), the server will close the connection after sending the error line. This is the only case in which the server closes a connection to a client.


--------------------------------------------------------------------------------
/apps/memcached/memcached-1.4.21.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/memcached/memcached-1.4.21.tar.gz


--------------------------------------------------------------------------------
/apps/memcached/mk:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # benchmark use memslap from libmemcached, source code in libmemcached-1.0.18/clients/memslap.cc
 4 | 
 5 | # variables
 6 | APP_VER=1.4.21
 7 | BEN_VER=1.0.18
 8 | APP_DIR=$PAXOS_ROOT/apps/memcached
 9 | 
10 | # working folder
11 | cd $APP_DIR
12 | 
13 | # remove folders
14 | rm -rf memcached-$APP_VER
15 | rm -rf install
16 | rm -rf libmemcached-$BEN_VER
17 | rm -rf benchmark
18 | 
19 | # download and extract
20 | if [ ! -f memcached-$APP_VER.tar.gz ]; then
21 |     wget http://www.memcached.org/files/memcached-$APP_VER.tar.gz
22 | fi
23 | tar zxvf memcached-$APP_VER.tar.gz
24 | 
25 | # build
26 | cd memcached-$APP_VER
27 | mkdir ../install
28 | ./configure --prefix=$APP_DIR/install
29 | make -j `nproc`
30 | make install
31 | 
32 | 
33 | # download benchmark
34 | cd ../
35 | if [ ! -f libmemcached-$BEN_VER.tar.gz ]; then
36 |     wget https://launchpad.net/libmemcached/1.0/1.0.18/+download/libmemcached-$BEN_VER.tar.gz
37 | fi
38 | tar zxvf libmemcached-$BEN_VER.tar.gz
39 | 
40 | 
41 | # build benchmark
42 | cd libmemcached-$BEN_VER
43 | mkdir ../benchmark
44 | ./configure --prefix=$APP_DIR/benchmark
45 | make -j `nproc`
46 | make install
47 | 
48 | 


--------------------------------------------------------------------------------
/apps/memcached/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # variables
 4 | APP_DIR=$PAXOS_ROOT/apps/memcached
 5 |  
 6 | # start server
 7 | # -p <num>      TCP port number to listen on (default: 11211)
 8 | # -d            run as a daemon
 9 | # -P <file>     save PID in <file>, only used with -d option
10 | # -m <num>      max memory to use for items in megabytes (default: 64 MB)
11 | # -M            return error on memory exhausted (rather than removing items)
12 | # -u <username> assume identity of <username> (only when run as root)
13 | cd $APP_DIR/install
14 | bin/memcached -p 11222 -P $APP_DIR/install/memcached.pid &
15 | sleep 1
16 | 
17 | # benchmack
18 | # Generates a load against a memcached custer of servers.
19 | # --concurrency=
20 | #        Number of users to simulate with load.
21 | # --execute-number=
22 | #        Number of times to execute the given test.
23 | # --servers=
24 | #        List which servers you wish to connect to.
25 | cd ../benchmark
26 | bin/memslap -s 127.0.0.1:11222 --concurrency=10 --execute-number=5000
27 | 
28 | # stop server
29 | cd ../install
30 | kill $(cat memcached.pid)
31 | rm memcached.pid
32 | 
33 | 


--------------------------------------------------------------------------------
/apps/redis/mk:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # benchmark redis-benchmark in redis-2.8.17/src/redis-benchmark.c
 4 | 
 5 | APP_VER=2.8.17
 6 | APP_DIR=$PAXOS_ROOT/apps/redis
 7 | 
 8 | # download
 9 | cd $APP_DIR
10 | rm -rf redis-$APP_VER
11 | rm -rf install
12 | if [ ! -f redis-$APP_VER.tar.gz ]; then
13 |     wget http://download.redis.io/releases/redis-$APP_VER.tar.gz
14 | fi
15 | tar zxvf redis-$APP_VER.tar.gz
16 | 
17 | # build
18 | cd redis-$APP_VER
19 | make
20 | make install PREFIX=$APP_DIR/install
21 | 


--------------------------------------------------------------------------------
/apps/redis/pipeline.md:
--------------------------------------------------------------------------------
 1 | ## Request/Response protocols and RTT
 2 | By default every client sends the next command only when the reply of the previous command is received, this means that the server will likely need a read call in order to read each command from every client. Also RTT is paid as well.
 3 | 
 4 | So for instance a four commands sequence is something like this:
 5 | - *Client*: INCR X
 6 | - *Server*: 1
 7 | - *Client*: INCR X
 8 | - *Server*: 2
 9 | - *Client*: INCR X
10 | - *Server*: 3
11 | - *Client*: INCR X
12 | - *Server*: 4
13 | 
14 | ## Redis Pipelining
15 | A Request/Response server can be implemented so that it is able to process new requests even if the client didn't already read the old responses. This way it is possible to send *multiple commands* to the server without waiting for the replies at all, and finally read the replies in a single step.
16 | 
17 | This is an example using the raw netcat utility:
18 | ```
19 | $ (printf "PING\r\nPING\r\nPING\r\n"; sleep 1) | nc localhost 6379
20 | +PONG
21 | +PONG
22 | +PONG
23 | ```
24 | This time we are not paying the cost of RTT for every call, but just one time for the three commands.
25 | 
26 | To be very explicit, with pipelining the order of operations of our very first example will be the following:
27 | - *Client*: INCR X
28 | - *Client*: INCR X
29 | - *Client*: INCR X
30 | - *Client*: INCR X
31 | - *Server*: 1
32 | - *Server*: 2
33 | - *Server*: 3
34 | - *Server*: 4
35 | 
36 | **IMPORTANT NOTE**: While the client sends commands using pipelining, the server will be forced to queue the replies, using memory. 
37 | 
38 | ## It's not just a matter of RTT
39 | Pipelining is not just a way in order to reduce the latency cost due to the round trip time, it actually improves by a huge amount the total operations you can perform per second in a given Redis server. This is the result of the fact that, without using pipelining, serving each command is very cheap from the point of view of accessing the data structures and producing the reply, but it is very costly from the point of view of doing the socket I/O. This involes calling the `read()` and `write()` syscall, that means going from user land to kernel land. The context switch is a huge speed penalty.
40 | 
41 | When pipelining is used, many commands are usually read with a single `read()` system call, and multiple replies are delivered with a single `write()` system call. Because of this, the number of total queries performed per second initially increases almost linearly with longer pipelines, and eventually reaches 10 times the baseline obtained not using pipelining, as you can see from the following graph:
42 | 


--------------------------------------------------------------------------------
/apps/redis/redis-2.8.17.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/redis/redis-2.8.17.tar.gz


--------------------------------------------------------------------------------
/apps/redis/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MONITOR is a debugging command that streams back every command processed by the Redis server.
 4 | # It can help in understanding what is happening to the database.
 5 | # $ redis-cli monitor
 6 | # 1339518083.107412 [0 127.0.0.1:60866] "keys" "*"
 7 | # 1339518087.877697 [0 127.0.0.1:60866] "dbsize"
 8 | # 1339518090.420270 [0 127.0.0.1:60866] "set" "x" "6"
 9 | # 1339518096.506257 [0 127.0.0.1:60866] "get" "x"
10 | # 1339518099.363765 [0 127.0.0.1:60866] "del" "x"
11 | # 1339518100.544926 [0 127.0.0.1:60866] "get" "x"
12 | 
13 | 
14 | # Measuring latency
15 | # showLatencyReport in redis-benchmark.c
16 | # for (i = 0; i < config.requests; i++) {
17 | #     # print config.latency
18 | # }
19 | 
20 | 
21 | ################################ SNAPSHOTTING  ################################
22 | #
23 | # Save the DB on disk:
24 | #
25 | #   save <seconds> <changes>
26 | #
27 | #   Will save the DB if both the given number of seconds and the given
28 | #   number of write operations against the DB occurred.
29 | #
30 | #   In the example below the behaviour will be to save:
31 | #   after 900 sec (15 min) if at least 1 key changed
32 | #   after 300 sec (5 min) if at least 10 keys changed
33 | #   after 60 sec if at least 10000 keys changed
34 | #
35 | #   Note: you can disable saving at all commenting all the "save" lines.
36 | 
37 | # save 900 1
38 | # save 300 10
39 | # save 60 10000
40 | 


--------------------------------------------------------------------------------
/apps/redis/sentinel.md:
--------------------------------------------------------------------------------
 1 | # Redis Sentinel Documentation
 2 | Redis Sentinel provides high availability for Redis. In practical terms this means that using Sentinel you can create a Redis deployment that resists without human intervention to certain kind of failures.
 3 | 
 4 | Redis Sentinel also provides other collateral tasks such as monitoring, notifications and acts as a configuration provider for clients.
 5 | 
 6 | This is the full list of Sentinel capabilities at a macroscopical level (i.e. the *big picture*):
 7 | 
 8 | - **Automatic failover**. If a master is not working as expected, Sentinel can start a failover process where a slave is promoted to master, the other additional slaves are reconfigured to use the new master, and the applications using the Redis server informed about the new address to use when connecting.
 9 | 
10 | ## Distributed nature of Sentinel
11 | 
12 | Redis Sentinel is a distributed system:
13 | 
14 | Sentinel itself is designed to run in a configuration where there are multiple Sentinel processes cooperating together. The advantage of having multiple Sentinel processes cooperating are the following:
15 | 
16 | 1. Failure detection is performed when multiple Sentinels agree about the fact a given master is no longer available. This lowers the probability of false positives.
17 | 
18 | ## Example 2: basic setup with three boxes
19 | 
20 | This is a very simple setup, that has the advantage to be simple to tune for additional safety. It is based on three boxes, each box running both a Redis process and a Sentinel process.
21 | 
22 | ```
23 |        +----+
24 |        | M1 |
25 |        | S1 |
26 |        +----+
27 |           |
28 | +----+    |    +----+
29 | | R2 |----+----| R3 |
30 | | S2 |         | S3 |
31 | +----+         +----+
32 | 
33 | Configuration: quorum = 2
34 | ```
35 | If the master M1 fails, S2 and S3 will agree about the failure and will be able to authorize a failover, making clients able to continue.
36 | 
37 | In every Sentinel setup, being Redis asynchronously replicated, there is always the risk of losing some write because a given acknowledged write may not be able to reach the slave which is promoted to master. However in the above setup there is an higher risk due to clients partitioned away with an old master, like in the following picture:
38 | 
39 | ```
40 |          +----+
41 |          | M1 |
42 |          | S1 | <- C1 (writes will be lost)
43 |          +----+
44 |             |
45 |             /
46 |             /
47 | +------+    |    +----+
48 | | [M2] |----+----| R3 |
49 | | S2   |         | S3 |
50 | +------+         +----+
51 | ```
52 | 
53 | In this case a network partition isolated the old master M1, so the slave R2 is promoted to master. However clients, like C1, that are in the same partition as the old master, may continue to write data to the old master. This data will be lost forever since when the partition will heal, the master will be reconfigured as a slave of the new master, discarding its data set.
54 | 
55 | This problem can be mitigated using the following Redis replication feature, that allows to stop accepting writes if a master detects that is no longer able to transfer its writes to the specified number of slaves.
56 | 
57 | ```
58 | min-slaves-to-write 1
59 | min-slaves-max-lag 10
60 | ```
61 | 


--------------------------------------------------------------------------------
/apps/ssdb/master.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/apps/ssdb/master.tar.gz


--------------------------------------------------------------------------------
/apps/ssdb/mk:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # benchmark source in ssdb/master/tools/ssdb-bench.cpp
 4 | 
 5 | # variables
 6 | APP_DIR=$PAXOS_ROOT/apps/ssdb
 7 | 
 8 | # working folder
 9 | cd $APP_DIR
10 | 
11 | # remove folders
12 | rm -rf ssdb-master
13 | 
14 | # download and extract
15 | if [ ! -f master.tar.gz ]; then
16 |     wget https://github.com/ideawu/ssdb/archive/master.tar.gz
17 | fi
18 | tar zxvf master.tar.gz
19 | 
20 | # build
21 | cd ssdb-master
22 | CFLAGS="-g -O0" CXXFLAGS="-g -O0" make
23 | 
24 | # config
25 | sed -i 's/ip: 127.0.0.1/#ip: 127.0.0.1/' ./ssdb.conf
26 | sed -i 's/#ip: 0.0.0.0/ip: 0.0.0.0/' ./ssdb.conf
27 | 


--------------------------------------------------------------------------------
/apps/ssdb/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # variables
 4 | APP_DIR=$PAXOS_ROOT/apps/ssdb
 5 | 
 6 | # start servere
 7 | # ./ssdb-server [-d] /path/to/ssdb.conf [-s start|stop|restart]
 8 | # Options:
 9 | #     -d    run as daemon
10 | #     -s    option to start|stop|restart the server
11 | cd $APP_DIR/ssdb-master
12 | ./ssdb-server ssdb.conf
13 | sleep 1
14 | 
15 | # benchmark
16 | # ./ssdb-bench [ip] [port] [requests] [clients]
17 | # Options:
18 | #     ip          server ip (default 127.0.0.1)
19 | #     port        server port (default 8888)
20 | #     requests    Total number of requests (default 10000)
21 | #     clients     Number of parallel connections (default 50)
22 | ./tools/ssdb-bench 127.0.0.1 8888 10000 50
23 | # SSDB supports Redis network protocol, you can use Redis clients to connect to a SSDB server and operate on it.
24 | 
25 | # stop server
26 | kill -SIGINT $(cat ./var/ssdb.pid)


--------------------------------------------------------------------------------
/benchmarks/README:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | mckey program is used to test RDMA CM multicast setup and simple data transfer.
 4 | usage  : mckey [options]
 5 | options: -m       # multicast_address
 6 |          -s       # sender
 7 |          -b       # bind_address
 8 |          
 9 | Server: $ mckey -m 225.1.1.1 -b 10.22.1.1
10 | Client: $ mckey -m 225.1.1.1 -b 10.22.1.2 -s
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/benchmarks/mckey.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2005-2007 Intel Corporation.  All rights reserved.
  3 |  *
  4 |  * This software is available to you under a choice of one of two
  5 |  * licenses.  You may choose to be licensed under the terms of the GNU
  6 |  * General Public License (GPL) Version 2, available from the file
  7 |  * COPYING in the main directory of this source tree, or the
  8 |  * OpenIB.org BSD license below:
  9 |  *
 10 |  *     Redistribution and use in source and binary forms, with or
 11 |  *     without modification, are permitted provided that the following
 12 |  *     conditions are met:
 13 |  *
 14 |  *      - Redistributions of source code must retain the above
 15 |  *        copyright notice, this list of conditions and the following
 16 |  *        disclaimer.
 17 |  *
 18 |  *      - Redistributions in binary form must reproduce the above
 19 |  *        copyright notice, this list of conditions and the following
 20 |  *        disclaimer in the documentation and/or other materials
 21 |  *        provided with the distribution.
 22 |  *
 23 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 24 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 25 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 26 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 27 |  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 28 |  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 29 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 30 |  * SOFTWARE.
 31 |  *
 32 |  * BUILD COMMAND:
 33 |  * gcc -g -Wall -D_GNU_SOURCE -g -O2 -o examples/mckey examples/mckey.c -libverbs -lrdmacm
 34 |  *
 35 |  * $Id$
 36 |  */
 37 | 
 38 | #include <stdlib.h>
 39 | #include <string.h>
 40 | #include <stdio.h>
 41 | #include <errno.h>
 42 | #include <sys/types.h>
 43 | #include <netinet/in.h>
 44 | #include <arpa/inet.h>
 45 | #include <sys/socket.h>
 46 | #include <netdb.h>
 47 | #include <byteswap.h>
 48 | #include <unistd.h>
 49 | #include <getopt.h>
 50 | 
 51 | #include <rdma/rdma_cma.h>
 52 | 
 53 | struct cmatest_node {
 54 | 	int			id;
 55 | 	struct rdma_cm_id	*cma_id;
 56 | 	int			connected;
 57 | 	struct ibv_pd		*pd;
 58 | 	struct ibv_cq		*cq;
 59 | 	struct ibv_mr		*mr;
 60 | 	struct ibv_ah		*ah;
 61 | 	uint32_t		remote_qpn;
 62 | 	uint32_t		remote_qkey;
 63 | 	void			*mem;
 64 | };
 65 | 
 66 | struct cmatest {
 67 | 	struct rdma_event_channel *channel;
 68 | 	pthread_t 		cmathread;
 69 | 	struct cmatest_node	*nodes;
 70 | 	int			conn_index;
 71 | 	int			connects_left;
 72 | 
 73 | 	struct sockaddr_in6	dst_in;
 74 | 	struct sockaddr		*dst_addr;
 75 | 	struct sockaddr_in6	src_in;
 76 | 	struct sockaddr		*src_addr;
 77 | };
 78 | 
 79 | static struct cmatest test;
 80 | static int connections = 1;
 81 | static int message_size = 100;
 82 | static int message_count = 10;
 83 | static int is_sender;
 84 | static int unmapped_addr;
 85 | static char *dst_addr;
 86 | static char *src_addr;
 87 | static enum rdma_port_space port_space = RDMA_PS_UDP;
 88 | 
 89 | static int create_message(struct cmatest_node *node)
 90 | {
 91 | 	if (!message_size)
 92 | 		message_count = 0;
 93 | 
 94 | 	if (!message_count)
 95 | 		return 0;
 96 | 
 97 | 	node->mem = malloc(message_size + sizeof(struct ibv_grh));
 98 | 	if (!node->mem) {
 99 | 		printf("failed message allocation\n");
100 | 		return -1;
101 | 	}
102 | 	node->mr = ibv_reg_mr(node->pd, node->mem,
103 | 			      message_size + sizeof(struct ibv_grh),
104 | 			      IBV_ACCESS_LOCAL_WRITE);
105 | 	if (!node->mr) {
106 | 		printf("failed to reg MR\n");
107 | 		goto err;
108 | 	}
109 | 	return 0;
110 | err:
111 | 	free(node->mem);
112 | 	return -1;
113 | }
114 | 
115 | static int verify_test_params(struct cmatest_node *node)
116 | {
117 | 	struct ibv_port_attr port_attr;
118 | 	int ret;
119 | 
120 | 	ret = ibv_query_port(node->cma_id->verbs, node->cma_id->port_num,
121 | 			     &port_attr);
122 | 	if (ret)
123 | 		return ret;
124 | 
125 | 	if (message_count && message_size > (1 << (port_attr.active_mtu + 7))) {
126 | 		printf("mckey: message_size %d is larger than active mtu %d\n",
127 | 		       message_size, 1 << (port_attr.active_mtu + 7));
128 | 		return -EINVAL;
129 | 	}
130 | 
131 | 	return 0;
132 | }
133 | 
134 | static int init_node(struct cmatest_node *node)
135 | {
136 | 	struct ibv_qp_init_attr init_qp_attr;
137 | 	int cqe, ret;
138 | 
139 | 	node->pd = ibv_alloc_pd(node->cma_id->verbs);
140 | 	if (!node->pd) {
141 | 		ret = -ENOMEM;
142 | 		printf("mckey: unable to allocate PD\n");
143 | 		goto out;
144 | 	}
145 | 
146 | 	cqe = message_count ? message_count * 2 : 2;
147 | 	node->cq = ibv_create_cq(node->cma_id->verbs, cqe, node, 0, 0);
148 | 	if (!node->cq) {
149 | 		ret = -ENOMEM;
150 | 		printf("mckey: unable to create CQ\n");
151 | 		goto out;
152 | 	}
153 | 
154 | 	memset(&init_qp_attr, 0, sizeof init_qp_attr);
155 | 	init_qp_attr.cap.max_send_wr = message_count ? message_count : 1;
156 | 	init_qp_attr.cap.max_recv_wr = message_count ? message_count : 1;
157 | 	init_qp_attr.cap.max_send_sge = 1;
158 | 	init_qp_attr.cap.max_recv_sge = 1;
159 | 	init_qp_attr.qp_context = node;
160 | 	init_qp_attr.sq_sig_all = 0;
161 | 	init_qp_attr.qp_type = IBV_QPT_UD;
162 | 	init_qp_attr.send_cq = node->cq;
163 | 	init_qp_attr.recv_cq = node->cq;
164 | 	ret = rdma_create_qp(node->cma_id, node->pd, &init_qp_attr);
165 | 	if (ret) {
166 | 		perror("mckey: unable to create QP");
167 | 		goto out;
168 | 	}
169 | 
170 | 	ret = create_message(node);
171 | 	if (ret) {
172 | 		printf("mckey: failed to create messages: %d\n", ret);
173 | 		goto out;
174 | 	}
175 | out:
176 | 	return ret;
177 | }
178 | 
179 | static int post_recvs(struct cmatest_node *node)
180 | {
181 | 	struct ibv_recv_wr recv_wr, *recv_failure;
182 | 	struct ibv_sge sge;
183 | 	int i, ret = 0;
184 | 
185 | 	if (!message_count)
186 | 		return 0;
187 | 
188 | 	recv_wr.next = NULL;
189 | 	recv_wr.sg_list = &sge;
190 | 	recv_wr.num_sge = 1;
191 | 	recv_wr.wr_id = (uintptr_t) node;
192 | 
193 | 	sge.length = message_size + sizeof(struct ibv_grh);
194 | 	sge.lkey = node->mr->lkey;
195 | 	sge.addr = (uintptr_t) node->mem;
196 | 
197 | 	for (i = 0; i < message_count && !ret; i++ ) {
198 | 		ret = ibv_post_recv(node->cma_id->qp, &recv_wr, &recv_failure);
199 | 		if (ret) {
200 | 			printf("failed to post receives: %d\n", ret);
201 | 			break;
202 | 		}
203 | 	}
204 | 	return ret;
205 | }
206 | 
207 | static int post_sends(struct cmatest_node *node, int signal_flag)
208 | {
209 | 	struct ibv_send_wr send_wr, *bad_send_wr;
210 | 	struct ibv_sge sge;
211 | 	int i, ret = 0;
212 | 
213 | 	if (!node->connected || !message_count)
214 | 		return 0;
215 | 
216 | 	send_wr.next = NULL;
217 | 	send_wr.sg_list = &sge;
218 | 	send_wr.num_sge = 1;
219 | 	send_wr.opcode = IBV_WR_SEND_WITH_IMM;
220 | 	send_wr.send_flags = signal_flag;
221 | 	send_wr.wr_id = (unsigned long)node;
222 | 	send_wr.imm_data = htonl(node->cma_id->qp->qp_num);
223 | 
224 | 	send_wr.wr.ud.ah = node->ah;
225 | 	send_wr.wr.ud.remote_qpn = node->remote_qpn;
226 | 	send_wr.wr.ud.remote_qkey = node->remote_qkey;
227 | 
228 | 	sge.length = message_size;
229 | 	sge.lkey = node->mr->lkey;
230 | 	sge.addr = (uintptr_t) node->mem;
231 | 
232 | 	for (i = 0; i < message_count && !ret; i++) {
233 | 		ret = ibv_post_send(node->cma_id->qp, &send_wr, &bad_send_wr);
234 | 		if (ret)
235 | 			printf("failed to post sends: %d\n", ret);
236 | 	}
237 | 	return ret;
238 | }
239 | 
240 | static void connect_error(void)
241 | {
242 | 	test.connects_left--;
243 | }
244 | 
245 | static int addr_handler(struct cmatest_node *node)
246 | {
247 | 	int ret;
248 | 
249 | 	ret = verify_test_params(node);
250 | 	if (ret)
251 | 		goto err;
252 | 
253 | 	ret = init_node(node);
254 | 	if (ret)
255 | 		goto err;
256 | 
257 | 	if (!is_sender) {
258 | 		ret = post_recvs(node);
259 | 		if (ret)
260 | 			goto err;
261 | 	}
262 | 
263 | 	ret = rdma_join_multicast(node->cma_id, test.dst_addr, node);
264 | 	if (ret) {
265 | 		perror("mckey: failure joining");
266 | 		goto err;
267 | 	}
268 | 	return 0;
269 | err:
270 | 	connect_error();
271 | 	return ret;
272 | }
273 | 
274 | static int join_handler(struct cmatest_node *node,
275 | 			struct rdma_ud_param *param)
276 | {
277 | 	char buf[40];
278 | 
279 | 	inet_ntop(AF_INET6, param->ah_attr.grh.dgid.raw, buf, 40);
280 | 	printf("mckey: joined dgid: %s mlid 0x%x sl %d\n", buf,
281 | 		param->ah_attr.dlid, param->ah_attr.sl);
282 | 
283 | 	node->remote_qpn = param->qp_num;
284 | 	node->remote_qkey = param->qkey;
285 | 	node->ah = ibv_create_ah(node->pd, &param->ah_attr);
286 | 	if (!node->ah) {
287 | 		printf("mckey: failure creating address handle\n");
288 | 		goto err;
289 | 	}
290 | 
291 | 	node->connected = 1;
292 | 	test.connects_left--;
293 | 	return 0;
294 | err:
295 | 	connect_error();
296 | 	return -1;
297 | }
298 | 
299 | static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
300 | {
301 | 	int ret = 0;
302 | 
303 | 	switch (event->event) {
304 | 	case RDMA_CM_EVENT_ADDR_RESOLVED:
305 | 		ret = addr_handler(cma_id->context);
306 | 		break;
307 | 	case RDMA_CM_EVENT_MULTICAST_JOIN:
308 | 		ret = join_handler(cma_id->context, &event->param.ud);
309 | 		break;
310 | 	case RDMA_CM_EVENT_ADDR_ERROR:
311 | 	case RDMA_CM_EVENT_ROUTE_ERROR:
312 | 	case RDMA_CM_EVENT_MULTICAST_ERROR:
313 | 		printf("mckey: event: %s, error: %d\n",
314 | 		       rdma_event_str(event->event), event->status);
315 | 		connect_error();
316 | 		ret = event->status;
317 | 		break;
318 | 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
319 | 		/* Cleanup will occur after test completes. */
320 | 		break;
321 | 	default:
322 | 		break;
323 | 	}
324 | 	return ret;
325 | }
326 | 
327 | static void *cma_thread(void *arg)
328 | {
329 | 	struct rdma_cm_event *event;
330 | 	int ret;
331 | 
332 | 	while (1) {
333 | 		ret = rdma_get_cm_event(test.channel, &event);
334 | 		if (ret) {
335 | 			perror("rdma_get_cm_event");
336 | 			break;
337 | 		}
338 | 
339 | 		switch (event->event) {
340 | 		case RDMA_CM_EVENT_MULTICAST_ERROR:
341 | 		case RDMA_CM_EVENT_ADDR_CHANGE:
342 | 			printf("mckey: event: %s, status: %d\n",
343 | 			       rdma_event_str(event->event), event->status);
344 | 			break;
345 | 		default:
346 | 			break;
347 | 		}
348 | 
349 | 		rdma_ack_cm_event(event);
350 | 	}
351 | 	return NULL;
352 | }
353 | 
354 | static void destroy_node(struct cmatest_node *node)
355 | {
356 | 	if (!node->cma_id)
357 | 		return;
358 | 
359 | 	if (node->ah)
360 | 		ibv_destroy_ah(node->ah);
361 | 
362 | 	if (node->cma_id->qp)
363 | 		rdma_destroy_qp(node->cma_id);
364 | 
365 | 	if (node->cq)
366 | 		ibv_destroy_cq(node->cq);
367 | 
368 | 	if (node->mem) {
369 | 		ibv_dereg_mr(node->mr);
370 | 		free(node->mem);
371 | 	}
372 | 
373 | 	if (node->pd)
374 | 		ibv_dealloc_pd(node->pd);
375 | 
376 | 	/* Destroy the RDMA ID after all device resources */
377 | 	rdma_destroy_id(node->cma_id);
378 | }
379 | 
380 | static int alloc_nodes(void)
381 | {
382 | 	int ret, i;
383 | 
384 | 	test.nodes = malloc(sizeof *test.nodes * connections);
385 | 	if (!test.nodes) {
386 | 		printf("mckey: unable to allocate memory for test nodes\n");
387 | 		return -ENOMEM;
388 | 	}
389 | 	memset(test.nodes, 0, sizeof *test.nodes * connections);
390 | 
391 | 	for (i = 0; i < connections; i++) {
392 | 		test.nodes[i].id = i;
393 | 		ret = rdma_create_id(test.channel, &test.nodes[i].cma_id,
394 | 				     &test.nodes[i], port_space);
395 | 		if (ret)
396 | 			goto err;
397 | 	}
398 | 	return 0;
399 | err:
400 | 	while (--i >= 0)
401 | 		rdma_destroy_id(test.nodes[i].cma_id);
402 | 	free(test.nodes);
403 | 	return ret;
404 | }
405 | 
406 | static void destroy_nodes(void)
407 | {
408 | 	int i;
409 | 
410 | 	for (i = 0; i < connections; i++)
411 | 		destroy_node(&test.nodes[i]);
412 | 	free(test.nodes);
413 | }
414 | 
415 | static int poll_cqs(void)
416 | {
417 | 	struct ibv_wc wc[8];
418 | 	int done, i, ret;
419 | 
420 | 	for (i = 0; i < connections; i++) {
421 | 		if (!test.nodes[i].connected)
422 | 			continue;
423 | 
424 | 		for (done = 0; done < message_count; done += ret) {
425 | 			ret = ibv_poll_cq(test.nodes[i].cq, 8, wc);
426 | 			if (ret < 0) {
427 | 				printf("mckey: failed polling CQ: %d\n", ret);
428 | 				return ret;
429 | 			}
430 | 		}
431 | 	}
432 | 	return 0;
433 | }
434 | 
435 | static int connect_events(void)
436 | {
437 | 	struct rdma_cm_event *event;
438 | 	int ret = 0;
439 | 
440 | 	while (test.connects_left && !ret) {
441 | 		ret = rdma_get_cm_event(test.channel, &event);
442 | 		if (!ret) {
443 | 			ret = cma_handler(event->id, event);
444 | 			rdma_ack_cm_event(event);
445 | 		}
446 | 	}
447 | 	return ret;
448 | }
449 | 
450 | static int get_addr(char *dst, struct sockaddr *addr)
451 | {
452 | 	struct addrinfo *res;
453 | 	int ret;
454 | 
455 | 	ret = getaddrinfo(dst, NULL, NULL, &res);
456 | 	if (ret) {
457 | 		printf("getaddrinfo failed - invalid hostname or IP address\n");
458 | 		return ret;
459 | 	}
460 | 
461 | 	memcpy(addr, res->ai_addr, res->ai_addrlen);
462 | 	freeaddrinfo(res);
463 | 	return ret;
464 | }
465 | 
466 | static int run(void)
467 | {
468 | 	int i, ret;
469 | 
470 | 	printf("mckey: starting %s\n", is_sender ? "client" : "server");
471 | 	if (src_addr) {
472 | 		ret = get_addr(src_addr, (struct sockaddr *) &test.src_in);
473 | 		if (ret)
474 | 			return ret;
475 | 	}
476 | 
477 | 	ret = get_addr(dst_addr, (struct sockaddr *) &test.dst_in);
478 | 	if (ret)
479 | 		return ret;
480 | 
481 | 	printf("mckey: joining\n");
482 | 	for (i = 0; i < connections; i++) {
483 | 		if (src_addr) {
484 | 			ret = rdma_bind_addr(test.nodes[i].cma_id,
485 | 					     test.src_addr);
486 | 			if (ret) {
487 | 				perror("mckey: addr bind failure");
488 | 				connect_error();
489 | 				return ret;
490 | 			}
491 | 		}
492 | 
493 | 		if (unmapped_addr)
494 | 			ret = addr_handler(&test.nodes[i]);
495 | 		else
496 | 			ret = rdma_resolve_addr(test.nodes[i].cma_id,
497 | 						test.src_addr, test.dst_addr,
498 | 						2000);
499 | 		if (ret) {
500 | 			perror("mckey: resolve addr failure");
501 | 			connect_error();
502 | 			return ret;
503 | 		}
504 | 	}
505 | 
506 | 	ret = connect_events();
507 | 	if (ret)
508 | 		goto out;
509 | 
510 | 	pthread_create(&test.cmathread, NULL, cma_thread, NULL);
511 | 
512 | 	/*
513 | 	 * Pause to give SM chance to configure switches.  We don't want to
514 | 	 * handle reliability issue in this simple test program.
515 | 	 */
516 | 	sleep(3);
517 | 
518 | 	if (message_count) {
519 | 		if (is_sender) {
520 | 			printf("initiating data transfers\n");
521 | 			for (i = 0; i < connections; i++) {
522 | 				ret = post_sends(&test.nodes[i], 0);
523 | 				if (ret)
524 | 					goto out;
525 | 			}
526 | 		} else {
527 | 			printf("receiving data transfers\n");
528 | 			ret = poll_cqs();
529 | 			if (ret)
530 | 				goto out;
531 | 		}
532 | 		printf("data transfers complete\n");
533 | 	}
534 | out:
535 | 	for (i = 0; i < connections; i++) {
536 | 		ret = rdma_leave_multicast(test.nodes[i].cma_id,
537 | 					   test.dst_addr);
538 | 		if (ret)
539 | 			perror("mckey: failure leaving");
540 | 	}
541 | 	return ret;
542 | }
543 | 
544 | int main(int argc, char **argv)
545 | {
546 | 	int op, ret;
547 | 
548 | 
549 | 	while ((op = getopt(argc, argv, "m:M:sb:c:C:S:p:")) != -1) {
550 | 		switch (op) {
551 | 		case 'm':
552 | 			dst_addr = optarg;
553 | 			break;
554 | 		case 'M':
555 | 			unmapped_addr = 1;
556 | 			dst_addr = optarg;
557 | 			break;
558 | 		case 's':
559 | 			is_sender = 1;
560 | 			break;
561 | 		case 'b':
562 | 			src_addr = optarg;
563 | 			test.src_addr = (struct sockaddr *) &test.src_in;
564 | 			break;
565 | 		case 'c':
566 | 			connections = atoi(optarg);
567 | 			break;
568 | 		case 'C':
569 | 			message_count = atoi(optarg);
570 | 			break;
571 | 		case 'S':
572 | 			message_size = atoi(optarg);
573 | 			break;
574 | 		case 'p':
575 | 			port_space = strtol(optarg, NULL, 0);
576 | 			break;
577 | 		default:
578 | 			printf("usage: %s\n", argv[0]);
579 | 			printf("\t-m multicast_address\n");
580 | 			printf("\t[-M unmapped_multicast_address]\n"
581 | 			       "\t replaces -m and requires -b\n");
582 | 			printf("\t[-s(ender)]\n");
583 | 			printf("\t[-b bind_address]\n");
584 | 			printf("\t[-c connections]\n");
585 | 			printf("\t[-C message_count]\n");
586 | 			printf("\t[-S message_size]\n");
587 | 			printf("\t[-p port_space - %#x for UDP (default), "
588 | 			       "%#x for IPOIB]\n", RDMA_PS_UDP, RDMA_PS_IPOIB);
589 | 			exit(1);
590 | 		}
591 | 	}
592 | 
593 | 	if (unmapped_addr && !src_addr) {
594 | 		printf("unmapped multicast address requires binding "
595 | 			"to source address\n");
596 | 		exit(1);
597 | 	}
598 | 
599 | 	test.dst_addr = (struct sockaddr *) &test.dst_in;
600 | 	test.connects_left = connections;
601 | 
602 | 	test.channel = rdma_create_event_channel();
603 | 	if (!test.channel) {
604 | 		perror("failed to create event channel");
605 | 		exit(1);
606 | 	}
607 | 
608 | 	if (alloc_nodes())
609 | 		exit(1);
610 | 
611 | 	ret = run();
612 | 
613 | 	printf("test complete\n");
614 | 	destroy_nodes();
615 | 	rdma_destroy_event_channel(test.channel);
616 | 
617 | 	printf("return status %d\n", ret);
618 | 	return ret;
619 | }
620 | 


--------------------------------------------------------------------------------
/benchmarks/reconf_bench.sh:
--------------------------------------------------------------------------------
  1 | define(){ IFS='\n' read -r -d '' ${1} || true; }
  2 | declare -A pids
  3 | declare -A rounds
  4 | redirection=( "> out" "2> err" "< /dev/null" )
  5 | 
  6 | define HELP <<'EOF'
  7 | Script for starting DARE
  8 | usage  : $0 [options]
  9 | options: --app                # app to run
 10 | EOF
 11 | 
 12 | usage () {
 13 |     echo -e "$HELP"
 14 | }
 15 | 
 16 | timer_start () {
 17 | 	echo "$1"
 18 | 	t1=$(date +%s%N)
 19 | }
 20 | 
 21 | timer_stop () {
 22 | 	t2=$(date +%s%N)
 23 | 	echo "done ($(expr $t2 - $t1) nanoseconds)"
 24 | }
 25 | 
 26 | ErrorAndExit () {
 27 |   echo "ERROR: $1"
 28 |   exit 1
 29 | }
 30 | 
 31 | ForceAbsolutePath () {
 32 |   case "$2" in
 33 |     /* )
 34 |       ;;
 35 |     *)
 36 |       ErrorAndExit "Expected an absolute path for $1"
 37 |       ;;
 38 |   esac
 39 | }
 40 | 
 41 | StartDare() {
 42 |     for ((i=0; i<${group_size}; ++i)); do
 43 |         srv=${servers[$i]}
 44 |         config_dare=( "server_type=start" "server_idx=$i" "group_size=$group_size" "config_path=${DAREDIR}/target/nodes.local.cfg" "dare_log_file=$PWD/srv${i}_1.log" "mgid=$DGID" "LD_PRELOAD=${DAREDIR}/target/interpose.so" )
 45 |         cmd=( "ssh" "$USER@${servers[$i]}" "${config_dare[@]}" "nohup" "${run_dare}" "${redirection[@]}" "&" "echo \$!" )
 46 |         pids[$srv]=$("${cmd[@]}")
 47 |         rounds[$srv]=2
 48 |         #echo "StartDare COMMAND: "${cmd[@]}
 49 |         echo -e "\tp$i ($srv) -- pid=${pids[$srv]}"
 50 |         #echo -e enable interpretation of backslash escapes
 51 |     done
 52 |     #echo -e "\n\tinitial servers: ${!servers[]}${!pids[@]}"
 53 |     #echo -e "\t...and their PIDs: ${pids[@]}"
 54 | }
 55 | 
 56 | StopDare() {
 57 |     for srv in "${!pids[@]}"; do 
 58 |         #${!pids[@]}: expand to the list of array indices (keys) assigned in pids
 59 |         cmd=( "ssh" "$USER@$srv" "kill -2" "${pids[$srv]}" )
 60 |         echo "Executing: ${cmd[@]}"
 61 |         $("${cmd[@]}")
 62 |     done
 63 | }
 64 | 
 65 | FindLeader() {
 66 |     leader=""
 67 |     max_idx=-1
 68 |     max_term=""
 69 |  
 70 |     for ((i=0; i<${group_size}; ++i)); do
 71 |         srv=${servers[$i]}
 72 |         # look for the latest [T<term>] LEADER 
 73 |         cmd=( "ssh" "$USER@$srv" "grep -r \"] LEADER\"" "$PWD/srv${i}_$((rounds[$srv]-1)).log" )
 74 |         #echo ${cmd[@]}
 75 |         grep_out=$("${cmd[@]}")
 76 |         if [[ -z $grep_out ]]; then
 77 |             continue
 78 |         fi
 79 |         terms=($(echo $grep_out | awk '{print $2}'))
 80 |         for j in "${terms[@]}"; do
 81 |            term=`echo $j | awk -F'T' '{print $2}' | awk -F']' '{print $1}'`
 82 |            if [[ $term -gt $max_term ]]; then 
 83 |                 max_term=$term
 84 |                 leader=$srv
 85 |                 leader_idx=$i
 86 |            fi
 87 |         done
 88 |     done
 89 |     echo "Leader: p${leader_idx} ($leader)"
 90 | }
 91 | 
 92 | RemoveLeader() {
 93 |     FindLeader
 94 |     if [[ -z $leader ]]; then
 95 |         echo -e "\n\tNo leader [$leader]"
 96 |         return 1
 97 |     fi
 98 |     #echo ${!pids[@]}
 99 |     #echo ${pids[@]}
100 |     if [[ -z ${pids[$leader]} ]]; then
101 |         echo -e "\n\tNo PID for the leader $leader"
102 |         return 1
103 |     fi
104 |     cmd=( "ssh" "$USER@$leader" "kill -2" "${pids[$leader]}" )
105 |     $("${cmd[@]}")
106 |     unset pids[$leader]
107 |     echo -e "\tremoved p${leader_idx} ($leader)"
108 |     #echo -e "\n\tservers after removing the leader p${leader_idx} ($leader): ${!pids[@]}"
109 |     #echo -e "\t...and their PIDs: ${pids[@]}"
110 |     #echo ${cmd[@]}
111 |     maj=$(bc -l <<< "${group_size}/2.") # bc - An arbitrary precision calculator language
112 |     if [[ ${#pids[@]} < $maj ]]; then
113 |         ErrorAndExit "...not enough servers!"
114 |     fi
115 |     return 0
116 | }
117 | 
118 | # Stop a server that is not the leader
119 | RemoveServer() {
120 |     FindLeader
121 |     for ((i=0; i<${group_size}; ++i)); do
122 |         srv=${servers[$i]}
123 |         if [[ "x$srv" == "x$leader" ]]; then
124 |             continue
125 |         fi
126 |         if [[ "x${pids[$srv]}" == "x" ]]; then
127 |             continue
128 |         fi
129 |         cmd=( "ssh" "$USER@$srv" "kill -2" "${pids[$srv]}" )
130 |         #echo -e "\tcmd: ${cmd[@]}"
131 |         $("${cmd[@]}")
132 |         unset pids[$srv]
133 |         echo -e "\tremoved p$i ($srv) -- p$leader_idx is the leader"
134 |         #echo -e "\tservers after removing p$i ($srv): ${!pids[@]}"
135 |         #echo -e "\t...and their PIDs: ${pids[@]}"
136 |         #echo ${cmd[@]}
137 |         break
138 |     done
139 |     maj=$(bc -l <<< "${group_size}/2.")
140 |     if [[ ${#pids[@]} < $maj ]]; then
141 |         ErrorAndExit "...not enough servers!"
142 |     fi
143 | }
144 | 
145 | AddServer() {
146 |     if [[ ${#pids[@]} == $group_size ]]; then
147 |         # the group is full
148 |         group_size=$((group_size+2))
149 |     fi
150 |     for ((i=0; i<${group_size}; ++i)); do
151 |         srv=${servers[$i]}
152 |         next=0
153 |         for j in "${!pids[@]}"; do 
154 |             if [[ "x$srv" == "x$j" ]]; then
155 |                next=1
156 |                break
157 |             fi
158 |         done
159 |         if [[ $next == 1 ]]; then
160 |             continue
161 |         fi
162 |         break
163 |     done
164 |     if [[ "x${rounds[$srv]}" == "x" ]]; then
165 |         rounds[$srv]=1
166 |     fi
167 |     config_dare=( "server_type=join" "config_path=${DAREDIR}/target/nodes.local.cfg" "dare_log_file=$PWD/srv${i}_${rounds[$srv]}.log" "mgid=$DGID" "LD_PRELOAD=${DAREDIR}/target/interpose.so" )
168 |     cmd=( "ssh" "$USER@$srv" "${config_dare[@]}" "nohup" "${run_dare}" "${redirection[@]}" "&" "echo \$!" )
169 |     pids[$srv]=$("${cmd[@]}")
170 |     rounds[$srv]=$((rounds[$srv] + 1))
171 |     #echo "COMMAND: "${cmd[@]}
172 |     echo -e "\tadded p$i ($srv)"
173 |     #echo -e "\n\tservers after adding p$i ($srv): ${!pids[@]}"
174 |     #echo -e "\t...and their PIDs: ${pids[@]}"
175 | }
176 | 
177 | port=8888
178 | StartBenchmark() {
179 |     if [[ "$APP" == "ssdb" ]]; then
180 |         run_loop=( "${DAREDIR}/apps/ssdb/ssdb-master/tools/ssdb-bench" "$leader" "$port" "$request_count" "$client_count")
181 |     elif [[ "$APP" == "redis" ]]; then
182 |         run_loop=( "${DAREDIR}/apps/redis/install/bin/redis-benchmark" "-t set,get" "-h $leader" "-p $port" "-n $request_count" "-c $client_count")
183 |     fi
184 |     rounds[$client]=$((rounds[$client] + 1))
185 |     cmd=( "ssh" "$USER@${client}" "${run_loop[@]}" ">" "clt_${rounds[$client]}.log")
186 |     $("${cmd[@]}")
187 | }
188 | 
189 | DAREDIR=$PWD/..
190 | APP=""
191 | client_count=1
192 | request_count=10000
193 | for arg in "$@"
194 | do
195 |     case ${arg} in
196 |     --help|-help|-h)
197 |         usage
198 |         exit 1
199 |         ;;
200 |     --op=*)
201 |         OPCODE=`echo $arg | sed -e 's/--op=//'`
202 |         OPCODE=`eval echo ${OPCODE}`    # tilde and variable expansion
203 |         ;;
204 |     --app=*)
205 |         APP=`echo $arg | sed -e 's/--app=//'`
206 |         APP=`eval echo ${APP}`    # tilde and variable expansion
207 |         ;;
208 |     esac
209 | done
210 | 
211 | if [[ "x$APP" == "x" ]]; then
212 |     ErrorAndExit "No app defined: --app"
213 | elif [[ "$APP" == "ssdb" ]]; then
214 |     run_dare="${DAREDIR}/apps/ssdb/ssdb-master/ssdb-server ${DAREDIR}/apps/ssdb/ssdb-master/ssdb.conf"
215 | elif [[ "$APP" == "redis" ]]; then
216 |     run_dare="${DAREDIR}/apps/redis/install/bin/redis-server --port $port"
217 | fi
218 | 
219 | # list of allocated nodes, e.g., nodes=(n112002 n112001 n111902)
220 | nodes=(10.22.1.3 10.22.1.4 10.22.1.5 10.22.1.6 10.22.1.7 10.22.1.8 10.22.1.9 202.45.128.159)
221 | node_count=${#nodes[@]}
222 | 
223 | echo "Allocated ${node_count} nodes:" > nodes
224 | for ((i=0; i<${node_count}; ++i)); do
225 |     echo "$i:${nodes[$i]}" >> nodes
226 | done
227 | group_size=5
228 | 
229 | client=${nodes[-2]}
230 | echo ">>> client: ${client}"
231 | 
232 | for ((i=0; i<$node_count; ++i)); do
233 |     servers[${i}]=${nodes[$i]}
234 | done
235 | echo ">>> $(($node_count)) servers: ${servers[@]}"
236 | 
237 | DGID="ff0e::ffff:e101:101"
238 | 
239 | rm -f *.log
240 | 
241 | ########################################################################
242 | 
243 | Stop() {
244 |     sleep 0.2
245 |     StopDare
246 |     exit 1
247 | }
248 | 
249 | Start() {
250 |     echo -e "Starting $group_size servers..."
251 |     StartDare
252 |     echo "done"
253 | 
254 |     sleep 2
255 | 
256 |     sleep 0.5
257 |     FindLeader
258 |     StartBenchmark
259 |     
260 |     if [[ "x$1" == "xstop" ]]; then
261 |         Stop
262 |     fi    
263 | }
264 | 
265 | FailLeader() {
266 |     echo -e "Removing the leader..."
267 |     while true; do
268 |         RemoveLeader
269 |         ret=$?
270 |         #echo "ret=$ret"
271 |         if [ $ret -eq 0 ]; then 
272 |             break;    
273 |         fi
274 |         sleep 0.05
275 |     done
276 |     echo "done"
277 | 
278 |     sleep 1
279 |     timer_start "Finding the leader..."
280 |     FindLeader
281 |     echo -e "\tp$leader_idx ($leader) is the leader"
282 |     timer_stop
283 |     
284 |     StartBenchmark
285 |     
286 |     if [[ "x$1" == "xstop" ]]; then
287 |         Stop
288 |     fi  
289 | }
290 | 
291 | RecoverServer() {
292 |     echo -e "Adding a server..."
293 |     AddServer
294 |     echo "done"
295 | 
296 |     sleep 0.5
297 |     StartBenchmark
298 |     
299 |     if [[ "x$1" == "xstop" ]]; then
300 |         Stop
301 |     fi
302 | }
303 | 
304 | Upsize() {
305 |     echo -e "Adding a server (upsize)..."
306 |     AddServer
307 |     echo "done"
308 | 
309 |     sleep 0.3
310 |     StartBenchmark
311 |     
312 |     if [[ "x$1" == "xstop" ]]; then
313 |         Stop
314 |     fi
315 | }
316 | 
317 | FailServer() {
318 |     echo -e "Removing a server (non-leader)..."
319 |     RemoveServer
320 |     echo "done"
321 | 
322 |     sleep 0.7
323 |     StartBenchmark
324 |     
325 |     if [[ "x$1" == "xstop" ]]; then
326 |         Stop
327 |     fi
328 | }
329 | 
330 | ########################################################################
331 | 
332 | # Start DARE
333 | Start
334 | 
335 | # Upsize
336 | 
337 | # Upsize
338 | 
339 | # Remove the leader
340 | FailLeader
341 | 
342 | # Remove a server that is not the leader
343 | FailServer stop
344 | 


--------------------------------------------------------------------------------
/benchmarks/run.sh:
--------------------------------------------------------------------------------
  1 | define(){ IFS='\n' read -r -d '' ${1} || true; }
  2 | declare -A pids
  3 | redirection=( "> out" "2> err" "< /dev/null" )
  4 | 
  5 | define HELP <<'EOF'
  6 | Script for starting DARE
  7 | usage  : $0 [options]
  8 | options: --app                # app to run
  9 |          [--scount=INT]       # server count [default 3]
 10 |          [--ccount=INT]       # client count [default 1]
 11 |          [--rcount=INT]       # request count [default 10000]
 12 | EOF
 13 | 
 14 | usage () {
 15 |     echo -e "$HELP"
 16 | }
 17 | 
 18 | ErrorAndExit () {
 19 |   echo "ERROR: $1"
 20 |   exit 1
 21 | }
 22 | 
 23 | StartDare() {
 24 |     for ((i=0; i<$1; ++i));
 25 |     do
 26 |         config_dare=( "server_type=start" "server_idx=$i" "group_size=$1" "config_path=${DAREDIR}/target/nodes.local.cfg" "dare_log_file=$PWD/srv${i}.log" "mgid=$DGID" "LD_PRELOAD=${DAREDIR}/target/interpose.so" )
 27 |         cmd=( "ssh" "$USER@${servers[$i]}" "${config_dare[@]}" "nohup" "${run_dare}" "${redirection[@]}" "&" "echo \$!" )
 28 |         pids[${servers[$i]}]=$("${cmd[@]}")
 29 |         echo "StartDare COMMAND: "${cmd[@]}
 30 |     done
 31 |     echo -e "\n\tinitial servers: ${!pids[@]}"
 32 |     echo -e "\t...and their PIDs: ${pids[@]}"
 33 | }
 34 | 
 35 | StopDare() {
 36 |     for i in "${!pids[@]}"
 37 |     do
 38 |         cmd=( "ssh" "$USER@$i" "kill -2" "${pids[$i]}" )
 39 |         echo "Executing: ${cmd[@]}"
 40 |         $("${cmd[@]}")
 41 |     done
 42 | }
 43 | 
 44 | FindLeader() {
 45 |     leader=""
 46 |     max_idx=-1
 47 |     max_term=""
 48 |  
 49 |     for ((i=0; i<${server_count}; ++i)); do
 50 |         srv=${servers[$i]}
 51 |         # look for the latest [T<term>] LEADER 
 52 |         cmd=( "ssh" "$USER@$srv" "grep -r \"] LEADER\"" "$PWD/srv${i}.log" )
 53 |         #echo ${cmd[@]}
 54 |         grep_out=$("${cmd[@]}")
 55 |         if [[ -z $grep_out ]]; then
 56 |             continue
 57 |         fi
 58 |         terms=($(echo $grep_out | awk '{print $2}'))
 59 |         for j in "${terms[@]}"; do
 60 |            term=`echo $j | awk -F'T' '{print $2}' | awk -F']' '{print $1}'`
 61 |            if [[ $term -gt $max_term ]]; then 
 62 |                 max_term=$term
 63 |                 leader=$srv
 64 |                 leader_idx=$i
 65 |            fi
 66 |         done
 67 |     done
 68 |     echo "Leader: p${leader_idx} ($leader)"
 69 | }
 70 | 
 71 | port=8888
 72 | StartBenchmark() {
 73 |     if [[ "$APP" == "ssdb" ]]; then
 74 |         run_loop=( "${DAREDIR}/apps/ssdb/ssdb-master/tools/ssdb-bench" "$leader" "$port" "$request_count" "$client_count")
 75 |     elif [[ "$APP" == "redis" ]]; then
 76 |         run_loop=( "${DAREDIR}/apps/redis/install/bin/redis-benchmark" "-t set,get" "-h $leader" "-p $port" "-n $request_count" "-c $client_count")
 77 |     fi
 78 |     
 79 |     cmd=( "ssh" "$USER@${client}" "${run_loop[@]}" ">" "clt.log")
 80 |     $("${cmd[@]}")
 81 | }
 82 | 
 83 | DAREDIR=$PWD/..
 84 | run_dare=""
 85 | server_count=3
 86 | APP=""
 87 | client_count=1
 88 | request_count=10000
 89 | for arg in "$@"
 90 | do
 91 |     case ${arg} in
 92 |     --help|-help|-h)
 93 |         usage
 94 |         exit 1
 95 |         ;;
 96 |     --scount=*)
 97 |         server_count=`echo $arg | sed -e 's/--scount=//'`
 98 |         server_count=`eval echo ${server_count}`    # tilde and variable expansion
 99 |         ;;
100 |     --app=*)
101 |         APP=`echo $arg | sed -e 's/--app=//'`
102 |         APP=`eval echo ${APP}`    # tilde and variable expansion
103 |         ;;
104 |     --ccount=*)
105 |         client_count=`echo $arg | sed -e 's/--ccount=//'`
106 |         client_count=`eval echo ${client_count}`    # tilde and variable expansion
107 |         ;;
108 |     --rcount=*)
109 |         request_count=`echo $arg | sed -e 's/--rcount=//'`
110 |         request_count=`eval echo ${request_count}`    # tilde and variable expansion
111 |         ;;
112 |     esac
113 | done
114 | 
115 | if [[ "x$APP" == "x" ]]; then
116 |     ErrorAndExit "No app defined: --app"
117 | elif [[ "$APP" == "ssdb" ]]; then
118 |     run_dare="${DAREDIR}/apps/ssdb/ssdb-master/ssdb-server ${DAREDIR}/apps/ssdb/ssdb-master/ssdb.conf"
119 | elif [[ "$APP" == "redis" ]]; then
120 |     run_dare="${DAREDIR}/apps/redis/install/bin/redis-server --port $port"
121 | elif [[ "$APP" == "memcached" ]]; then
122 |     run_dare="${DAREDIR}/apps/memcached/install/bin/memcached -p $port"
123 | fi
124 | 
125 | 
126 | # list of allocated nodes, e.g., nodes=(n112002 n112001 n111902)
127 | nodes=(10.22.1.3 10.22.1.4 10.22.1.5 10.22.1.6 10.22.1.7 10.22.1.8 10.22.1.9 202.45.128.159)
128 | node_count=${#nodes[@]}
129 | echo "Allocated ${node_count} nodes:" > nodes
130 | for ((i=0; i<${node_count}; ++i)); do
131 |     echo "$i:${nodes[$i]}" >> nodes
132 | done
133 | 
134 | if [ $server_count -le 0 ]; then
135 |     ErrorAndExit "0 < #servers; --scount"
136 | fi
137 | 
138 | client=${nodes[-2]}
139 | echo ">>> client: ${client}"
140 | 
141 | for ((i=0; i<${server_count}; ++i)); do
142 |     servers[${i}]=${nodes[$i]}
143 | done
144 | echo ">>> ${server_count} servers: ${servers[@]}"
145 | 
146 | DGID="ff0e::ffff:e101:101"
147 | 
148 | ########################################################################
149 | 
150 | echo -ne "Starting $server_count servers...\n"
151 | StartDare $server_count
152 | echo "done"
153 | 
154 | sleep 10
155 | #note: wait for leader election
156 | FindLeader
157 | StartBenchmark
158 | 
159 | sleep 0.2
160 | StopDare
161 | 
162 | ########################################################################
163 | 


--------------------------------------------------------------------------------
/eval/mongoose_aget.cfg:
--------------------------------------------------------------------------------
 1 | [mongoose /mongoose]
 2 | PROXY_MODE=WITH_PROXY
 3 | DEBUG_MODE=WITH_DEBUG
 4 | SERVER_COUNT=3
 5 | SERVER_INPUT=-p <port> -document_root $MSMR_ROOT/eval/current/server<port>
 6 | SERVER_KILL=killall mongoose
 7 | CLIENT_COUNT=100
 8 | CLIENT_PROGRAM=$MSMR_ROOT/libevent_paxos/client-ld-preload/Mongoose_Aget/aget
 9 | CLIENT_INPUT=-f -n2 -p 9000 http://localhost/README.md
10 | TEST_FILE=$MSMR_ROOT/README.md
11 | 


--------------------------------------------------------------------------------
/eval/readme.txt:
--------------------------------------------------------------------------------
1 | This is evaluation framework of m-smr system!
2 | Run:
3 | 	./eval.py *.cfg
4 | 


--------------------------------------------------------------------------------
/makefile.init:
--------------------------------------------------------------------------------
1 | ROOT_DIR := $(shell pwd)
2 | DEBUGOPT := 0
3 | 


--------------------------------------------------------------------------------
/src/config-comp/config-dare.c:
--------------------------------------------------------------------------------
 1 | #include "../include/util/common-header.h"
 2 | #include "../include/dare/dare_server.h"
 3 | #include <libconfig.h>
 4 | 
 5 | double hb_period;
 6 | uint64_t elec_timeout_low;
 7 | uint64_t elec_timeout_high;
 8 | double rc_info_period;
 9 | double retransmit_period;
10 | double log_pruning_period;
11 | 
12 | int dare_read_config(const char* config_path){
13 |     config_t config_file;
14 |     config_init(&config_file);
15 | 
16 |     if(!config_read_file(&config_file,config_path)){
17 |         goto goto_config_error;
18 |     }
19 | 
20 |     config_setting_t *dare_global_config = NULL;
21 |     dare_global_config = config_lookup(&config_file,"dare_global_config");
22 | 
23 |     if(NULL!=dare_global_config){
24 |         double temp_float;
25 |         if(config_setting_lookup_float(dare_global_config,"hb_period",&temp_float)){
26 |             hb_period = temp_float;
27 |         }
28 |         if(config_setting_lookup_float(dare_global_config,"rc_info_period",&temp_float)){
29 |             rc_info_period = temp_float;
30 |         }
31 |         if(config_setting_lookup_float(dare_global_config,"retransmit_period",&temp_float)){
32 |             retransmit_period = temp_float;
33 |         }
34 |         if(config_setting_lookup_float(dare_global_config,"log_pruning_period",&temp_float)){
35 |             log_pruning_period = temp_float;
36 |         }
37 |         long long temp_int64;
38 |         if(config_setting_lookup_int64(dare_global_config,"elec_timeout_low",&temp_int64)){
39 |             elec_timeout_low = temp_int64;
40 |         }
41 |         if(config_setting_lookup_int64(dare_global_config,"elec_timeout_high",&temp_int64)){
42 |             elec_timeout_high = temp_int64;
43 |         }
44 |     }
45 | 
46 |     config_destroy(&config_file);
47 |     return 0;
48 | 
49 | goto_config_error:
50 |     err_log("%s:%d - %s\n", config_error_file(&config_file),
51 |             config_error_line(&config_file), config_error_text(&config_file));
52 |     config_destroy(&config_file);
53 |     return -1;
54 | }
55 | 


--------------------------------------------------------------------------------
/src/config-comp/config-proxy.c:
--------------------------------------------------------------------------------
 1 | #include "../include/util/common-header.h"
 2 | #include "../include/proxy/proxy.h"
 3 | #include <libconfig.h>
 4 | 
 5 | 
 6 | int proxy_read_config(struct proxy_node_t* cur_node,const char* config_path){
 7 |     config_t config_file;
 8 |     config_init(&config_file);
 9 | 
10 |     if(!config_read_file(&config_file,config_path)){
11 |         goto goto_config_error;
12 |     }
13 | 
14 |     config_lookup_int(&config_file,"req_log",&cur_node->req_log);
15 | 
16 |     const char* db_name;
17 |     if(!config_lookup_string(&config_file,"db_name",&db_name)){
18 |         goto goto_config_error;
19 |     }
20 |     size_t db_name_len = strlen(db_name);
21 |     cur_node->db_name = (char*)malloc(sizeof(char)*(db_name_len+1));
22 |     if(cur_node->db_name==NULL){
23 |         goto goto_config_error;
24 |     }
25 |     if(NULL==strncpy(cur_node->db_name,db_name,db_name_len)){
26 |         free(cur_node->db_name);
27 |         goto goto_config_error;
28 |     }
29 |     cur_node->db_name[db_name_len] = '\0';
30 | 
31 | 
32 |     const char* peer_ipaddr=NULL;
33 |     int peer_port=-1;
34 |     if(!config_lookup_string(&config_file,"ip_address",&peer_ipaddr)){
35 |         goto goto_config_error;
36 |     }
37 |     if(!config_lookup_int(&config_file,"port",&peer_port)){
38 |         goto goto_config_error;
39 |     }
40 | 
41 |     cur_node->sys_addr.s_addr.sin_port = htons(peer_port);
42 |     cur_node->sys_addr.s_addr.sin_family = AF_INET;
43 |     inet_pton(AF_INET,peer_ipaddr,&cur_node->sys_addr.s_addr.sin_addr);
44 | 
45 |     cur_node->sys_addr.s_sock_len = sizeof(cur_node->sys_addr.s_addr);
46 | 
47 | 
48 |     config_destroy(&config_file);
49 |     return 0;
50 | 
51 | goto_config_error:
52 |     err_log("%s:%d - %s\n", config_error_file(&config_file),
53 |             config_error_line(&config_file), config_error_text(&config_file));
54 |     config_destroy(&config_file);
55 |     return -1;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/dare/dare_ep_db.c:
--------------------------------------------------------------------------------
  1 | /**                                                                                                      
  2 |  * DARE (Direct Access REplication)
  3 |  * 
  4 |  * Endpoint database
  5 |  *
  6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  7 |  * 
  8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
  9 |  * 
 10 |  */
 11 |  
 12 | #include <stdlib.h>
 13 | 
 14 | #include "../include/dare/debug.h"
 15 | #include "../include/dare/dare_ibv_ud.h"
 16 | #include "../include/dare/dare_ibv_rc.h"
 17 | 
 18 | #include "../include/dare/dare_ep_db.h"
 19 | 
 20 | /* ================================================================== */
 21 | 
 22 | static void 
 23 | free_ep(dare_ep_t *ep);
 24 | 
 25 | /* ================================================================== */
 26 | 
 27 | dare_ep_t* ep_search( struct rb_root *root, const uint16_t lid )
 28 | {
 29 |     struct rb_node *node = root->rb_node;
 30 | 
 31 |     while (node) 
 32 |     {
 33 |         dare_ep_t *ep = container_of(node, dare_ep_t, node);
 34 | 
 35 |         if (lid < ep->ud_ep.lid)
 36 |             node = node->rb_left;
 37 |         else if (lid > ep->ud_ep.lid)
 38 |             node = node->rb_right;
 39 |         else
 40 |             return ep;
 41 |     }
 42 |     return NULL;
 43 | }
 44 | 
 45 | dare_ep_t* ep_insert( struct rb_root *root, const uint16_t lid, const union ibv_gid dest_gid )
 46 | {
 47 |     dare_ep_t *ep;
 48 |     struct rb_node **new = &(root->rb_node), *parent = NULL;
 49 |     
 50 |     while (*new) 
 51 |     {
 52 |         dare_ep_t *this = container_of(*new, dare_ep_t, node);
 53 |         
 54 |         parent = *new;
 55 |         if (lid < this->ud_ep.lid)
 56 |             new = &((*new)->rb_left);
 57 |         else if (lid > this->ud_ep.lid)
 58 |             new = &((*new)->rb_right);
 59 |         else
 60 |             return NULL;
 61 |     }
 62 |     
 63 |     /* Create new rr */
 64 |     ep = (dare_ep_t*)malloc(sizeof(dare_ep_t));
 65 |     ep->ud_ep.lid = lid;
 66 |     ep->ud_ep.gid = dest_gid;
 67 |     ep->last_req_id = 0;
 68 |     ep->cid_idx = 0;
 69 |     ep->committed = 0;
 70 |     ep->wait_for_idx = 0;
 71 |     
 72 |     /* Create AH */
 73 |     ep->ud_ep.ah = ud_ah_create(lid, dest_gid);
 74 |     
 75 | 
 76 |     /* Add new node and rebalance tree. */
 77 |     rb_link_node(&ep->node, parent, new);
 78 |     rb_insert_color(&ep->node, root);
 79 | 
 80 |     return ep;
 81 | }
 82 | 
 83 | void ep_erase( struct rb_root *root, const uint16_t lid )
 84 | {
 85 |     dare_ep_t *ep = ep_search(root, lid);
 86 | 
 87 |     if (ep) 
 88 |     {
 89 |         rb_erase(&ep->node, root);
 90 |         free_ep(ep);
 91 |     }
 92 | }
 93 | 
 94 | void ep_db_print( struct rb_root *root )
 95 | {
 96 |     struct rb_node *node;
 97 |     dare_ep_t *ep;
 98 |     
 99 |     for (node = rb_first(root); node; node = rb_next(node)) 
100 |     {
101 |         ep = rb_entry(node, dare_ep_t, node);
102 |         info(log_fp, "[%"PRIu16": qpn=%"PRIu32"] ", 
103 |             ep->ud_ep.lid, ep->ud_ep.qpn);
104 |     }
105 | }
106 | 
107 | void ep_db_free( struct rb_root *root )
108 | {
109 |     struct rb_node *node;
110 |     dare_ep_t *ep;
111 |     
112 |     for (node = rb_first_postorder(root); node;) 
113 |     {
114 |         ep = rb_entry(node, dare_ep_t, node);
115 |         node = rb_next_postorder(node);
116 |         free_ep(ep);
117 |     }
118 | }
119 | 
120 | void ep_dp_reset_wait_idx( struct rb_root *root )
121 | {
122 |     struct rb_node *node;
123 |     dare_ep_t *ep;
124 |     
125 |     for (node = rb_first(root); node; node = rb_next(node)) 
126 |     {
127 |         ep = rb_entry(node, dare_ep_t, node);
128 |         ep->wait_for_idx = 0;
129 |     }
130 | }
131 | 
132 | void ep_dp_reply_read_req( struct rb_root *root, uint64_t idx )
133 | {
134 |     int rc;
135 |     struct rb_node *node;
136 |     dare_ep_t *ep;
137 |     int verify_leadership = 0;
138 |     int leader = 0;
139 |     
140 |     for (node = rb_first(root); node; node = rb_next(node)) 
141 |     {
142 |         ep = rb_entry(node, dare_ep_t, node);
143 |         if (!ep->wait_for_idx) continue;
144 |         if (!verify_leadership) {
145 |             /* Verify leadership */
146 |             rc = rc_verify_leadership(&leader);
147 |             if (0 != rc) {
148 |                 error(log_fp, "Cannot verify leadership\n");
149 |             }
150 |             if (0 == leader) {
151 |                 /* No longer the leader; reset the wait idx */
152 |                 ep_dp_reset_wait_idx(root);
153 |                 return;
154 |             }
155 |             verify_leadership = 1;
156 |         }
157 |         if (ep->wait_for_idx < idx) {
158 |             ud_clt_answer_read_request(ep);
159 |         }
160 |     }
161 | }
162 | 
163 | /* ================================================================== */
164 | 
165 | static void 
166 | free_ep(dare_ep_t *ep)
167 | {
168 |     ud_ah_destroy(ep->ud_ep.ah);
169 |     free(ep);
170 | }
171 | 


--------------------------------------------------------------------------------
/src/dare/dare_kvs_sm.c:
--------------------------------------------------------------------------------
  1 | /**                                                                                                      
  2 |  * DARE (Direct Access REplication)
  3 |  * 
  4 |  * State machine implementation (KVS)
  5 |  *
  6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  7 |  * 
  8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
  9 |  * 
 10 |  */
 11 | 
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | 
 15 | #include "../include/dare/dare_sm.h"
 16 | #include "../include/dare/dare_kvs_sm.h"
 17 | #include "../include/dare/dare.h"
 18 | 
 19 | uint32_t kvs_size; // kvs size in bytes
 20 | 
 21 | struct kvs_list_t {
 22 |     kvs_entry_t entry;
 23 |     struct kvs_list_t *next;
 24 | };
 25 | typedef struct kvs_list_t kvs_list_t;
 26 | 
 27 | struct kvs_table_t {
 28 |     uint32_t size;
 29 |     kvs_list_t **table;
 30 | };
 31 | typedef struct kvs_table_t kvs_table_t;
 32 | 
 33 | struct dare_kvs_sm_t {
 34 |     dare_sm_t   sm;
 35 |     kvs_table_t kvs_table;
 36 | };
 37 | typedef struct dare_kvs_sm_t dare_kvs_sm_t;
 38 | 
 39 | struct kvs_snapshot_entry_t {
 40 |     uint16_t   len;
 41 |     char       key[KEY_SIZE];
 42 |     uint8_t    value[0];
 43 | };
 44 | typedef struct kvs_snapshot_entry_t kvs_snapshot_entry_t;
 45 | 
 46 | /* ================================================================== */
 47 | 
 48 | static int
 49 | create_kvs_table( kvs_table_t* kvs_table );
 50 | static void 
 51 | destroy_kvs_sm( dare_sm_t* sm );
 52 | static int 
 53 | apply_kvs_cmd( dare_sm_t *sm, sm_cmd_t *cmd, sm_data_t *data );
 54 | 
 55 | static uint32_t 
 56 | hash( kvs_table_t *kvs_table, char *key );
 57 | static kvs_list_t* 
 58 | lookup_key( kvs_table_t *kvs_table, char *key );
 59 | static void 
 60 | remove_key( kvs_table_t *kvs_table, char *key );
 61 | static int 
 62 | write_key( kvs_table_t *kvs_table, char *key, kvs_blob_t *blob );
 63 | 
 64 | /* ================================================================== */
 65 | /* Create KVS */
 66 | 
 67 | dare_sm_t* create_kvs_sm( uint32_t size )
 68 | {
 69 | 
 70 |     int rc;
 71 |     dare_kvs_sm_t *kvs_sm;
 72 |     
 73 |     if (0 == size) {
 74 |         size = DEFAULT_KVS_SIZE;
 75 |     }
 76 |     
 77 |     /* Allocate new KVS SM */
 78 |     kvs_sm = (dare_kvs_sm_t*)malloc(sizeof(dare_kvs_sm_t));
 79 |     if (NULL == kvs_sm) {
 80 |         error(log_fp, "Cannot allocate KVS SM\n");
 81 |         return NULL;
 82 |     }
 83 |     
 84 |     /* Initiate KVS table */
 85 |     kvs_sm->kvs_table.size = size;
 86 |     rc = create_kvs_table(&kvs_sm->kvs_table);
 87 |     if (0 != rc) {
 88 |         free(kvs_sm);
 89 |         kvs_sm = NULL;
 90 |         error(log_fp, "Cannot allocate KVS SM\n");
 91 |         return NULL;
 92 |     }
 93 | 
 94 |     dare_sm_t sm = {
 95 |         .destroy   = destroy_kvs_sm,
 96 |         .apply_cmd = apply_kvs_cmd,
 97 |     };
 98 | 
 99 |     memcpy(&kvs_sm->sm, &sm, sizeof(dare_sm_t));
100 |     
101 |    // kvs_sm->sm.destroy = destroy_kvs_sm;
102 |    // kvs_sm->sm.apply_cmd = apply_kvs_cmd;
103 |     
104 |     return &(kvs_sm->sm);
105 | }
106 | 
107 | static int
108 | create_kvs_table( kvs_table_t* kvs_table )
109 | {    
110 |     kvs_table->table = (kvs_list_t**)
111 |         malloc(sizeof(kvs_list_t*) * kvs_table->size);
112 |     if (NULL == kvs_table->table) {
113 |         error_return(1, log_fp, "Cannot allocate KVS table\n");
114 |     }
115 |     memset(kvs_table->table, 0, sizeof(kvs_list_t*) * kvs_table->size);
116 |     
117 |     return 0;
118 | }
119 | 
120 | /* ================================================================== */
121 | /* SM methods */
122 | 
123 | static void 
124 | destroy_kvs_sm( dare_sm_t* sm )
125 | {
126 |     uint32_t i;
127 |     dare_kvs_sm_t *kvs_sm = (dare_kvs_sm_t*)sm;
128 |     kvs_list_t *list, *tmp;
129 | 
130 |     if (NULL == kvs_sm) {
131 |         return;
132 |     }
133 |     if (NULL == kvs_sm->kvs_table.table) {
134 |         free(kvs_sm);
135 |         kvs_sm = NULL;
136 |         return;
137 |     }
138 |     for (i = 0; i < kvs_sm->kvs_table.size; i++) {
139 |         list = kvs_sm->kvs_table.table[i];
140 |         while (NULL != list) {
141 |             tmp = list;
142 |             list = list->next;
143 |             if (NULL != tmp->entry.blob.data) {
144 |                 free(tmp->entry.blob.data);
145 |                 tmp->entry.blob.data = NULL;
146 |             }
147 |             free(tmp);
148 |         }
149 |     }
150 | 
151 |     free(kvs_sm->kvs_table.table);
152 |     kvs_sm->kvs_table.table = NULL;
153 |     free(kvs_sm);
154 |     kvs_sm = NULL;
155 | }
156 | 
157 | static int 
158 | apply_kvs_cmd( dare_sm_t* sm, sm_cmd_t *cmd, sm_data_t *data )
159 | {
160 |     int rc;
161 |     kvs_blob_t blob;
162 |     kvs_list_t* list;
163 |     dare_kvs_sm_t *kvs_sm = (dare_kvs_sm_t*)sm;
164 |     if (NULL == kvs_sm) {
165 |         error_return(1, log_fp, "SM is NULL\n");
166 |     }
167 | 
168 |     kvs_cmd_t *kvs_cmd = (kvs_cmd_t*)cmd->cmd;
169 |     if (NULL == kvs_cmd) {
170 |         error_return(1, log_fp, "Command is NULL\n");
171 |     }
172 |     //debug(log_fp, "KVS type %"PRIu8"\n", kvs_cmd->type);
173 |     switch (kvs_cmd->type) {
174 |         case KVS_PUT:
175 |             //debug(log_fp, "PUT key = %s\n", kvs_cmd->key);
176 |             blob.len = kvs_cmd->len;
177 |             blob.data = kvs_cmd->data;
178 |             rc = write_key(&kvs_sm->kvs_table, kvs_cmd->key, &blob);
179 |             if (0 != rc) {               
180 |                 error_return(1, log_fp, "Cannot apply PUT operation\n");
181 |             }
182 |             break;
183 |         case KVS_GET:
184 |             //debug(log_fp, "GET key = %s\n", kvs_cmd->key);
185 |             list = lookup_key(&kvs_sm->kvs_table, kvs_cmd->key);
186 |             if (NULL == list) {
187 |                 data->len = 0;
188 |             }
189 |             else {
190 |                 data->len = list->entry.blob.len;
191 |                 memcpy(data->data, list->entry.blob.data, data->len);
192 |             }
193 |             break;
194 |         case KVS_RM:
195 |             remove_key(&kvs_sm->kvs_table, kvs_cmd->key);
196 |             break;
197 |         default:
198 |             error_return(1, log_fp, "Unknown KVS command\n");
199 |     }
200 |     
201 |     return 0;
202 | }
203 | 
204 | /* ================================================================== */
205 | 
206 | /**
207 |  * Simple hash function
208 |  */
209 | static uint32_t 
210 | hash( kvs_table_t *kvs_table, char *key )
211 | {
212 |     uint32_t hashval;
213 |     
214 |     hashval = 0;
215 |     for(; *key != '\0'; key++) {
216 |         hashval = *key + (hashval << 5) - hashval;
217 |     }
218 |     return hashval % kvs_table->size;
219 | }
220 | 
221 | static kvs_list_t* 
222 | lookup_key( kvs_table_t *kvs_table, char *key )
223 | {
224 |     kvs_list_t *list;
225 |     uint32_t hashval = hash(kvs_table, key);
226 | 
227 |     for(list = kvs_table->table[hashval]; list != NULL; list = list->next) {
228 |         if (strcmp(key, list->entry.key) == 0) {
229 |             return list;
230 |         }
231 |     }
232 |     return NULL;
233 | }
234 | 
235 | static void 
236 | remove_key( kvs_table_t *kvs_table, char *key )
237 | {
238 |     kvs_list_t *list, *prev;
239 |     uint32_t hashval = hash(kvs_table, key);
240 |     
241 |     list = kvs_table->table[hashval];
242 |     if (list == NULL) return;
243 |     if (strcmp(key, list->entry.key) == 0) {
244 |         kvs_table->table[hashval] = list->next;
245 |         /* Update KVS size */
246 |         kvs_size = kvs_size - sizeof(kvs_snapshot_entry_t) 
247 |                     - list->entry.blob.len;
248 |         if (NULL != list->entry.blob.data) {
249 |             free(list->entry.blob.data);
250 |             list->entry.blob.data = NULL;
251 |         }
252 |         free(list);
253 |         return;
254 |     }
255 |     prev = list;
256 |     for(list = list->next; list != NULL; list = list->next) {
257 |         if (strcmp(key, list->entry.key) == 0) {
258 |             prev->next = list->next;
259 |             /* Update KVS size */
260 |             kvs_size = kvs_size - sizeof(kvs_snapshot_entry_t) 
261 |                         - list->entry.blob.len;
262 |             if (NULL != list->entry.blob.data) {
263 |                 free(list->entry.blob.data);
264 |                 list->entry.blob.data = NULL;
265 |             }
266 |             free(list);
267 |             return;
268 |         }
269 |         prev = list;
270 |     }
271 | }
272 | 
273 | static int 
274 | write_key( kvs_table_t *kvs_table, char *key, kvs_blob_t *blob )
275 | {
276 |     /* Search for list entry with this key */
277 |     kvs_list_t *list = lookup_key(kvs_table, key);
278 |     if (NULL != list) {
279 |         /* Key already exists - overwrite */
280 |         if (list->entry.blob.len != blob->len) {
281 |             /* Update KVS size */
282 |             kvs_size += blob->len - list->entry.blob.len;
283 |             /* Resize blob */
284 |             list->entry.blob.len = blob->len;
285 |             /* Reallocate memory for the value */
286 |             list->entry.blob.data = realloc(list->entry.blob.data, blob->len);
287 |             if (NULL == list->entry.blob.data) {
288 |                 error_return(1, log_fp, "Cannot allocate new KVS blob\n");
289 |             }
290 |         }
291 |         memcpy(list->entry.blob.data, blob->data, blob->len);
292 |         return 0;
293 |     }
294 |     
295 |     /* Insert new key */
296 |     unsigned int hashval = hash(kvs_table, key);
297 |     list = (kvs_list_t*)malloc(sizeof(kvs_list_t));
298 |     if (NULL == list) {
299 |         error_return(1, log_fp, "Cannot allocate new KVS list\n");
300 |     }
301 |     memcpy(&list->entry.key, key, KEY_SIZE);
302 |     list->entry.blob.len = blob->len;
303 |     /* Update KVS size */
304 |     kvs_size += sizeof(kvs_snapshot_entry_t) + blob->len;
305 |     /* Allocate memory for the value */
306 |     list->entry.blob.data = malloc(blob->len);
307 |     if (NULL == list->entry.blob.data) {
308 |         error_return(1, log_fp, "Cannot allocate new KVS blob\n");
309 |     }
310 |     memcpy(list->entry.blob.data, blob->data, blob->len);
311 |     list->next = kvs_table->table[hashval];
312 |     kvs_table->table[hashval] = list;
313 | 
314 |     return 0;
315 | }
316 | 


--------------------------------------------------------------------------------
/src/db/db-interface.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include <sys/stat.h>
  4 | #include <string.h>
  5 | #include <errno.h>
  6 | #include <db.h>
  7 | #include "../include/db/db-interface.h"
  8 | #include "../include/util/debug.h"
  9 | 
 10 | const char* db_dir="./.db";
 11 | 
 12 | u_int32_t pagesize = 32 * 1024;
 13 | u_int cachesize = 32 * 1024 * 1024;
 14 | 
 15 | struct db_t{
 16 |     DB* bdb_ptr;
 17 | };
 18 | 
 19 | uint32_t records_len;
 20 | 
 21 | db* initialize_db(const char* db_name,uint32_t flag){
 22 |     db* db_ptr=NULL;
 23 |     DB* b_db;
 24 |     int ret;
 25 |     /* Initialize the DB handle */
 26 |     if((ret = db_create(&b_db,NULL,flag))!=0){
 27 |         err_log("DB : %s.\n",db_strerror(ret));
 28 |         goto db_init_return;
 29 |     }
 30 |     
 31 |     if((ret = b_db->set_pagesize(b_db,pagesize))!=0){
 32 |         goto db_init_return;
 33 |     }
 34 |     if((ret = b_db->set_cachesize(b_db, 0, cachesize, 1))!=0){
 35 |         goto db_init_return;
 36 |     }
 37 | 
 38 |     if((ret = b_db->open(b_db,NULL,db_name,NULL,DB_RECNO,DB_THREAD|DB_CREATE,0))!=0){
 39 |         //b_db->err(b_db,ret,"%s","test.db");
 40 |         goto db_init_return;
 41 |     }
 42 |     db_ptr = (db*)(malloc(sizeof(db)));
 43 |     db_ptr->bdb_ptr = b_db;
 44 | 
 45 | db_init_return:
 46 |     if(db_ptr!=NULL){
 47 |         //debug_log("DB Initialization Finished\n");
 48 |         ;
 49 |     }
 50 |     return db_ptr;
 51 | }
 52 | 
 53 | void close_db(db* db_p,uint32_t mode){
 54 |     if(db_p!=NULL){
 55 |         if(db_p->bdb_ptr!=NULL){
 56 |             db_p->bdb_ptr->close(db_p->bdb_ptr,mode);
 57 |             db_p->bdb_ptr=NULL;
 58 |         }
 59 |         free(db_p);
 60 |         db_p = NULL;
 61 |     }
 62 |     return;
 63 | }
 64 | 
 65 | int store_record(db* db_p,size_t data_size,void* data){
 66 |     int ret = 1;
 67 |     if((NULL==db_p)||(NULL==db_p->bdb_ptr)){
 68 |         if(db_p == NULL){
 69 |           err_log("DB store_record : db_p is null.\n");
 70 |         } else{
 71 |           err_log("DB store_recor : db_p->bdb_ptr is null.\n");
 72 |         }
 73 |         goto db_store_return;
 74 |     }
 75 |     DB* b_db = db_p->bdb_ptr;
 76 |     DBT key,db_data;
 77 |     memset(&db_data,0,sizeof(db_data));
 78 |     db_data.data = data;
 79 |     db_data.size = data_size;
 80 | 
 81 |     records_len += data_size;
 82 | 
 83 |     memset(&key,0,sizeof(key));
 84 |     key.flags = DB_DBT_MALLOC;
 85 |     if ((ret=b_db->put(b_db,NULL,&key,&db_data,DB_AUTO_COMMIT|DB_APPEND))==0){
 86 |         //debug_log("db : %ld record stored. \n",*(uint64_t*)key_data);
 87 |         //b_db->sync(b_db,0);
 88 |     }
 89 |     else{
 90 |         err_log("DB : %s.\n",db_strerror(ret));
 91 |         //debug_log("db : can not save record %ld from database.\n",*(uint64_t*)key_data);
 92 |         //b_db->err(b_db,ret,"DB->Put");
 93 |     }
 94 | db_store_return:
 95 |     return ret;
 96 | }
 97 | 
 98 | void dump_records(db* db_p, void* buf){
 99 |     DB* b_db = db_p->bdb_ptr;
100 |     DBT key, data;
101 |     DBC *dbcp;
102 |     int ret;
103 | 
104 |     uint32_t len = 0;
105 | 
106 |     /* Acquire a cursor for the database. */
107 |     if ((ret = b_db->cursor(b_db, NULL, &dbcp, 0)) != 0) {
108 |         b_db->err(b_db, ret, "DB->cursor");
109 |     }
110 | 
111 |     /* Re-initialize the key/data pair. */
112 |     memset(&key, 0, sizeof(key));
113 |     memset(&data, 0, sizeof(data));
114 | 
115 |     /* Walk through the database and print out the key/data pairs. */
116 |     while ((ret = dbcp->c_get(dbcp, &key, &data, DB_NEXT)) == 0) {
117 |         //debug_log("%lu : %.*s\n", *(u_long *)key.data, (int)data.size, (char *)data.data);
118 |         memcpy((char*)buf+len, data.data, data.size);
119 |         len += data.size;
120 |     }
121 |     if (ret != DB_NOTFOUND)
122 |         b_db->err(b_db, ret, "DBcursor->get");
123 | 
124 |     /* Close the cursor. */
125 |     if ((ret = dbcp->c_close(dbcp)) != 0) {
126 |         b_db->err(b_db, ret, "DBcursor->close");
127 |     }
128 | }
129 | 
130 | 
131 | uint32_t get_records_len()
132 | {
133 |     return records_len;
134 | }


--------------------------------------------------------------------------------
/src/include/config-comp/config-dare.h:
--------------------------------------------------------------------------------
1 | #ifndef CONFIG_DARE_H
2 | #define CONFIG_DARE_H
3 | 
4 | int dare_read_config(const char* config_path);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/src/include/config-comp/config-proxy.h:
--------------------------------------------------------------------------------
1 | #ifndef CONFIG_PROXY_H
2 | #define CONFIG_PROXY_H
3 | 
4 | struct proxy_node_t;
5 | 
6 | int proxy_read_config(struct proxy_node_t* cur_node,const char* config_path);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/src/include/dare/dare.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DARE (Direct Access REplication)
 3 |  *
 4 |  * General header file
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 9 |  * 
10 |  */
11 |  
12 | #include "./debug.h"
13 | 
14 | #ifndef DARE_H_
15 | #define DARE_H_
16 | 
17 | /* SM types */
18 | #define CLT_NULL 1
19 | #define CLT_KVS  2
20 | #define CLT_FS   3
21 | 
22 | /* For immediate event scheduling */
23 | #define NOW 0.000000001
24 | 
25 | #define MAX_CLIENT_COUNT 64
26 | #define MAX_SERVER_COUNT 13
27 | 
28 | #define PAGE_SIZE 4096
29 | 
30 | 
31 | /**
32 |  *  UD message types 
33 |  */
34 | #define MSG_NONE 0
35 | #define MSG_ERROR 13
36 | /* Initialization messages */
37 | #define RC_SYN      1
38 | #define RC_SYNACK   2
39 | #define RC_ACK      3
40 | /* Client SM messages */
41 | #define CSM_READ    201
42 | #define CSM_WRITE   202
43 | #define CSM_REPLY   203
44 | /* Config messages */
45 | #define JOIN        211
46 | #define DOWNSIZE    213
47 | #define CFG_REPLY   214
48 | /* LOGGP messages */
49 | #define LOGGP_UD    55
50 | 
51 | #endif /* DARE_H_ */
52 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_client.h:
--------------------------------------------------------------------------------
 1 | /**                                                                                                      
 2 |  * DARE (Direct Access REplication)
 3 |  * 
 4 |  * Client implementation
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 9 |  * 
10 |  */
11 |  
12 | #ifndef DARE_CLIENT_H
13 | #define DARE_CLIENT_H 
14 | 
15 | #include <stdio.h>
16 | #include <ev.h>
17 | #include "../../../utils/rbtree/include/rbtree.h"
18 | #include "./dare.h"
19 | #include "./dare_sm.h"
20 | #include "./timer.h"
21 | 
22 | /* Retransmission period in ms */ 
23 | #ifdef DEBUG
24 | #define CLT_RETRANS_PERIOD 500
25 | #define CLT_OUTPUT_PERIOD 100
26 | #else 
27 | #define CLT_RETRANS_PERIOD 20
28 | #define CLT_OUTPUT_PERIOD 10
29 | #endif 
30 | 
31 | /* Client types */
32 | #define CLT_TYPE_RECONF 1
33 | #define CLT_TYPE_LOOP   2
34 | #define CLT_TYPE_TRACE  3
35 | #define CLT_TYPE_RTRACE  4
36 | 
37 | #define MAX_LINE_LENGTH 128
38 | 
39 | struct dare_client_input_t {
40 |     FILE* log;
41 |     char* trace;
42 |     char* output;
43 |     uint8_t clt_type;
44 |     uint8_t sm_type;
45 |     uint8_t first_op_perc;
46 |     uint8_t group_size;
47 | };
48 | typedef struct dare_client_input_t dare_client_input_t;
49 | 
50 | struct dare_client_data_t {
51 |     dare_client_input_t *input;
52 |     struct ev_loop      *loop;   // loop for EV library
53 |     void                *leader_ep;
54 |     FILE                *trace_fp;
55 |     FILE                *output_fp;
56 |     dare_sm_t           *sm;        // local state machine
57 |     HRT_TIMESTAMP_T     t1, t2;
58 | };
59 | typedef struct dare_client_data_t dare_client_data_t;
60 | 
61 | /* ================================================================== */
62 | 
63 | int dare_client_init( dare_client_input_t *input );
64 | void dare_client_shutdown();
65 | 
66 | #endif /* DARE_CLIENT_H */
67 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_config.h:
--------------------------------------------------------------------------------
  1 | /**                                                                                                      
  2 |  * DARE (Direct Access REplication)
  3 |  * 
  4 |  * Group configuration
  5 |  *
  6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  7 |  * 
  8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
  9 |  * 
 10 |  */
 11 |  
 12 | #ifndef DARE_CONFIG_H
 13 | #define DARE_CONFIG_H 
 14 | 
 15 | #include "./dare.h"
 16 | 
 17 | /* Stable configuration: only one size specified */
 18 | #define CID_STABLE  0
 19 | /* Transitional configuration: both old and new size are specified; 
 20 |  * !!! both majority needed */
 21 | #define CID_TRANSIT  1
 22 | /* Extended configuration: both old and new size are specified; 
 23 |  * !!! only old majority needed */
 24 | #define CID_EXTENDED  2
 25 | 
 26 | #define CID_IS_SERVER_ON(cid, idx) ((cid).bitmask & (1 << (idx)))
 27 | #define CID_SERVER_ADD(cid, idx) (cid).bitmask |= 1 << (idx)
 28 | #define CID_SERVER_RM(cid, idx) (cid).bitmask &= ~(1 << (idx))
 29 | 
 30 | /** 
 31 |  * Configuration ID: A configuration is given by a 
 32 |  * [N, N', STATE, BITMASK] tuple, where:
 33 |  * N - is the current group size
 34 |  * N' - is the new size in a transitional configuration
 35 |  * STATE - is the configuration state: stable, transitional, extended
 36 |  * BITMASK - is a bitmask with a bit set for every on servers
 37 |  */
 38 | struct dare_cid_t {
 39 |     uint64_t epoch;
 40 |     uint8_t size[2];
 41 |     uint8_t state;
 42 |     uint8_t pad[1];
 43 |     uint32_t bitmask;
 44 | };
 45 | typedef struct dare_cid_t dare_cid_t;
 46 | 
 47 | static int
 48 | equal_cid( dare_cid_t left_cid, dare_cid_t right_cid )
 49 | {
 50 |     if (left_cid.epoch != right_cid.epoch) return 0;
 51 |     if (left_cid.state != right_cid.state) return 0;
 52 |     if (left_cid.size[0] != right_cid.size[0]) return 0;
 53 |     if (left_cid.size[1] != right_cid.size[1]) return 0;
 54 |     if (left_cid.bitmask != right_cid.bitmask) return 0;
 55 |     return 1;
 56 | } 
 57 | 
 58 | typedef struct server_t server_t;
 59 | struct server_config_t {
 60 |     dare_cid_t cid;         /* configuration identifier */ 
 61 |     uint64_t cid_offset;    /* the offset of the next entry from where 
 62 |                             to start looking for CONFIG entries; 
 63 |                             note that it cannot be larger than WRITE */
 64 |     uint64_t cid_idx;       /* the index of the last CONFIG entry before 
 65 |                             joining the cluster; a server considers only
 66 |                             CONFIG entries with a larger index */
 67 |     uint64_t req_id;        /* Request ID of the endpoint that owns 
 68 |                             this configuration change */
 69 |     server_t *servers;      /* array with info for each server */
 70 |     uint16_t clt_id;        /* LID of the endpoint that owns 
 71 |                             this configuration change */
 72 |     uint8_t idx;            /* own index in configuration */
 73 |     uint8_t len;            /* fixed length of configuration array */
 74 | };
 75 | typedef struct server_config_t server_config_t;
 76 | 
 77 | /* Get the maximum size including the extra added servers */
 78 | static uint8_t
 79 | get_extended_group_size( server_config_t config )
 80 | {
 81 |     if (CID_STABLE == config.cid.state)
 82 |         return config.cid.size[0];
 83 |     if (config.cid.size[0] < config.cid.size[1])
 84 |         return config.cid.size[1];
 85 |     return config.cid.size[0];
 86 | }
 87 | 
 88 | /* Get the maximum size ignoring the extra added servers */
 89 | static uint8_t
 90 | get_group_size( server_config_t config )
 91 | {
 92 |     if (CID_TRANSIT != config.cid.state)
 93 |         return config.cid.size[0];
 94 |     if (config.cid.size[0] < config.cid.size[1])
 95 |         return config.cid.size[1];
 96 |     return config.cid.size[0];
 97 | }
 98 | 
 99 | #define PRINT_CID(cid) text(log_fp,     \
100 |     " [E%"PRIu64":%02"PRIu8"|%02"PRIu8"|%d|%03"PRIu32"] ", \
101 |     (cid).epoch, (cid).size[0], (cid).size[1], (cid).state, (cid).bitmask)
102 | #define PRINT_CID_(cid) PRINT_CID(cid); text(log_fp, "\n");
103 | 
104 | #define PRINT_CONF_TRANSIT(old_cid, new_cid) \
105 |     info_wtime(log_fp, "(%s:%d) Configuration transition: " \
106 |         "[E%"PRIu64":%02"PRIu8"|%02"PRIu8"|%d|%03"PRIu32"] -> " \
107 |         "[E%"PRIu64":%02"PRIu8"|%02"PRIu8"|%d|%03"PRIu32"]\n", \
108 |         __func__, __LINE__, \
109 |         (old_cid).epoch, (old_cid).size[0], (old_cid).size[1], \
110 |         (old_cid).state, (old_cid).bitmask, \
111 |         (new_cid).epoch, (new_cid).size[0], (new_cid).size[1], \
112 |         (new_cid).state, (new_cid).bitmask)
113 | 
114 | #endif /* DARE_CONFIG_H */
115 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_ep_db.h:
--------------------------------------------------------------------------------
 1 | /**                                                                                                      
 2 |  * DARE (Direct Access REplication)
 3 |  *
 4 |  * Endpoint database
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 9 |  * 
10 |  */
11 |  
12 | #ifndef DARE_EP_DB_H
13 | #define DARE_EP_DB_H
14 | 
15 | #include "../../../utils/rbtree/include/rbtree.h"
16 | #include "./dare_ibv.h"
17 | 
18 | /* ================================================================== */
19 | 
20 | struct dare_ep_t {
21 |     struct rb_node node;
22 |     ud_ep_t ud_ep;
23 |     uint8_t last_read_request[128];
24 |     uint64_t wait_for_idx;
25 |     uint64_t last_req_id;   /* this is the ID of the last request from 
26 |                             this endpoint that I answer; ignore requests 
27 |                             with lower IDs */
28 |     uint64_t cid_idx;
29 |     int committed;
30 | };
31 | typedef struct dare_ep_t dare_ep_t;
32 | 
33 | /* ================================================================== */
34 | 
35 | dare_ep_t* ep_search( struct rb_root *root, const uint16_t lid );
36 | dare_ep_t* ep_insert( struct rb_root *root, const uint16_t lid, const union ibv_gid dest_gid );
37 | void ep_erase( struct rb_root *root, const uint16_t lid );
38 | void ep_db_print( struct rb_root *root );
39 | void ep_db_free( struct rb_root *root );
40 | void ep_dp_reset_wait_idx( struct rb_root *root );
41 | void ep_dp_reply_read_req( struct rb_root *root, uint64_t idx );
42 | 
43 | 
44 | #endif /* DARE_EP_DB_H */
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_ibv.h:
--------------------------------------------------------------------------------
  1 | /**                                                                                                      
  2 |  * DARE (Direct Access REplication)
  3 |  *
  4 |  * Network module for the DARE consensus algorithm (IB verbs)
  5 |  *
  6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  7 |  * 
  8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
  9 |  * 
 10 |  */
 11 |  
 12 | #include <infiniband/verbs.h> /* OFED IB verbs */
 13 | #include "./dare.h"
 14 |  
 15 | #ifndef DARE_IBV_H
 16 | #define DARE_IBV_H
 17 | 
 18 | #define DARE_WR_COUNT    32
 19 | #define IB_PKEY_MASK 0x7fff
 20 | 
 21 | #define IBV_SERVER  1
 22 | #define IBV_CLIENT  2
 23 | 
 24 | 
 25 | #define mtu_value(mtu) \
 26 |     ((mtu == IBV_MTU_256) ? 256 :    \
 27 |     (mtu == IBV_MTU_512) ? 512 :    \
 28 |     (mtu == IBV_MTU_1024) ? 1024 :  \
 29 |     (mtu == IBV_MTU_2048) ? 2048 :  \
 30 |     (mtu == IBV_MTU_4096) ? 4096 : 0)
 31 | 
 32 | #define qp_state_to_str(state) \
 33 |    ((state == IBV_QPS_RESET) ? "RESET" : \
 34 |    (state == IBV_QPS_INIT) ? "INIT" : \
 35 |    (state == IBV_QPS_RTR) ? "RTR" : \
 36 |    (state == IBV_QPS_RTS) ? "RTS" : \
 37 |    (state == IBV_QPS_ERR) ? "ERR" : "X")
 38 | 
 39 | #define CTRL_PSN 13
 40 | #define LOG_PSN 55
 41 | #define LOG_QP 1
 42 | #define CTRL_QP 0
 43 | 
 44 | #define HB_CNT_DELAY 3
 45 | 
 46 | /* Endpoint UD info */
 47 | struct ud_ep_t {
 48 |     uint16_t lid;
 49 |     union ibv_gid gid;
 50 |     uint32_t qpn;
 51 |     struct ibv_ah *ah;
 52 | };
 53 | typedef struct ud_ep_t ud_ep_t;
 54 | 
 55 | struct rem_mem_t {
 56 |     uint64_t raddr;
 57 |     uint32_t rkey;
 58 | };
 59 | typedef struct rem_mem_t rem_mem_t;
 60 | 
 61 | #define RC_QP_ACTIVE    0
 62 | #define RC_QP_BLOCKED   1
 63 | #define RC_QP_ERROR     2
 64 | 
 65 | struct rc_qp_t {
 66 |     struct ibv_qp *qp;          // RC QP
 67 |     uint64_t signaled_wr_id;    // ID of signaled WR (to avoid overflow)
 68 |     uint32_t qpn;               // remote QP number
 69 |     uint32_t send_count;        // number of posted sends
 70 |     uint8_t  state;             // QP's state
 71 | }; 
 72 | typedef struct rc_qp_t rc_qp_t;
 73 | 
 74 | /* Endpoint RC info */
 75 | struct rc_ep_t {
 76 |     rem_mem_t rmt_mr[2];    // remote memory regions
 77 |     rc_qp_t   rc_qp[2];     // RC QPs (LOG & CTRL)
 78 | };
 79 | typedef struct rc_ep_t rc_ep_t;
 80 | 
 81 | struct dare_ib_ep_t {
 82 |     ud_ep_t ud_ep;  // UD info
 83 |     rc_ep_t rc_ep;  // RC info
 84 |     uint32_t mtu;
 85 |     int rc_connected;
 86 |     int log_access;
 87 | };
 88 | typedef struct dare_ib_ep_t dare_ib_ep_t;
 89 | 
 90 | struct dare_ib_device_t {
 91 |     /* General fields */
 92 |     struct ibv_device *ib_dev;
 93 |     struct ibv_context *ib_dev_context;
 94 |     struct ibv_device_attr ib_dev_attr;
 95 |     uint16_t pkey_index;    
 96 |     int gid_index;
 97 |     union ibv_gid gid;
 98 |     uint8_t port_num;       // port number 
 99 |     enum ibv_mtu mtu;       // MTU for this device
100 |     uint16_t lid;           // local ID for this device        
101 | 
102 |     /* QP for listening for clients requests - UD */
103 |     struct ibv_pd           *ud_pd;
104 |     struct ibv_qp           *ud_qp;
105 |     struct ibv_cq           *ud_rcq;
106 |     struct ibv_cq           *ud_scq;
107 |     int                     ud_rcqe;
108 |     void                    **ud_recv_bufs;
109 |     struct ibv_mr           **ud_recv_mrs;
110 |     void                    *ud_send_buf;
111 |     struct ibv_mr           *ud_send_mr;
112 |     uint32_t                ud_max_inline_data;
113 |     uint64_t  request_id;
114 |     
115 |     /* Multicast */
116 |     struct ibv_ah *ib_mcast_ah;
117 |     union ibv_gid mgid;
118 |     uint16_t      mlid;
119 |     
120 |     /* QPs for inter-server communication - RC */
121 |     struct ibv_pd *rc_pd;
122 |     struct ibv_cq *rc_cq[2];
123 |     int           rc_cqe;
124 |     struct ibv_wc *rc_wc_array;
125 |     struct ibv_mr *lcl_mr[2];
126 |     uint32_t      rc_max_inline_data;
127 |     uint32_t      rc_max_send_wr;
128 |     
129 |     /* Snapshot */
130 |     struct ibv_mr *prereg_snapshot_mr;
131 |     struct ibv_mr *snapshot_mr;
132 |     
133 |     int ulp_type;
134 |     void *udata;
135 | };
136 | typedef struct dare_ib_device_t dare_ib_device_t;
137 | 
138 | /* ================================================================== */
139 | 
140 | /* Init and cleaning up */
141 | int dare_init_ib_device();
142 | int dare_start_ib_ud();
143 | int dare_init_ib_srv_data( void *data );
144 | int dare_init_ib_clt_data( void *data );
145 | int dare_init_ib_rc();
146 | void dare_ib_srv_shutdown();
147 | void dare_ib_clt_shutdown();
148 | void dare_ib_destroy_ep( uint8_t idx );
149 | 
150 | /* Starting a server */
151 | void dare_ib_poll_tailq();
152 | uint8_t dare_ib_poll_ud_queue();
153 | int dare_ib_join_cluster();
154 | int dare_ib_exchange_rc_info();
155 | int dare_ib_update_rc_info();
156 | int dare_ib_get_replicated_vote();
157 | int dare_ib_send_sm_request();
158 | int dare_ib_send_sm_reply( uint8_t idx, void *s, int reg_mem );
159 | int dare_ib_recover_sm( uint8_t idx );
160 | int dare_ib_recover_log();
161 | 
162 | /* HB mechanism */
163 | int dare_ib_send_hb();
164 | int dare_ib_send_hb_reply( uint8_t idx );
165 | 
166 | /* Leader election */
167 | int dare_ib_send_vote_request();
168 | int dare_ib_replicate_vote();
169 | int dare_ib_send_vote_ack();
170 | 
171 | /* Normal operation */
172 | int dare_ib_establish_leadership();
173 | int dare_ib_write_remote_logs( int wait_for_commit );
174 | int dare_ib_send_entries_reply( uint8_t idx );
175 | int dare_ib_get_remote_apply_offsets();
176 | 
177 | /* Handle client requests */
178 | int dare_ib_apply_cmd_locally();
179 | int dare_ib_create_clt_request();
180 | int dare_ib_create_clt_downsize_request();
181 | int dare_ib_resend_clt_request();
182 | int dare_ib_send_clt_reply( uint16_t lid, uint64_t req_id, uint8_t type );
183 | 
184 | /* Handle QPs state */
185 | void dare_ib_disconnect_server( uint8_t idx );
186 | int dare_ib_revoke_log_access();
187 | int dare_ib_restore_log_access();
188 | 
189 | /* LogGP */
190 | double dare_ib_get_loggp_params( uint32_t size, int type, int *poll_count, int write, int inline_flag );
191 | double dare_ib_loggp_prtt( int n, double delay, uint32_t size, int inline_flag );
192 | int dare_ib_loggp_exit();
193 | 
194 | void print_rc_info();
195 | int print_qp_state( void *qp );
196 | int dare_ib_print_ud_qp();
197 | 
198 | void dare_ib_send_msg();
199 | int find_max_inline( struct ibv_context *context, 
200 |                      struct ibv_pd *pd,
201 |                      uint32_t *max_inline_arg );
202 | 
203 | #endif /* DARE_IBV_H */
204 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_ibv_rc.h:
--------------------------------------------------------------------------------
 1 | /**                                                                                                      
 2 |  * DARE (Direct Access REplication)
 3 |  * 
 4 |  * Reliable Connection (RC) over InfiniBand
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 9 |  * 
10 |  */
11 | 
12 | #ifndef DARE_IBV_RC_H
13 | #define DARE_IBV_RC_H
14 | 
15 | #include <infiniband/verbs.h> /* OFED stuff */
16 | #include "./dare_ibv.h"
17 | 
18 | #define SIGNALED    1
19 | #define NOTSIGNALED 0
20 | //#define NOTSIGNALED 1
21 | 
22 | /**
23 |  * The WR Identifier (WRID)
24 |  * the WRID is a 64-bit value [SSN|WA|TAG|CONN], where
25 |     * SSN is the Send Sequence Number
26 |     * WA is the Wrap-Around flag, set for log update WRs 
27 |     * TAG is a flag set for special signaled WRs (to avoid QPs overflow)
28 |     * CONN is a 8-bit index that identifies the connection (the remote server)
29 |  */
30 | /* The CONN consists of the 8 least significant bits (lsbs) */
31 | #define WRID_GET_CONN(wrid) (uint8_t)((wrid) & (0xFF))
32 | #define WRID_SET_CONN(wrid, conn) (wrid) = (conn | ((wrid >> 8) << 8))
33 | /* The TAG flag is the 9th lsb */
34 | #define WRID_GET_TAG(wrid) ((wrid) & (1 << 8))
35 | #define WRID_SET_TAG(wrid) (wrid) |= 1 << 8
36 | #define WRID_UNSET_TAG(wrid) (wrid) &= ~(1 << 8)
37 | /* The WA flag is the 10th lsb */
38 | #define WRID_GET_WA(wrid) ((wrid) & (1 << 9))
39 | #define WRID_SET_WA(wrid) (wrid) |= 1 << 9
40 | #define WRID_UNSET_WA(wrid) (wrid) &= ~(1 << 9)
41 | /* The SSN consists of the most significant 54 bits */
42 | #define WRID_GET_SSN(wrid) ((wrid) >> 10)
43 | #define WRID_SET_SSN(wrid, ssn) (wrid) = (((ssn) << 10) | ((wrid) & 0x3FF))
44 | 
45 | #define PRINT_WRID(wrid) info(log_fp,     \
46 |     " [%010"PRIu64"|%d|%d|%03"PRIu8"] ", \
47 |     WRID_GET_SSN(wrid),  \
48 |     (WRID_GET_WA(wrid) ? 1 : 0),   \
49 |     (WRID_GET_TAG(wrid) ? 1 : 0),   \
50 |     WRID_GET_CONN(wrid))
51 | #define PRINT_WRID_(wrid) PRINT_WRID(wrid); info(log_fp, "\n");
52 | 
53 | int rc_init();
54 | void rc_free();
55 | 
56 | /* Start up */
57 | int rc_get_replicated_vote();
58 | int rc_send_sm_request();
59 | int rc_send_sm_reply( uint8_t idx, void *s, int reg_mem );
60 | int rc_recover_sm( uint8_t idx );
61 | int rc_recover_log();
62 | 
63 | /* HB mechanism */
64 | int rc_send_hb();
65 | int rc_send_hb_reply( uint8_t idx );
66 | 
67 | /* Leader election */
68 | int rc_send_vote_request();
69 | int rc_replicate_vote();
70 | int rc_send_vote_ack();
71 | 
72 | /* Normal operation */
73 | int rc_verify_leadership( int *leader );
74 | int rc_write_remote_logs( int wait_for_commit );
75 | int rc_send_entries_reply( uint8_t idx );
76 | int rc_get_remote_apply_offsets();
77 | 
78 | /* QP interface */
79 | int rc_disconnect_server( uint8_t idx );
80 | int rc_connect_server( uint8_t idx, int qp_id );
81 | int rc_revoke_log_access();
82 | int rc_restore_log_access();
83 | 
84 | /* LogGP */
85 | double rc_get_loggp_params( uint32_t size, int type, int *poll_count, int write, int inline_flag );
86 | double rc_loggp_prtt( int n, double delay, uint32_t size );
87 | int rc_loggp_exit();
88 |  
89 | int rc_print_qp_state( void *data );
90 | void rc_ib_send_msg();
91 | #endif /* DARE_IBV_RC_H */
92 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_ibv_ud.h:
--------------------------------------------------------------------------------
  1 | /**                                                                                                      
  2 |  * DARE (Direct Access REplication)
  3 |  * 
  4 |  * Unreliable Datagrams (UD) over InfiniBand
  5 |  *
  6 |  * Copyright (c) 2016 HLRS, University of Stuttgart. All rights reserved.
  7 |  * 
  8 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  9 |  * 
 10 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 11 |  *            Nakul Vyas <mailnakul@gmail.com>
 12 |  * 
 13 |  */
 14 |  
 15 | #ifndef DARE_IBV_UD_H
 16 | #define DARE_IBV_UD_H
 17 | 
 18 | #include <infiniband/verbs.h> /* OFED stuff */ 
 19 | #include "./dare_sm.h"
 20 | #include "./dare_ibv.h"
 21 | #include "./dare_config.h"
 22 | #include "./dare_ep_db.h"
 23 | 
 24 | #define REQ_MAJORITY 13
 25 | #define MCG_GID {255,1,0,0,0,2,201,133,0,0,0,0,0,0,0,0}
 26 | 
 27 | /* ================================================================== */
 28 | /* UD messages */
 29 | struct ud_hdr_t {
 30 |     uint64_t id;
 31 |     uint8_t type;
 32 |     union ibv_gid gid;
 33 |     //uint8_t pad[7];
 34 |     uint16_t slid;
 35 | };
 36 | typedef struct ud_hdr_t ud_hdr_t;
 37 | 
 38 | struct client_req_t {
 39 |     ud_hdr_t hdr;
 40 |     sm_cmd_t cmd;
 41 | };
 42 | typedef struct client_req_t client_req_t;
 43 | 
 44 | struct client_rep_t {
 45 |     ud_hdr_t hdr;
 46 |     sm_data_t data;
 47 | };
 48 | typedef struct client_rep_t client_rep_t;
 49 | 
 50 | struct reconf_req_t {
 51 |     ud_hdr_t hdr;
 52 |     uint8_t  idx_size;
 53 | };
 54 | typedef struct reconf_req_t reconf_req_t;
 55 | 
 56 | struct reconf_rep_t {
 57 |     ud_hdr_t   hdr;
 58 |     uint8_t    idx;
 59 |     dare_cid_t cid;
 60 |     uint64_t cid_idx;
 61 |     uint64_t head;
 62 | };
 63 | typedef struct reconf_rep_t reconf_rep_t;
 64 | 
 65 | struct rc_syn_t {
 66 |     ud_hdr_t hdr;
 67 |     rem_mem_t log_rm;
 68 |     rem_mem_t ctrl_rm;
 69 |     enum ibv_mtu mtu;
 70 |     //union ibv_gid gid;
 71 |     uint8_t idx;
 72 |     uint8_t size;
 73 |     uint8_t data[0];    // log & ctrl QPNs
 74 | };
 75 | typedef struct rc_syn_t rc_syn_t;
 76 | 
 77 | struct rc_ack_t {
 78 |     ud_hdr_t hdr;
 79 |     uint8_t idx;
 80 | };
 81 | typedef struct rc_ack_t rc_ack_t;
 82 | 
 83 | extern char* global_mgid; 
 84 | 
 85 | /* ================================================================== */ 
 86 | 
 87 | int ud_init( uint32_t receive_count );
 88 | int ud_start();
 89 | void ud_shutdown();
 90 | 
 91 | struct ibv_ah* ud_ah_create( uint16_t dlid, union ibv_gid dgid );
 92 | void ud_ah_destroy( struct ibv_ah* ah );
 93 | 
 94 | void get_tailq_message();
 95 | uint8_t ud_get_message();
 96 | int ud_join_cluster();
 97 | int ud_exchange_rc_info();
 98 | int ud_update_rc_info();
 99 | int ud_discover_servers();
100 | int ud_establish_rc();
101 | 
102 | /* Client stuff */
103 | int ud_send_clt_reply( uint16_t lid, uint64_t req_id, uint8_t type );
104 | void ud_clt_answer_read_request(dare_ep_t *ep);
105 | 
106 | #endif /* DARE_IBV_UD_H */
107 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_kvs_sm.h:
--------------------------------------------------------------------------------
 1 | /**                                                                                                      
 2 |  * DARE (Direct Access REplication)
 3 |  * 
 4 |  * State machine implementation (KVS)
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 9 |  * 
10 |  */
11 |  
12 | #ifndef DARE_KVS_SM_H
13 | #define DARE_KVS_SM_H
14 | 
15 | #define __STDC_FORMAT_MACROS
16 | #include <inttypes.h>
17 | 
18 | #include "./dare_sm.h"
19 | 
20 | #define DEFAULT_KVS_SIZE 1024
21 | #define KEY_SIZE 64
22 | 
23 | /* KVS commands */
24 | #define KVS_PUT 1
25 | #define KVS_GET 2
26 | #define KVS_RM  3
27 | 
28 | /* KVS command */
29 | struct kvs_cmd_t {
30 |     uint8_t     type;   // read, write, delete
31 |     char        key[KEY_SIZE];
32 |     uint16_t    len;
33 |     uint8_t     data[0];
34 | };
35 | typedef struct kvs_cmd_t kvs_cmd_t;
36 | 
37 | struct kvs_blob_t {
38 |     uint16_t len;
39 |     void *data;
40 | };
41 | typedef struct kvs_blob_t kvs_blob_t;
42 | 
43 | struct kvs_entry_t {
44 |     char       key[KEY_SIZE];
45 |     kvs_blob_t blob;
46 | };
47 | typedef struct kvs_entry_t kvs_entry_t;
48 | 
49 | #endif /* DARE_KVS_SM_H */
50 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_server.h:
--------------------------------------------------------------------------------
  1 | /**                                                                                                      
  2 |  * DARE (Direct Access REplication)
  3 |  *
  4 |  * Implementation of a DARE server
  5 |  *
  6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  7 |  * 
  8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
  9 |  * 
 10 |  */
 11 |  
 12 | #ifndef DARE_SERVER_H
 13 | #define DARE_SERVER_H 
 14 | 
 15 | #include <stdio.h>
 16 | 
 17 | #include <ev.h>
 18 | #include "../../../utils/rbtree/include/rbtree.h"
 19 | #include "./dare_log.h"
 20 | #include "./dare.h"
 21 | #include "./timer.h"
 22 | 
 23 | /* Server types */
 24 | #define SRV_TYPE_START  1
 25 | #define SRV_TYPE_JOIN   2
 26 | #define SRV_TYPE_LOGGP  3
 27 | 
 28 | /* LogGP param types */
 29 | #define LOGGP_PARAM_O   1
 30 | #define LOGGP_PARAM_OP  2
 31 | #define LOGGP_PARAM_L   3
 32 | #define LOGGP_PARAM_OPX 4
 33 | 
 34 | /* Retry period before failures in ms */
 35 | extern const double retry_exec_period;
 36 | 
 37 | /* Heartbeat period in ms */
 38 | extern double hb_period;
 39 | extern uint64_t elec_timeout_low;
 40 | extern uint64_t elec_timeout_high;
 41 | extern double rc_info_period;
 42 | extern double retransmit_period;
 43 | extern double log_pruning_period;
 44 | 
 45 | /**
 46 |  * The state identifier (SID)
 47 |  * the SID is a 64-bit value [TERM|L|IDX], where
 48 |     * TERM is the current term
 49 |     * L is the leader flag, set when there is a leader
 50 |     * IDX is the index of the server that cause the last SID update
 51 |  */
 52 | /* The IDX consists of the 8 least significant bits (lsbs) */
 53 | #define SID_GET_IDX(sid) (uint8_t)((sid) & (0xFF))
 54 | #define SID_SET_IDX(sid, idx) (sid) = (idx | ((sid >> 8) << 8))
 55 | /* The L flag is the 9th lsb */
 56 | #define SID_GET_L(sid) ((sid) & (1 << 8))
 57 | #define SID_SET_L(sid) (sid) |= 1 << 8
 58 | #define SID_UNSET_L(sid) (sid) &= ~(1 << 8)
 59 | /* The TERM consists of the most significant 55 bits */
 60 | #define SID_GET_TERM(sid) ((sid) >> 9)
 61 | #define SID_SET_TERM(sid, term) (sid) = (((term) << 9) | ((sid) & 0x1FF))
 62 | 
 63 | #define PRINT_SID(sid) text(log_fp,     \
 64 |     " [%010"PRIu64"|%d|%03"PRIu8"] ", \
 65 |     SID_GET_TERM(sid),  \
 66 |     (SID_GET_L(sid) ? 1 : 0),   \
 67 |     SID_GET_IDX(sid))
 68 | #define PRINT_SID_(sid) PRINT_SID(sid); text(log_fp, "\n");
 69 | 
 70 | #define IS_SID_NEW(sid) (!SID_GET_L(sid) && (SID_GET_TERM(sid) == 0))
 71 | #define SID_NULL 0xFF
 72 | #define SID_DEAD 0xFFFFFFFFFFFFFFFF
 73 | 
 74 | /* Number of fail communication attempts before considering a remote 
 75 | server as permanently failed */
 76 | #define PERMANENT_FAILURE   2
 77 | 
 78 | /* Normal operation (log replication) steps */
 79 | #define LR_GET_WRITE      1
 80 | #define LR_GET_NCE_LEN    2
 81 | #define LR_GET_NCE        3
 82 | #define LR_SET_END        4
 83 | #define LR_UPDATE_LOG     5
 84 | #define LR_UPDATE_END     6
 85 | 
 86 | struct server_t {
 87 |     uint64_t next_wr_id;    // next WR ID to wait for
 88 |     uint64_t cached_end_offset; // the new end offset if the log update succeeds
 89 |     uint64_t last_get_read_ssn; // ssn of the last get read operation
 90 |     void *ep;               // endpoint data (network related)
 91 |     uint8_t fail_count;     // number of failures detected
 92 |     uint8_t next_lr_step;   // next log replication step 
 93 |     uint8_t send_flag;      // flag set for posting send for this EP
 94 |     uint8_t send_count;     // number of sends poster for current step
 95 | };
 96 | 
 97 | //typedef struct server_t server_t;
 98 | 
 99 | struct vote_req_t {
100 |     uint64_t sid;
101 |     uint64_t index;
102 |     uint64_t term;
103 |     dare_cid_t cid;
104 | };
105 | typedef struct vote_req_t vote_req_t;
106 | 
107 | struct prv_data_t {
108 |     uint64_t vote_sid;  // SID of last vote given
109 |                         // on recovery need to retrieve this from a 
110 |                         // remote server and update own SID to at least 
111 |                         // this SID
112 | };
113 | typedef struct prv_data_t prv_data_t;
114 | 
115 | struct sm_rep_t {
116 |     uint64_t sid;
117 |     uint64_t raddr;
118 |     uint32_t rkey;
119 |     uint32_t len;
120 | };
121 | typedef struct sm_rep_t sm_rep_t;
122 | 
123 | struct ctrl_data_t {
124 |     /* State identified (SID) */
125 |     uint64_t    sid;
126 |     
127 |     /* DARE arrays */
128 |     vote_req_t    vote_req[MAX_SERVER_COUNT];       /* vote requests */
129 |     log_offsets_t log_offsets[MAX_SERVER_COUNT];	/* log offsets */
130 |     sm_rep_t      sm_rep[MAX_SERVER_COUNT];
131 |     uint64_t      sm_req[MAX_SERVER_COUNT];
132 |     uint64_t 	  hb[MAX_SERVER_COUNT];             /* heartbeat array */ 
133 |     uint64_t      vote_ack[MAX_SERVER_COUNT];
134 |     uint64_t      rsid[MAX_SERVER_COUNT];   /* for remote terms & indexes */
135 |     uint64_t      apply_offsets[MAX_SERVER_COUNT];   /* apply offsets */
136 |     
137 |     /* Remote private data */
138 |     prv_data_t  prv_data[MAX_SERVER_COUNT];    // private data
139 | };
140 | typedef struct ctrl_data_t ctrl_data_t;
141 | 
142 | struct dare_server_input_t {
143 |     FILE* log;
144 |     char* name;
145 |     char* output;
146 |     uint8_t srv_type;
147 |     uint8_t sm_type;
148 |     uint8_t group_size;
149 |     uint8_t server_idx;
150 |     
151 |     proxy_do_action_cb_t do_action;
152 |     proxy_store_cmd_cb_t store_cmd;
153 |     proxy_create_db_snapshot_cb_t create_db_snapshot;
154 |     proxy_get_db_size_cb_t get_db_size;
155 |     proxy_apply_db_snapshot_cb_t apply_db_snapshot;
156 |     proxy_update_state_cb_t update_state;
157 |     char config_path[128];
158 |     void* up_para;
159 | };
160 | typedef struct dare_server_input_t dare_server_input_t;
161 | 
162 | struct dare_loggp_t {
163 |     double o[2],
164 |         o_ninline,
165 |         o_poll,
166 |         o_poll_x,
167 |         L[2],
168 |         G[3];
169 | };
170 | typedef struct dare_loggp_t dare_loggp_t;
171 | 
172 | struct dare_server_data_t {
173 |     dare_server_input_t *input;
174 |     
175 |     server_config_t config; // configuration 
176 |     
177 |     ctrl_data_t *ctrl_data;  // control data (state & private data)
178 |     dare_log_t  *log;       // local log (remotely accessible)
179 |     dare_sm_t   *sm;        // local state machine
180 |     snapshot_t  *prereg_snapshot;
181 |     snapshot_t  *snapshot;
182 |     
183 |     struct rb_root endpoints;   // RB-tree with remote endpoints
184 |     uint64_t last_write_csm_idx;
185 |     uint64_t last_cmt_write_csm_idx;
186 |     
187 |     struct ev_loop *loop;   // loop for EV library
188 | 
189 |     FILE* output_fp;
190 |     dare_loggp_t loggp;
191 |     
192 |     HRT_TIMESTAMP_T t1, t2;
193 | };
194 | typedef struct dare_server_data_t dare_server_data_t;
195 | /* ================================================================== */
196 | 
197 | void *dare_server_init( void *arg );
198 | void dare_server_shutdown();
199 | 
200 | void server_to_follower();
201 | int server_update_sid( uint64_t new_sid, uint64_t old_sid );
202 | int is_leader();
203 | uint8_t get_node_id();
204 | 
205 | #endif /* DARE_SERVER_H */
206 | 


--------------------------------------------------------------------------------
/src/include/dare/dare_sm.h:
--------------------------------------------------------------------------------
 1 | /**                                                                                                      
 2 |  * DARE (Direct Access REplication)
 3 |  * 
 4 |  * State machine abstraction
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
 9 |  * 
10 |  */
11 |  
12 | #ifndef DARE_SM_H
13 | #define DARE_SM_H
14 | 
15 | #include "./dare_kvs_sm.h"
16 | 
17 | /* SM types */
18 | #define SM_NULL 1
19 | #define SM_KVS  2
20 | #define SM_FS   3
21 | 
22 | /* SM command - can be interpreted only by the SM */
23 | struct sm_cmd_t {
24 |     uint16_t    len;
25 |     uint8_t cmd[0];
26 | };
27 | typedef struct sm_cmd_t sm_cmd_t;
28 | 
29 | /* SM data - as answer to a command */
30 | struct sm_data_t {
31 |     uint16_t    len;
32 |     uint8_t data[0];
33 | };
34 | typedef struct sm_data_t sm_data_t;
35 | typedef struct dare_sm_t dare_sm_t;
36 | 
37 | /* Destroy the state machine */
38 | typedef void (*destroy_cb_t)(dare_sm_t *sm);
39 | /* Apply a command to the state machine */
40 | typedef int (*apply_cmd_cb_t)(dare_sm_t *sm, sm_cmd_t *cmd, sm_data_t *data);
41 | 
42 | typedef void (*proxy_store_cmd_cb_t)(void* data,void *arg);
43 | typedef void (*proxy_do_action_cb_t)(uint16_t clt_id,uint8_t type,size_t data_size,void* data,void *arg);
44 | typedef void (*proxy_create_db_snapshot_cb_t)(void *snapshot,void *arg);
45 | typedef uint32_t (*proxy_get_db_size_cb_t)(void *arg);
46 | typedef int (*proxy_apply_db_snapshot_cb_t)(void *snapshot,uint32_t size,void *arg);
47 | typedef void (*proxy_update_state_cb_t)(void *arg);
48 | 
49 | struct dare_sm_t {
50 |     destroy_cb_t   destroy;
51 |     apply_cmd_cb_t apply_cmd;
52 | 
53 |     proxy_store_cmd_cb_t proxy_store_cmd;
54 |     proxy_do_action_cb_t proxy_do_action;
55 |     proxy_get_db_size_cb_t proxy_get_db_size;
56 |     proxy_create_db_snapshot_cb_t proxy_create_db_snapshot;
57 |     proxy_apply_db_snapshot_cb_t proxy_apply_db_snapshot;
58 |     proxy_update_state_cb_t proxy_update_state;
59 |     void* up_para;
60 | };
61 | 
62 | /* ================================================================== */
63 | 
64 | dare_sm_t* create_kvs_sm( uint32_t size );
65 | 
66 | 
67 | #endif /* DARE_SM_H */
68 | 


--------------------------------------------------------------------------------
/src/include/dare/debug.h:
--------------------------------------------------------------------------------
  1 | /**          
  2 |  * DARE (Direct Access REplication)
  3 |  *                                                                                             
  4 |  * Debugging and logging utilities
  5 |  *
  6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
  7 |  * 
  8 |  * Author(s): Marius Poke <marius.poke@inf.ethz.ch>
  9 |  * 
 10 |  */
 11 | 
 12 | #ifndef DEBUG_H_
 13 | #define DEBUG_H_
 14 | 
 15 | #include <stdio.h>
 16 | #include <errno.h>
 17 | #define __STDC_FORMAT_MACROS
 18 | #include <inttypes.h>
 19 | #include <sys/time.h>
 20 | 
 21 | //extern struct timeval prev_tv;
 22 | //extern uint64_t jump_cnt;
 23 | 
 24 | #define info(stream, fmt, ...) do {\
 25 |     fprintf(stream, fmt, ##__VA_ARGS__); \
 26 |     fflush(stream); \
 27 | } while(0)
 28 | #define info_wtime(stream, fmt, ...) do {\
 29 |     struct timeval _debug_tv;\
 30 |     gettimeofday(&_debug_tv,NULL);\
 31 | /*    if (prev_tv.tv_sec != 0) { \
 32 |         double __tmp = (_debug_tv.tv_sec - prev_tv.tv_sec) * 1000 + (_debug_tv.tv_usec -  prev_tv.tv_usec)/1000;\
 33 |         if (__tmp > 15) {\
 34 |             jump_cnt++;\
 35 |             fprintf(stream, "Time jump (%lf) ms %"PRIu64"\n", __tmp, jump_cnt);\
 36 |         }\
 37 |     }*/\
 38 |     fprintf(stream, "[%lu:%06lu] " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, ##__VA_ARGS__); \
 39 |     fflush(stream); \
 40 | } while(0)
 41 | 
 42 | #ifdef DEBUG
 43 | #define debug(stream, fmt, ...) do {\
 44 |     struct timeval _debug_tv;\
 45 |     gettimeofday(&_debug_tv,NULL);\
 46 |     fprintf(stream, "[DEBUG %lu:%lu] %s/%d/%s() " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
 47 |     fflush(stream); \
 48 | } while(0)
 49 | #define text(stream, fmt, ...) do {\
 50 |     fprintf(stream, fmt, ##__VA_ARGS__); \
 51 |     fflush(stream); \
 52 | } while(0)
 53 | #define text_wtime(stream, fmt, ...) do {\
 54 |     struct timeval _debug_tv;\
 55 |     gettimeofday(&_debug_tv,NULL);\
 56 |     fprintf(stream, "[%lu:%lu] " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, ##__VA_ARGS__); \
 57 |     fflush(stream); \
 58 | } while(0)
 59 | #else
 60 | #define debug(stream, fmt, ...)
 61 | #define text(stream, fmt, ...)
 62 | #define text_wtime(stream, fmt, ...)
 63 | #endif
 64 | 
 65 | //#ifdef DEBUG
 66 | #define error(stream, fmt, ...) do { \
 67 |     fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
 68 |     fflush(stream); \
 69 | } while(0)
 70 | //#else
 71 | //#define error(stream, fmt, ...)
 72 | //#endif
 73 | 
 74 | //#ifdef DEBUG
 75 | #define error_return(rc, stream, fmt, ...) do { \
 76 |     fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
 77 |     fflush(stream); \
 78 |     return (rc);  \
 79 | } while(0)
 80 | //#else
 81 | //#define error_return(rc, stream, fmt, ...) return (rc)
 82 | //#endif
 83 | 
 84 | //#ifdef DEBUG
 85 | #define error_exit(rc, stream, fmt, ...) do { \
 86 |     fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
 87 |     fflush(stream); \
 88 |     exit(rc); \
 89 | } while(0)
 90 | //#else
 91 | //#define error_exit(rc, stream, fmt, ...) exit(rc)
 92 | //#endif
 93 | 
 94 | #ifndef DEBUG
 95 | #define dump_bytes(stream, addr, len, header) do { \
 96 |     uint32_t _i; \
 97 |     uint8_t *bytes = (uint8_t*)addr; \
 98 |     info(stream, "### %s: [" , header); \
 99 |     for (_i = 0; _i < (uint32_t)(len); _i++) { \
100 |         info(stream, "%"PRIu8", ", bytes[_i]); \
101 |     }   \
102 |     info(stream, "]\n"); \
103 | } while(0)
104 | #else
105 | #define dump_bytes(stream, addr, len, header)
106 | #endif
107 | 
108 | extern FILE *log_fp;
109 | 
110 | #endif /* DEBUG_H_ */
111 | 
112 | 


--------------------------------------------------------------------------------
/src/include/dare/message.h:
--------------------------------------------------------------------------------
 1 | #ifndef MESSAGE_H
 2 | #define MESSAGE_H
 3 | #include <sys/queue.h>
 4 | 
 5 | struct tailq_cmd_t {
 6 |     uint16_t    len;
 7 |     uint8_t cmd[87380];
 8 | };
 9 | typedef struct tailq_cmd_t tailq_cmd_t;
10 | 
11 | struct tailq_entry_t {
12 | 	uint8_t type;
13 | 	uint16_t connection_id;
14 | 	uint64_t req_id;
15 | 	tailq_cmd_t cmd;
16 | 	TAILQ_ENTRY(tailq_entry_t) entries;
17 | };
18 | typedef struct tailq_entry_t tailq_entry_t;
19 | 
20 | TAILQ_HEAD(, tailq_entry_t) tailhead;
21 | 
22 | pthread_spinlock_t tailq_lock;
23 | 
24 | #endif


--------------------------------------------------------------------------------
/src/include/dare/timer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DARE (Direct Access REplication)
 3 |  * 
 4 |  * Timer implementation
 5 |  *
 6 |  * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved.
 7 |  * 
 8 |  * Copyright (c) 2009 The Trustees of Indiana University and Indiana
 9 |  *                    University Research and Technology
10 |  *                    Corporation.  All rights reserved.
11 |  *
12 |  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
13 |  */
14 | 
15 | #ifndef TIMER_H_
16 | #define TIMER_H_
17 | 
18 | #include "./debug.h"
19 | 
20 | #define UINT32_T uint32_t
21 | #define UINT64_T uint64_t
22 | 
23 | #define HRT_CALIBRATE(freq) do {  \
24 |   static volatile HRT_TIMESTAMP_T t1, t2; \
25 |   static volatile UINT64_T elapsed_ticks, min = (UINT64_T)(~0x1); \
26 |   int notsmaller=0; \
27 |   while(notsmaller<3) { \
28 |     HRT_GET_TIMESTAMP(t1); \
29 |      sleep(1);  \
30 |     /* nanosleep((struct timespec[]){{0, 10000000}}, NULL); */ \
31 |     HRT_GET_TIMESTAMP(t2); \
32 |     HRT_GET_ELAPSED_TICKS(t1, t2, &elapsed_ticks); \
33 |     notsmaller++; \
34 |     if(elapsed_ticks<min) { \
35 |       min = elapsed_ticks; \
36 |       notsmaller = 0; \
37 |     } \
38 |   } \
39 |   freq = min; \
40 | } while(0);
41 | 
42 | #define HRT_INIT(freq) HRT_CALIBRATE(freq)
43 | 
44 | #define HRT_TIMESTAMP_T x86_64_timeval_t
45 | 
46 | #define HRT_GET_TIMESTAMP(t1)  __asm__ __volatile__ ("rdtsc" : "=a" (t1.l), "=d" (t1.h));
47 | 
48 | #define HRT_GET_ELAPSED_TICKS(t1, t2, numptr)   *numptr = (((( UINT64_T ) t2.h) << 32) | t2.l) - \
49 |                                                           (((( UINT64_T ) t1.h) << 32) | t1.l);
50 | 
51 | #define HRT_GET_TIME(t1, time) time = (((( UINT64_T ) t1.h) << 32) | t1.l)
52 | 
53 | typedef struct {
54 |     UINT32_T l;
55 |     UINT32_T h;
56 | } x86_64_timeval_t;
57 | 
58 | /* global timer frequency in Hz */
59 | extern unsigned long long g_timerfreq;
60 | 
61 | #define HRT_GET_USEC(ticks) 1e6/*1e4*/*(double)ticks/(double)g_timerfreq
62 | 
63 | #define usecs_wait(d) do {   \
64 |   HRT_TIMESTAMP_T ts;   \
65 |   unsigned long long targettime, time;  \
66 |   HRT_GET_TIMESTAMP(ts);    \
67 |   HRT_GET_TIME(ts,targettime);  \
68 |   targettime += g_timerfreq/1e6*(d);    \
69 |   do {  \
70 |     HRT_GET_TIMESTAMP(ts);  \
71 |     HRT_GET_TIME(ts,time);  \
72 |   } while (time < targettime);  \
73 | } while(0);
74 | 
75 | #ifdef DEBUG
76 | #define TIMER_INIT HRT_TIMESTAMP_T t1, t2;  \
77 |     uint64_t ticks; \
78 |     double usecs; 
79 | #define TIMER_START(stream, fmt, ...) info_wtime(stream, fmt, ##__VA_ARGS__); \
80 |     HRT_GET_TIMESTAMP(t1);
81 | #define TIMER_STOP(stream) HRT_GET_TIMESTAMP(t2);   \
82 |     HRT_GET_ELAPSED_TICKS(t1, t2, &ticks);  \
83 |     usecs = HRT_GET_USEC(ticks);    \
84 |     info(stream, "done (%lf usecs)\n", usecs);
85 | #define TIMER_INFO(stream, fmt, ...) info(stream, fmt, ##__VA_ARGS__);
86 | #else
87 | #define TIMER_INIT
88 | #define TIMER_START(stream, fmt, ...)
89 | #define TIMER_STOP(stream)
90 | #define TIMER_INFO(stream, fmt, ...)
91 | #endif
92 | 
93 | 
94 | #endif /* TIMER_H_ */
95 | 
96 | 


--------------------------------------------------------------------------------
/src/include/db/db-interface.h:
--------------------------------------------------------------------------------
 1 | #ifndef DB_INTERFACE_H
 2 | #define DB_INTERFACE_H
 3 | #include <stdint.h>
 4 | #include <sys/types.h>
 5 | 
 6 | typedef struct db_t db;
 7 | 
 8 | db* initialize_db(const char* db_name,uint32_t flag);
 9 | 
10 | void close_db(db*,uint32_t);
11 | 
12 | int store_record(db*,size_t,void*);
13 | 
14 | // the caller is responsible to release the memory
15 | 
16 | void dump_records(db*,void*);
17 | uint32_t get_records_len();
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/include/proxy/proxy.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROXY_H 
 2 | #define PROXY_H
 3 | 
 4 | #include "../util/common-header.h"
 5 | #include "../rsm-interface.h"
 6 | #include "../../../utils/uthash/uthash.h"
 7 | #include "../db/db-interface.h"
 8 | #include <sys/queue.h>
 9 | 
10 | #define CONNECT 4
11 | #define SEND    5
12 | #define CLOSE   6
13 | 
14 | typedef uint16_t hk_t;
15 | typedef uint8_t nc_t;
16 | typedef uint8_t nid_t;
17 | 
18 | struct list_entry_t {
19 |     pthread_t tid;
20 |     LIST_ENTRY(list_entry_t) entries;
21 | };
22 | typedef struct list_entry_t list_entry_t;
23 | 
24 | LIST_HEAD(, list_entry_t) listhead;
25 | 
26 | typedef struct proxy_address_t{
27 |     struct sockaddr_in s_addr;
28 |     size_t s_sock_len;
29 | }proxy_address;
30 | 
31 | typedef struct socket_pair_t{
32 |     int clt_id;
33 |     uint64_t req_id;
34 |     uint16_t connection_id;
35 |     int p_s;
36 |     
37 |     UT_hash_handle hh;
38 | }socket_pair;
39 | 
40 | typedef struct proxy_node_t{
41 | 	proxy_address sys_addr;
42 | 
43 |     socket_pair* leader_hash_map;
44 |     socket_pair* follower_hash_map;
45 |     uint64_t highest_rec;
46 |     uint64_t cur_rec;
47 |     nc_t pair_count;
48 | 	
49 |     // log option
50 |     int req_log;
51 | 
52 | 	FILE* req_log_file;
53 | 	char* db_name;
54 | 	db* db_ptr;
55 | }proxy_node;
56 | 
57 | typedef struct proxy_msg_header_t{
58 |     uint16_t connection_id;
59 |     uint8_t action;
60 | }proxy_msg_header;
61 | #define PROXY_MSG_HEADER_SIZE (sizeof(proxy_msg_header))
62 | 
63 | typedef struct proxy_connect_msg_t{
64 |     proxy_msg_header header;
65 | }proxy_connect_msg;
66 | #define PROXY_CONNECT_MSG_SIZE (sizeof(proxy_connect_msg))
67 | 
68 | struct fake_dare_cid_t {
69 |     uint64_t epoch;
70 |     uint8_t size[2];
71 |     uint8_t state;
72 |     uint8_t pad[1];
73 |     uint32_t bitmask;
74 | };
75 | typedef struct fake_dare_cid_t fake_dare_cid_t;
76 | 
77 | struct fake_sm_cmd_t {
78 |     uint16_t    len;
79 |     uint8_t cmd[0];
80 | };
81 | typedef struct fake_sm_cmd_t fake_sm_cmd_t;
82 | 
83 | typedef struct proxy_send_msg_t{
84 |     proxy_msg_header header;
85 |     union {
86 |         fake_sm_cmd_t   cmd;
87 |         fake_dare_cid_t cid;
88 |         uint64_t head;
89 |     } data;
90 | }proxy_send_msg;
91 | #define PROXY_SEND_MSG_SIZE(M) (M->data.cmd.len+sizeof(proxy_send_msg))
92 | 
93 | typedef struct proxy_close_msg_t{
94 |     proxy_msg_header header;
95 | }proxy_close_msg;
96 | #define PROXY_CLOSE_MSG_SIZE (sizeof(proxy_close_msg))
97 | 
98 | #endif


--------------------------------------------------------------------------------
/src/include/rsm-interface.h:
--------------------------------------------------------------------------------
 1 | #ifndef RSM_INTERFACE_H
 2 | #define RSM_INTERFACE_H
 3 | #include <unistd.h>
 4 | #include <stdint.h>
 5 | 
 6 | struct proxy_node_t;
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 | 	struct proxy_node_t* proxy_init(const char* config_path, const char* proxy_log_path);
13 | 	void proxy_on_read(struct proxy_node_t* proxy, void* buf, ssize_t ret, int fd);
14 | 	void proxy_on_accept(struct proxy_node_t* proxy, int ret);
15 | 	void proxy_on_close(struct proxy_node_t* proxy, int fildes);
16 | 	
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/include/util/common-header.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_HEADER_H
 2 | #define COMMON_HEADER_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <signal.h>
 6 | #include <assert.h>
 7 | #include <getopt.h>
 8 | #include <unistd.h>
 9 | #include <pthread.h>
10 | #include <error.h>
11 | #include <errno.h>
12 | #include <ctype.h>
13 | #include <string.h>
14 | #include <pthread.h>
15 | #include <sys/types.h>
16 | #include <sys/wait.h>
17 | #include <sys/socket.h>
18 | #include <sys/time.h>
19 | #include <arpa/inet.h>
20 | #include <netinet/in.h>
21 | #include "debug.h"
22 | 
23 | #endif
24 | 
25 | #ifndef _POSIX_SOURCE
26 | #define _POSIX_SOURCE
27 | #endif
28 | 


--------------------------------------------------------------------------------
/src/include/util/debug.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef DEBUG_H
 3 | #define DEBUG_H
 4 | 
 5 | #define debug_log(args...) do { \
 6 |     struct timeval tv; \
 7 |     gettimeofday(&tv,0); \
 8 |     fprintf(stderr,"%lu.%06lu:",tv.tv_sec,tv.tv_usec); \
 9 |     fprintf(stderr,args); \
10 | }while(0);
11 | 
12 | 
13 | #define err_log(args...) do { \
14 |     struct timeval tv; \
15 |     gettimeofday(&tv,0); \
16 |     fprintf(stderr,"%lu.%06lu:",tv.tv_sec,tv.tv_usec); \
17 |     fprintf(stderr,args); \
18 | }while(0);
19 | 
20 | #define rec_log(out,args...) do { \
21 |     struct timeval tv; \
22 |     gettimeofday(&tv,0); \
23 |     fprintf((out),"%lu.%06lu:",tv.tv_sec,tv.tv_usec); \
24 |     fprintf((out),args); \
25 |     fflush(out); \
26 | }while(0);
27 | 
28 | #define safe_rec_log(x,args...) {if(NULL!=(x)){rec_log((x),args);}}
29 | 
30 | #define SYS_LOG(x,args...) {if((x)->sys_log){safe_rec_log(((x)->sys_log_file),args)}}
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/src/proxy/proxy.c:
--------------------------------------------------------------------------------
  1 | #include "../include/proxy/proxy.h"
  2 | #include "../include/config-comp/config-proxy.h"
  3 | #include <fcntl.h>
  4 | #include <netinet/tcp.h>
  5 | #include "../include/dare/dare_server.h"
  6 | #include "../include/dare/message.h"
  7 | #define __STDC_FORMAT_MACROS
  8 | 
  9 | static void stablestorage_save_request(void* data,void*arg);
 10 | static void stablestorage_dump_records(void*buf,void*arg);
 11 | static uint32_t stablestorage_get_records_len(void*arg);
 12 | static int stablestorage_load_records(void*buf,uint32_t size,void*arg);
 13 | static void update_highest_rec(void*arg);
 14 | static void do_action_to_server(uint16_t clt_id,uint8_t type,size_t data_size,void* data,void *arg);
 15 | static void do_action_send(uint16_t clt_id,size_t data_size,void* data,void* arg);
 16 | static void do_action_connect(uint16_t clt_id,void* arg);
 17 | static void do_action_close(uint16_t clt_id,void* arg);
 18 | static int set_socket_blocking(int fd, int blocking);
 19 | 
 20 | FILE *log_fp;
 21 | 
 22 | int dare_main(proxy_node* proxy, const char* config_path)
 23 | {
 24 |     int rc; 
 25 |     dare_server_input_t *input = (dare_server_input_t*)malloc(sizeof(dare_server_input_t));
 26 |     memset(input, 0, sizeof(dare_server_input_t));
 27 |     input->log = stdout;
 28 |     input->name = "";
 29 |     input->output = "dare_servers.out";
 30 |     input->srv_type = SRV_TYPE_START;
 31 |     input->sm_type = CLT_KVS;
 32 |     input->server_idx = 0xFF;
 33 |     char *server_idx = getenv("server_idx");
 34 |     if (server_idx != NULL)
 35 |         input->server_idx = (uint8_t)atoi(server_idx);
 36 |     input->group_size = 3;
 37 |     char *group_size = getenv("group_size");
 38 |     if (group_size != NULL)
 39 |         input->group_size = (uint8_t)atoi(group_size);
 40 | 
 41 |     input->do_action = do_action_to_server;
 42 |     input->store_cmd = stablestorage_save_request;
 43 |     input->get_db_size = stablestorage_get_records_len;
 44 |     input->create_db_snapshot = stablestorage_dump_records;
 45 |     input->apply_db_snapshot = stablestorage_load_records;
 46 |     input->update_state = update_highest_rec;
 47 |     memcpy(input->config_path, config_path, strlen(config_path));
 48 |     input->up_para = proxy;
 49 |     static int srv_type = SRV_TYPE_START;
 50 | 
 51 |     const char *server_type = getenv("server_type");
 52 |     if (server_type != NULL) {
 53 |         if (strcmp(server_type, "join") == 0) {
 54 |             srv_type = SRV_TYPE_JOIN;
 55 |         }
 56 |     }
 57 |     char *dare_log_file = getenv("dare_log_file");
 58 |     if (dare_log_file == NULL)
 59 |         dare_log_file = "";
 60 | 
 61 |     input->srv_type = srv_type;
 62 | 
 63 |     if (strcmp(dare_log_file, "") != 0) {
 64 |         input->log = fopen(dare_log_file, "w+");
 65 |         if (input->log==NULL) {
 66 |             printf("Cannot open log file\n");
 67 |             exit(1);
 68 |         }
 69 |     }
 70 |     if (SRV_TYPE_START == input->srv_type) {
 71 |         if (0xFF == input->server_idx) {
 72 |             printf("A server cannot start without an index\n");
 73 |             exit(1);
 74 |         }
 75 |     }
 76 |     pthread_t dare_thread;
 77 |     rc = pthread_create(&dare_thread, NULL, &dare_server_init, input);
 78 |     if (0 != rc) {
 79 |         fprintf(log_fp, "Cannot init dare_thread\n");
 80 |         return 1;
 81 |     }
 82 | 
 83 |     list_entry_t *n1 = malloc(sizeof(list_entry_t));
 84 |     n1->tid = dare_thread;
 85 |     LIST_INSERT_HEAD(&listhead, n1, entries);
 86 |     //fclose(log_fp);
 87 |     
 88 |     return 0;
 89 | }
 90 | 
 91 | static int is_inner(pthread_t tid)
 92 | {
 93 |     list_entry_t *np;
 94 |     LIST_FOREACH(np, &listhead, entries) {
 95 |         if (np->tid == tid)
 96 |             return 1;
 97 |     }
 98 |     return 0;
 99 | }
100 | 
101 | static hk_t gen_key(nid_t node_id,nc_t node_count){
102 |     hk_t key = 0;
103 |     key |= ((hk_t)node_id<<8);
104 |     key |= (hk_t)node_count;
105 |     return key;
106 | }
107 | 
108 | static void leader_handle_submit_req(uint8_t type, ssize_t data_size, void* buf, int clt_id, proxy_node* proxy)
109 | {
110 |     socket_pair* pair = NULL;
111 |     uint64_t req_id;
112 |     uint16_t connection_id;
113 | 
114 |     pthread_spin_lock(&tailq_lock);
115 |     uint64_t cur_rec = ++proxy->cur_rec;
116 |     switch(type) {
117 |         case CONNECT:
118 |             pair = (socket_pair*)malloc(sizeof(socket_pair));
119 |             memset(pair,0,sizeof(socket_pair));
120 |             pair->clt_id = clt_id;
121 |             pair->req_id = 0;
122 |             nid_t node_id = get_node_id();
123 |             pair->connection_id = gen_key(node_id, proxy->pair_count++);
124 |             
125 |             req_id = ++pair->req_id;
126 |             connection_id = pair->connection_id;
127 |             
128 |             HASH_ADD_INT(proxy->leader_hash_map, clt_id, pair);
129 |             break;
130 |         case SEND:
131 |             HASH_FIND_INT(proxy->leader_hash_map, &clt_id, pair);
132 |             
133 |             req_id = ++pair->req_id;
134 |             connection_id = pair->connection_id;
135 |             
136 |             socket_pair* replaced_pair = NULL;
137 |             HASH_REPLACE_INT(proxy->leader_hash_map, clt_id, pair, replaced_pair);
138 |             break;
139 |         case CLOSE:
140 |             HASH_FIND_INT(proxy->leader_hash_map, &clt_id, pair);
141 |             
142 |             req_id = ++pair->req_id;
143 |             connection_id = pair->connection_id;
144 |             
145 |             HASH_DEL(proxy->leader_hash_map, pair);
146 |             break;
147 |     }
148 | 
149 |     tailq_entry_t* n2 = (tailq_entry_t*)malloc(sizeof(tailq_entry_t));
150 |     n2->req_id = req_id;
151 |     n2->connection_id = connection_id;
152 |     n2->type = type;
153 |     n2->cmd.len = data_size;
154 |     if (data_size)
155 |         memcpy(n2->cmd.cmd, buf, data_size);
156 |     TAILQ_INSERT_TAIL(&tailhead, n2, entries);
157 | 
158 |     pthread_spin_unlock(&tailq_lock);
159 | 
160 |     while (cur_rec > proxy->highest_rec);
161 | }
162 | 
163 | static void get_socket_buffer_size(int sockfd)
164 | {
165 |     /* 
166 |      * TCP provides flow control. TCP always tells its peer exactly
167 |      * how many bytes of data it is willing to accept from the peer
168 |      * at any one time. This is called the advertised window.
169 |      * At any time, the window is the amount of room currently available
170 |      * in the receive buffer, guaranteeing that the sender cannot
171 |      * overflow the receiver buffer. The window changes dynamically over
172 |      * time: As data is received from the sender, the window size decreases,
173 |      * but as the receiving application reads data from the buffer, the
174 |      * window size increases.
175 |      */
176 |     socklen_t i;
177 |     size_t len;
178 | 
179 |     i = sizeof(len);
180 |     if (getsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &len, &i) < 0) {
181 |         perror(": getsockopt");
182 |     }
183 | 
184 |     printf("receive buffer size = %d\n", len);
185 | 
186 |     if (getsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &len, &i) < 0) {
187 |         perror(": getsockopt");
188 |     }
189 | 
190 |     printf("send buffer size = %d\n", len);
191 | }
192 | 
193 | static int set_socket_blocking(int fd, int blocking) {
194 |     int flags;
195 | 
196 |     if ((flags = fcntl(fd, F_GETFL)) == -1) {
197 |         fprintf(stderr, "fcntl(F_GETFL): %s", strerror(errno));
198 |     }
199 | 
200 |     if (blocking)
201 |         flags &= ~O_NONBLOCK;
202 |     else
203 |         flags |= O_NONBLOCK;
204 | 
205 |     if (fcntl(fd, F_SETFL, flags) == -1) {
206 |         fprintf(stderr, "fcntl(F_SETFL,O_NONBLOCK): %s", strerror(errno));
207 |     }
208 |     return 0;
209 | }
210 | 
211 | static int set_socket_timeout(int fd, struct timeval *timeout) {
212 | 	/*
213 | 	 * SO_RCVTIMEO and SO_SNDTIMEO
214 |      * Specify the receiving or sending timeouts until reporting  an
215 |      * error. The argument is a struct timeval. If an input or output
216 |      * function blocks for this period of time, and data has been sent
217 |      * or received, the return value of that function will be the
218 |      * amount of data transferred; if no data has been transferred and
219 |      * the timeout has been reached then -1 is returned with errno set
220 |      * to EAGAIN or EWOULDBLOCK, or EINPROGRESS just as if the socket
221 |      * was specified to be nonblocking. If the timeout is set to zero
222 |      * (the default) then the operation will never timeout.
223 | 	 */
224 |     if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, timeout, sizeof(struct timeval)) < 0) {
225 |         perror("set_socket_timeout");
226 |     }
227 |     return 0;
228 | }
229 | 
230 | void proxy_on_read(proxy_node* proxy, void* buf, ssize_t bytes_read, int fd)
231 | {
232 | 	if (is_inner(pthread_self()))
233 | 		return;
234 | 
235 | 	if (is_leader())
236 |         leader_handle_submit_req(SEND, bytes_read, buf, fd, proxy);
237 | 
238 | 	return;
239 | }
240 | 
241 | void proxy_on_accept(proxy_node* proxy, int fd)
242 | {
243 | 	if (is_inner(pthread_self()))
244 | 		return;
245 | 
246 | 	if (is_leader())
247 |         leader_handle_submit_req(CONNECT, 0, NULL, fd, proxy);
248 | 
249 | 	return;	
250 | }
251 | 
252 | void proxy_on_close(proxy_node* proxy, int fd)
253 | {
254 | 	if (is_inner(pthread_self()))
255 | 		return;
256 | 
257 | 	if (is_leader())
258 |         leader_handle_submit_req(CLOSE, 0, NULL, fd, proxy);
259 | 
260 | 	return;
261 | }
262 | 
263 | static void update_highest_rec(void*arg)
264 | {
265 |     proxy_node* proxy = arg;
266 |     proxy->highest_rec++;   
267 | }
268 | 
269 | static void stablestorage_save_request(void* data,void*arg)
270 | {
271 |     proxy_node* proxy = arg;
272 |     proxy_msg_header* header = (proxy_msg_header*)data;
273 |     switch(header->action){
274 |         case CONNECT:
275 |         {
276 |             store_record(proxy->db_ptr,PROXY_CONNECT_MSG_SIZE,data);
277 |             break;
278 |         }
279 |         case SEND:
280 |         {
281 |             proxy_send_msg* send_msg = (proxy_send_msg*)data;
282 |             store_record(proxy->db_ptr,PROXY_SEND_MSG_SIZE(send_msg),data);
283 |             break;
284 |         }
285 |         case CLOSE:
286 |         {
287 |             store_record(proxy->db_ptr,PROXY_CLOSE_MSG_SIZE,data);
288 |             break;
289 |         }
290 |     }
291 | }
292 | 
293 | static uint32_t stablestorage_get_records_len(void*arg)
294 | {
295 |     proxy_node* proxy = arg;
296 |     uint32_t records_len = get_records_len(proxy->db_ptr);
297 |     return records_len;
298 | }
299 | 
300 | static void stablestorage_dump_records(void*buf,void*arg)
301 | {
302 |     proxy_node* proxy = arg;
303 |     dump_records(proxy->db_ptr,buf);
304 | }
305 | 
306 | static int stablestorage_load_records(void*buf,uint32_t size,void*arg)
307 | {
308 |     proxy_node* proxy = arg;
309 |     proxy_msg_header* header;
310 |     uint32_t len = 0;
311 |     while(len < size) {
312 |         header = (proxy_msg_header*)((char*)buf + len);
313 |         switch(header->action){
314 |             case SEND:
315 |             {
316 |                 proxy_send_msg* send_msg = (proxy_send_msg*)header;
317 |                 len += PROXY_SEND_MSG_SIZE(send_msg);
318 |                 store_record(proxy->db_ptr,PROXY_SEND_MSG_SIZE(send_msg),header);
319 |                 do_action_send(header->connection_id, send_msg->data.cmd.len, send_msg->data.cmd.cmd, arg);
320 |                 break;
321 |             }
322 |             case CONNECT:
323 |             {
324 |                 len += PROXY_CONNECT_MSG_SIZE;
325 |                 store_record(proxy->db_ptr,PROXY_CONNECT_MSG_SIZE,header);
326 |                 do_action_connect(header->connection_id, arg);
327 |                 break;
328 |             }
329 |             case CLOSE:
330 |             {
331 |                 len += PROXY_CLOSE_MSG_SIZE;
332 |                 store_record(proxy->db_ptr,PROXY_CLOSE_MSG_SIZE,header);
333 |                 do_action_close(header->connection_id, arg);
334 |                 break;
335 |             }
336 |         }
337 |     }
338 |     return 0;
339 | }
340 | 
341 | static void do_action_to_server(uint16_t clt_id,uint8_t type,size_t data_size,void* data,void*arg)
342 | {
343 |     proxy_node* proxy = arg;
344 |     FILE* output = NULL;
345 |     if(proxy->req_log){
346 |         output = proxy->req_log_file;
347 |     }
348 |     switch(type){
349 |         case CONNECT:
350 |         	if(output!=NULL){
351 |         		fprintf(output,"Operation: Connects.\n");
352 |             }
353 |             do_action_connect(clt_id,arg);
354 |             break;
355 |         case SEND:
356 |         	if(output!=NULL){
357 |         		fprintf(output,"Operation: Sends data.\n");
358 |             }
359 |             do_action_send(clt_id,data_size,data,arg);
360 |             break;
361 |         case CLOSE:
362 |         	if(output!=NULL){
363 |         		fprintf(output,"Operation: Closes.\n");
364 |             }
365 |             do_action_close(clt_id,arg);
366 |             break;
367 |         default:
368 |             break;
369 |     }
370 |     return;
371 | }
372 | 
373 | static void do_action_connect(uint16_t clt_id,void* arg)
374 | {
375 |     proxy_node* proxy = arg;
376 | 
377 |     socket_pair* ret;
378 |     HASH_FIND(hh, proxy->follower_hash_map, &clt_id, sizeof(uint16_t), ret);
379 |     if (NULL == ret)
380 |     {
381 |         ret = malloc(sizeof(socket_pair));
382 |         memset(ret,0,sizeof(socket_pair));
383 | 
384 |         ret->connection_id = clt_id;
385 |         int sockfd = socket(AF_INET, SOCK_STREAM, 0);
386 |         if (sockfd < 0)
387 |         {
388 |             fprintf(stderr, "ERROR opening socket!\n");
389 |             goto do_action_connect_exit;
390 |         }
391 |         ret->p_s = sockfd;
392 |         HASH_ADD(hh, proxy->follower_hash_map, connection_id, sizeof(uint16_t), ret);
393 | 
394 |         if (connect(ret->p_s, (struct sockaddr*)&proxy->sys_addr.s_addr, proxy->sys_addr.s_sock_len) < 0)
395 |             fprintf(stderr, "ERROR connecting!\n");
396 | 
397 |         set_socket_blocking(ret->p_s, 0);
398 | 
399 |         int enable = 1;
400 |         if(setsockopt(ret->p_s, IPPROTO_TCP, TCP_NODELAY, (void*)&enable, sizeof(enable)) < 0)
401 |             fprintf(stderr, "TCP_NODELAY SETTING ERROR!\n");
402 |     }
403 | 
404 | do_action_connect_exit:
405 | 	return;
406 | }
407 | 
408 | static void do_action_send(uint16_t clt_id,size_t data_size,void* data,void* arg)
409 | {
410 | 	proxy_node* proxy = arg;
411 | 	socket_pair* ret;
412 | 	HASH_FIND(hh, proxy->follower_hash_map, &clt_id, sizeof(uint16_t), ret);
413 | 
414 | 	if(NULL==ret){
415 | 		goto do_action_send_exit;
416 | 	}else{
417 | 		int n = write(ret->p_s, data, data_size);
418 | 		if (n < 0)
419 | 			fprintf(stderr, "ERROR writing to socket!\n");
420 | 	}
421 | do_action_send_exit:
422 | 	return;
423 | }
424 | 
425 | static void do_action_close(uint16_t clt_id,void* arg)
426 | {
427 | 	proxy_node* proxy = arg;
428 | 	socket_pair* ret;
429 | 	HASH_FIND(hh, proxy->follower_hash_map, &clt_id, sizeof(uint16_t), ret);
430 | 	if(NULL==ret){
431 | 		goto do_action_close_exit;
432 | 	}else{
433 | 		if (close(ret->p_s))
434 | 			fprintf(stderr, "ERROR closing socket!\n");
435 | 		HASH_DEL(proxy->follower_hash_map, ret);
436 | 	}
437 | do_action_close_exit:
438 | 	return;
439 | }
440 | 
441 | proxy_node* proxy_init(const char* config_path,const char* proxy_log_path)
442 | {
443 |     proxy_node* proxy = (proxy_node*)malloc(sizeof(proxy_node));
444 | 
445 |     if(NULL==proxy){
446 |         err_log("PROXY : Cannot Malloc Memory For The Proxy.\n");
447 |         goto proxy_exit_error;
448 |     }
449 | 
450 |     memset(proxy,0,sizeof(proxy_node));
451 |     
452 |     if(proxy_read_config(proxy,config_path)){
453 |         err_log("PROXY : Configuration File Reading Error.\n");
454 |         goto proxy_exit_error;
455 |     }
456 | 
457 |     int build_log_ret = 0;
458 |     if(proxy_log_path==NULL){
459 |         proxy_log_path = ".";
460 |     }else{
461 |         if((build_log_ret=mkdir(proxy_log_path,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH))!=0){
462 |             if(errno!=EEXIST){
463 |                 err_log("PROXY : Log Directory Creation Failed,No Log Will Be Recorded.\n");
464 |             }else{
465 |                 build_log_ret = 0;
466 |             }
467 |         }
468 |     }
469 | 
470 |     if(!build_log_ret){
471 |         //if(proxy->req_log){
472 |             char* req_log_path = (char*)malloc(sizeof(char)*strlen(proxy_log_path)+50);
473 |             memset(req_log_path,0,sizeof(char)*strlen(proxy_log_path)+50);
474 |             if(NULL!=req_log_path){
475 |                 sprintf(req_log_path,"%s/node-proxy-req.log",proxy_log_path);
476 |                 //err_log("%s.\n",req_log_path);
477 |                 proxy->req_log_file = fopen(req_log_path,"w");
478 |                 free(req_log_path);
479 |             }
480 |             if(NULL==proxy->req_log_file && proxy->req_log){
481 |                 err_log("PROXY : Client Request Log File Cannot Be Created.\n");
482 |             }
483 |         //}
484 |     }
485 | 
486 |     TAILQ_INIT(&tailhead);
487 |     LIST_INIT(&listhead);
488 | 
489 |     proxy->db_ptr = initialize_db(proxy->db_name,0);
490 | 
491 |     proxy->follower_hash_map = NULL;
492 |     proxy->leader_hash_map = NULL;
493 | 
494 |     if(pthread_spin_init(&tailq_lock, PTHREAD_PROCESS_PRIVATE)){
495 |         err_log("PROXY: Cannot init the lock\n");
496 |     }
497 | 
498 |     dare_main(proxy, config_path);
499 | 
500 |     return proxy;
501 | 
502 | proxy_exit_error:
503 |     if(NULL!=proxy){
504 |         free(proxy);
505 |     }
506 |     return NULL;
507 | 
508 | }
509 | 


--------------------------------------------------------------------------------
/src/spec_hooks.cpp:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <stdio.h>
  3 | #include <dlfcn.h>
  4 | #include <stdlib.h>
  5 | #include <pthread.h>
  6 | #include <sys/stat.h>
  7 | #include "include/rsm-interface.h"
  8 | 
  9 | #define dprintf(fmt...)
 10 | 
 11 | struct proxy_node_t* proxy = NULL;
 12 | 
 13 | typedef int (*main_type)(int, char**, char**);
 14 | 
 15 | struct arg_type
 16 | {
 17 | 	char **argv;
 18 | 	int (*main_func) (int, char **, char **);
 19 | };
 20 | 
 21 | main_type saved_init_func = NULL;
 22 | void tern_init_func(int argc, char **argv, char **env)
 23 | {
 24 | 	dprintf("%04d: __tern_init_func() called.\n", (int) pthread_self());
 25 | 	if(saved_init_func)
 26 | 		saved_init_func(argc, argv, env);
 27 | 
 28 | 	printf("tern_init_func is called\n");
 29 | 
 30 | 	char* config_path = getenv("config_path");
 31 | 
 32 | 	char* proxy_log_dir = NULL;
 33 | 	proxy = proxy_init(config_path, proxy_log_dir);
 34 | }
 35 | 
 36 | typedef void (*fini_type)(void*);
 37 | fini_type saved_fini_func = NULL;
 38 | 
 39 | extern "C" int my_main(int argc, char **pt, char **aa)
 40 | {
 41 | 	int ret;
 42 | 	arg_type *args = (arg_type*)pt;
 43 | 	dprintf("%04d: __libc_start_main() called.\n", (int) pthread_self());
 44 | 	ret = args->main_func(argc, args->argv, aa);
 45 | 	return ret;
 46 | }
 47 | 
 48 | extern "C" int __libc_start_main(
 49 | 	void *func_ptr,
 50 | 	int argc,
 51 | 	char* argv[],
 52 | 	void (*init_func)(void),
 53 | 	void (*fini_func)(void),
 54 | 	void (*rtld_fini_func)(void),
 55 | 	void *stack_end)
 56 | {
 57 | 	typedef void (*fnptr_type)(void);
 58 | 	typedef int (*orig_func_type)(void *, int, char *[], fnptr_type,
 59 | 		fnptr_type, fnptr_type, void*);
 60 | 	orig_func_type orig_func;
 61 | 	arg_type args;
 62 | 
 63 | 	void * handle;
 64 | 	int ret;
 65 | 
 66 | 	// Get lib path.
 67 | 	Dl_info dli;
 68 | 	dladdr((void *)dlsym, &dli);
 69 | 	std::string libPath = dli.dli_fname;
 70 | 	libPath = dli.dli_fname;
 71 | 	size_t lastSlash = libPath.find_last_of("/");
 72 | 	libPath = libPath.substr(0, lastSlash);
 73 | 	libPath += "/libc.so.6";
 74 | 	libPath = "/lib/x86_64-linux-gnu/libc.so.6";
 75 | 	if(!(handle=dlopen(libPath.c_str(), RTLD_LAZY))) {
 76 | 		puts("dlopen error");
 77 | 		abort();
 78 | 	}
 79 | 
 80 | 	orig_func = (orig_func_type) dlsym(handle, "__libc_start_main");
 81 | 
 82 | 	if(dlerror()) {
 83 | 		puts("dlerror");
 84 | 		abort();
 85 | 	}
 86 | 
 87 | 	dlclose(handle);
 88 | 
 89 | 	dprintf("%04d: __libc_start_main is hooked.\n", (int) pthread_self());
 90 | 
 91 | 	args.argv = argv;
 92 | 	args.main_func = (main_type)func_ptr;
 93 | 	saved_init_func = (main_type)init_func;
 94 | 
 95 | 	saved_fini_func = (fini_type)rtld_fini_func;
 96 | 
 97 | 	ret = orig_func((void*)my_main, argc, (char**)(&args), (fnptr_type)tern_init_func, (fnptr_type)fini_func, rtld_fini_func, stack_end);
 98 | 
 99 | 	return ret;
100 | }
101 | 
102 | extern "C" int accept(int socket, struct sockaddr *address, socklen_t *address_len)
103 | {
104 | 	typedef int (*orig_accept_type)(int, sockaddr *, socklen_t *);
105 | 	static orig_accept_type orig_accept;
106 | 	if (!orig_accept)
107 | 		orig_accept = (orig_accept_type) dlsym(RTLD_NEXT, "accept");
108 | 
109 | 	int ret = orig_accept(socket, address, address_len);
110 | 
111 | 	if (ret >= 0 && proxy != NULL)
112 | 	{
113 | 		struct stat sb;
114 | 		fstat(ret, &sb);
115 | 		if ((sb.st_mode & S_IFMT) == S_IFSOCK)
116 | 			proxy_on_accept(proxy, ret);
117 | 	}
118 | 
119 | 	return ret;
120 | }
121 | 
122 | // memcached
123 | extern "C" int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)
124 | {
125 | 	typedef int (*orig_accept4_type)(int, sockaddr *, socklen_t *, int);
126 | 	static orig_accept4_type orig_accept4;
127 | 	if (!orig_accept4)
128 | 		orig_accept4 = (orig_accept4_type) dlsym(RTLD_NEXT, "accept4");
129 | 
130 | 	int ret = orig_accept4(sockfd, addr, addrlen, flags);
131 | 
132 | 	if (ret >= 0 && proxy != NULL)
133 | 	{
134 | 		struct stat sb;
135 | 		fstat(ret, &sb);
136 | 		if ((sb.st_mode & S_IFMT) == S_IFSOCK)
137 | 			proxy_on_accept(proxy, ret);
138 | 	}
139 | 
140 | 	return ret;
141 | }
142 | 
143 | extern "C" int close(int fildes)
144 | {
145 | 	if (proxy != NULL)
146 | 	{
147 | 		struct stat sb;
148 | 		fstat(fildes, &sb);
149 | 		if ((sb.st_mode & S_IFMT) == S_IFSOCK)
150 | 			proxy_on_close(proxy, fildes);
151 | 	}
152 | 
153 | 	typedef int (*orig_close_type)(int);
154 | 	static orig_close_type orig_close;
155 | 	if (!orig_close)
156 | 		orig_close = (orig_close_type) dlsym(RTLD_NEXT, "close");
157 | 	int ret = orig_close(fildes);
158 | 	return ret;
159 | }
160 | 
161 | extern "C" ssize_t read(int fd, void *buf, size_t count)
162 | {
163 | 	typedef ssize_t (*orig_read_type)(int, void *, size_t);
164 | 	static orig_read_type orig_read;
165 | 	if (!orig_read)
166 | 		orig_read = (orig_read_type) dlsym(RTLD_NEXT, "read");
167 | 	ssize_t bytes_read = orig_read(fd, buf, count);
168 | 
169 | 	if (bytes_read > 0 && proxy != NULL)
170 | 	{
171 | 		struct stat sb;
172 | 		fstat(fd, &sb);
173 | 		if ((sb.st_mode & S_IFMT) == S_IFSOCK)
174 | 			proxy_on_read(proxy, buf, bytes_read, fd);
175 | 	}
176 | 
177 | 	return bytes_read;
178 | }
179 | 


--------------------------------------------------------------------------------
/target/makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | include ../makefile.init
 6 | 
 7 | RM := rm -rf
 8 | 
 9 | # All of the sources participating in the build are defined here
10 | -include sources.mk
11 | -include src/dare/subdir.mk
12 | -include src/proxy/subdir.mk
13 | -include src/db/subdir.mk
14 | -include src/config-comp/subdir.mk
15 | -include src/subdir.mk
16 | -include subdir.mk
17 | -include objects.mk
18 | 
19 | LIBS += $(DARE) -lev -ldb -lconfig -libverbs -lm
20 | 
21 | # Add inputs and outputs from these tool invocations to the build variables 
22 | 
23 | # All Target
24 | all: interpose.so
25 | 
26 | # Tool invocations
27 | interpose.so: $(OBJS)
28 | 	@echo 'Building target: $@'
29 | 	@echo 'Invoking: GCC C Linker'
30 | 	gcc -shared -Wl,-soname,interpose.so $(OBJS) -Wall -o interpose.so $(LIBS)
31 | 	@echo 'Finished building target: $@'
32 | 	@echo ' '
33 | 
34 | # Other Targets
35 | clean:
36 | 	@echo "##### CLEAN-UP DARE#####"
37 | 	-$(RM) $(RBTREE_OBJS)
38 | 	-$(RM) $(DARE_OBJS)
39 | 	-$(RM) $(DARE) $(RBTREE)
40 | 	@echo "########################"
41 | 	-@echo ' '
42 | 	-$(RM) $(OBJS)$(C_DEPS) interpose.so
43 | 	-@echo ' '
44 | 
45 | .PHONY: all clean dependents
46 | 


--------------------------------------------------------------------------------
/target/nodes.local.cfg:
--------------------------------------------------------------------------------
 1 | #configuration files for the replicated state machine node group
 2 | 
 3 | #proxy configuration part
 4 | 
 5 | db_name = "node_test";
 6 | req_log = 1;
 7 | 
 8 | #real server configuration
 9 | 
10 | ip_address = "127.0.0.1";
11 | port       = 8888;
12 | 
13 | #dare component configuration part
14 | 
15 | #HB period (seconds)
16 | #election timeout range (microseconds)
17 | #retransmission period (seconds)
18 | #period of checking for new connections (seconds)
19 | #log pruning period (seconds)
20 | dare_global_config = {
21 |     #hb_period = 0.001;
22 |     #elec_timeout_low = 10000;
23 |     #elec_timeout_high = 30000;
24 |     #rc_info_period = 0.01;
25 |     #retransmit_period = 0.02;
26 |     #log_pruning_period = 0.03;
27 | 
28 |     #DEBUG
29 |     hb_period = 0.01;
30 |     elec_timeout_low = 100000;
31 |     elec_timeout_high = 300000;
32 |     retransmit_period = 0.04;
33 |     rc_info_period = 0.05;
34 |     log_pruning_period = 0.05;
35 | };
36 | 


--------------------------------------------------------------------------------
/target/objects.mk:
--------------------------------------------------------------------------------
1 | LIBS := -lsupc++ -lpthread -lstdc++ -lrt


--------------------------------------------------------------------------------
/target/sources.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | O_SRCS := 
 6 | C_SRCS := 
 7 | S_UPPER_SRCS := 
 8 | OBJ_SRCS := 
 9 | ASM_SRCS := 
10 | OBJS := 
11 | C_DEPS := 
12 | EXECUTABLES := 
13 | 
14 | RBTREE_OBJS := 
15 | DARE_OBJS := 
16 | DARE := 
17 | RBTREE := 
18 | DARE_LIBPATH := 
19 | 
20 | # Every subdirectory with source files must be described here
21 | SUBDIRS := \
22 | src/util \
23 | src \
24 | src/dare \
25 | src/db \
26 | src/config-comp \
27 | src/proxy \
28 | 


--------------------------------------------------------------------------------
/target/src/config-comp/subdir.mk:
--------------------------------------------------------------------------------
 1 | # Add inputs and outputs from these tool invocations to the build variables 
 2 | C_SRCS += \
 3 | ../src/config-comp/config-dare.c \
 4 | ../src/config-comp/config-proxy.c 
 5 | 
 6 | 
 7 | OBJS += \
 8 | ./src/config-comp/config-dare.o \
 9 | ./src/config-comp/config-proxy.o 
10 | 
11 | 
12 | # Each subdirectory must supply rules for building sources it contributes
13 | src/config-comp/%.o: ../src/config-comp/%.c
14 | 	@echo 'Building file: $<'
15 | 	@echo 'Invoking: GCC C Compiler'
16 | 	gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<"
17 | 	@echo 'Finished building: $<'
18 | 	@echo ' '
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/target/src/dare/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | CC = gcc
 6 | 
 7 | ifeq ($(DEBUGOPT),1)
 8 | 	FLAGS        = -fPIC -rdynamic -std=gnu99 -DDEBUG -I"$(ROOT_DIR)/../src/include/dare" -I"$(ROOT_DIR)/../utils/rbtree/include" -I/usr/include
 9 | else
10 | 	FLAGS        = -fPIC -rdynamic -std=gnu99 -I"$(ROOT_DIR)/../src/include/dare" -I"$(ROOT_DIR)/../utils/rbtree/include" -I/usr/include
11 | endif
12 | CFLAGS       = #-Wall -Wunused-function #-Wextra
13 | LDFLAGS      = -L/usr/lib -libverbs
14 | 
15 | PREFIX = $(ROOT_DIR)/src/dare
16 | DARE_LIBPATH = $(PREFIX)/lib
17 | 
18 | DARE_HEADERS = $(shell echo $(ROOT_DIR)/../src/include/dare/*.h)
19 | DARE_SRCS = $(shell echo $(ROOT_DIR)/../src/dare/*.c)
20 | DARE_OBJS = $(DARE_SRCS:.c=.o)
21 | DARE = $(DARE_LIBPATH)/libdare.a
22 | 
23 | RBTREE_HEADERS = $(shell echo $(ROOT_DIR)/../utils/rbtree/include/*.h)
24 | RBTREE_SRCS = $(shell echo $(ROOT_DIR)/../utils/rbtree/src/*.c)
25 | RBTREE_OBJS = $(RBTREE_SRCS:.c=.o)
26 | RBTREE = $(DARE_LIBPATH)/librbtree.a
27 | 
28 | all: dare
29 | 
30 | $(RBTREE): rbtree_print $(RBTREE_OBJS) $(RBTREE_HEADERS)
31 | 	mkdir -pm 755 $(DARE_LIBPATH)
32 | 	ar -rcs $@ $(RBTREE_OBJS)
33 | 	@echo "##############################"
34 | 	@echo
35 | rbtree_print:
36 | 	@echo "##### BUILDING Red-Black Tree #####"
37 | 	
38 | dare: FLAGS += -I/usr/local/include
39 | dare: LDFLAGS += /usr/local/lib/libev.a
40 | dare: $(DARE) 
41 | $(DARE): $(RBTREE) dare_print $(DARE_OBJS) $(DARE_HEADERS) 
42 | 	mkdir -pm 755 $(DARE_LIBPATH)
43 | 	ar -rcs $@ $(DARE_OBJS) $(RBTREE_OBJS)
44 | 	@echo "##############################"
45 | 	@echo
46 | dare_print:
47 | 	@echo "##### BUILDING DARE #####"
48 | 
49 | %.o: %.c $(HEADERS)
50 | 	$(CC) $(FLAGS) $(CFLAGS) -c -o $@ $<
51 | 	 
52 | .PHONY : all
53 | 


--------------------------------------------------------------------------------
/target/src/db/subdir.mk:
--------------------------------------------------------------------------------
 1 | # Add inputs and outputs from these tool invocations to the build variables 
 2 | C_SRCS += \
 3 | ../src/db/db-interface.c
 4 | 
 5 | OBJS += \
 6 | ./src/db/db-interface.o
 7 | 
 8 | 
 9 | # Each subdirectory must supply rules for building sources it contributes
10 | src/db/%.o: ../src/db/%.c
11 | 	@echo 'Building file: $<'
12 | 	@echo 'Invoking: GCC C Compiler'
13 | 	gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<"
14 | 	@echo 'Finished building: $<'
15 | 	@echo ' '
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/target/src/proxy/subdir.mk:
--------------------------------------------------------------------------------
 1 | # Add inputs and outputs from these tool invocations to the build variables 
 2 | C_SRCS += \
 3 | ../src/proxy/proxy.c
 4 | 
 5 | OBJS += \
 6 | ./src/proxy/proxy.o
 7 | 
 8 | 
 9 | # Each subdirectory must supply rules for building sources it contributes
10 | src/proxy/%.o: ../src/proxy/%.c
11 | 	@echo 'Building file: $<'
12 | 	@echo 'Invoking: GCC C Compiler'
13 | 	gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<"
14 | 	@echo 'Finished building: $<'
15 | 	@echo ' '
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/target/src/subdir.mk:
--------------------------------------------------------------------------------
 1 | # Add inputs and outputs from these tool invocations to the build variables 
 2 | C_SRCS += \
 3 | ../src/spec_hooks.cpp 
 4 | 
 5 | OBJS += \
 6 | ./src/spec_hooks.o 
 7 | 
 8 | 
 9 | # Each subdirectory must supply rules for building sources it contributes
10 | src/%.o: ../src/%.cpp
11 | 	@echo 'Building file: $<'
12 | 	@echo 'Invoking: GCC C Compiler'
13 | 	gcc -fPIC -rdynamic -std=gnu99 -DDEBUG=$(DEBUGOPT) -O0 -g3 -Wall -c -o "$@" "$<"
14 | 	@echo 'Finished building: $<'
15 | 	@echo ' '
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/utils/dep-lib/db-5.1.29.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/utils/dep-lib/db-5.1.29.tar.gz


--------------------------------------------------------------------------------
/utils/dep-lib/libconfig-1.4.9.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/utils/dep-lib/libconfig-1.4.9.tar.gz


--------------------------------------------------------------------------------
/utils/dep-lib/libev-4.15.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hku-systems/apus/896959f59b33fc695df753e4b65b4d564d67443d/utils/dep-lib/libev-4.15.tar.gz


--------------------------------------------------------------------------------
/utils/mk:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #build dep for the program
 3 | 
 4 | CUR_DIR=$(pwd)
 5 | LIB_PREFIX=${CUR_DIR}/.local/
 6 | 
 7 | mkdir -p ${LIB_PREFIX}
 8 | 
 9 | LIBCONFIG_VER=1.4.9
10 | LIBCONFIG_NAME=libconfig-${LIBCONFIG_VER}
11 | 
12 | LIBEV_VER=4.15
13 | LIBEV_NAME=libev-${LIBEV_VER}
14 | 
15 | BDB_VER=5.1.29
16 | BDB_NAME=db-${BDB_VER}
17 | 
18 | if [ ! -d "dep-lib" ];then
19 | # True if dep-lib exists and is a directory.
20 |     mkdir dep-lib
21 | fi
22 | cd dep-lib
23 | 
24 | if [ ! -f "${LIBCONFIG_NAME}.tar.gz" ];then
25 | # True if ${LIBCONFIG_NAME}.tar.gz exists and is an ordinary file. 
26 |     wget http://www.hyperrealm.com/libconfig/${LIBCONFIG_NAME}.tar.gz
27 | fi
28 | 
29 | if [ ! -d "${LIBCONFIG_NAME}" ];then
30 |     tar -xvf ${LIBCONFIG_NAME}.tar.gz
31 | fi
32 | 
33 | cd ${LIBCONFIG_NAME}
34 | pwd
35 | ./configure --prefix=${LIB_PREFIX}
36 | make;
37 | make install;
38 | cd ..
39 | 
40 | 
41 | if [ ! -f "${LIBEV_NAME}.tar.gz" ];then
42 |     wget http://dist.schmorp.de/libev/Attic/{LIBEV_NAME}.tar.gz
43 | fi
44 | 
45 | if [ ! -d "${LIBEV_NAME}" ];then
46 |     tar -xvf ${LIBEV_NAME}.tar.gz
47 | fi
48 | 
49 | cd ${LIBEV_NAME}
50 | pwd
51 | ./configure --prefix=${LIB_PREFIX}
52 | make;
53 | make install;
54 | cd ..
55 | 
56 | 
57 | if [ ! -f "${BDB_NAME}.tar.gz" ];then
58 |     wget http://download.oracle.com/berkeley-db/${BDB_NAME}.tar.gz
59 | fi
60 | 
61 | if [ ! -d "${BDB_NAME}" ];then
62 |     tar -xvf ${BDB_NAME}.tar.gz
63 | fi
64 | 
65 | cd ${BDB_NAME}
66 | pwd
67 | cd build_unix
68 | ../dist/configure --prefix=${LIB_PREFIX}
69 | make;
70 | make install;
71 | cd ..
72 | cd ..
73 | 
74 | cd ..
75 | 


--------------------------------------------------------------------------------
/utils/queue/tailq.c:
--------------------------------------------------------------------------------
 1 | #include <sys/queue.h>
 2 | 
 3 | struct entry {
 4 | 	// element
 5 | 	TAILQ_ENTRY(entry)	entries;
 6 | } *n1;
 7 | 
 8 | TAILQ_HEAD(, entry) head;
 9 | 
10 | int main(int argc, char const *argv[])
11 | {
12 | 	TAILQ_INIT(&head);
13 | 
14 | 	n1	= malloc(sizeof(struct entry));
15 | 
16 | 	TAILQ_INSERT_TAIL(&head, n1, entries);
17 | 
18 | 	while (!TAILQ_EMPTY(&head)) {
19 | 		n1	= TAILQ_FIRST(&head);
20 | 		TAILQ_REMOVE(&head, n1, entries);
21 | 		free(n1);
22 | 	}
23 | 
24 | 	return 0;
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/utils/rbtree/include/compiler.h:
--------------------------------------------------------------------------------
  1 | #ifndef __LINUX_COMPILER_H
  2 | #define __LINUX_COMPILER_H
  3 | 
  4 | #ifndef __ASSEMBLY__
  5 | 
  6 | #ifdef __CHECKER__
  7 | # define __user		__attribute__((noderef, address_space(1)))
  8 | # define __kernel	__attribute__((address_space(0)))
  9 | # define __safe		__attribute__((safe))
 10 | # define __force	__attribute__((force))
 11 | # define __nocast	__attribute__((nocast))
 12 | # define __iomem	__attribute__((noderef, address_space(2)))
 13 | # define __must_hold(x)	__attribute__((context(x,1,1)))
 14 | # define __acquires(x)	__attribute__((context(x,0,1)))
 15 | # define __releases(x)	__attribute__((context(x,1,0)))
 16 | # define __acquire(x)	__context__(x,1)
 17 | # define __release(x)	__context__(x,-1)
 18 | # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 19 | # define __percpu	__attribute__((noderef, address_space(3)))
 20 | #ifdef CONFIG_SPARSE_RCU_POINTER
 21 | # define __rcu		__attribute__((noderef, address_space(4)))
 22 | #else
 23 | # define __rcu
 24 | #endif
 25 | extern void __chk_user_ptr(const volatile void __user *);
 26 | extern void __chk_io_ptr(const volatile void __iomem *);
 27 | #else
 28 | # define __user
 29 | # define __kernel
 30 | # define __safe
 31 | # define __force
 32 | # define __nocast
 33 | # define __iomem
 34 | # define __chk_user_ptr(x) (void)0
 35 | # define __chk_io_ptr(x) (void)0
 36 | //# define __builtin_warning(x, y...) (1)
 37 | # define __must_hold(x)
 38 | # define __acquires(x)
 39 | # define __releases(x)
 40 | # define __acquire(x) (void)0
 41 | # define __release(x) (void)0
 42 | # define __cond_lock(x,c) (c)
 43 | # define __percpu
 44 | # define __rcu
 45 | #endif
 46 | 
 47 | /* Indirect macros required for expanded argument pasting, eg. __LINE__. */
 48 | #define ___PASTE(a,b) a##b
 49 | #define __PASTE(a,b) ___PASTE(a,b)
 50 | 
 51 | #ifdef __KERNEL__
 52 | 
 53 | #ifdef __GNUC__
 54 | #include <linux/compiler-gcc.h>
 55 | #endif
 56 | 
 57 | #define notrace __attribute__((no_instrument_function))
 58 | 
 59 | /* Intel compiler defines __GNUC__. So we will overwrite implementations
 60 |  * coming from above header files here
 61 |  */
 62 | #ifdef __INTEL_COMPILER
 63 | # include <linux/compiler-intel.h>
 64 | #endif
 65 | 
 66 | /*
 67 |  * Generic compiler-dependent macros required for kernel
 68 |  * build go below this comment. Actual compiler/compiler version
 69 |  * specific implementations come from the above header files
 70 |  */
 71 | 
 72 | struct ftrace_branch_data {
 73 | 	const char *func;
 74 | 	const char *file;
 75 | 	unsigned line;
 76 | 	union {
 77 | 		struct {
 78 | 			unsigned long correct;
 79 | 			unsigned long incorrect;
 80 | 		};
 81 | 		struct {
 82 | 			unsigned long miss;
 83 | 			unsigned long hit;
 84 | 		};
 85 | 		unsigned long miss_hit[2];
 86 | 	};
 87 | };
 88 | 
 89 | /*
 90 |  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
 91 |  * to disable branch tracing on a per file basis.
 92 |  */
 93 | #if defined(CONFIG_TRACE_BRANCH_PROFILING) \
 94 |     && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
 95 | void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 96 | 
 97 | #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 98 | #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 99 | 
100 | #define __branch_check__(x, expect) ({					\
101 | 			int ______r;					\
102 | 			static struct ftrace_branch_data		\
103 | 				__attribute__((__aligned__(4)))		\
104 | 				__attribute__((section("_ftrace_annotated_branch"))) \
105 | 				______f = {				\
106 | 				.func = __func__,			\
107 | 				.file = __FILE__,			\
108 | 				.line = __LINE__,			\
109 | 			};						\
110 | 			______r = likely_notrace(x);			\
111 | 			ftrace_likely_update(&______f, ______r, expect); \
112 | 			______r;					\
113 | 		})
114 | 
115 | /*
116 |  * Using __builtin_constant_p(x) to ignore cases where the return
117 |  * value is always the same.  This idea is taken from a similar patch
118 |  * written by Daniel Walker.
119 |  */
120 | # ifndef likely
121 | #  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
122 | # endif
123 | # ifndef unlikely
124 | #  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
125 | # endif
126 | 
127 | #ifdef CONFIG_PROFILE_ALL_BRANCHES
128 | /*
129 |  * "Define 'is'", Bill Clinton
130 |  * "Define 'if'", Steven Rostedt
131 |  */
132 | #define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
133 | #define __trace_if(cond) \
134 | 	if (__builtin_constant_p((cond)) ? !!(cond) :			\
135 | 	({								\
136 | 		int ______r;						\
137 | 		static struct ftrace_branch_data			\
138 | 			__attribute__((__aligned__(4)))			\
139 | 			__attribute__((section("_ftrace_branch")))	\
140 | 			______f = {					\
141 | 				.func = __func__,			\
142 | 				.file = __FILE__,			\
143 | 				.line = __LINE__,			\
144 | 			};						\
145 | 		______r = !!(cond);					\
146 | 		______f.miss_hit[______r]++;					\
147 | 		______r;						\
148 | 	}))
149 | #endif /* CONFIG_PROFILE_ALL_BRANCHES */
150 | 
151 | #else
152 | # define likely(x)	__builtin_expect(!!(x), 1)
153 | # define unlikely(x)	__builtin_expect(!!(x), 0)
154 | #endif
155 | 
156 | /* Optimization barrier */
157 | #ifndef barrier
158 | # define barrier() __memory_barrier()
159 | #endif
160 | 
161 | /* Unreachable code */
162 | #ifndef unreachable
163 | # define unreachable() do { } while (1)
164 | #endif
165 | 
166 | #ifndef RELOC_HIDE
167 | # define RELOC_HIDE(ptr, off)					\
168 |   ({ unsigned long __ptr;					\
169 |      __ptr = (unsigned long) (ptr);				\
170 |     (typeof(ptr)) (__ptr + (off)); })
171 | #endif
172 | 
173 | /* Not-quite-unique ID. */
174 | #ifndef __UNIQUE_ID
175 | # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
176 | #endif
177 | 
178 | #endif /* __KERNEL__ */
179 | 
180 | #endif /* __ASSEMBLY__ */
181 | 
182 | #ifdef __KERNEL__
183 | /*
184 |  * Allow us to mark functions as 'deprecated' and have gcc emit a nice
185 |  * warning for each use, in hopes of speeding the functions removal.
186 |  * Usage is:
187 |  * 		int __deprecated foo(void)
188 |  */
189 | #ifndef __deprecated
190 | # define __deprecated		/* unimplemented */
191 | #endif
192 | 
193 | #ifdef MODULE
194 | #define __deprecated_for_modules __deprecated
195 | #else
196 | #define __deprecated_for_modules
197 | #endif
198 | 
199 | #ifndef __must_check
200 | #define __must_check
201 | #endif
202 | 
203 | #ifndef CONFIG_ENABLE_MUST_CHECK
204 | #undef __must_check
205 | #define __must_check
206 | #endif
207 | #ifndef CONFIG_ENABLE_WARN_DEPRECATED
208 | #undef __deprecated
209 | #undef __deprecated_for_modules
210 | #define __deprecated
211 | #define __deprecated_for_modules
212 | #endif
213 | 
214 | /*
215 |  * Allow us to avoid 'defined but not used' warnings on functions and data,
216 |  * as well as force them to be emitted to the assembly file.
217 |  *
218 |  * As of gcc 3.4, static functions that are not marked with attribute((used))
219 |  * may be elided from the assembly file.  As of gcc 3.4, static data not so
220 |  * marked will not be elided, but this may change in a future gcc version.
221 |  *
222 |  * NOTE: Because distributions shipped with a backported unit-at-a-time
223 |  * compiler in gcc 3.3, we must define __used to be __attribute__((used))
224 |  * for gcc >=3.3 instead of 3.4.
225 |  *
226 |  * In prior versions of gcc, such functions and data would be emitted, but
227 |  * would be warned about except with attribute((unused)).
228 |  *
229 |  * Mark functions that are referenced only in inline assembly as __used so
230 |  * the code is emitted even though it appears to be unreferenced.
231 |  */
232 | #ifndef __used
233 | # define __used			/* unimplemented */
234 | #endif
235 | 
236 | #ifndef __maybe_unused
237 | # define __maybe_unused		/* unimplemented */
238 | #endif
239 | 
240 | #ifndef __always_unused
241 | # define __always_unused	/* unimplemented */
242 | #endif
243 | 
244 | #ifndef noinline
245 | #define noinline
246 | #endif
247 | 
248 | /*
249 |  * Rather then using noinline to prevent stack consumption, use
250 |  * noinline_for_stack instead.  For documentation reasons.
251 |  */
252 | #define noinline_for_stack noinline
253 | 
254 | #ifndef __always_inline
255 | #define __always_inline inline
256 | #endif
257 | 
258 | #endif /* __KERNEL__ */
259 | 
260 | /*
261 |  * From the GCC manual:
262 |  *
263 |  * Many functions do not examine any values except their arguments,
264 |  * and have no effects except the return value.  Basically this is
265 |  * just slightly more strict class than the `pure' attribute above,
266 |  * since function is not allowed to read global memory.
267 |  *
268 |  * Note that a function that has pointer arguments and examines the
269 |  * data pointed to must _not_ be declared `const'.  Likewise, a
270 |  * function that calls a non-`const' function usually must not be
271 |  * `const'.  It does not make sense for a `const' function to return
272 |  * `void'.
273 |  */
274 | #ifndef __attribute_const__
275 | # define __attribute_const__	/* unimplemented */
276 | #endif
277 | 
278 | /*
279 |  * Tell gcc if a function is cold. The compiler will assume any path
280 |  * directly leading to the call is unlikely.
281 |  */
282 | 
283 | #ifndef __cold
284 | #define __cold
285 | #endif
286 | 
287 | /* Simple shorthand for a section definition */
288 | #ifndef __section
289 | # define __section(S) __attribute__ ((__section__(#S)))
290 | #endif
291 | 
292 | #ifndef __visible
293 | #define __visible
294 | #endif
295 | 
296 | /* Are two types/vars the same type (ignoring qualifiers)? */
297 | #ifndef __same_type
298 | # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
299 | #endif
300 | 
301 | /* Compile time object size, -1 for unknown */
302 | #ifndef __compiletime_object_size
303 | # define __compiletime_object_size(obj) -1
304 | #endif
305 | #ifndef __compiletime_warning
306 | # define __compiletime_warning(message)
307 | #endif
308 | #ifndef __compiletime_error
309 | # define __compiletime_error(message)
310 | # define __compiletime_error_fallback(condition) \
311 | 	do { ((void)sizeof(char[1 - 2 * condition])); } while (0)
312 | #else
313 | # define __compiletime_error_fallback(condition) do { } while (0)
314 | #endif
315 | 
316 | #define __compiletime_assert(condition, msg, prefix, suffix)		\
317 | 	do {								\
318 | 		bool __cond = !(condition);				\
319 | 		extern void prefix ## suffix(void) __compiletime_error(msg); \
320 | 		if (__cond)						\
321 | 			prefix ## suffix();				\
322 | 		__compiletime_error_fallback(__cond);			\
323 | 	} while (0)
324 | 
325 | #define _compiletime_assert(condition, msg, prefix, suffix) \
326 | 	__compiletime_assert(condition, msg, prefix, suffix)
327 | 
328 | /**
329 |  * compiletime_assert - break build and emit msg if condition is false
330 |  * @condition: a compile-time constant condition to check
331 |  * @msg:       a message to emit if condition is false
332 |  *
333 |  * In tradition of POSIX assert, this macro will break the build if the
334 |  * supplied condition is *false*, emitting the supplied error message if the
335 |  * compiler has support to do so.
336 |  */
337 | #define compiletime_assert(condition, msg) \
338 | 	_compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
339 | 
340 | /*
341 |  * Prevent the compiler from merging or refetching accesses.  The compiler
342 |  * is also forbidden from reordering successive instances of ACCESS_ONCE(),
343 |  * but only when the compiler is aware of some particular ordering.  One way
344 |  * to make the compiler aware of ordering is to put the two invocations of
345 |  * ACCESS_ONCE() in different C statements.
346 |  *
347 |  * This macro does absolutely -nothing- to prevent the CPU from reordering,
348 |  * merging, or refetching absolutely anything at any time.  Its main intended
349 |  * use is to mediate communication between process-level code and irq/NMI
350 |  * handlers, all running on the same CPU.
351 |  */
352 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
353 | 
354 | /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
355 | #ifdef CONFIG_KPROBES
356 | # define __kprobes	__attribute__((__section__(".kprobes.text")))
357 | #else
358 | # define __kprobes
359 | #endif
360 | #endif /* __LINUX_COMPILER_H */
361 | 


--------------------------------------------------------------------------------
/utils/rbtree/include/export.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LINUX_EXPORT_H
 2 | #define _LINUX_EXPORT_H
 3 | /*
 4 |  * Export symbols from the kernel to modules.  Forked from module.h
 5 |  * to reduce the amount of pointless cruft we feed to gcc when only
 6 |  * exporting a simple symbol or two.
 7 |  *
 8 |  * Try not to add #includes here.  It slows compilation and makes kernel
 9 |  * hackers place grumpy comments in header files.
10 |  */
11 | 
12 | /* Some toolchains use a `_' prefix for all user symbols. */
13 | #ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX
14 | #define __VMLINUX_SYMBOL(x) _##x
15 | #define __VMLINUX_SYMBOL_STR(x) "_" #x
16 | #else
17 | #define __VMLINUX_SYMBOL(x) x
18 | #define __VMLINUX_SYMBOL_STR(x) #x
19 | #endif
20 | 
21 | /* Indirect, so macros are expanded before pasting. */
22 | #define VMLINUX_SYMBOL(x) __VMLINUX_SYMBOL(x)
23 | #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x)
24 | 
25 | #ifndef __ASSEMBLY__
26 | struct kernel_symbol
27 | {
28 | 	unsigned long value;
29 | 	const char *name;
30 | };
31 | 
32 | #ifdef MODULE
33 | extern struct module __this_module;
34 | #define THIS_MODULE (&__this_module)
35 | #else
36 | #define THIS_MODULE ((struct module *)0)
37 | #endif
38 | 
39 | #ifdef CONFIG_MODULES
40 | 
41 | #ifndef __GENKSYMS__
42 | #ifdef CONFIG_MODVERSIONS
43 | /* Mark the CRC weak since genksyms apparently decides not to
44 |  * generate a checksums for some symbols */
45 | #define __CRC_SYMBOL(sym, sec)					\
46 | 	extern void *__crc_##sym __attribute__((weak));		\
47 | 	static const unsigned long __kcrctab_##sym		\
48 | 	__used							\
49 | 	__attribute__((section("___kcrctab" sec "+" #sym), unused))	\
50 | 	= (unsigned long) &__crc_##sym;
51 | #else
52 | #define __CRC_SYMBOL(sym, sec)
53 | #endif
54 | 
55 | /* For every exported symbol, place a struct in the __ksymtab section */
56 | #define __EXPORT_SYMBOL(sym, sec)				\
57 | 	extern typeof(sym) sym;					\
58 | 	__CRC_SYMBOL(sym, sec)					\
59 | 	static const char __kstrtab_##sym[]			\
60 | 	__attribute__((section("__ksymtab_strings"), aligned(1))) \
61 | 	= VMLINUX_SYMBOL_STR(sym);				\
62 | 	static const struct kernel_symbol __ksymtab_##sym	\
63 | 	__used							\
64 | 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
65 | 	= { (unsigned long)&sym, __kstrtab_##sym }
66 | 
67 | #define EXPORT_SYMBOL(sym)					\
68 | 	__EXPORT_SYMBOL(sym, "")
69 | 
70 | #define EXPORT_SYMBOL_GPL(sym)					\
71 | 	__EXPORT_SYMBOL(sym, "_gpl")
72 | 
73 | #define EXPORT_SYMBOL_GPL_FUTURE(sym)				\
74 | 	__EXPORT_SYMBOL(sym, "_gpl_future")
75 | 
76 | #ifdef CONFIG_UNUSED_SYMBOLS
77 | #define EXPORT_UNUSED_SYMBOL(sym) __EXPORT_SYMBOL(sym, "_unused")
78 | #define EXPORT_UNUSED_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_unused_gpl")
79 | #else
80 | #define EXPORT_UNUSED_SYMBOL(sym)
81 | #define EXPORT_UNUSED_SYMBOL_GPL(sym)
82 | #endif
83 | 
84 | #endif	/* __GENKSYMS__ */
85 | 
86 | #else /* !CONFIG_MODULES... */
87 | 
88 | #define EXPORT_SYMBOL(sym)
89 | #define EXPORT_SYMBOL_GPL(sym)
90 | #define EXPORT_SYMBOL_GPL_FUTURE(sym)
91 | #define EXPORT_UNUSED_SYMBOL(sym)
92 | #define EXPORT_UNUSED_SYMBOL_GPL(sym)
93 | 
94 | #endif /* CONFIG_MODULES */
95 | #endif /* !__ASSEMBLY__ */
96 | 
97 | #endif /* _LINUX_EXPORT_H */
98 | 


--------------------------------------------------------------------------------
/utils/rbtree/include/rbtree.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Red Black Trees
  3 |   (C) 1999  Andrea Arcangeli <andrea@suse.de>
  4 |   
  5 |   This program is free software; you can redistribute it and/or modify
  6 |   it under the terms of the GNU General Public License as published by
  7 |   the Free Software Foundation; either version 2 of the License, or
  8 |   (at your option) any later version.
  9 | 
 10 |   This program is distributed in the hope that it will be useful,
 11 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |   GNU General Public License for more details.
 14 | 
 15 |   You should have received a copy of the GNU General Public License
 16 |   along with this program; if not, write to the Free Software
 17 |   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 18 | 
 19 |   linux/include/linux/rbtree.h
 20 | 
 21 |   To use rbtrees you'll have to implement your own insert and search cores.
 22 |   This will avoid us to use callbacks and to drop drammatically performances.
 23 |   I know it's not the cleaner way,  but in C (not in C++) to get
 24 |   performances and genericity...
 25 | 
 26 |   See Documentation/rbtree.txt for documentation and samples.
 27 | */
 28 | 
 29 | #ifndef	_LINUX_RBTREE_H
 30 | #define	_LINUX_RBTREE_H
 31 | 
 32 | //#include <linux/kernel.h>
 33 | 
 34 | 
 35 | /* #include <linux/stddef.h> */
 36 | #undef NULL
 37 | #define NULL ((void *)0)
 38 | 
 39 | enum {
 40 | 	false	= 0,
 41 | 	true	= 1
 42 | };
 43 | 
 44 | enum {
 45 | 	FALSE	= 0,
 46 | 	TRUE	= 1
 47 | };
 48 | 
 49 | #ifndef container_of
 50 | /**
 51 |  * container_of - cast a member of a structure out to the containing structure
 52 |  * @ptr:	the pointer to the member.
 53 |  * @type:	the type of the container struct this is embedded in.
 54 |  * @member:	the name of the member within the struct.
 55 |  *
 56 |  */
 57 | #define container_of(ptr, type, member) ({			\
 58 | 	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
 59 | 	(type *)( (char *)__mptr - offsetof(type,member) );})
 60 | #endif
 61 | 
 62 | #undef offsetof
 63 | #ifdef __compiler_offsetof
 64 | #define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
 65 | #else
 66 | #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
 67 | #endif
 68 | 
 69 |    
 70 | 
 71 | struct rb_node {
 72 | 	unsigned long  __rb_parent_color;
 73 | 	struct rb_node *rb_right;
 74 | 	struct rb_node *rb_left;
 75 | } __attribute__((aligned(sizeof(long))));
 76 |     /* The alignment might seem pointless, but allegedly CRIS needs it */
 77 | 
 78 | struct rb_root {
 79 | 	struct rb_node *rb_node;
 80 | };
 81 | 
 82 | 
 83 | #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 84 | 
 85 | #define RB_ROOT	(struct rb_root) { NULL, }
 86 | #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 87 | 
 88 | #define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
 89 | 
 90 | /* 'empty' nodes are nodes that are known not to be inserted in an rbree */
 91 | #define RB_EMPTY_NODE(node)  \
 92 | 	((node)->__rb_parent_color == (unsigned long)(node))
 93 | #define RB_CLEAR_NODE(node)  \
 94 | 	((node)->__rb_parent_color = (unsigned long)(node))
 95 | 
 96 | 
 97 | extern void rb_insert_color(struct rb_node *, struct rb_root *);
 98 | extern void rb_erase(struct rb_node *, struct rb_root *);
 99 | 
100 | 
101 | /* Find logical next and previous nodes in a tree */
102 | extern struct rb_node *rb_next(const struct rb_node *);
103 | extern struct rb_node *rb_prev(const struct rb_node *);
104 | extern struct rb_node *rb_first(const struct rb_root *);
105 | extern struct rb_node *rb_last(const struct rb_root *);
106 | 
107 | /* Postorder iteration - always visit the parent after its children */
108 | extern struct rb_node *rb_first_postorder(const struct rb_root *);
109 | extern struct rb_node *rb_next_postorder(const struct rb_node *);
110 | 
111 | /* Fast replacement of a single node without remove/rebalance/add/rebalance */
112 | extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
113 | 			    struct rb_root *root);
114 | 
115 | static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
116 | 				struct rb_node ** rb_link)
117 | {
118 | 	node->__rb_parent_color = (unsigned long)parent;
119 | 	node->rb_left = node->rb_right = NULL;
120 | 
121 | 	*rb_link = node;
122 | }
123 | 
124 | #define rb_entry_safe(ptr, type, member) \
125 | 	({ typeof(ptr) ____ptr = (ptr); \
126 | 	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
127 | 	})
128 | 
129 | /**
130 |  * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
131 |  * given type safe against removal of rb_node entry
132 |  *
133 |  * @pos:	the 'type *' to use as a loop cursor.
134 |  * @n:		another 'type *' to use as temporary storage
135 |  * @root:	'rb_root *' of the rbtree.
136 |  * @field:	the name of the rb_node field within 'type'.
137 |  */
138 | #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
139 | 	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
140 | 	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
141 | 			typeof(*pos), field); 1; }); \
142 | 	     pos = n)
143 | 
144 | #endif	/* _LINUX_RBTREE_H */
145 | 


--------------------------------------------------------------------------------
/utils/rbtree/include/rbtree_augmented.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Red Black Trees
  3 |   (C) 1999  Andrea Arcangeli <andrea@suse.de>
  4 |   (C) 2002  David Woodhouse <dwmw2@infradead.org>
  5 |   (C) 2012  Michel Lespinasse <walken@google.com>
  6 | 
  7 |   This program is free software; you can redistribute it and/or modify
  8 |   it under the terms of the GNU General Public License as published by
  9 |   the Free Software Foundation; either version 2 of the License, or
 10 |   (at your option) any later version.
 11 | 
 12 |   This program is distributed in the hope that it will be useful,
 13 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |   GNU General Public License for more details.
 16 | 
 17 |   You should have received a copy of the GNU General Public License
 18 |   along with this program; if not, write to the Free Software
 19 |   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 20 | 
 21 |   linux/include/linux/rbtree_augmented.h
 22 | */
 23 | 
 24 | #ifndef _LINUX_RBTREE_AUGMENTED_H
 25 | #define _LINUX_RBTREE_AUGMENTED_H
 26 | 
 27 | #include <compiler.h>
 28 | #include <rbtree.h>
 29 | 
 30 | /*
 31 |  * Please note - only struct rb_augment_callbacks and the prototypes for
 32 |  * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 33 |  * The rest are implementation details you are not expected to depend on.
 34 |  *
 35 |  * See Documentation/rbtree.txt for documentation and samples.
 36 |  */
 37 | 
 38 | struct rb_augment_callbacks {
 39 | 	void (*propagate)(struct rb_node *node, struct rb_node *stop);
 40 | 	void (*copy)(struct rb_node *old, struct rb_node *new);
 41 | 	void (*rotate)(struct rb_node *old, struct rb_node *new);
 42 | };
 43 | 
 44 | extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 45 | 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
 46 | static inline void
 47 | rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 48 | 		    const struct rb_augment_callbacks *augment)
 49 | {
 50 | 	__rb_insert_augmented(node, root, augment->rotate);
 51 | }
 52 | 
 53 | #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
 54 | 			     rbtype, rbaugmented, rbcompute)		\
 55 | static inline void							\
 56 | rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
 57 | {									\
 58 | 	while (rb != stop) {						\
 59 | 		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
 60 | 		rbtype augmented = rbcompute(node);			\
 61 | 		if (node->rbaugmented == augmented)			\
 62 | 			break;						\
 63 | 		node->rbaugmented = augmented;				\
 64 | 		rb = rb_parent(&node->rbfield);				\
 65 | 	}								\
 66 | }									\
 67 | static inline void							\
 68 | rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
 69 | {									\
 70 | 	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
 71 | 	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
 72 | 	new->rbaugmented = old->rbaugmented;				\
 73 | }									\
 74 | static void								\
 75 | rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
 76 | {									\
 77 | 	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
 78 | 	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
 79 | 	new->rbaugmented = old->rbaugmented;				\
 80 | 	old->rbaugmented = rbcompute(old);				\
 81 | }									\
 82 | rbstatic const struct rb_augment_callbacks rbname = {			\
 83 | 	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
 84 | };
 85 | 
 86 | 
 87 | #define	RB_RED		0
 88 | #define	RB_BLACK	1
 89 | 
 90 | #define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
 91 | 
 92 | #define __rb_color(pc)     ((pc) & 1)
 93 | #define __rb_is_black(pc)  __rb_color(pc)
 94 | #define __rb_is_red(pc)    (!__rb_color(pc))
 95 | #define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
 96 | #define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
 97 | #define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
 98 | 
 99 | static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
100 | {
101 | 	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
102 | }
103 | 
104 | static inline void rb_set_parent_color(struct rb_node *rb,
105 | 				       struct rb_node *p, int color)
106 | {
107 | 	rb->__rb_parent_color = (unsigned long)p | color;
108 | }
109 | 
110 | static inline void
111 | __rb_change_child(struct rb_node *old, struct rb_node *new,
112 | 		  struct rb_node *parent, struct rb_root *root)
113 | {
114 | 	if (parent) {
115 | 		if (parent->rb_left == old)
116 | 			parent->rb_left = new;
117 | 		else
118 | 			parent->rb_right = new;
119 | 	} else
120 | 		root->rb_node = new;
121 | }
122 | 
123 | extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
124 | 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
125 | 
126 | static inline struct rb_node *
127 | __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
128 | 		     const struct rb_augment_callbacks *augment)
129 | {
130 | 	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
131 | 	struct rb_node *parent, *rebalance;
132 | 	unsigned long pc;
133 | 
134 | 	if (!tmp) {
135 | 		/*
136 | 		 * Case 1: node to erase has no more than 1 child (easy!)
137 | 		 *
138 | 		 * Note that if there is one child it must be red due to 5)
139 | 		 * and node must be black due to 4). We adjust colors locally
140 | 		 * so as to bypass __rb_erase_color() later on.
141 | 		 */
142 | 		pc = node->__rb_parent_color;
143 | 		parent = __rb_parent(pc);
144 | 		__rb_change_child(node, child, parent, root);
145 | 		if (child) {
146 | 			child->__rb_parent_color = pc;
147 | 			rebalance = NULL;
148 | 		} else
149 | 			rebalance = __rb_is_black(pc) ? parent : NULL;
150 | 		tmp = parent;
151 | 	} else if (!child) {
152 | 		/* Still case 1, but this time the child is node->rb_left */
153 | 		tmp->__rb_parent_color = pc = node->__rb_parent_color;
154 | 		parent = __rb_parent(pc);
155 | 		__rb_change_child(node, tmp, parent, root);
156 | 		rebalance = NULL;
157 | 		tmp = parent;
158 | 	} else {
159 | 		struct rb_node *successor = child, *child2;
160 | 		tmp = child->rb_left;
161 | 		if (!tmp) {
162 | 			/*
163 | 			 * Case 2: node's successor is its right child
164 | 			 *
165 | 			 *    (n)          (s)
166 | 			 *    / \          / \
167 | 			 *  (x) (s)  ->  (x) (c)
168 | 			 *        \
169 | 			 *        (c)
170 | 			 */
171 | 			parent = successor;
172 | 			child2 = successor->rb_right;
173 | 			augment->copy(node, successor);
174 | 		} else {
175 | 			/*
176 | 			 * Case 3: node's successor is leftmost under
177 | 			 * node's right child subtree
178 | 			 *
179 | 			 *    (n)          (s)
180 | 			 *    / \          / \
181 | 			 *  (x) (y)  ->  (x) (y)
182 | 			 *      /            /
183 | 			 *    (p)          (p)
184 | 			 *    /            /
185 | 			 *  (s)          (c)
186 | 			 *    \
187 | 			 *    (c)
188 | 			 */
189 | 			do {
190 | 				parent = successor;
191 | 				successor = tmp;
192 | 				tmp = tmp->rb_left;
193 | 			} while (tmp);
194 | 			parent->rb_left = child2 = successor->rb_right;
195 | 			successor->rb_right = child;
196 | 			rb_set_parent(child, successor);
197 | 			augment->copy(node, successor);
198 | 			augment->propagate(parent, successor);
199 | 		}
200 | 
201 | 		successor->rb_left = tmp = node->rb_left;
202 | 		rb_set_parent(tmp, successor);
203 | 
204 | 		pc = node->__rb_parent_color;
205 | 		tmp = __rb_parent(pc);
206 | 		__rb_change_child(node, successor, tmp, root);
207 | 		if (child2) {
208 | 			successor->__rb_parent_color = pc;
209 | 			rb_set_parent_color(child2, parent, RB_BLACK);
210 | 			rebalance = NULL;
211 | 		} else {
212 | 			unsigned long pc2 = successor->__rb_parent_color;
213 | 			successor->__rb_parent_color = pc;
214 | 			rebalance = __rb_is_black(pc2) ? parent : NULL;
215 | 		}
216 | 		tmp = successor;
217 | 	}
218 | 
219 | 	augment->propagate(tmp, NULL);
220 | 	return rebalance;
221 | }
222 | 
223 | static inline void
224 | rb_erase_augmented(struct rb_node *node, struct rb_root *root,
225 | 		   const struct rb_augment_callbacks *augment)
226 | {
227 | 	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
228 | 	if (rebalance)
229 | 		__rb_erase_color(rebalance, root, augment->rotate);
230 | }
231 | 
232 | #endif	/* _LINUX_RBTREE_AUGMENTED_H */
233 | 


--------------------------------------------------------------------------------
/utils/rbtree/rbtree.txt:
--------------------------------------------------------------------------------
  1 | Red-black Trees (rbtree) in Linux
  2 | January 18, 2007
  3 | Rob Landley <rob@landley.net>
  4 | =============================
  5 | 
  6 | What are red-black trees, and what are they for?
  7 | ------------------------------------------------
  8 | 
  9 | Red-black trees are a type of self-balancing binary search tree, used for
 10 | storing sortable key/value data pairs.  This differs from radix trees (which
 11 | are used to efficiently store sparse arrays and thus use long integer indexes
 12 | to insert/access/delete nodes) and hash tables (which are not kept sorted to
 13 | be easily traversed in order, and must be tuned for a specific size and
 14 | hash function where rbtrees scale gracefully storing arbitrary keys).
 15 | 
 16 | Red-black trees are similar to AVL trees, but provide faster real-time bounded
 17 | worst case performance for insertion and deletion (at most two rotations and
 18 | three rotations, respectively, to balance the tree), with slightly slower
 19 | (but still O(log n)) lookup time.
 20 | 
 21 | To quote Linux Weekly News:
 22 | 
 23 |     There are a number of red-black trees in use in the kernel.
 24 |     The deadline and CFQ I/O schedulers employ rbtrees to
 25 |     track requests; the packet CD/DVD driver does the same.
 26 |     The high-resolution timer code uses an rbtree to organize outstanding
 27 |     timer requests.  The ext3 filesystem tracks directory entries in a
 28 |     red-black tree.  Virtual memory areas (VMAs) are tracked with red-black
 29 |     trees, as are epoll file descriptors, cryptographic keys, and network
 30 |     packets in the "hierarchical token bucket" scheduler.
 31 | 
 32 | This document covers use of the Linux rbtree implementation.  For more
 33 | information on the nature and implementation of Red Black Trees,  see:
 34 | 
 35 |   Linux Weekly News article on red-black trees
 36 |     http://lwn.net/Articles/184495/
 37 | 
 38 |   Wikipedia entry on red-black trees
 39 |     http://en.wikipedia.org/wiki/Red-black_tree
 40 | 
 41 | Linux implementation of red-black trees
 42 | ---------------------------------------
 43 | 
 44 | Linux's rbtree implementation lives in the file "lib/rbtree.c".  To use it,
 45 | "#include <linux/rbtree.h>".
 46 | 
 47 | The Linux rbtree implementation is optimized for speed, and thus has one
 48 | less layer of indirection (and better cache locality) than more traditional
 49 | tree implementations.  Instead of using pointers to separate rb_node and data
 50 | structures, each instance of struct rb_node is embedded in the data structure
 51 | it organizes.  And instead of using a comparison callback function pointer,
 52 | users are expected to write their own tree search and insert functions
 53 | which call the provided rbtree functions.  Locking is also left up to the
 54 | user of the rbtree code.
 55 | 
 56 | Creating a new rbtree
 57 | ---------------------
 58 | 
 59 | Data nodes in an rbtree tree are structures containing a struct rb_node member:
 60 | 
 61 |   struct mytype {
 62 |   	struct rb_node node;
 63 |   	char *keystring;
 64 |   };
 65 | 
 66 | When dealing with a pointer to the embedded struct rb_node, the containing data
 67 | structure may be accessed with the standard container_of() macro.  In addition,
 68 | individual members may be accessed directly via rb_entry(node, type, member).
 69 | 
 70 | At the root of each rbtree is an rb_root structure, which is initialized to be
 71 | empty via:
 72 | 
 73 |   struct rb_root mytree = RB_ROOT;
 74 | 
 75 | Searching for a value in an rbtree
 76 | ----------------------------------
 77 | 
 78 | Writing a search function for your tree is fairly straightforward: start at the
 79 | root, compare each value, and follow the left or right branch as necessary.
 80 | 
 81 | Example:
 82 | 
 83 |   struct mytype *my_search(struct rb_root *root, char *string)
 84 |   {
 85 |   	struct rb_node *node = root->rb_node;
 86 | 
 87 |   	while (node) {
 88 |   		struct mytype *data = container_of(node, struct mytype, node);
 89 | 		int result;
 90 | 
 91 | 		result = strcmp(string, data->keystring);
 92 | 
 93 | 		if (result < 0)
 94 |   			node = node->rb_left;
 95 | 		else if (result > 0)
 96 |   			node = node->rb_right;
 97 | 		else
 98 |   			return data;
 99 | 	}
100 | 	return NULL;
101 |   }
102 | 
103 | Inserting data into an rbtree
104 | -----------------------------
105 | 
106 | Inserting data in the tree involves first searching for the place to insert the
107 | new node, then inserting the node and rebalancing ("recoloring") the tree.
108 | 
109 | The search for insertion differs from the previous search by finding the
110 | location of the pointer on which to graft the new node.  The new node also
111 | needs a link to its parent node for rebalancing purposes.
112 | 
113 | Example:
114 | 
115 |   int my_insert(struct rb_root *root, struct mytype *data)
116 |   {
117 |   	struct rb_node **new = &(root->rb_node), *parent = NULL;
118 | 
119 |   	/* Figure out where to put new node */
120 |   	while (*new) {
121 |   		struct mytype *this = container_of(*new, struct mytype, node);
122 |   		int result = strcmp(data->keystring, this->keystring);
123 | 
124 | 		parent = *new;
125 |   		if (result < 0)
126 |   			new = &((*new)->rb_left);
127 |   		else if (result > 0)
128 |   			new = &((*new)->rb_right);
129 |   		else
130 |   			return FALSE;
131 |   	}
132 | 
133 |   	/* Add new node and rebalance tree. */
134 |   	rb_link_node(&data->node, parent, new);
135 |   	rb_insert_color(&data->node, root);
136 | 
137 | 	return TRUE;
138 |   }
139 | 
140 | Removing or replacing existing data in an rbtree
141 | ------------------------------------------------
142 | 
143 | To remove an existing node from a tree, call:
144 | 
145 |   void rb_erase(struct rb_node *victim, struct rb_root *tree);
146 | 
147 | Example:
148 | 
149 |   struct mytype *data = mysearch(&mytree, "walrus");
150 | 
151 |   if (data) {
152 |   	rb_erase(&data->node, &mytree);
153 |   	myfree(data);
154 |   }
155 | 
156 | To replace an existing node in a tree with a new one with the same key, call:
157 | 
158 |   void rb_replace_node(struct rb_node *old, struct rb_node *new,
159 |   			struct rb_root *tree);
160 | 
161 | Replacing a node this way does not re-sort the tree: If the new node doesn't
162 | have the same key as the old node, the rbtree will probably become corrupted.
163 | 
164 | Iterating through the elements stored in an rbtree (in sort order)
165 | ------------------------------------------------------------------
166 | 
167 | Four functions are provided for iterating through an rbtree's contents in
168 | sorted order.  These work on arbitrary trees, and should not need to be
169 | modified or wrapped (except for locking purposes):
170 | 
171 |   struct rb_node *rb_first(struct rb_root *tree);
172 |   struct rb_node *rb_last(struct rb_root *tree);
173 |   struct rb_node *rb_next(struct rb_node *node);
174 |   struct rb_node *rb_prev(struct rb_node *node);
175 | 
176 | To start iterating, call rb_first() or rb_last() with a pointer to the root
177 | of the tree, which will return a pointer to the node structure contained in
178 | the first or last element in the tree.  To continue, fetch the next or previous
179 | node by calling rb_next() or rb_prev() on the current node.  This will return
180 | NULL when there are no more nodes left.
181 | 
182 | The iterator functions return a pointer to the embedded struct rb_node, from
183 | which the containing data structure may be accessed with the container_of()
184 | macro, and individual members may be accessed directly via
185 | rb_entry(node, type, member).
186 | 
187 | Example:
188 | 
189 |   struct rb_node *node;
190 |   for (node = rb_first(&mytree); node; node = rb_next(node))
191 | 	printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
192 | 
193 | Support for Augmented rbtrees
194 | -----------------------------
195 | 
196 | Augmented rbtree is an rbtree with "some" additional data stored in
197 | each node, where the additional data for node N must be a function of
198 | the contents of all nodes in the subtree rooted at N. This data can
199 | be used to augment some new functionality to rbtree. Augmented rbtree
200 | is an optional feature built on top of basic rbtree infrastructure.
201 | An rbtree user who wants this feature will have to call the augmentation
202 | functions with the user provided augmentation callback when inserting
203 | and erasing nodes.
204 | 
205 | C files implementing augmented rbtree manipulation must include
206 | <linux/rbtree_augmented.h> instead of <linus/rbtree.h>. Note that
207 | linux/rbtree_augmented.h exposes some rbtree implementations details
208 | you are not expected to rely on; please stick to the documented APIs
209 | there and do not include <linux/rbtree_augmented.h> from header files
210 | either so as to minimize chances of your users accidentally relying on
211 | such implementation details.
212 | 
213 | On insertion, the user must update the augmented information on the path
214 | leading to the inserted node, then call rb_link_node() as usual and
215 | rb_augment_inserted() instead of the usual rb_insert_color() call.
216 | If rb_augment_inserted() rebalances the rbtree, it will callback into
217 | a user provided function to update the augmented information on the
218 | affected subtrees.
219 | 
220 | When erasing a node, the user must call rb_erase_augmented() instead of
221 | rb_erase(). rb_erase_augmented() calls back into user provided functions
222 | to updated the augmented information on affected subtrees.
223 | 
224 | In both cases, the callbacks are provided through struct rb_augment_callbacks.
225 | 3 callbacks must be defined:
226 | 
227 | - A propagation callback, which updates the augmented value for a given
228 |   node and its ancestors, up to a given stop point (or NULL to update
229 |   all the way to the root).
230 | 
231 | - A copy callback, which copies the augmented value for a given subtree
232 |   to a newly assigned subtree root.
233 | 
234 | - A tree rotation callback, which copies the augmented value for a given
235 |   subtree to a newly assigned subtree root AND recomputes the augmented
236 |   information for the former subtree root.
237 | 
238 | The compiled code for rb_erase_augmented() may inline the propagation and
239 | copy callbacks, which results in a large function, so each augmented rbtree
240 | user should have a single rb_erase_augmented() call site in order to limit
241 | compiled code size.
242 | 
243 | 
244 | Sample usage:
245 | 
246 | Interval tree is an example of augmented rb tree. Reference -
247 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
248 | More details about interval trees:
249 | 
250 | Classical rbtree has a single key and it cannot be directly used to store
251 | interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
252 | lo:hi or to find whether there is an exact match for a new lo:hi.
253 | 
254 | However, rbtree can be augmented to store such interval ranges in a structured
255 | way making it possible to do efficient lookup and exact match.
256 | 
257 | This "extra information" stored in each node is the maximum hi
258 | (max_hi) value among all the nodes that are its descendents. This
259 | information can be maintained at each node just be looking at the node
260 | and its immediate children. And this will be used in O(log n) lookup
261 | for lowest match (lowest start address among all possible matches)
262 | with something like:
263 | 
264 | struct interval_tree_node *
265 | interval_tree_first_match(struct rb_root *root,
266 | 			  unsigned long start, unsigned long last)
267 | {
268 | 	struct interval_tree_node *node;
269 | 
270 | 	if (!root->rb_node)
271 | 		return NULL;
272 | 	node = rb_entry(root->rb_node, struct interval_tree_node, rb);
273 | 
274 | 	while (true) {
275 | 		if (node->rb.rb_left) {
276 | 			struct interval_tree_node *left =
277 | 				rb_entry(node->rb.rb_left,
278 | 					 struct interval_tree_node, rb);
279 | 			if (left->__subtree_last >= start) {
280 | 				/*
281 | 				 * Some nodes in left subtree satisfy Cond2.
282 | 				 * Iterate to find the leftmost such node N.
283 | 				 * If it also satisfies Cond1, that's the match
284 | 				 * we are looking for. Otherwise, there is no
285 | 				 * matching interval as nodes to the right of N
286 | 				 * can't satisfy Cond1 either.
287 | 				 */
288 | 				node = left;
289 | 				continue;
290 | 			}
291 | 		}
292 | 		if (node->start <= last) {		/* Cond1 */
293 | 			if (node->last >= start)	/* Cond2 */
294 | 				return node;	/* node is leftmost match */
295 | 			if (node->rb.rb_right) {
296 | 				node = rb_entry(node->rb.rb_right,
297 | 					struct interval_tree_node, rb);
298 | 				if (node->__subtree_last >= start)
299 | 					continue;
300 | 			}
301 | 		}
302 | 		return NULL;	/* No match */
303 | 	}
304 | }
305 | 
306 | Insertion/removal are defined using the following augmented callbacks:
307 | 
308 | static inline unsigned long
309 | compute_subtree_last(struct interval_tree_node *node)
310 | {
311 | 	unsigned long max = node->last, subtree_last;
312 | 	if (node->rb.rb_left) {
313 | 		subtree_last = rb_entry(node->rb.rb_left,
314 | 			struct interval_tree_node, rb)->__subtree_last;
315 | 		if (max < subtree_last)
316 | 			max = subtree_last;
317 | 	}
318 | 	if (node->rb.rb_right) {
319 | 		subtree_last = rb_entry(node->rb.rb_right,
320 | 			struct interval_tree_node, rb)->__subtree_last;
321 | 		if (max < subtree_last)
322 | 			max = subtree_last;
323 | 	}
324 | 	return max;
325 | }
326 | 
327 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
328 | {
329 | 	while (rb != stop) {
330 | 		struct interval_tree_node *node =
331 | 			rb_entry(rb, struct interval_tree_node, rb);
332 | 		unsigned long subtree_last = compute_subtree_last(node);
333 | 		if (node->__subtree_last == subtree_last)
334 | 			break;
335 | 		node->__subtree_last = subtree_last;
336 | 		rb = rb_parent(&node->rb);
337 | 	}
338 | }
339 | 
340 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
341 | {
342 | 	struct interval_tree_node *old =
343 | 		rb_entry(rb_old, struct interval_tree_node, rb);
344 | 	struct interval_tree_node *new =
345 | 		rb_entry(rb_new, struct interval_tree_node, rb);
346 | 
347 | 	new->__subtree_last = old->__subtree_last;
348 | }
349 | 
350 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
351 | {
352 | 	struct interval_tree_node *old =
353 | 		rb_entry(rb_old, struct interval_tree_node, rb);
354 | 	struct interval_tree_node *new =
355 | 		rb_entry(rb_new, struct interval_tree_node, rb);
356 | 
357 | 	new->__subtree_last = old->__subtree_last;
358 | 	old->__subtree_last = compute_subtree_last(old);
359 | }
360 | 
361 | static const struct rb_augment_callbacks augment_callbacks = {
362 | 	augment_propagate, augment_copy, augment_rotate
363 | };
364 | 
365 | void interval_tree_insert(struct interval_tree_node *node,
366 | 			  struct rb_root *root)
367 | {
368 | 	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
369 | 	unsigned long start = node->start, last = node->last;
370 | 	struct interval_tree_node *parent;
371 | 
372 | 	while (*link) {
373 | 		rb_parent = *link;
374 | 		parent = rb_entry(rb_parent, struct interval_tree_node, rb);
375 | 		if (parent->__subtree_last < last)
376 | 			parent->__subtree_last = last;
377 | 		if (start < parent->start)
378 | 			link = &parent->rb.rb_left;
379 | 		else
380 | 			link = &parent->rb.rb_right;
381 | 	}
382 | 
383 | 	node->__subtree_last = last;
384 | 	rb_link_node(&node->rb, rb_parent, link);
385 | 	rb_insert_augmented(&node->rb, root, &augment_callbacks);
386 | }
387 | 
388 | void interval_tree_remove(struct interval_tree_node *node,
389 | 			  struct rb_root *root)
390 | {
391 | 	rb_erase_augmented(&node->rb, root, &augment_callbacks);
392 | }
393 | 


--------------------------------------------------------------------------------
/utils/rbtree/src/rbtree.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Red Black Trees
  3 |   (C) 1999  Andrea Arcangeli <andrea@suse.de>
  4 |   (C) 2002  David Woodhouse <dwmw2@infradead.org>
  5 |   (C) 2012  Michel Lespinasse <walken@google.com>
  6 | 
  7 |   This program is free software; you can redistribute it and/or modify
  8 |   it under the terms of the GNU General Public License as published by
  9 |   the Free Software Foundation; either version 2 of the License, or
 10 |   (at your option) any later version.
 11 | 
 12 |   This program is distributed in the hope that it will be useful,
 13 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |   GNU General Public License for more details.
 16 | 
 17 |   You should have received a copy of the GNU General Public License
 18 |   along with this program; if not, write to the Free Software
 19 |   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 20 | 
 21 |   linux/lib/rbtree.c
 22 | */
 23 | 
 24 | #include <rbtree_augmented.h>
 25 | #include <export.h>
 26 | 
 27 | /*
 28 |  * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
 29 |  *
 30 |  *  1) A node is either red or black
 31 |  *  2) The root is black
 32 |  *  3) All leaves (NULL) are black
 33 |  *  4) Both children of every red node are black
 34 |  *  5) Every simple path from root to leaves contains the same number
 35 |  *     of black nodes.
 36 |  *
 37 |  *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
 38 |  *  consecutive red nodes in a path and every red node is therefore followed by
 39 |  *  a black. So if B is the number of black nodes on every simple path (as per
 40 |  *  5), then the longest possible path due to 4 is 2B.
 41 |  *
 42 |  *  We shall indicate color with case, where black nodes are uppercase and red
 43 |  *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
 44 |  *  parentheses and have some accompanying text comment.
 45 |  */
 46 | 
 47 | static inline void rb_set_black(struct rb_node *rb)
 48 | {
 49 | 	rb->__rb_parent_color |= RB_BLACK;
 50 | }
 51 | 
 52 | static inline struct rb_node *rb_red_parent(struct rb_node *red)
 53 | {
 54 | 	return (struct rb_node *)red->__rb_parent_color;
 55 | }
 56 | 
 57 | /*
 58 |  * Helper function for rotations:
 59 |  * - old's parent and color get assigned to new
 60 |  * - old gets assigned new as a parent and 'color' as a color.
 61 |  */
 62 | static inline void
 63 | __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 64 | 			struct rb_root *root, int color)
 65 | {
 66 | 	struct rb_node *parent = rb_parent(old);
 67 | 	new->__rb_parent_color = old->__rb_parent_color;
 68 | 	rb_set_parent_color(old, new, color);
 69 | 	__rb_change_child(old, new, parent, root);
 70 | }
 71 | 
 72 | static inline void
 73 | __rb_insert(struct rb_node *node, struct rb_root *root,
 74 | 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 75 | {
 76 | 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 77 | 
 78 | 	while (true) {
 79 | 		/*
 80 | 		 * Loop invariant: node is red
 81 | 		 *
 82 | 		 * If there is a black parent, we are done.
 83 | 		 * Otherwise, take some corrective action as we don't
 84 | 		 * want a red root or two consecutive red nodes.
 85 | 		 */
 86 | 		if (!parent) {
 87 | 			rb_set_parent_color(node, NULL, RB_BLACK);
 88 | 			break;
 89 | 		} else if (rb_is_black(parent))
 90 | 			break;
 91 | 
 92 | 		gparent = rb_red_parent(parent);
 93 | 
 94 | 		tmp = gparent->rb_right;
 95 | 		if (parent != tmp) {	/* parent == gparent->rb_left */
 96 | 			if (tmp && rb_is_red(tmp)) {
 97 | 				/*
 98 | 				 * Case 1 - color flips
 99 | 				 *
100 | 				 *       G            g
101 | 				 *      / \          / \
102 | 				 *     p   u  -->   P   U
103 | 				 *    /            /
104 | 				 *   n            N
105 | 				 *
106 | 				 * However, since g's parent might be red, and
107 | 				 * 4) does not allow this, we need to recurse
108 | 				 * at g.
109 | 				 */
110 | 				rb_set_parent_color(tmp, gparent, RB_BLACK);
111 | 				rb_set_parent_color(parent, gparent, RB_BLACK);
112 | 				node = gparent;
113 | 				parent = rb_parent(node);
114 | 				rb_set_parent_color(node, parent, RB_RED);
115 | 				continue;
116 | 			}
117 | 
118 | 			tmp = parent->rb_right;
119 | 			if (node == tmp) {
120 | 				/*
121 | 				 * Case 2 - left rotate at parent
122 | 				 *
123 | 				 *      G             G
124 | 				 *     / \           / \
125 | 				 *    p   U  -->    n   U
126 | 				 *     \           /
127 | 				 *      n         p
128 | 				 *
129 | 				 * This still leaves us in violation of 4), the
130 | 				 * continuation into Case 3 will fix that.
131 | 				 */
132 | 				parent->rb_right = tmp = node->rb_left;
133 | 				node->rb_left = parent;
134 | 				if (tmp)
135 | 					rb_set_parent_color(tmp, parent,
136 | 							    RB_BLACK);
137 | 				rb_set_parent_color(parent, node, RB_RED);
138 | 				augment_rotate(parent, node);
139 | 				parent = node;
140 | 				tmp = node->rb_right;
141 | 			}
142 | 
143 | 			/*
144 | 			 * Case 3 - right rotate at gparent
145 | 			 *
146 | 			 *        G           P
147 | 			 *       / \         / \
148 | 			 *      p   U  -->  n   g
149 | 			 *     /                 \
150 | 			 *    n                   U
151 | 			 */
152 | 			gparent->rb_left = tmp;  /* == parent->rb_right */
153 | 			parent->rb_right = gparent;
154 | 			if (tmp)
155 | 				rb_set_parent_color(tmp, gparent, RB_BLACK);
156 | 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
157 | 			augment_rotate(gparent, parent);
158 | 			break;
159 | 		} else {
160 | 			tmp = gparent->rb_left;
161 | 			if (tmp && rb_is_red(tmp)) {
162 | 				/* Case 1 - color flips */
163 | 				rb_set_parent_color(tmp, gparent, RB_BLACK);
164 | 				rb_set_parent_color(parent, gparent, RB_BLACK);
165 | 				node = gparent;
166 | 				parent = rb_parent(node);
167 | 				rb_set_parent_color(node, parent, RB_RED);
168 | 				continue;
169 | 			}
170 | 
171 | 			tmp = parent->rb_left;
172 | 			if (node == tmp) {
173 | 				/* Case 2 - right rotate at parent */
174 | 				parent->rb_left = tmp = node->rb_right;
175 | 				node->rb_right = parent;
176 | 				if (tmp)
177 | 					rb_set_parent_color(tmp, parent,
178 | 							    RB_BLACK);
179 | 				rb_set_parent_color(parent, node, RB_RED);
180 | 				augment_rotate(parent, node);
181 | 				parent = node;
182 | 				tmp = node->rb_left;
183 | 			}
184 | 
185 | 			/* Case 3 - left rotate at gparent */
186 | 			gparent->rb_right = tmp;  /* == parent->rb_left */
187 | 			parent->rb_left = gparent;
188 | 			if (tmp)
189 | 				rb_set_parent_color(tmp, gparent, RB_BLACK);
190 | 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
191 | 			augment_rotate(gparent, parent);
192 | 			break;
193 | 		}
194 | 	}
195 | }
196 | 
197 | /*
198 |  * Inline version for rb_erase() use - we want to be able to inline
199 |  * and eliminate the dummy_rotate callback there
200 |  */
201 | static inline void
202 | ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
203 | 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
204 | {
205 | 	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
206 | 
207 | 	while (true) {
208 | 		/*
209 | 		 * Loop invariants:
210 | 		 * - node is black (or NULL on first iteration)
211 | 		 * - node is not the root (parent is not NULL)
212 | 		 * - All leaf paths going through parent and node have a
213 | 		 *   black node count that is 1 lower than other leaf paths.
214 | 		 */
215 | 		sibling = parent->rb_right;
216 | 		if (node != sibling) {	/* node == parent->rb_left */
217 | 			if (rb_is_red(sibling)) {
218 | 				/*
219 | 				 * Case 1 - left rotate at parent
220 | 				 *
221 | 				 *     P               S
222 | 				 *    / \             / \
223 | 				 *   N   s    -->    p   Sr
224 | 				 *      / \         / \
225 | 				 *     Sl  Sr      N   Sl
226 | 				 */
227 | 				parent->rb_right = tmp1 = sibling->rb_left;
228 | 				sibling->rb_left = parent;
229 | 				rb_set_parent_color(tmp1, parent, RB_BLACK);
230 | 				__rb_rotate_set_parents(parent, sibling, root,
231 | 							RB_RED);
232 | 				augment_rotate(parent, sibling);
233 | 				sibling = tmp1;
234 | 			}
235 | 			tmp1 = sibling->rb_right;
236 | 			if (!tmp1 || rb_is_black(tmp1)) {
237 | 				tmp2 = sibling->rb_left;
238 | 				if (!tmp2 || rb_is_black(tmp2)) {
239 | 					/*
240 | 					 * Case 2 - sibling color flip
241 | 					 * (p could be either color here)
242 | 					 *
243 | 					 *    (p)           (p)
244 | 					 *    / \           / \
245 | 					 *   N   S    -->  N   s
246 | 					 *      / \           / \
247 | 					 *     Sl  Sr        Sl  Sr
248 | 					 *
249 | 					 * This leaves us violating 5) which
250 | 					 * can be fixed by flipping p to black
251 | 					 * if it was red, or by recursing at p.
252 | 					 * p is red when coming from Case 1.
253 | 					 */
254 | 					rb_set_parent_color(sibling, parent,
255 | 							    RB_RED);
256 | 					if (rb_is_red(parent))
257 | 						rb_set_black(parent);
258 | 					else {
259 | 						node = parent;
260 | 						parent = rb_parent(node);
261 | 						if (parent)
262 | 							continue;
263 | 					}
264 | 					break;
265 | 				}
266 | 				/*
267 | 				 * Case 3 - right rotate at sibling
268 | 				 * (p could be either color here)
269 | 				 *
270 | 				 *   (p)           (p)
271 | 				 *   / \           / \
272 | 				 *  N   S    -->  N   Sl
273 | 				 *     / \             \
274 | 				 *    sl  Sr            s
275 | 				 *                       \
276 | 				 *                        Sr
277 | 				 */
278 | 				sibling->rb_left = tmp1 = tmp2->rb_right;
279 | 				tmp2->rb_right = sibling;
280 | 				parent->rb_right = tmp2;
281 | 				if (tmp1)
282 | 					rb_set_parent_color(tmp1, sibling,
283 | 							    RB_BLACK);
284 | 				augment_rotate(sibling, tmp2);
285 | 				tmp1 = sibling;
286 | 				sibling = tmp2;
287 | 			}
288 | 			/*
289 | 			 * Case 4 - left rotate at parent + color flips
290 | 			 * (p and sl could be either color here.
291 | 			 *  After rotation, p becomes black, s acquires
292 | 			 *  p's color, and sl keeps its color)
293 | 			 *
294 | 			 *      (p)             (s)
295 | 			 *      / \             / \
296 | 			 *     N   S     -->   P   Sr
297 | 			 *        / \         / \
298 | 			 *      (sl) sr      N  (sl)
299 | 			 */
300 | 			parent->rb_right = tmp2 = sibling->rb_left;
301 | 			sibling->rb_left = parent;
302 | 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
303 | 			if (tmp2)
304 | 				rb_set_parent(tmp2, parent);
305 | 			__rb_rotate_set_parents(parent, sibling, root,
306 | 						RB_BLACK);
307 | 			augment_rotate(parent, sibling);
308 | 			break;
309 | 		} else {
310 | 			sibling = parent->rb_left;
311 | 			if (rb_is_red(sibling)) {
312 | 				/* Case 1 - right rotate at parent */
313 | 				parent->rb_left = tmp1 = sibling->rb_right;
314 | 				sibling->rb_right = parent;
315 | 				rb_set_parent_color(tmp1, parent, RB_BLACK);
316 | 				__rb_rotate_set_parents(parent, sibling, root,
317 | 							RB_RED);
318 | 				augment_rotate(parent, sibling);
319 | 				sibling = tmp1;
320 | 			}
321 | 			tmp1 = sibling->rb_left;
322 | 			if (!tmp1 || rb_is_black(tmp1)) {
323 | 				tmp2 = sibling->rb_right;
324 | 				if (!tmp2 || rb_is_black(tmp2)) {
325 | 					/* Case 2 - sibling color flip */
326 | 					rb_set_parent_color(sibling, parent,
327 | 							    RB_RED);
328 | 					if (rb_is_red(parent))
329 | 						rb_set_black(parent);
330 | 					else {
331 | 						node = parent;
332 | 						parent = rb_parent(node);
333 | 						if (parent)
334 | 							continue;
335 | 					}
336 | 					break;
337 | 				}
338 | 				/* Case 3 - right rotate at sibling */
339 | 				sibling->rb_right = tmp1 = tmp2->rb_left;
340 | 				tmp2->rb_left = sibling;
341 | 				parent->rb_left = tmp2;
342 | 				if (tmp1)
343 | 					rb_set_parent_color(tmp1, sibling,
344 | 							    RB_BLACK);
345 | 				augment_rotate(sibling, tmp2);
346 | 				tmp1 = sibling;
347 | 				sibling = tmp2;
348 | 			}
349 | 			/* Case 4 - left rotate at parent + color flips */
350 | 			parent->rb_left = tmp2 = sibling->rb_right;
351 | 			sibling->rb_right = parent;
352 | 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
353 | 			if (tmp2)
354 | 				rb_set_parent(tmp2, parent);
355 | 			__rb_rotate_set_parents(parent, sibling, root,
356 | 						RB_BLACK);
357 | 			augment_rotate(parent, sibling);
358 | 			break;
359 | 		}
360 | 	}
361 | }
362 | 
363 | /* Non-inline version for rb_erase_augmented() use */
364 | void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
365 | 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
366 | {
367 | 	____rb_erase_color(parent, root, augment_rotate);
368 | }
369 | EXPORT_SYMBOL(__rb_erase_color);
370 | 
371 | /*
372 |  * Non-augmented rbtree manipulation functions.
373 |  *
374 |  * We use dummy augmented callbacks here, and have the compiler optimize them
375 |  * out of the rb_insert_color() and rb_erase() function definitions.
376 |  */
377 | 
378 | static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
379 | static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
380 | static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
381 | 
382 | static const struct rb_augment_callbacks dummy_callbacks = {
383 | 	dummy_propagate, dummy_copy, dummy_rotate
384 | };
385 | 
386 | void rb_insert_color(struct rb_node *node, struct rb_root *root)
387 | {
388 | 	__rb_insert(node, root, dummy_rotate);
389 | }
390 | EXPORT_SYMBOL(rb_insert_color);
391 | 
392 | void rb_erase(struct rb_node *node, struct rb_root *root)
393 | {
394 | 	struct rb_node *rebalance;
395 | 	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
396 | 	if (rebalance)
397 | 		____rb_erase_color(rebalance, root, dummy_rotate);
398 | }
399 | EXPORT_SYMBOL(rb_erase);
400 | 
401 | /*
402 |  * Augmented rbtree manipulation functions.
403 |  *
404 |  * This instantiates the same __always_inline functions as in the non-augmented
405 |  * case, but this time with user-defined callbacks.
406 |  */
407 | 
408 | void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
409 | 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
410 | {
411 | 	__rb_insert(node, root, augment_rotate);
412 | }
413 | EXPORT_SYMBOL(__rb_insert_augmented);
414 | 
415 | /*
416 |  * This function returns the first node (in sort order) of the tree.
417 |  */
418 | struct rb_node *rb_first(const struct rb_root *root)
419 | {
420 | 	struct rb_node	*n;
421 | 
422 | 	n = root->rb_node;
423 | 	if (!n)
424 | 		return NULL;
425 | 	while (n->rb_left)
426 | 		n = n->rb_left;
427 | 	return n;
428 | }
429 | EXPORT_SYMBOL(rb_first);
430 | 
431 | struct rb_node *rb_last(const struct rb_root *root)
432 | {
433 | 	struct rb_node	*n;
434 | 
435 | 	n = root->rb_node;
436 | 	if (!n)
437 | 		return NULL;
438 | 	while (n->rb_right)
439 | 		n = n->rb_right;
440 | 	return n;
441 | }
442 | EXPORT_SYMBOL(rb_last);
443 | 
444 | struct rb_node *rb_next(const struct rb_node *node)
445 | {
446 | 	struct rb_node *parent;
447 | 
448 | 	if (RB_EMPTY_NODE(node))
449 | 		return NULL;
450 | 
451 | 	/*
452 | 	 * If we have a right-hand child, go down and then left as far
453 | 	 * as we can.
454 | 	 */
455 | 	if (node->rb_right) {
456 | 		node = node->rb_right; 
457 | 		while (node->rb_left)
458 | 			node=node->rb_left;
459 | 		return (struct rb_node *)node;
460 | 	}
461 | 
462 | 	/*
463 | 	 * No right-hand children. Everything down and left is smaller than us,
464 | 	 * so any 'next' node must be in the general direction of our parent.
465 | 	 * Go up the tree; any time the ancestor is a right-hand child of its
466 | 	 * parent, keep going up. First time it's a left-hand child of its
467 | 	 * parent, said parent is our 'next' node.
468 | 	 */
469 | 	while ((parent = rb_parent(node)) && node == parent->rb_right)
470 | 		node = parent;
471 | 
472 | 	return parent;
473 | }
474 | EXPORT_SYMBOL(rb_next);
475 | 
476 | struct rb_node *rb_prev(const struct rb_node *node)
477 | {
478 | 	struct rb_node *parent;
479 | 
480 | 	if (RB_EMPTY_NODE(node))
481 | 		return NULL;
482 | 
483 | 	/*
484 | 	 * If we have a left-hand child, go down and then right as far
485 | 	 * as we can.
486 | 	 */
487 | 	if (node->rb_left) {
488 | 		node = node->rb_left; 
489 | 		while (node->rb_right)
490 | 			node=node->rb_right;
491 | 		return (struct rb_node *)node;
492 | 	}
493 | 
494 | 	/*
495 | 	 * No left-hand children. Go up till we find an ancestor which
496 | 	 * is a right-hand child of its parent.
497 | 	 */
498 | 	while ((parent = rb_parent(node)) && node == parent->rb_left)
499 | 		node = parent;
500 | 
501 | 	return parent;
502 | }
503 | EXPORT_SYMBOL(rb_prev);
504 | 
505 | void rb_replace_node(struct rb_node *victim, struct rb_node *new,
506 | 		     struct rb_root *root)
507 | {
508 | 	struct rb_node *parent = rb_parent(victim);
509 | 
510 | 	/* Set the surrounding nodes to point to the replacement */
511 | 	__rb_change_child(victim, new, parent, root);
512 | 	if (victim->rb_left)
513 | 		rb_set_parent(victim->rb_left, new);
514 | 	if (victim->rb_right)
515 | 		rb_set_parent(victim->rb_right, new);
516 | 
517 | 	/* Copy the pointers/colour from the victim to the replacement */
518 | 	*new = *victim;
519 | }
520 | EXPORT_SYMBOL(rb_replace_node);
521 | 
522 | static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
523 | {
524 | 	for (;;) {
525 | 		if (node->rb_left)
526 | 			node = node->rb_left;
527 | 		else if (node->rb_right)
528 | 			node = node->rb_right;
529 | 		else
530 | 			return (struct rb_node *)node;
531 | 	}
532 | }
533 | 
534 | struct rb_node *rb_next_postorder(const struct rb_node *node)
535 | {
536 | 	const struct rb_node *parent;
537 | 	if (!node)
538 | 		return NULL;
539 | 	parent = rb_parent(node);
540 | 
541 | 	/* If we're sitting on node, we've already seen our children */
542 | 	if (parent && node == parent->rb_left && parent->rb_right) {
543 | 		/* If we are the parent's left node, go to the parent's right
544 | 		 * node then all the way down to the left */
545 | 		return rb_left_deepest_node(parent->rb_right);
546 | 	} else
547 | 		/* Otherwise we are the parent's right node, and the parent
548 | 		 * should be next */
549 | 		return (struct rb_node *)parent;
550 | }
551 | EXPORT_SYMBOL(rb_next_postorder);
552 | 
553 | struct rb_node *rb_first_postorder(const struct rb_root *root)
554 | {
555 | 	if (!root->rb_node)
556 | 		return NULL;
557 | 
558 | 	return rb_left_deepest_node(root->rb_node);
559 | }
560 | EXPORT_SYMBOL(rb_first_postorder);
561 | 


--------------------------------------------------------------------------------