├── .eqc_ci
├── EQC_CI_LICENCE.txt
├── LICENSE
├── Makefile
├── README.md
├── doc
    ├── 5HT.css
    ├── cr.htm
    └── images
    │   ├── log.svg
    │   ├── merging.svg
    │   ├── replicas.svg
    │   └── sup.png
├── include
    ├── cr.hrl
    ├── rafter.hrl
    ├── rafter_consensus_fsm.hrl
    └── rafter_opts.hrl
├── mad
├── otp.mk
├── rebar.config
├── src
    ├── backends
    │   └── cr_kvs.erl
    ├── consensus
    │   ├── README.md
    │   ├── cr_config.erl
    │   ├── cr_log.erl
    │   ├── cr_paxon.erl
    │   ├── cr_rafter.erl
    │   └── cr_replication.erl
    ├── cr.app.src
    ├── cr.erl
    ├── cr_app.erl
    ├── cr_hash.erl
    ├── cr_heart.erl
    ├── cr_vnode.erl
    └── tcp
    │   ├── cr_client.erl
    │   ├── cr_connection.erl
    │   ├── cr_interconnect.erl
    │   ├── cr_ping.erl
    │   └── cr_tcp.erl
├── sys.config
└── vm.args


/.eqc_ci:
--------------------------------------------------------------------------------
1 | {build, "./mad dep com pla"}.
2 | {test_path, "ebin"}.
3 | {deps, "deps"}.
4 | {test_root, "test"}.
5 | 


--------------------------------------------------------------------------------
/EQC_CI_LICENCE.txt:
--------------------------------------------------------------------------------
 1 | This file is an agreement between Quviq AB ("Quviq"), Sven Hultins
 2 | Gata 9, Gothenburg, Sweden, and the committers to the github
 3 | repository in which the file appears ("the owner"). By placing this
 4 | file in a github repository, the owner agrees to the terms below.
 5 | 
 6 | The purpose of the agreement is to enable Quviq AB to provide a
 7 | continuous integration service to the owner, whereby the code in the
 8 | repository ("the source code") is tested using Quviq's test tools, and
 9 | the test results are made available on the web. The test results
10 | include test output, generated test cases, and a copy of the source
11 | code in the repository annotated with coverage information ("the test
12 | results").
13 | 
14 | The owner agrees that Quviq may run the tests in the source code and
15 | display the test results on the web, without obligation.
16 | 
17 | The owner warrants that running the tests in the source code and
18 | displaying the test results on the web violates no laws, licences or other
19 | agreements. In the event of such a violation, the owner accepts full
20 | responsibility.
21 | 
22 | The owner warrants that the source code is not malicious, and will not
23 | mount an attack on either Quviq's server or any other server--for
24 | example by taking part in a denial of service attack, or by attempting
25 | to send unsolicited emails.
26 | 
27 | The owner warrants that the source code does not attempt to reverse
28 | engineer Quviq's code.
29 | 
30 | Quviq reserves the right to exclude repositories that break this
31 | agreement from its continuous integration service.
32 | 
33 | Any dispute arising from the use of Quviq's service will be resolved
34 | under Swedish law.
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Maxim Sokhatsky, Synrc Research Center
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | Software may only be used for the great good and the true happiness of all sentient beings.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
18 | THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | RELEASE := cr
 2 | COOKIE  := node_runner
 3 | VER     := 1.0.0
 4 | 
 5 | NAME	?= cr
 6 | 
 7 | default: compile
 8 | 
 9 | include otp.mk
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Byzantine Chain Replication Database
  2 | ====================================
  3 | 
  4 | [![Join the chat at https://gitter.im/spawnproc/cr](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/spawnproc/cr?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
  5 | 
  6 | In banking system demands are very tight. Database
  7 | should be at least tripled, stand-by nodes should pick up
  8 | master reads from failover node, writes should be
  9 | accepted on a reasonable quorum, failover must be followed by recovery, database
 10 | should be able to scale even with the RAM/DISC limitations.
 11 | 
 12 | No data should be treated as written otherwise that commited to all replicas.
 13 | All this circumstances leads us to chain replication protocol as a simple and natural
 14 | feedback to this challenge.
 15 | 
 16 | Different replication techniques exists to satisfy replication demands.
 17 | Master-slave replication is most widely known type of replication
 18 | used before in such products like GFS, HDFS, mongodb, etc. Quorum Intersection
 19 | is another technique used in databases like Cassandra or Amazon Dynamo.
 20 | They mostly provide a consistent distributed repository
 21 | for event tables or for file storage. In banking industry
 22 | we synchronize account balances and need simple and managable
 23 | protocol for storage consistency issuing high demand on system integrity.
 24 | 
 25 | There are several classes of error usually implied when dealing with failure detection.
 26 | The most weak class is fail-stop events, when the outage is normal or predictable.
 27 | The second class is crash-failures, the ubnormal terminations and outages. The most strong
 28 | type of failures are byzantine failures resistant to bit-flips,
 29 | hacked parties or any types of compromising the transaction objects.
 30 | For banking applications the byzantine fault tolerance is desired,
 31 | despite it affects the latency.
 32 | 
 33 | Features
 34 | --------
 35 | 
 36 | * Highly-available CP database :-)
 37 | * 2N+1 nodes tolerates N failures
 38 | * Consistent hashing DHT
 39 | * RAFT for managing server configurations timeline
 40 | * HMAC signing for Byzantine capabilities
 41 | * Various database backends: <b>mnesia</b>, <b>riak</b>, <b>redis</b>, <b>fs</b>, <b>sql</b>
 42 | * High-performance non-blocking TCP acceptor
 43 | * Separate endpoints for HEART, CLIENT and SERVER protocols
 44 | * Pure, clean and understandable codebase
 45 | * Article about CR implementation details: http://synrc.space/apps/cr/doc/cr.htm
 46 | * Business Processing Erlang book: http://synrc.space/apps/bpe/doc/book.pdf
 47 | 
 48 | Launch
 49 | ------
 50 | 
 51 | ```bash
 52 | make console NAME=cr
 53 | make console NAME=cr2
 54 | make console NAME=cr3
 55 | ```
 56 | 
 57 | You could start all nodes in separate console sesions or you
 58 | can `make start NAME=cr2` nodes and later attach to them with `make attach NAME=cr2`.
 59 | Also the start is compatible within single folders, which cause no single problem.
 60 | 
 61 | ```erlang
 62 | > timer:tc(cr,test,[500]).
 63 | 
 64 | =INFO REPORT==== 7-Apr-2015::00:56:34 ===
 65 | cr:Already in Database: 14020
 66 | New record will be applied: 500
 67 | {214369,{transactions,11510}}
 68 | ```
 69 | 
 70 | Fore generating sample data, let say 500 transactions you may run with `cr:test(500)`.
 71 | By measuring accepring performance it's like `2000 Req/s`.
 72 | 
 73 | ```erlang
 74 | > cr:dump().
 75 | 
 76 |                                                vnode   i  n        top     latency
 77 |     121791803110908576516973736059690251637994378581   1  1        391    2/198/64
 78 |     243583606221817153033947472119380503275988757162   2  1        400    2/183/72
 79 |     365375409332725729550921208179070754913983135743   3  1        388    3/195/64
 80 |     487167212443634306067894944238761006551977514324   4  1        357    2/183/53
 81 |     608959015554542882584868680298451258189971892905   5  2      12994    2/198/67
 82 |     730750818665451459101842416358141509827966271486   6  2      13017    3/184/66
 83 |     852542621776360035618816152417831761465960650067   7  2      13019    2/201/75
 84 |     974334424887268612135789888477522013103955028648   8  2      13020    3/178/62
 85 |    1096126227998177188652763624537212264741949407229   9  3      13021    2/190/68
 86 |    1217918031109085765169737360596902516379943785810  10  3      13028    3/206/65
 87 |    1339709834219994341686711096656592768017938164391  11  3      13030    2/208/55
 88 |    1461501637330902918203684832716283019655932542972  12  3      13031    2/185/58
 89 | ok
 90 | ```
 91 | 
 92 | The latency in last column `~70 ms` means the moment data is stored on all `mnesia` replicas.
 93 | The latency in a given example is for storing async_dirty using KVS
 94 | chain linking (from `1 to 3` msg per write operation, from `1 to 2` msg for lookups)
 95 | clustered in `3 nodes` with same replicas number.
 96 | 
 97 | Let's say we want to see all the operations log of a given replica `391`.
 98 | 
 99 | ```erlang
100 | > cr:dump(391).
101 |                                          operation         id       prev    i       size
102 |                       transaction:389:feed::false:        391        387    1        480
103 |                       transaction:399:feed::false:        387        382    1        500
104 |                       transaction:375:feed::false:        382        379    1        446
105 |                       transaction:373:feed::false:        379        378    1        446
106 |                       transaction:383:feed::false:        378        376    1        473
107 |                       transaction:392:feed::false:        376        374    1        500
108 |                       transaction:360:feed::false:        374        371    1        446
109 |                       transaction:366:feed::false:        371        370    1        473
110 |                       transaction:370:feed::false:        370        369    1        446
111 |                       transaction:371:feed::false:        369        368    1        446
112 | ok
113 | ```
114 | 
115 | You may check this from the other side. First retrieve the operation and then
116 | retrieve the transaction created during operation.
117 | 
118 | ```erlang
119 | > kvs:get(operation,391).
120 | {ok,#operation{id = 391,version = undefined,container = log,
121 |                feed_id = {121791803110908576516973736059690251637994378581,1},   % VNODE
122 |                prev = 387,next = undefined,feeds = [],guard = false,
123 |                etc = undefined,
124 |                body = {prepare,{<0.41.0>,{1428,358105,840469}},
125 |                                [{121791803110908576516973736059690251637994378581,1},  % SIGNATURES
126 |                                 {608959015554542882584868680298451258189971892905,2}],
127 |                                #transaction{id = 389,version = undefined,container = feed,
128 |                                             feed_id = undefined,prev = undefined,next = undefined,
129 |                                             feeds = [],guard = false,etc = undefined,
130 |                                             timestamp = undefined,beneficiary = undefined,...}},
131 |                name = prepare,status = pending}}
132 | ```
133 | 
134 | The transaction. For linking transaction to the link you should use full XA
135 | protocol with two-stage confirmation (1) the PUT operation followed
136 | with (2) LINK operation to some feed, such as user account or customer admin list.
137 | 
138 | ```erlang
139 | > kvs:get(transaction,389).
140 | {ok,#transaction{id = 389,version = undefined,
141 |                  container = feed, feed_id = undefined, prev = undefined,
142 |                  next = undefined, feeds = [], guard = false, etc = undefined,
143 |                  timestamp = [], beneficiary = [],
144 |                  subsidiary = [], amount = [],tax = [],
145 |                  ballance = [], currency = [],
146 |                  description = [], info = [],
147 |                  prevdate = [], rate = [], item = []}}
148 | ```
149 | 
150 | The actiual Erlang business logic, banking transaction from `db` schema
151 | application is stored under 389 id. So you can easlity grab it unlinked
152 | as it was stored as atomic PUT.
153 | 
154 | Licenses
155 | --------
156 | 
157 | * consensus protols 1) raft and 2) paxos are distributed under the terms of Apache 2.0 http://www.apache.org/licenses/LICENSE-2.0.html
158 | * cr itself is distributed under the DHARMA license: http://5ht.co/license.htm
159 | 
160 | Credits
161 | -------
162 | 
163 | Copyright (c) 2015 Synrc Research Center s.r.o.
164 | 
165 | * Maxim Sokhatsky
166 | * Vladimir Kirillov
167 | * Sergey Klimenko
168 | * Valery Meleshkin
169 | * Victor Sovietov
170 | 
171 | OM A HUM
172 | 


--------------------------------------------------------------------------------
/doc/5HT.css:
--------------------------------------------------------------------------------
 1 | pre { padding:4px;white-space:pre;background-color:#F1F1F1;font-family:monospace;font-size:14pt;}
 2 | code { padding:4px;white-space:pre;font-family:monospace;font-size:14pt;}
 3 | body { font-family: local; font-size: 16pt; color: #888; }
 4 | h1 { font-size: 34pt; }
 5 | h2 { font-size: 24pt; margin-top: 50px; }
 6 | h3 { margin-top: 40px; }
 7 | h4 { margin-top: 40px; }
 8 | h5 { margin-top: -20px; }
 9 | p { margin-top: 10px; }
10 | .note { margin-top: 0px; }
11 | .note p { margin-top: 20px; }
12 | .menu { text-align: right;}
13 | a { margin-top: 10px; padding: 10px; }
14 | .app { margin:100px auto;min-width:300px;max-width:800px; }
15 | .message { align: center; }
16 | .note { margin-left:0px;margin-top:0px;background-color:#F1F1F1;padding:4px 10px 4px 24px;color:gray;}
17 | ul {margin-left:70px;} 
18 | 
19 | a { color: blue; text-decoration: none }
20 | a:hover { color:blue; }
21 | a:hover, a:active { outline: 0 }
22 | 
23 | @font-face {
24 |     font-family: 'local';
25 |     src: url('Geometria-Light.otf');
26 |     font-weight: normal;
27 |     font-style: normal
28 | }
29 | 


--------------------------------------------------------------------------------
/doc/cr.htm:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 |     <meta charset="utf-8" />
  4 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 |     <meta name="description" content="" />
  7 |     <meta name="author" content="Maxim Sokhatsky" />
  8 |     <title>CR</title>
  9 |     <link rel="stylesheet" href="http://5ht.github.io/5HT.css" />
 10 | </head>
 11 | <body>
 12 | <div class=app>
 13 | 
 14 | <div align=right>FROM: 5HT<br>
 15 |                    TO: PUB<br>
 16 |                  DATE: 4 APR 2015</div>
 17 | 
 18 | <h2>Chain Replication Database</h2>
 19 | 
 20 | <div class=message>
 21 | 
 22 | <p>In banking system demands are very tight. Database
 23 | should be at least tripled, stand-by nodes should pick up
 24 | master reads from failover node, writes should be
 25 | accepted on a reasonble quorum, failover must be followed by recovery, database
 26 | should be able to scale even with the RAM/DISC limitations.</p>
 27 | 
 28 | <p>No data should be treated as written otherwise that commited to all replicas.
 29 | All this circumstances leads us to chain replication protocol as a simple and natural
 30 | feedback to this challenge.</p>
 31 | 
 32 | <p>Different replication techniques exists to satisfy replication demands.
 33 | Master-slave replication is most widely known type of replication
 34 | used before in such products like GFS, HDFS, mongodb, etc. Quorum Intersection
 35 | is another technique used in databases like Cassandra or Amazon Dynamo.
 36 | They mostly provide a consistent distributed repository
 37 | for event tables or for file storage. In banking industry
 38 | we synchronize account balances and need simple and managable
 39 | protocol for storage consistency issuing high demand on system integrity.
 40 | </p>
 41 | 
 42 | <p>There are several classes of error usually implied when dealing with failure detection.
 43 | The most weak class is fail-stop events, when the outage is normal or predictable.
 44 | The second class is crash-failures, the ubnormal terminations and outages. The most strong
 45 | type of failures are byzantine failures resistant to bit-flips,
 46 | hacked parties or any types of compromising the transaction objects.
 47 | For banking applications the byzantine fault tolerance is desired,
 48 | despite it affects the latency. However we will show that CR latency
 49 | is acceptible even in compare with web applications.</p>
 50 | 
 51 | <h3>Features</h3>
 52 | 
 53 | <p>
 54 | <ul>
 55 | <li>CP database</li>
 56 | <li>2N+1 nodes tolerates N failures</li>
 57 | <li>Consistent hashing DHT</li>
 58 | <li>RAFT for managing server configurations timeline</li>
 59 | <li>HMAC signing for Byzantine capabilities</li>
 60 | <li>Various database backends: <b>mnesia</b>, <b>riak</b>, <b>redis</b>, <b>fs</b>, <b>sql</b></li>
 61 | <li>High-performance non-blocking TCP acceptor</li>
 62 | <li>Separate endpoints for HEART, CLIENT and SERVER protocols</li>
 63 | <li>Pure, clean and understandable codebase</li></ul></p>
 64 | 
 65 | <h3>Consistent Hash Ring</h3>
 66 | 
 67 | <p>Bulding a consistent hash ring is a key feature
 68 | that opens a door to the distributed system.
 69 | CR is using only five functions to model the DHT ring.
 70 | Ring provides a desirable probability in series
 71 | of nines of working event condition.</p>
 72 | 
 73 | <p>
 74 | <div class=note>
 75 | <pre>
 76 |  > cr:ring().
 77 | 
 78 | {12,
 79 |  [{0,0},
 80 |   {121791803110908576516973736059690251637994378581,1},
 81 |   {243583606221817153033947472119380503275988757162,1},
 82 |   {365375409332725729550921208179070754913983135743,1},
 83 |   {487167212443634306067894944238761006551977514324,1},
 84 |   {608959015554542882584868680298451258189971892905,2},
 85 |   {730750818665451459101842416358141509827966271486,2},
 86 |   {852542621776360035618816152417831761465960650067,2},
 87 |   {974334424887268612135789888477522013103955028648,2},
 88 |   {1096126227998177188652763624537212264741949407229,3},
 89 |   {1217918031109085765169737360596902516379943785810,3},
 90 |   {1339709834219994341686711096656592768017938164391,3},
 91 |   {1461501637330902918203684832716283019655932542972,3}]}
 92 | </pre></div>
 93 | </p>
 94 | 
 95 | <p>The ring or configuration is partitioned by shards or peers.</p>
 96 | 
 97 | <p>
 98 | <div class=note>
 99 | <pre>
100 | > cr:peers().
101 | 
102 | [{'cr1@127.0.0.1',9000,9001,9002},
103 |  {'cr2@127.0.0.1',9004,9005,9006},
104 |  {'cr3@127.0.0.1',9008,9009,9010}]
105 | </pre></div>
106 | </p>
107 | 
108 | <p>Each peer is running several replica protocol vnodes. Each vnode is a
109 | replica process that serves a specific key-range.</p>
110 | 
111 | <p>
112 | <div class=note>
113 | <pre>
114 | > cr:local().
115 | 
116 | [{487167212443634306067894944238761006551977514324,<0.200.0>},
117 |  {365375409332725729550921208179070754913983135743,<0.199.0>},
118 |  {243583606221817153033947472119380503275988757162,<0.198.0>},
119 |  {121791803110908576516973736059690251637994378581,<0.197.0>}]
120 | </pre></div>
121 | </p>
122 | 
123 | <div class=note>
124 | <pre>
125 | > cr:chain(foo).
126 | 
127 | [{1461501637330902918203684832716283019655932542972,3},
128 |  {487167212443634306067894944238761006551977514324,1},
129 |  {974334424887268612135789888477522013103955028648,2}]</pre></div>
130 | 
131 | <h3>Chain Replication Protocol</h3>
132 | 
133 | <p><div class=note style="background-color:#FAFAFA;"><p><b>Command</b>
134 | <code>
135 | </code><p></div>
136 | <div class=note>
137 | <p>Command is an atomic event that can be performed
138 | in single process context at a single machine.</p></div></P>
139 | 
140 | <p>CR provides extensible set of possible commands:</p>
141 | 
142 | <p>
143 | <ul>
144 | <li>PUT the object to database</li>
145 | <li>LINK the object to some doubly-linked list</li>
146 | <li>REMOVE the object from the list and database</li>
147 | </ul></p>
148 | 
149 | <p>This set of commands refers to KVS the database framework for
150 | storing the doubly linked lists (it can be called chains/feeds/sequences)
151 | using the two basic record types: <b>#container</b>, who store the top of a chain along
152 | with chain aggregation counters; and <b>#iterator</b>, who provides next and prev
153 | fields for traversal.</p>
154 | 
155 | <div class=note style="background-color:#FAFAFA;"><p><b>Distributed Transaction</b>
156 | <code>
157 | </code><p></div>
158 | <div class=note>
159 | <p>All replicas are sequenced into the chains. Transaction is a
160 | command performing forward over the ordered chain of replicas. This chain
161 | is called configuration. All writes come to the chain's head,
162 | all reads come to chain's tail.</p></div>
163 | 
164 | <p><center>Picture 1. Chain<br><br><img src="images/replicas.svg" height=200></center></p>
165 | 
166 | <p><div class=note style="background-color:#FAFAFA;"><p><b>Replication Log</b>
167 | <code>
168 | </code><p></div>
169 | <div class=note>
170 | <p>During transaction, the command is saved in replication
171 | log on each replica of the transaction. This log is append-only
172 | disk structure and is also called this history of replica's operations.</p></div></p>
173 | 
174 | <p>The replication log is also uses KVS as underlying storage.
175 | As a replication log container it uses <b>#log</b> type and command is stored
176 | as <b>#operation</b> record. Each replica has its own log.</p>
177 | 
178 | <p><center>Picture 2. Log<br><br><img src="images/log.svg" height=400></center></p>
179 | 
180 | <h4>Replica Protocol</h4>
181 | 
182 | <p>Some assumptions are implied during protocol description.</p>
183 | 
184 | <blockquote><ul><li>1) each peer has at least one non-faulty vnode;</li>
185 | <li>2) ring is tracked by external consensus or</li>
186 | <li>3) ring has at least one peer with no faulty vnodes.</li>
187 | </ul></blockquote>
188 | 
189 | <p><b>#operation</b> [Vnode,Chain,Operation] — Any active replica Vnode in
190 | configuration Chain can issue an operation command only if each
191 | preceding replica in Chain, if any, has done likewise and there
192 | is no conflicting operation for s in its history. Vnode also adds
193 | a new order proof to its history.</p>
194 | 
195 | <p><b>#suspend</b> [Vnode] — An active replica Vnode can
196 | suspend updating its history by becoming immutable at any time.
197 | Only heart monitor can issue a becomeImmutable message.
198 | The replica signs a wedged statement to notify heart monitor
199 | that it is immutable and what its history is.</p>
200 | 
201 | <p><b>#resume</b> [Vnode,Configuration,History] — A pending
202 | replica Vnode in Configuration can resume handling operations
203 | if the Heart Monitor has synchronized the history between
204 | nodes to the greatest common prefix log.</p>
205 | 
206 | <h3>Failures</h3>
207 | 
208 | <p><div class=note style="background-color:#FAFAFA;"><p><b>Configuration Tracking</b>
209 | <code>
210 | </code><p></div>
211 | <div class=note>
212 | <p>The configuration is a dynamic property of transaction.
213 | During transaction it may change due to byzantine failures,
214 | leading us to reconfigure the replicas in a chain. The another consistent
215 | system is needed to track the dynamic configurations.</p></div></P>
216 | 
217 | <p>To make the shard highly available, we use replication
218 | and dynamically change the configuration of replicas
219 | in order to deal with crash failures and unresponsiveness.
220 | Each machine in a cluster has single append-only configuration
221 | log which is not based on KVS due to latency requirements.
222 | Configuration log is a binary file written by RAFT protocol commands.
223 | There is only two commands which could be performed over the configuration log:</p>
224 | 
225 | <p>
226 | <ul>
227 | <li>ADD replica to configuration</li>
228 | <li>DELETE replica from configuration</li>
229 | </ul></p>
230 | 
231 | <h4>Heart Monitor Protocol</h4>
232 | 
233 | <p><b>#reconfig</b> [Node,Configuration,NewConfiguration] —
234 | The heart monitor waits for a set of valid histories from
235 | a quorum of replicas in current configuration.
236 | A valid history contains at most one record per operation.
237 | The oracle then issues an #resume message for all nodes in NewConfiguration
238 | with the log position of maximal common prefix (last replica in previous Configuration).
239 | The heart monitor can issue at most one #resume message per Configuration generation.</p>
240 | 
241 | <p><b>#ping</b> — Round-Robin ping over nodes of Configuration. In initial
242 | configuration all nodes are active or resumed.</p>
243 | 
244 | <h3>Safety</h3>
245 | 
246 | <div class=note style="background-color:#FAFAFA;"><p><b>Stable Operation Log</b>
247 | 
248 | <p>The equation specifies what operations O are safe, when all its replicas are commited.
249 | but not when or in what order to do them.
250 | In other words, the system is asynchronous. In this formula we call stable
251 | operation log having operations commited on all replicas.</p>
252 | 
253 | <code>
254 | Stable = [ R || R <- replicas(O),
255 |                 status(R) == commited,
256 |                 length(R) == N ]
257 | </code><p>
258 | </div>
259 | <div class=note>
260 | <p><b>NOTE</b>: due to asynchronous nature of transaction service the operations
261 | log will be always unordered. As on Picture 3 it should GCP = 2.</p></div>
262 | 
263 | <p><center>Picture 3. Greatest common prefix<br><br>
264 | <img src="images/merging.svg" height=300></center></p>
265 | 
266 | <h3>Liveness</h3>
267 | 
268 | <p>There is always eventually a configuration in which all replicas
269 | are correct and do not become suspended. Failure detection of liveness
270 | is tracked by Heart Monitor which pings each node and reconfigures the
271 | nodes for synchronizing the configuration consensus log.</p>
272 | 
273 | 
274 | <h3>OTP protocol</h3>
275 | 
276 | <p>Some types are embedded in L core to resolve main tasks during
277 | type inference, type unification and patterm maching compilation.
278 | L has following basic types which are used by infer/unify/match core.
279 | These types are also shared with Type Inspector.</p>
280 | 
281 | <div class=note style="background-color:#fafafa;"><p>INTERCONNECT<ul>
282 |     <li>transaction</li>
283 |     <li>get</li>
284 |     <li>sync</li>
285 |     </ul></p></div>
286 | <div class=note><p>PING<ul>
287 |     <li>ping</li>
288 |     <li>join</li>
289 |     <li>leave</li>
290 | </ul></p></div>
291 | 
292 | <h3>Implementation</h3>
293 | 
294 | <p>The chain replication protolcol is implementes as <b>Erlang/OTP</b> application <b>cr</b>
295 | that could be embeded in any toplevel application. We use one supervision
296 | tree and <b>gen_server</b> per one TCP endpoint along with separate
297 | <b>vnode_sup</b> supervision for VNODE transactional contexts per hashring vnode.</p>
298 | 
299 | <p>The Chain Replication Database application is built using Synrc Application Stack.
300 | Among them we have <b>fs</b> native file-system listener, <b>sh</b> shell executor
301 | for running external commands, powerful <b>mad</b> rebar replacement which is
302 | able to pack application inside single-file bundle. During development we
303 | also use <b>otp.mk</b> and <b>active</b> file reloader that uses native
304 | filesystem event on each platform. The database itself built using
305 | <b>kvs</b> with <b>mnesia</b> backend and <b>db</b> banking schema as example.</p>
306 | 
307 | <p><div class=note>
308 | <pre>
309 | > application:which_applications().
310 | 
311 | [{cr,"Chain Replication","0.1"},
312 |  {sh,"VXZ SH Executor","0.9"},
313 |  {mad,"MAD VXZ Build Tool","2.2"},
314 |  {db,"Bank Database","1"},
315 |  {active,"ACT VXZ Continuous Compilation","0.9"},
316 |  {kvs,"KVS Abstract Term Database","1"},
317 |  {mnesia,"MNESIA  CXC 138 12","4.12.3"},
318 |  {fs,"VXZ FS Listener","0.9.1"},
319 |  {stdlib,"ERTS  CXC 138 10","2.2"},
320 |  {kernel,"ERTS  CXC 138 10","3.0.3"}]
321 | </pre>
322 | </div>
323 | </p>
324 | 
325 | <p>Supervision tree of chain replication supervisor:</p>
326 | 
327 | <p><center>Picture 4. Supervision<br><br><img src="images/sup.png" height=400></center></p>
328 | 
329 | <p>
330 | <div class=note>
331 | <pre>
332 | > cr:sup().
333 | 
334 | [{vnode_sup,<0.52.0>},
335 |  {client_sup,<0.51.0>},
336 |  {client,<0.50.0>},
337 |  {ping_sup,<0.289.0>},
338 |  {ping,<0.48.0>},
339 |  {interconnect_sup,<0.47.0>},
340 |  {interconnect,<0.46.0>}]
341 | </pre></div>
342 | </p>
343 | 
344 | <p>For benchmarking database please populate the it with data but without
345 | overloading the database:</p>
346 | 
347 | <div class=note>
348 | <pre>
349 |     [
350 |       begin
351 |           cr:test(500),
352 |           timer:sleep(1000)
353 |       end
354 |           || ___ <- lists:seq(1,10)
355 |     ].
356 | 
357 | > cr:dump().
358 | </pre>
359 | <pre style="font-size:10pt;">
360 |                                                vnode   i  n        top      log        latency
361 |     121791803110908576516973736059690251637994378581   1  1       6506     1607       1/315/97
362 |     243583606221817153033947472119380503275988757162   2  1       6508     1662      1/317/100
363 |     365375409332725729550921208179070754913983135743   3  1       6510     1658      2/317/105
364 |     487167212443634306067894944238761006551977514324   4  1       6505     1583      1/317/104
365 |     608959015554542882584868680298451258189971892905   5  2       6499     1637      3/317/115
366 |     730750818665451459101842416358141509827966271486   6  2       6510     1664      2/318/117
367 |     852542621776360035618816152417831761465960650067   7  2       6501     1634      2/311/115
368 |     974334424887268612135789888477522013103955028648   8  2       6500     1575       3/290/96
369 |    1096126227998177188652763624537212264741949407229   9  3       6497     1607      3/316/118
370 |    1217918031109085765169737360596902516379943785810  10  3       6510     1662      3/318/117
371 |    1339709834219994341686711096656592768017938164391  11  3       6496     1658      3/311/106
372 |    1461501637330902918203684832716283019655932542972  12  3       6505     1583      2/295/104
373 | </pre></div>
374 | 
375 | <h3>Literature</h3>
376 | 
377 | &nbsp;[1]. Hussam Abu-Libdeh, Robbert van Renesse, Ymir Vigfusson.<br>
378 | <a href="http://www.ymsir.com/papers/sharding-socc.pdf">
379 | &nbsp;&nbsp;&nbsp;&nbsp; Leveraging Sharding in the Design of Scalable Replication Protocols</a><br><br>
380 | 
381 | [2]. Robbert van Renesse, Chi Ho, Nicolas Schiper.<br>
382 | <a href="http://www.cs.cornell.edu/home/rvr/newpapers/opodis2012.pdf">
383 | &nbsp;&nbsp;&nbsp;&nbsp; Byzantine Chain Replication</a><br><br>
384 | 
385 | [3]. Robbert van Renesse, Nicolas Schiper.<br>
386 | <a href="http://www.cs.cornell.edu/home/rvr/papers/osdi04.pdf">
387 | &nbsp;&nbsp;&nbsp;&nbsp; Chain Replication for
388 | Supporting High Throughput and Availability</a>
389 | 
390 | <h3>Credits</h3>
391 | 
392 | <p>
393 | <ul>
394 | <li>Maxim Sokhatsky</li>
395 | <li>Vladimir Kirillov</li>
396 | <li>Sergey Klimenko</li>
397 | <li>Valery Maleshkin</li>
398 | <li>Victor Sovietov</li>
399 | </ul></p>
400 | 
401 | <br><br>
402 | <center> 2015 &copy; Synrc Research Center, s.r.o.</center>
403 | 
404 | </div>
405 | </div>
406 | </body>
407 | </html>
408 | 


--------------------------------------------------------------------------------
/doc/images/log.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg width="1104px" height="642px" viewBox="0 0 1104 642" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:sketch="http://www.bohemiancoding.com/sketch/ns">
 3 |     <!-- Generator: Sketch 3.1.1 (8761) - http://www.bohemiancoding.com/sketch -->
 4 |     <title>log</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <defs></defs>
 7 |     <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd" sketch:type="MSPage">
 8 |         <g id="log" sketch:type="MSLayerGroup" transform="translate(0.000000, -8.000000)">
 9 |             <rect id="Rectangle-19" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="167" y="74" width="228" height="59"></rect>
10 |             <rect id="Rectangle-20" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="167" y="218.771634" width="228" height="59"></rect>
11 |             <rect id="Rectangle-26" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="167" y="368.771634" width="227" height="59"></rect>
12 |             <rect id="Rectangle-23" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="167" y="442.771634" width="227" height="59"></rect>
13 |             <rect id="Rectangle-25" stroke="#979797" fill="#B8E986" sketch:type="MSShapeGroup" x="11" y="517" width="132" height="59"></rect>
14 |             <rect id="Rectangle-32" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="620" y="70.5543298" width="252" height="59"></rect>
15 |             <rect id="Rectangle-25" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="618" y="145.771634" width="257" height="59"></rect>
16 |             <rect id="Rectangle-28" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="618" y="290.771634" width="257" height="59"></rect>
17 |             <rect id="Rectangle-29" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="620" y="219.771634" width="252" height="59"></rect>
18 |             <rect id="Rectangle-30" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="619" y="369.771634" width="253" height="59"></rect>
19 |             <rect id="Rectangle-31" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="619" y="443.771634" width="253" height="59"></rect>
20 |             <rect id="Rectangle-31" stroke="#979797" fill="#B8E986" sketch:type="MSShapeGroup" x="619" y="517.771634" width="253" height="59"></rect>
21 |             <rect id="Rectangle-32" stroke="#979797" fill="#B8E986" sketch:type="MSShapeGroup" x="619" y="590.771634" width="253" height="59"></rect>
22 |             <rect id="Rectangle-24" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="416" y="144.771634" width="182" height="59"></rect>
23 |             <rect id="Rectangle-27" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="411" y="285.771634" width="187" height="59"></rect>
24 |             <rect id="Rectangle-28" stroke="#979797" fill="#B8E986" sketch:type="MSShapeGroup" x="411" y="590.771634" width="187" height="59"></rect>
25 |             <text id="#transaction" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
26 |                 <tspan x="167" y="33.7716344">#transaction</tspan>
27 |             </text>
28 |             <text id="#client" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
29 |                 <tspan x="0" y="33.7716344">#client</tspan>
30 |             </text>
31 |             <text id="#account" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
32 |                 <tspan x="421.5" y="33.7826985">#account</tspan>
33 |             </text>
34 |             <text id="#log" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
35 |                 <tspan x="618" y="34.7826985">#log</tspan>
36 |             </text>
37 |             <text id="&lt;--live-data" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
38 |                 <tspan x="907" y="563">&lt;- live data</tspan>
39 |             </text>
40 |             <text id="&lt;--failover" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
41 |                 <tspan x="907" y="491">&lt;- failover</tspan>
42 |             </text>
43 |             <text id="&lt;--epoch" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
44 |                 <tspan x="907" y="112">&lt;- epoch</tspan>
45 |             </text>
46 |             <text id="&lt;--log-head" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
47 |                 <tspan x="907" y="639">&lt;- log head</tspan>
48 |             </text>
49 |         </g>
50 |     </g>
51 | </svg>


--------------------------------------------------------------------------------
/doc/images/merging.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg width="569px" height="362px" viewBox="0 0 569 362" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:sketch="http://www.bohemiancoding.com/sketch/ns">
 3 |     <!-- Generator: Sketch 3.1.1 (8761) - http://www.bohemiancoding.com/sketch -->
 4 |     <title>merging</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <defs></defs>
 7 |     <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd" sketch:type="MSPage">
 8 |         <g id="merging" sketch:type="MSLayerGroup" transform="translate(0.000000, -7.000000)">
 9 |             <rect id="Rectangle-4" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="313" y="310" width="45" height="59"></rect>
10 |             <rect id="Rectangle-5" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="255" y="310" width="45" height="59"></rect>
11 |             <rect id="Rectangle-16" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="373" y="162" width="45" height="59"></rect>
12 |             <rect id="Rectangle-17" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="373" y="235" width="45" height="59"></rect>
13 |             <rect id="Rectangle-11" stroke="#979797" fill="#B8E986" sketch:type="MSShapeGroup" x="313" y="162" width="45" height="59"></rect>
14 |             <rect id="Rectangle-12" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="255" y="162" width="45" height="59"></rect>
15 |             <rect id="Rectangle-13" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="429" y="161.662" width="45" height="59"></rect>
16 |             <rect id="Rectangle-16" stroke="#979797" fill="#B8E986" sketch:type="MSShapeGroup" x="373" y="89" width="45" height="59"></rect>
17 |             <rect id="Rectangle-14" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="429" y="89" width="45" height="59"></rect>
18 |             <rect id="Rectangle-14" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="313" y="89" width="45" height="59"></rect>
19 |             <rect id="Rectangle-16" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="255" y="89" width="45" height="59"></rect>
20 |             <rect id="Rectangle-8" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="313" y="235" width="45" height="59"></rect>
21 |             <rect id="Rectangle-9" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="255" y="235" width="45" height="59"></rect>
22 |             <text id="Replica-A" sketch:type="MSTextLayer" font-family="Geometria" font-size="24" font-weight="260" fill="#043D3D">
23 |                 <tspan x="106" y="126">Replica A</tspan>
24 |             </text>
25 |             <text id="Operations" sketch:type="MSTextLayer" font-family="Geometria" font-size="24" font-weight="260" fill="#043D3D">
26 |                 <tspan x="82" y="31">Operations</tspan>
27 |             </text>
28 |             <text id="1" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
29 |                 <tspan x="270" y="34">1</tspan>
30 |             </text>
31 |             <text id="2" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
32 |                 <tspan x="325" y="34">2</tspan>
33 |             </text>
34 |             <text id="3" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
35 |                 <tspan x="385" y="34">3</tspan>
36 |             </text>
37 |             <text id="4" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
38 |                 <tspan x="442" y="34">4</tspan>
39 |             </text>
40 |             <text id="Replica-B" sketch:type="MSTextLayer" font-family="Geometria" font-size="24" font-weight="260" fill="#043D3D">
41 |                 <tspan x="107" y="196">Replica B</tspan>
42 |             </text>
43 |             <text id="Replica-C" sketch:type="MSTextLayer" font-family="Geometria" font-size="24" font-weight="260" fill="#043D3D">
44 |                 <tspan x="109" y="271">Replica C</tspan>
45 |             </text>
46 |             <path d="M0.5,63.5 L568.5,63.5" id="Line" stroke="#979797" stroke-linecap="square" sketch:type="MSShapeGroup"></path>
47 |             <text id="GCP" sketch:type="MSTextLayer" font-family="Geometria" font-size="24" font-weight="260" fill="#043D3D">
48 |                 <tspan x="166" y="348">GCP</tspan>
49 |             </text>
50 |         </g>
51 |     </g>
52 | </svg>


--------------------------------------------------------------------------------
/doc/images/replicas.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg width="580px" height="365px" viewBox="0 0 580 365" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:sketch="http://www.bohemiancoding.com/sketch/ns">
 3 |     <!-- Generator: Sketch 3.1.1 (8761) - http://www.bohemiancoding.com/sketch -->
 4 |     <title>replicas</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <defs></defs>
 7 |     <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd" sketch:type="MSPage">
 8 |         <g id="replicas" sketch:type="MSLayerGroup" transform="translate(-1.000000, 0.000000)">
 9 |             <rect id="Rectangle-27" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="177" y="0" width="132" height="59"></rect>
10 |             <rect id="Rectangle-31" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="177" y="76" width="132" height="59"></rect>
11 |             <rect id="Rectangle-32" stroke="#979797" fill-opacity="0.519191576" fill="#4990E2" sketch:type="MSShapeGroup" x="177" y="155" width="132" height="59"></rect>
12 |             <rect id="Rectangle-33" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="179" y="231" width="132" height="59"></rect>
13 |             <rect id="Rectangle-34" stroke="#979797" fill-opacity="0.582002944" fill="#F8E81C" sketch:type="MSShapeGroup" x="181" y="306" width="132" height="59"></rect>
14 |             <text id="write--&gt;" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
15 |                 <tspan x="0" y="34">write -&gt;</tspan>
16 |             </text>
17 |             <text id="&lt;--read" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
18 |                 <tspan x="347" y="351">&lt;- read</tspan>
19 |             </text>
20 |             <text id="&lt;--TX-context" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
21 |                 <tspan x="347" y="193">&lt;- TX context</tspan>
22 |             </text>
23 |             <text id="&lt;--head" sketch:type="MSTextLayer" font-family="Geometria" font-size="36" font-weight="260" fill="#043D3D">
24 |                 <tspan x="350" y="38">&lt;- head</tspan>
25 |             </text>
26 |         </g>
27 |     </g>
28 | </svg>


--------------------------------------------------------------------------------
/doc/images/sup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/synrc/cr/b4a30dc55d30500a1c239d6234444e1ecff5aab5/doc/images/sup.png


--------------------------------------------------------------------------------
/include/cr.hrl:
--------------------------------------------------------------------------------
 1 | -define(GEN_SERVER, [init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]).
 2 | -define(GEN_FSM,[init/1, handle_event/3, handle_sync_event/4, handle_info/3, terminate/3, code_change/4]).
 3 | 
 4 | -include_lib("kvs/include/kvs.hrl").
 5 | 
 6 | -type mode() :: active | pending | immutable | sync.
 7 | 
 8 | -record(ens, {eseq,key,val}).
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/include/rafter.hrl:
--------------------------------------------------------------------------------
 1 | -type peer() :: atom() | {atom(), atom()}.
 2 | 
 3 | %% Transport Independent MESSAGES
 4 | -record(request_vote, {
 5 |             term :: non_neg_integer(),
 6 |             from :: atom(),
 7 |             last_log_index :: non_neg_integer(),
 8 |             last_log_term :: non_neg_integer()}).
 9 | 
10 | -record(vote, {
11 |             from :: atom(),
12 |             term :: non_neg_integer(),
13 |             success :: boolean()}).
14 | 
15 | -record(append_entries, {
16 |             term :: non_neg_integer(),
17 |             from :: atom(),
18 |             prev_log_index :: non_neg_integer(),
19 |             prev_log_term :: non_neg_integer(),
20 |             entries :: term(),
21 |             commit_index :: non_neg_integer(),
22 | 
23 |             %% This is used during read-only operations
24 |             send_clock :: non_neg_integer()}).
25 | 
26 | -record(append_entries_rpy, {
27 |             from :: atom(),
28 |             term :: non_neg_integer(),
29 | 
30 |             %% This field isn't in the raft paper. However, for this implementation
31 |             %% it prevents duplicate responses from causing recommits and helps
32 |             %% maintain safety. In the raft reference implementation (logcabin)
33 |             %% they cancel the in flight RPC's instead. That's difficult
34 |             %% to do correctly(without races) in erlang with asynchronous
35 |             %% messaging and mailboxes.
36 |             index :: non_neg_integer(),
37 | 
38 |             %% This is used during read-only operations
39 |             send_clock :: non_neg_integer(),
40 | 
41 |             success :: boolean()}).
42 | 
43 | -record(rafter_entry, {
44 |         type :: noop | config | op,
45 |         term :: non_neg_integer(),
46 |         index :: non_neg_integer(),
47 |         cmd :: term()}).
48 | 
49 | -record(meta, {
50 |     voted_for :: peer(),
51 |     term = 0 :: non_neg_integer()}).
52 | 
53 | -record(config, {
54 |     state = blank ::
55 |         %% The configuration specifies no servers. Servers that are new to the
56 |         %% cluster and have empty logs start in this state.
57 |         blank   |
58 |         %% The configuration specifies a single list of servers: a quorum
59 |         %% requires any majority of oldservers.
60 |         stable  |
61 |         %% The configuration specifies two lists of servers: a quorum requires
62 |         %% any majority of oldservers, but the newservers also receive log entries.
63 |         staging |
64 |         %% The configuration specifies two lists of servers: a quorum requires
65 |         %% any majority of oldservers and any majority of the newservers.
66 |         transitional,
67 | 
68 |     oldservers = [] :: list(),
69 |     newservers = [] :: list()
70 | }).
71 | 
72 | 


--------------------------------------------------------------------------------
/include/rafter_consensus_fsm.hrl:
--------------------------------------------------------------------------------
 1 | -record(client_req, {
 2 |     id   :: binary(),
 3 |     timer :: timer:tref(),
 4 |     from :: term(),
 5 |     index :: non_neg_integer(),
 6 |     term :: non_neg_integer(),
 7 | 
 8 |     %% only used during read_only commands
 9 |     cmd :: term()}).
10 | 
11 | -record(state, {
12 |     leader :: term(),
13 |     term = 0 :: non_neg_integer(),
14 |     voted_for :: term(),
15 |     commit_index = 0 :: non_neg_integer(),
16 |     init_config :: undefined | list() | complete | no_client,
17 | 
18 |     %% Used for Election and Heartbeat timeouts
19 |     timer :: reference(),
20 | 
21 |     %% leader state: contains nextIndex for each peer
22 |     followers,
23 | 
24 |     %% Dict keyed by peer id.
25 |     %% contains true as val when candidate
26 |     %% contains match_indexes as val when leader
27 |     responses,
28 | 
29 |     %% Logical clock to allow read linearizability
30 |     %% Reset to 0 on leader election.
31 |     send_clock = 0 :: non_neg_integer(),
32 | 
33 |     %% Keep track of the highest send_clock received from each peer
34 |     %% Reset on leader election
35 |     send_clock_responses,
36 | 
37 |     %% Outstanding Client Write Requests
38 |     client_reqs = [] :: [#client_req{}],
39 | 
40 |     %% Outstanding Client Read Requests
41 |     %% Keyed on send_clock, Val = [#client_req{}]
42 |     read_reqs,
43 | 
44 |     %% All servers making up the ensemble
45 |     me :: string(),
46 | 
47 |     config :: term(),
48 | 
49 |     %% We allow pluggable backend state machine modules.
50 |     state_machine :: atom(),
51 |     backend_state :: term()}).
52 | 


--------------------------------------------------------------------------------
/include/rafter_opts.hrl:
--------------------------------------------------------------------------------
1 | -record(rafter_opts, {state_machine = cr_rafterback,
2 |                       cluster,
3 |                       logdir = "data"}).
4 | 


--------------------------------------------------------------------------------
/mad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/synrc/cr/b4a30dc55d30500a1c239d6234444e1ecff5aab5/mad


--------------------------------------------------------------------------------
/otp.mk:
--------------------------------------------------------------------------------
 1 | ifeq ($(OS),Windows_NT)
 2 |     SEPARATOR=;
 3 | else
 4 |     SEPARATOR=:
 5 | endif
 6 | 
 7 | MAD      := ./mad
 8 | VM       := vm.args
 9 | SYS      := sys.config
10 | PLT_NAME := ~/.n2o_dialyzer.plt
11 | ERL_ARGS := -args_file $(VM) -config $(SYS) -setcookie $(COOKIE) -name $(NAME)@127.0.0.1
12 | RUN_DIR  := data/$(NAME)/log
13 | LOG_DIR  := data/$(NAME)/log
14 | empty    :=
15 | ROOTS    := . deps
16 | space    := $(empty) $(empty)
17 | comma    := $(empty),$(empty)
18 | VSN      := $(shell git rev-parse HEAD | head -c 6)
19 | DATE     := $(shell date "+%Y%m%d-%H%M%S")
20 | ERL_LIBS := $(subst $(space),$(SEPARATOR),$(ROOTS))
21 | relx     := "{release,{$(RELEASE),\"$(VER)\"},[$(RELEASE)]}.\\n{include_erts,true}.\
22 | \\n{extended_start_script,true}.\\n{generate_start_script,true}.\\n{sys_config,\"$(SYS)\"}.\
23 | \\n{vm_args,\"$(VM)\"}.\\n{overlay,[{mkdir,\"log/sasl\"}]}."
24 | 
25 | test: eunit ct
26 | deps up:
27 | 	$(MAD) $@
28 | compile: deps
29 | 	$(MAD) compile skip_deps=true
30 | clean:
31 | 	rm -f .applist
32 | 	$(MAD) $@
33 | .applist: compile
34 | 	$(MAD) plan
35 | $(RUN_DIR) $(LOG_DIR):
36 | 	mkdir -p $(RUN_DIR) & mkdir -p $(LOG_DIR)
37 | console: .applist
38 | 	mkdir -p data
39 | 	ERL_LIBS="$(ERL_LIBS)" erl $(ERL_ARGS) -eval '[application:start(A) || A <- $(shell cat .applist)]'
40 | start: $(RUN_DIR) $(LOG_DIR) .applist
41 | 	RUN_ERL_LOG_GENERATIONS=1000 RUN_ERL_LOG_MAXSIZE=20000000 \
42 | 	ERL_LIBS=$(ERL_LIBS) run_erl -daemon $(RUN_DIR)/ $(LOG_DIR)/ "exec $(MAKE) console"
43 | attach:
44 | 	to_erl $(RUN_DIR)/
45 | release:
46 | 	echo $(relx) > relx.config && relx
47 | stop:
48 | 	@kill -9 $(shell ps ax -o pid= -o command=|grep $(RELEASE)|grep $(COOKIE)|awk '{print $$1}')
49 | $(PLT_NAME):
50 | 	$(eval APPS := $(subst deps/,,$(subst apps/,,$(shell find apps deps -maxdepth 1 -mindepth 1 -type d))))
51 | 	ERL_LIBS=$(ERL_LIBS) dialyzer --build_plt --output_plt $(PLT_NAME) --apps $(APPS) || true
52 | dialyze: $(PLT_NAME) compile
53 | 	$(eval APPS := $(shell find apps deps -maxdepth 1 -mindepth 1 -type d))
54 | 	@$(foreach var,$(APPS),(echo "Process $(var)"; dialyzer -q $(var)/ebin --plt $(PLT_NAME) --no_native -Werror_handling -Wunderspecs -Wrace_conditions -Wno_undefined_callbacks);)
55 | tar: release
56 | 	tar zcvf $(RELEASE)-$(VSN)-$(DATE).tar.gz _rel/lib/*/ebin _rel/lib/*/priv _rel/bin _rel/releases
57 | eunit:
58 | 	rebar eunit skip_deps=true
59 | ct:
60 | 	rebar ct skip_deps=true verbose=1
61 | 
62 | .PHONY: deps up compile clean console start attach release update-deps dialyze ct eunit tar
63 | 


--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
1 | {deps,[
2 |     {kvs,     ".*", {git, "git://github.com/synrc/kvs", {tag,"3.4"}}},
3 |     {db,     ".*", {git, "git://github.com/spawnproc/db", "HEAD"}},
4 |     {active,  ".*", {git, "git://github.com/synrc/active", "HEAD"}}
5 |   ]}.
6 | 


--------------------------------------------------------------------------------
/src/backends/cr_kvs.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_kvs).
 2 | -copyright('Maxim Sokhatsky').
 3 | -include("cr.hrl").
 4 | -compile(export_all).
 5 | 
 6 | dispatch({prepare,_,_,Tx}, {state,Name,_,_,_})  ->
 7 | %    io:format("KVS PUT ~p:~p~n",[element(1,Tx),element(2,Tx)]),
 8 |     kvs:put(Tx);
 9 | 
10 | dispatch({commit,_,_,Tx}, {state,Name,_,_,_})  ->
11 | %    io:format("KVS LINK ~p:~p~n",[element(1,Tx),element(2,Tx)]),
12 |     kvs:link(Tx);
13 | 
14 | dispatch({rollback,_,_,Tx}, {state,Name,_,_,_})  ->
15 | %    io:format("KVS REMOVE ~p:~p~n",[element(1,Tx),element(2,Tx)]),
16 |     kvs:remove(Tx);
17 | 
18 | dispatch(_,_)  -> ok.
19 | 


--------------------------------------------------------------------------------
/src/consensus/README.md:
--------------------------------------------------------------------------------
1 | Consensus Modules
2 | =================
3 | 
4 | The original ideas is to have replacable consunsus modules:
5 | 
6 | * PAXOS (cr_paxon)
7 | * RAFT (cr_rafter)
8 | 


--------------------------------------------------------------------------------
/src/consensus/cr_config.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_config).
 2 | -compile(export_all).
 3 | -include("rafter.hrl").
 4 | 
 5 | quorum_max(_Me, #config{state=blank}, _) -> 0;
 6 | quorum_max(Me, #config{state=stable, newservers=OldServers}, Responses) -> quorum_max(Me, OldServers, Responses);
 7 | quorum_max(Me, #config{state=staging, newservers=OldServers}, Responses) -> quorum_max(Me, OldServers, Responses);
 8 | quorum_max(Me, #config{state=transitional, oldservers=Old, newservers=New}, Responses) -> min(quorum_max(Me, Old, Responses), quorum_max(Me, New, Responses));
 9 | 
10 | quorum_max(_, [], _) -> 0;
11 | quorum_max(Me, Servers, Responses) when (length(Servers) rem 2) =:= 0->
12 |     Values = sorted_values(Me, Servers, Responses),
13 |     lists:nth(length(Values) div 2, Values);
14 | quorum_max(Me, Servers, Responses) ->
15 |     Values = sorted_values(Me, Servers, Responses),
16 |     lists:nth(length(Values) div 2 + 1, Values).
17 | 
18 | quorum(_Me, #config{state=blank}, _Responses) -> false;
19 | quorum(Me, #config{state=stable,newservers=Servers}, Responses) -> quorum(Me, Servers, Responses);
20 | quorum(Me, #config{state=staging,newservers=Servers}, Responses) -> quorum(Me, Servers, Responses);
21 | quorum(Me, #config{state=transitional,oldservers=Old, newservers=New}, Responses) -> quorum(Me, Old, Responses) andalso quorum(Me, New, Responses);
22 | quorum(Me, Servers, Responses) ->
23 |     TrueResponses = [R || {Peer, R} <- dict:to_list(Responses),
24 |                           R =:= true,
25 |                           lists:member(Peer, Servers)],
26 |     case lists:member(Me, Servers) of
27 |         true -> length(TrueResponses) + 1 > length(Servers)/2;
28 |         false -> length(TrueResponses) > length(Servers)/2 end.
29 | 
30 | voters(Me, Config) -> lists:delete(Me, voters(Config)).
31 | voters(#config{oldservers=Old, newservers=New}) -> sets:to_list(sets:from_list(Old ++ New));
32 | voters(#config{newservers=Old}) -> Old.
33 | 
34 | has_vote(_Me, #config{state=blank}) -> false;
35 | has_vote(Me, #config{oldservers=Old, newservers=New})-> lists:member(Me, Old) orelse lists:member(Me, New);
36 | has_vote(Me, #config{newservers=Old}) -> lists:member(Me, Old).
37 | 
38 | followers(Me, #config{oldservers=Old, newservers=New}) -> lists:delete(Me, sets:to_list(sets:from_list(Old ++ New)));
39 | followers(Me, #config{newservers=Old}) -> lists:delete(Me, Old).
40 | 
41 | reconfig(#config{state=Blank,newservers=OldNew}=Config, Servers) ->
42 |     Config#config{state=stable,oldservers=OldNew, newservers=Servers}.
43 | 
44 | allow_config(#config{state=blank}, _NewServers) -> true;
45 | allow_config(#config{newservers=OldServers}, NewServers) when NewServers =/= OldServers -> true;
46 | allow_config(_Config, _NewServers) -> {error, config_not_allowed}.
47 | 
48 | sorted_values(Me, Servers, Responses) ->
49 |     Vals = lists:sort(lists:map(fun(S) -> value(S, Responses) end, Servers)),
50 |     case lists:member(Me, Servers) of
51 |         true -> [_ | T] = Vals, lists:reverse([lists:max(Vals) | lists:reverse(T)]);
52 |         false -> Vals end.
53 | 
54 | value(Peer, Responses) ->
55 |     case dict:find(Peer, Responses) of
56 |         {ok, Value} -> Value;
57 |         error -> 0 end.
58 | 


--------------------------------------------------------------------------------
/src/consensus/cr_log.erl:
--------------------------------------------------------------------------------
  1 | -module(cr_log).
  2 | -description('RAFT protocol replication log').
  3 | -compile(export_all).
  4 | -behaviour(gen_server).
  5 | -include_lib("kernel/include/file.hrl").
  6 | -include("cr.hrl").
  7 | -include("rafter.hrl").
  8 | -include("rafter_opts.hrl").
  9 | -export(?GEN_SERVER).
 10 | 
 11 | show() -> show(cr:node()).
 12 | show(Node) ->
 13 |   [ {I,element(2,cr_log:get_entry(Node,I))} || I <- lists:seq(1,cr_log:get_last_index(Node)) ].
 14 | 
 15 | %%=============================================================================
 16 | %% Logfile Structure
 17 | %%=============================================================================
 18 | %% @doc A log is made up of a file header and entries. The header contains file
 19 | %%      metadata and is written once at file creation. Each entry is a binary
 20 | %%      of arbitrary size containing header information and is followed by a trailer.
 21 | %%      The formats of the file header and entries are described below.
 22 | %%
 23 | %%         File Header Format
 24 | %%         -----------------------
 25 | %%         <<Version:8>>
 26 | %%
 27 | %%         Entry Format
 28 | %%         ----------------
 29 | %%         <<Sha1:20/binary, Type:8, Term:64, Index: 64, DataSize:32, Data/binary>>
 30 | %%
 31 | %%         Sha1 - hash of the rest of the entry,
 32 | %%         Type - ?CONFIG | ?OP
 33 | %%         Term - The term of the entry
 34 | %%         Index - The log index of the entry
 35 | %%         DataSize - The size of Data in bytes
 36 | %%         Data - Data encoded with term_to_binary/1
 37 | %%
 38 | %%     After each log entry a trailer is written. The trailer is used for
 39 | %%     detecting incomplete/corrupted writes, pointing to the latest config and
 40 | %%     traversing the log file backwards.
 41 | %%
 42 | %%         Trailer Format
 43 | %%         ----------------
 44 | %%         <<Crc:32, ConfigStart:64, EntryStart:64, ?MAGIC:64>>
 45 | %%
 46 | %%         Crc - checksum, computed with erlang:crc32/1, of the rest of the trailer
 47 | %%         ConfigStart - file location of last seen config,
 48 | %%         EntryStart - file location of the start of this entry
 49 | %%         ?MAGIC - magic number marking the end of the trailer.
 50 | %%                  A fully consistent log should always have
 51 | %%                  the following magic number as the last 8 bytes:
 52 | %%                  <<"\xFE\xED\xFE\xED\xFE\xED\xFE\xED">>
 53 | %%
 54 | 
 55 | -define(MAX_HINTS, 1000).
 56 | 
 57 | -type index() :: non_neg_integer().
 58 | -type offset() :: non_neg_integer().
 59 | 
 60 | -record(state, {
 61 |     logfile :: file:io_device(),
 62 |     version :: non_neg_integer(),
 63 |     meta_filename :: string(),
 64 |     write_location = 0 :: non_neg_integer(),
 65 |     config :: #config{},
 66 |     config_loc :: offset(),
 67 |     meta :: #meta{},
 68 |     last_entry :: #rafter_entry{},
 69 |     index = 0 :: index(),
 70 |     hints :: ets:tid(),
 71 |     hint_prunes = 0 :: non_neg_integer(),
 72 | 
 73 |     %% frequency of number of entries scanned in get_entry/2 calls
 74 |     seek_counts = dict:new()}).
 75 | 
 76 | -define(MAGIC, <<"\xFE\xED\xFE\xED\xFE\xED\xFE\xED">>).
 77 | -define(MAGIC_SIZE, 8).
 78 | -define(HEADER_SIZE, 41).
 79 | -define(TRAILER_SIZE, 28).
 80 | -define(FILE_HEADER_SIZE, 1).
 81 | -define(READ_BLOCK_SIZE, 1048576). %% 1MB
 82 | -define(LATEST_VERSION, 1).
 83 | 
 84 | %% Entry Types
 85 | -define(NOOP, 0).
 86 | -define(CONFIG, 1).
 87 | -define(OP, 2).
 88 | -define(ALL, [?CONFIG, ?OP]).
 89 | 
 90 | -ifdef(TEST).
 91 | -define(ETS_OPTS, [ordered_set, protected]).
 92 | -else.
 93 | -define(ETS_OPTS, [named_table, ordered_set, protected]).
 94 | -endif.
 95 | 
 96 | 
 97 | %%====================================================================
 98 | %% API
 99 | %%====================================================================
100 | entry_to_binary(#rafter_entry{type=noop, term=Term, index=Index, cmd=noop}) ->
101 |     entry_to_binary(?NOOP, Term, Index, noop);
102 | entry_to_binary(#rafter_entry{type=config, term=Term, index=Index, cmd=Data}) ->
103 |     entry_to_binary(?CONFIG, Term, Index, Data);
104 | entry_to_binary(#rafter_entry{type=op, term=Term, index=Index, cmd=Data}) ->
105 |     entry_to_binary(?OP, Term, Index, Data).
106 | 
107 | entry_to_binary(Type, Term, Index, Data) ->
108 |     BinData = term_to_binary(Data),
109 |     B0 = <<Type:8, Term:64, Index:64, (byte_size(BinData)):32, BinData/binary>>,
110 |     Sha1 = crypto:hash(sha, B0),
111 |     <<Sha1/binary, B0/binary>>.
112 | 
113 | binary_to_entry(<<Sha1:20/binary, Type:8, Term:64, Index:64, Size:32, Data/binary>>) ->
114 |     %% We want to crash on badmatch here if if our log is corrupt
115 |     %% TODO: Allow an operator to repair the log by truncating at that point
116 |     %% or repair each entry 1 by 1 by consulting a good log.
117 |     Sha1 = crypto:hash(sha, <<Type:8, Term:64, Index:64, Size:32, Data/binary>>),
118 |     binary_to_entry(Type, Term, Index, Data).
119 | 
120 | binary_to_entry(?NOOP, Term, Index, _Data) ->
121 |     #rafter_entry{type=noop, term=Term, index=Index, cmd=noop};
122 | binary_to_entry(?CONFIG, Term, Index, Data) ->
123 |     #rafter_entry{type=config, term=Term, index=Index, cmd=binary_to_term(Data)};
124 | binary_to_entry(?OP, Term, Index, Data) ->
125 |     #rafter_entry{type=op, term=Term, index=Index, cmd=binary_to_term(Data)}.
126 | 
127 | start_link(Peer, Opts) ->
128 |     gen_server:start_link({local, logname(Peer)}, ?MODULE, [Peer, Opts], []).
129 | 
130 | stop(Peer) ->
131 |     gen_server:cast(logname(Peer), stop).
132 | 
133 | %% @doc check_and_append/3 gets called in the follower state only and will only
134 | %% truncate the log if entries don't match. It never truncates and re-writes
135 | %% committed entries as this violates the safety of the RAFT protocol.
136 | check_and_append(Peer, Entries, Index) ->
137 |     gen_server:call(logname(Peer), {check_and_append, Entries, Index}).
138 | 
139 | %% @doc append/2 gets called in the leader state only, and assumes a
140 | %% truncated log.
141 | append(Peer, Entries) ->
142 |     gen_server:call(logname(Peer), {append, Entries}).
143 | 
144 | kvs_log(Peer, Operation) ->
145 |     gen_server:call(logname(Peer), {kvs_log, Operation}).
146 | 
147 | kvs_replay(Peer, Operation, Storage, Status) ->
148 |     gen_server:call(logname(Peer), {kvs_replay, Operation, Storage, Status}).
149 | 
150 | get_config(Peer) ->
151 |     gen_server:call(logname(Peer), get_config).
152 | 
153 | get_last_index(Peer) ->
154 |     gen_server:call(logname(Peer), get_last_index).
155 | 
156 | get_last_entry(Peer) ->
157 |     gen_server:call(logname(Peer), get_last_entry).
158 | 
159 | get_last_term(Peer) ->
160 |     case get_last_entry(Peer) of
161 |         {ok, #rafter_entry{term=Term}} ->
162 |             Term;
163 |         {ok, not_found} ->
164 |             0
165 |     end.
166 | 
167 | get_metadata(Peer) ->
168 |     gen_server:call(logname(Peer), get_metadata).
169 | 
170 | set_metadata(Peer, VotedFor, Term) ->
171 |     gen_server:call(logname(Peer), {set_metadata, VotedFor, Term}).
172 | 
173 | get_entry(Peer, Index) ->
174 |     gen_server:call(logname(Peer), {get_entry, Index}).
175 | 
176 | get_term(Peer, Index) ->
177 |     case get_entry(Peer, Index) of
178 |         {ok, #rafter_entry{term=Term}} ->
179 |             Term;
180 |         {ok, not_found} ->
181 |             0
182 |     end.
183 | 
184 | %%====================================================================
185 | %% gen_server callbacks
186 | %%====================================================================
187 | init([Name, #rafter_opts{logdir = Logdir}]) ->
188 |     LogName  = lists:concat([Logdir,"/",Name,".log"]),
189 |     MetaName = lists:concat([Logdir,"/",Name,".meta"]),
190 |     {ok, LogFile} = file:open(LogName, [append, read, binary, raw]),
191 |     {ok, #file_info{size=Size}} = file:read_file_info(LogName),
192 |     {ok, Meta} = read_metadata(MetaName, Size),
193 |     {ConfigLoc, Config, _Term, Index, WriteLocation, Version} = init_file(LogFile, Size),
194 |     LastEntry = find_last_entry(LogFile, WriteLocation),
195 |     HintsTable = list_to_atom("rafter_hints_" ++ atom_to_list(Name)),
196 |     {ok, #state{logfile=LogFile,
197 |                 version=Version,
198 |                 meta_filename=MetaName,
199 |                 write_location=WriteLocation,
200 |                 index=Index,
201 |                 meta=Meta,
202 |                 config=Config,
203 |                 config_loc = ConfigLoc,
204 |                 last_entry=LastEntry,
205 |                 hints=ets:new(HintsTable, ?ETS_OPTS)}}.
206 | 
207 | format_status(_, [_, State]) ->
208 |     [{data, [{"StateData", State}]}].
209 | 
210 | handle_call({kvs_log, Operation}, _From, #state{logfile=File}=State) ->
211 |     {reply, kvs:add(Operation#operation{id=kvs:next_id(operation,1)}), State};
212 | 
213 | handle_call({kvs_replay, Operation, {state,Name,Nodes,Storage,L}, Status}, _From, #state{}=State) ->
214 |     Storage:dispatch(Operation#operation.body,{state,Name,Nodes,Storage,L}),
215 |     {reply, ok, State};
216 | 
217 | %% Leader Append. Entries do NOT have Indexes, as they are unlogged entries as a
218 | %% result of client operations. Appends are based on the current index of the log.
219 | %% Just append to the next location in the log for each entry.
220 | handle_call({append, Entries}, _From, #state{logfile=File}=State) ->
221 |     NewState = append_entries(File, Entries, State),
222 |     Index = NewState#state.index,
223 |     {reply, {ok, Index}, NewState};
224 | 
225 | handle_call(get_config, _From, #state{config=Config}=State) ->
226 |     {reply, Config, State};
227 | 
228 | handle_call(get_last_entry, _From, #state{last_entry=undefined}=State) ->
229 |     {reply, {ok, not_found}, State};
230 | handle_call(get_last_entry, _From, #state{last_entry=LastEntry}=State) ->
231 |     {reply, {ok, LastEntry}, State};
232 | 
233 | handle_call(get_last_index, _From, #state{index=Index}=State) ->
234 |     {reply, Index, State};
235 | 
236 | handle_call(get_metadata, _, #state{meta=Meta}=State) ->
237 |     {reply, Meta, State};
238 | 
239 | handle_call({set_metadata, VotedFor, Term}, _, #state{meta_filename=Name}=S) ->
240 |     Meta = #meta{voted_for=VotedFor, term=Term},
241 |     ok = write_metadata(Name, Meta),
242 |     {reply, ok, S#state{meta=Meta}};
243 | 
244 | %% Follower append. Logs may not match. Write the first entry at the given index
245 | %% and reset the current index maintained in #state{}. Note that Entries
246 | %% actually contain correct indexes, since they are sent from the leader.
247 | %% Return the last index written.
248 | handle_call({check_and_append, Entries, Index}, _From, #state{logfile=File,
249 |                                                               hints=Hints}=S) ->
250 |     Loc0 = closest_forward_offset(Hints, Index),
251 |     {Loc, Count} = get_pos(File, Loc0, Index),
252 |     State = update_counters(Count, 0, S),
253 |     #state{index=NewIndex}=NewState = maybe_append(Loc, Entries, State),
254 |     {reply, {ok, NewIndex}, NewState};
255 | 
256 | handle_call({get_entry, Index}, _From, #state{logfile=File,
257 |                                               hints=Hints}=State0) ->
258 |     Loc = closest_forward_offset(Hints, Index),
259 |     {Res, NewState} =
260 |     case find_entry(File, Loc, Index) of
261 |         {not_found, Count} ->
262 |             State = update_counters(Count, 0, State0),
263 |             {not_found, State};
264 |         {Entry, NextLoc, Count} ->
265 |             Prunes = add_hint(Hints, Index, NextLoc),
266 |             State = update_counters(Count, Prunes, State0),
267 |             {Entry, State}
268 |     end,
269 |     {reply, {ok, Res}, NewState}.
270 | 
271 | -spec update_counters(offset(), non_neg_integer(), #state{}) -> #state{}.
272 | update_counters(Distance, Prunes, #state{hint_prunes=Prunes0,
273 |                                                    seek_counts=Dict0}
274 |                                                    =State) ->
275 |     Dict = dict:update_counter(Distance, 1, Dict0),
276 |     State#state{hint_prunes=Prunes0 + Prunes, seek_counts=Dict}.
277 | 
278 | -spec closest_forward_offset(ets:tid(), index()) -> offset().
279 | closest_forward_offset(Hints, Index) ->
280 |     case ets:prev(Hints, Index) of
281 |         '$end_of_table' ->
282 |             ?FILE_HEADER_SIZE;
283 |         Key ->
284 |             [{Key, Loc0}] = ets:lookup(Hints, Key),
285 |             Loc0
286 |     end.
287 | 
288 | -spec add_hint(ets:tid(), index(), offset()) -> non_neg_integer().
289 | add_hint(Hints, Index, Loc) ->
290 |     {size, Size} = lists:keyfind(size, 1, ets:info(Hints)),
291 |     case Size >= ?MAX_HINTS of
292 |         true ->
293 |             delete_hints(Hints),
294 |             true = ets:insert(Hints, {Index, Loc}),
295 |             1;
296 |         false ->
297 |             true = ets:insert(Hints, {Index, Loc}),
298 |             0
299 |     end.
300 | 
301 | %% Delete every 10th hint
302 | delete_hints(Hints) ->
303 |     L = ets:tab2list(Hints),
304 |     {_, ToDelete} =
305 |     lists:foldl(fun({Index, _}, {Count, Deleted}) when Count rem 10 =:= 0 ->
306 |                        {Count+1, [Index | Deleted]};
307 |                    ({_, _}, {Count, Deleted}) ->
308 |                        {Count+1, Deleted}
309 |                 end, {0, []}, L),
310 |     [true = ets:delete(Hints, Index) || Index <- ToDelete].
311 | 
312 | handle_cast(stop, #state{logfile=File}=State) ->
313 |     ok = file:close(File),
314 |     {stop, normal, State};
315 | handle_cast(_Msg, State) ->
316 |     {noreply, State}.
317 | 
318 | handle_info(_Info, State) ->
319 |     {noreply, State}.
320 | 
321 | terminate(_Reason, _State) ->
322 |     ok.
323 | 
324 | code_change(_OldVsn, State, _Extra) ->
325 |     {ok, State}.
326 | 
327 | 
328 | %%====================================================================
329 | %% Internal Functions
330 | %%====================================================================
331 | 
332 | maybe_append(_, [], State) ->
333 |     State;
334 | maybe_append(eof, [Entry | Entries], State) ->
335 |     NewState = write_entry(Entry, State),
336 |     maybe_append(eof, Entries, NewState);
337 | maybe_append(Loc, [Entry | Entries], State=#state{logfile=File}) ->
338 |     #rafter_entry{index=Index, term=Term}=Entry,
339 |     case read_entry(File, Loc) of
340 |         {entry, Data, NewLocation} ->
341 |             case binary_to_entry(Data) of
342 |                 %% We already have this entry in the log. Continue.
343 |                 #rafter_entry{index=Index, term=Term} ->
344 |                     maybe_append(NewLocation, Entries, State);
345 |                 #rafter_entry{index=Index, term=_} ->
346 |                     NewState = truncate_and_write(File, Loc, Entry, State),
347 |                     maybe_append(eof, Entries, NewState)
348 |             end;
349 |         eof ->
350 |             NewState = truncate_and_write(File, Loc, Entry, State),
351 |             maybe_append(eof, Entries, NewState)
352 |     end.
353 | 
354 | truncate_and_write(File, Loc, Entry, State0) ->
355 |     ok = truncate(File, Loc),
356 |     State1 = maybe_reset_config(File, Loc, State0),
357 |     State2 = State1#state{write_location=Loc},
358 |     write_entry(Entry, State2).
359 | 
360 | -spec maybe_reset_config(file:io_device(), non_neg_integer(), #state{}) ->
361 |     #state{}.
362 | maybe_reset_config(File, Loc, #state{config_loc=ConfigLoc}=State) ->
363 |     case ConfigLoc >= Loc of
364 |         true ->
365 |             reset_config(File, Loc, State);
366 |         false ->
367 |             State
368 |     end.
369 | 
370 | -spec reset_config(file:io_device(), non_neg_integer(), #state{}) -> #state{}.
371 | reset_config(File, Loc, State) ->
372 |     case Loc of
373 |         ?FILE_HEADER_SIZE ->
374 |             %% Empty file, so reset to blank config
375 |             State#state{config_loc=0, config=#config{}};
376 |         _ ->
377 |             %% Get config from the previous trailer
378 |             TrailerLoc = Loc - ?TRAILER_SIZE,
379 |             {ok, Trailer} = file:pread(File, TrailerLoc, ?TRAILER_SIZE),
380 |             <<CRC:32, Rest/binary>> = Trailer,
381 |             %% validate checksum, fail fast.
382 |             CRC = erlang:crc32(Rest),
383 |             <<ConfigLoc:64, _/binary>> = Rest,
384 |             case ConfigLoc of
385 |                 0 ->
386 |                     State#state{config_loc=0, config=#config{}};
387 |                 _ ->
388 |                     {ok, Config} = read_config(File, ConfigLoc),
389 |                     State#state{config_loc=ConfigLoc, config=Config}
390 |             end
391 |     end.
392 | 
393 | logname(IndexNode) -> list_to_atom(lists:concat(["log:",IndexNode])).
394 | 
395 | init_file(File, 0) ->
396 |     {ok, Loc} = write_file_header(File),
397 |     {0, #config{}, 0, 0, Loc, ?LATEST_VERSION};
398 | init_file(File, Size) ->
399 |     case repair_file(File, Size) of
400 |         {ok, ConfigLoc, Term, Index, WriteLoc} ->
401 |             {ok, Version} = read_file_header(File),
402 |             {ok, Config} = read_config(File, ConfigLoc),
403 |             {ConfigLoc, Config, Term, Index, WriteLoc, Version};
404 |         empty_file ->
405 |             {ok, Loc} = write_file_header(File),
406 |             {0, #config{}, 0, 0, Loc, ?LATEST_VERSION}
407 |     end.
408 | 
409 | read_file_header(File) ->
410 |     {ok, <<Version:8>>} = file:pread(File, 0, ?FILE_HEADER_SIZE),
411 |     {ok, Version}.
412 | 
413 | write_file_header(File) ->
414 |     ok = file:write(File, <<?LATEST_VERSION:8>>),
415 |     {ok, ?FILE_HEADER_SIZE}.
416 | 
417 | make_trailer(EntryStart, ConfigStart) ->
418 |     T = <<ConfigStart:64, EntryStart:64, ?MAGIC/binary >>,
419 |     Crc = erlang:crc32(T),
420 |     <<Crc:32, T/binary>>.
421 | 
422 | append_entries(File, Entries, State) ->
423 |     NewState = lists:foldl(fun append_entry/2, State, Entries),
424 |     ok = file:sync(File),
425 |     NewState.
426 | 
427 | %% Append an entry at the next location in the log. The entry does not yet have an
428 | %% index, so add one.
429 | append_entry(Entry, State=#state{index=Index}) ->
430 |     NewIndex = Index + 1,
431 |     NewEntry = Entry#rafter_entry{index=NewIndex},
432 |     write_entry(NewEntry, State).
433 | 
434 | %% Precondition: each entry must have an index at this point.
435 | write_entry(Entry, State) ->
436 |     #rafter_entry{index=Index, type=Type, cmd=Cmd}=Entry,
437 |     #state{write_location=Loc, config=Config, config_loc=ConfigLoc,
438 |            logfile=File} = State,
439 |     BinEntry = entry_to_binary(Entry),
440 |     {NewConfigLoc, NewConfig} =
441 |         maybe_update_config(Type, Loc, Cmd, ConfigLoc, Config),
442 |     Trailer = make_trailer(Loc, NewConfigLoc),
443 |     ok = file:write(File, <<BinEntry/binary, Trailer/binary>>),
444 |     NewLoc = Loc + byte_size(BinEntry) + ?TRAILER_SIZE,
445 |     State#state{index=Index,
446 |             config=NewConfig,
447 |             write_location=NewLoc,
448 |             config_loc=NewConfigLoc,
449 |             last_entry=Entry}.
450 | 
451 | maybe_update_config(config, NewConfigLoc, NewConfig, _, _) ->
452 |     {NewConfigLoc, NewConfig};
453 | maybe_update_config(_Type, _, _, CurConfigLoc, CurConfig) ->
454 |     {CurConfigLoc, CurConfig}.
455 | 
456 | read_config(File, Loc) ->
457 |     {entry, Data, _} = read_entry(File, Loc),
458 |     #rafter_entry{type=config, cmd=Config} = binary_to_entry(Data),
459 |     {ok, Config}.
460 | 
461 | %% TODO: Write to a tmp file then rename so the write is always atomic and the
462 | %% metadata file cannot become partially written.
463 | write_metadata(Filename, Meta) ->
464 |     ok = file:write_file(Filename, term_to_binary(Meta)).
465 | 
466 | read_metadata(Filename, FileSize) ->
467 |     case file:read_file(Filename) of
468 |         {ok, Bin} ->
469 |             {ok, binary_to_term(Bin)};
470 |         {error, enoent} when FileSize =< ?FILE_HEADER_SIZE ->
471 |             {ok, #meta{}};
472 |         {error, Reason} ->
473 |             io:format("Failed to open metadata file: ~p. Reason = ~p~n",
474 |                 [Filename, Reason]),
475 |             {ok, #meta{}}
476 |     end.
477 | 
478 | truncate(File, Pos) ->
479 |     {ok, _} = file:position(File, Pos),
480 |     file:truncate(File).
481 | 
482 | maybe_truncate(File, TruncateAt, FileSize) ->
483 |     case TruncateAt < FileSize of
484 |         true ->
485 |             ok = truncate(File, TruncateAt);
486 |         false ->
487 |             ok
488 |     end.
489 | 
490 | repair_file(File, Size) ->
491 |     case scan_for_trailer(File, Size) of
492 |         {ok, ConfigStart, EntryStart, TruncateAt} ->
493 |             maybe_truncate(File, TruncateAt, Size),
494 |             {entry, Data, _} = read_entry(File, EntryStart),
495 |             #rafter_entry{term=Term, index=Index} = binary_to_entry(Data),
496 |             {ok, ConfigStart, Term, Index, TruncateAt};
497 |         not_found ->
498 |             io:format("NOT FOUND: Size = ~p~n", [Size]),
499 |             ok = truncate(File, 0),
500 |             empty_file
501 |     end.
502 | 
503 | scan_for_trailer(File, Loc) ->
504 |     case find_magic_number(File, Loc) of
505 |         {ok, MagicLoc} ->
506 |             case file:pread(File, MagicLoc - (?TRAILER_SIZE-?MAGIC_SIZE), ?TRAILER_SIZE) of
507 |                 {ok, <<Crc:32, ConfigStart:64, EntryStart:64, _/binary >>} ->
508 |                     case erlang:crc32(<<ConfigStart:64, EntryStart:64, ?MAGIC/binary >>) of
509 |                         Crc ->
510 |                             {ok, ConfigStart, EntryStart, MagicLoc + 8};
511 |                         _ ->
512 |                             scan_for_trailer(File, MagicLoc)
513 |                     end;
514 |                 eof ->
515 |                     not_found
516 |             end;
517 |         not_found ->
518 |             not_found
519 |     end.
520 | 
521 | read_block(File, Loc) ->
522 |     case Loc < ?READ_BLOCK_SIZE of
523 |         true ->
524 |             {ok, Buffer} = file:pread(File, 0, Loc),
525 |             {Buffer, 0};
526 |         false ->
527 |             Start = Loc - ?READ_BLOCK_SIZE,
528 |             {ok, Buffer} = file:pread(File, Start, ?READ_BLOCK_SIZE),
529 |             {Buffer, Start}
530 |     end.
531 | 
532 | %% @doc Continuously read blocks from the file and search backwards until the
533 | %% magic number is found or we reach the beginning of the file.
534 | find_magic_number(File, Loc) ->
535 |     {Block, Start} = read_block(File, Loc),
536 |     case find_last_magic_number_in_block(Block) of
537 |         {ok, Offset} ->
538 |             io:format("Magic Number found at ~p~n", [Start+Offset]),
539 |             {ok, Start+Offset};
540 |         not_found ->
541 |             case Start of
542 |                 0 ->
543 |                     not_found;
544 |                 _ ->
545 |                     %% Ensure we search the overlapping 8 bytes between blocks
546 |                     find_magic_number(File, Start+8)
547 |             end
548 |     end.
549 | 
550 | -spec find_last_magic_number_in_block(binary()) ->
551 |     {ok, non_neg_integer()} | not_found.
552 | find_last_magic_number_in_block(Block) ->
553 |     case string:rstr(binary_to_list(Block), binary_to_list(?MAGIC)) of
554 |         0 ->
555 |             not_found;
556 |         Index ->
557 |             %% We want the 0 based binary offset, not the 1 based list offset.
558 |             {ok, Index - 1}
559 |     end.
560 | 
561 | get_pos(File, Loc, Index) ->
562 |     get_pos(File, Loc, Index, 0).
563 | 
564 | get_pos(File, Loc, Index, Count) ->
565 |     case file:pread(File, Loc, ?HEADER_SIZE) of
566 |         {ok, <<_Sha1:20/binary, _Type:8, _Term:64, Index:64, _DataSize:32>>} ->
567 |             {Loc, Count};
568 |         {ok, <<_:37/binary, DataSize:32>>} ->
569 |             get_pos(File, next_entry_loc(Loc, DataSize), Index, Count+1);
570 |         eof ->
571 |             {eof, Count}
572 |     end.
573 | 
574 | %% @doc Find an entry at the given index in a file. Search forward from Loc.
575 | find_entry(File, Loc, Index) ->
576 |     find_entry(File, Loc, Index, 0).
577 | 
578 | find_entry(File, Loc, Index, Count) ->
579 |     case file:pread(File, Loc, ?HEADER_SIZE) of
580 |         {ok, <<_Sha1:20/binary, _Type:8, _Term:64, Index:64, _DataSize:32>>=Header} ->
581 |             case read_data(File, Loc + ?HEADER_SIZE, Header) of
582 |                 {entry, Entry, _} ->
583 |                     {binary_to_entry(Entry), Loc, Count};
584 |                 eof ->
585 |                     %% This should only occur if the entry is currently being written.
586 |                     {not_found, Count}
587 |             end;
588 |         {ok, <<_:37/binary, DataSize:32>>} ->
589 |             NextLoc = next_entry_loc(Loc, DataSize),
590 |             find_entry(File, NextLoc, Index, Count+1);
591 |         eof ->
592 |             {not_found, Count}
593 |     end.
594 | 
595 | next_entry_loc(Loc, DataSize) ->
596 |     Loc + ?HEADER_SIZE + DataSize + ?TRAILER_SIZE.
597 | 
598 | find_last_entry(_File, WriteLocation) when WriteLocation =< ?FILE_HEADER_SIZE ->
599 |     undefined;
600 | find_last_entry(File, WriteLocation) ->
601 |     {ok, <<_:32, _:64, EntryStart:64, _/binary>>} =
602 |         file:pread(File, WriteLocation - ?TRAILER_SIZE, ?TRAILER_SIZE),
603 |     {entry, Entry, _} = read_entry(File, EntryStart),
604 |     binary_to_entry(Entry).
605 | 
606 | %% @doc This function reads the next entry from the log at the given location
607 | %% and returns {entry, Entry, NewLocation}. If the end of file has been reached,
608 | %% return eof to the client. Errors are fail-fast.
609 | -spec read_entry(file:io_device(), non_neg_integer()) ->
610 |     {entry, binary(), non_neg_integer()} | {skip, non_neg_integer()} | eof.
611 | read_entry(File, Location) ->
612 |     case file:pread(File, Location, ?HEADER_SIZE) of
613 |         {ok, <<_Sha1:20/binary, _Type:8, _Term:64, _Index:64, _DataSize:32>>=Header} ->
614 |             read_data(File, Location + ?HEADER_SIZE, Header);
615 |         eof ->
616 |             eof
617 |     end.
618 | 
619 | -spec read_data(file:io_device(), non_neg_integer(), binary()) ->
620 |     {entry, binary(), non_neg_integer()} | eof.
621 | read_data(File, Location, <<Sha1:20/binary, Type:8, Term:64, Index:64, Size:32>>=H) ->
622 |     case file:pread(File, Location, Size) of
623 |         {ok, Data} ->
624 |             %% Fail-fast Integrity check. TODO: Offer user repair options?
625 |             Sha1 = crypto:hash(sha, <<Type:8, Term:64, Index:64, Size:32, Data/binary>>),
626 |             NewLocation = Location + Size + ?TRAILER_SIZE,
627 |             {entry, <<H/binary, Data/binary>>, NewLocation};
628 |         eof ->
629 |             eof
630 |     end.
631 | 
632 | -ifdef(TEST).
633 | -include_lib("eunit/include/eunit.hrl").
634 | -define(PEER, test).
635 | 
636 | cleanup() ->
637 |     os:cmd("rm -rf /tmp/rafter_test*").
638 | 
639 | %% REGRESSION: - see https://github.com/andrewjstone/rafter/pull/32
640 | log_overwrite_test() ->
641 |     cleanup(),
642 |     Opts = #rafter_opts{logdir="/tmp"},
643 |     {ok, _Pid} = rafter_log:start_link(?PEER, Opts),
644 |     assert_empty(),
645 | 
646 |     %% We are appending Entry1 as the leader, so it has no index.
647 |     Entry1 = #rafter_entry{type=config, term=1, index=undefined,
648 |                            cmd=#config{state=stable}},
649 |     assert_leader_append(1, 1, Entry1),
650 |     ConfigLoc0 = assert_stable_config(),
651 | 
652 |     Entry2 = #rafter_entry{type=noop, term=1, index=undefined, cmd=noop},
653 |     assert_leader_append(2, 1, Entry2),
654 |     ConfigLoc1 = assert_stable_config(),
655 |     ?assertEqual(ConfigLoc0, ConfigLoc1),
656 | 
657 |     %% A new leader takes over and this log gets its entry overwritten.
658 |     %% In reality index 1 will always be a #config{}, but this validates the
659 |     %% test that config gets reset.
660 |     Entry = #rafter_entry{type=noop, term=2, index=1, cmd=noop},
661 |     assert_follower_append(Entry),
662 |     assert_blank_config(),
663 | 
664 |     %% This peer becomes leader again and appends 2 configs
665 |     Entry3 = #rafter_entry{type=config, term=3, cmd=#config{state=stable}},
666 |     assert_leader_append(2, 3, Entry3),
667 |     ConfigLoc2 = assert_stable_config(),
668 | 
669 |     Entry4 = #rafter_entry{type=config, term=3, cmd=#config{state=stable}},
670 |     assert_leader_append(3, 3, Entry4),
671 |     ConfigLoc3 = assert_stable_config(),
672 |     ?assertNotEqual(ConfigLoc2, ConfigLoc3),
673 | 
674 |     %% A new leader takes over and truncates the last config
675 |     Entry5 = #rafter_entry{type=noop, term=4, index=3, cmd=noop},
676 |     assert_follower_append(Entry5),
677 |     ConfigLoc4 = assert_stable_config(),
678 |     ?assertEqual(ConfigLoc2, ConfigLoc4),
679 |     Index = rafter_log:get_last_index(?PEER),
680 |     ?assertEqual(Index, 3),
681 |     {ok, Entry6} = rafter_log:get_last_entry(?PEER),
682 |     ?assertEqual(Entry5, Entry6),
683 | 
684 |     %% A new leader takes over and truncates the last stable config
685 |     %% New config is at position 0
686 |     Entry7 = #rafter_entry{type=noop, term=5, index=2, cmd=noop},
687 |     assert_follower_append(Entry7),
688 |     assert_blank_config(),
689 |     Index2 = rafter_log:get_last_index(?PEER),
690 |     ?assertEqual(Index2, 2),
691 |     {ok, Entry8} = rafter_log:get_last_entry(?PEER),
692 |     ?assertEqual(Entry7, Entry8),
693 | 
694 |     rafter_log:stop(?PEER).
695 | 
696 | assert_leader_append(ExpectedIndex, ExpectedTerm, Entry) ->
697 |     {ok, Index} = rafter_log:append(?PEER, [Entry]),
698 |     ?assertEqual(ExpectedIndex, Index),
699 |     {ok, Entry1} = rafter_log:get_entry(?PEER, Index),
700 |     {ok, Entry1} = rafter_log:get_last_entry(?PEER),
701 |     Index = rafter_log:get_last_index(?PEER),
702 |     ?assertEqual(Entry1#rafter_entry.index, ExpectedIndex),
703 |     ?assertEqual(Entry1#rafter_entry.term, ExpectedTerm).
704 | 
705 | assert_follower_append(Entry) ->
706 |     %% Note that follower appends always have indexes since they are sent
707 |     %% from the leader who has already written the entry to its log.
708 |     Index = Entry#rafter_entry.index,
709 |     {ok, Index} = rafter_log:check_and_append(?PEER, [Entry], Index),
710 |     {ok, Entry1} = rafter_log:get_entry(?PEER, Index),
711 |     ?assertEqual(Entry, Entry1).
712 | 
713 | assert_blank_config() ->
714 |     Config = rafter_log:get_config(?PEER),
715 |     ?assertEqual(blank, Config#config.state),
716 |     State = sys:get_state(logname(?PEER)),
717 |     ?assertEqual(State#state.config_loc, 0).
718 | 
719 | assert_stable_config() ->
720 |     Config = rafter_log:get_config(?PEER),
721 |     ?assertEqual(stable, Config#config.state),
722 |     State = sys:get_state(logname(?PEER)),
723 |     ConfigLoc = State#state.config_loc,
724 |     ?assertNotEqual(ConfigLoc, 0),
725 |     ConfigLoc.
726 | 
727 | assert_empty() ->
728 |     ?assertEqual({ok, not_found}, rafter_log:get_last_entry(?PEER)),
729 |     ?assertEqual(0, rafter_log:get_last_index(?PEER)),
730 |     assert_blank_config().
731 | 
732 | -endif.
733 | 


--------------------------------------------------------------------------------
/src/consensus/cr_paxon.erl:
--------------------------------------------------------------------------------
  1 | -module(cr_paxon).
  2 | -author('Uenishi Kota').
  3 | -behaviour(gen_fsm).
  4 | -compile(export_all).
  5 | -include("cr.hrl").
  6 | -export(?GEN_FSM).
  7 | 
  8 | -export([nil/2,
  9 |          preparing/2,
 10 |          proposing/2,
 11 |          acceptor/2,
 12 |          learner/2,
 13 |          decided/2]).
 14 | 
 15 | -define( TIMEOUT, 3000 ).
 16 | 
 17 | -record( state, {subject, n, value,
 18 |                  all, quorum, current=0, others, init_n,
 19 |                  return_pids=[]
 20 |                 } ).
 21 | 
 22 | version_info()-> {?MODULE, 1}.
 23 | 
 24 | start(S, InitN, V, Others, ReturnPids) ->
 25 |     All = length(Others)+1,    Quorum = All / 2 ,
 26 |     InitStateData = #state{ subject=S, n=InitN, value=V,
 27 |                             all=All, quorum=Quorum, others=Others, init_n=InitN,
 28 |                             return_pids=ReturnPids },
 29 |     gen_fsm:start_link(
 30 |       generate_global_address( node(), S ), %FsmName  %%{global, ?MODULE},       %{local, {?MODULE, S} },
 31 |       ?MODULE,                        %Module
 32 |       InitStateData,                %Args
 33 |       [{timeout, ?TIMEOUT}]   %Options  %%, {debug, debug_info} ]
 34 |      ).
 35 | 
 36 | stop(S) ->gen_fsm:send_all_state_event( generate_global_address( node(),S ), stop).
 37 | get_result(S)-> gen_fsm:sync_send_all_state_event( generate_global_address( node(),S ), result).
 38 | 
 39 | init(InitStateData)->
 40 |     io:format("~p ~p: ~p~n", [?MODULE, started, InitStateData]),
 41 |     process_flag(trap_exit, true),
 42 |     {ok,
 43 |      nil,  %% initial statename
 44 |      InitStateData,     %%{{S, InitN, V},{All, Quorum, 0, Others, InitN}, Misc }, %% initial state data
 45 |      ?TIMEOUT %% initial state timeout
 46 |     }.
 47 | 
 48 | broadcast(Others, S, Message)->
 49 |     PaxosOthers = [ generate_global_address( P, S ) || P <-  Others ],
 50 |     lists:map( fun(Other)-> gen_fsm:send_event( Other, Message ) end , %Timeout * 1) end,
 51 |                PaxosOthers ).
 52 | 
 53 | send(Node, S, Message)-> gen_fsm:send_event( generate_global_address( Node, S ), Message ).
 54 | get_next_n( N , All )-> (( N div All )+1) * All.
 55 | generate_global_address( Node, Subject )->  {global, {?MODULE, Node, Subject}}.
 56 | 
 57 | nil( {prepare,  {S, N, _V, From}},  StateData) when N > StateData#state.n ->
 58 |     send(From, S, {prepare_result, {S, 0, nil, node()}}),
 59 |     NewStateData = StateData#state{n=N},
 60 |     {next_state, acceptor, NewStateData, %{{S, N, V}, Nums},
 61 |      ?TIMEOUT};
 62 | nil( {prepare,  {S, N, _V, From}}, StateData) when N < StateData#state.n ->  %{{S, Nc, Vc}, Nums} ) when N < Nc ->
 63 |     send(From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node()}}),
 64 |     {next_state, nil, StateData, ?TIMEOUT};
 65 | nil( {decide,  {S, N, V, _From}}, StateData ) -> % when N == Nc
 66 |     S=StateData#state.subject,
 67 |     decided_callback( StateData#state{n=N, value=V} );
 68 | nil( timeout, StateData )-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN}} )->
 69 |     NewN = get_next_n( StateData#state.n, StateData#state.all ) + StateData#state.init_n,
 70 |     io:format( "PAXON ~p. ~n", [[NewN, StateData]]),
 71 |     S=StateData#state.subject,
 72 |     V=StateData#state.value,
 73 |     Result = broadcast( StateData#state.others, S, {prepare, {S, NewN, V, node()}} ),
 74 |     io:format( "BROADCAST: ~p. ~n", [Result]),
 75 |     {next_state, preparing, StateData#state{n=NewN, current=1}, ?TIMEOUT};
 76 | nil(UnknownEvent, StateData)-> % ignore
 77 |     io:format( "unknown event: ~p,  ~p : all ignored.~n", [UnknownEvent, StateData] ),
 78 |     {next_state, nil, StateData, ?TIMEOUT}.
 79 | 
 80 | preparing( {prepare,  {S, N, _V, From}},  StateData ) when N < StateData#state.n ->
 81 |     send( From, S,  {prepare, {S, StateData#state.n, StateData#state.value, node()} } ),
 82 |     {next_state, preparing, StateData, ?TIMEOUT};
 83 | preparing( {prepare,  {S, N, V, From}},  StateData ) when N > StateData#state.n ->
 84 |     send( From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node()}}),
 85 |     io:format("sending prepare_result and going acceptor...~n", []),
 86 |     {next_state, acceptor, StateData#state{subject=S, n=N, value=V}, ?TIMEOUT};
 87 | preparing( {prepare_result,  {S, N, V, From}},  StateData ) when N > StateData#state.n ->
 88 |     send( From, S, {prepare_result, {S, N, V, node()}} ),
 89 |     {next_state, acceptor, StateData#state{subject=S, n=N, value=V}, ?TIMEOUT};
 90 | preparing( {prepare_result,  {S, _N, _V, _From}}, StateData ) when StateData#state.current > StateData#state.quorum ->
 91 |     broadcast( StateData#state.others, S, {propose, {S,StateData#state.n,StateData#state.value, node()}} ),
 92 |     {next_state, proposing, StateData#state{current=1}, ?TIMEOUT};
 93 | preparing( {prepare_result,  {S, N, V, _From}}, StateData )
 94 |   when S==StateData#state.subject , N==StateData#state.n , V==StateData#state.value ->
 95 |     Current = StateData#state.current,
 96 |     {next_state, proposing, StateData#state{current=Current+1}, ?TIMEOUT};
 97 | preparing( {prepare_result,  {S, N, _V, _From}}, StateData ) when N < StateData#state.n ->
 98 |     case (StateData#state.current + 1 > StateData#state.quorum) of
 99 |         true ->
100 |             io:format("got quorum at prepare!~n", []),
101 |             broadcast( StateData#state.others, S, {propose, {S, StateData#state.n, StateData#state.value, node()}} ),
102 |             {next_state, proposing, StateData#state{current=1}, ?TIMEOUT};
103 |         false ->
104 |             Current = StateData#state.current,
105 |             {next_state, preparing, StateData#state{current=Current+1}, ?TIMEOUT}
106 |                                                 %{{S, Nc, Vc}, {All, Quorum, Current+1, Others, InitN}},
107 |     end;
108 | preparing( {propose,  {S, N, V, From}},  StateData ) when N > StateData#state.n ->
109 |     send( From, S, {propose_result, {S, N, V, node()}} ),
110 |     {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT};
111 | preparing( {propose_result,  {S, N, V, From}}, StateData) when N > StateData#state.n ->
112 |     send( From, S, {propose_result, {S, N, V, node()}} ),
113 |     {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT};
114 | preparing( {decide,  {_S, N, V, _From}}, StateData)-> %{{S, _Nc, _Vc}, Nums} ) ->
115 |     decided_callback( StateData#state{n=N, value=V} );
116 | 
117 | preparing( timeout, StateData)-> %{{S, N, V},  {All, Quorum, _Current, Others, InitN} } )->
118 |     {next_state, nil,  StateData#state{current=0}, ?TIMEOUT}.
119 | proposing( {prepare,  {S, N, V, From}},  StateData) when N > StateData#state.n ->  %{{S, Nc, Vc}, Nums} ) when N > Nc ->
120 |     send( From, S,  {prepare_result, {S, StateData#state.n, StateData#state.value, node() }}),
121 |     {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT};
122 | proposing( {prepare_result,  {S, N, V, From}},  StateData) when N > StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N > Nc ->
123 |     send( From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node()}}),
124 |     {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT};
125 | proposing( {propose,  {S, N, V, From}},  StateData) when N > StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N > Nc ->
126 |     send( From, S, {propose_result, {S, StateData#state.n, StateData#state.value, node()}}),
127 |     {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT};
128 | proposing( {propose_result,  {S, N, V, _From}}, StateData)
129 |   when N==StateData#state.n, V==StateData#state.value, StateData#state.quorum > StateData#state.current+1 ->
130 |     S=StateData#state.subject,
131 |     Current = StateData#state.current,
132 |     {next_state, proposing, StateData#state{current=Current+1}, ?TIMEOUT };
133 | proposing( {propose_result,  {S, N, V, _From}}, StateData) when N==StateData#state.n, V==StateData#state.value->
134 |     io:format("PROPOSING quorum result~n", []),
135 |     broadcast( StateData#state.others, S, {decide, {S, N, V, node()}} ),
136 |     Current=StateData#state.current,
137 |     decided_callback( StateData#state{current=Current+1} );
138 | proposing( {propose_result,  {S, N, V, From}}, StateData) when N > StateData#state.n -> % {{S, Nc, _Vc}, Nums} ) when N > Nc ->
139 |     send( From,  S, {propose_result, {S, N, V, node()}}),
140 |     {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT};
141 | proposing( {decide,  {S, N, V, _From}}, StateData) when N >= StateData#state.n-> %{{S, Nc, _Vc}, Nums} ) when N >= Nc ->
142 |     S=StateData#state.subject,
143 |     decided_callback( StateData#state{n=N, value=V} );
144 | proposing( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN}} )->
145 |     io:format("PROPOSING timeout state: ~p~n" , [StateData]),
146 |     {next_state, nil, StateData#state{current=1}, ?TIMEOUT};
147 | proposing( _Event, StateData) ->
148 |     {next_state, proposing, StateData}.
149 | 
150 | 
151 | acceptor( {prepare,  {S, N, _V, From}},  StateData) when N < StateData#state.n-> %{{S, Nc, Vc}, Nums} ) when N < Nc ->
152 |     send( From, S, { prepare_result, {S, StateData#state.n, StateData#state.value, node()}} ),
153 |     {next_state, acceptor, StateData, ?TIMEOUT};
154 | acceptor( {prepare,  {S, N, V, From}},  StateData ) when N >= StateData#state.n ->
155 |     send( From, S, { prepare_result, {S, StateData#state.n, StateData#state.value, node()}} ),
156 |     {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT};
157 | acceptor( {propose,  {S, N, _V, _From}},  StateData) when N < StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N < Nc ->
158 |     io:format("bad state: ~p (N,Nc)=(~p)~n" , [{propose},{ N, StateData#state.n}]),
159 |     S=StateData#state.subject,
160 |     {next_state, propose, StateData, ?TIMEOUT};
161 | acceptor( {propose,  {S, N, V, From}},  StateData ) when N > StateData#state.n ->
162 |     send( From, S, {propose_result , {S, StateData#state.n, StateData#state.value, node() }} ),
163 |     {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT};
164 | acceptor( {propose,  {S, N, V, From}},  StateData)-> % when N == Nc
165 |     {N,V}={StateData#state.n, StateData#state.value},
166 |     send( From, S, {propose_result , {S, StateData#state.n, StateData#state.value,  node() }} ),
167 |     {next_state, learner, StateData, ?TIMEOUT};
168 | acceptor( {decide,  {S, N, V, _From}}, StateData) when N >= StateData#state.n -> %{{S, Nc, _Vc}, Nums} ) when N >= Nc ->
169 |     S=StateData#state.subject,
170 |     decided_callback( StateData#state{n=N, value=V} );
171 | acceptor( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN} })->
172 |     io:format("ACCEPTOR timeout: ~p (N,V)=(~p)~n" , [{propose},{StateData#state.n, StateData#state.value}]),
173 |     {next_state, nil, StateData#state{current=1}, ?TIMEOUT};
174 | 
175 | acceptor( _Event, StateData) ->
176 |     io:format("ACCEPTOR unknown event: ~p ,~p~n" , [_Event , StateData]),
177 |     {next_state, acceptor, StateData}.
178 | 
179 | learner( {prepare,  {S, N, V, From}}, StateData) when N > StateData#state.n -> % {{S, Nc, _Vc}, Nums} ) when N > Nc ->
180 |     send( From, S, {prepare_result, {S, N, V, node()}} ),
181 |     {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT};
182 | learner( {prepare_result,  {S, _N, _V, _From}}, StateData )-> % when N < Nc ->
183 |     S=StateData#state.subject,
184 |     {next_state, learner, StateData, ?TIMEOUT };
185 | learner( {propose,  {S, N, _V, _From}}, StateData) when N < StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N < Nc ->
186 |     S=StateData#state.subject,
187 |     {next_state, learner, StateData, ?TIMEOUT};
188 | learner( {propose,  {S, N, V, From}},  StateData) when N > StateData#state.n -> %{{S, Nc, _Vc}, Nums} ) when N > Nc ->
189 |     send( From, S, {propose_result, {S, N, V, node()}}),
190 |     {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT};
191 | learner( {decide,  {S, N, V, _From}}, StateData) when N >= StateData#state.n -> %{{S, Nc, _Vc}, Nums} ) when N >= Nc ->
192 |     S=StateData#state.subject,
193 |     decided_callback( StateData#state{n=N, value=V} );
194 | learner( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN}} )->
195 |     {next_state, nil, StateData#state{current=0}, ?TIMEOUT};
196 | learner( _Event, StateData )->
197 |     {next_state, learner, StateData }.
198 | 
199 | decided( {_Message, {S,_N,_V, From}}, StateData)->
200 |     send( From, S, {decide, {S,StateData#state.n, StateData#state.value,node()}} ),
201 |     {next_state, decided, StateData, ?TIMEOUT };
202 | decided( timeout,  StateData )->
203 |     io:format( "PAXON mediation: ~p/~p~n", [StateData#state.value, StateData#state.n] ),
204 |     {stop, normal, StateData }.
205 | 
206 | decided_callback(StateData)->
207 |     callback(StateData#state.subject, StateData#state.value, StateData#state.return_pids ),
208 |     {next_state, decided, StateData, ?TIMEOUT}.
209 | 
210 | callback(S, V, ReturnPids)->
211 |     lists:map( fun(ReturnPid)-> ReturnPid ! {self(), result, {S, V}} end, ReturnPids ).
212 | 
213 | code_change(_,_,_,_)-> ok.
214 | handle_event( stop, _StateName, StateData )-> {stop, normal, StateData}.
215 | handle_info(_,_,_)-> ok.
216 | handle_sync_event(result, _From, StateName, StateData)-> {reply, {StateName, StateName#state.value}  , StateName, StateData};
217 | handle_sync_event(stop, From, StateName, StateData)-> {stop, From, StateName, StateData}.
218 | terminate(Reason, StateName, StateData) ->
219 |     io:format("Module ~p terminated with reason: ~p~n", [?MODULE, Reason]),
220 |     io:format("State ~p with data: ~p~n",  [StateName, StateData]),
221 |     ok.
222 | 


--------------------------------------------------------------------------------
/src/consensus/cr_rafter.erl:
--------------------------------------------------------------------------------
  1 | -module(cr_rafter).
  2 | -author('Andrew J. Stone').
  3 | -description('RAFT protocol').
  4 | -behaviour(gen_fsm).
  5 | -include("rafter.hrl").
  6 | -include("rafter_consensus_fsm.hrl").
  7 | -include("rafter_opts.hrl").
  8 | -include("cr.hrl").
  9 | -export(?GEN_FSM).
 10 | -compile(export_all).
 11 | -export([follower/2, follower/3, candidate/2, candidate/3, leader/2, leader/3]).
 12 | 
 13 | -define(CLIENT_TIMEOUT,       2000).
 14 | -define(ELECTION_TIMEOUT_MIN, 500).
 15 | -define(ELECTION_TIMEOUT_MAX, 1000).
 16 | -define(HEARTBEAT_TIMEOUT,    100).
 17 | 
 18 | start_link({Index,Node}, Opts) ->
 19 |     Name = list_to_atom(lists:concat([Index,':',Node])),
 20 |     io:format("RAFTER start_link ~p~n",[{Index,Node}]),
 21 |     gen_fsm:start_link({local,Node},?MODULE, [Node, Opts], []).
 22 | 
 23 | raftname(Name) -> list_to_atom(lists:concat(["rafter:",Name])).
 24 | 
 25 | init([Me, #rafter_opts{state_machine=StateMachine,cluster=Nodes}]) ->
 26 |     Timer = gen_fsm:send_event_after(election_timeout(), timeout),
 27 |     #meta{voted_for=VotedFor, term=Term} = cr_log:get_metadata(Me),
 28 |     BackendState = StateMachine:init(Me),
 29 |     io:format("RAFTER INIT Me: ~p~n",[Me]),
 30 |     State = #state{term=Term,
 31 |                    voted_for=VotedFor,
 32 |                    me=Me,
 33 |                    responses=dict:new(),
 34 |                    followers=dict:new(),
 35 |                    commit_index = cr_log:get_last_index(cr:node()),
 36 |                    timer=Timer,
 37 |                    state_machine=StateMachine,
 38 |                    backend_state=BackendState},
 39 |     Config = cr:config(),
 40 |     NewState =
 41 |         case Config#config.state of
 42 |             blank ->
 43 |                 State#state{config=Config};
 44 |             _ ->
 45 |                 State#state{config=Config, init_config=complete}
 46 |         end,
 47 |     {ok, follower, NewState}.
 48 | 
 49 | stop(Pid) -> gen_fsm:send_all_state_event({Pid,Pid}, stop).
 50 | op(Command) -> gen_fsm:sync_send_event(get_leader(cr:node()), {op, Command}).
 51 | op(Peer, Command) -> gen_fsm:sync_send_event({Peer,Peer}, {op, Command}).
 52 | read_op(Peer, Command) -> gen_fsm:sync_send_event({Peer,Peer}, {read_op, Command}).
 53 | set_config(Peer, Config) -> gen_fsm:sync_send_event({Peer,Peer}, {set_config, Config}).
 54 | get_leader(Pid) -> gen_fsm:sync_send_all_state_event({Pid,Pid}, get_leader).
 55 | send(To, Msg) -> catch gen_fsm:send_event({To,To}, Msg).
 56 | send_sync(To, Msg) -> Timeout=100, gen_fsm:sync_send_event(To, Msg, Timeout).
 57 | format_status(_, [_, State]) -> Data = lager:pr(State, ?MODULE), [{data, [{"StateData", Data}]}].
 58 | 
 59 | handle_event(stop, _, State) ->
 60 |     {stop, normal, State};
 61 | handle_event(_Event, _StateName, State) ->
 62 |     {stop, {error, badmsg}, State}.
 63 | 
 64 | handle_sync_event(get_leader, _, StateName, State=#state{leader=Leader}) ->
 65 |     {reply, Leader, StateName, State};
 66 | handle_sync_event(_Event, _From, _StateName, State) ->
 67 |     {stop, badmsg, State}.
 68 | 
 69 | handle_info({client_read_timeout, Clock, Id}, StateName,
 70 |     #state{read_reqs=Reqs}=State) ->
 71 |         ClientRequests = orddict:fetch(Clock, Reqs),
 72 |         {ok, ClientReq} = find_client_req(Id, ClientRequests),
 73 |         send_client_timeout_reply(ClientReq),
 74 |         NewClientRequests = delete_client_req(Id, ClientRequests),
 75 |         NewReqs = orddict:store(Clock, NewClientRequests, Reqs),
 76 |         NewState = State#state{read_reqs=NewReqs},
 77 |         {next_state, StateName, NewState};
 78 | 
 79 | handle_info({client_timeout, Id}, StateName, #state{client_reqs=Reqs}=State) ->
 80 |     case find_client_req(Id, Reqs) of
 81 |         {ok, ClientReq} ->
 82 |             send_client_timeout_reply(ClientReq),
 83 |             NewState = State#state{client_reqs=delete_client_req(Id, Reqs)},
 84 |             {next_state, StateName, NewState};
 85 |         not_found ->
 86 |             {next_state, StateName, State}
 87 |     end;
 88 | handle_info(_, _, State) ->
 89 |     {stop, badmsg, State}.
 90 | 
 91 | terminate(_, _, _) ->
 92 |     ok.
 93 | 
 94 | code_change(_OldVsn, StateName, State, _Extra) ->
 95 |     {ok, StateName, State}.
 96 | 
 97 | %%=============================================================================
 98 | %% States
 99 | %%
100 | %% Note: All RPC's and client requests get answered in State/3 functions.
101 | %% RPC Responses get handled in State/2 functions.
102 | %%=============================================================================
103 | 
104 | %% Election timeout has expired. Go to candidate state iff we are a voter.
105 | follower(timeout, #state{config=Config, me=Me}=State0) ->
106 |     io:format("RAFTER FOLLOWER timeout~n",[]),
107 |     case cr_config:has_vote(Me, Config) of
108 |         false ->
109 |             State = reset_timer(election_timeout(), State0),
110 |             NewState = State#state{leader=undefined},
111 |             {next_state, follower, NewState};
112 |         true ->
113 |             State = become_candidate(State0),
114 |             {next_state, candidate, State}
115 |     end;
116 | 
117 | %% Ignore stale messages.
118 | follower(#vote{}, State) ->
119 |     io:format("RAFTER FOLLOWER #vote~n",[]),
120 |     {next_state, follower, State};
121 | follower(#append_entries_rpy{}, State) ->
122 |     {next_state, follower, State}.
123 | 
124 | %% Vote for this candidate
125 | follower(#request_vote{}=RequestVote, _From, State) ->
126 |     io:format("RAFTER FOLLOWER #req_vote~n",[]),
127 |     handle_request_vote(RequestVote, State);
128 | 
129 | follower(#append_entries{term=Term}, _From,
130 |          #state{term=CurrentTerm, me=Me}=State) when CurrentTerm > Term ->
131 |     Rpy = #append_entries_rpy{from=Me, term=CurrentTerm, success=false},
132 |     io:format("RAFTER FOLLOWER #append Me: ~p success: false~n",[Me]),
133 |     {reply, Rpy, follower, State};
134 | 
135 | follower(#append_entries{term=Term, from=From, prev_log_index=PrevLogIndex,
136 |                          entries=Entries, commit_index=CommitIndex,
137 |                          send_clock=Clock}=AppendEntries,
138 |          _From, #state{me=Me}=State) ->
139 |     %io:format("RAFTER FOLLOWER #append Me: ~p~n",[Me]),
140 |     State2=set_term(Term, State),
141 |     Rpy = #append_entries_rpy{send_clock=Clock,
142 |                               term=Term,
143 |                               success=false,
144 |                               from=Me},
145 |     %% Always reset the election timer here, since the leader is valid,
146 |     %% but may have conflicting data to sync
147 |     State3 = reset_timer(election_timeout(), State2),
148 |     case consistency_check(AppendEntries, State3) of
149 |         false ->
150 |             {reply, Rpy, follower, State3};
151 |         true ->
152 |             {ok, CurrentIndex} = cr_log:check_and_append(Me,Entries, PrevLogIndex+1),
153 |             Config = cr_log:get_config(Me),
154 |             NewRpy = Rpy#append_entries_rpy{success=true, index=CurrentIndex},
155 |             State4 = commit_entries(CommitIndex, State3),
156 |             State5 = State4#state{leader=From, config=Config},
157 |             {reply, NewRpy, follower, State5}
158 |     end;
159 | 
160 | follower({set_config, _}, _From, #state{leader=undefined, me=Me, config=C}=State) ->
161 |     io:format("RAFTER FOLLOWER set_config ~p~n",[Me]),
162 |     Error = no_leader_error(Me, C),
163 |     {reply, {error, Error}, follower, State};
164 | 
165 | follower({set_config, _}, _From, #state{leader=Leader}=State) ->
166 |     io:format("RAFTER FOLLOWER set_config ~p~n",[Leader]),
167 |     Reply = {error, {redirect, Leader}},
168 |     {reply, Reply, follower, State};
169 | 
170 | follower({read_op, _}, _From, #state{me=Me, config=Config, leader=undefined}=State) ->
171 |     io:format("RAFTER FOLLOWER read_op ~p~n",[Me]),
172 |     Error = no_leader_error(Me, Config),
173 |     {reply, {error, Error}, follower, State};
174 | 
175 | follower({read_op, _}, _From, #state{leader=Leader}=State) ->
176 |     io:format("RAFTER FOLLOWER read_op~n",[]),
177 |     Reply = {error, {redirect, Leader}},
178 |     {reply, Reply, follower, State};
179 | 
180 | follower({op, _Command}, _From, #state{me=Me, config=Config, leader=undefined}=State) ->
181 |     io:format("RAFTER FOLLOWER read_op~n",[]),
182 |     Error = no_leader_error(Me, Config),
183 |     {reply, {error, Error}, follower, State};
184 | 
185 | follower({op, _Command}, _From, #state{leader=Leader}=State) ->
186 |     io:format("RAFTER FOLLOWER read_op~n",[]),
187 |     Reply = {error, {redirect, Leader}},
188 |     {reply, Reply, follower, State}.
189 | 
190 | %% This is the initial election to set the initial config. We did not
191 | %% get a quorum for our votes, so just reply to the user here and keep trying
192 | %% until the other nodes come up.
193 | candidate(timeout, #state{term=1, init_config=[_Id, From]}=S) ->
194 |     io:format("RAFTER CANDIDATE timeout ~n",[]),
195 |     State0 = reset_timer(election_timeout(), S),
196 |     gen_fsm:reply(From, {error, peers_not_responding}),
197 |     State = State0#state{init_config=no_client},
198 |     {next_state, candidate, State};
199 | 
200 | %% The election timeout has elapsed so start an election
201 | candidate(timeout, State) ->
202 |     io:format("RAFTER CANDIDATE timeout~n",[]),
203 |     NewState = become_candidate(State),
204 |     {next_state, candidate, NewState};
205 | 
206 | %% This should only happen if two machines are configured differently during
207 | %% initial configuration such that one configuration includes both proposed leaders
208 | %% and the other only itself. Additionally, there is not a quorum of either
209 | %% configuration's servers running.
210 | %%
211 | %% (i.e. rafter:set_config(b, [k, b, j]), rafter:set_config(d, [i,k,b,d,o]).
212 | %%       when only b and d are running.)
213 | %%
214 | %% Thank you EQC for finding this one :)
215 | candidate(#vote{term=VoteTerm, success=false},
216 |           #state{term=Term, init_config=[_Id, From]}=State)
217 |          when VoteTerm > Term ->
218 |     io:format("RAFTER CANDIDATE #vote~n",[]),
219 |     gen_fsm:reply(From, {error, invalid_initial_config}),
220 |     State2 = State#state{init_config=undefined, config=#config{state=blank}},
221 |     NewState = step_down(VoteTerm, State2),
222 |     {next_state, follower, NewState};
223 | 
224 | %% We are out of date. Go back to follower state.
225 | candidate(#vote{term=VoteTerm, success=false}, #state{term=Term}=State)
226 |          when VoteTerm > Term ->
227 |     io:format("RAFTER CANDIDATE #vote~n",[]),
228 |     NewState = step_down(VoteTerm, State),
229 |     {next_state, follower, NewState};
230 | 
231 | %% This is a stale vote from an old request. Ignore it.
232 | candidate(#vote{term=VoteTerm}, #state{term=CurrentTerm}=State)
233 |           when VoteTerm < CurrentTerm ->
234 |     io:format("RAFTER CANDIDATE #vote~n",[]),
235 |     {next_state, candidate, State};
236 | 
237 | candidate(#vote{success=false, from=From}, #state{responses=Responses}=State) ->
238 |     NewResponses = dict:store(From, false, Responses),
239 |     NewState = State#state{responses=NewResponses},
240 |     io:format("RAFTER CANDIDATE #vote~n",[]),
241 |     {next_state, candidate, NewState};
242 | 
243 | %% Sweet, someone likes us! Do we have enough votes to get elected?
244 | candidate(#vote{success=true, from=From}, #state{responses=Responses, me=Me,
245 |                                                  config=Config}=State) ->
246 |     io:format("RAFTER CANDIDATE #vote ~p~n",[Config]),
247 |     NewResponses = dict:store(From, true, Responses),
248 |     case cr_config:quorum(Me, Config, NewResponses) of
249 |         true ->
250 |             NewState = become_leader(State),
251 |             {next_state, leader, NewState};
252 |         false ->
253 |             NewState = State#state{responses=NewResponses},
254 |             {next_state, candidate, NewState}
255 |     end.
256 | 
257 | candidate({set_config, _}, _From, State) ->
258 |     io:format("RAFTER CANDIDATE set_config~n",[]),
259 |     Reply = {error, election_in_progress},
260 |     {reply, Reply, follower, State};
261 | 
262 | %% A Peer is simultaneously trying to become the leader
263 | %% If it has a higher term, step down and become follower.
264 | candidate(#request_vote{term=RequestTerm}=RequestVote, _From,
265 |           #state{term=Term}=State) when RequestTerm > Term ->
266 |     NewState = step_down(RequestTerm, State),
267 |     io:format("RAFTER CANDIDATE #req_vote~n",[]),
268 |     handle_request_vote(RequestVote, NewState);
269 | candidate(#request_vote{}, _From, #state{term=CurrentTerm, me=Me}=State) ->
270 |     Vote = #vote{term=CurrentTerm, success=false, from=Me},
271 |     io:format("RAFTER CANDIDATE #req_vote~n",[]),
272 |     {reply, Vote, candidate, State};
273 | 
274 | %% Another peer is asserting itself as leader, and it must be correct because
275 | %% it was elected. We are still in initial config, which must have been a
276 | %% misconfiguration. Clear the initial configuration and step down. Since we
277 | %% still have an outstanding client request for inital config send an error
278 | %% response.
279 | candidate(#append_entries{term=RequestTerm}, _From,
280 |           #state{init_config=[_, Client]}=State) ->
281 |     io:format("RAFTER CANDIDATE #append~n"),
282 |     gen_fsm:reply(Client, {error, invalid_initial_config}),
283 |     %% Set to complete, we don't want another misconfiguration
284 |     State2 = State#state{init_config=complete, config=#config{state=blank}},
285 |     State3 = step_down(RequestTerm, State2),
286 |     {next_state, follower, State3};
287 | 
288 | %% Same as the above clause, but we don't need to send an error response.
289 | candidate(#append_entries{term=RequestTerm}, _From,
290 |           #state{init_config=no_client}=State) ->
291 |     %% Set to complete, we don't want another misconfiguration
292 |     io:format("RAFTER CANDIDATE #append~n"),
293 |     State2 = State#state{init_config=complete, config=#config{state=blank}},
294 |     State3 = step_down(RequestTerm, State2),
295 |     {next_state, follower, State3};
296 | 
297 | %% Another peer is asserting itself as leader. If it has a current term
298 | %% step down and become follower. Otherwise do nothing
299 | candidate(#append_entries{term=RequestTerm}, _From, #state{term=CurrentTerm}=State)
300 |         when RequestTerm >= CurrentTerm ->
301 |     io:format("RAFTER CANDIDATE #append~n"),
302 |     NewState = step_down(RequestTerm, State),
303 |     {next_state, follower, NewState};
304 | candidate(#append_entries{}, _From, State) ->
305 |     io:format("RAFTER CANDIDATE #append~n"),
306 |     {next_state, candidate, State};
307 | 
308 | candidate({set_config, {NewServer, AddRemove}}, From, #state{me=Me, followers=F, term=Term, config=C}=State) ->
309 | %    change_config(NewServer, AddRemove, From, Me, F, Term, C, State, candidate);
310 |     {reply, {error, election_in_progress}, candidate, State};
311 | 
312 | %% We are in the middle of an election.
313 | %% Leader should always be undefined here.
314 | candidate({read_op, _}, _, #state{leader=undefined}=State) ->
315 |     io:format("RAFTER CANDIDATE read_op~n"),
316 |     {reply, {error, election_in_progress}, candidate, State};
317 | candidate({op, _Command}, _From, #state{leader=undefined}=State) ->
318 |     io:format("RAFTER CANDIDATE op~n"),
319 |     {reply, {error, election_in_progress}, candidate, State}.
320 | 
321 | leader(timeout, #state{term=Term,
322 |                        init_config=no_client,
323 |                        config=C}=S) ->
324 |     io:format("RAFTER LEADER timeout ~p~n",[no_client]),
325 |     Entry = #rafter_entry{type=config, term=Term, cmd=C},
326 |     State0 = append(Entry, S),
327 |     State = reset_timer(heartbeat_timeout(), State0),
328 |     NewState = State#state{init_config=complete},
329 |     {next_state, leader, NewState};
330 | 
331 | %% We have just been elected leader because of an initial configuration.
332 | %% Append the initial config and set init_config=complete.
333 | leader(timeout, #state{term=Term, init_config=[Id, From], config=C}=S) ->
334 |     io:format("RAFTER LEADER timeout ~p~n",[{Id,From}]),
335 |     State0 = reset_timer(heartbeat_timeout(), S),
336 |     Entry = #rafter_entry{type=config, term=Term, cmd=C},
337 |     State = append(Id, From, Entry, State0, leader),
338 |     NewState = State#state{init_config=complete},
339 |     {next_state, leader, NewState};
340 | 
341 | leader(timeout, State0) ->
342 |     State = reset_timer(heartbeat_timeout(), State0),
343 |     NewState = send_append_entries(State),
344 |     {next_state, leader, NewState};
345 | 
346 | %% We are out of date. Go back to follower state.
347 | leader(#append_entries_rpy{term=Term, success=false},
348 |        #state{term=CurrentTerm}=State) when Term > CurrentTerm ->
349 |     NewState = step_down(Term, State),
350 |     {next_state, follower, NewState};
351 | 
352 | %% This is a stale reply from an old request. Ignore it.
353 | leader(#append_entries_rpy{term=Term, success=true},
354 |        #state{term=CurrentTerm}=State) when CurrentTerm > Term ->
355 |     {next_state, leader, State};
356 | 
357 | %% The follower is not synced yet. Try the previous entry
358 | leader(#append_entries_rpy{from=From, success=false},
359 |        #state{followers=Followers, config=C, me=Me}=State) ->
360 |        case lists:member(From, cr_config:followers(Me, C)) of
361 |            true ->
362 |                NextIndex = decrement_follower_index(From, Followers),
363 |                NewFollowers = dict:store(From, NextIndex, Followers),
364 |                NewState = State#state{followers=NewFollowers},
365 |                {next_state, leader, NewState};
366 |            false ->
367 |                %% This is a reply from a previous configuration. Ignore it.
368 |                {next_state, leader, State}
369 |        end;
370 | 
371 | %% Success!
372 | leader(#append_entries_rpy{from=From, success=true}=Rpy,
373 |        #state{followers=Followers, config=C, me=Me}=State) ->
374 |     case lists:member(From, cr_config:followers(Me, C)) of
375 |         true ->
376 |             NewState = save_rpy(Rpy, State),
377 |             State2 = maybe_commit(NewState),
378 |             State3 = maybe_send_read_replies(State2),
379 |             case State3#state.leader of
380 |                 undefined ->
381 |                     %% We just committed a config that doesn't include ourselves
382 |                     {next_state, follower, State3};
383 |                 _ ->
384 |                     State4 =
385 |                         maybe_increment_follower_index(From, Followers, State3),
386 |                     {next_state, leader, State4}
387 |             end;
388 |         false ->
389 |             %% This is a reply from a previous configuration. Ignore it.
390 |             {next_state, leader, State}
391 |     end;
392 | 
393 | %% Ignore stale votes.
394 | leader(#vote{}, State) ->
395 |     io:format("RAFTER LEADER #vote~n"),
396 |     {next_state, leader, State}.
397 | 
398 | %% An out of date leader is sending append_entries, tell it to step down.
399 | leader(#append_entries{term=Term}, _From, #state{term=CurrentTerm, me=Me}=State)
400 |         when Term < CurrentTerm ->
401 |     Rpy = #append_entries_rpy{from=Me, term=CurrentTerm, success=false},
402 |     io:format("RAFTER LEADER #append~n"),
403 |     {reply, Rpy, leader, State};
404 | 
405 | %% We are out of date. Step down
406 | leader(#append_entries{term=Term}, _From, #state{term=CurrentTerm}=State)
407 |         when Term > CurrentTerm ->
408 |     NewState = step_down(Term, State),
409 |     io:format("RAFTER LEADER #append~n"),
410 |     {next_state, follower, NewState};
411 | 
412 | %% We are out of date. Step down
413 | leader(#request_vote{term=Term}, _From, #state{term=CurrentTerm}=State)
414 |         when Term > CurrentTerm ->
415 |     NewState = step_down(Term, State),
416 |     io:format("RAFTER LEADER #req_vote~n"),
417 |     {next_state, follower, NewState};
418 | 
419 | %% An out of date candidate is trying to steal our leadership role. Stop it.
420 | leader(#request_vote{}, _From, #state{me=Me, term=CurrentTerm}=State) ->
421 |     Rpy = #vote{from=Me, term=CurrentTerm, success=false},
422 |     io:format("RAFTER LEADER #req_vote~n"),
423 |     {reply, Rpy, leader, State};
424 | 
425 | leader({set_config, {NewServer, AddRemove}}, From, #state{me=Me, followers=F, term=Term, config=C}=State) ->
426 |     change_config(NewServer, AddRemove, From, Me, F, Term, C, State, leader);
427 | 
428 | %% Handle client requests
429 | leader({read_op, {Id, Command}}, From, State) ->
430 |     NewState = setup_read_request(Id, From, Command, State),
431 |     io:format("RAFTER LEADER read_op~n"),
432 |     {next_state, leader, NewState};
433 | 
434 | leader({op, {Id, Command}}, From,
435 |         #state{term=Term}=State) ->
436 |     Entry = #rafter_entry{type=op, term=Term, cmd=Command},
437 |     NewState = append(Id, From, Entry, State, leader),
438 |     io:format("RAFTER LEADER op~n"),
439 |     {next_state, leader, NewState}.
440 | 
441 | %%=============================================================================
442 | %% Internal Functions
443 | %%=============================================================================
444 | 
445 | change_config(NewServer, AddRemove, From, Me, F, Term, C, State, FSMState) ->
446 |     Id = os:timestamp(),
447 |     #config{newservers=PreviousConfiguration} = C,
448 |     WithoutNew = lists:delete(NewServer, sets:to_list(sets:from_list(PreviousConfiguration))),
449 |     NewServers = case AddRemove of
450 |                             add -> [NewServer|WithoutNew];
451 |                          remove -> WithoutNew end,
452 |     io:format("RAFTER LEADER set_config~n~p~n~p~n",[C,NewServers]),
453 |     case cr_config:allow_config(C, NewServers) of
454 |         true ->
455 |             {Followers, Config} = reconfig(Me, F, C, NewServers, State),
456 |             Entry = #rafter_entry{type=config, term=Term, cmd=Config},
457 |             NewState0 = State#state{followers=Followers},
458 |             NewState = append(Id, From, Entry, NewState0, leader),
459 |             io:format("RAFTER new config: ~p~n",[Config]),
460 |             {next_state, FSMState, NewState};
461 |         Error ->
462 |             io:format("set_config error: ~p~n",[Error]),
463 |             {reply, Error, FSMState, State} end.
464 | 
465 | no_leader_error(Me, Config) ->
466 |     case cr_config:has_vote(Me, Config) of
467 |         false ->
468 |             not_consensus_group_member;
469 |         true ->
470 |             election_in_progress
471 |     end.
472 | 
473 | reconfig(Me, OldFollowers, Config0, NewServers, State) ->
474 |     Config = cr_config:reconfig(Config0, NewServers),
475 |     NewFollowers = cr_config:followers(Me, Config),
476 |     OldSet = sets:from_list([K || {K, _} <- dict:to_list(OldFollowers)]),
477 |     NewSet = sets:from_list(NewFollowers),
478 |     AddedServers = sets:to_list(sets:subtract(NewSet, OldSet)),
479 |     RemovedServers = sets:to_list(sets:subtract(OldSet, NewSet)),
480 |     Followers0 = add_followers(AddedServers, OldFollowers, State),
481 |     Followers = remove_followers(RemovedServers, Followers0),
482 |     {Followers, Config}.
483 | 
484 | add_followers(NewServers, Followers, #state{me=Me}) ->
485 |     NextIndex = cr_log:get_last_index(Me) + 1,
486 |     NewFollowers = [{S, NextIndex} || S <- NewServers],
487 |     dict:from_list(NewFollowers ++ dict:to_list(Followers)).
488 | 
489 | remove_followers(Servers, Followers0) ->
490 |     lists:foldl(fun(S, Followers) ->
491 |                     dict:erase(S, Followers)
492 |                 end, Followers0, Servers).
493 | 
494 | append(Entry, #state{me=Me}=State) ->
495 |     io:format("RAFTER APPEND Me: ~p Entry ~p~n",[Me,Entry]),
496 |     {ok, _Index} = cr_log:append(Me, [Entry]),
497 |     send_append_entries(State).
498 | 
499 | append(Id, From, Entry, State, leader) ->
500 |     NewState = append(Id, From, Entry, State),
501 |     send_append_entries(NewState).
502 | 
503 | append(Id, From, Entry,
504 |        #state{me=Me, term=Term, client_reqs=Reqs}=State) ->
505 |     io:format("RAFTER APPEND Me: ~p Entry ~p~n",[Me,Entry]),
506 |     {ok, Index} = cr_log:append(Me, [Entry]),
507 |     {ok, Timer} = timer:send_after(?CLIENT_TIMEOUT, Me, {client_timeout, Id}),
508 |     ClientRequest = #client_req{id=Id,
509 |                                 from=From,
510 |                                 index=Index,
511 |                                 term=Term,
512 |                                 timer=Timer},
513 |     State#state{client_reqs=[ClientRequest | Reqs]}.
514 | 
515 | setup_read_request(Id, From, Command, #state{send_clock=Clock,
516 |                                              me=Me,
517 |                                              term=Term}=State) ->
518 |     {ok, Timer} = timer:send_after(?CLIENT_TIMEOUT, Me, {client_read_timeout, Clock, Id}),
519 |     ReadRequest = #client_req{id=Id,
520 |                               from=From,
521 |                               term=Term,
522 |                               cmd=Command,
523 |                               timer=Timer},
524 |     NewState = save_read_request(ReadRequest, State),
525 |     send_append_entries(NewState).
526 | 
527 | save_read_request(ReadRequest, #state{send_clock=Clock,
528 |                                       read_reqs=Requests}=State) ->
529 |     NewRequests =
530 |         case orddict:find(Clock, Requests) of
531 |             {ok, ReadRequests} ->
532 |                 orddict:store(Clock, [ReadRequest | ReadRequests], Requests);
533 |             error ->
534 |                 orddict:store(Clock, [ReadRequest], Requests)
535 |         end,
536 |         State#state{read_reqs=NewRequests}.
537 | 
538 | send_client_timeout_reply(#client_req{from=From}) ->
539 |     gen_fsm:reply(From, {error, timeout}).
540 | 
541 | send_client_reply(#client_req{timer=Timer, from=From}, Result) ->
542 |     {ok, cancel} = timer:cancel(Timer),
543 |     gen_fsm:reply(From, Result).
544 | 
545 | find_client_req(Id, ClientRequests) ->
546 |     Result = lists:filter(fun(Req) ->
547 |                               Req#client_req.id =:= Id
548 |                           end, ClientRequests),
549 |     case Result of
550 |         [Request] ->
551 |             {ok, Request};
552 |         [] ->
553 |             not_found
554 |     end.
555 | 
556 | delete_client_req(Id, ClientRequests) ->
557 |     lists:filter(fun(Req) ->
558 |                      Req#client_req.id =/= Id
559 |                  end, ClientRequests).
560 | 
561 | find_client_req_by_index(Index, ClientRequests) ->
562 |     Result = lists:filter(fun(Req) ->
563 |                               Req#client_req.index =:= Index
564 |                           end, ClientRequests),
565 |     case Result of
566 |         [Request] ->
567 |             {ok, Request};
568 |         [] ->
569 |             not_found
570 |     end.
571 | 
572 | delete_client_req_by_index(Index, ClientRequests) ->
573 |     lists:filter(fun(Req) ->
574 |                     Req#client_req.index =/= Index
575 |                  end, ClientRequests).
576 | 
577 | %% @doc Commit entries between the previous commit index and the new one.
578 | %%      Apply them to the local state machine and respond to any outstanding
579 | %%      client requests that these commits affect. Return the new state.
580 | %%      Ignore already committed entries.
581 | commit_entries(NewCommitIndex, #state{commit_index=CommitIndex}=State)
582 |     when CommitIndex >= NewCommitIndex -> State;
583 | commit_entries(NewCommitIndex, #state{commit_index=CommitIndex,
584 |                                       state_machine=StateMachine,
585 |                                       backend_state=BackendState,
586 |                                       me=Me}=State) ->
587 |    LastIndex = min(cr_log:get_last_index(Me), NewCommitIndex),
588 |    lists:foldl(fun(Index, #state{client_reqs=CliReqs}=State1) ->
589 |        NewState = State1#state{commit_index=Index},
590 |        case cr_log:get_entry(Me, Index) of
591 | 
592 |            %% Noop - Ignore this request
593 |            {ok, #rafter_entry{type=noop}} ->
594 |                NewState;
595 | 
596 |            %% Normal Operation. Apply Command to StateMachine.
597 |            {ok, #rafter_entry{type=op, cmd=Command}} ->
598 |                {Result, NewBackendState} =
599 |                    StateMachine:write(Command, BackendState),
600 |                NewState2 = NewState#state{backend_state=NewBackendState},
601 |                maybe_send_client_reply(Index, CliReqs, NewState2, Result);
602 | 
603 |            %% We have a committed transitional state, so reply
604 |            %% successfully to the client. Then set the new stable
605 |            %% configuration.
606 |            {ok, #rafter_entry{type=config,
607 |                    cmd=#config{state=transitional}=C}} ->
608 |                S = stabilize_config(C, NewState),
609 |                Reply = {ok, S#state.config},
610 |                maybe_send_client_reply(Index, CliReqs, S, Reply);
611 | 
612 |            %% The configuration has already been set. Initial configuration goes
613 |            %% directly to stable state so needs to send a reply. Checking for
614 |            %% a client request is expensive, but config changes happen
615 |            %% infrequently.
616 |            {ok, #rafter_entry{type=config,
617 |                    cmd=#config{state=stable}}} ->
618 |                Reply = {ok, NewState#state.config},
619 |                maybe_send_client_reply(Index, CliReqs, NewState, Reply)
620 |        end
621 |    end, State, lists:seq(CommitIndex+1, LastIndex)).
622 | 
623 | stabilize_config(#config{state=transitional, newservers=New}=C,
624 |     #state{me=Me, term=Term}=S) when S#state.leader =:= S#state.me ->
625 |         Config = C#config{state=stable, oldservers=New, newservers=[]},
626 |         Entry = #rafter_entry{type=config, term=Term, cmd=Config},
627 |         State = S#state{config=Config},
628 |         {ok, _Index} = cr_log:append(Me, [Entry]),
629 |         send_append_entries(State);
630 | stabilize_config(_, State) ->
631 |     State.
632 | 
633 | maybe_send_client_reply(Index, CliReqs, S, Result) when S#state.leader =:= S#state.me ->
634 |     case find_client_req_by_index(Index, CliReqs) of
635 |         {ok, Req} ->
636 |             send_client_reply(Req, Result),
637 |             Reqs = delete_client_req_by_index(Index, CliReqs),
638 |             S#state{client_reqs=Reqs};
639 |         not_found ->
640 |             S
641 |     end;
642 | maybe_send_client_reply(_, _, State, _) ->
643 |     State.
644 | 
645 | maybe_send_read_replies(#state{me=Me,
646 |                              config=Config,
647 |                              send_clock_responses=Responses}=State0) ->
648 |     Clock = cr_config:quorum_max(Me, Config, Responses),
649 |     {ok, Requests, State} = find_eligible_read_requests(Clock, State0),
650 |     NewState = send_client_read_replies(Requests, State),
651 |     NewState.
652 | 
653 | eligible_request(SendClock) ->
654 |     fun({Clock, _}) ->
655 |         SendClock > Clock
656 |     end.
657 | 
658 | find_eligible_read_requests(SendClock, #state{read_reqs=Requests}=State) ->
659 |     EligibleReq = eligible_request(SendClock),
660 |     Eligible = lists:takewhile(EligibleReq, Requests),
661 |     NewRequests = lists:dropwhile(EligibleReq, Requests),
662 |     NewState = State#state{read_reqs=NewRequests},
663 |     {ok, Eligible, NewState}.
664 | 
665 | send_client_read_replies([], State) ->
666 |     State;
667 | send_client_read_replies(Requests, State=#state{state_machine=StateMachine,
668 |                                                 backend_state=BackendState}) ->
669 |     NewBackendState =
670 |         lists:foldl(fun({_Clock, ClientReqs}, BeState) ->
671 |                         read_and_send(ClientReqs, StateMachine, BeState)
672 |                     end, BackendState, Requests),
673 |     State#state{backend_state=NewBackendState}.
674 | 
675 | read_and_send(ClientRequests, StateMachine, BackendState) ->
676 |     lists:foldl(fun(Req, Acc) ->
677 |                     {Val, NewAcc} =
678 |                     StateMachine:read(Req#client_req.cmd, Acc),
679 |                     send_client_reply(Req, Val),
680 |                     NewAcc
681 |                 end, BackendState, ClientRequests).
682 | 
683 | maybe_commit(#state{me=Me,
684 |                     commit_index=CommitIndex,
685 |                     config=Config,
686 |                     responses=Responses}=State) ->
687 |     Min = cr_config:quorum_max(Me, Config, Responses),
688 |     case Min > CommitIndex andalso safe_to_commit(Min, State) of
689 |         true ->
690 |             NewState = commit_entries(Min, State),
691 |             case cr_config:has_vote(Me, NewState#state.config) of
692 |                 true ->
693 |                     NewState;
694 |                 false ->
695 |                     %% We just committed a config that doesn't include ourself
696 |                     step_down(NewState#state.term, NewState)
697 |             end;
698 |         false ->
699 |             State
700 |     end.
701 | 
702 | safe_to_commit(Index, #state{term=CurrentTerm, me=Me}) ->
703 |     CurrentTerm =:= cr_log:get_term(Me, Index).
704 | 
705 | %% We are about to transition to the follower state. Reset the necessary state.
706 | %% TODO: send errors to any outstanding client read or write requests and cleanup
707 | %% timers
708 | step_down(NewTerm, State0) ->
709 |     State = reset_timer(election_timeout(), State0),
710 |     NewState = State#state{term=NewTerm,
711 |                            responses=dict:new(),
712 |                            leader=undefined},
713 |     set_metadata(undefined, NewState).
714 | 
715 | save_rpy(#append_entries_rpy{from=From, index=Index, send_clock=Clock},
716 |          #state{responses=Responses, send_clock_responses=ClockResponses}=State) ->
717 |     NewResponses = save_greater(From, Index, Responses),
718 |     NewClockResponses = save_greater(From, Clock, ClockResponses),
719 |     State#state{responses=NewResponses, send_clock_responses=NewClockResponses}.
720 | 
721 | save_greater(Key, Val, Dict) -> save_greater(Key, Val, Dict, dict:find(Key, Dict)).
722 | save_greater(_Key, Val, Dict, {ok, CurrentVal}) when CurrentVal > Val -> Dict;
723 | save_greater(_Key, CurrentVal, Dict, {ok, CurrentVal}) -> Dict;
724 | save_greater(Key, Val, Dict, {ok, _}) -> dict:store(Key, Val, Dict);
725 | save_greater(Key, Val, Dict, error) -> dict:store(Key, Val, Dict).
726 | 
727 | handle_request_vote(#request_vote{from=CandidateId, term=Term}=RequestVote,
728 |   State) ->
729 |     State2 = set_term(Term, State),
730 |     {ok, Vote} = vote(RequestVote, State2),
731 |     case Vote#vote.success of
732 |         true ->
733 |             State3 = set_metadata(CandidateId, State2),
734 |             State4 = reset_timer(election_timeout(), State3),
735 |             {reply, Vote, follower, State4};
736 |         false ->
737 |             {reply, Vote, follower, State2}
738 |     end.
739 | 
740 | set_metadata(CandidateId, State=#state{me=Me, term=Term}) ->
741 |     NewState = State#state{voted_for=CandidateId},
742 |     ok = cr_log:set_metadata(Me, CandidateId, Term),
743 |     NewState.
744 | 
745 | maybe_increment_follower_index(From, Followers, State=#state{me=Me}) ->
746 |     LastLogIndex = cr_log:get_last_index(Me),
747 |     {ok, Index} = dict:find(From, Followers),
748 |     case Index =< LastLogIndex of
749 |         true ->
750 |             State#state{followers=dict:store(From, Index+1, Followers)};
751 |         false ->
752 |             State
753 |     end.
754 | 
755 | get_prev(Me, Index) ->
756 |     case Index - 1 of
757 |         0 ->
758 |             {0, 0};
759 |         PrevIndex ->
760 |             {PrevIndex,
761 |                 cr_log:get_term(Me, PrevIndex)}
762 |     end.
763 | 
764 | %% TODO: Return a block of entries if more than one exist
765 | get_entries(Me, Index) ->
766 |     case cr_log:get_entry(Me, Index) of
767 |         {ok, not_found} ->
768 |             [];
769 |         {ok, Entry} ->
770 |             [Entry]
771 |     end.
772 | 
773 | send_entry(Peer, Index, #state{me=Me,
774 |                                term=Term,
775 |                                send_clock=Clock,
776 |                                commit_index=CIdx}) ->
777 |     {PrevLogIndex, PrevLogTerm} = get_prev(Me, Index),
778 |     Entries = get_entries(Me, Index),
779 |     AppendEntries = #append_entries{term=Term,
780 |                                     from=Me,
781 |                                     prev_log_index=PrevLogIndex,
782 |                                     prev_log_term=PrevLogTerm,
783 |                                     entries=Entries,
784 |                                     commit_index=CIdx,
785 |                                     send_clock=Clock},
786 |     rsend(Peer, AppendEntries).
787 | 
788 | send_append_entries(#state{followers=Followers, send_clock=SendClock}=State) ->
789 |     NewState = State#state{send_clock=SendClock+1},
790 |     _ = [send_entry(Peer, Index, NewState) ||
791 |         {Peer, Index} <- dict:to_list(Followers)],
792 |     NewState.
793 | 
794 | decrement_follower_index(From, Followers) ->
795 |     case dict:find(From, Followers) of
796 |         {ok, 1} ->
797 |             1;
798 |         {ok, Num} ->
799 |             Num - 1
800 |     end.
801 | 
802 | %% @doc Start a process to send a syncrhonous rpc to each peer. Votes will be sent
803 | %%      back as messages when the process receives them from the peer. If
804 | %%      there is an error or a timeout no message is sent. This helps preserve
805 | %%      the asynchrnony of the consensus fsm, while maintaining the rpc
806 | %%      semantics for the request_vote message as described in the raft paper.
807 | request_votes(#state{config=Config, term=Term, me=Me}) ->
808 |     Voters = cr_config:voters(Me, Config),
809 |     Msg = #request_vote{term=Term,
810 |                         from=Me,
811 |                         last_log_index=cr_log:get_last_index(Me),
812 |                         last_log_term=cr_log:get_last_term(Me)},
813 |     [rsend(Peer, Msg) || Peer <- Voters].
814 | 
815 | become_candidate(#state{term=CurrentTerm, me=Me}=State0) ->
816 |     State = reset_timer(election_timeout(), State0),
817 |     State2 = State#state{term=CurrentTerm + 1,
818 |                          responses=dict:new(),
819 |                          leader=undefined},
820 |     State3 = set_metadata(Me, State2),
821 |     _ = request_votes(State3),
822 |     State3.
823 | 
824 | become_leader(#state{me=Me, term=Term, init_config=InitConfig}=State) ->
825 |     NewState = State#state{leader=Me,
826 |                            responses=dict:new(),
827 |                            followers=initialize_followers(State),
828 |                            send_clock = 0,
829 |                            send_clock_responses = dict:new(),
830 |                            read_reqs = orddict:new()},
831 | 
832 |     case InitConfig of
833 |         complete ->
834 |             %% Commit a noop entry to the log so we can move the commit index
835 |             Entry = #rafter_entry{type=noop, term=Term, cmd=noop},
836 |             append(Entry, NewState);
837 |         _ ->
838 |             %% First entry must always be a config entry
839 |             NewState
840 |     end.
841 | 
842 | 
843 | initialize_followers(#state{me=Me, config=Config}) ->
844 |     Peers = cr_config:followers(Me, Config),
845 |     NextIndex = cr_log:get_last_index(Me) + 1,
846 |     Followers = [{Peer, NextIndex} || Peer <- Peers],
847 |     dict:from_list(Followers).
848 | 
849 | %% There is no entry at t=0, so just return true.
850 | consistency_check(#append_entries{prev_log_index=0,
851 |                                   prev_log_term=0}, _State) ->
852 |     true;
853 | consistency_check(#append_entries{prev_log_index=Index,
854 |                                   prev_log_term=Term}, #state{me=Me}) ->
855 |     case cr_log:get_entry(Me, Index) of
856 |         {ok, not_found} ->
857 |             false;
858 |         {ok, #rafter_entry{term=Term}} ->
859 |             true;
860 |         {ok, #rafter_entry{term=_DifferentTerm}} ->
861 |             false
862 |     end.
863 | 
864 | set_term(Term, #state{term=CurrentTerm}=State) when Term < CurrentTerm -> State;
865 | set_term(Term, #state{term=CurrentTerm}=State) when Term > CurrentTerm -> set_metadata(undefined, State#state{term=Term});
866 | set_term(Term, #state{term=Term}=State) -> State.
867 | 
868 | vote(#request_vote{term=Term}, #state{term=CurrentTerm, me=Me})
869 |         when Term < CurrentTerm ->
870 |     fail_vote(CurrentTerm, Me);
871 | vote(#request_vote{from=CandidateId, term=CurrentTerm}=RequestVote,
872 |      #state{voted_for=CandidateId, term=CurrentTerm, me=Me}=State) ->
873 |     maybe_successful_vote(RequestVote, CurrentTerm, Me, State);
874 | vote(#request_vote{term=CurrentTerm}=RequestVote,
875 |      #state{voted_for=undefined, term=CurrentTerm, me=Me}=State) ->
876 |     maybe_successful_vote(RequestVote, CurrentTerm, Me, State);
877 | vote(#request_vote{from=CandidateId, term=CurrentTerm},
878 |      #state{voted_for=AnotherId, term=CurrentTerm, me=Me})
879 |      when AnotherId =/= CandidateId ->
880 |     fail_vote(CurrentTerm, Me).
881 | 
882 | maybe_successful_vote(RequestVote, CurrentTerm, Me, State) ->
883 |     case candidate_log_up_to_date(RequestVote, State) of
884 |         true ->
885 |             successful_vote(CurrentTerm, Me);
886 |         false ->
887 |             fail_vote(CurrentTerm, Me)
888 |     end.
889 | 
890 | candidate_log_up_to_date(#request_vote{last_log_term=CandidateTerm,
891 |                                        last_log_index=CandidateIndex},
892 |                          #state{me=Me}) ->
893 |     candidate_log_up_to_date(CandidateTerm,
894 |                              CandidateIndex,
895 |                              cr_log:get_last_term(Me),
896 |                              cr_log:get_last_index(Me)).
897 | 
898 | candidate_log_up_to_date(CandidateTerm, _CandidateIndex, LogTerm, _LogIndex) when CandidateTerm > LogTerm -> true;
899 | candidate_log_up_to_date(CandidateTerm, _CandidateIndex, LogTerm, _LogIndex) when CandidateTerm < LogTerm -> false;
900 | candidate_log_up_to_date(Term, CandidateIndex, Term, LogIndex) when CandidateIndex > LogIndex -> true;
901 | candidate_log_up_to_date(Term, CandidateIndex, Term, LogIndex) when CandidateIndex < LogIndex -> false;
902 | candidate_log_up_to_date(Term, Index, Term, Index) -> true.
903 | 
904 | successful_vote(CurrentTerm, Me) -> {ok, #vote{term=CurrentTerm, success=true, from=Me}}.
905 | fail_vote(CurrentTerm, Me) -> {ok, #vote{term=CurrentTerm, success=false, from=Me}}.
906 | election_timeout() -> crypto:rand_uniform(?ELECTION_TIMEOUT_MIN, ?ELECTION_TIMEOUT_MAX).
907 | heartbeat_timeout() -> ?HEARTBEAT_TIMEOUT.
908 | 
909 | reset_timer(Duration, State=#state{timer=Timer}) ->
910 |     _ = gen_fsm:cancel_timer(Timer),
911 |     NewTimer = gen_fsm:send_event_after(Duration, timeout),
912 |     State#state{timer=NewTimer}.
913 | 
914 | rsend(To, #request_vote{from=From}=Msg) -> rsend(To, From, Msg);
915 | rsend(To, #append_entries{from=From}=Msg) -> rsend(To, From, Msg).
916 | rsend(To, From, Msg) ->
917 |     spawn(fun() ->
918 |               case cr_rafter:send_sync({To,To}, Msg) of
919 |                   Rpy when is_record(Rpy, vote) orelse
920 |                            is_record(Rpy, append_entries_rpy) ->
921 |                       cr_rafter:send(From, Rpy);
922 |                   E ->
923 |                       io:format("Error sending ~p to To ~p: ~p", [Msg, To, E])
924 |               end
925 |           end).
926 | 


--------------------------------------------------------------------------------
/src/consensus/cr_replication.erl:
--------------------------------------------------------------------------------
  1 | -module(cr_replication).
  2 | -description('RAFT protocol replication log backend').
  3 | -behaviour(rafter_backend).
  4 | -export([init/1, stop/1, read/2, write/2]).
  5 | -record(state, {peer :: atom() | {atom(), atom()}}).
  6 | 
  7 | % Issue commands only if you want them to be saved in cluster status log.
  8 | 
  9 | init(Peer) ->
 10 |     State = #state{peer=Peer},
 11 |     NewState = stop(State),
 12 |     _Tid1 = ets:new(rafter, [set, named_table, public]),
 13 |     _Tid2 = ets:new(rafter_tables, [set, named_table, public]),
 14 |     io:format("RAFTER BACK INIT ~p~n~p~n",[Peer,{_Tid1,_Tid2}]),
 15 |     
 16 |     NewState.
 17 | 
 18 | stop(State) ->
 19 |     catch ets:delete(rafter),
 20 |     catch ets:delete(rafter_tables),
 21 |     State.
 22 | 
 23 | read({get, Table, Key}, State) ->
 24 |     io:format("CONS GET: ~p~n",[{Table, Key}]),
 25 |     Val = try
 26 |               case ets:lookup(Table, Key) of
 27 |                   [{Key, Value}] ->
 28 |                       {ok, Value};
 29 |                   [] ->
 30 |                       {ok, not_found}
 31 |               end
 32 |           catch _:E ->
 33 |               {error, E}
 34 |           end,
 35 |      {Val, State};
 36 | read(list_tables, State) ->
 37 |     io:format("CONS DIR~n",[]),
 38 |     {{ok, [Table || {Table} <- ets:tab2list(rafter_tables)]},
 39 |         State};
 40 | read({list_keys, Table}, State) ->
 41 |     io:format("CONS ALL: ~p~n",[{Table}]),
 42 |     Val = try
 43 |               list_keys(Table)
 44 |           catch _:E ->
 45 |               {error, E}
 46 |           end,
 47 |     {Val, State};
 48 | read(_, State) ->
 49 |     {{error, ets_read_badarg}, State}.
 50 | 
 51 | write({new, Name}, State) ->
 52 |     io:format("CONS NEW: ~p~n",[{Name}]),
 53 |     Val = try
 54 |               _Tid = ets:new((Name), [ordered_set, named_table, public]),
 55 |               ets:insert(rafter_tables, {Name}),
 56 |               {ok, Name}
 57 |           catch _:E ->
 58 |               {error, E}
 59 |           end,
 60 |     {Val, State};
 61 | 
 62 | write({put, Table, Key, Value}, State) ->
 63 |     io:format("CONS PUT: ~p~n",[{Table, Key, Value}]),
 64 |     Val = try
 65 |               ets:insert(Table, {Key, Value}),
 66 |               {ok, Value}
 67 |           catch _:E ->
 68 |               {error, E}
 69 |           end,
 70 |     {Val, State};
 71 | write({delete, Table}, State) ->
 72 |     io:format("CONS DELETE: ~p~n",[{Table}]),
 73 |     Val =
 74 |         try
 75 |             ets:delete(Table),
 76 |             ets:delete(rafter_tables, Table),
 77 |             {ok, true}
 78 |         catch _:E ->
 79 |             {error, E}
 80 |         end,
 81 |     {Val, State};
 82 | write({delete, Table, Key}, State) ->
 83 |     io:format("CONS DELETE: ~p~n",[{Table,Key}]),
 84 |     Val = try
 85 |               {ok, ets:delete(Table, Key)}
 86 |           catch _:E ->
 87 |               {error, E}
 88 |           end,
 89 |     {Val, State};
 90 | write(Data, State) ->
 91 |     io:format("CONS WRITE: ~p~n",[{Data}]),
 92 |     {{error, ets_write_badarg}, State}.
 93 | 
 94 | list_keys(Table) ->
 95 |     list_keys(ets:first(Table), Table, []).
 96 | 
 97 | list_keys('$end_of_table', _Table, Keys) ->
 98 |     {ok, Keys};
 99 | list_keys(Key, Table, Keys) ->
100 |     list_keys(ets:next(Table, Key), Table, [Key | Keys]).
101 | 


--------------------------------------------------------------------------------
/src/cr.app.src:
--------------------------------------------------------------------------------
1 | {application, cr,
2 |  [
3 |   {description, "Chain Replication"},
4 |   {vsn, "0.1"},
5 |   {registered, []},
6 |   {applications, [kernel,stdlib,kvs]},
7 |   {mod, { cr_app, []}}
8 | ]}.
9 | 


--------------------------------------------------------------------------------
/src/cr.erl:
--------------------------------------------------------------------------------
  1 | -module(cr).
  2 | -description('Distributed Transaction Coordinator').
  3 | -copyright('Maxim Sokhatsky').
  4 | -include("cr.hrl").
  5 | -include_lib("db/include/transaction.hrl").
  6 | -include("rafter.hrl").
  7 | -compile(export_all).
  8 | -compile({no_auto_import,[node/0]}).
  9 | 
 10 | main(A) -> mad_repl:main(A,[]).
 11 | 
 12 | encode(Msg) -> term_to_binary(Msg).
 13 | decode(Bin) -> binary_to_term(Bin).
 14 | 
 15 | set_socket(Pid, Socket) when is_pid(Pid), is_port(Socket) -> gen_fsm:send_event(Pid, {socket_ready, Socket}).
 16 | send(Pid, Message) when is_pid(Pid)  -> gen_fsm:send_event(Pid, {out, Message}).
 17 | 
 18 | config()       -> {ok,Peers} = application:get_env(cr,peers),
 19 |                   N = lists:map(fun({N,_,_,_})->N end,Peers),
 20 |                   #config{state=stable,oldservers=N,newservers=N}.
 21 | local(Object)  -> {I,N}=lists:keyfind(cr:nodex(cr:node()),2,cr:chain(Object)),
 22 |                   {I,P,_,_}=lists:keyfind(I,1,supervisor:which_children(vnode_sup)), P.
 23 | secret()       -> application:get_env(cr,secret,<<"ThisIsClassified">>).
 24 | peers()        -> {ok,Peers}=application:get_env(cr,peers),Peers.
 25 | peers(N)       -> lists:zip(lists:seq(1,N),lists:seq(1,N)).
 26 | hash(Object)   -> hd(seq(Object)).
 27 | rep(Object)    -> roll(element(2,hash(Object))).
 28 | roll(N)        -> lists:seq(N,length(peers())) ++ lists:seq(1,N-1).
 29 | seq(Object)    -> lists:keydelete(0,1,cr_hash:succ(cr_hash:key_of(Object),ring())).
 30 | peer({I,N})    -> element(1,lists:nth(N,peers())).
 31 | nodex(Node)    -> string:str(cr:peers(),[lists:keyfind(Node,1,cr:peers())]).
 32 | node()         -> list_to_atom(lists:concat([os:getenv("NAME"),'@127.0.0.1'])).
 33 | vpid({I,Node}) -> {I,P,_,_}=lists:keyfind(I,1,supervisor:which_children({vnode_sup,Node})), P.
 34 | ring()         -> ring(4).
 35 | ring(C)        -> {Nodes,[{0,1}|Rest]} = cr_hash:fresh(length(peers())*C,1),
 36 |                   {Nodes,[{0,0}|lists:map(fun({{I,1},X})->{I,(X-1) div C+1} end,
 37 |                                 lists:zip(Rest,lists:seq(1,length(Rest))))]}.
 38 | 
 39 | chain(Object) ->
 40 |     {N,_} = cr:ring(),
 41 |     lists:map(fun(X) -> lists:nth((X-1)*4+1,cr:seq(Object)) end,
 42 |               cr:roll(element(2,cr:hash(Object)))).
 43 | 
 44 | tx(Record) when is_tuple(Record) ->
 45 |     gen_server:cast(local(Record),
 46 |         {client,{self(),os:timestamp()},
 47 |                 chain(element(2,Record)),
 48 |                 Record}).
 49 | 
 50 | stack(Error, Reason) ->
 51 |     Stacktrace = [case A of
 52 |          { Module,Function,Arity,Location} ->
 53 |              { Module,Function,Arity,proplists:get_value(line, Location) };
 54 |          Else -> Else end
 55 |     || A <- erlang:get_stacktrace()],
 56 |     [Error, Reason, Stacktrace].
 57 | 
 58 | error_page(Class,Error) ->
 59 |     io_lib:format("ERROR:  ~w:~w~n~n",[Class,Error]) ++
 60 |     "STACK: " ++
 61 |     [ io_lib:format("\t~w:~w/~w:~w\n",
 62 |         [ Module,Function,Arity,proplists:get_value(line, Location) ])
 63 |     ||  { Module,Function,Arity,Location} <- erlang:get_stacktrace() ].
 64 | 
 65 | test() -> test(10).
 66 | test(Num) ->
 67 |     O1 = lists:foldl(fun({_,_,_,A,_,_},Acc) -> A+Acc end,0,kvs:all(log)),
 68 |     T1 = length(kvs:all(transaction)),
 69 |     io:format("Already in Database: ~p~n"
 70 |                      "New record will be applied: ~p~n",[O1,Num]),
 71 |     [cr:tx(#transaction{id=kvs:next_id(transaction,1)})||I<-lists:seq(1,Num)],
 72 |     O2 = lists:foldl(fun({_,_,_,A,_,_},Acc) -> A+Acc end,0,kvs:all(log)),
 73 |     {transactions,T2 = length(kvs:all(transaction))}.
 74 | 
 75 | log_size({I,N}) ->
 76 |     {ok,Log} = kvs:get(log,{I,N}),
 77 |     {Log#log.top,length(kvs:entries({ok,Log},operation,-1))}.
 78 | 
 79 | dump() ->
 80 |      {N,Nodes} = cr:ring(),
 81 |      io:format("~52w ~3w ~2w ~10w ~11w~n",[vnode,i,n,top,latency]),
 82 |    [ begin
 83 |      {A,B} = rpc(rpc:call(cr:peer({I,N}),cr,log_size,[{I,N}])),
 84 |      {Min,Max,Avg} = latency({I,N}),
 85 |      L = lists:concat([Min,'/',Max,'/',Avg]),
 86 |      io:format("~52w ~3w ~2w ~10w ~11s~n",[I,P,N,A,L])
 87 |      end || {{I,N},P} <- lists:zip(lists:keydelete(0,1,Nodes),lists:seq(1,length(Nodes)-1))],
 88 |      ok.
 89 | 
 90 | string(O) ->
 91 |     lists:concat(lists:flatten([lists:map(fun(undefined) -> ''; (X) -> [X,':'] end, tuple_to_list(O))])).
 92 | 
 93 | dump(N) when N < 13  -> {_,X}   = cr:ring(),
 94 |                         Nodes   = lists:keydelete(0,1,X),
 95 |                         {I,P}   = lists:nth(N,Nodes),
 96 |                         Pos     = string:str(Nodes,[{I,P}]),
 97 |                         {ok,C}  = rpc:call(cr:peer({I,P}),kvs,get,[log,{I,P}]),
 98 |                         dump_op(Pos,rpc(rpc:call(cr:peer({I,P}),kvs,entries,[C,operation,10])));
 99 | 
100 | dump(N)              -> {_,X}   = cr:ring(),
101 |                         Nodes   = lists:keydelete(0,1,X),
102 |                         {ok,Oo} = kvs:get(operation,N),
103 |                         {I,P}   = lists:keyfind(element(1,Oo#operation.feed_id),1,Nodes),
104 |                         Pos     = string:str(Nodes,[{I,P}]),
105 |                         dump_op(Pos,kvs:traversal(operation,Oo#operation.id,10,#iterator.prev)).
106 | 
107 | dump_op(Pos,List) ->
108 |      io:format("~50s ~10w ~10w ~4w ~10w~n",[operation,id,prev,i,size]),
109 |    [ io:format("~50s ~10w ~10w ~4w ~10w~n",[
110 |         string(Tx),
111 |         element(2,O),
112 |         rpc(element(#iterator.prev,O)),
113 |         rpc(Pos),
114 |         size(term_to_binary(O))])
115 |      || #operation{name=Name,body={Cmd,_,Chain,Tx}}=O <- List],
116 |      ok.
117 | 
118 | latency({I,N}) -> gen_server:call(cr:vpid({I,cr:peer({I,N})}),{latency}).
119 | 
120 | rpc(undefined) -> [];
121 | rpc({badrpc,_}) -> {error,error};
122 | rpc(Value) -> Value.
123 | 
124 | clean() -> kvs:destroy(), kvs:join().
125 | 
126 | log_modules() -> [cr,cr_log,cr_rafter,cr_heart].
127 | 
128 | sup()   -> [{T,Pid}||{T,Pid,_,_}<-supervisor:which_children(cr_sup)].
129 | heart() -> [{_,P,_,_}]=supervisor:which_children(heart_sup), gen_server:call(P,{heart}).
130 | local() -> [{I,P}||{I,P,_,_} <- supervisor:which_children(vnode_sup)].
131 | 
132 | % Integrity Functions
133 | 
134 | % consensus_log  checks that the length of RAFT log is the same on all nodes.
135 | % node_log       checks that the sum of chains of all vnodes equals the the overal operations counts.
136 | % operation_log  checks that on all nodes all operations logs are ok
137 | % cluster_status checks that all logs on all nodes are ok
138 | 
139 | consensus_log() ->
140 |       Entries = cr_log:get_last_index(cr:node()),
141 |       case lists:all(fun({H,_,_,_}) ->
142 |             rpc:call(H,cr_log,get_last_index,[H]) == Entries end,
143 |             cr:peers()) of true -> {ok,Entries};
144 |                           false -> {error,consensus_log} end.
145 | 
146 | 
147 | node_log() ->
148 |       Operations = length(kvs:all(operation)),
149 |       case lists:sum([ begin
150 |             length(kvs:entries(kvs:get(log,Id),operation,-1)) == Num, Num end
151 |             || {log,Id,_,Num,_,_} <- kvs:all(log) ]) == Operations of
152 |            true -> {ok,Operations};
153 |           false -> {error,node_log} end.
154 | 
155 | 
156 | operation_log() ->
157 |       Operations = length(kvs:all(operation)),
158 |       case lists:all(fun({H,_,_,_}) ->
159 |             case rpc:call(H,cr,node_log,[]) of
160 |                  {ok,Operations} -> true;
161 |                                _ -> false end end,
162 |              cr:peers()) of true -> {ok,Operations};
163 |                            false -> {error,operation_log} end.
164 | 
165 | 
166 | cluster_status() -> {ok,_} = consensus_log(),
167 |                     {ok,_} = operation_log().
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/src/cr_app.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_app).
 2 | -behaviour(application).
 3 | -export([start/2, stop/1]).
 4 | -copyright('Maxim Sokhatsky').
 5 | -include("rafter_opts.hrl").
 6 | -compile(export_all).
 7 | 
 8 | tcp(Name,Port,Mod,Nodes) -> {Name,{cr_tcp,start_link,
 9 |                             [Name,Port,Mod,Nodes]},
10 |                             permanent,2000,worker,[cr_tcp]}.
11 | 
12 | pool(SupName)            -> {SupName,{supervisor,start_link,
13 |                             [{local,SupName},cr_connection,[]]},
14 |                             permanent,infinity,supervisor,[]}.
15 | 
16 | vnode({I,N})             -> {I,{cr_vnode,start_link,
17 |                             [{I,N},cr_kvs]},
18 |                             permanent,2000,worker,[cr_vnode]}.
19 | 
20 | heart(Nodes)             -> {heart,{cr_heart,start_link,
21 |                             ["heart",Nodes]},
22 |                             permanent,2000,worker,[cr_heart]}.
23 | 
24 | log({I,N},Nodes)         -> {cr_log:logname(N),{cr_log,start_link,
25 |                             [N,#rafter_opts{cluster=Nodes}]},
26 |                             permanent,2000,worker,[cr_log]}.
27 | 
28 | rafter({I,N},Nodes)      -> {N,{cr_rafter,start_link,
29 |                             [{I,N},#rafter_opts{state_machine=cr_replication,cluster=Nodes}]},
30 |                             permanent,2000,worker,[cr_rafter]}.
31 | 
32 | init([Nodes,Opts]) ->
33 |     {ok, {{one_for_one, 5, 60},
34 |               lists:flatten([ log({0,N},Nodes)    || {N,_,_,_} <- Nodes, N == cr:node()]
35 |                          ++ [ rafter({0,N},Nodes) || {N,_,_,_} <- Nodes, N == cr:node()]
36 |                          ++ [ protocol(O,Nodes) || O<-Opts ]
37 |                          ++ [ pool(heart_sup) ]
38 |                          ++ [ pool(vnode_sup) ]) }}.
39 | 
40 | stop(_)    -> ok.
41 | start() -> start(normal,[]).
42 | start(_,_) ->
43 |     io:format("Node: ~p~n",[cr:node()]),
44 |     {ok,Peers}=application:get_env(cr,peers),
45 |     {N,P1,P2,P3} = lists:keyfind(cr:node(),1,Peers),
46 |     {_,VNodes} = cr:ring(),
47 |     kvs:join(),
48 |     Sup = supervisor:start_link({local, cr_sup}, ?MODULE,
49 |                 [  Peers, [ { interconnect, P1, cr_interconnect },
50 |                             { ping,         P2, cr_ping },
51 |                             { client,       P3, cr_client } ]]),
52 |     io:format("Supervision: ~p~n",[supervisor:which_children(cr_sup)]),
53 |     [ start_vnode({Index,Node},Peers) || {Index,Node} <- VNodes, Node == cr:nodex(cr:node()) ],
54 |     spawn(fun() -> supervisor:start_child(heart_sup,heart(Peers)) end),
55 |     Sup.
56 | 
57 | protocol({Name,Port,Mod},Nodes) ->
58 |   SupName = list_to_atom(lists:concat([Name,'_',sup])),
59 |   [ tcp(Name,Port,Mod,Nodes),     % TCP listener gen_server
60 |     pool(SupName)        ].       % Accepted Clients Supervisor
61 | 
62 | start_vnode({0,_Name},Peers) -> skip;
63 | start_vnode({Index,Name},_ ) -> supervisor:start_child(vnode_sup,vnode({Index,Name})).
64 | 


--------------------------------------------------------------------------------
/src/cr_hash.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_hash).
 2 | -description('Consistent Hash Ring').
 3 | -copyright('Synrc Research Center s.r.o.').
 4 | -compile(export_all).
 5 | -define(RINGTOP, trunc(math:pow(2,160)-1)). % SHA-1 space
 6 | 
 7 | % Our consistent ring hash module consists of five functions
 8 | % Why need we have more?
 9 | 
10 | key_of(Object) -> crypto:hash(sha, term_to_binary(Object)).
11 | inc(N) -> ?RINGTOP div N.
12 | fresh(N, Seed) -> {N, [{Int,Seed} || Int <- lists:seq(0,(?RINGTOP-1),inc(N))]}.
13 | succ(Idx,{N,Nodes}) -> <<Int:160/integer>> =Idx, {A,B}=lists:split((Int div inc(N))+1,Nodes), B++A.
14 | 


--------------------------------------------------------------------------------
/src/cr_heart.erl:
--------------------------------------------------------------------------------
  1 | -module(cr_heart).
  2 | -description('Heart Monitor').
  3 | -author('Maxim Sokhatsky').
  4 | -copyright('Synrc Research Center s.r.o.').
  5 | -include("cr.hrl").
  6 | -include("rafter.hrl").
  7 | -compile(export_all).
  8 | -record(state, {name,nodes,timers}).
  9 | -export(?GEN_SERVER).
 10 | 
 11 | %% Heart Monitor module is a single process, monitoring other cluster peers.
 12 | %% The Configuration of Ring is tracked by RAFT protocol and its log.
 13 | 
 14 | start_link(Name,Nodes) ->
 15 |     gen_server:start_link(?MODULE, [Name,Nodes], []).
 16 | 
 17 | init([Name,Nodes]) ->
 18 | 
 19 |     Timers = [ begin
 20 |           [_,Addr]=string:tokens(atom_to_list(cr:node()),"@"),
 21 |           {ok,Parsed}=inet:parse_address(Addr),
 22 |           Timer = erlang:send_after(1000,self(),{timer,ping,{Parsed,P2},Node,undefined}),
 23 |           {Node,Timer}
 24 |       end || {Node,_,P2,_}<-Nodes, Node /= cr:node()],
 25 | 
 26 |     io:format("HEART PROTOCOL: started: ~p~n"
 27 |                                    "Nodes: ~p~n",[Name,Timers]),
 28 | 
 29 |     {ok,#state{name=Name,nodes=Nodes,timers=Timers}}.
 30 | 
 31 | timer_restart(Diff,Connect,Node,Socket) ->
 32 |     {X,Y,Z} = Diff,
 33 |     erlang:send_after(1000*(1+Z+60*Y+60*60*X),self(),{timer,ping,Connect,Node,Socket}).
 34 | 
 35 | setkey(Name,Pos,List,New) ->
 36 |     case lists:keyfind(Name,Pos,List) of
 37 |         false -> [New|List];
 38 |         _Element -> lists:keyreplace(Name,Pos,List,New) end.
 39 | 
 40 | handle_info({'EXIT', Pid,_}, #state{} = State) ->
 41 |     io:format("HEART: EXIT~n",[]),
 42 |     {noreply, State};
 43 | 
 44 | handle_info({carrier,lost,N}, State=#state{timers=Timer}) ->
 45 |     io:format("HOST CARRIER LOST ~p~n",[N]),
 46 |     {noreply,State};
 47 | 
 48 | handle_info({timer,ping,{A,P},N,S}, State=#state{timers=Timers}) ->
 49 | 
 50 |     %io:format("PING STATE: ~p~n",[{A,P,N,S}]),
 51 | 
 52 |     #config{newservers=Servers} = cr_log:get_config(cr:node()),
 53 | 
 54 |     {N,Timer} = lists:keyfind(N,1,Timers),
 55 |     case Timer of undefined -> skip; _ -> erlang:cancel_timer(Timer) end,
 56 | 
 57 |     Socket = try gen_tcp:send(S,term_to_binary({ping})), S
 58 |            catch E:R -> case gen_tcp:connect(A,P,[{packet,0},{active,false}]) of
 59 |                              {ok,S1} -> gen_tcp:send(S1,term_to_binary({ping})), S1;
 60 |                              {error,SErr} -> % io:format("Send Error: ~p~n",[{N,SErr}]),
 61 |                                              undefined end end,
 62 | 
 63 |       Data = try case gen_tcp:recv(Socket,0) of
 64 |                       {error,RErr} -> %io:format("Recv Error: ~p~n",[{N,RErr}]), 
 65 |                                       {error,undefined};
 66 |                       {ok,PONG} when length(PONG) == 10 -> {ok,Socket} end
 67 |            catch E1:R1 ->    {error,recv} end,
 68 | 
 69 |     {T,Operation,Online} = case Data of
 70 |          {error,_} -> {timer_restart({0,0,5},{A,P},N,undefined),remove,undefined};
 71 |          {ok,Sx}   -> {timer_restart({0,0,5},{A,P},N,Sx),add,Sx} end,
 72 | 
 73 |     case change(S,Online,N,Servers) of
 74 |          true ->
 75 |                  try
 76 |                       case cr_rafter:set_config(cr:node(),{N,Operation}) of
 77 |                            {error,_} -> skip;
 78 |                                   _ ->  io:format("Server Config Changed S/T ~p~n",
 79 |                                                          [{N,Operation}]) end
 80 |                  catch
 81 |                       _:Err -> io:format("CONFIG ERROR ~p~n",[Err]) end,
 82 |                  ok;
 83 |          false -> skip end,
 84 | 
 85 |     {noreply,State#state{timers=setkey(N,1,Timers,{N,T})}};
 86 | 
 87 | handle_info(_Info, State) ->
 88 |     io:format("HEART: Info ~p~n",[_Info]),
 89 |     {noreply, State}.
 90 | 
 91 | handle_call({heart},_,Proc) ->
 92 |     {reply,Proc,Proc};
 93 | 
 94 | handle_call(Request,_,Proc) ->
 95 |     io:format("HEART: Call ~p~n",[Request]),
 96 |     {reply,ok,Proc}.
 97 | 
 98 | handle_cast(Msg, State) ->
 99 |     io:format("HEART: Cast ~p", [Msg]),
100 |     {stop, {error, {unknown_cast, Msg}}, State}.
101 | 
102 | terminate(_Reason, #state{}) -> ok.
103 | code_change(_OldVsn, State, _Extra) -> {ok, State}.
104 | 
105 | change(undefined,undefined,N,Servers) -> lists:member(N,Servers);
106 | change(undefined,_,_,_) -> true;
107 | change(_,undefined,_,_) -> true;
108 | change(A,A,_,_) -> false;
109 | change(_,_,_,_) -> true.
110 | 


--------------------------------------------------------------------------------
/src/cr_vnode.erl:
--------------------------------------------------------------------------------
  1 | -module(cr_vnode).
  2 | -description('Ring Replica').
  3 | -author('Maxim Sokhatsky').
  4 | -copyright('Synrc Research Center s.r.o.').
  5 | -include("cr.hrl").
  6 | -include_lib("kvs/include/kvs.hrl").
  7 | -include_lib("db/include/transaction.hrl").
  8 | -compile(export_all).
  9 | -record(state, {name,nodes,storage,latency={inf,0,0,0}}). % latency {min,max,avg}
 10 | -export(?GEN_SERVER).
 11 | 
 12 | %% Ring Replica vnode is single point of execution inside CR DHT.
 13 | %% Each Node in Cluster has several replica vnodes.
 14 | 
 15 | start_link(Name,Storage) ->
 16 |     gen_server:start_link(?MODULE, [Name,Storage], []).
 17 | 
 18 | init([Name,Storage]) ->
 19 |     [ gen_server:cast(Name,O) || O <- kvs:entries(kvs:get(log,{pending,Name}),operation,-1) ],
 20 |     io:format("VNODE PROTOCOL: started: ~p.~n",[Name]),
 21 |     {ok,#state{name=Name,storage=Storage}}.
 22 | 
 23 | handle_info({'EXIT', Pid,_}, #state{} = State) ->
 24 |     io:format("VNODE: EXIT~n",[]),
 25 |     {noreply, State};
 26 | 
 27 | handle_info(_Info, State) ->
 28 | %    io:format("VNODE: Info ~p~n",[_Info]),
 29 |     {noreply, State}.
 30 | 
 31 | kvs_log({Cmd,Self,[{I,N}|T],Tx}=Message, #state{name=Name}=State) ->
 32 |     Id = element(2,Tx),
 33 | %    io:format("XA RECEIVE: ~p~n",[{Id,Message,Name}]),
 34 |     Operation = #operation{name=Cmd,body=Message,feed_id=Name,status=pending},
 35 |     {ok,Saved} = %kvs:add(Operation#operation{id=kvs:next_id(operation,1)}),
 36 |                  cr_log:kvs_log(cr:node(),Operation),
 37 |     try gen_server:cast(self(),Saved)
 38 |     catch E:R -> io:format("LOG ERROR ~p~n",[cr:stack(E,R)]) end.
 39 | 
 40 | continuation(Next,{_,_,[],Tx}=Command,State) -> {noreply, State};
 41 | continuation(Next,{C,S,[{I,N}|T],Tx}=Command,State) ->
 42 |     Id = element(2,Tx),
 43 |     Peer = cr:peer({I,N}),
 44 |     Vpid = cr:vpid({I,Peer}),
 45 |     case gen_server:cast(Vpid,{pending,Command}) of
 46 |                      ok -> % io:format("XA SENT OK from ~p to ~p~n",[cr:node(),Peer]),
 47 |                            {noreply,State};
 48 |                   Error -> timer:sleep(1000),
 49 |                            continuation(Next,Command,State) end.
 50 | 
 51 | handle_call({pending,{Cmd,Self,[{I,N}|T],Tx}=Message}, _, #state{name=Name,storage=Storage}=State) ->
 52 |     kvs_log(Message,State),
 53 |     {reply, {ok,queued}, State};
 54 | 
 55 | handle_call({latency},_,#state{latency={Min,Max,Avg,N}}=State) ->
 56 |     L = try X = {Min div 1000,Max div 1000,Avg div (N*1000)}
 57 |       catch _:_ -> {Min,Max,Avg} end,
 58 |     {reply,L,State};
 59 | 
 60 | handle_call(Request,_,Proc) ->
 61 |     io:format("VNODE: Call ~p~n",[Request]),
 62 |     {reply,ok,Proc}.
 63 | 
 64 | handle_cast({client,Client,Chain,Record}, #state{name=Name,storage=Storage}=State) ->
 65 |     {I,N} = hd(Chain),
 66 |     Self  = cr:node(),
 67 |     gen_server:cast(case cr:peer({I,N}) of
 68 |                          Self -> cr:local(Record);
 69 |                          Node -> cr:vpid({I,Node}) end,
 70 |                     {pending,{prepare,Client,Chain,Record}}),
 71 |     {noreply, State};
 72 | 
 73 | handle_cast({pending,{Cmd,Self,[{I,N}|T],Tx}=Message}, #state{name=Name,storage=Storage}=State) ->
 74 |     kvs_log(Message,State),
 75 |     {noreply, State};
 76 | 
 77 | handle_cast(#operation{name=Command,body=Message}=Operation, #state{name=Name,storage=Storage}=State) ->
 78 |     {Command,Sender,[H|T]=Chain,Tx} = Message,
 79 |     Replay =   try cr_log:kvs_replay(cr:node(),Operation,State,status(Command))
 80 |              catch E:R -> %io:format("~p REPLAY ~p~n",[code(Command),cr:stack(E,R)]),
 81 |                           {rollback, {E,R}, Chain, Tx} end,
 82 |     {Forward,Latency} = case [Chain,Replay] of
 83 |            [_,A={rollback,_,_,_}] -> {A,State#state.latency};
 84 |                        [[Name],_] -> last(Operation,State);
 85 |                         [[H|T],_] -> {{Command,Sender,T,Tx},State#state.latency} end,
 86 |     try continuation(H,Forward,State)
 87 |     catch X:Y -> io:format("~p SEND ~p~n",[code(Command),cr:stack(X,Y)]) end,
 88 |     {noreply,State#state{latency=Latency}}.
 89 | 
 90 | terminate(_Reason, #state{}) -> ok.
 91 | code_change(_OldVsn, State, _Extra) -> {ok, State}.
 92 | 
 93 | status(commit)   -> commited;
 94 | status(prepare)  -> prepared;
 95 | status(Unknown)  -> Unknown.
 96 | 
 97 | % XA PROTOCOL
 98 | % last(#operation{body={prepare,{Sender,Time},_,Tx}},S) -> {{commit,{Sender,Time},cr:chain(element(2,Tx)),Tx},S#state.latency};
 99 | % last(#operation{body={commit,{Sender,Time},_,Tx}},S)  -> {{nop,{Sender,Time},[],[]},new_latency(Time,S)};
100 | 
101 | % CR PROTOCOL
102 | last(#operation{body={_,{Sender,Time},_,Tx}},S)       -> {{nop,{Sender,Time},[],[]},new_latency(Time,S)}.
103 | 
104 | new_latency(Time,#state{latency=Latency}) ->
105 |     {Min,Max,Avg,N} = Latency,
106 |     L = time_diff(Time,os:timestamp()),
107 |     {NMin,NMax} = case L of
108 |          L when L > Max -> {Min,L};
109 |          L when L < Min -> {L,Max};
110 |                       _ -> {Min,Max} end,
111 |     NAvg = Avg + L,
112 |     {NMin,NMax,NAvg,N + 1}.
113 | 
114 | ms({Mega,Sec,Micro}) -> (Mega*1000000+Sec)*1000000+Micro.
115 | time_diff(Now,Now2) -> ms(Now2) - ms(Now).
116 | 
117 | code(prepare)    -> "PREPARE";
118 | code(commit)     -> "COMMIT";
119 | code(rollback)   -> "ROLLBACK";
120 | code(Unknown)    -> Unknown.
121 | 


--------------------------------------------------------------------------------
/src/tcp/cr_client.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_client).
 2 | -copyright('Maxim Sokhatsky').
 3 | -include("cr.hrl").
 4 | -compile(export_all).
 5 | -record(state, {succ,pred,port,name,socket,module,nodes}).
 6 | 
 7 | sup() -> client_sup.
 8 | 
 9 | init([Name,Mod,Socket,Nodes]) -> #state{name=Name,module=Mod,socket=Socket,nodes=Nodes}.
10 | 
11 | dispatch({transaction,Object},#state{nodes=Nodes,name=Name,socket=Socket}=State) ->
12 |     Result = cr:tx(Object),
13 |     gen_tcp:send(Socket,term_to_binary(Result)),
14 |     State;
15 | 
16 | dispatch(_,State) -> State.
17 | 


--------------------------------------------------------------------------------
/src/tcp/cr_connection.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_connection).
 2 | -copyright('Synrc Research Center s.r.o.').
 3 | -include("cr.hrl").
 4 | -behaviour(gen_fsm).
 5 | -compile(export_all).
 6 | -record(state, {port,name,socket,module,peer,state,nodes}).
 7 | -export(?GEN_FSM).
 8 | -export([listen/2, transfer/2]).
 9 | -define(TIMEOUT, 10000).
10 | 
11 | start_connection(Module,Socket,Nodes) ->
12 |     {ok, {IP,Port}} = try inet:peername(Socket) catch _:_ -> {ok,{{127,0,0,1},now()}} end,
13 |     Restart = permanent,
14 |     Shutdown = 2000,
15 |     UniqueName = {Module,IP,Port},
16 |     ChildSpec = { UniqueName, { cr_connection, start_link, [UniqueName,Module,Socket,Nodes]},
17 |         Restart, Shutdown, worker, [Module] },
18 |     Sup = supervisor:start_child(Module:sup(),ChildSpec),
19 |     io:format("SERVER: starting ~p listener: ~p~n",[Sup,{Module,IP,Port,Socket}]),
20 |     Sup.
21 | 
22 | listen({socket_ready, Socket}, State) ->
23 |     io:format("SERVER: Socket Ready ~p~n", [Socket]),
24 |     inet:setopts(Socket, [{active, once}, {packet, 0}, binary]),
25 |     {next_state, transfer, State#state{socket=Socket}, ?TIMEOUT};
26 | 
27 | listen(Other, State) ->
28 |     io:format("SERVER: Unexpected message during listening ~p~n", [Other]),
29 |     {next_state, listen, State, ?TIMEOUT}.
30 | 
31 | transfer({in,Binary}, #state{state=SubState,module=Module}=State) ->
32 | %    io:format("SERVER: RECV ~p~n", [Binary]),
33 |     NewSubState = Module:dispatch(cr:decode(Binary),SubState),
34 |     {next_state, transfer, State#state{state=NewSubState}, ?TIMEOUT};
35 | 
36 | transfer({out,Message}, #state{socket=Socket,state=SubState}=State) ->
37 | %    io:format("SERVER: SEND ~p~n", [Message]),
38 |     Bytes = cr:encode(Message),
39 |     gen_tcp:send(Socket, Bytes),
40 |     {next_state, transfer, State, ?TIMEOUT};
41 | 
42 | transfer(timeout, State) ->
43 | %   io:format("SERVER: Client connection timeout: ~p\n", [State]),
44 |     {stop, normal, State};
45 | 
46 | transfer(_Data,  State) ->
47 |     io:format("SERVER: unknown Data during transfer: ~p\n", [_Data]),
48 |     {stop, normal, State}.
49 | 
50 | start_link(Name,Mod,Socket,Nodes) ->
51 |     gen_fsm:start_link(?MODULE, [Name,Mod,Socket,Nodes], []).
52 | 
53 | init([]) ->
54 |     RestartStrategy = one_for_one,
55 |     MaxRestarts = 1,
56 |     MaxSecondsBetweenRestarts = 600,
57 |     SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},
58 |     {ok, {SupFlags, []}};
59 | 
60 | init([Name,Mod,Socket,Nodes]) ->
61 |     io:format("PROTOCOL: starting ~p listener: ~p~n",[self(),{Name,Mod}]),
62 |     process_flag(trap_exit, true),
63 |     {ok,listen,#state{module=Mod,name=Name,
64 |                       socket=Socket,nodes=Nodes,
65 |                       state=Mod:init([Name,Mod,Socket,Nodes])}}.
66 | 
67 | handle_info({tcp, Socket, Bin}, StateName, #state{module=Module,state=SubState} = State) ->
68 |     inet:setopts(Socket, [{active, once}]),
69 |     ?MODULE:StateName({in,Bin}, State);
70 | 
71 | handle_info({tcp_closed,_S}, _, State) ->
72 |     io:format("SERVER: TCP closed~n",[]),
73 |     {stop, normal, State};
74 | 
75 | handle_info({'EXIT', Pid,_}, StateName, #state{} = State) ->
76 |     io:format("SERVER: EXIT~n",[]),
77 |     {next_state, StateName, State};
78 | 
79 | handle_info(_Info, StateName, State) ->
80 |     io:format("SERVER: Info ~p~n",[_Info]),
81 |     {noreply, StateName, State}.
82 | 
83 | handle_event(Event, StateName, State) -> {stop, {StateName, undefined_event, Event}, State}.
84 | handle_sync_event(Event, _From, StateName, State) -> {stop, {StateName, undefined_event, Event}, State}.
85 | terminate(_Reason, StateName, #state{socket=Socket}) -> gen_tcp:close(Socket).
86 | code_change(_OldVsn, StateName, State, _Extra) -> {ok, StateName, State}.
87 | 
88 | 


--------------------------------------------------------------------------------
/src/tcp/cr_interconnect.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_interconnect).
 2 | -copyright('Synrc Research Center s.r.o.').
 3 | -include("cr.hrl").
 4 | -compile(export_all).
 5 | -record(state, {succ,pred,port,name,socket,module,nodes}).
 6 | 
 7 | sup() -> interconnect_sup.
 8 | init([Name,Mod,Socket,Nodes]) -> #state{name=Name,module=Mod,socket=Socket,nodes=Nodes}.
 9 | 
10 | reply(Socket,Result,State) -> gen_tcp:send(Socket,term_to_binary(Result)), State.
11 | 
12 | dispatch({Command,Object},#state{socket=Socket}=State) ->
13 |     io:format("CONS {_,_} VNODE command: ~p~n",[{Object}]),
14 |     reply(Socket,gen_server:call(cr:peer(cr:hash(Object)),{Command,Object}),State);
15 | 
16 | dispatch({Command,Tx,Transaction}, #state{name=Name,socket=Socket}=State) ->
17 |     io:format("CONS {_,_,_} XA command: ~p~n",[{Transaction}]),
18 |     reply(Socket,gen_server:call(element(2,Tx),{Command,Transaction}),State);
19 | 
20 | dispatch(_,State)  -> State.
21 | 


--------------------------------------------------------------------------------
/src/tcp/cr_ping.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_ping).
 2 | -copyright('Synrc Research Center s.r.o.').
 3 | -include("cr.hrl").
 4 | -compile(export_all).
 5 | -record(state, {port,name,socket,module,nodes}).
 6 | 
 7 | sup() -> ping_sup.
 8 | 
 9 | init([Name,Mod,Socket,Nodes]) -> #state{socket=Socket,nodes=Nodes}.
10 | 
11 | dispatch({'join',Object},State)  ->
12 |     io:format("PING: Join request: ~p~n",[Object]),
13 |     State;
14 | 
15 | dispatch({ping},#state{socket=Socket}=State)  ->
16 |     io:format("PING: Message: ~p~n",[self()]),
17 |     gen_tcp:send(Socket,term_to_binary({pong})),
18 |     State;
19 | 
20 | dispatch({'leave',Object},State)  -> State;
21 | 
22 | dispatch(_,State)  -> State.
23 | 


--------------------------------------------------------------------------------
/src/tcp/cr_tcp.erl:
--------------------------------------------------------------------------------
 1 | -module(cr_tcp).
 2 | -description('prim_inet based TCP non-blocking listener').
 3 | -copyright('Synrc Research Center s.r.o.').
 4 | -behaviour(gen_server).
 5 | -include("cr.hrl").
 6 | -export(?GEN_SERVER).
 7 | -compile(export_all).
 8 | -record(state, {listener,acceptor,module,name,port,ring}).
 9 | 
10 | handle_info({inet_async,ListSock,Ref,Message},
11 |     #state{listener=ListSock,acceptor=Ref,module=Module,name=Name,port=Port,ring=HashRing} = State) ->
12 |     {ok,CliSocket} = Message,
13 |     set_sockopt(ListSock, CliSocket),
14 |     {ok, Pid} = cr_connection:start_connection(Module,CliSocket,HashRing),
15 |     gen_tcp:controlling_process(CliSocket, Pid),
16 |     cr:set_socket(Pid, CliSocket),
17 |     Acceptor = case prim_inet:async_accept(ListSock, -1) of
18 |          {ok, NewRef} -> NewRef;
19 |          {error, Reason} ->
20 |               io:format("TCP: Accept Error: ~p~n",[Reason]),
21 |               Reason end,
22 |     {noreply, State#state{acceptor=Acceptor}};
23 | 
24 | handle_info(_Info, State) -> {noreply, State}.
25 | terminate(_Reason, State) -> gen_tcp:close(State#state.listener), ok.
26 | code_change(_OldVsn, State, _Extra) -> {ok, State}.
27 | handle_call(Request, _From, State) -> {stop, {unknown_call, Request}, State}.
28 | handle_cast(_Msg, State) -> {noreply, State}.
29 | 
30 | start_link(Name, Port, Module, HashRing) ->
31 |     gen_server:start_link({local, Name}, ?MODULE, [Name, Port, Module, HashRing], []).
32 | 
33 | init([Name, Port, Module, HashRing]) ->
34 |     process_flag(trap_exit, true),
35 |     Opts = [binary,{packet,1},{reuseaddr,true},{keepalive,true},{backlog,30},{active,false}],
36 |     case gen_tcp:listen(Port, Opts) of
37 |          {ok, Listen_socket} ->
38 | 	          {ok, Ref} = prim_inet:async_accept(Listen_socket, -1),
39 |               {ok, #state{ listener = Listen_socket,
40 |                             acceptor = Ref,
41 |                             ring = HashRing,
42 |                             module = Module,
43 |                             port=Port,
44 |                             name=Name}};
45 |          {error, Reason} -> {stop, Reason} end.
46 | 
47 | set_sockopt(ListSock, CliSocket) ->
48 |     true = inet_db:register_socket(CliSocket, inet_tcp),
49 |     case prim_inet:getopts(ListSock,[active, nodelay, keepalive, delay_send, priority, tos]) of
50 |          {ok, Opts} -> case prim_inet:setopts(CliSocket, Opts) of
51 | 		                    ok -> ok;
52 |                             Error ->
53 |                                      io:format("TCP OPT Socket Error ~p~n",[Error]),
54 |                                      gen_tcp:close(CliSocket), Error end;
55 |          Error -> gen_tcp:close(CliSocket),
56 |                   io:format("TCP Socket Error ~p~n",[Error]),
57 |                   exit({set_sockopt, Error}) end.
58 | 


--------------------------------------------------------------------------------
/sys.config:
--------------------------------------------------------------------------------
 1 | [
 2 |  {riak_ensemble, [{data_root, "data"}]},
 3 |  {cr,  [{peers,[{  'cr@127.0.0.1',9000,9001,9002},
 4 |                 { 'cr2@127.0.0.1',9004,9005,9006},
 5 |                 { 'cr3@127.0.0.1',9008,9009,9010}]}]},
 6 |  {kvs, [{dba,store_mnesia},
 7 |         {log_modules,cr},
 8 |         {user,[{interval,5,10,user},
 9 |                {interval,10,100,user2}]},
10 |         {schema, [ db_config,
11 |                    kvs_feed, kvs_user, kvs_subscription ]} ]}
12 | ].
13 | 


--------------------------------------------------------------------------------
/vm.args:
--------------------------------------------------------------------------------
1 | +pc unicode
2 | +K true
3 | +A 5
4 | -env ERL_MAX_PORTS 4096
5 | -env ERL_FULLSWEEP_AFTER 10
6 | 


--------------------------------------------------------------------------------