├── .eqc_ci ├── EQC_CI_LICENCE.txt ├── LICENSE ├── Makefile ├── README.md ├── doc ├── 5HT.css ├── cr.htm └── images │ ├── log.svg │ ├── merging.svg │ ├── replicas.svg │ └── sup.png ├── include ├── cr.hrl ├── rafter.hrl ├── rafter_consensus_fsm.hrl └── rafter_opts.hrl ├── mad ├── otp.mk ├── rebar.config ├── src ├── backends │ └── cr_kvs.erl ├── consensus │ ├── README.md │ ├── cr_config.erl │ ├── cr_log.erl │ ├── cr_paxon.erl │ ├── cr_rafter.erl │ └── cr_replication.erl ├── cr.app.src ├── cr.erl ├── cr_app.erl ├── cr_hash.erl ├── cr_heart.erl ├── cr_vnode.erl └── tcp │ ├── cr_client.erl │ ├── cr_connection.erl │ ├── cr_interconnect.erl │ ├── cr_ping.erl │ └── cr_tcp.erl ├── sys.config └── vm.args /.eqc_ci: -------------------------------------------------------------------------------- 1 | {build, "./mad dep com pla"}. 2 | {test_path, "ebin"}. 3 | {deps, "deps"}. 4 | {test_root, "test"}. 5 | -------------------------------------------------------------------------------- /EQC_CI_LICENCE.txt: -------------------------------------------------------------------------------- 1 | This file is an agreement between Quviq AB ("Quviq"), Sven Hultins 2 | Gata 9, Gothenburg, Sweden, and the committers to the github 3 | repository in which the file appears ("the owner"). By placing this 4 | file in a github repository, the owner agrees to the terms below. 5 | 6 | The purpose of the agreement is to enable Quviq AB to provide a 7 | continuous integration service to the owner, whereby the code in the 8 | repository ("the source code") is tested using Quviq's test tools, and 9 | the test results are made available on the web. The test results 10 | include test output, generated test cases, and a copy of the source 11 | code in the repository annotated with coverage information ("the test 12 | results"). 13 | 14 | The owner agrees that Quviq may run the tests in the source code and 15 | display the test results on the web, without obligation. 16 | 17 | The owner warrants that running the tests in the source code and 18 | displaying the test results on the web violates no laws, licences or other 19 | agreements. In the event of such a violation, the owner accepts full 20 | responsibility. 21 | 22 | The owner warrants that the source code is not malicious, and will not 23 | mount an attack on either Quviq's server or any other server--for 24 | example by taking part in a denial of service attack, or by attempting 25 | to send unsolicited emails. 26 | 27 | The owner warrants that the source code does not attempt to reverse 28 | engineer Quviq's code. 29 | 30 | Quviq reserves the right to exclude repositories that break this 31 | agreement from its continuous integration service. 32 | 33 | Any dispute arising from the use of Quviq's service will be resolved 34 | under Swedish law. 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Maxim Sokhatsky, Synrc Research Center 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | Software may only be used for the great good and the true happiness of all sentient beings. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 18 | THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | RELEASE := cr 2 | COOKIE := node_runner 3 | VER := 1.0.0 4 | 5 | NAME ?= cr 6 | 7 | default: compile 8 | 9 | include otp.mk 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Byzantine Chain Replication Database 2 | ==================================== 3 | 4 | [![Join the chat at https://gitter.im/spawnproc/cr](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/spawnproc/cr?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 5 | 6 | In banking system demands are very tight. Database 7 | should be at least tripled, stand-by nodes should pick up 8 | master reads from failover node, writes should be 9 | accepted on a reasonable quorum, failover must be followed by recovery, database 10 | should be able to scale even with the RAM/DISC limitations. 11 | 12 | No data should be treated as written otherwise that commited to all replicas. 13 | All this circumstances leads us to chain replication protocol as a simple and natural 14 | feedback to this challenge. 15 | 16 | Different replication techniques exists to satisfy replication demands. 17 | Master-slave replication is most widely known type of replication 18 | used before in such products like GFS, HDFS, mongodb, etc. Quorum Intersection 19 | is another technique used in databases like Cassandra or Amazon Dynamo. 20 | They mostly provide a consistent distributed repository 21 | for event tables or for file storage. In banking industry 22 | we synchronize account balances and need simple and managable 23 | protocol for storage consistency issuing high demand on system integrity. 24 | 25 | There are several classes of error usually implied when dealing with failure detection. 26 | The most weak class is fail-stop events, when the outage is normal or predictable. 27 | The second class is crash-failures, the ubnormal terminations and outages. The most strong 28 | type of failures are byzantine failures resistant to bit-flips, 29 | hacked parties or any types of compromising the transaction objects. 30 | For banking applications the byzantine fault tolerance is desired, 31 | despite it affects the latency. 32 | 33 | Features 34 | -------- 35 | 36 | * Highly-available CP database :-) 37 | * 2N+1 nodes tolerates N failures 38 | * Consistent hashing DHT 39 | * RAFT for managing server configurations timeline 40 | * HMAC signing for Byzantine capabilities 41 | * Various database backends: mnesia, riak, redis, fs, sql 42 | * High-performance non-blocking TCP acceptor 43 | * Separate endpoints for HEART, CLIENT and SERVER protocols 44 | * Pure, clean and understandable codebase 45 | * Article about CR implementation details: http://synrc.space/apps/cr/doc/cr.htm 46 | * Business Processing Erlang book: http://synrc.space/apps/bpe/doc/book.pdf 47 | 48 | Launch 49 | ------ 50 | 51 | ```bash 52 | make console NAME=cr 53 | make console NAME=cr2 54 | make console NAME=cr3 55 | ``` 56 | 57 | You could start all nodes in separate console sesions or you 58 | can `make start NAME=cr2` nodes and later attach to them with `make attach NAME=cr2`. 59 | Also the start is compatible within single folders, which cause no single problem. 60 | 61 | ```erlang 62 | > timer:tc(cr,test,[500]). 63 | 64 | =INFO REPORT==== 7-Apr-2015::00:56:34 === 65 | cr:Already in Database: 14020 66 | New record will be applied: 500 67 | {214369,{transactions,11510}} 68 | ``` 69 | 70 | Fore generating sample data, let say 500 transactions you may run with `cr:test(500)`. 71 | By measuring accepring performance it's like `2000 Req/s`. 72 | 73 | ```erlang 74 | > cr:dump(). 75 | 76 | vnode i n top latency 77 | 121791803110908576516973736059690251637994378581 1 1 391 2/198/64 78 | 243583606221817153033947472119380503275988757162 2 1 400 2/183/72 79 | 365375409332725729550921208179070754913983135743 3 1 388 3/195/64 80 | 487167212443634306067894944238761006551977514324 4 1 357 2/183/53 81 | 608959015554542882584868680298451258189971892905 5 2 12994 2/198/67 82 | 730750818665451459101842416358141509827966271486 6 2 13017 3/184/66 83 | 852542621776360035618816152417831761465960650067 7 2 13019 2/201/75 84 | 974334424887268612135789888477522013103955028648 8 2 13020 3/178/62 85 | 1096126227998177188652763624537212264741949407229 9 3 13021 2/190/68 86 | 1217918031109085765169737360596902516379943785810 10 3 13028 3/206/65 87 | 1339709834219994341686711096656592768017938164391 11 3 13030 2/208/55 88 | 1461501637330902918203684832716283019655932542972 12 3 13031 2/185/58 89 | ok 90 | ``` 91 | 92 | The latency in last column `~70 ms` means the moment data is stored on all `mnesia` replicas. 93 | The latency in a given example is for storing async_dirty using KVS 94 | chain linking (from `1 to 3` msg per write operation, from `1 to 2` msg for lookups) 95 | clustered in `3 nodes` with same replicas number. 96 | 97 | Let's say we want to see all the operations log of a given replica `391`. 98 | 99 | ```erlang 100 | > cr:dump(391). 101 | operation id prev i size 102 | transaction:389:feed::false: 391 387 1 480 103 | transaction:399:feed::false: 387 382 1 500 104 | transaction:375:feed::false: 382 379 1 446 105 | transaction:373:feed::false: 379 378 1 446 106 | transaction:383:feed::false: 378 376 1 473 107 | transaction:392:feed::false: 376 374 1 500 108 | transaction:360:feed::false: 374 371 1 446 109 | transaction:366:feed::false: 371 370 1 473 110 | transaction:370:feed::false: 370 369 1 446 111 | transaction:371:feed::false: 369 368 1 446 112 | ok 113 | ``` 114 | 115 | You may check this from the other side. First retrieve the operation and then 116 | retrieve the transaction created during operation. 117 | 118 | ```erlang 119 | > kvs:get(operation,391). 120 | {ok,#operation{id = 391,version = undefined,container = log, 121 | feed_id = {121791803110908576516973736059690251637994378581,1}, % VNODE 122 | prev = 387,next = undefined,feeds = [],guard = false, 123 | etc = undefined, 124 | body = {prepare,{<0.41.0>,{1428,358105,840469}}, 125 | [{121791803110908576516973736059690251637994378581,1}, % SIGNATURES 126 | {608959015554542882584868680298451258189971892905,2}], 127 | #transaction{id = 389,version = undefined,container = feed, 128 | feed_id = undefined,prev = undefined,next = undefined, 129 | feeds = [],guard = false,etc = undefined, 130 | timestamp = undefined,beneficiary = undefined,...}}, 131 | name = prepare,status = pending}} 132 | ``` 133 | 134 | The transaction. For linking transaction to the link you should use full XA 135 | protocol with two-stage confirmation (1) the PUT operation followed 136 | with (2) LINK operation to some feed, such as user account or customer admin list. 137 | 138 | ```erlang 139 | > kvs:get(transaction,389). 140 | {ok,#transaction{id = 389,version = undefined, 141 | container = feed, feed_id = undefined, prev = undefined, 142 | next = undefined, feeds = [], guard = false, etc = undefined, 143 | timestamp = [], beneficiary = [], 144 | subsidiary = [], amount = [],tax = [], 145 | ballance = [], currency = [], 146 | description = [], info = [], 147 | prevdate = [], rate = [], item = []}} 148 | ``` 149 | 150 | The actiual Erlang business logic, banking transaction from `db` schema 151 | application is stored under 389 id. So you can easlity grab it unlinked 152 | as it was stored as atomic PUT. 153 | 154 | Licenses 155 | -------- 156 | 157 | * consensus protols 1) raft and 2) paxos are distributed under the terms of Apache 2.0 http://www.apache.org/licenses/LICENSE-2.0.html 158 | * cr itself is distributed under the DHARMA license: http://5ht.co/license.htm 159 | 160 | Credits 161 | ------- 162 | 163 | Copyright (c) 2015 Synrc Research Center s.r.o. 164 | 165 | * Maxim Sokhatsky 166 | * Vladimir Kirillov 167 | * Sergey Klimenko 168 | * Valery Meleshkin 169 | * Victor Sovietov 170 | 171 | OM A HUM 172 | -------------------------------------------------------------------------------- /doc/5HT.css: -------------------------------------------------------------------------------- 1 | pre { padding:4px;white-space:pre;background-color:#F1F1F1;font-family:monospace;font-size:14pt;} 2 | code { padding:4px;white-space:pre;font-family:monospace;font-size:14pt;} 3 | body { font-family: local; font-size: 16pt; color: #888; } 4 | h1 { font-size: 34pt; } 5 | h2 { font-size: 24pt; margin-top: 50px; } 6 | h3 { margin-top: 40px; } 7 | h4 { margin-top: 40px; } 8 | h5 { margin-top: -20px; } 9 | p { margin-top: 10px; } 10 | .note { margin-top: 0px; } 11 | .note p { margin-top: 20px; } 12 | .menu { text-align: right;} 13 | a { margin-top: 10px; padding: 10px; } 14 | .app { margin:100px auto;min-width:300px;max-width:800px; } 15 | .message { align: center; } 16 | .note { margin-left:0px;margin-top:0px;background-color:#F1F1F1;padding:4px 10px 4px 24px;color:gray;} 17 | ul {margin-left:70px;} 18 | 19 | a { color: blue; text-decoration: none } 20 | a:hover { color:blue; } 21 | a:hover, a:active { outline: 0 } 22 | 23 | @font-face { 24 | font-family: 'local'; 25 | src: url('Geometria-Light.otf'); 26 | font-weight: normal; 27 | font-style: normal 28 | } 29 | -------------------------------------------------------------------------------- /doc/cr.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | CR 9 | 10 | 11 | 12 |
13 | 14 |
FROM: 5HT
15 | TO: PUB
16 | DATE: 4 APR 2015
17 | 18 |

Chain Replication Database

19 | 20 |
21 | 22 |

In banking system demands are very tight. Database 23 | should be at least tripled, stand-by nodes should pick up 24 | master reads from failover node, writes should be 25 | accepted on a reasonble quorum, failover must be followed by recovery, database 26 | should be able to scale even with the RAM/DISC limitations.

27 | 28 |

No data should be treated as written otherwise that commited to all replicas. 29 | All this circumstances leads us to chain replication protocol as a simple and natural 30 | feedback to this challenge.

31 | 32 |

Different replication techniques exists to satisfy replication demands. 33 | Master-slave replication is most widely known type of replication 34 | used before in such products like GFS, HDFS, mongodb, etc. Quorum Intersection 35 | is another technique used in databases like Cassandra or Amazon Dynamo. 36 | They mostly provide a consistent distributed repository 37 | for event tables or for file storage. In banking industry 38 | we synchronize account balances and need simple and managable 39 | protocol for storage consistency issuing high demand on system integrity. 40 |

41 | 42 |

There are several classes of error usually implied when dealing with failure detection. 43 | The most weak class is fail-stop events, when the outage is normal or predictable. 44 | The second class is crash-failures, the ubnormal terminations and outages. The most strong 45 | type of failures are byzantine failures resistant to bit-flips, 46 | hacked parties or any types of compromising the transaction objects. 47 | For banking applications the byzantine fault tolerance is desired, 48 | despite it affects the latency. However we will show that CR latency 49 | is acceptible even in compare with web applications.

50 | 51 |

Features

52 | 53 |

54 |

64 | 65 |

Consistent Hash Ring

66 | 67 |

Bulding a consistent hash ring is a key feature 68 | that opens a door to the distributed system. 69 | CR is using only five functions to model the DHT ring. 70 | Ring provides a desirable probability in series 71 | of nines of working event condition.

72 | 73 |

74 |

75 |
 76 |  > cr:ring().
 77 | 
 78 | {12,
 79 |  [{0,0},
 80 |   {121791803110908576516973736059690251637994378581,1},
 81 |   {243583606221817153033947472119380503275988757162,1},
 82 |   {365375409332725729550921208179070754913983135743,1},
 83 |   {487167212443634306067894944238761006551977514324,1},
 84 |   {608959015554542882584868680298451258189971892905,2},
 85 |   {730750818665451459101842416358141509827966271486,2},
 86 |   {852542621776360035618816152417831761465960650067,2},
 87 |   {974334424887268612135789888477522013103955028648,2},
 88 |   {1096126227998177188652763624537212264741949407229,3},
 89 |   {1217918031109085765169737360596902516379943785810,3},
 90 |   {1339709834219994341686711096656592768017938164391,3},
 91 |   {1461501637330902918203684832716283019655932542972,3}]}
 92 | 
93 |

94 | 95 |

The ring or configuration is partitioned by shards or peers.

96 | 97 |

98 |

99 |
100 | > cr:peers().
101 | 
102 | [{'cr1@127.0.0.1',9000,9001,9002},
103 |  {'cr2@127.0.0.1',9004,9005,9006},
104 |  {'cr3@127.0.0.1',9008,9009,9010}]
105 | 
106 |

107 | 108 |

Each peer is running several replica protocol vnodes. Each vnode is a 109 | replica process that serves a specific key-range.

110 | 111 |

112 |

113 |
114 | > cr:local().
115 | 
116 | [{487167212443634306067894944238761006551977514324,<0.200.0>},
117 |  {365375409332725729550921208179070754913983135743,<0.199.0>},
118 |  {243583606221817153033947472119380503275988757162,<0.198.0>},
119 |  {121791803110908576516973736059690251637994378581,<0.197.0>}]
120 | 
121 |

122 | 123 |
124 |
125 | > cr:chain(foo).
126 | 
127 | [{1461501637330902918203684832716283019655932542972,3},
128 |  {487167212443634306067894944238761006551977514324,1},
129 |  {974334424887268612135789888477522013103955028648,2}]
130 | 131 |

Chain Replication Protocol

132 | 133 |

Command 134 | 135 |

136 |
137 |

Command is an atomic event that can be performed 138 | in single process context at a single machine.

139 | 140 |

CR provides extensible set of possible commands:

141 | 142 |

143 |

148 | 149 |

This set of commands refers to KVS the database framework for 150 | storing the doubly linked lists (it can be called chains/feeds/sequences) 151 | using the two basic record types: #container, who store the top of a chain along 152 | with chain aggregation counters; and #iterator, who provides next and prev 153 | fields for traversal.

154 | 155 |

Distributed Transaction 156 | 157 |

158 |
159 |

All replicas are sequenced into the chains. Transaction is a 160 | command performing forward over the ordered chain of replicas. This chain 161 | is called configuration. All writes come to the chain's head, 162 | all reads come to chain's tail.

163 | 164 |

Picture 1. Chain

165 | 166 |

Replication Log 167 | 168 |

169 |
170 |

During transaction, the command is saved in replication 171 | log on each replica of the transaction. This log is append-only 172 | disk structure and is also called this history of replica's operations.

173 | 174 |

The replication log is also uses KVS as underlying storage. 175 | As a replication log container it uses #log type and command is stored 176 | as #operation record. Each replica has its own log.

177 | 178 |

Picture 2. Log

179 | 180 |

Replica Protocol

181 | 182 |

Some assumptions are implied during protocol description.

183 | 184 |
  • 1) each peer has at least one non-faulty vnode;
  • 185 |
  • 2) ring is tracked by external consensus or
  • 186 |
  • 3) ring has at least one peer with no faulty vnodes.
  • 187 |
188 | 189 |

#operation [Vnode,Chain,Operation] — Any active replica Vnode in 190 | configuration Chain can issue an operation command only if each 191 | preceding replica in Chain, if any, has done likewise and there 192 | is no conflicting operation for s in its history. Vnode also adds 193 | a new order proof to its history.

194 | 195 |

#suspend [Vnode] — An active replica Vnode can 196 | suspend updating its history by becoming immutable at any time. 197 | Only heart monitor can issue a becomeImmutable message. 198 | The replica signs a wedged statement to notify heart monitor 199 | that it is immutable and what its history is.

200 | 201 |

#resume [Vnode,Configuration,History] — A pending 202 | replica Vnode in Configuration can resume handling operations 203 | if the Heart Monitor has synchronized the history between 204 | nodes to the greatest common prefix log.

205 | 206 |

Failures

207 | 208 |

Configuration Tracking 209 | 210 |

211 |
212 |

The configuration is a dynamic property of transaction. 213 | During transaction it may change due to byzantine failures, 214 | leading us to reconfigure the replicas in a chain. The another consistent 215 | system is needed to track the dynamic configurations.

216 | 217 |

To make the shard highly available, we use replication 218 | and dynamically change the configuration of replicas 219 | in order to deal with crash failures and unresponsiveness. 220 | Each machine in a cluster has single append-only configuration 221 | log which is not based on KVS due to latency requirements. 222 | Configuration log is a binary file written by RAFT protocol commands. 223 | There is only two commands which could be performed over the configuration log:

224 | 225 |

226 |

230 | 231 |

Heart Monitor Protocol

232 | 233 |

#reconfig [Node,Configuration,NewConfiguration] — 234 | The heart monitor waits for a set of valid histories from 235 | a quorum of replicas in current configuration. 236 | A valid history contains at most one record per operation. 237 | The oracle then issues an #resume message for all nodes in NewConfiguration 238 | with the log position of maximal common prefix (last replica in previous Configuration). 239 | The heart monitor can issue at most one #resume message per Configuration generation.

240 | 241 |

#ping — Round-Robin ping over nodes of Configuration. In initial 242 | configuration all nodes are active or resumed.

243 | 244 |

Safety

245 | 246 |

Stable Operation Log 247 | 248 |

The equation specifies what operations O are safe, when all its replicas are commited. 249 | but not when or in what order to do them. 250 | In other words, the system is asynchronous. In this formula we call stable 251 | operation log having operations commited on all replicas.

252 | 253 | 254 | Stable = [ R || R <- replicas(O), 255 | status(R) == commited, 256 | length(R) == N ] 257 |

258 |

259 |
260 |

NOTE: due to asynchronous nature of transaction service the operations 261 | log will be always unordered. As on Picture 3 it should GCP = 2.

262 | 263 |

Picture 3. Greatest common prefix

264 |

265 | 266 |

Liveness

267 | 268 |

There is always eventually a configuration in which all replicas 269 | are correct and do not become suspended. Failure detection of liveness 270 | is tracked by Heart Monitor which pings each node and reconfigures the 271 | nodes for synchronizing the configuration consensus log.

272 | 273 | 274 |

OTP protocol

275 | 276 |

Some types are embedded in L core to resolve main tasks during 277 | type inference, type unification and patterm maching compilation. 278 | L has following basic types which are used by infer/unify/match core. 279 | These types are also shared with Type Inspector.

280 | 281 |

INTERCONNECT

    282 |
  • transaction
  • 283 |
  • get
  • 284 |
  • sync
  • 285 |

286 |

PING

    287 |
  • ping
  • 288 |
  • join
  • 289 |
  • leave
  • 290 |

291 | 292 |

Implementation

293 | 294 |

The chain replication protolcol is implementes as Erlang/OTP application cr 295 | that could be embeded in any toplevel application. We use one supervision 296 | tree and gen_server per one TCP endpoint along with separate 297 | vnode_sup supervision for VNODE transactional contexts per hashring vnode.

298 | 299 |

The Chain Replication Database application is built using Synrc Application Stack. 300 | Among them we have fs native file-system listener, sh shell executor 301 | for running external commands, powerful mad rebar replacement which is 302 | able to pack application inside single-file bundle. During development we 303 | also use otp.mk and active file reloader that uses native 304 | filesystem event on each platform. The database itself built using 305 | kvs with mnesia backend and db banking schema as example.

306 | 307 |

308 |
309 | > application:which_applications().
310 | 
311 | [{cr,"Chain Replication","0.1"},
312 |  {sh,"VXZ SH Executor","0.9"},
313 |  {mad,"MAD VXZ Build Tool","2.2"},
314 |  {db,"Bank Database","1"},
315 |  {active,"ACT VXZ Continuous Compilation","0.9"},
316 |  {kvs,"KVS Abstract Term Database","1"},
317 |  {mnesia,"MNESIA  CXC 138 12","4.12.3"},
318 |  {fs,"VXZ FS Listener","0.9.1"},
319 |  {stdlib,"ERTS  CXC 138 10","2.2"},
320 |  {kernel,"ERTS  CXC 138 10","3.0.3"}]
321 | 
322 |
323 |

324 | 325 |

Supervision tree of chain replication supervisor:

326 | 327 |

Picture 4. Supervision

328 | 329 |

330 |

331 |
332 | > cr:sup().
333 | 
334 | [{vnode_sup,<0.52.0>},
335 |  {client_sup,<0.51.0>},
336 |  {client,<0.50.0>},
337 |  {ping_sup,<0.289.0>},
338 |  {ping,<0.48.0>},
339 |  {interconnect_sup,<0.47.0>},
340 |  {interconnect,<0.46.0>}]
341 | 
342 |

343 | 344 |

For benchmarking database please populate the it with data but without 345 | overloading the database:

346 | 347 |
348 |
349 |     [
350 |       begin
351 |           cr:test(500),
352 |           timer:sleep(1000)
353 |       end
354 |           || ___ <- lists:seq(1,10)
355 |     ].
356 | 
357 | > cr:dump().
358 | 
359 |
360 |                                                vnode   i  n        top      log        latency
361 |     121791803110908576516973736059690251637994378581   1  1       6506     1607       1/315/97
362 |     243583606221817153033947472119380503275988757162   2  1       6508     1662      1/317/100
363 |     365375409332725729550921208179070754913983135743   3  1       6510     1658      2/317/105
364 |     487167212443634306067894944238761006551977514324   4  1       6505     1583      1/317/104
365 |     608959015554542882584868680298451258189971892905   5  2       6499     1637      3/317/115
366 |     730750818665451459101842416358141509827966271486   6  2       6510     1664      2/318/117
367 |     852542621776360035618816152417831761465960650067   7  2       6501     1634      2/311/115
368 |     974334424887268612135789888477522013103955028648   8  2       6500     1575       3/290/96
369 |    1096126227998177188652763624537212264741949407229   9  3       6497     1607      3/316/118
370 |    1217918031109085765169737360596902516379943785810  10  3       6510     1662      3/318/117
371 |    1339709834219994341686711096656592768017938164391  11  3       6496     1658      3/311/106
372 |    1461501637330902918203684832716283019655932542972  12  3       6505     1583      2/295/104
373 | 
374 | 375 |

Literature

376 | 377 |  [1]. Hussam Abu-Libdeh, Robbert van Renesse, Ymir Vigfusson.
378 | 379 |      Leveraging Sharding in the Design of Scalable Replication Protocols

380 | 381 | [2]. Robbert van Renesse, Chi Ho, Nicolas Schiper.
382 | 383 |      Byzantine Chain Replication

384 | 385 | [3]. Robbert van Renesse, Nicolas Schiper.
386 | 387 |      Chain Replication for 388 | Supporting High Throughput and Availability 389 | 390 |

Credits

391 | 392 |

393 |

400 | 401 |

402 |
2015 © Synrc Research Center, s.r.o.
403 | 404 |
405 |
406 | 407 | 408 | -------------------------------------------------------------------------------- /doc/images/log.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | log 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | #transaction 27 | 28 | 29 | #client 30 | 31 | 32 | #account 33 | 34 | 35 | #log 36 | 37 | 38 | <- live data 39 | 40 | 41 | <- failover 42 | 43 | 44 | <- epoch 45 | 46 | 47 | <- log head 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /doc/images/merging.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | merging 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | Replica A 24 | 25 | 26 | Operations 27 | 28 | 29 | 1 30 | 31 | 32 | 2 33 | 34 | 35 | 3 36 | 37 | 38 | 4 39 | 40 | 41 | Replica B 42 | 43 | 44 | Replica C 45 | 46 | 47 | 48 | GCP 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /doc/images/replicas.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | replicas 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | write -> 16 | 17 | 18 | <- read 19 | 20 | 21 | <- TX context 22 | 23 | 24 | <- head 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /doc/images/sup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synrc/cr/b4a30dc55d30500a1c239d6234444e1ecff5aab5/doc/images/sup.png -------------------------------------------------------------------------------- /include/cr.hrl: -------------------------------------------------------------------------------- 1 | -define(GEN_SERVER, [init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). 2 | -define(GEN_FSM,[init/1, handle_event/3, handle_sync_event/4, handle_info/3, terminate/3, code_change/4]). 3 | 4 | -include_lib("kvs/include/kvs.hrl"). 5 | 6 | -type mode() :: active | pending | immutable | sync. 7 | 8 | -record(ens, {eseq,key,val}). 9 | 10 | 11 | -------------------------------------------------------------------------------- /include/rafter.hrl: -------------------------------------------------------------------------------- 1 | -type peer() :: atom() | {atom(), atom()}. 2 | 3 | %% Transport Independent MESSAGES 4 | -record(request_vote, { 5 | term :: non_neg_integer(), 6 | from :: atom(), 7 | last_log_index :: non_neg_integer(), 8 | last_log_term :: non_neg_integer()}). 9 | 10 | -record(vote, { 11 | from :: atom(), 12 | term :: non_neg_integer(), 13 | success :: boolean()}). 14 | 15 | -record(append_entries, { 16 | term :: non_neg_integer(), 17 | from :: atom(), 18 | prev_log_index :: non_neg_integer(), 19 | prev_log_term :: non_neg_integer(), 20 | entries :: term(), 21 | commit_index :: non_neg_integer(), 22 | 23 | %% This is used during read-only operations 24 | send_clock :: non_neg_integer()}). 25 | 26 | -record(append_entries_rpy, { 27 | from :: atom(), 28 | term :: non_neg_integer(), 29 | 30 | %% This field isn't in the raft paper. However, for this implementation 31 | %% it prevents duplicate responses from causing recommits and helps 32 | %% maintain safety. In the raft reference implementation (logcabin) 33 | %% they cancel the in flight RPC's instead. That's difficult 34 | %% to do correctly(without races) in erlang with asynchronous 35 | %% messaging and mailboxes. 36 | index :: non_neg_integer(), 37 | 38 | %% This is used during read-only operations 39 | send_clock :: non_neg_integer(), 40 | 41 | success :: boolean()}). 42 | 43 | -record(rafter_entry, { 44 | type :: noop | config | op, 45 | term :: non_neg_integer(), 46 | index :: non_neg_integer(), 47 | cmd :: term()}). 48 | 49 | -record(meta, { 50 | voted_for :: peer(), 51 | term = 0 :: non_neg_integer()}). 52 | 53 | -record(config, { 54 | state = blank :: 55 | %% The configuration specifies no servers. Servers that are new to the 56 | %% cluster and have empty logs start in this state. 57 | blank | 58 | %% The configuration specifies a single list of servers: a quorum 59 | %% requires any majority of oldservers. 60 | stable | 61 | %% The configuration specifies two lists of servers: a quorum requires 62 | %% any majority of oldservers, but the newservers also receive log entries. 63 | staging | 64 | %% The configuration specifies two lists of servers: a quorum requires 65 | %% any majority of oldservers and any majority of the newservers. 66 | transitional, 67 | 68 | oldservers = [] :: list(), 69 | newservers = [] :: list() 70 | }). 71 | 72 | -------------------------------------------------------------------------------- /include/rafter_consensus_fsm.hrl: -------------------------------------------------------------------------------- 1 | -record(client_req, { 2 | id :: binary(), 3 | timer :: timer:tref(), 4 | from :: term(), 5 | index :: non_neg_integer(), 6 | term :: non_neg_integer(), 7 | 8 | %% only used during read_only commands 9 | cmd :: term()}). 10 | 11 | -record(state, { 12 | leader :: term(), 13 | term = 0 :: non_neg_integer(), 14 | voted_for :: term(), 15 | commit_index = 0 :: non_neg_integer(), 16 | init_config :: undefined | list() | complete | no_client, 17 | 18 | %% Used for Election and Heartbeat timeouts 19 | timer :: reference(), 20 | 21 | %% leader state: contains nextIndex for each peer 22 | followers, 23 | 24 | %% Dict keyed by peer id. 25 | %% contains true as val when candidate 26 | %% contains match_indexes as val when leader 27 | responses, 28 | 29 | %% Logical clock to allow read linearizability 30 | %% Reset to 0 on leader election. 31 | send_clock = 0 :: non_neg_integer(), 32 | 33 | %% Keep track of the highest send_clock received from each peer 34 | %% Reset on leader election 35 | send_clock_responses, 36 | 37 | %% Outstanding Client Write Requests 38 | client_reqs = [] :: [#client_req{}], 39 | 40 | %% Outstanding Client Read Requests 41 | %% Keyed on send_clock, Val = [#client_req{}] 42 | read_reqs, 43 | 44 | %% All servers making up the ensemble 45 | me :: string(), 46 | 47 | config :: term(), 48 | 49 | %% We allow pluggable backend state machine modules. 50 | state_machine :: atom(), 51 | backend_state :: term()}). 52 | -------------------------------------------------------------------------------- /include/rafter_opts.hrl: -------------------------------------------------------------------------------- 1 | -record(rafter_opts, {state_machine = cr_rafterback, 2 | cluster, 3 | logdir = "data"}). 4 | -------------------------------------------------------------------------------- /mad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synrc/cr/b4a30dc55d30500a1c239d6234444e1ecff5aab5/mad -------------------------------------------------------------------------------- /otp.mk: -------------------------------------------------------------------------------- 1 | ifeq ($(OS),Windows_NT) 2 | SEPARATOR=; 3 | else 4 | SEPARATOR=: 5 | endif 6 | 7 | MAD := ./mad 8 | VM := vm.args 9 | SYS := sys.config 10 | PLT_NAME := ~/.n2o_dialyzer.plt 11 | ERL_ARGS := -args_file $(VM) -config $(SYS) -setcookie $(COOKIE) -name $(NAME)@127.0.0.1 12 | RUN_DIR := data/$(NAME)/log 13 | LOG_DIR := data/$(NAME)/log 14 | empty := 15 | ROOTS := . deps 16 | space := $(empty) $(empty) 17 | comma := $(empty),$(empty) 18 | VSN := $(shell git rev-parse HEAD | head -c 6) 19 | DATE := $(shell date "+%Y%m%d-%H%M%S") 20 | ERL_LIBS := $(subst $(space),$(SEPARATOR),$(ROOTS)) 21 | relx := "{release,{$(RELEASE),\"$(VER)\"},[$(RELEASE)]}.\\n{include_erts,true}.\ 22 | \\n{extended_start_script,true}.\\n{generate_start_script,true}.\\n{sys_config,\"$(SYS)\"}.\ 23 | \\n{vm_args,\"$(VM)\"}.\\n{overlay,[{mkdir,\"log/sasl\"}]}." 24 | 25 | test: eunit ct 26 | deps up: 27 | $(MAD) $@ 28 | compile: deps 29 | $(MAD) compile skip_deps=true 30 | clean: 31 | rm -f .applist 32 | $(MAD) $@ 33 | .applist: compile 34 | $(MAD) plan 35 | $(RUN_DIR) $(LOG_DIR): 36 | mkdir -p $(RUN_DIR) & mkdir -p $(LOG_DIR) 37 | console: .applist 38 | mkdir -p data 39 | ERL_LIBS="$(ERL_LIBS)" erl $(ERL_ARGS) -eval '[application:start(A) || A <- $(shell cat .applist)]' 40 | start: $(RUN_DIR) $(LOG_DIR) .applist 41 | RUN_ERL_LOG_GENERATIONS=1000 RUN_ERL_LOG_MAXSIZE=20000000 \ 42 | ERL_LIBS=$(ERL_LIBS) run_erl -daemon $(RUN_DIR)/ $(LOG_DIR)/ "exec $(MAKE) console" 43 | attach: 44 | to_erl $(RUN_DIR)/ 45 | release: 46 | echo $(relx) > relx.config && relx 47 | stop: 48 | @kill -9 $(shell ps ax -o pid= -o command=|grep $(RELEASE)|grep $(COOKIE)|awk '{print $$1}') 49 | $(PLT_NAME): 50 | $(eval APPS := $(subst deps/,,$(subst apps/,,$(shell find apps deps -maxdepth 1 -mindepth 1 -type d)))) 51 | ERL_LIBS=$(ERL_LIBS) dialyzer --build_plt --output_plt $(PLT_NAME) --apps $(APPS) || true 52 | dialyze: $(PLT_NAME) compile 53 | $(eval APPS := $(shell find apps deps -maxdepth 1 -mindepth 1 -type d)) 54 | @$(foreach var,$(APPS),(echo "Process $(var)"; dialyzer -q $(var)/ebin --plt $(PLT_NAME) --no_native -Werror_handling -Wunderspecs -Wrace_conditions -Wno_undefined_callbacks);) 55 | tar: release 56 | tar zcvf $(RELEASE)-$(VSN)-$(DATE).tar.gz _rel/lib/*/ebin _rel/lib/*/priv _rel/bin _rel/releases 57 | eunit: 58 | rebar eunit skip_deps=true 59 | ct: 60 | rebar ct skip_deps=true verbose=1 61 | 62 | .PHONY: deps up compile clean console start attach release update-deps dialyze ct eunit tar 63 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {deps,[ 2 | {kvs, ".*", {git, "git://github.com/synrc/kvs", {tag,"3.4"}}}, 3 | {db, ".*", {git, "git://github.com/spawnproc/db", "HEAD"}}, 4 | {active, ".*", {git, "git://github.com/synrc/active", "HEAD"}} 5 | ]}. 6 | -------------------------------------------------------------------------------- /src/backends/cr_kvs.erl: -------------------------------------------------------------------------------- 1 | -module(cr_kvs). 2 | -copyright('Maxim Sokhatsky'). 3 | -include("cr.hrl"). 4 | -compile(export_all). 5 | 6 | dispatch({prepare,_,_,Tx}, {state,Name,_,_,_}) -> 7 | % io:format("KVS PUT ~p:~p~n",[element(1,Tx),element(2,Tx)]), 8 | kvs:put(Tx); 9 | 10 | dispatch({commit,_,_,Tx}, {state,Name,_,_,_}) -> 11 | % io:format("KVS LINK ~p:~p~n",[element(1,Tx),element(2,Tx)]), 12 | kvs:link(Tx); 13 | 14 | dispatch({rollback,_,_,Tx}, {state,Name,_,_,_}) -> 15 | % io:format("KVS REMOVE ~p:~p~n",[element(1,Tx),element(2,Tx)]), 16 | kvs:remove(Tx); 17 | 18 | dispatch(_,_) -> ok. 19 | -------------------------------------------------------------------------------- /src/consensus/README.md: -------------------------------------------------------------------------------- 1 | Consensus Modules 2 | ================= 3 | 4 | The original ideas is to have replacable consunsus modules: 5 | 6 | * PAXOS (cr_paxon) 7 | * RAFT (cr_rafter) 8 | -------------------------------------------------------------------------------- /src/consensus/cr_config.erl: -------------------------------------------------------------------------------- 1 | -module(cr_config). 2 | -compile(export_all). 3 | -include("rafter.hrl"). 4 | 5 | quorum_max(_Me, #config{state=blank}, _) -> 0; 6 | quorum_max(Me, #config{state=stable, newservers=OldServers}, Responses) -> quorum_max(Me, OldServers, Responses); 7 | quorum_max(Me, #config{state=staging, newservers=OldServers}, Responses) -> quorum_max(Me, OldServers, Responses); 8 | quorum_max(Me, #config{state=transitional, oldservers=Old, newservers=New}, Responses) -> min(quorum_max(Me, Old, Responses), quorum_max(Me, New, Responses)); 9 | 10 | quorum_max(_, [], _) -> 0; 11 | quorum_max(Me, Servers, Responses) when (length(Servers) rem 2) =:= 0-> 12 | Values = sorted_values(Me, Servers, Responses), 13 | lists:nth(length(Values) div 2, Values); 14 | quorum_max(Me, Servers, Responses) -> 15 | Values = sorted_values(Me, Servers, Responses), 16 | lists:nth(length(Values) div 2 + 1, Values). 17 | 18 | quorum(_Me, #config{state=blank}, _Responses) -> false; 19 | quorum(Me, #config{state=stable,newservers=Servers}, Responses) -> quorum(Me, Servers, Responses); 20 | quorum(Me, #config{state=staging,newservers=Servers}, Responses) -> quorum(Me, Servers, Responses); 21 | quorum(Me, #config{state=transitional,oldservers=Old, newservers=New}, Responses) -> quorum(Me, Old, Responses) andalso quorum(Me, New, Responses); 22 | quorum(Me, Servers, Responses) -> 23 | TrueResponses = [R || {Peer, R} <- dict:to_list(Responses), 24 | R =:= true, 25 | lists:member(Peer, Servers)], 26 | case lists:member(Me, Servers) of 27 | true -> length(TrueResponses) + 1 > length(Servers)/2; 28 | false -> length(TrueResponses) > length(Servers)/2 end. 29 | 30 | voters(Me, Config) -> lists:delete(Me, voters(Config)). 31 | voters(#config{oldservers=Old, newservers=New}) -> sets:to_list(sets:from_list(Old ++ New)); 32 | voters(#config{newservers=Old}) -> Old. 33 | 34 | has_vote(_Me, #config{state=blank}) -> false; 35 | has_vote(Me, #config{oldservers=Old, newservers=New})-> lists:member(Me, Old) orelse lists:member(Me, New); 36 | has_vote(Me, #config{newservers=Old}) -> lists:member(Me, Old). 37 | 38 | followers(Me, #config{oldservers=Old, newservers=New}) -> lists:delete(Me, sets:to_list(sets:from_list(Old ++ New))); 39 | followers(Me, #config{newservers=Old}) -> lists:delete(Me, Old). 40 | 41 | reconfig(#config{state=Blank,newservers=OldNew}=Config, Servers) -> 42 | Config#config{state=stable,oldservers=OldNew, newservers=Servers}. 43 | 44 | allow_config(#config{state=blank}, _NewServers) -> true; 45 | allow_config(#config{newservers=OldServers}, NewServers) when NewServers =/= OldServers -> true; 46 | allow_config(_Config, _NewServers) -> {error, config_not_allowed}. 47 | 48 | sorted_values(Me, Servers, Responses) -> 49 | Vals = lists:sort(lists:map(fun(S) -> value(S, Responses) end, Servers)), 50 | case lists:member(Me, Servers) of 51 | true -> [_ | T] = Vals, lists:reverse([lists:max(Vals) | lists:reverse(T)]); 52 | false -> Vals end. 53 | 54 | value(Peer, Responses) -> 55 | case dict:find(Peer, Responses) of 56 | {ok, Value} -> Value; 57 | error -> 0 end. 58 | -------------------------------------------------------------------------------- /src/consensus/cr_log.erl: -------------------------------------------------------------------------------- 1 | -module(cr_log). 2 | -description('RAFT protocol replication log'). 3 | -compile(export_all). 4 | -behaviour(gen_server). 5 | -include_lib("kernel/include/file.hrl"). 6 | -include("cr.hrl"). 7 | -include("rafter.hrl"). 8 | -include("rafter_opts.hrl"). 9 | -export(?GEN_SERVER). 10 | 11 | show() -> show(cr:node()). 12 | show(Node) -> 13 | [ {I,element(2,cr_log:get_entry(Node,I))} || I <- lists:seq(1,cr_log:get_last_index(Node)) ]. 14 | 15 | %%============================================================================= 16 | %% Logfile Structure 17 | %%============================================================================= 18 | %% @doc A log is made up of a file header and entries. The header contains file 19 | %% metadata and is written once at file creation. Each entry is a binary 20 | %% of arbitrary size containing header information and is followed by a trailer. 21 | %% The formats of the file header and entries are described below. 22 | %% 23 | %% File Header Format 24 | %% ----------------------- 25 | %% <> 26 | %% 27 | %% Entry Format 28 | %% ---------------- 29 | %% <> 30 | %% 31 | %% Sha1 - hash of the rest of the entry, 32 | %% Type - ?CONFIG | ?OP 33 | %% Term - The term of the entry 34 | %% Index - The log index of the entry 35 | %% DataSize - The size of Data in bytes 36 | %% Data - Data encoded with term_to_binary/1 37 | %% 38 | %% After each log entry a trailer is written. The trailer is used for 39 | %% detecting incomplete/corrupted writes, pointing to the latest config and 40 | %% traversing the log file backwards. 41 | %% 42 | %% Trailer Format 43 | %% ---------------- 44 | %% <> 45 | %% 46 | %% Crc - checksum, computed with erlang:crc32/1, of the rest of the trailer 47 | %% ConfigStart - file location of last seen config, 48 | %% EntryStart - file location of the start of this entry 49 | %% ?MAGIC - magic number marking the end of the trailer. 50 | %% A fully consistent log should always have 51 | %% the following magic number as the last 8 bytes: 52 | %% <<"\xFE\xED\xFE\xED\xFE\xED\xFE\xED">> 53 | %% 54 | 55 | -define(MAX_HINTS, 1000). 56 | 57 | -type index() :: non_neg_integer(). 58 | -type offset() :: non_neg_integer(). 59 | 60 | -record(state, { 61 | logfile :: file:io_device(), 62 | version :: non_neg_integer(), 63 | meta_filename :: string(), 64 | write_location = 0 :: non_neg_integer(), 65 | config :: #config{}, 66 | config_loc :: offset(), 67 | meta :: #meta{}, 68 | last_entry :: #rafter_entry{}, 69 | index = 0 :: index(), 70 | hints :: ets:tid(), 71 | hint_prunes = 0 :: non_neg_integer(), 72 | 73 | %% frequency of number of entries scanned in get_entry/2 calls 74 | seek_counts = dict:new()}). 75 | 76 | -define(MAGIC, <<"\xFE\xED\xFE\xED\xFE\xED\xFE\xED">>). 77 | -define(MAGIC_SIZE, 8). 78 | -define(HEADER_SIZE, 41). 79 | -define(TRAILER_SIZE, 28). 80 | -define(FILE_HEADER_SIZE, 1). 81 | -define(READ_BLOCK_SIZE, 1048576). %% 1MB 82 | -define(LATEST_VERSION, 1). 83 | 84 | %% Entry Types 85 | -define(NOOP, 0). 86 | -define(CONFIG, 1). 87 | -define(OP, 2). 88 | -define(ALL, [?CONFIG, ?OP]). 89 | 90 | -ifdef(TEST). 91 | -define(ETS_OPTS, [ordered_set, protected]). 92 | -else. 93 | -define(ETS_OPTS, [named_table, ordered_set, protected]). 94 | -endif. 95 | 96 | 97 | %%==================================================================== 98 | %% API 99 | %%==================================================================== 100 | entry_to_binary(#rafter_entry{type=noop, term=Term, index=Index, cmd=noop}) -> 101 | entry_to_binary(?NOOP, Term, Index, noop); 102 | entry_to_binary(#rafter_entry{type=config, term=Term, index=Index, cmd=Data}) -> 103 | entry_to_binary(?CONFIG, Term, Index, Data); 104 | entry_to_binary(#rafter_entry{type=op, term=Term, index=Index, cmd=Data}) -> 105 | entry_to_binary(?OP, Term, Index, Data). 106 | 107 | entry_to_binary(Type, Term, Index, Data) -> 108 | BinData = term_to_binary(Data), 109 | B0 = <>, 110 | Sha1 = crypto:hash(sha, B0), 111 | <>. 112 | 113 | binary_to_entry(<>) -> 114 | %% We want to crash on badmatch here if if our log is corrupt 115 | %% TODO: Allow an operator to repair the log by truncating at that point 116 | %% or repair each entry 1 by 1 by consulting a good log. 117 | Sha1 = crypto:hash(sha, <>), 118 | binary_to_entry(Type, Term, Index, Data). 119 | 120 | binary_to_entry(?NOOP, Term, Index, _Data) -> 121 | #rafter_entry{type=noop, term=Term, index=Index, cmd=noop}; 122 | binary_to_entry(?CONFIG, Term, Index, Data) -> 123 | #rafter_entry{type=config, term=Term, index=Index, cmd=binary_to_term(Data)}; 124 | binary_to_entry(?OP, Term, Index, Data) -> 125 | #rafter_entry{type=op, term=Term, index=Index, cmd=binary_to_term(Data)}. 126 | 127 | start_link(Peer, Opts) -> 128 | gen_server:start_link({local, logname(Peer)}, ?MODULE, [Peer, Opts], []). 129 | 130 | stop(Peer) -> 131 | gen_server:cast(logname(Peer), stop). 132 | 133 | %% @doc check_and_append/3 gets called in the follower state only and will only 134 | %% truncate the log if entries don't match. It never truncates and re-writes 135 | %% committed entries as this violates the safety of the RAFT protocol. 136 | check_and_append(Peer, Entries, Index) -> 137 | gen_server:call(logname(Peer), {check_and_append, Entries, Index}). 138 | 139 | %% @doc append/2 gets called in the leader state only, and assumes a 140 | %% truncated log. 141 | append(Peer, Entries) -> 142 | gen_server:call(logname(Peer), {append, Entries}). 143 | 144 | kvs_log(Peer, Operation) -> 145 | gen_server:call(logname(Peer), {kvs_log, Operation}). 146 | 147 | kvs_replay(Peer, Operation, Storage, Status) -> 148 | gen_server:call(logname(Peer), {kvs_replay, Operation, Storage, Status}). 149 | 150 | get_config(Peer) -> 151 | gen_server:call(logname(Peer), get_config). 152 | 153 | get_last_index(Peer) -> 154 | gen_server:call(logname(Peer), get_last_index). 155 | 156 | get_last_entry(Peer) -> 157 | gen_server:call(logname(Peer), get_last_entry). 158 | 159 | get_last_term(Peer) -> 160 | case get_last_entry(Peer) of 161 | {ok, #rafter_entry{term=Term}} -> 162 | Term; 163 | {ok, not_found} -> 164 | 0 165 | end. 166 | 167 | get_metadata(Peer) -> 168 | gen_server:call(logname(Peer), get_metadata). 169 | 170 | set_metadata(Peer, VotedFor, Term) -> 171 | gen_server:call(logname(Peer), {set_metadata, VotedFor, Term}). 172 | 173 | get_entry(Peer, Index) -> 174 | gen_server:call(logname(Peer), {get_entry, Index}). 175 | 176 | get_term(Peer, Index) -> 177 | case get_entry(Peer, Index) of 178 | {ok, #rafter_entry{term=Term}} -> 179 | Term; 180 | {ok, not_found} -> 181 | 0 182 | end. 183 | 184 | %%==================================================================== 185 | %% gen_server callbacks 186 | %%==================================================================== 187 | init([Name, #rafter_opts{logdir = Logdir}]) -> 188 | LogName = lists:concat([Logdir,"/",Name,".log"]), 189 | MetaName = lists:concat([Logdir,"/",Name,".meta"]), 190 | {ok, LogFile} = file:open(LogName, [append, read, binary, raw]), 191 | {ok, #file_info{size=Size}} = file:read_file_info(LogName), 192 | {ok, Meta} = read_metadata(MetaName, Size), 193 | {ConfigLoc, Config, _Term, Index, WriteLocation, Version} = init_file(LogFile, Size), 194 | LastEntry = find_last_entry(LogFile, WriteLocation), 195 | HintsTable = list_to_atom("rafter_hints_" ++ atom_to_list(Name)), 196 | {ok, #state{logfile=LogFile, 197 | version=Version, 198 | meta_filename=MetaName, 199 | write_location=WriteLocation, 200 | index=Index, 201 | meta=Meta, 202 | config=Config, 203 | config_loc = ConfigLoc, 204 | last_entry=LastEntry, 205 | hints=ets:new(HintsTable, ?ETS_OPTS)}}. 206 | 207 | format_status(_, [_, State]) -> 208 | [{data, [{"StateData", State}]}]. 209 | 210 | handle_call({kvs_log, Operation}, _From, #state{logfile=File}=State) -> 211 | {reply, kvs:add(Operation#operation{id=kvs:next_id(operation,1)}), State}; 212 | 213 | handle_call({kvs_replay, Operation, {state,Name,Nodes,Storage,L}, Status}, _From, #state{}=State) -> 214 | Storage:dispatch(Operation#operation.body,{state,Name,Nodes,Storage,L}), 215 | {reply, ok, State}; 216 | 217 | %% Leader Append. Entries do NOT have Indexes, as they are unlogged entries as a 218 | %% result of client operations. Appends are based on the current index of the log. 219 | %% Just append to the next location in the log for each entry. 220 | handle_call({append, Entries}, _From, #state{logfile=File}=State) -> 221 | NewState = append_entries(File, Entries, State), 222 | Index = NewState#state.index, 223 | {reply, {ok, Index}, NewState}; 224 | 225 | handle_call(get_config, _From, #state{config=Config}=State) -> 226 | {reply, Config, State}; 227 | 228 | handle_call(get_last_entry, _From, #state{last_entry=undefined}=State) -> 229 | {reply, {ok, not_found}, State}; 230 | handle_call(get_last_entry, _From, #state{last_entry=LastEntry}=State) -> 231 | {reply, {ok, LastEntry}, State}; 232 | 233 | handle_call(get_last_index, _From, #state{index=Index}=State) -> 234 | {reply, Index, State}; 235 | 236 | handle_call(get_metadata, _, #state{meta=Meta}=State) -> 237 | {reply, Meta, State}; 238 | 239 | handle_call({set_metadata, VotedFor, Term}, _, #state{meta_filename=Name}=S) -> 240 | Meta = #meta{voted_for=VotedFor, term=Term}, 241 | ok = write_metadata(Name, Meta), 242 | {reply, ok, S#state{meta=Meta}}; 243 | 244 | %% Follower append. Logs may not match. Write the first entry at the given index 245 | %% and reset the current index maintained in #state{}. Note that Entries 246 | %% actually contain correct indexes, since they are sent from the leader. 247 | %% Return the last index written. 248 | handle_call({check_and_append, Entries, Index}, _From, #state{logfile=File, 249 | hints=Hints}=S) -> 250 | Loc0 = closest_forward_offset(Hints, Index), 251 | {Loc, Count} = get_pos(File, Loc0, Index), 252 | State = update_counters(Count, 0, S), 253 | #state{index=NewIndex}=NewState = maybe_append(Loc, Entries, State), 254 | {reply, {ok, NewIndex}, NewState}; 255 | 256 | handle_call({get_entry, Index}, _From, #state{logfile=File, 257 | hints=Hints}=State0) -> 258 | Loc = closest_forward_offset(Hints, Index), 259 | {Res, NewState} = 260 | case find_entry(File, Loc, Index) of 261 | {not_found, Count} -> 262 | State = update_counters(Count, 0, State0), 263 | {not_found, State}; 264 | {Entry, NextLoc, Count} -> 265 | Prunes = add_hint(Hints, Index, NextLoc), 266 | State = update_counters(Count, Prunes, State0), 267 | {Entry, State} 268 | end, 269 | {reply, {ok, Res}, NewState}. 270 | 271 | -spec update_counters(offset(), non_neg_integer(), #state{}) -> #state{}. 272 | update_counters(Distance, Prunes, #state{hint_prunes=Prunes0, 273 | seek_counts=Dict0} 274 | =State) -> 275 | Dict = dict:update_counter(Distance, 1, Dict0), 276 | State#state{hint_prunes=Prunes0 + Prunes, seek_counts=Dict}. 277 | 278 | -spec closest_forward_offset(ets:tid(), index()) -> offset(). 279 | closest_forward_offset(Hints, Index) -> 280 | case ets:prev(Hints, Index) of 281 | '$end_of_table' -> 282 | ?FILE_HEADER_SIZE; 283 | Key -> 284 | [{Key, Loc0}] = ets:lookup(Hints, Key), 285 | Loc0 286 | end. 287 | 288 | -spec add_hint(ets:tid(), index(), offset()) -> non_neg_integer(). 289 | add_hint(Hints, Index, Loc) -> 290 | {size, Size} = lists:keyfind(size, 1, ets:info(Hints)), 291 | case Size >= ?MAX_HINTS of 292 | true -> 293 | delete_hints(Hints), 294 | true = ets:insert(Hints, {Index, Loc}), 295 | 1; 296 | false -> 297 | true = ets:insert(Hints, {Index, Loc}), 298 | 0 299 | end. 300 | 301 | %% Delete every 10th hint 302 | delete_hints(Hints) -> 303 | L = ets:tab2list(Hints), 304 | {_, ToDelete} = 305 | lists:foldl(fun({Index, _}, {Count, Deleted}) when Count rem 10 =:= 0 -> 306 | {Count+1, [Index | Deleted]}; 307 | ({_, _}, {Count, Deleted}) -> 308 | {Count+1, Deleted} 309 | end, {0, []}, L), 310 | [true = ets:delete(Hints, Index) || Index <- ToDelete]. 311 | 312 | handle_cast(stop, #state{logfile=File}=State) -> 313 | ok = file:close(File), 314 | {stop, normal, State}; 315 | handle_cast(_Msg, State) -> 316 | {noreply, State}. 317 | 318 | handle_info(_Info, State) -> 319 | {noreply, State}. 320 | 321 | terminate(_Reason, _State) -> 322 | ok. 323 | 324 | code_change(_OldVsn, State, _Extra) -> 325 | {ok, State}. 326 | 327 | 328 | %%==================================================================== 329 | %% Internal Functions 330 | %%==================================================================== 331 | 332 | maybe_append(_, [], State) -> 333 | State; 334 | maybe_append(eof, [Entry | Entries], State) -> 335 | NewState = write_entry(Entry, State), 336 | maybe_append(eof, Entries, NewState); 337 | maybe_append(Loc, [Entry | Entries], State=#state{logfile=File}) -> 338 | #rafter_entry{index=Index, term=Term}=Entry, 339 | case read_entry(File, Loc) of 340 | {entry, Data, NewLocation} -> 341 | case binary_to_entry(Data) of 342 | %% We already have this entry in the log. Continue. 343 | #rafter_entry{index=Index, term=Term} -> 344 | maybe_append(NewLocation, Entries, State); 345 | #rafter_entry{index=Index, term=_} -> 346 | NewState = truncate_and_write(File, Loc, Entry, State), 347 | maybe_append(eof, Entries, NewState) 348 | end; 349 | eof -> 350 | NewState = truncate_and_write(File, Loc, Entry, State), 351 | maybe_append(eof, Entries, NewState) 352 | end. 353 | 354 | truncate_and_write(File, Loc, Entry, State0) -> 355 | ok = truncate(File, Loc), 356 | State1 = maybe_reset_config(File, Loc, State0), 357 | State2 = State1#state{write_location=Loc}, 358 | write_entry(Entry, State2). 359 | 360 | -spec maybe_reset_config(file:io_device(), non_neg_integer(), #state{}) -> 361 | #state{}. 362 | maybe_reset_config(File, Loc, #state{config_loc=ConfigLoc}=State) -> 363 | case ConfigLoc >= Loc of 364 | true -> 365 | reset_config(File, Loc, State); 366 | false -> 367 | State 368 | end. 369 | 370 | -spec reset_config(file:io_device(), non_neg_integer(), #state{}) -> #state{}. 371 | reset_config(File, Loc, State) -> 372 | case Loc of 373 | ?FILE_HEADER_SIZE -> 374 | %% Empty file, so reset to blank config 375 | State#state{config_loc=0, config=#config{}}; 376 | _ -> 377 | %% Get config from the previous trailer 378 | TrailerLoc = Loc - ?TRAILER_SIZE, 379 | {ok, Trailer} = file:pread(File, TrailerLoc, ?TRAILER_SIZE), 380 | <> = Trailer, 381 | %% validate checksum, fail fast. 382 | CRC = erlang:crc32(Rest), 383 | <> = Rest, 384 | case ConfigLoc of 385 | 0 -> 386 | State#state{config_loc=0, config=#config{}}; 387 | _ -> 388 | {ok, Config} = read_config(File, ConfigLoc), 389 | State#state{config_loc=ConfigLoc, config=Config} 390 | end 391 | end. 392 | 393 | logname(IndexNode) -> list_to_atom(lists:concat(["log:",IndexNode])). 394 | 395 | init_file(File, 0) -> 396 | {ok, Loc} = write_file_header(File), 397 | {0, #config{}, 0, 0, Loc, ?LATEST_VERSION}; 398 | init_file(File, Size) -> 399 | case repair_file(File, Size) of 400 | {ok, ConfigLoc, Term, Index, WriteLoc} -> 401 | {ok, Version} = read_file_header(File), 402 | {ok, Config} = read_config(File, ConfigLoc), 403 | {ConfigLoc, Config, Term, Index, WriteLoc, Version}; 404 | empty_file -> 405 | {ok, Loc} = write_file_header(File), 406 | {0, #config{}, 0, 0, Loc, ?LATEST_VERSION} 407 | end. 408 | 409 | read_file_header(File) -> 410 | {ok, <>} = file:pread(File, 0, ?FILE_HEADER_SIZE), 411 | {ok, Version}. 412 | 413 | write_file_header(File) -> 414 | ok = file:write(File, <>), 415 | {ok, ?FILE_HEADER_SIZE}. 416 | 417 | make_trailer(EntryStart, ConfigStart) -> 418 | T = <>, 419 | Crc = erlang:crc32(T), 420 | <>. 421 | 422 | append_entries(File, Entries, State) -> 423 | NewState = lists:foldl(fun append_entry/2, State, Entries), 424 | ok = file:sync(File), 425 | NewState. 426 | 427 | %% Append an entry at the next location in the log. The entry does not yet have an 428 | %% index, so add one. 429 | append_entry(Entry, State=#state{index=Index}) -> 430 | NewIndex = Index + 1, 431 | NewEntry = Entry#rafter_entry{index=NewIndex}, 432 | write_entry(NewEntry, State). 433 | 434 | %% Precondition: each entry must have an index at this point. 435 | write_entry(Entry, State) -> 436 | #rafter_entry{index=Index, type=Type, cmd=Cmd}=Entry, 437 | #state{write_location=Loc, config=Config, config_loc=ConfigLoc, 438 | logfile=File} = State, 439 | BinEntry = entry_to_binary(Entry), 440 | {NewConfigLoc, NewConfig} = 441 | maybe_update_config(Type, Loc, Cmd, ConfigLoc, Config), 442 | Trailer = make_trailer(Loc, NewConfigLoc), 443 | ok = file:write(File, <>), 444 | NewLoc = Loc + byte_size(BinEntry) + ?TRAILER_SIZE, 445 | State#state{index=Index, 446 | config=NewConfig, 447 | write_location=NewLoc, 448 | config_loc=NewConfigLoc, 449 | last_entry=Entry}. 450 | 451 | maybe_update_config(config, NewConfigLoc, NewConfig, _, _) -> 452 | {NewConfigLoc, NewConfig}; 453 | maybe_update_config(_Type, _, _, CurConfigLoc, CurConfig) -> 454 | {CurConfigLoc, CurConfig}. 455 | 456 | read_config(File, Loc) -> 457 | {entry, Data, _} = read_entry(File, Loc), 458 | #rafter_entry{type=config, cmd=Config} = binary_to_entry(Data), 459 | {ok, Config}. 460 | 461 | %% TODO: Write to a tmp file then rename so the write is always atomic and the 462 | %% metadata file cannot become partially written. 463 | write_metadata(Filename, Meta) -> 464 | ok = file:write_file(Filename, term_to_binary(Meta)). 465 | 466 | read_metadata(Filename, FileSize) -> 467 | case file:read_file(Filename) of 468 | {ok, Bin} -> 469 | {ok, binary_to_term(Bin)}; 470 | {error, enoent} when FileSize =< ?FILE_HEADER_SIZE -> 471 | {ok, #meta{}}; 472 | {error, Reason} -> 473 | io:format("Failed to open metadata file: ~p. Reason = ~p~n", 474 | [Filename, Reason]), 475 | {ok, #meta{}} 476 | end. 477 | 478 | truncate(File, Pos) -> 479 | {ok, _} = file:position(File, Pos), 480 | file:truncate(File). 481 | 482 | maybe_truncate(File, TruncateAt, FileSize) -> 483 | case TruncateAt < FileSize of 484 | true -> 485 | ok = truncate(File, TruncateAt); 486 | false -> 487 | ok 488 | end. 489 | 490 | repair_file(File, Size) -> 491 | case scan_for_trailer(File, Size) of 492 | {ok, ConfigStart, EntryStart, TruncateAt} -> 493 | maybe_truncate(File, TruncateAt, Size), 494 | {entry, Data, _} = read_entry(File, EntryStart), 495 | #rafter_entry{term=Term, index=Index} = binary_to_entry(Data), 496 | {ok, ConfigStart, Term, Index, TruncateAt}; 497 | not_found -> 498 | io:format("NOT FOUND: Size = ~p~n", [Size]), 499 | ok = truncate(File, 0), 500 | empty_file 501 | end. 502 | 503 | scan_for_trailer(File, Loc) -> 504 | case find_magic_number(File, Loc) of 505 | {ok, MagicLoc} -> 506 | case file:pread(File, MagicLoc - (?TRAILER_SIZE-?MAGIC_SIZE), ?TRAILER_SIZE) of 507 | {ok, <>} -> 508 | case erlang:crc32(<>) of 509 | Crc -> 510 | {ok, ConfigStart, EntryStart, MagicLoc + 8}; 511 | _ -> 512 | scan_for_trailer(File, MagicLoc) 513 | end; 514 | eof -> 515 | not_found 516 | end; 517 | not_found -> 518 | not_found 519 | end. 520 | 521 | read_block(File, Loc) -> 522 | case Loc < ?READ_BLOCK_SIZE of 523 | true -> 524 | {ok, Buffer} = file:pread(File, 0, Loc), 525 | {Buffer, 0}; 526 | false -> 527 | Start = Loc - ?READ_BLOCK_SIZE, 528 | {ok, Buffer} = file:pread(File, Start, ?READ_BLOCK_SIZE), 529 | {Buffer, Start} 530 | end. 531 | 532 | %% @doc Continuously read blocks from the file and search backwards until the 533 | %% magic number is found or we reach the beginning of the file. 534 | find_magic_number(File, Loc) -> 535 | {Block, Start} = read_block(File, Loc), 536 | case find_last_magic_number_in_block(Block) of 537 | {ok, Offset} -> 538 | io:format("Magic Number found at ~p~n", [Start+Offset]), 539 | {ok, Start+Offset}; 540 | not_found -> 541 | case Start of 542 | 0 -> 543 | not_found; 544 | _ -> 545 | %% Ensure we search the overlapping 8 bytes between blocks 546 | find_magic_number(File, Start+8) 547 | end 548 | end. 549 | 550 | -spec find_last_magic_number_in_block(binary()) -> 551 | {ok, non_neg_integer()} | not_found. 552 | find_last_magic_number_in_block(Block) -> 553 | case string:rstr(binary_to_list(Block), binary_to_list(?MAGIC)) of 554 | 0 -> 555 | not_found; 556 | Index -> 557 | %% We want the 0 based binary offset, not the 1 based list offset. 558 | {ok, Index - 1} 559 | end. 560 | 561 | get_pos(File, Loc, Index) -> 562 | get_pos(File, Loc, Index, 0). 563 | 564 | get_pos(File, Loc, Index, Count) -> 565 | case file:pread(File, Loc, ?HEADER_SIZE) of 566 | {ok, <<_Sha1:20/binary, _Type:8, _Term:64, Index:64, _DataSize:32>>} -> 567 | {Loc, Count}; 568 | {ok, <<_:37/binary, DataSize:32>>} -> 569 | get_pos(File, next_entry_loc(Loc, DataSize), Index, Count+1); 570 | eof -> 571 | {eof, Count} 572 | end. 573 | 574 | %% @doc Find an entry at the given index in a file. Search forward from Loc. 575 | find_entry(File, Loc, Index) -> 576 | find_entry(File, Loc, Index, 0). 577 | 578 | find_entry(File, Loc, Index, Count) -> 579 | case file:pread(File, Loc, ?HEADER_SIZE) of 580 | {ok, <<_Sha1:20/binary, _Type:8, _Term:64, Index:64, _DataSize:32>>=Header} -> 581 | case read_data(File, Loc + ?HEADER_SIZE, Header) of 582 | {entry, Entry, _} -> 583 | {binary_to_entry(Entry), Loc, Count}; 584 | eof -> 585 | %% This should only occur if the entry is currently being written. 586 | {not_found, Count} 587 | end; 588 | {ok, <<_:37/binary, DataSize:32>>} -> 589 | NextLoc = next_entry_loc(Loc, DataSize), 590 | find_entry(File, NextLoc, Index, Count+1); 591 | eof -> 592 | {not_found, Count} 593 | end. 594 | 595 | next_entry_loc(Loc, DataSize) -> 596 | Loc + ?HEADER_SIZE + DataSize + ?TRAILER_SIZE. 597 | 598 | find_last_entry(_File, WriteLocation) when WriteLocation =< ?FILE_HEADER_SIZE -> 599 | undefined; 600 | find_last_entry(File, WriteLocation) -> 601 | {ok, <<_:32, _:64, EntryStart:64, _/binary>>} = 602 | file:pread(File, WriteLocation - ?TRAILER_SIZE, ?TRAILER_SIZE), 603 | {entry, Entry, _} = read_entry(File, EntryStart), 604 | binary_to_entry(Entry). 605 | 606 | %% @doc This function reads the next entry from the log at the given location 607 | %% and returns {entry, Entry, NewLocation}. If the end of file has been reached, 608 | %% return eof to the client. Errors are fail-fast. 609 | -spec read_entry(file:io_device(), non_neg_integer()) -> 610 | {entry, binary(), non_neg_integer()} | {skip, non_neg_integer()} | eof. 611 | read_entry(File, Location) -> 612 | case file:pread(File, Location, ?HEADER_SIZE) of 613 | {ok, <<_Sha1:20/binary, _Type:8, _Term:64, _Index:64, _DataSize:32>>=Header} -> 614 | read_data(File, Location + ?HEADER_SIZE, Header); 615 | eof -> 616 | eof 617 | end. 618 | 619 | -spec read_data(file:io_device(), non_neg_integer(), binary()) -> 620 | {entry, binary(), non_neg_integer()} | eof. 621 | read_data(File, Location, <>=H) -> 622 | case file:pread(File, Location, Size) of 623 | {ok, Data} -> 624 | %% Fail-fast Integrity check. TODO: Offer user repair options? 625 | Sha1 = crypto:hash(sha, <>), 626 | NewLocation = Location + Size + ?TRAILER_SIZE, 627 | {entry, <>, NewLocation}; 628 | eof -> 629 | eof 630 | end. 631 | 632 | -ifdef(TEST). 633 | -include_lib("eunit/include/eunit.hrl"). 634 | -define(PEER, test). 635 | 636 | cleanup() -> 637 | os:cmd("rm -rf /tmp/rafter_test*"). 638 | 639 | %% REGRESSION: - see https://github.com/andrewjstone/rafter/pull/32 640 | log_overwrite_test() -> 641 | cleanup(), 642 | Opts = #rafter_opts{logdir="/tmp"}, 643 | {ok, _Pid} = rafter_log:start_link(?PEER, Opts), 644 | assert_empty(), 645 | 646 | %% We are appending Entry1 as the leader, so it has no index. 647 | Entry1 = #rafter_entry{type=config, term=1, index=undefined, 648 | cmd=#config{state=stable}}, 649 | assert_leader_append(1, 1, Entry1), 650 | ConfigLoc0 = assert_stable_config(), 651 | 652 | Entry2 = #rafter_entry{type=noop, term=1, index=undefined, cmd=noop}, 653 | assert_leader_append(2, 1, Entry2), 654 | ConfigLoc1 = assert_stable_config(), 655 | ?assertEqual(ConfigLoc0, ConfigLoc1), 656 | 657 | %% A new leader takes over and this log gets its entry overwritten. 658 | %% In reality index 1 will always be a #config{}, but this validates the 659 | %% test that config gets reset. 660 | Entry = #rafter_entry{type=noop, term=2, index=1, cmd=noop}, 661 | assert_follower_append(Entry), 662 | assert_blank_config(), 663 | 664 | %% This peer becomes leader again and appends 2 configs 665 | Entry3 = #rafter_entry{type=config, term=3, cmd=#config{state=stable}}, 666 | assert_leader_append(2, 3, Entry3), 667 | ConfigLoc2 = assert_stable_config(), 668 | 669 | Entry4 = #rafter_entry{type=config, term=3, cmd=#config{state=stable}}, 670 | assert_leader_append(3, 3, Entry4), 671 | ConfigLoc3 = assert_stable_config(), 672 | ?assertNotEqual(ConfigLoc2, ConfigLoc3), 673 | 674 | %% A new leader takes over and truncates the last config 675 | Entry5 = #rafter_entry{type=noop, term=4, index=3, cmd=noop}, 676 | assert_follower_append(Entry5), 677 | ConfigLoc4 = assert_stable_config(), 678 | ?assertEqual(ConfigLoc2, ConfigLoc4), 679 | Index = rafter_log:get_last_index(?PEER), 680 | ?assertEqual(Index, 3), 681 | {ok, Entry6} = rafter_log:get_last_entry(?PEER), 682 | ?assertEqual(Entry5, Entry6), 683 | 684 | %% A new leader takes over and truncates the last stable config 685 | %% New config is at position 0 686 | Entry7 = #rafter_entry{type=noop, term=5, index=2, cmd=noop}, 687 | assert_follower_append(Entry7), 688 | assert_blank_config(), 689 | Index2 = rafter_log:get_last_index(?PEER), 690 | ?assertEqual(Index2, 2), 691 | {ok, Entry8} = rafter_log:get_last_entry(?PEER), 692 | ?assertEqual(Entry7, Entry8), 693 | 694 | rafter_log:stop(?PEER). 695 | 696 | assert_leader_append(ExpectedIndex, ExpectedTerm, Entry) -> 697 | {ok, Index} = rafter_log:append(?PEER, [Entry]), 698 | ?assertEqual(ExpectedIndex, Index), 699 | {ok, Entry1} = rafter_log:get_entry(?PEER, Index), 700 | {ok, Entry1} = rafter_log:get_last_entry(?PEER), 701 | Index = rafter_log:get_last_index(?PEER), 702 | ?assertEqual(Entry1#rafter_entry.index, ExpectedIndex), 703 | ?assertEqual(Entry1#rafter_entry.term, ExpectedTerm). 704 | 705 | assert_follower_append(Entry) -> 706 | %% Note that follower appends always have indexes since they are sent 707 | %% from the leader who has already written the entry to its log. 708 | Index = Entry#rafter_entry.index, 709 | {ok, Index} = rafter_log:check_and_append(?PEER, [Entry], Index), 710 | {ok, Entry1} = rafter_log:get_entry(?PEER, Index), 711 | ?assertEqual(Entry, Entry1). 712 | 713 | assert_blank_config() -> 714 | Config = rafter_log:get_config(?PEER), 715 | ?assertEqual(blank, Config#config.state), 716 | State = sys:get_state(logname(?PEER)), 717 | ?assertEqual(State#state.config_loc, 0). 718 | 719 | assert_stable_config() -> 720 | Config = rafter_log:get_config(?PEER), 721 | ?assertEqual(stable, Config#config.state), 722 | State = sys:get_state(logname(?PEER)), 723 | ConfigLoc = State#state.config_loc, 724 | ?assertNotEqual(ConfigLoc, 0), 725 | ConfigLoc. 726 | 727 | assert_empty() -> 728 | ?assertEqual({ok, not_found}, rafter_log:get_last_entry(?PEER)), 729 | ?assertEqual(0, rafter_log:get_last_index(?PEER)), 730 | assert_blank_config(). 731 | 732 | -endif. 733 | -------------------------------------------------------------------------------- /src/consensus/cr_paxon.erl: -------------------------------------------------------------------------------- 1 | -module(cr_paxon). 2 | -author('Uenishi Kota'). 3 | -behaviour(gen_fsm). 4 | -compile(export_all). 5 | -include("cr.hrl"). 6 | -export(?GEN_FSM). 7 | 8 | -export([nil/2, 9 | preparing/2, 10 | proposing/2, 11 | acceptor/2, 12 | learner/2, 13 | decided/2]). 14 | 15 | -define( TIMEOUT, 3000 ). 16 | 17 | -record( state, {subject, n, value, 18 | all, quorum, current=0, others, init_n, 19 | return_pids=[] 20 | } ). 21 | 22 | version_info()-> {?MODULE, 1}. 23 | 24 | start(S, InitN, V, Others, ReturnPids) -> 25 | All = length(Others)+1, Quorum = All / 2 , 26 | InitStateData = #state{ subject=S, n=InitN, value=V, 27 | all=All, quorum=Quorum, others=Others, init_n=InitN, 28 | return_pids=ReturnPids }, 29 | gen_fsm:start_link( 30 | generate_global_address( node(), S ), %FsmName %%{global, ?MODULE}, %{local, {?MODULE, S} }, 31 | ?MODULE, %Module 32 | InitStateData, %Args 33 | [{timeout, ?TIMEOUT}] %Options %%, {debug, debug_info} ] 34 | ). 35 | 36 | stop(S) ->gen_fsm:send_all_state_event( generate_global_address( node(),S ), stop). 37 | get_result(S)-> gen_fsm:sync_send_all_state_event( generate_global_address( node(),S ), result). 38 | 39 | init(InitStateData)-> 40 | io:format("~p ~p: ~p~n", [?MODULE, started, InitStateData]), 41 | process_flag(trap_exit, true), 42 | {ok, 43 | nil, %% initial statename 44 | InitStateData, %%{{S, InitN, V},{All, Quorum, 0, Others, InitN}, Misc }, %% initial state data 45 | ?TIMEOUT %% initial state timeout 46 | }. 47 | 48 | broadcast(Others, S, Message)-> 49 | PaxosOthers = [ generate_global_address( P, S ) || P <- Others ], 50 | lists:map( fun(Other)-> gen_fsm:send_event( Other, Message ) end , %Timeout * 1) end, 51 | PaxosOthers ). 52 | 53 | send(Node, S, Message)-> gen_fsm:send_event( generate_global_address( Node, S ), Message ). 54 | get_next_n( N , All )-> (( N div All )+1) * All. 55 | generate_global_address( Node, Subject )-> {global, {?MODULE, Node, Subject}}. 56 | 57 | nil( {prepare, {S, N, _V, From}}, StateData) when N > StateData#state.n -> 58 | send(From, S, {prepare_result, {S, 0, nil, node()}}), 59 | NewStateData = StateData#state{n=N}, 60 | {next_state, acceptor, NewStateData, %{{S, N, V}, Nums}, 61 | ?TIMEOUT}; 62 | nil( {prepare, {S, N, _V, From}}, StateData) when N < StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N < Nc -> 63 | send(From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node()}}), 64 | {next_state, nil, StateData, ?TIMEOUT}; 65 | nil( {decide, {S, N, V, _From}}, StateData ) -> % when N == Nc 66 | S=StateData#state.subject, 67 | decided_callback( StateData#state{n=N, value=V} ); 68 | nil( timeout, StateData )-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN}} )-> 69 | NewN = get_next_n( StateData#state.n, StateData#state.all ) + StateData#state.init_n, 70 | io:format( "PAXON ~p. ~n", [[NewN, StateData]]), 71 | S=StateData#state.subject, 72 | V=StateData#state.value, 73 | Result = broadcast( StateData#state.others, S, {prepare, {S, NewN, V, node()}} ), 74 | io:format( "BROADCAST: ~p. ~n", [Result]), 75 | {next_state, preparing, StateData#state{n=NewN, current=1}, ?TIMEOUT}; 76 | nil(UnknownEvent, StateData)-> % ignore 77 | io:format( "unknown event: ~p, ~p : all ignored.~n", [UnknownEvent, StateData] ), 78 | {next_state, nil, StateData, ?TIMEOUT}. 79 | 80 | preparing( {prepare, {S, N, _V, From}}, StateData ) when N < StateData#state.n -> 81 | send( From, S, {prepare, {S, StateData#state.n, StateData#state.value, node()} } ), 82 | {next_state, preparing, StateData, ?TIMEOUT}; 83 | preparing( {prepare, {S, N, V, From}}, StateData ) when N > StateData#state.n -> 84 | send( From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node()}}), 85 | io:format("sending prepare_result and going acceptor...~n", []), 86 | {next_state, acceptor, StateData#state{subject=S, n=N, value=V}, ?TIMEOUT}; 87 | preparing( {prepare_result, {S, N, V, From}}, StateData ) when N > StateData#state.n -> 88 | send( From, S, {prepare_result, {S, N, V, node()}} ), 89 | {next_state, acceptor, StateData#state{subject=S, n=N, value=V}, ?TIMEOUT}; 90 | preparing( {prepare_result, {S, _N, _V, _From}}, StateData ) when StateData#state.current > StateData#state.quorum -> 91 | broadcast( StateData#state.others, S, {propose, {S,StateData#state.n,StateData#state.value, node()}} ), 92 | {next_state, proposing, StateData#state{current=1}, ?TIMEOUT}; 93 | preparing( {prepare_result, {S, N, V, _From}}, StateData ) 94 | when S==StateData#state.subject , N==StateData#state.n , V==StateData#state.value -> 95 | Current = StateData#state.current, 96 | {next_state, proposing, StateData#state{current=Current+1}, ?TIMEOUT}; 97 | preparing( {prepare_result, {S, N, _V, _From}}, StateData ) when N < StateData#state.n -> 98 | case (StateData#state.current + 1 > StateData#state.quorum) of 99 | true -> 100 | io:format("got quorum at prepare!~n", []), 101 | broadcast( StateData#state.others, S, {propose, {S, StateData#state.n, StateData#state.value, node()}} ), 102 | {next_state, proposing, StateData#state{current=1}, ?TIMEOUT}; 103 | false -> 104 | Current = StateData#state.current, 105 | {next_state, preparing, StateData#state{current=Current+1}, ?TIMEOUT} 106 | %{{S, Nc, Vc}, {All, Quorum, Current+1, Others, InitN}}, 107 | end; 108 | preparing( {propose, {S, N, V, From}}, StateData ) when N > StateData#state.n -> 109 | send( From, S, {propose_result, {S, N, V, node()}} ), 110 | {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT}; 111 | preparing( {propose_result, {S, N, V, From}}, StateData) when N > StateData#state.n -> 112 | send( From, S, {propose_result, {S, N, V, node()}} ), 113 | {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT}; 114 | preparing( {decide, {_S, N, V, _From}}, StateData)-> %{{S, _Nc, _Vc}, Nums} ) -> 115 | decided_callback( StateData#state{n=N, value=V} ); 116 | 117 | preparing( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN} } )-> 118 | {next_state, nil, StateData#state{current=0}, ?TIMEOUT}. 119 | proposing( {prepare, {S, N, V, From}}, StateData) when N > StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N > Nc -> 120 | send( From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node() }}), 121 | {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT}; 122 | proposing( {prepare_result, {S, N, V, From}}, StateData) when N > StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N > Nc -> 123 | send( From, S, {prepare_result, {S, StateData#state.n, StateData#state.value, node()}}), 124 | {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT}; 125 | proposing( {propose, {S, N, V, From}}, StateData) when N > StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N > Nc -> 126 | send( From, S, {propose_result, {S, StateData#state.n, StateData#state.value, node()}}), 127 | {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT}; 128 | proposing( {propose_result, {S, N, V, _From}}, StateData) 129 | when N==StateData#state.n, V==StateData#state.value, StateData#state.quorum > StateData#state.current+1 -> 130 | S=StateData#state.subject, 131 | Current = StateData#state.current, 132 | {next_state, proposing, StateData#state{current=Current+1}, ?TIMEOUT }; 133 | proposing( {propose_result, {S, N, V, _From}}, StateData) when N==StateData#state.n, V==StateData#state.value-> 134 | io:format("PROPOSING quorum result~n", []), 135 | broadcast( StateData#state.others, S, {decide, {S, N, V, node()}} ), 136 | Current=StateData#state.current, 137 | decided_callback( StateData#state{current=Current+1} ); 138 | proposing( {propose_result, {S, N, V, From}}, StateData) when N > StateData#state.n -> % {{S, Nc, _Vc}, Nums} ) when N > Nc -> 139 | send( From, S, {propose_result, {S, N, V, node()}}), 140 | {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT}; 141 | proposing( {decide, {S, N, V, _From}}, StateData) when N >= StateData#state.n-> %{{S, Nc, _Vc}, Nums} ) when N >= Nc -> 142 | S=StateData#state.subject, 143 | decided_callback( StateData#state{n=N, value=V} ); 144 | proposing( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN}} )-> 145 | io:format("PROPOSING timeout state: ~p~n" , [StateData]), 146 | {next_state, nil, StateData#state{current=1}, ?TIMEOUT}; 147 | proposing( _Event, StateData) -> 148 | {next_state, proposing, StateData}. 149 | 150 | 151 | acceptor( {prepare, {S, N, _V, From}}, StateData) when N < StateData#state.n-> %{{S, Nc, Vc}, Nums} ) when N < Nc -> 152 | send( From, S, { prepare_result, {S, StateData#state.n, StateData#state.value, node()}} ), 153 | {next_state, acceptor, StateData, ?TIMEOUT}; 154 | acceptor( {prepare, {S, N, V, From}}, StateData ) when N >= StateData#state.n -> 155 | send( From, S, { prepare_result, {S, StateData#state.n, StateData#state.value, node()}} ), 156 | {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT}; 157 | acceptor( {propose, {S, N, _V, _From}}, StateData) when N < StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N < Nc -> 158 | io:format("bad state: ~p (N,Nc)=(~p)~n" , [{propose},{ N, StateData#state.n}]), 159 | S=StateData#state.subject, 160 | {next_state, propose, StateData, ?TIMEOUT}; 161 | acceptor( {propose, {S, N, V, From}}, StateData ) when N > StateData#state.n -> 162 | send( From, S, {propose_result , {S, StateData#state.n, StateData#state.value, node() }} ), 163 | {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT}; 164 | acceptor( {propose, {S, N, V, From}}, StateData)-> % when N == Nc 165 | {N,V}={StateData#state.n, StateData#state.value}, 166 | send( From, S, {propose_result , {S, StateData#state.n, StateData#state.value, node() }} ), 167 | {next_state, learner, StateData, ?TIMEOUT}; 168 | acceptor( {decide, {S, N, V, _From}}, StateData) when N >= StateData#state.n -> %{{S, Nc, _Vc}, Nums} ) when N >= Nc -> 169 | S=StateData#state.subject, 170 | decided_callback( StateData#state{n=N, value=V} ); 171 | acceptor( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN} })-> 172 | io:format("ACCEPTOR timeout: ~p (N,V)=(~p)~n" , [{propose},{StateData#state.n, StateData#state.value}]), 173 | {next_state, nil, StateData#state{current=1}, ?TIMEOUT}; 174 | 175 | acceptor( _Event, StateData) -> 176 | io:format("ACCEPTOR unknown event: ~p ,~p~n" , [_Event , StateData]), 177 | {next_state, acceptor, StateData}. 178 | 179 | learner( {prepare, {S, N, V, From}}, StateData) when N > StateData#state.n -> % {{S, Nc, _Vc}, Nums} ) when N > Nc -> 180 | send( From, S, {prepare_result, {S, N, V, node()}} ), 181 | {next_state, acceptor, StateData#state{n=N, value=V}, ?TIMEOUT}; 182 | learner( {prepare_result, {S, _N, _V, _From}}, StateData )-> % when N < Nc -> 183 | S=StateData#state.subject, 184 | {next_state, learner, StateData, ?TIMEOUT }; 185 | learner( {propose, {S, N, _V, _From}}, StateData) when N < StateData#state.n -> %{{S, Nc, Vc}, Nums} ) when N < Nc -> 186 | S=StateData#state.subject, 187 | {next_state, learner, StateData, ?TIMEOUT}; 188 | learner( {propose, {S, N, V, From}}, StateData) when N > StateData#state.n -> %{{S, Nc, _Vc}, Nums} ) when N > Nc -> 189 | send( From, S, {propose_result, {S, N, V, node()}}), 190 | {next_state, learner, StateData#state{n=N, value=V}, ?TIMEOUT}; 191 | learner( {decide, {S, N, V, _From}}, StateData) when N >= StateData#state.n -> %{{S, Nc, _Vc}, Nums} ) when N >= Nc -> 192 | S=StateData#state.subject, 193 | decided_callback( StateData#state{n=N, value=V} ); 194 | learner( timeout, StateData)-> %{{S, N, V}, {All, Quorum, _Current, Others, InitN}} )-> 195 | {next_state, nil, StateData#state{current=0}, ?TIMEOUT}; 196 | learner( _Event, StateData )-> 197 | {next_state, learner, StateData }. 198 | 199 | decided( {_Message, {S,_N,_V, From}}, StateData)-> 200 | send( From, S, {decide, {S,StateData#state.n, StateData#state.value,node()}} ), 201 | {next_state, decided, StateData, ?TIMEOUT }; 202 | decided( timeout, StateData )-> 203 | io:format( "PAXON mediation: ~p/~p~n", [StateData#state.value, StateData#state.n] ), 204 | {stop, normal, StateData }. 205 | 206 | decided_callback(StateData)-> 207 | callback(StateData#state.subject, StateData#state.value, StateData#state.return_pids ), 208 | {next_state, decided, StateData, ?TIMEOUT}. 209 | 210 | callback(S, V, ReturnPids)-> 211 | lists:map( fun(ReturnPid)-> ReturnPid ! {self(), result, {S, V}} end, ReturnPids ). 212 | 213 | code_change(_,_,_,_)-> ok. 214 | handle_event( stop, _StateName, StateData )-> {stop, normal, StateData}. 215 | handle_info(_,_,_)-> ok. 216 | handle_sync_event(result, _From, StateName, StateData)-> {reply, {StateName, StateName#state.value} , StateName, StateData}; 217 | handle_sync_event(stop, From, StateName, StateData)-> {stop, From, StateName, StateData}. 218 | terminate(Reason, StateName, StateData) -> 219 | io:format("Module ~p terminated with reason: ~p~n", [?MODULE, Reason]), 220 | io:format("State ~p with data: ~p~n", [StateName, StateData]), 221 | ok. 222 | -------------------------------------------------------------------------------- /src/consensus/cr_rafter.erl: -------------------------------------------------------------------------------- 1 | -module(cr_rafter). 2 | -author('Andrew J. Stone'). 3 | -description('RAFT protocol'). 4 | -behaviour(gen_fsm). 5 | -include("rafter.hrl"). 6 | -include("rafter_consensus_fsm.hrl"). 7 | -include("rafter_opts.hrl"). 8 | -include("cr.hrl"). 9 | -export(?GEN_FSM). 10 | -compile(export_all). 11 | -export([follower/2, follower/3, candidate/2, candidate/3, leader/2, leader/3]). 12 | 13 | -define(CLIENT_TIMEOUT, 2000). 14 | -define(ELECTION_TIMEOUT_MIN, 500). 15 | -define(ELECTION_TIMEOUT_MAX, 1000). 16 | -define(HEARTBEAT_TIMEOUT, 100). 17 | 18 | start_link({Index,Node}, Opts) -> 19 | Name = list_to_atom(lists:concat([Index,':',Node])), 20 | io:format("RAFTER start_link ~p~n",[{Index,Node}]), 21 | gen_fsm:start_link({local,Node},?MODULE, [Node, Opts], []). 22 | 23 | raftname(Name) -> list_to_atom(lists:concat(["rafter:",Name])). 24 | 25 | init([Me, #rafter_opts{state_machine=StateMachine,cluster=Nodes}]) -> 26 | Timer = gen_fsm:send_event_after(election_timeout(), timeout), 27 | #meta{voted_for=VotedFor, term=Term} = cr_log:get_metadata(Me), 28 | BackendState = StateMachine:init(Me), 29 | io:format("RAFTER INIT Me: ~p~n",[Me]), 30 | State = #state{term=Term, 31 | voted_for=VotedFor, 32 | me=Me, 33 | responses=dict:new(), 34 | followers=dict:new(), 35 | commit_index = cr_log:get_last_index(cr:node()), 36 | timer=Timer, 37 | state_machine=StateMachine, 38 | backend_state=BackendState}, 39 | Config = cr:config(), 40 | NewState = 41 | case Config#config.state of 42 | blank -> 43 | State#state{config=Config}; 44 | _ -> 45 | State#state{config=Config, init_config=complete} 46 | end, 47 | {ok, follower, NewState}. 48 | 49 | stop(Pid) -> gen_fsm:send_all_state_event({Pid,Pid}, stop). 50 | op(Command) -> gen_fsm:sync_send_event(get_leader(cr:node()), {op, Command}). 51 | op(Peer, Command) -> gen_fsm:sync_send_event({Peer,Peer}, {op, Command}). 52 | read_op(Peer, Command) -> gen_fsm:sync_send_event({Peer,Peer}, {read_op, Command}). 53 | set_config(Peer, Config) -> gen_fsm:sync_send_event({Peer,Peer}, {set_config, Config}). 54 | get_leader(Pid) -> gen_fsm:sync_send_all_state_event({Pid,Pid}, get_leader). 55 | send(To, Msg) -> catch gen_fsm:send_event({To,To}, Msg). 56 | send_sync(To, Msg) -> Timeout=100, gen_fsm:sync_send_event(To, Msg, Timeout). 57 | format_status(_, [_, State]) -> Data = lager:pr(State, ?MODULE), [{data, [{"StateData", Data}]}]. 58 | 59 | handle_event(stop, _, State) -> 60 | {stop, normal, State}; 61 | handle_event(_Event, _StateName, State) -> 62 | {stop, {error, badmsg}, State}. 63 | 64 | handle_sync_event(get_leader, _, StateName, State=#state{leader=Leader}) -> 65 | {reply, Leader, StateName, State}; 66 | handle_sync_event(_Event, _From, _StateName, State) -> 67 | {stop, badmsg, State}. 68 | 69 | handle_info({client_read_timeout, Clock, Id}, StateName, 70 | #state{read_reqs=Reqs}=State) -> 71 | ClientRequests = orddict:fetch(Clock, Reqs), 72 | {ok, ClientReq} = find_client_req(Id, ClientRequests), 73 | send_client_timeout_reply(ClientReq), 74 | NewClientRequests = delete_client_req(Id, ClientRequests), 75 | NewReqs = orddict:store(Clock, NewClientRequests, Reqs), 76 | NewState = State#state{read_reqs=NewReqs}, 77 | {next_state, StateName, NewState}; 78 | 79 | handle_info({client_timeout, Id}, StateName, #state{client_reqs=Reqs}=State) -> 80 | case find_client_req(Id, Reqs) of 81 | {ok, ClientReq} -> 82 | send_client_timeout_reply(ClientReq), 83 | NewState = State#state{client_reqs=delete_client_req(Id, Reqs)}, 84 | {next_state, StateName, NewState}; 85 | not_found -> 86 | {next_state, StateName, State} 87 | end; 88 | handle_info(_, _, State) -> 89 | {stop, badmsg, State}. 90 | 91 | terminate(_, _, _) -> 92 | ok. 93 | 94 | code_change(_OldVsn, StateName, State, _Extra) -> 95 | {ok, StateName, State}. 96 | 97 | %%============================================================================= 98 | %% States 99 | %% 100 | %% Note: All RPC's and client requests get answered in State/3 functions. 101 | %% RPC Responses get handled in State/2 functions. 102 | %%============================================================================= 103 | 104 | %% Election timeout has expired. Go to candidate state iff we are a voter. 105 | follower(timeout, #state{config=Config, me=Me}=State0) -> 106 | io:format("RAFTER FOLLOWER timeout~n",[]), 107 | case cr_config:has_vote(Me, Config) of 108 | false -> 109 | State = reset_timer(election_timeout(), State0), 110 | NewState = State#state{leader=undefined}, 111 | {next_state, follower, NewState}; 112 | true -> 113 | State = become_candidate(State0), 114 | {next_state, candidate, State} 115 | end; 116 | 117 | %% Ignore stale messages. 118 | follower(#vote{}, State) -> 119 | io:format("RAFTER FOLLOWER #vote~n",[]), 120 | {next_state, follower, State}; 121 | follower(#append_entries_rpy{}, State) -> 122 | {next_state, follower, State}. 123 | 124 | %% Vote for this candidate 125 | follower(#request_vote{}=RequestVote, _From, State) -> 126 | io:format("RAFTER FOLLOWER #req_vote~n",[]), 127 | handle_request_vote(RequestVote, State); 128 | 129 | follower(#append_entries{term=Term}, _From, 130 | #state{term=CurrentTerm, me=Me}=State) when CurrentTerm > Term -> 131 | Rpy = #append_entries_rpy{from=Me, term=CurrentTerm, success=false}, 132 | io:format("RAFTER FOLLOWER #append Me: ~p success: false~n",[Me]), 133 | {reply, Rpy, follower, State}; 134 | 135 | follower(#append_entries{term=Term, from=From, prev_log_index=PrevLogIndex, 136 | entries=Entries, commit_index=CommitIndex, 137 | send_clock=Clock}=AppendEntries, 138 | _From, #state{me=Me}=State) -> 139 | %io:format("RAFTER FOLLOWER #append Me: ~p~n",[Me]), 140 | State2=set_term(Term, State), 141 | Rpy = #append_entries_rpy{send_clock=Clock, 142 | term=Term, 143 | success=false, 144 | from=Me}, 145 | %% Always reset the election timer here, since the leader is valid, 146 | %% but may have conflicting data to sync 147 | State3 = reset_timer(election_timeout(), State2), 148 | case consistency_check(AppendEntries, State3) of 149 | false -> 150 | {reply, Rpy, follower, State3}; 151 | true -> 152 | {ok, CurrentIndex} = cr_log:check_and_append(Me,Entries, PrevLogIndex+1), 153 | Config = cr_log:get_config(Me), 154 | NewRpy = Rpy#append_entries_rpy{success=true, index=CurrentIndex}, 155 | State4 = commit_entries(CommitIndex, State3), 156 | State5 = State4#state{leader=From, config=Config}, 157 | {reply, NewRpy, follower, State5} 158 | end; 159 | 160 | follower({set_config, _}, _From, #state{leader=undefined, me=Me, config=C}=State) -> 161 | io:format("RAFTER FOLLOWER set_config ~p~n",[Me]), 162 | Error = no_leader_error(Me, C), 163 | {reply, {error, Error}, follower, State}; 164 | 165 | follower({set_config, _}, _From, #state{leader=Leader}=State) -> 166 | io:format("RAFTER FOLLOWER set_config ~p~n",[Leader]), 167 | Reply = {error, {redirect, Leader}}, 168 | {reply, Reply, follower, State}; 169 | 170 | follower({read_op, _}, _From, #state{me=Me, config=Config, leader=undefined}=State) -> 171 | io:format("RAFTER FOLLOWER read_op ~p~n",[Me]), 172 | Error = no_leader_error(Me, Config), 173 | {reply, {error, Error}, follower, State}; 174 | 175 | follower({read_op, _}, _From, #state{leader=Leader}=State) -> 176 | io:format("RAFTER FOLLOWER read_op~n",[]), 177 | Reply = {error, {redirect, Leader}}, 178 | {reply, Reply, follower, State}; 179 | 180 | follower({op, _Command}, _From, #state{me=Me, config=Config, leader=undefined}=State) -> 181 | io:format("RAFTER FOLLOWER read_op~n",[]), 182 | Error = no_leader_error(Me, Config), 183 | {reply, {error, Error}, follower, State}; 184 | 185 | follower({op, _Command}, _From, #state{leader=Leader}=State) -> 186 | io:format("RAFTER FOLLOWER read_op~n",[]), 187 | Reply = {error, {redirect, Leader}}, 188 | {reply, Reply, follower, State}. 189 | 190 | %% This is the initial election to set the initial config. We did not 191 | %% get a quorum for our votes, so just reply to the user here and keep trying 192 | %% until the other nodes come up. 193 | candidate(timeout, #state{term=1, init_config=[_Id, From]}=S) -> 194 | io:format("RAFTER CANDIDATE timeout ~n",[]), 195 | State0 = reset_timer(election_timeout(), S), 196 | gen_fsm:reply(From, {error, peers_not_responding}), 197 | State = State0#state{init_config=no_client}, 198 | {next_state, candidate, State}; 199 | 200 | %% The election timeout has elapsed so start an election 201 | candidate(timeout, State) -> 202 | io:format("RAFTER CANDIDATE timeout~n",[]), 203 | NewState = become_candidate(State), 204 | {next_state, candidate, NewState}; 205 | 206 | %% This should only happen if two machines are configured differently during 207 | %% initial configuration such that one configuration includes both proposed leaders 208 | %% and the other only itself. Additionally, there is not a quorum of either 209 | %% configuration's servers running. 210 | %% 211 | %% (i.e. rafter:set_config(b, [k, b, j]), rafter:set_config(d, [i,k,b,d,o]). 212 | %% when only b and d are running.) 213 | %% 214 | %% Thank you EQC for finding this one :) 215 | candidate(#vote{term=VoteTerm, success=false}, 216 | #state{term=Term, init_config=[_Id, From]}=State) 217 | when VoteTerm > Term -> 218 | io:format("RAFTER CANDIDATE #vote~n",[]), 219 | gen_fsm:reply(From, {error, invalid_initial_config}), 220 | State2 = State#state{init_config=undefined, config=#config{state=blank}}, 221 | NewState = step_down(VoteTerm, State2), 222 | {next_state, follower, NewState}; 223 | 224 | %% We are out of date. Go back to follower state. 225 | candidate(#vote{term=VoteTerm, success=false}, #state{term=Term}=State) 226 | when VoteTerm > Term -> 227 | io:format("RAFTER CANDIDATE #vote~n",[]), 228 | NewState = step_down(VoteTerm, State), 229 | {next_state, follower, NewState}; 230 | 231 | %% This is a stale vote from an old request. Ignore it. 232 | candidate(#vote{term=VoteTerm}, #state{term=CurrentTerm}=State) 233 | when VoteTerm < CurrentTerm -> 234 | io:format("RAFTER CANDIDATE #vote~n",[]), 235 | {next_state, candidate, State}; 236 | 237 | candidate(#vote{success=false, from=From}, #state{responses=Responses}=State) -> 238 | NewResponses = dict:store(From, false, Responses), 239 | NewState = State#state{responses=NewResponses}, 240 | io:format("RAFTER CANDIDATE #vote~n",[]), 241 | {next_state, candidate, NewState}; 242 | 243 | %% Sweet, someone likes us! Do we have enough votes to get elected? 244 | candidate(#vote{success=true, from=From}, #state{responses=Responses, me=Me, 245 | config=Config}=State) -> 246 | io:format("RAFTER CANDIDATE #vote ~p~n",[Config]), 247 | NewResponses = dict:store(From, true, Responses), 248 | case cr_config:quorum(Me, Config, NewResponses) of 249 | true -> 250 | NewState = become_leader(State), 251 | {next_state, leader, NewState}; 252 | false -> 253 | NewState = State#state{responses=NewResponses}, 254 | {next_state, candidate, NewState} 255 | end. 256 | 257 | candidate({set_config, _}, _From, State) -> 258 | io:format("RAFTER CANDIDATE set_config~n",[]), 259 | Reply = {error, election_in_progress}, 260 | {reply, Reply, follower, State}; 261 | 262 | %% A Peer is simultaneously trying to become the leader 263 | %% If it has a higher term, step down and become follower. 264 | candidate(#request_vote{term=RequestTerm}=RequestVote, _From, 265 | #state{term=Term}=State) when RequestTerm > Term -> 266 | NewState = step_down(RequestTerm, State), 267 | io:format("RAFTER CANDIDATE #req_vote~n",[]), 268 | handle_request_vote(RequestVote, NewState); 269 | candidate(#request_vote{}, _From, #state{term=CurrentTerm, me=Me}=State) -> 270 | Vote = #vote{term=CurrentTerm, success=false, from=Me}, 271 | io:format("RAFTER CANDIDATE #req_vote~n",[]), 272 | {reply, Vote, candidate, State}; 273 | 274 | %% Another peer is asserting itself as leader, and it must be correct because 275 | %% it was elected. We are still in initial config, which must have been a 276 | %% misconfiguration. Clear the initial configuration and step down. Since we 277 | %% still have an outstanding client request for inital config send an error 278 | %% response. 279 | candidate(#append_entries{term=RequestTerm}, _From, 280 | #state{init_config=[_, Client]}=State) -> 281 | io:format("RAFTER CANDIDATE #append~n"), 282 | gen_fsm:reply(Client, {error, invalid_initial_config}), 283 | %% Set to complete, we don't want another misconfiguration 284 | State2 = State#state{init_config=complete, config=#config{state=blank}}, 285 | State3 = step_down(RequestTerm, State2), 286 | {next_state, follower, State3}; 287 | 288 | %% Same as the above clause, but we don't need to send an error response. 289 | candidate(#append_entries{term=RequestTerm}, _From, 290 | #state{init_config=no_client}=State) -> 291 | %% Set to complete, we don't want another misconfiguration 292 | io:format("RAFTER CANDIDATE #append~n"), 293 | State2 = State#state{init_config=complete, config=#config{state=blank}}, 294 | State3 = step_down(RequestTerm, State2), 295 | {next_state, follower, State3}; 296 | 297 | %% Another peer is asserting itself as leader. If it has a current term 298 | %% step down and become follower. Otherwise do nothing 299 | candidate(#append_entries{term=RequestTerm}, _From, #state{term=CurrentTerm}=State) 300 | when RequestTerm >= CurrentTerm -> 301 | io:format("RAFTER CANDIDATE #append~n"), 302 | NewState = step_down(RequestTerm, State), 303 | {next_state, follower, NewState}; 304 | candidate(#append_entries{}, _From, State) -> 305 | io:format("RAFTER CANDIDATE #append~n"), 306 | {next_state, candidate, State}; 307 | 308 | candidate({set_config, {NewServer, AddRemove}}, From, #state{me=Me, followers=F, term=Term, config=C}=State) -> 309 | % change_config(NewServer, AddRemove, From, Me, F, Term, C, State, candidate); 310 | {reply, {error, election_in_progress}, candidate, State}; 311 | 312 | %% We are in the middle of an election. 313 | %% Leader should always be undefined here. 314 | candidate({read_op, _}, _, #state{leader=undefined}=State) -> 315 | io:format("RAFTER CANDIDATE read_op~n"), 316 | {reply, {error, election_in_progress}, candidate, State}; 317 | candidate({op, _Command}, _From, #state{leader=undefined}=State) -> 318 | io:format("RAFTER CANDIDATE op~n"), 319 | {reply, {error, election_in_progress}, candidate, State}. 320 | 321 | leader(timeout, #state{term=Term, 322 | init_config=no_client, 323 | config=C}=S) -> 324 | io:format("RAFTER LEADER timeout ~p~n",[no_client]), 325 | Entry = #rafter_entry{type=config, term=Term, cmd=C}, 326 | State0 = append(Entry, S), 327 | State = reset_timer(heartbeat_timeout(), State0), 328 | NewState = State#state{init_config=complete}, 329 | {next_state, leader, NewState}; 330 | 331 | %% We have just been elected leader because of an initial configuration. 332 | %% Append the initial config and set init_config=complete. 333 | leader(timeout, #state{term=Term, init_config=[Id, From], config=C}=S) -> 334 | io:format("RAFTER LEADER timeout ~p~n",[{Id,From}]), 335 | State0 = reset_timer(heartbeat_timeout(), S), 336 | Entry = #rafter_entry{type=config, term=Term, cmd=C}, 337 | State = append(Id, From, Entry, State0, leader), 338 | NewState = State#state{init_config=complete}, 339 | {next_state, leader, NewState}; 340 | 341 | leader(timeout, State0) -> 342 | State = reset_timer(heartbeat_timeout(), State0), 343 | NewState = send_append_entries(State), 344 | {next_state, leader, NewState}; 345 | 346 | %% We are out of date. Go back to follower state. 347 | leader(#append_entries_rpy{term=Term, success=false}, 348 | #state{term=CurrentTerm}=State) when Term > CurrentTerm -> 349 | NewState = step_down(Term, State), 350 | {next_state, follower, NewState}; 351 | 352 | %% This is a stale reply from an old request. Ignore it. 353 | leader(#append_entries_rpy{term=Term, success=true}, 354 | #state{term=CurrentTerm}=State) when CurrentTerm > Term -> 355 | {next_state, leader, State}; 356 | 357 | %% The follower is not synced yet. Try the previous entry 358 | leader(#append_entries_rpy{from=From, success=false}, 359 | #state{followers=Followers, config=C, me=Me}=State) -> 360 | case lists:member(From, cr_config:followers(Me, C)) of 361 | true -> 362 | NextIndex = decrement_follower_index(From, Followers), 363 | NewFollowers = dict:store(From, NextIndex, Followers), 364 | NewState = State#state{followers=NewFollowers}, 365 | {next_state, leader, NewState}; 366 | false -> 367 | %% This is a reply from a previous configuration. Ignore it. 368 | {next_state, leader, State} 369 | end; 370 | 371 | %% Success! 372 | leader(#append_entries_rpy{from=From, success=true}=Rpy, 373 | #state{followers=Followers, config=C, me=Me}=State) -> 374 | case lists:member(From, cr_config:followers(Me, C)) of 375 | true -> 376 | NewState = save_rpy(Rpy, State), 377 | State2 = maybe_commit(NewState), 378 | State3 = maybe_send_read_replies(State2), 379 | case State3#state.leader of 380 | undefined -> 381 | %% We just committed a config that doesn't include ourselves 382 | {next_state, follower, State3}; 383 | _ -> 384 | State4 = 385 | maybe_increment_follower_index(From, Followers, State3), 386 | {next_state, leader, State4} 387 | end; 388 | false -> 389 | %% This is a reply from a previous configuration. Ignore it. 390 | {next_state, leader, State} 391 | end; 392 | 393 | %% Ignore stale votes. 394 | leader(#vote{}, State) -> 395 | io:format("RAFTER LEADER #vote~n"), 396 | {next_state, leader, State}. 397 | 398 | %% An out of date leader is sending append_entries, tell it to step down. 399 | leader(#append_entries{term=Term}, _From, #state{term=CurrentTerm, me=Me}=State) 400 | when Term < CurrentTerm -> 401 | Rpy = #append_entries_rpy{from=Me, term=CurrentTerm, success=false}, 402 | io:format("RAFTER LEADER #append~n"), 403 | {reply, Rpy, leader, State}; 404 | 405 | %% We are out of date. Step down 406 | leader(#append_entries{term=Term}, _From, #state{term=CurrentTerm}=State) 407 | when Term > CurrentTerm -> 408 | NewState = step_down(Term, State), 409 | io:format("RAFTER LEADER #append~n"), 410 | {next_state, follower, NewState}; 411 | 412 | %% We are out of date. Step down 413 | leader(#request_vote{term=Term}, _From, #state{term=CurrentTerm}=State) 414 | when Term > CurrentTerm -> 415 | NewState = step_down(Term, State), 416 | io:format("RAFTER LEADER #req_vote~n"), 417 | {next_state, follower, NewState}; 418 | 419 | %% An out of date candidate is trying to steal our leadership role. Stop it. 420 | leader(#request_vote{}, _From, #state{me=Me, term=CurrentTerm}=State) -> 421 | Rpy = #vote{from=Me, term=CurrentTerm, success=false}, 422 | io:format("RAFTER LEADER #req_vote~n"), 423 | {reply, Rpy, leader, State}; 424 | 425 | leader({set_config, {NewServer, AddRemove}}, From, #state{me=Me, followers=F, term=Term, config=C}=State) -> 426 | change_config(NewServer, AddRemove, From, Me, F, Term, C, State, leader); 427 | 428 | %% Handle client requests 429 | leader({read_op, {Id, Command}}, From, State) -> 430 | NewState = setup_read_request(Id, From, Command, State), 431 | io:format("RAFTER LEADER read_op~n"), 432 | {next_state, leader, NewState}; 433 | 434 | leader({op, {Id, Command}}, From, 435 | #state{term=Term}=State) -> 436 | Entry = #rafter_entry{type=op, term=Term, cmd=Command}, 437 | NewState = append(Id, From, Entry, State, leader), 438 | io:format("RAFTER LEADER op~n"), 439 | {next_state, leader, NewState}. 440 | 441 | %%============================================================================= 442 | %% Internal Functions 443 | %%============================================================================= 444 | 445 | change_config(NewServer, AddRemove, From, Me, F, Term, C, State, FSMState) -> 446 | Id = os:timestamp(), 447 | #config{newservers=PreviousConfiguration} = C, 448 | WithoutNew = lists:delete(NewServer, sets:to_list(sets:from_list(PreviousConfiguration))), 449 | NewServers = case AddRemove of 450 | add -> [NewServer|WithoutNew]; 451 | remove -> WithoutNew end, 452 | io:format("RAFTER LEADER set_config~n~p~n~p~n",[C,NewServers]), 453 | case cr_config:allow_config(C, NewServers) of 454 | true -> 455 | {Followers, Config} = reconfig(Me, F, C, NewServers, State), 456 | Entry = #rafter_entry{type=config, term=Term, cmd=Config}, 457 | NewState0 = State#state{followers=Followers}, 458 | NewState = append(Id, From, Entry, NewState0, leader), 459 | io:format("RAFTER new config: ~p~n",[Config]), 460 | {next_state, FSMState, NewState}; 461 | Error -> 462 | io:format("set_config error: ~p~n",[Error]), 463 | {reply, Error, FSMState, State} end. 464 | 465 | no_leader_error(Me, Config) -> 466 | case cr_config:has_vote(Me, Config) of 467 | false -> 468 | not_consensus_group_member; 469 | true -> 470 | election_in_progress 471 | end. 472 | 473 | reconfig(Me, OldFollowers, Config0, NewServers, State) -> 474 | Config = cr_config:reconfig(Config0, NewServers), 475 | NewFollowers = cr_config:followers(Me, Config), 476 | OldSet = sets:from_list([K || {K, _} <- dict:to_list(OldFollowers)]), 477 | NewSet = sets:from_list(NewFollowers), 478 | AddedServers = sets:to_list(sets:subtract(NewSet, OldSet)), 479 | RemovedServers = sets:to_list(sets:subtract(OldSet, NewSet)), 480 | Followers0 = add_followers(AddedServers, OldFollowers, State), 481 | Followers = remove_followers(RemovedServers, Followers0), 482 | {Followers, Config}. 483 | 484 | add_followers(NewServers, Followers, #state{me=Me}) -> 485 | NextIndex = cr_log:get_last_index(Me) + 1, 486 | NewFollowers = [{S, NextIndex} || S <- NewServers], 487 | dict:from_list(NewFollowers ++ dict:to_list(Followers)). 488 | 489 | remove_followers(Servers, Followers0) -> 490 | lists:foldl(fun(S, Followers) -> 491 | dict:erase(S, Followers) 492 | end, Followers0, Servers). 493 | 494 | append(Entry, #state{me=Me}=State) -> 495 | io:format("RAFTER APPEND Me: ~p Entry ~p~n",[Me,Entry]), 496 | {ok, _Index} = cr_log:append(Me, [Entry]), 497 | send_append_entries(State). 498 | 499 | append(Id, From, Entry, State, leader) -> 500 | NewState = append(Id, From, Entry, State), 501 | send_append_entries(NewState). 502 | 503 | append(Id, From, Entry, 504 | #state{me=Me, term=Term, client_reqs=Reqs}=State) -> 505 | io:format("RAFTER APPEND Me: ~p Entry ~p~n",[Me,Entry]), 506 | {ok, Index} = cr_log:append(Me, [Entry]), 507 | {ok, Timer} = timer:send_after(?CLIENT_TIMEOUT, Me, {client_timeout, Id}), 508 | ClientRequest = #client_req{id=Id, 509 | from=From, 510 | index=Index, 511 | term=Term, 512 | timer=Timer}, 513 | State#state{client_reqs=[ClientRequest | Reqs]}. 514 | 515 | setup_read_request(Id, From, Command, #state{send_clock=Clock, 516 | me=Me, 517 | term=Term}=State) -> 518 | {ok, Timer} = timer:send_after(?CLIENT_TIMEOUT, Me, {client_read_timeout, Clock, Id}), 519 | ReadRequest = #client_req{id=Id, 520 | from=From, 521 | term=Term, 522 | cmd=Command, 523 | timer=Timer}, 524 | NewState = save_read_request(ReadRequest, State), 525 | send_append_entries(NewState). 526 | 527 | save_read_request(ReadRequest, #state{send_clock=Clock, 528 | read_reqs=Requests}=State) -> 529 | NewRequests = 530 | case orddict:find(Clock, Requests) of 531 | {ok, ReadRequests} -> 532 | orddict:store(Clock, [ReadRequest | ReadRequests], Requests); 533 | error -> 534 | orddict:store(Clock, [ReadRequest], Requests) 535 | end, 536 | State#state{read_reqs=NewRequests}. 537 | 538 | send_client_timeout_reply(#client_req{from=From}) -> 539 | gen_fsm:reply(From, {error, timeout}). 540 | 541 | send_client_reply(#client_req{timer=Timer, from=From}, Result) -> 542 | {ok, cancel} = timer:cancel(Timer), 543 | gen_fsm:reply(From, Result). 544 | 545 | find_client_req(Id, ClientRequests) -> 546 | Result = lists:filter(fun(Req) -> 547 | Req#client_req.id =:= Id 548 | end, ClientRequests), 549 | case Result of 550 | [Request] -> 551 | {ok, Request}; 552 | [] -> 553 | not_found 554 | end. 555 | 556 | delete_client_req(Id, ClientRequests) -> 557 | lists:filter(fun(Req) -> 558 | Req#client_req.id =/= Id 559 | end, ClientRequests). 560 | 561 | find_client_req_by_index(Index, ClientRequests) -> 562 | Result = lists:filter(fun(Req) -> 563 | Req#client_req.index =:= Index 564 | end, ClientRequests), 565 | case Result of 566 | [Request] -> 567 | {ok, Request}; 568 | [] -> 569 | not_found 570 | end. 571 | 572 | delete_client_req_by_index(Index, ClientRequests) -> 573 | lists:filter(fun(Req) -> 574 | Req#client_req.index =/= Index 575 | end, ClientRequests). 576 | 577 | %% @doc Commit entries between the previous commit index and the new one. 578 | %% Apply them to the local state machine and respond to any outstanding 579 | %% client requests that these commits affect. Return the new state. 580 | %% Ignore already committed entries. 581 | commit_entries(NewCommitIndex, #state{commit_index=CommitIndex}=State) 582 | when CommitIndex >= NewCommitIndex -> State; 583 | commit_entries(NewCommitIndex, #state{commit_index=CommitIndex, 584 | state_machine=StateMachine, 585 | backend_state=BackendState, 586 | me=Me}=State) -> 587 | LastIndex = min(cr_log:get_last_index(Me), NewCommitIndex), 588 | lists:foldl(fun(Index, #state{client_reqs=CliReqs}=State1) -> 589 | NewState = State1#state{commit_index=Index}, 590 | case cr_log:get_entry(Me, Index) of 591 | 592 | %% Noop - Ignore this request 593 | {ok, #rafter_entry{type=noop}} -> 594 | NewState; 595 | 596 | %% Normal Operation. Apply Command to StateMachine. 597 | {ok, #rafter_entry{type=op, cmd=Command}} -> 598 | {Result, NewBackendState} = 599 | StateMachine:write(Command, BackendState), 600 | NewState2 = NewState#state{backend_state=NewBackendState}, 601 | maybe_send_client_reply(Index, CliReqs, NewState2, Result); 602 | 603 | %% We have a committed transitional state, so reply 604 | %% successfully to the client. Then set the new stable 605 | %% configuration. 606 | {ok, #rafter_entry{type=config, 607 | cmd=#config{state=transitional}=C}} -> 608 | S = stabilize_config(C, NewState), 609 | Reply = {ok, S#state.config}, 610 | maybe_send_client_reply(Index, CliReqs, S, Reply); 611 | 612 | %% The configuration has already been set. Initial configuration goes 613 | %% directly to stable state so needs to send a reply. Checking for 614 | %% a client request is expensive, but config changes happen 615 | %% infrequently. 616 | {ok, #rafter_entry{type=config, 617 | cmd=#config{state=stable}}} -> 618 | Reply = {ok, NewState#state.config}, 619 | maybe_send_client_reply(Index, CliReqs, NewState, Reply) 620 | end 621 | end, State, lists:seq(CommitIndex+1, LastIndex)). 622 | 623 | stabilize_config(#config{state=transitional, newservers=New}=C, 624 | #state{me=Me, term=Term}=S) when S#state.leader =:= S#state.me -> 625 | Config = C#config{state=stable, oldservers=New, newservers=[]}, 626 | Entry = #rafter_entry{type=config, term=Term, cmd=Config}, 627 | State = S#state{config=Config}, 628 | {ok, _Index} = cr_log:append(Me, [Entry]), 629 | send_append_entries(State); 630 | stabilize_config(_, State) -> 631 | State. 632 | 633 | maybe_send_client_reply(Index, CliReqs, S, Result) when S#state.leader =:= S#state.me -> 634 | case find_client_req_by_index(Index, CliReqs) of 635 | {ok, Req} -> 636 | send_client_reply(Req, Result), 637 | Reqs = delete_client_req_by_index(Index, CliReqs), 638 | S#state{client_reqs=Reqs}; 639 | not_found -> 640 | S 641 | end; 642 | maybe_send_client_reply(_, _, State, _) -> 643 | State. 644 | 645 | maybe_send_read_replies(#state{me=Me, 646 | config=Config, 647 | send_clock_responses=Responses}=State0) -> 648 | Clock = cr_config:quorum_max(Me, Config, Responses), 649 | {ok, Requests, State} = find_eligible_read_requests(Clock, State0), 650 | NewState = send_client_read_replies(Requests, State), 651 | NewState. 652 | 653 | eligible_request(SendClock) -> 654 | fun({Clock, _}) -> 655 | SendClock > Clock 656 | end. 657 | 658 | find_eligible_read_requests(SendClock, #state{read_reqs=Requests}=State) -> 659 | EligibleReq = eligible_request(SendClock), 660 | Eligible = lists:takewhile(EligibleReq, Requests), 661 | NewRequests = lists:dropwhile(EligibleReq, Requests), 662 | NewState = State#state{read_reqs=NewRequests}, 663 | {ok, Eligible, NewState}. 664 | 665 | send_client_read_replies([], State) -> 666 | State; 667 | send_client_read_replies(Requests, State=#state{state_machine=StateMachine, 668 | backend_state=BackendState}) -> 669 | NewBackendState = 670 | lists:foldl(fun({_Clock, ClientReqs}, BeState) -> 671 | read_and_send(ClientReqs, StateMachine, BeState) 672 | end, BackendState, Requests), 673 | State#state{backend_state=NewBackendState}. 674 | 675 | read_and_send(ClientRequests, StateMachine, BackendState) -> 676 | lists:foldl(fun(Req, Acc) -> 677 | {Val, NewAcc} = 678 | StateMachine:read(Req#client_req.cmd, Acc), 679 | send_client_reply(Req, Val), 680 | NewAcc 681 | end, BackendState, ClientRequests). 682 | 683 | maybe_commit(#state{me=Me, 684 | commit_index=CommitIndex, 685 | config=Config, 686 | responses=Responses}=State) -> 687 | Min = cr_config:quorum_max(Me, Config, Responses), 688 | case Min > CommitIndex andalso safe_to_commit(Min, State) of 689 | true -> 690 | NewState = commit_entries(Min, State), 691 | case cr_config:has_vote(Me, NewState#state.config) of 692 | true -> 693 | NewState; 694 | false -> 695 | %% We just committed a config that doesn't include ourself 696 | step_down(NewState#state.term, NewState) 697 | end; 698 | false -> 699 | State 700 | end. 701 | 702 | safe_to_commit(Index, #state{term=CurrentTerm, me=Me}) -> 703 | CurrentTerm =:= cr_log:get_term(Me, Index). 704 | 705 | %% We are about to transition to the follower state. Reset the necessary state. 706 | %% TODO: send errors to any outstanding client read or write requests and cleanup 707 | %% timers 708 | step_down(NewTerm, State0) -> 709 | State = reset_timer(election_timeout(), State0), 710 | NewState = State#state{term=NewTerm, 711 | responses=dict:new(), 712 | leader=undefined}, 713 | set_metadata(undefined, NewState). 714 | 715 | save_rpy(#append_entries_rpy{from=From, index=Index, send_clock=Clock}, 716 | #state{responses=Responses, send_clock_responses=ClockResponses}=State) -> 717 | NewResponses = save_greater(From, Index, Responses), 718 | NewClockResponses = save_greater(From, Clock, ClockResponses), 719 | State#state{responses=NewResponses, send_clock_responses=NewClockResponses}. 720 | 721 | save_greater(Key, Val, Dict) -> save_greater(Key, Val, Dict, dict:find(Key, Dict)). 722 | save_greater(_Key, Val, Dict, {ok, CurrentVal}) when CurrentVal > Val -> Dict; 723 | save_greater(_Key, CurrentVal, Dict, {ok, CurrentVal}) -> Dict; 724 | save_greater(Key, Val, Dict, {ok, _}) -> dict:store(Key, Val, Dict); 725 | save_greater(Key, Val, Dict, error) -> dict:store(Key, Val, Dict). 726 | 727 | handle_request_vote(#request_vote{from=CandidateId, term=Term}=RequestVote, 728 | State) -> 729 | State2 = set_term(Term, State), 730 | {ok, Vote} = vote(RequestVote, State2), 731 | case Vote#vote.success of 732 | true -> 733 | State3 = set_metadata(CandidateId, State2), 734 | State4 = reset_timer(election_timeout(), State3), 735 | {reply, Vote, follower, State4}; 736 | false -> 737 | {reply, Vote, follower, State2} 738 | end. 739 | 740 | set_metadata(CandidateId, State=#state{me=Me, term=Term}) -> 741 | NewState = State#state{voted_for=CandidateId}, 742 | ok = cr_log:set_metadata(Me, CandidateId, Term), 743 | NewState. 744 | 745 | maybe_increment_follower_index(From, Followers, State=#state{me=Me}) -> 746 | LastLogIndex = cr_log:get_last_index(Me), 747 | {ok, Index} = dict:find(From, Followers), 748 | case Index =< LastLogIndex of 749 | true -> 750 | State#state{followers=dict:store(From, Index+1, Followers)}; 751 | false -> 752 | State 753 | end. 754 | 755 | get_prev(Me, Index) -> 756 | case Index - 1 of 757 | 0 -> 758 | {0, 0}; 759 | PrevIndex -> 760 | {PrevIndex, 761 | cr_log:get_term(Me, PrevIndex)} 762 | end. 763 | 764 | %% TODO: Return a block of entries if more than one exist 765 | get_entries(Me, Index) -> 766 | case cr_log:get_entry(Me, Index) of 767 | {ok, not_found} -> 768 | []; 769 | {ok, Entry} -> 770 | [Entry] 771 | end. 772 | 773 | send_entry(Peer, Index, #state{me=Me, 774 | term=Term, 775 | send_clock=Clock, 776 | commit_index=CIdx}) -> 777 | {PrevLogIndex, PrevLogTerm} = get_prev(Me, Index), 778 | Entries = get_entries(Me, Index), 779 | AppendEntries = #append_entries{term=Term, 780 | from=Me, 781 | prev_log_index=PrevLogIndex, 782 | prev_log_term=PrevLogTerm, 783 | entries=Entries, 784 | commit_index=CIdx, 785 | send_clock=Clock}, 786 | rsend(Peer, AppendEntries). 787 | 788 | send_append_entries(#state{followers=Followers, send_clock=SendClock}=State) -> 789 | NewState = State#state{send_clock=SendClock+1}, 790 | _ = [send_entry(Peer, Index, NewState) || 791 | {Peer, Index} <- dict:to_list(Followers)], 792 | NewState. 793 | 794 | decrement_follower_index(From, Followers) -> 795 | case dict:find(From, Followers) of 796 | {ok, 1} -> 797 | 1; 798 | {ok, Num} -> 799 | Num - 1 800 | end. 801 | 802 | %% @doc Start a process to send a syncrhonous rpc to each peer. Votes will be sent 803 | %% back as messages when the process receives them from the peer. If 804 | %% there is an error or a timeout no message is sent. This helps preserve 805 | %% the asynchrnony of the consensus fsm, while maintaining the rpc 806 | %% semantics for the request_vote message as described in the raft paper. 807 | request_votes(#state{config=Config, term=Term, me=Me}) -> 808 | Voters = cr_config:voters(Me, Config), 809 | Msg = #request_vote{term=Term, 810 | from=Me, 811 | last_log_index=cr_log:get_last_index(Me), 812 | last_log_term=cr_log:get_last_term(Me)}, 813 | [rsend(Peer, Msg) || Peer <- Voters]. 814 | 815 | become_candidate(#state{term=CurrentTerm, me=Me}=State0) -> 816 | State = reset_timer(election_timeout(), State0), 817 | State2 = State#state{term=CurrentTerm + 1, 818 | responses=dict:new(), 819 | leader=undefined}, 820 | State3 = set_metadata(Me, State2), 821 | _ = request_votes(State3), 822 | State3. 823 | 824 | become_leader(#state{me=Me, term=Term, init_config=InitConfig}=State) -> 825 | NewState = State#state{leader=Me, 826 | responses=dict:new(), 827 | followers=initialize_followers(State), 828 | send_clock = 0, 829 | send_clock_responses = dict:new(), 830 | read_reqs = orddict:new()}, 831 | 832 | case InitConfig of 833 | complete -> 834 | %% Commit a noop entry to the log so we can move the commit index 835 | Entry = #rafter_entry{type=noop, term=Term, cmd=noop}, 836 | append(Entry, NewState); 837 | _ -> 838 | %% First entry must always be a config entry 839 | NewState 840 | end. 841 | 842 | 843 | initialize_followers(#state{me=Me, config=Config}) -> 844 | Peers = cr_config:followers(Me, Config), 845 | NextIndex = cr_log:get_last_index(Me) + 1, 846 | Followers = [{Peer, NextIndex} || Peer <- Peers], 847 | dict:from_list(Followers). 848 | 849 | %% There is no entry at t=0, so just return true. 850 | consistency_check(#append_entries{prev_log_index=0, 851 | prev_log_term=0}, _State) -> 852 | true; 853 | consistency_check(#append_entries{prev_log_index=Index, 854 | prev_log_term=Term}, #state{me=Me}) -> 855 | case cr_log:get_entry(Me, Index) of 856 | {ok, not_found} -> 857 | false; 858 | {ok, #rafter_entry{term=Term}} -> 859 | true; 860 | {ok, #rafter_entry{term=_DifferentTerm}} -> 861 | false 862 | end. 863 | 864 | set_term(Term, #state{term=CurrentTerm}=State) when Term < CurrentTerm -> State; 865 | set_term(Term, #state{term=CurrentTerm}=State) when Term > CurrentTerm -> set_metadata(undefined, State#state{term=Term}); 866 | set_term(Term, #state{term=Term}=State) -> State. 867 | 868 | vote(#request_vote{term=Term}, #state{term=CurrentTerm, me=Me}) 869 | when Term < CurrentTerm -> 870 | fail_vote(CurrentTerm, Me); 871 | vote(#request_vote{from=CandidateId, term=CurrentTerm}=RequestVote, 872 | #state{voted_for=CandidateId, term=CurrentTerm, me=Me}=State) -> 873 | maybe_successful_vote(RequestVote, CurrentTerm, Me, State); 874 | vote(#request_vote{term=CurrentTerm}=RequestVote, 875 | #state{voted_for=undefined, term=CurrentTerm, me=Me}=State) -> 876 | maybe_successful_vote(RequestVote, CurrentTerm, Me, State); 877 | vote(#request_vote{from=CandidateId, term=CurrentTerm}, 878 | #state{voted_for=AnotherId, term=CurrentTerm, me=Me}) 879 | when AnotherId =/= CandidateId -> 880 | fail_vote(CurrentTerm, Me). 881 | 882 | maybe_successful_vote(RequestVote, CurrentTerm, Me, State) -> 883 | case candidate_log_up_to_date(RequestVote, State) of 884 | true -> 885 | successful_vote(CurrentTerm, Me); 886 | false -> 887 | fail_vote(CurrentTerm, Me) 888 | end. 889 | 890 | candidate_log_up_to_date(#request_vote{last_log_term=CandidateTerm, 891 | last_log_index=CandidateIndex}, 892 | #state{me=Me}) -> 893 | candidate_log_up_to_date(CandidateTerm, 894 | CandidateIndex, 895 | cr_log:get_last_term(Me), 896 | cr_log:get_last_index(Me)). 897 | 898 | candidate_log_up_to_date(CandidateTerm, _CandidateIndex, LogTerm, _LogIndex) when CandidateTerm > LogTerm -> true; 899 | candidate_log_up_to_date(CandidateTerm, _CandidateIndex, LogTerm, _LogIndex) when CandidateTerm < LogTerm -> false; 900 | candidate_log_up_to_date(Term, CandidateIndex, Term, LogIndex) when CandidateIndex > LogIndex -> true; 901 | candidate_log_up_to_date(Term, CandidateIndex, Term, LogIndex) when CandidateIndex < LogIndex -> false; 902 | candidate_log_up_to_date(Term, Index, Term, Index) -> true. 903 | 904 | successful_vote(CurrentTerm, Me) -> {ok, #vote{term=CurrentTerm, success=true, from=Me}}. 905 | fail_vote(CurrentTerm, Me) -> {ok, #vote{term=CurrentTerm, success=false, from=Me}}. 906 | election_timeout() -> crypto:rand_uniform(?ELECTION_TIMEOUT_MIN, ?ELECTION_TIMEOUT_MAX). 907 | heartbeat_timeout() -> ?HEARTBEAT_TIMEOUT. 908 | 909 | reset_timer(Duration, State=#state{timer=Timer}) -> 910 | _ = gen_fsm:cancel_timer(Timer), 911 | NewTimer = gen_fsm:send_event_after(Duration, timeout), 912 | State#state{timer=NewTimer}. 913 | 914 | rsend(To, #request_vote{from=From}=Msg) -> rsend(To, From, Msg); 915 | rsend(To, #append_entries{from=From}=Msg) -> rsend(To, From, Msg). 916 | rsend(To, From, Msg) -> 917 | spawn(fun() -> 918 | case cr_rafter:send_sync({To,To}, Msg) of 919 | Rpy when is_record(Rpy, vote) orelse 920 | is_record(Rpy, append_entries_rpy) -> 921 | cr_rafter:send(From, Rpy); 922 | E -> 923 | io:format("Error sending ~p to To ~p: ~p", [Msg, To, E]) 924 | end 925 | end). 926 | -------------------------------------------------------------------------------- /src/consensus/cr_replication.erl: -------------------------------------------------------------------------------- 1 | -module(cr_replication). 2 | -description('RAFT protocol replication log backend'). 3 | -behaviour(rafter_backend). 4 | -export([init/1, stop/1, read/2, write/2]). 5 | -record(state, {peer :: atom() | {atom(), atom()}}). 6 | 7 | % Issue commands only if you want them to be saved in cluster status log. 8 | 9 | init(Peer) -> 10 | State = #state{peer=Peer}, 11 | NewState = stop(State), 12 | _Tid1 = ets:new(rafter, [set, named_table, public]), 13 | _Tid2 = ets:new(rafter_tables, [set, named_table, public]), 14 | io:format("RAFTER BACK INIT ~p~n~p~n",[Peer,{_Tid1,_Tid2}]), 15 | 16 | NewState. 17 | 18 | stop(State) -> 19 | catch ets:delete(rafter), 20 | catch ets:delete(rafter_tables), 21 | State. 22 | 23 | read({get, Table, Key}, State) -> 24 | io:format("CONS GET: ~p~n",[{Table, Key}]), 25 | Val = try 26 | case ets:lookup(Table, Key) of 27 | [{Key, Value}] -> 28 | {ok, Value}; 29 | [] -> 30 | {ok, not_found} 31 | end 32 | catch _:E -> 33 | {error, E} 34 | end, 35 | {Val, State}; 36 | read(list_tables, State) -> 37 | io:format("CONS DIR~n",[]), 38 | {{ok, [Table || {Table} <- ets:tab2list(rafter_tables)]}, 39 | State}; 40 | read({list_keys, Table}, State) -> 41 | io:format("CONS ALL: ~p~n",[{Table}]), 42 | Val = try 43 | list_keys(Table) 44 | catch _:E -> 45 | {error, E} 46 | end, 47 | {Val, State}; 48 | read(_, State) -> 49 | {{error, ets_read_badarg}, State}. 50 | 51 | write({new, Name}, State) -> 52 | io:format("CONS NEW: ~p~n",[{Name}]), 53 | Val = try 54 | _Tid = ets:new((Name), [ordered_set, named_table, public]), 55 | ets:insert(rafter_tables, {Name}), 56 | {ok, Name} 57 | catch _:E -> 58 | {error, E} 59 | end, 60 | {Val, State}; 61 | 62 | write({put, Table, Key, Value}, State) -> 63 | io:format("CONS PUT: ~p~n",[{Table, Key, Value}]), 64 | Val = try 65 | ets:insert(Table, {Key, Value}), 66 | {ok, Value} 67 | catch _:E -> 68 | {error, E} 69 | end, 70 | {Val, State}; 71 | write({delete, Table}, State) -> 72 | io:format("CONS DELETE: ~p~n",[{Table}]), 73 | Val = 74 | try 75 | ets:delete(Table), 76 | ets:delete(rafter_tables, Table), 77 | {ok, true} 78 | catch _:E -> 79 | {error, E} 80 | end, 81 | {Val, State}; 82 | write({delete, Table, Key}, State) -> 83 | io:format("CONS DELETE: ~p~n",[{Table,Key}]), 84 | Val = try 85 | {ok, ets:delete(Table, Key)} 86 | catch _:E -> 87 | {error, E} 88 | end, 89 | {Val, State}; 90 | write(Data, State) -> 91 | io:format("CONS WRITE: ~p~n",[{Data}]), 92 | {{error, ets_write_badarg}, State}. 93 | 94 | list_keys(Table) -> 95 | list_keys(ets:first(Table), Table, []). 96 | 97 | list_keys('$end_of_table', _Table, Keys) -> 98 | {ok, Keys}; 99 | list_keys(Key, Table, Keys) -> 100 | list_keys(ets:next(Table, Key), Table, [Key | Keys]). 101 | -------------------------------------------------------------------------------- /src/cr.app.src: -------------------------------------------------------------------------------- 1 | {application, cr, 2 | [ 3 | {description, "Chain Replication"}, 4 | {vsn, "0.1"}, 5 | {registered, []}, 6 | {applications, [kernel,stdlib,kvs]}, 7 | {mod, { cr_app, []}} 8 | ]}. 9 | -------------------------------------------------------------------------------- /src/cr.erl: -------------------------------------------------------------------------------- 1 | -module(cr). 2 | -description('Distributed Transaction Coordinator'). 3 | -copyright('Maxim Sokhatsky'). 4 | -include("cr.hrl"). 5 | -include_lib("db/include/transaction.hrl"). 6 | -include("rafter.hrl"). 7 | -compile(export_all). 8 | -compile({no_auto_import,[node/0]}). 9 | 10 | main(A) -> mad_repl:main(A,[]). 11 | 12 | encode(Msg) -> term_to_binary(Msg). 13 | decode(Bin) -> binary_to_term(Bin). 14 | 15 | set_socket(Pid, Socket) when is_pid(Pid), is_port(Socket) -> gen_fsm:send_event(Pid, {socket_ready, Socket}). 16 | send(Pid, Message) when is_pid(Pid) -> gen_fsm:send_event(Pid, {out, Message}). 17 | 18 | config() -> {ok,Peers} = application:get_env(cr,peers), 19 | N = lists:map(fun({N,_,_,_})->N end,Peers), 20 | #config{state=stable,oldservers=N,newservers=N}. 21 | local(Object) -> {I,N}=lists:keyfind(cr:nodex(cr:node()),2,cr:chain(Object)), 22 | {I,P,_,_}=lists:keyfind(I,1,supervisor:which_children(vnode_sup)), P. 23 | secret() -> application:get_env(cr,secret,<<"ThisIsClassified">>). 24 | peers() -> {ok,Peers}=application:get_env(cr,peers),Peers. 25 | peers(N) -> lists:zip(lists:seq(1,N),lists:seq(1,N)). 26 | hash(Object) -> hd(seq(Object)). 27 | rep(Object) -> roll(element(2,hash(Object))). 28 | roll(N) -> lists:seq(N,length(peers())) ++ lists:seq(1,N-1). 29 | seq(Object) -> lists:keydelete(0,1,cr_hash:succ(cr_hash:key_of(Object),ring())). 30 | peer({I,N}) -> element(1,lists:nth(N,peers())). 31 | nodex(Node) -> string:str(cr:peers(),[lists:keyfind(Node,1,cr:peers())]). 32 | node() -> list_to_atom(lists:concat([os:getenv("NAME"),'@127.0.0.1'])). 33 | vpid({I,Node}) -> {I,P,_,_}=lists:keyfind(I,1,supervisor:which_children({vnode_sup,Node})), P. 34 | ring() -> ring(4). 35 | ring(C) -> {Nodes,[{0,1}|Rest]} = cr_hash:fresh(length(peers())*C,1), 36 | {Nodes,[{0,0}|lists:map(fun({{I,1},X})->{I,(X-1) div C+1} end, 37 | lists:zip(Rest,lists:seq(1,length(Rest))))]}. 38 | 39 | chain(Object) -> 40 | {N,_} = cr:ring(), 41 | lists:map(fun(X) -> lists:nth((X-1)*4+1,cr:seq(Object)) end, 42 | cr:roll(element(2,cr:hash(Object)))). 43 | 44 | tx(Record) when is_tuple(Record) -> 45 | gen_server:cast(local(Record), 46 | {client,{self(),os:timestamp()}, 47 | chain(element(2,Record)), 48 | Record}). 49 | 50 | stack(Error, Reason) -> 51 | Stacktrace = [case A of 52 | { Module,Function,Arity,Location} -> 53 | { Module,Function,Arity,proplists:get_value(line, Location) }; 54 | Else -> Else end 55 | || A <- erlang:get_stacktrace()], 56 | [Error, Reason, Stacktrace]. 57 | 58 | error_page(Class,Error) -> 59 | io_lib:format("ERROR: ~w:~w~n~n",[Class,Error]) ++ 60 | "STACK: " ++ 61 | [ io_lib:format("\t~w:~w/~w:~w\n", 62 | [ Module,Function,Arity,proplists:get_value(line, Location) ]) 63 | || { Module,Function,Arity,Location} <- erlang:get_stacktrace() ]. 64 | 65 | test() -> test(10). 66 | test(Num) -> 67 | O1 = lists:foldl(fun({_,_,_,A,_,_},Acc) -> A+Acc end,0,kvs:all(log)), 68 | T1 = length(kvs:all(transaction)), 69 | io:format("Already in Database: ~p~n" 70 | "New record will be applied: ~p~n",[O1,Num]), 71 | [cr:tx(#transaction{id=kvs:next_id(transaction,1)})||I<-lists:seq(1,Num)], 72 | O2 = lists:foldl(fun({_,_,_,A,_,_},Acc) -> A+Acc end,0,kvs:all(log)), 73 | {transactions,T2 = length(kvs:all(transaction))}. 74 | 75 | log_size({I,N}) -> 76 | {ok,Log} = kvs:get(log,{I,N}), 77 | {Log#log.top,length(kvs:entries({ok,Log},operation,-1))}. 78 | 79 | dump() -> 80 | {N,Nodes} = cr:ring(), 81 | io:format("~52w ~3w ~2w ~10w ~11w~n",[vnode,i,n,top,latency]), 82 | [ begin 83 | {A,B} = rpc(rpc:call(cr:peer({I,N}),cr,log_size,[{I,N}])), 84 | {Min,Max,Avg} = latency({I,N}), 85 | L = lists:concat([Min,'/',Max,'/',Avg]), 86 | io:format("~52w ~3w ~2w ~10w ~11s~n",[I,P,N,A,L]) 87 | end || {{I,N},P} <- lists:zip(lists:keydelete(0,1,Nodes),lists:seq(1,length(Nodes)-1))], 88 | ok. 89 | 90 | string(O) -> 91 | lists:concat(lists:flatten([lists:map(fun(undefined) -> ''; (X) -> [X,':'] end, tuple_to_list(O))])). 92 | 93 | dump(N) when N < 13 -> {_,X} = cr:ring(), 94 | Nodes = lists:keydelete(0,1,X), 95 | {I,P} = lists:nth(N,Nodes), 96 | Pos = string:str(Nodes,[{I,P}]), 97 | {ok,C} = rpc:call(cr:peer({I,P}),kvs,get,[log,{I,P}]), 98 | dump_op(Pos,rpc(rpc:call(cr:peer({I,P}),kvs,entries,[C,operation,10]))); 99 | 100 | dump(N) -> {_,X} = cr:ring(), 101 | Nodes = lists:keydelete(0,1,X), 102 | {ok,Oo} = kvs:get(operation,N), 103 | {I,P} = lists:keyfind(element(1,Oo#operation.feed_id),1,Nodes), 104 | Pos = string:str(Nodes,[{I,P}]), 105 | dump_op(Pos,kvs:traversal(operation,Oo#operation.id,10,#iterator.prev)). 106 | 107 | dump_op(Pos,List) -> 108 | io:format("~50s ~10w ~10w ~4w ~10w~n",[operation,id,prev,i,size]), 109 | [ io:format("~50s ~10w ~10w ~4w ~10w~n",[ 110 | string(Tx), 111 | element(2,O), 112 | rpc(element(#iterator.prev,O)), 113 | rpc(Pos), 114 | size(term_to_binary(O))]) 115 | || #operation{name=Name,body={Cmd,_,Chain,Tx}}=O <- List], 116 | ok. 117 | 118 | latency({I,N}) -> gen_server:call(cr:vpid({I,cr:peer({I,N})}),{latency}). 119 | 120 | rpc(undefined) -> []; 121 | rpc({badrpc,_}) -> {error,error}; 122 | rpc(Value) -> Value. 123 | 124 | clean() -> kvs:destroy(), kvs:join(). 125 | 126 | log_modules() -> [cr,cr_log,cr_rafter,cr_heart]. 127 | 128 | sup() -> [{T,Pid}||{T,Pid,_,_}<-supervisor:which_children(cr_sup)]. 129 | heart() -> [{_,P,_,_}]=supervisor:which_children(heart_sup), gen_server:call(P,{heart}). 130 | local() -> [{I,P}||{I,P,_,_} <- supervisor:which_children(vnode_sup)]. 131 | 132 | % Integrity Functions 133 | 134 | % consensus_log checks that the length of RAFT log is the same on all nodes. 135 | % node_log checks that the sum of chains of all vnodes equals the the overal operations counts. 136 | % operation_log checks that on all nodes all operations logs are ok 137 | % cluster_status checks that all logs on all nodes are ok 138 | 139 | consensus_log() -> 140 | Entries = cr_log:get_last_index(cr:node()), 141 | case lists:all(fun({H,_,_,_}) -> 142 | rpc:call(H,cr_log,get_last_index,[H]) == Entries end, 143 | cr:peers()) of true -> {ok,Entries}; 144 | false -> {error,consensus_log} end. 145 | 146 | 147 | node_log() -> 148 | Operations = length(kvs:all(operation)), 149 | case lists:sum([ begin 150 | length(kvs:entries(kvs:get(log,Id),operation,-1)) == Num, Num end 151 | || {log,Id,_,Num,_,_} <- kvs:all(log) ]) == Operations of 152 | true -> {ok,Operations}; 153 | false -> {error,node_log} end. 154 | 155 | 156 | operation_log() -> 157 | Operations = length(kvs:all(operation)), 158 | case lists:all(fun({H,_,_,_}) -> 159 | case rpc:call(H,cr,node_log,[]) of 160 | {ok,Operations} -> true; 161 | _ -> false end end, 162 | cr:peers()) of true -> {ok,Operations}; 163 | false -> {error,operation_log} end. 164 | 165 | 166 | cluster_status() -> {ok,_} = consensus_log(), 167 | {ok,_} = operation_log(). 168 | 169 | 170 | -------------------------------------------------------------------------------- /src/cr_app.erl: -------------------------------------------------------------------------------- 1 | -module(cr_app). 2 | -behaviour(application). 3 | -export([start/2, stop/1]). 4 | -copyright('Maxim Sokhatsky'). 5 | -include("rafter_opts.hrl"). 6 | -compile(export_all). 7 | 8 | tcp(Name,Port,Mod,Nodes) -> {Name,{cr_tcp,start_link, 9 | [Name,Port,Mod,Nodes]}, 10 | permanent,2000,worker,[cr_tcp]}. 11 | 12 | pool(SupName) -> {SupName,{supervisor,start_link, 13 | [{local,SupName},cr_connection,[]]}, 14 | permanent,infinity,supervisor,[]}. 15 | 16 | vnode({I,N}) -> {I,{cr_vnode,start_link, 17 | [{I,N},cr_kvs]}, 18 | permanent,2000,worker,[cr_vnode]}. 19 | 20 | heart(Nodes) -> {heart,{cr_heart,start_link, 21 | ["heart",Nodes]}, 22 | permanent,2000,worker,[cr_heart]}. 23 | 24 | log({I,N},Nodes) -> {cr_log:logname(N),{cr_log,start_link, 25 | [N,#rafter_opts{cluster=Nodes}]}, 26 | permanent,2000,worker,[cr_log]}. 27 | 28 | rafter({I,N},Nodes) -> {N,{cr_rafter,start_link, 29 | [{I,N},#rafter_opts{state_machine=cr_replication,cluster=Nodes}]}, 30 | permanent,2000,worker,[cr_rafter]}. 31 | 32 | init([Nodes,Opts]) -> 33 | {ok, {{one_for_one, 5, 60}, 34 | lists:flatten([ log({0,N},Nodes) || {N,_,_,_} <- Nodes, N == cr:node()] 35 | ++ [ rafter({0,N},Nodes) || {N,_,_,_} <- Nodes, N == cr:node()] 36 | ++ [ protocol(O,Nodes) || O<-Opts ] 37 | ++ [ pool(heart_sup) ] 38 | ++ [ pool(vnode_sup) ]) }}. 39 | 40 | stop(_) -> ok. 41 | start() -> start(normal,[]). 42 | start(_,_) -> 43 | io:format("Node: ~p~n",[cr:node()]), 44 | {ok,Peers}=application:get_env(cr,peers), 45 | {N,P1,P2,P3} = lists:keyfind(cr:node(),1,Peers), 46 | {_,VNodes} = cr:ring(), 47 | kvs:join(), 48 | Sup = supervisor:start_link({local, cr_sup}, ?MODULE, 49 | [ Peers, [ { interconnect, P1, cr_interconnect }, 50 | { ping, P2, cr_ping }, 51 | { client, P3, cr_client } ]]), 52 | io:format("Supervision: ~p~n",[supervisor:which_children(cr_sup)]), 53 | [ start_vnode({Index,Node},Peers) || {Index,Node} <- VNodes, Node == cr:nodex(cr:node()) ], 54 | spawn(fun() -> supervisor:start_child(heart_sup,heart(Peers)) end), 55 | Sup. 56 | 57 | protocol({Name,Port,Mod},Nodes) -> 58 | SupName = list_to_atom(lists:concat([Name,'_',sup])), 59 | [ tcp(Name,Port,Mod,Nodes), % TCP listener gen_server 60 | pool(SupName) ]. % Accepted Clients Supervisor 61 | 62 | start_vnode({0,_Name},Peers) -> skip; 63 | start_vnode({Index,Name},_ ) -> supervisor:start_child(vnode_sup,vnode({Index,Name})). 64 | -------------------------------------------------------------------------------- /src/cr_hash.erl: -------------------------------------------------------------------------------- 1 | -module(cr_hash). 2 | -description('Consistent Hash Ring'). 3 | -copyright('Synrc Research Center s.r.o.'). 4 | -compile(export_all). 5 | -define(RINGTOP, trunc(math:pow(2,160)-1)). % SHA-1 space 6 | 7 | % Our consistent ring hash module consists of five functions 8 | % Why need we have more? 9 | 10 | key_of(Object) -> crypto:hash(sha, term_to_binary(Object)). 11 | inc(N) -> ?RINGTOP div N. 12 | fresh(N, Seed) -> {N, [{Int,Seed} || Int <- lists:seq(0,(?RINGTOP-1),inc(N))]}. 13 | succ(Idx,{N,Nodes}) -> <> =Idx, {A,B}=lists:split((Int div inc(N))+1,Nodes), B++A. 14 | -------------------------------------------------------------------------------- /src/cr_heart.erl: -------------------------------------------------------------------------------- 1 | -module(cr_heart). 2 | -description('Heart Monitor'). 3 | -author('Maxim Sokhatsky'). 4 | -copyright('Synrc Research Center s.r.o.'). 5 | -include("cr.hrl"). 6 | -include("rafter.hrl"). 7 | -compile(export_all). 8 | -record(state, {name,nodes,timers}). 9 | -export(?GEN_SERVER). 10 | 11 | %% Heart Monitor module is a single process, monitoring other cluster peers. 12 | %% The Configuration of Ring is tracked by RAFT protocol and its log. 13 | 14 | start_link(Name,Nodes) -> 15 | gen_server:start_link(?MODULE, [Name,Nodes], []). 16 | 17 | init([Name,Nodes]) -> 18 | 19 | Timers = [ begin 20 | [_,Addr]=string:tokens(atom_to_list(cr:node()),"@"), 21 | {ok,Parsed}=inet:parse_address(Addr), 22 | Timer = erlang:send_after(1000,self(),{timer,ping,{Parsed,P2},Node,undefined}), 23 | {Node,Timer} 24 | end || {Node,_,P2,_}<-Nodes, Node /= cr:node()], 25 | 26 | io:format("HEART PROTOCOL: started: ~p~n" 27 | "Nodes: ~p~n",[Name,Timers]), 28 | 29 | {ok,#state{name=Name,nodes=Nodes,timers=Timers}}. 30 | 31 | timer_restart(Diff,Connect,Node,Socket) -> 32 | {X,Y,Z} = Diff, 33 | erlang:send_after(1000*(1+Z+60*Y+60*60*X),self(),{timer,ping,Connect,Node,Socket}). 34 | 35 | setkey(Name,Pos,List,New) -> 36 | case lists:keyfind(Name,Pos,List) of 37 | false -> [New|List]; 38 | _Element -> lists:keyreplace(Name,Pos,List,New) end. 39 | 40 | handle_info({'EXIT', Pid,_}, #state{} = State) -> 41 | io:format("HEART: EXIT~n",[]), 42 | {noreply, State}; 43 | 44 | handle_info({carrier,lost,N}, State=#state{timers=Timer}) -> 45 | io:format("HOST CARRIER LOST ~p~n",[N]), 46 | {noreply,State}; 47 | 48 | handle_info({timer,ping,{A,P},N,S}, State=#state{timers=Timers}) -> 49 | 50 | %io:format("PING STATE: ~p~n",[{A,P,N,S}]), 51 | 52 | #config{newservers=Servers} = cr_log:get_config(cr:node()), 53 | 54 | {N,Timer} = lists:keyfind(N,1,Timers), 55 | case Timer of undefined -> skip; _ -> erlang:cancel_timer(Timer) end, 56 | 57 | Socket = try gen_tcp:send(S,term_to_binary({ping})), S 58 | catch E:R -> case gen_tcp:connect(A,P,[{packet,0},{active,false}]) of 59 | {ok,S1} -> gen_tcp:send(S1,term_to_binary({ping})), S1; 60 | {error,SErr} -> % io:format("Send Error: ~p~n",[{N,SErr}]), 61 | undefined end end, 62 | 63 | Data = try case gen_tcp:recv(Socket,0) of 64 | {error,RErr} -> %io:format("Recv Error: ~p~n",[{N,RErr}]), 65 | {error,undefined}; 66 | {ok,PONG} when length(PONG) == 10 -> {ok,Socket} end 67 | catch E1:R1 -> {error,recv} end, 68 | 69 | {T,Operation,Online} = case Data of 70 | {error,_} -> {timer_restart({0,0,5},{A,P},N,undefined),remove,undefined}; 71 | {ok,Sx} -> {timer_restart({0,0,5},{A,P},N,Sx),add,Sx} end, 72 | 73 | case change(S,Online,N,Servers) of 74 | true -> 75 | try 76 | case cr_rafter:set_config(cr:node(),{N,Operation}) of 77 | {error,_} -> skip; 78 | _ -> io:format("Server Config Changed S/T ~p~n", 79 | [{N,Operation}]) end 80 | catch 81 | _:Err -> io:format("CONFIG ERROR ~p~n",[Err]) end, 82 | ok; 83 | false -> skip end, 84 | 85 | {noreply,State#state{timers=setkey(N,1,Timers,{N,T})}}; 86 | 87 | handle_info(_Info, State) -> 88 | io:format("HEART: Info ~p~n",[_Info]), 89 | {noreply, State}. 90 | 91 | handle_call({heart},_,Proc) -> 92 | {reply,Proc,Proc}; 93 | 94 | handle_call(Request,_,Proc) -> 95 | io:format("HEART: Call ~p~n",[Request]), 96 | {reply,ok,Proc}. 97 | 98 | handle_cast(Msg, State) -> 99 | io:format("HEART: Cast ~p", [Msg]), 100 | {stop, {error, {unknown_cast, Msg}}, State}. 101 | 102 | terminate(_Reason, #state{}) -> ok. 103 | code_change(_OldVsn, State, _Extra) -> {ok, State}. 104 | 105 | change(undefined,undefined,N,Servers) -> lists:member(N,Servers); 106 | change(undefined,_,_,_) -> true; 107 | change(_,undefined,_,_) -> true; 108 | change(A,A,_,_) -> false; 109 | change(_,_,_,_) -> true. 110 | -------------------------------------------------------------------------------- /src/cr_vnode.erl: -------------------------------------------------------------------------------- 1 | -module(cr_vnode). 2 | -description('Ring Replica'). 3 | -author('Maxim Sokhatsky'). 4 | -copyright('Synrc Research Center s.r.o.'). 5 | -include("cr.hrl"). 6 | -include_lib("kvs/include/kvs.hrl"). 7 | -include_lib("db/include/transaction.hrl"). 8 | -compile(export_all). 9 | -record(state, {name,nodes,storage,latency={inf,0,0,0}}). % latency {min,max,avg} 10 | -export(?GEN_SERVER). 11 | 12 | %% Ring Replica vnode is single point of execution inside CR DHT. 13 | %% Each Node in Cluster has several replica vnodes. 14 | 15 | start_link(Name,Storage) -> 16 | gen_server:start_link(?MODULE, [Name,Storage], []). 17 | 18 | init([Name,Storage]) -> 19 | [ gen_server:cast(Name,O) || O <- kvs:entries(kvs:get(log,{pending,Name}),operation,-1) ], 20 | io:format("VNODE PROTOCOL: started: ~p.~n",[Name]), 21 | {ok,#state{name=Name,storage=Storage}}. 22 | 23 | handle_info({'EXIT', Pid,_}, #state{} = State) -> 24 | io:format("VNODE: EXIT~n",[]), 25 | {noreply, State}; 26 | 27 | handle_info(_Info, State) -> 28 | % io:format("VNODE: Info ~p~n",[_Info]), 29 | {noreply, State}. 30 | 31 | kvs_log({Cmd,Self,[{I,N}|T],Tx}=Message, #state{name=Name}=State) -> 32 | Id = element(2,Tx), 33 | % io:format("XA RECEIVE: ~p~n",[{Id,Message,Name}]), 34 | Operation = #operation{name=Cmd,body=Message,feed_id=Name,status=pending}, 35 | {ok,Saved} = %kvs:add(Operation#operation{id=kvs:next_id(operation,1)}), 36 | cr_log:kvs_log(cr:node(),Operation), 37 | try gen_server:cast(self(),Saved) 38 | catch E:R -> io:format("LOG ERROR ~p~n",[cr:stack(E,R)]) end. 39 | 40 | continuation(Next,{_,_,[],Tx}=Command,State) -> {noreply, State}; 41 | continuation(Next,{C,S,[{I,N}|T],Tx}=Command,State) -> 42 | Id = element(2,Tx), 43 | Peer = cr:peer({I,N}), 44 | Vpid = cr:vpid({I,Peer}), 45 | case gen_server:cast(Vpid,{pending,Command}) of 46 | ok -> % io:format("XA SENT OK from ~p to ~p~n",[cr:node(),Peer]), 47 | {noreply,State}; 48 | Error -> timer:sleep(1000), 49 | continuation(Next,Command,State) end. 50 | 51 | handle_call({pending,{Cmd,Self,[{I,N}|T],Tx}=Message}, _, #state{name=Name,storage=Storage}=State) -> 52 | kvs_log(Message,State), 53 | {reply, {ok,queued}, State}; 54 | 55 | handle_call({latency},_,#state{latency={Min,Max,Avg,N}}=State) -> 56 | L = try X = {Min div 1000,Max div 1000,Avg div (N*1000)} 57 | catch _:_ -> {Min,Max,Avg} end, 58 | {reply,L,State}; 59 | 60 | handle_call(Request,_,Proc) -> 61 | io:format("VNODE: Call ~p~n",[Request]), 62 | {reply,ok,Proc}. 63 | 64 | handle_cast({client,Client,Chain,Record}, #state{name=Name,storage=Storage}=State) -> 65 | {I,N} = hd(Chain), 66 | Self = cr:node(), 67 | gen_server:cast(case cr:peer({I,N}) of 68 | Self -> cr:local(Record); 69 | Node -> cr:vpid({I,Node}) end, 70 | {pending,{prepare,Client,Chain,Record}}), 71 | {noreply, State}; 72 | 73 | handle_cast({pending,{Cmd,Self,[{I,N}|T],Tx}=Message}, #state{name=Name,storage=Storage}=State) -> 74 | kvs_log(Message,State), 75 | {noreply, State}; 76 | 77 | handle_cast(#operation{name=Command,body=Message}=Operation, #state{name=Name,storage=Storage}=State) -> 78 | {Command,Sender,[H|T]=Chain,Tx} = Message, 79 | Replay = try cr_log:kvs_replay(cr:node(),Operation,State,status(Command)) 80 | catch E:R -> %io:format("~p REPLAY ~p~n",[code(Command),cr:stack(E,R)]), 81 | {rollback, {E,R}, Chain, Tx} end, 82 | {Forward,Latency} = case [Chain,Replay] of 83 | [_,A={rollback,_,_,_}] -> {A,State#state.latency}; 84 | [[Name],_] -> last(Operation,State); 85 | [[H|T],_] -> {{Command,Sender,T,Tx},State#state.latency} end, 86 | try continuation(H,Forward,State) 87 | catch X:Y -> io:format("~p SEND ~p~n",[code(Command),cr:stack(X,Y)]) end, 88 | {noreply,State#state{latency=Latency}}. 89 | 90 | terminate(_Reason, #state{}) -> ok. 91 | code_change(_OldVsn, State, _Extra) -> {ok, State}. 92 | 93 | status(commit) -> commited; 94 | status(prepare) -> prepared; 95 | status(Unknown) -> Unknown. 96 | 97 | % XA PROTOCOL 98 | % last(#operation{body={prepare,{Sender,Time},_,Tx}},S) -> {{commit,{Sender,Time},cr:chain(element(2,Tx)),Tx},S#state.latency}; 99 | % last(#operation{body={commit,{Sender,Time},_,Tx}},S) -> {{nop,{Sender,Time},[],[]},new_latency(Time,S)}; 100 | 101 | % CR PROTOCOL 102 | last(#operation{body={_,{Sender,Time},_,Tx}},S) -> {{nop,{Sender,Time},[],[]},new_latency(Time,S)}. 103 | 104 | new_latency(Time,#state{latency=Latency}) -> 105 | {Min,Max,Avg,N} = Latency, 106 | L = time_diff(Time,os:timestamp()), 107 | {NMin,NMax} = case L of 108 | L when L > Max -> {Min,L}; 109 | L when L < Min -> {L,Max}; 110 | _ -> {Min,Max} end, 111 | NAvg = Avg + L, 112 | {NMin,NMax,NAvg,N + 1}. 113 | 114 | ms({Mega,Sec,Micro}) -> (Mega*1000000+Sec)*1000000+Micro. 115 | time_diff(Now,Now2) -> ms(Now2) - ms(Now). 116 | 117 | code(prepare) -> "PREPARE"; 118 | code(commit) -> "COMMIT"; 119 | code(rollback) -> "ROLLBACK"; 120 | code(Unknown) -> Unknown. 121 | -------------------------------------------------------------------------------- /src/tcp/cr_client.erl: -------------------------------------------------------------------------------- 1 | -module(cr_client). 2 | -copyright('Maxim Sokhatsky'). 3 | -include("cr.hrl"). 4 | -compile(export_all). 5 | -record(state, {succ,pred,port,name,socket,module,nodes}). 6 | 7 | sup() -> client_sup. 8 | 9 | init([Name,Mod,Socket,Nodes]) -> #state{name=Name,module=Mod,socket=Socket,nodes=Nodes}. 10 | 11 | dispatch({transaction,Object},#state{nodes=Nodes,name=Name,socket=Socket}=State) -> 12 | Result = cr:tx(Object), 13 | gen_tcp:send(Socket,term_to_binary(Result)), 14 | State; 15 | 16 | dispatch(_,State) -> State. 17 | -------------------------------------------------------------------------------- /src/tcp/cr_connection.erl: -------------------------------------------------------------------------------- 1 | -module(cr_connection). 2 | -copyright('Synrc Research Center s.r.o.'). 3 | -include("cr.hrl"). 4 | -behaviour(gen_fsm). 5 | -compile(export_all). 6 | -record(state, {port,name,socket,module,peer,state,nodes}). 7 | -export(?GEN_FSM). 8 | -export([listen/2, transfer/2]). 9 | -define(TIMEOUT, 10000). 10 | 11 | start_connection(Module,Socket,Nodes) -> 12 | {ok, {IP,Port}} = try inet:peername(Socket) catch _:_ -> {ok,{{127,0,0,1},now()}} end, 13 | Restart = permanent, 14 | Shutdown = 2000, 15 | UniqueName = {Module,IP,Port}, 16 | ChildSpec = { UniqueName, { cr_connection, start_link, [UniqueName,Module,Socket,Nodes]}, 17 | Restart, Shutdown, worker, [Module] }, 18 | Sup = supervisor:start_child(Module:sup(),ChildSpec), 19 | io:format("SERVER: starting ~p listener: ~p~n",[Sup,{Module,IP,Port,Socket}]), 20 | Sup. 21 | 22 | listen({socket_ready, Socket}, State) -> 23 | io:format("SERVER: Socket Ready ~p~n", [Socket]), 24 | inet:setopts(Socket, [{active, once}, {packet, 0}, binary]), 25 | {next_state, transfer, State#state{socket=Socket}, ?TIMEOUT}; 26 | 27 | listen(Other, State) -> 28 | io:format("SERVER: Unexpected message during listening ~p~n", [Other]), 29 | {next_state, listen, State, ?TIMEOUT}. 30 | 31 | transfer({in,Binary}, #state{state=SubState,module=Module}=State) -> 32 | % io:format("SERVER: RECV ~p~n", [Binary]), 33 | NewSubState = Module:dispatch(cr:decode(Binary),SubState), 34 | {next_state, transfer, State#state{state=NewSubState}, ?TIMEOUT}; 35 | 36 | transfer({out,Message}, #state{socket=Socket,state=SubState}=State) -> 37 | % io:format("SERVER: SEND ~p~n", [Message]), 38 | Bytes = cr:encode(Message), 39 | gen_tcp:send(Socket, Bytes), 40 | {next_state, transfer, State, ?TIMEOUT}; 41 | 42 | transfer(timeout, State) -> 43 | % io:format("SERVER: Client connection timeout: ~p\n", [State]), 44 | {stop, normal, State}; 45 | 46 | transfer(_Data, State) -> 47 | io:format("SERVER: unknown Data during transfer: ~p\n", [_Data]), 48 | {stop, normal, State}. 49 | 50 | start_link(Name,Mod,Socket,Nodes) -> 51 | gen_fsm:start_link(?MODULE, [Name,Mod,Socket,Nodes], []). 52 | 53 | init([]) -> 54 | RestartStrategy = one_for_one, 55 | MaxRestarts = 1, 56 | MaxSecondsBetweenRestarts = 600, 57 | SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts}, 58 | {ok, {SupFlags, []}}; 59 | 60 | init([Name,Mod,Socket,Nodes]) -> 61 | io:format("PROTOCOL: starting ~p listener: ~p~n",[self(),{Name,Mod}]), 62 | process_flag(trap_exit, true), 63 | {ok,listen,#state{module=Mod,name=Name, 64 | socket=Socket,nodes=Nodes, 65 | state=Mod:init([Name,Mod,Socket,Nodes])}}. 66 | 67 | handle_info({tcp, Socket, Bin}, StateName, #state{module=Module,state=SubState} = State) -> 68 | inet:setopts(Socket, [{active, once}]), 69 | ?MODULE:StateName({in,Bin}, State); 70 | 71 | handle_info({tcp_closed,_S}, _, State) -> 72 | io:format("SERVER: TCP closed~n",[]), 73 | {stop, normal, State}; 74 | 75 | handle_info({'EXIT', Pid,_}, StateName, #state{} = State) -> 76 | io:format("SERVER: EXIT~n",[]), 77 | {next_state, StateName, State}; 78 | 79 | handle_info(_Info, StateName, State) -> 80 | io:format("SERVER: Info ~p~n",[_Info]), 81 | {noreply, StateName, State}. 82 | 83 | handle_event(Event, StateName, State) -> {stop, {StateName, undefined_event, Event}, State}. 84 | handle_sync_event(Event, _From, StateName, State) -> {stop, {StateName, undefined_event, Event}, State}. 85 | terminate(_Reason, StateName, #state{socket=Socket}) -> gen_tcp:close(Socket). 86 | code_change(_OldVsn, StateName, State, _Extra) -> {ok, StateName, State}. 87 | 88 | -------------------------------------------------------------------------------- /src/tcp/cr_interconnect.erl: -------------------------------------------------------------------------------- 1 | -module(cr_interconnect). 2 | -copyright('Synrc Research Center s.r.o.'). 3 | -include("cr.hrl"). 4 | -compile(export_all). 5 | -record(state, {succ,pred,port,name,socket,module,nodes}). 6 | 7 | sup() -> interconnect_sup. 8 | init([Name,Mod,Socket,Nodes]) -> #state{name=Name,module=Mod,socket=Socket,nodes=Nodes}. 9 | 10 | reply(Socket,Result,State) -> gen_tcp:send(Socket,term_to_binary(Result)), State. 11 | 12 | dispatch({Command,Object},#state{socket=Socket}=State) -> 13 | io:format("CONS {_,_} VNODE command: ~p~n",[{Object}]), 14 | reply(Socket,gen_server:call(cr:peer(cr:hash(Object)),{Command,Object}),State); 15 | 16 | dispatch({Command,Tx,Transaction}, #state{name=Name,socket=Socket}=State) -> 17 | io:format("CONS {_,_,_} XA command: ~p~n",[{Transaction}]), 18 | reply(Socket,gen_server:call(element(2,Tx),{Command,Transaction}),State); 19 | 20 | dispatch(_,State) -> State. 21 | -------------------------------------------------------------------------------- /src/tcp/cr_ping.erl: -------------------------------------------------------------------------------- 1 | -module(cr_ping). 2 | -copyright('Synrc Research Center s.r.o.'). 3 | -include("cr.hrl"). 4 | -compile(export_all). 5 | -record(state, {port,name,socket,module,nodes}). 6 | 7 | sup() -> ping_sup. 8 | 9 | init([Name,Mod,Socket,Nodes]) -> #state{socket=Socket,nodes=Nodes}. 10 | 11 | dispatch({'join',Object},State) -> 12 | io:format("PING: Join request: ~p~n",[Object]), 13 | State; 14 | 15 | dispatch({ping},#state{socket=Socket}=State) -> 16 | io:format("PING: Message: ~p~n",[self()]), 17 | gen_tcp:send(Socket,term_to_binary({pong})), 18 | State; 19 | 20 | dispatch({'leave',Object},State) -> State; 21 | 22 | dispatch(_,State) -> State. 23 | -------------------------------------------------------------------------------- /src/tcp/cr_tcp.erl: -------------------------------------------------------------------------------- 1 | -module(cr_tcp). 2 | -description('prim_inet based TCP non-blocking listener'). 3 | -copyright('Synrc Research Center s.r.o.'). 4 | -behaviour(gen_server). 5 | -include("cr.hrl"). 6 | -export(?GEN_SERVER). 7 | -compile(export_all). 8 | -record(state, {listener,acceptor,module,name,port,ring}). 9 | 10 | handle_info({inet_async,ListSock,Ref,Message}, 11 | #state{listener=ListSock,acceptor=Ref,module=Module,name=Name,port=Port,ring=HashRing} = State) -> 12 | {ok,CliSocket} = Message, 13 | set_sockopt(ListSock, CliSocket), 14 | {ok, Pid} = cr_connection:start_connection(Module,CliSocket,HashRing), 15 | gen_tcp:controlling_process(CliSocket, Pid), 16 | cr:set_socket(Pid, CliSocket), 17 | Acceptor = case prim_inet:async_accept(ListSock, -1) of 18 | {ok, NewRef} -> NewRef; 19 | {error, Reason} -> 20 | io:format("TCP: Accept Error: ~p~n",[Reason]), 21 | Reason end, 22 | {noreply, State#state{acceptor=Acceptor}}; 23 | 24 | handle_info(_Info, State) -> {noreply, State}. 25 | terminate(_Reason, State) -> gen_tcp:close(State#state.listener), ok. 26 | code_change(_OldVsn, State, _Extra) -> {ok, State}. 27 | handle_call(Request, _From, State) -> {stop, {unknown_call, Request}, State}. 28 | handle_cast(_Msg, State) -> {noreply, State}. 29 | 30 | start_link(Name, Port, Module, HashRing) -> 31 | gen_server:start_link({local, Name}, ?MODULE, [Name, Port, Module, HashRing], []). 32 | 33 | init([Name, Port, Module, HashRing]) -> 34 | process_flag(trap_exit, true), 35 | Opts = [binary,{packet,1},{reuseaddr,true},{keepalive,true},{backlog,30},{active,false}], 36 | case gen_tcp:listen(Port, Opts) of 37 | {ok, Listen_socket} -> 38 | {ok, Ref} = prim_inet:async_accept(Listen_socket, -1), 39 | {ok, #state{ listener = Listen_socket, 40 | acceptor = Ref, 41 | ring = HashRing, 42 | module = Module, 43 | port=Port, 44 | name=Name}}; 45 | {error, Reason} -> {stop, Reason} end. 46 | 47 | set_sockopt(ListSock, CliSocket) -> 48 | true = inet_db:register_socket(CliSocket, inet_tcp), 49 | case prim_inet:getopts(ListSock,[active, nodelay, keepalive, delay_send, priority, tos]) of 50 | {ok, Opts} -> case prim_inet:setopts(CliSocket, Opts) of 51 | ok -> ok; 52 | Error -> 53 | io:format("TCP OPT Socket Error ~p~n",[Error]), 54 | gen_tcp:close(CliSocket), Error end; 55 | Error -> gen_tcp:close(CliSocket), 56 | io:format("TCP Socket Error ~p~n",[Error]), 57 | exit({set_sockopt, Error}) end. 58 | -------------------------------------------------------------------------------- /sys.config: -------------------------------------------------------------------------------- 1 | [ 2 | {riak_ensemble, [{data_root, "data"}]}, 3 | {cr, [{peers,[{ 'cr@127.0.0.1',9000,9001,9002}, 4 | { 'cr2@127.0.0.1',9004,9005,9006}, 5 | { 'cr3@127.0.0.1',9008,9009,9010}]}]}, 6 | {kvs, [{dba,store_mnesia}, 7 | {log_modules,cr}, 8 | {user,[{interval,5,10,user}, 9 | {interval,10,100,user2}]}, 10 | {schema, [ db_config, 11 | kvs_feed, kvs_user, kvs_subscription ]} ]} 12 | ]. 13 | -------------------------------------------------------------------------------- /vm.args: -------------------------------------------------------------------------------- 1 | +pc unicode 2 | +K true 3 | +A 5 4 | -env ERL_MAX_PORTS 4096 5 | -env ERL_FULLSWEEP_AFTER 10 6 | --------------------------------------------------------------------------------