├── .gitignore
├── LICENSE
├── README.md
├── ReplicaEngine
    └── tla
    │   ├── ReplicaEngine.tla
    │   └── ReplicaEngine.toolbox
    │       ├── .project
    │       ├── .settings
    │           └── org.lamport.tla.toolbox.prefs
    │       └── ReplicaEngine___model.launch
├── Storage
    └── tla
    │   ├── Storage.tla
    │   └── Storage.toolbox
    │       └── Storage___model.launch
├── ZenWithTerms
    └── tla
    │   ├── ZenWithTerms.tla
    │   └── ZenWithTerms.toolbox
    │       ├── .project
    │       ├── .settings
    │           └── org.lamport.tla.toolbox.prefs
    │       └── ZenWithTerms___model.launch
├── cluster
    ├── isabelle
    │   ├── Implementation.thy
    │   ├── Monadic.thy
    │   ├── OneSlot.thy
    │   ├── Preliminaries.thy
    │   ├── ROOT
    │   ├── Zen.thy
    │   └── document
    │   │   └── root.tex
    └── tla
    │   ├── consensus.tla
    │   └── consensus.toolbox
    │       ├── .project
    │       ├── .settings
    │           └── org.lamport.tla.toolbox.prefs
    │       └── consensus___model.launch
└── data
    └── tla
        ├── replication.tla
        └── replication.toolbox
            ├── .project
            ├── .settings
                └── org.lamport.tla.toolbox.prefs
            └── replication___model.launch


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/.DS_Store
 2 | **/tla/*.toolbox/model
 3 | **/tla/*.toolbox/*aux
 4 | **/tla/*.toolbox/*.log
 5 | **/tla/*.toolbox/*.pdf
 6 | **/tla/*.toolbox/*.tex
 7 | **/tla/*.toolbox/*___model_SnapShot*.launch
 8 | **/tla/*.toolbox/**/*.tla
 9 | **/tla/*.toolbox/**/*.out
10 | **/tla/*.toolbox/**/MC.cfg
11 | **/tla/*.pdf
12 | **/tla/*.old
13 | **/*~
14 | cluster/isabelle/output
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Formal models of core Elasticsearch algorithms
 2 | 
 3 | This repository contains formal models of core [Elasticsearch](https://github.com/elastic/elasticsearch) algorithms and is directly related to implementation efforts around [data replication](https://github.com/elastic/elasticsearch/issues/10708) and [cluster coordination](https://github.com/elastic/elasticsearch/issues/32006). The models in this repository might represent past, current and future designs of Elasticsearch and can differ to their implementations in substantial ways. The formal models mainly serve to illustrate some of the high-level concepts and help to validate resiliency-related aspects.
 4 | 
 5 | ## Models
 6 | 
 7 | ### Cluster coordination model
 8 | 
 9 | The cluster coordination TLA+ model ensures the consistency of cluster state updates and represents the core [cluster coordination](https://github.com/elastic/elasticsearch/issues/32006) and metadata replication algorithm implemented in Elasticsearch 7.0. It consists of two files:
10 | 
11 | - [TLA+ specification](ZenWithTerms/tla/ZenWithTerms.tla) which has a [direct one-to-one implementation in Elasticsearch](https://github.com/elastic/elasticsearch/blob/master/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationState.java)
12 | - [TLC model checking configuration](ZenWithTerms/tla/ZenWithTerms.toolbox/ZenWithTerms___model.launch)
13 | 
14 | ### Data replication model
15 | 
16 | The data replication TLA+ model describes the Elasticsearch [sequence number](https://github.com/elastic/elasticsearch/issues/10708) based data replication approach, implemented since Elasticsearch 6.0, which consists of two files:
17 | 
18 | - [TLA+ specification](data/tla/replication.tla)
19 | - [TLC model checking configuration](data/tla/replication.toolbox/replication___model.launch)
20 | 
21 | ### Replica engine
22 | 
23 | A TLA+ model of how the
24 | [engine](https://github.com/elastic/elasticsearch/blob/00fd73acc4a2991f96438f8c1948016c5b9eefb2/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java)
25 | handles replication requests.
26 | 
27 | - [TLA+ specification](ReplicaEngine/tla/ReplicaEngine.tla)
28 | - [TLC model checking configuration](ReplicaEngine/tla/ReplicaEngine.toolbox/ReplicaEngine___model.launch)
29 | 
30 | ### Alternative cluster coordination model
31 | 
32 | The alternative cluster coordination TLA+ model consists of two files:
33 | 
34 | - [TLA+ specification](cluster/tla/consensus.tla)
35 | - [TLC model checking configuration](cluster/tla/consensus.toolbox/consensus___model.launch)
36 | 
37 | The alternative cluster consensus Isabelle model consists of the following theories:
38 | 
39 | - [Basic definitions](cluster/isabelle/Preliminaries.thy)
40 | - [An implementation in functional style](cluster/isabelle/Implementation.thy)
41 | - [An implementation in monadic style, along with a proof it's equivalent to the previous](cluster/isabelle/Monadic.thy)
42 | - [The proof that each slot is consistent, based on Lamport's Synod algorithm](cluster/isabelle/OneSlot.thy)
43 | - [The proof that the implementation ensures consistency](cluster/isabelle/Zen.thy)
44 | 
45 | ## How to edit/run TLA+:
46 | 
47 | - Install the [TLA Toolbox](http://research.microsoft.com/en-us/um/people/lamport/tla/toolbox.html)
48 |   - If on Mac OS, [move the downloaded app to the Applications folder first](https://groups.google.com/forum/#!topic/tlaplus/bL04c6BiYxo)
49 | - Read some [documentation](http://research.microsoft.com/en-us/um/people/lamport/tla/book.html)
50 | 
51 | How to run the model checker in headless mode:
52 | 
53 | - Download [tla2tools.jar](http://research.microsoft.com/en-us/um/people/lamport/tla/tools.html)
54 | - Run the model checker once in TLA+ Toolbox on desktop (can be aborted once started). This generates the folder `elasticsearch.toolbox/model/` that contains all model files that are required to run the model checker in headless mode.
55 | - Copy the above folder and `tla2tools.jar` to the server running in headless mode.
56 | - `cd` to the folder and run `java -Xmx30G -cp ../tla2tools.jar tlc2.TLC MC -deadlock -workers 12`. The setting `-Xmx30G` denotes the amount of memory to allocate to the model checker and `-workers 12` the number of worker threads (should be equal to the number of cores on machine). The setting `-deadlock` ensures that TLC explores the full reachable state space, not searching for deadlocks.
57 | 


--------------------------------------------------------------------------------
/ReplicaEngine/tla/ReplicaEngine.toolbox/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>ReplicaEngine</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>toolbox.builder.TLAParserBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>toolbox.builder.PCalAlgorithmSearchingBuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>toolbox.natures.TLANature</nature>
21 | 	</natures>
22 | 	<linkedResources>
23 | 		<link>
24 | 			<name>ReplicaEngine.tla</name>
25 | 			<type>1</type>
26 | 			<locationURI>PARENT-1-PROJECT_LOC/ReplicaEngine.tla</locationURI>
27 | 		</link>
28 | 	</linkedResources>
29 | </projectDescription>
30 | 


--------------------------------------------------------------------------------
/ReplicaEngine/tla/ReplicaEngine.toolbox/.settings/org.lamport.tla.toolbox.prefs:
--------------------------------------------------------------------------------
1 | ProjectRootFile=PARENT-1-PROJECT_LOC/ReplicaEngine.tla
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/ReplicaEngine/tla/ReplicaEngine.toolbox/ReplicaEngine___model.launch:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <launchConfiguration type="org.lamport.tla.toolbox.tool.tlc.modelCheck">
 3 | <stringAttribute key="TLCCmdLineParameters" value=""/>
 4 | <stringAttribute key="configurationName" value="model"/>
 5 | <intAttribute key="dfidDepth" value="100"/>
 6 | <booleanAttribute key="dfidMode" value="false"/>
 7 | <intAttribute key="distributedFPSetCount" value="0"/>
 8 | <stringAttribute key="distributedNetworkInterface" value="192.168.1.39"/>
 9 | <intAttribute key="distributedNodesCount" value="1"/>
10 | <stringAttribute key="distributedTLC" value="off"/>
11 | <stringAttribute key="distributedTLCVMArgs" value=""/>
12 | <intAttribute key="fpBits" value="1"/>
13 | <intAttribute key="fpIndex" value="1"/>
14 | <intAttribute key="maxHeapSize" value="25"/>
15 | <intAttribute key="maxSetSize" value="1000000"/>
16 | <booleanAttribute key="mcMode" value="true"/>
17 | <stringAttribute key="modelBehaviorInit" value=""/>
18 | <stringAttribute key="modelBehaviorNext" value=""/>
19 | <stringAttribute key="modelBehaviorSpec" value="Spec"/>
20 | <intAttribute key="modelBehaviorSpecType" value="1"/>
21 | <stringAttribute key="modelBehaviorVars" value="versionMap_isUnsafe, lucene_buffer, maxSeqNoOfNonAppendOnlyOperations, request_count, expected_doc, duplicationCount, useLuceneUpdateDocument, pc, versionMap_entry, replication_requests, maxUnsafeAutoIdTimestamp, currentlyDeleted, req, lucene_document, indexIntoLucene, versionMap_needsSafeAccess, deleteFromLucene, currentNotFoundOrDeleted, plan, localCheckPoint, completedSeqnos"/>
22 | <stringAttribute key="modelComments" value=""/>
23 | <booleanAttribute key="modelCorrectnessCheckDeadlock" value="true"/>
24 | <listAttribute key="modelCorrectnessInvariants">
25 | <listEntry value="1Invariant"/>
26 | </listAttribute>
27 | <listAttribute key="modelCorrectnessProperties">
28 | <listEntry value="0Termination"/>
29 | <listEntry value="1Invariant"/>
30 | </listAttribute>
31 | <stringAttribute key="modelExpressionEval" value=""/>
32 | <stringAttribute key="modelParameterActionConstraint" value=""/>
33 | <listAttribute key="modelParameterConstants">
34 | <listEntry value="UPDATE;;UPDATE;1;0"/>
35 | <listEntry value="NULL;;NULL;1;0"/>
36 | <listEntry value="ADD;;ADD;1;0"/>
37 | <listEntry value="DELETE;;DELETE;1;0"/>
38 | <listEntry value="DocContent;;{DocA, DocB};1;1"/>
39 | <listEntry value="RETRY_ADD;;RETRY_ADD;1;0"/>
40 | <listEntry value="Lucene_deleteDocuments;;Lucene_deleteDocuments;1;0"/>
41 | <listEntry value="Lucene_addDocuments;;Lucene_addDocuments;1;0"/>
42 | <listEntry value="Lucene_updateDocuments;;Lucene_updateDocuments;1;0"/>
43 | <listEntry value="DocAutoIdTimestamp;;1000;0;0"/>
44 | <listEntry value="defaultInitValue;;defaultInitValue;1;0"/>
45 | <listEntry value="DuplicationLimit;;1;0;0"/>
46 | </listAttribute>
47 | <stringAttribute key="modelParameterContraint" value=""/>
48 | <listAttribute key="modelParameterDefinitions"/>
49 | <stringAttribute key="modelParameterModelValues" value="{}"/>
50 | <stringAttribute key="modelParameterNewDefinitions" value=""/>
51 | <stringAttribute key="modelPropertiesExpand" value=""/>
52 | <intAttribute key="numberOfWorkers" value="4"/>
53 | <booleanAttribute key="recover" value="false"/>
54 | <stringAttribute key="result.mail.address" value=""/>
55 | <intAttribute key="simuAril" value="-1"/>
56 | <intAttribute key="simuDepth" value="100"/>
57 | <intAttribute key="simuSeed" value="-1"/>
58 | <stringAttribute key="specName" value="ReplicaEngine"/>
59 | <stringAttribute key="view" value=""/>
60 | </launchConfiguration>
61 | 


--------------------------------------------------------------------------------
/Storage/tla/Storage.tla:
--------------------------------------------------------------------------------
  1 | ------------------------------ MODULE Storage ------------------------------
  2 | EXTENDS Integers, FiniteSets, TLC
  3 | 
  4 | CONSTANTS 
  5 |     MaxNewMeta, \* maximum generation of newMeta to limit the state space
  6 |     MetaDataContent \* content that is written to the metadata file
  7 |     
  8 | VARIABLES
  9 |     metadata,      \* metaData[i] = MetaDataContent if metadata of generation i is present
 10 |     manifest,      \* manifest[j] is generation of metadata j-th manifest is referencing
 11 |     newMeta,       \* generation of newly created metadata file
 12 |     newManifest,   \* generation of newly created manifest file
 13 |     state,         \* current state, describes what to do next
 14 |     possibleStates \* set of generations of metadata that limits what can be read from disk
 15 | 
 16 | --------------------------------------------------------------------------
 17 | (*************************************************************************)
 18 | (* First we define some helper functions to work with files abstraction. *)
 19 | (* Files is a function from file generation to some content.             *)
 20 | (*************************************************************************)
 21 | 
 22 | (*************************************************************************)
 23 | (* CurrentGeneration returns the maximum file generation. If there are   *)
 24 | (* no files then -1 is returned.                                         *)                                              
 25 | (*************************************************************************)
 26 | CurrentGeneration(files) == 
 27 |     IF DOMAIN files = {} 
 28 |         THEN -1 
 29 |     ELSE 
 30 |         CHOOSE gen \in DOMAIN files : 
 31 |             \A otherGen \in DOMAIN files : gen \geq otherGen
 32 | 
 33 | (*************************************************************************)
 34 | (* DeleteFile removes file with generation delGen.                        *)
 35 | (*************************************************************************)
 36 | DeleteFile(files, delGen) == [gen \in DOMAIN files \ {delGen} |-> files[gen]]
 37 | 
 38 | (*************************************************************************)
 39 | (* DeleteFilesExcept removes all files except keepGen.                 *)
 40 | (*************************************************************************)
 41 | DeleteFilesExcept(files, keepGen) == (keepGen :> files[keepGen])
 42 | 
 43 | (*************************************************************************)
 44 | (* WriteFile creates new file with specified generation and content.     *)
 45 | (*************************************************************************)
 46 | WriteFile(files, gen, content) == (gen :> content) @@ files
 47 |   
 48 | --------------------------------------------------------------------------
 49 | (*************************************************************************)
 50 | (* Now we define functions to emulate write and cleanup of the metadata. *)
 51 | (*************************************************************************)
 52 | WriteMetaOk(gen) == 
 53 |     /\ metadata' = WriteFile(metadata, gen, MetaDataContent)
 54 |     /\ state' = "writeManifest"
 55 | 
 56 | WriteMetaFail(gen) == 
 57 |     /\ metadata' = metadata
 58 |     /\ state' = "writeMeta"
 59 |                 
 60 | WriteMetaDirty(gen) == 
 61 |     /\ \/ metadata' = WriteFile(metadata, gen, MetaDataContent)
 62 |        \/ metadata' = metadata
 63 |     /\ state' = "deleteNewMeta"
 64 | 
 65 | DeleteNewMeta == 
 66 |     /\ \/ metadata' = DeleteFile(metadata, newMeta) 
 67 |        \/ metadata' = metadata
 68 |     /\ state' = "writeMeta"
 69 |     /\ UNCHANGED <<newMeta, newManifest, manifest, possibleStates>> 
 70 | 
 71 | DeleteOldMeta ==
 72 |     /\ \/ metadata' = DeleteFilesExcept(metadata, newMeta) 
 73 |        \/ metadata' = metadata
 74 |     /\ state' = "writeMeta"
 75 |     /\ UNCHANGED <<newMeta, newManifest, manifest, possibleStates>>
 76 | 
 77 | WriteMeta == 
 78 |     LET gen == CurrentGeneration(metadata) + 1 IN 
 79 |         /\ newMeta' = gen
 80 |         /\ \/ WriteMetaOk(gen) 
 81 |            \/ WriteMetaFail(gen) 
 82 |            \/ WriteMetaDirty(gen)
 83 |         /\ UNCHANGED <<newManifest, manifest, possibleStates>>
 84 | 
 85 | --------------------------------------------------------------------------
 86 | (*************************************************************************)
 87 | (* Now we define functions to emulate write and cleanup of the manifest  *)
 88 | (* file.                                                                 *)
 89 | (*************************************************************************)      
 90 | WriteManifestOk(gen) == 
 91 |     /\ manifest' = WriteFile(manifest, gen, newMeta)
 92 |     /\ state' = "deleteOldManifest"
 93 |     /\ possibleStates' = {newMeta}
 94 | 
 95 | WriteManifestFail(gen) == 
 96 |     /\ manifest' = manifest
 97 |     /\ state' = "deleteNewMeta"
 98 |     /\ possibleStates' = possibleStates
 99 |                 
100 | WriteManifestDirty(gen) == 
101 |     /\ \/ manifest' = WriteFile(manifest, gen, newMeta)
102 |        \/ manifest' = manifest
103 |     /\ state' = "deleteNewManifest"
104 |     /\ possibleStates' = possibleStates \union {newMeta}
105 |           
106 | WriteManifest == 
107 |     LET gen == CurrentGeneration(manifest) + 1 IN
108 |         /\ newManifest' = gen
109 |         /\ \/ WriteManifestOk(gen)
110 |            \/ WriteManifestFail(gen)
111 |            \/ WriteManifestDirty(gen)
112 |         /\ UNCHANGED <<newMeta, metadata>>
113 | 
114 | DeleteOldManifest ==
115 |     /\ \/ manifest' = DeleteFilesExcept(manifest, newManifest)
116 |        \/ manifest' = manifest
117 |     /\ state' = "deleteOldMeta"
118 |     /\ UNCHANGED <<newMeta, newManifest, metadata, possibleStates>>
119 | 
120 | --------------------------------------------------------------------------
121 | (*************************************************************************)
122 | (* Below are 3 versions of the same function, that is called when        *)
123 | (* manifest write was dirty. The buggy one was initially implemented and *)
124 | (* was caught by https://github.com/elastic/elasticsearch/issues/39077.  *)
125 | (* Pick one and use in Next function.                                    *)
126 | (* https://github.com/elastic/elasticsearch/pull/40519 implements        *)
127 | (* DeleteNewManifestEasy.                                                *)
128 | (*************************************************************************)    
129 | DeleteNewManifestBuggy == 
130 |     /\ \/ manifest' = DeleteFile(manifest, newManifest)
131 |        \/ manifest' = manifest
132 |     /\ state' = "deleteNewMeta"
133 |     /\ UNCHANGED <<newMeta, newManifest, metadata, possibleStates>>
134 | 
135 | DeleteNewManifestEasy == 
136 |     /\ \/ manifest' = DeleteFile(manifest, newManifest)
137 |        \/ manifest' = manifest
138 |     /\ state' = "writeMeta"
139 |     /\ UNCHANGED <<newMeta, newManifest, possibleStates, metadata>>
140 |     
141 | DeleteNewManifestHard == 
142 |     /\ \/ /\ manifest' = DeleteFile(manifest, newManifest)
143 |           /\ state' = "deleteNewMeta"
144 |        \/ /\ manifest' = manifest
145 |           /\ state' = "writeMeta"
146 |     /\ UNCHANGED <<newMeta, newManifest, metadata, possibleStates>>
147 | --------------------------------------------------------------------------
148 | (*************************************************************************)
149 | (* We can define Init and Next functions now.                            *)
150 | (*************************************************************************)   
151 | Init == 
152 |     /\ metadata = <<>>
153 |     /\ manifest = <<>>
154 |     /\ newMeta = -1 \* no latest metadata file
155 |     /\ newManifest = -1 \* no latest manifest file
156 |     /\ state = "writeMeta" \* we start with writing metadata file
157 |     /\ possibleStates = {} \* no metadata can be read from disk
158 |     
159 | Next == 
160 |     \/ (state = "writeMeta"         /\ WriteMeta)
161 |     \/ (state = "writeManifest"     /\ WriteManifest)
162 |     \/ (state = "deleteOldManifest" /\ DeleteOldManifest)
163 |     \/ (state = "deleteOldMeta"     /\ DeleteOldMeta)
164 |     \/ (state = "deleteNewManifest" /\ DeleteNewManifestEasy) \* try DeleteNewManifestBuggy and DeleteNewManifestHard
165 |     \/ (state = "deleteNewMeta"     /\ DeleteNewMeta)
166 | --------------------------------------------------------------------------
167 | (*************************************************************************)
168 | (* Our model has 2 invariants.                                           *)
169 | (*************************************************************************)
170 | MetadataFileReferencedByManifestExists ==
171 |     CurrentGeneration(manifest) /= -1 
172 |         => 
173 |     manifest[CurrentGeneration(manifest)] \in DOMAIN metadata
174 |     
175 | MetadataReferencedByManifestIsValid ==
176 |     CurrentGeneration(manifest) /= -1 
177 |         =>
178 |     \E meta \in possibleStates : meta = manifest[CurrentGeneration(manifest)]
179 | ============


--------------------------------------------------------------------------------
/Storage/tla/Storage.toolbox/Storage___model.launch:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <launchConfiguration type="org.lamport.tla.toolbox.tool.tlc.modelCheck">
 3 |     <stringAttribute key="TLCCmdLineParameters" value=""/>
 4 |     <stringAttribute key="configurationName" value="model"/>
 5 |     <booleanAttribute key="deferLiveness" value="false"/>
 6 |     <intAttribute key="dfidDepth" value="100"/>
 7 |     <booleanAttribute key="dfidMode" value="false"/>
 8 |     <intAttribute key="distributedFPSetCount" value="0"/>
 9 |     <stringAttribute key="distributedNetworkInterface" value="10.2.42.180"/>
10 |     <intAttribute key="distributedNodesCount" value="1"/>
11 |     <stringAttribute key="distributedTLC" value="off"/>
12 |     <stringAttribute key="distributedTLCVMArgs" value=""/>
13 |     <intAttribute key="fpBits" value="1"/>
14 |     <intAttribute key="fpIndex" value="0"/>
15 |     <booleanAttribute key="fpIndexRandom" value="true"/>
16 |     <intAttribute key="maxHeapSize" value="25"/>
17 |     <intAttribute key="maxSetSize" value="1000000"/>
18 |     <booleanAttribute key="mcMode" value="true"/>
19 |     <stringAttribute key="modelBehaviorInit" value="Init"/>
20 |     <stringAttribute key="modelBehaviorNext" value="Next"/>
21 |     <stringAttribute key="modelBehaviorSpec" value=""/>
22 |     <intAttribute key="modelBehaviorSpecType" value="2"/>
23 |     <stringAttribute key="modelBehaviorVars" value="state, metadata, newManifest, possibleStates, newMeta, manifest"/>
24 |     <stringAttribute key="modelComments" value=""/>
25 |     <booleanAttribute key="modelCorrectnessCheckDeadlock" value="true"/>
26 |     <listAttribute key="modelCorrectnessInvariants">
27 |         <listEntry value="1MetadataFileReferencedByManifestExists"/>
28 |         <listEntry value="1MetadataReferencedByManifestIsValid"/>
29 |     </listAttribute>
30 |     <listAttribute key="modelCorrectnessProperties"/>
31 |     <stringAttribute key="modelExpressionEval" value=""/>
32 |     <stringAttribute key="modelParameterActionConstraint" value=""/>
33 |     <listAttribute key="modelParameterConstants">
34 |         <listEntry value="MaxNewMeta;;5;0;0"/>
35 |         <listEntry value="MetaDataContent;;MetaDataContent;1;0"/>
36 |     </listAttribute>
37 |     <stringAttribute key="modelParameterContraint" value="newMeta &lt; MaxNewMeta"/>
38 |     <listAttribute key="modelParameterDefinitions"/>
39 |     <stringAttribute key="modelParameterModelValues" value="{}"/>
40 |     <stringAttribute key="modelParameterNewDefinitions" value=""/>
41 |     <intAttribute key="numberOfWorkers" value="6"/>
42 |     <booleanAttribute key="recover" value="false"/>
43 |     <stringAttribute key="result.mail.address" value=""/>
44 |     <intAttribute key="simuAril" value="-1"/>
45 |     <intAttribute key="simuDepth" value="100"/>
46 |     <intAttribute key="simuSeed" value="-1"/>
47 |     <stringAttribute key="specName" value="Storage"/>
48 |     <stringAttribute key="view" value=""/>
49 |     <booleanAttribute key="visualizeStateGraph" value="false"/>
50 | </launchConfiguration>
51 | 


--------------------------------------------------------------------------------
/ZenWithTerms/tla/ZenWithTerms.tla:
--------------------------------------------------------------------------------
  1 | -------------------------------------------------------------------------------------
  2 | 
  3 | -------------------------------- MODULE ZenWithTerms --------------------------------
  4 | \* Imported modules used in this specification
  5 | EXTENDS Naturals, FiniteSets, Sequences, TLC
  6 | 
  7 | ----
  8 | 
  9 | CONSTANTS Values
 10 | 
 11 | \* Set of node ids (all master-eligible nodes)
 12 | CONSTANTS Nodes
 13 | 
 14 | \* RPC message types
 15 | CONSTANTS
 16 |   Join,
 17 |   PublishRequest,
 18 |   PublishResponse,
 19 |   Commit
 20 | 
 21 | ----
 22 | 
 23 | \* Set of requests and responses sent between nodes.
 24 | VARIABLE messages
 25 | 
 26 | \* Transitive closure of value updates as done by leaders 
 27 | VARIABLE descendant
 28 | 
 29 | \* Values to bootstrap the cluster
 30 | VARIABLE initialConfiguration
 31 | VARIABLE initialValue
 32 | VARIABLE initialAcceptedVersion
 33 | 
 34 | \* node state (map from node id to state)
 35 | VARIABLE currentTerm
 36 | VARIABLE lastCommittedConfiguration
 37 | VARIABLE lastAcceptedTerm
 38 | VARIABLE lastAcceptedVersion
 39 | VARIABLE lastAcceptedValue
 40 | VARIABLE lastAcceptedConfiguration
 41 | VARIABLE joinVotes
 42 | VARIABLE startedJoinSinceLastReboot
 43 | VARIABLE electionWon
 44 | VARIABLE lastPublishedVersion
 45 | VARIABLE lastPublishedConfiguration
 46 | VARIABLE publishVotes
 47 | 
 48 | ----
 49 | 
 50 | Terms == Nat
 51 | 
 52 | Versions == Nat
 53 | 
 54 | \* set of valid configurations (i.e. the set of all non-empty subsets of Nodes)
 55 | ValidConfigs == SUBSET(Nodes) \ {{}}
 56 | 
 57 | \* cluster-state versions that might have come from older systems
 58 | InitialVersions == Nat
 59 | 
 60 | \* quorums correspond to majority of votes in a config
 61 | IsQuorum(votes, config) == Cardinality(votes \cap config) * 2 > Cardinality(config)
 62 | 
 63 | IsElectionQuorum(n, votes) ==
 64 |   /\ IsQuorum(votes, lastCommittedConfiguration[n])
 65 |   /\ IsQuorum(votes, lastAcceptedConfiguration[n])
 66 | 
 67 | IsPublishQuorum(n, votes) ==
 68 |   /\ IsQuorum(votes, lastCommittedConfiguration[n])
 69 |   /\ IsQuorum(votes, lastPublishedConfiguration[n])
 70 | 
 71 | \* initial model state
 72 | Init == /\ messages = {}
 73 |         /\ descendant = {}
 74 |         /\ initialConfiguration \in ValidConfigs
 75 |         /\ initialValue \in Values
 76 |         /\ initialAcceptedVersion \in [Nodes -> InitialVersions]
 77 |         /\ currentTerm = [n \in Nodes |-> 0]
 78 |         /\ lastCommittedConfiguration = [n \in Nodes |-> {}] \* empty config
 79 |         /\ lastAcceptedTerm = [n \in Nodes |-> 0]
 80 |         /\ lastAcceptedVersion = initialAcceptedVersion
 81 |         /\ lastAcceptedValue \in {[n \in Nodes |-> v] : v \in Values} \* all agree on initial value
 82 |         /\ lastAcceptedConfiguration = [n \in Nodes |-> lastCommittedConfiguration[n]]
 83 |         /\ joinVotes = [n \in Nodes |-> {}]
 84 |         /\ startedJoinSinceLastReboot = [n \in Nodes |-> FALSE]
 85 |         /\ electionWon = [n \in Nodes |-> FALSE]
 86 |         /\ lastPublishedVersion = [n \in Nodes |-> 0]
 87 |         /\ lastPublishedConfiguration = [n \in Nodes |-> lastCommittedConfiguration[n]]
 88 |         /\ publishVotes = [n \in Nodes |-> {}]
 89 | 
 90 | \* Bootstrap node n with the initial state and config 
 91 | SetInitialState(n) ==
 92 |   /\ lastAcceptedConfiguration[n] = {} \* not already bootstrapped
 93 |   /\ Assert(lastAcceptedTerm[n] = 0, "lastAcceptedTerm should be 0")
 94 |   /\ Assert(lastCommittedConfiguration[n] = {}, "lastCommittedConfiguration should be empty")
 95 |   /\ Assert(lastPublishedVersion[n] = 0, "lastPublishedVersion should be 0")
 96 |   /\ Assert(lastPublishedConfiguration[n] = {}, "lastPublishedConfiguration should be empty")
 97 |   /\ Assert(electionWon[n] = FALSE, "electionWon should be FALSE")
 98 |   /\ Assert(joinVotes[n] = {}, "joinVotes should be empty")
 99 |   /\ Assert(publishVotes[n] = {}, "publishVotes should be empty")
100 |   /\ lastAcceptedConfiguration' = [lastAcceptedConfiguration EXCEPT ![n] = initialConfiguration]
101 |   /\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = initialValue]
102 |   /\ lastCommittedConfiguration' = [lastCommittedConfiguration EXCEPT ![n] = initialConfiguration]
103 |   /\ Assert(lastAcceptedTerm[n] = 0, "lastAcceptedTerm should be 0")
104 |   /\ Assert(lastAcceptedConfiguration'[n] /= {}, "lastAcceptedConfiguration should be non-empty")
105 |   /\ Assert(lastCommittedConfiguration'[n] /= {}, "lastCommittedConfiguration should be non-empty")
106 |   /\ UNCHANGED <<descendant, initialConfiguration, initialValue, messages, lastAcceptedTerm, lastAcceptedVersion,
107 |                  lastPublishedVersion, lastPublishedConfiguration, electionWon, joinVotes, publishVotes,
108 |                  startedJoinSinceLastReboot, currentTerm, initialAcceptedVersion>>
109 | 
110 | \* Send join request from node n to node nm for term t
111 | HandleStartJoin(n, nm, t) ==
112 |   /\ t > currentTerm[n]
113 |   /\ LET
114 |        joinRequest == [method     |-> Join,
115 |                        source     |-> n,
116 |                        dest       |-> nm,
117 |                        term       |-> t,
118 |                        laTerm     |-> lastAcceptedTerm[n],
119 |                        laVersion  |-> lastAcceptedVersion[n]]
120 |      IN
121 |        /\ currentTerm' = [currentTerm EXCEPT ![n] = t]
122 |        /\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = 0]
123 |        /\ lastPublishedConfiguration' = [lastPublishedConfiguration EXCEPT ![n] = lastAcceptedConfiguration[n]]
124 |        /\ startedJoinSinceLastReboot' = [startedJoinSinceLastReboot EXCEPT ![n] = TRUE]
125 |        /\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
126 |        /\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
127 |        /\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
128 |        /\ messages' = messages \cup { joinRequest }
129 |        /\ UNCHANGED <<lastCommittedConfiguration, lastAcceptedConfiguration, lastAcceptedVersion,
130 |                       lastAcceptedValue, lastAcceptedTerm, descendant, initialConfiguration, initialValue, initialAcceptedVersion>>
131 | 
132 | \* node n handles a join request and checks if it has received enough joins (= votes)
133 | \* for its term to be elected as master
134 | HandleJoin(n, m) ==
135 |   /\ m.method = Join
136 |   /\ m.term = currentTerm[n]
137 |   /\ startedJoinSinceLastReboot[n]
138 |   /\ \/ m.laTerm < lastAcceptedTerm[n]
139 |      \/ /\ m.laTerm = lastAcceptedTerm[n]
140 |         /\ m.laVersion <= lastAcceptedVersion[n]
141 |   /\ lastAcceptedConfiguration[n] /= {} \* must be bootstrapped
142 |   /\ joinVotes' = [joinVotes EXCEPT ![n] = @ \cup { m.source }]
143 |   /\ electionWon' = [electionWon EXCEPT ![n] = IsElectionQuorum(n, joinVotes'[n])]
144 |   /\ IF electionWon[n] = FALSE /\ electionWon'[n]
145 |      THEN
146 |        \* initiating publish version with last accepted version to enable client requests
147 |        /\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = lastAcceptedVersion[n]]
148 |      ELSE
149 |        UNCHANGED <<lastPublishedVersion>>
150 |   /\ UNCHANGED <<lastCommittedConfiguration, currentTerm, publishVotes, messages, descendant,
151 |                  lastAcceptedVersion, lastAcceptedValue, lastAcceptedConfiguration,
152 |                  lastAcceptedTerm, startedJoinSinceLastReboot, lastPublishedConfiguration,
153 |                  initialConfiguration, initialValue, initialAcceptedVersion>>
154 | 
155 | \* client causes a cluster state change val with configuration cfg
156 | HandleClientValue(n, t, v, val, cfg) ==
157 |   /\ electionWon[n]
158 |   /\ lastPublishedVersion[n] = lastAcceptedVersion[n] \* means we have the last published value / config (useful for CAS operations, where we need to read the previous value first)
159 |   /\ t = currentTerm[n]
160 |   /\ v > lastPublishedVersion[n]
161 |   /\ cfg /= lastAcceptedConfiguration[n] => lastCommittedConfiguration[n] = lastAcceptedConfiguration[n] \* only allow reconfiguration if there is not already a reconfiguration in progress
162 |   /\ IsQuorum(joinVotes[n], cfg) \* only allow reconfiguration if we have a quorum of (join) votes for the new config
163 |   /\ LET
164 |        publishRequests == { [method   |-> PublishRequest,
165 |                              source   |-> n,
166 |                              dest     |-> ns,
167 |                              term     |-> t,
168 |                              version  |-> v,
169 |                              value    |-> val,
170 |                              config   |-> cfg,
171 |                              commConf |-> lastCommittedConfiguration[n]] : ns \in Nodes }
172 |         newEntry == [prevT |-> lastAcceptedTerm[n],
173 |                      prevV |-> lastAcceptedVersion[n],
174 |                      nextT |-> t,
175 |                      nextV |-> v]
176 |         matchingElems == { e \in descendant : 
177 |                                 /\ e.nextT = newEntry.prevT
178 |                                 /\ e.nextV = newEntry.prevV }
179 |         newTransitiveElems == { [prevT |-> e.prevT,
180 |                      prevV |-> e.prevV,
181 |                      nextT |-> newEntry.nextT,
182 |                      nextV |-> newEntry.nextV] : e \in matchingElems }
183 |      IN
184 |        /\ descendant' = descendant \cup {newEntry} \cup newTransitiveElems
185 |        /\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = v]
186 |        /\ lastPublishedConfiguration' = [lastPublishedConfiguration EXCEPT ![n] = cfg]
187 |        /\ publishVotes' = [publishVotes EXCEPT ![n] = {}] \* publishVotes are only counted per publish version
188 |        /\ messages' = messages \cup publishRequests
189 |        /\ UNCHANGED <<startedJoinSinceLastReboot, lastCommittedConfiguration, currentTerm, electionWon,
190 |                       lastAcceptedVersion, lastAcceptedValue, lastAcceptedTerm, lastAcceptedConfiguration,
191 |                       joinVotes, initialConfiguration, initialValue, initialAcceptedVersion>>
192 | 
193 | \* handle publish request m on node n
194 | HandlePublishRequest(n, m) ==
195 |   /\ m.method = PublishRequest
196 |   /\ m.term = currentTerm[n]
197 |   /\ (m.term = lastAcceptedTerm[n]) => (m.version > lastAcceptedVersion[n])
198 |   /\ lastAcceptedTerm' = [lastAcceptedTerm EXCEPT ![n] = m.term]
199 |   /\ lastAcceptedVersion' = [lastAcceptedVersion EXCEPT ![n] = m.version]
200 |   /\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = m.value]
201 |   /\ lastAcceptedConfiguration' = [lastAcceptedConfiguration EXCEPT ![n] = m.config]
202 |   /\ lastCommittedConfiguration' = [lastCommittedConfiguration EXCEPT ![n] = m.commConf] 
203 |   /\ LET
204 |        response == [method   |-> PublishResponse,
205 |                     source   |-> n,
206 |                     dest     |-> m.source,
207 |                     term     |-> m.term,
208 |                     version  |-> m.version]
209 |      IN
210 |        /\ messages' = messages \cup {response}
211 |        /\ UNCHANGED <<startedJoinSinceLastReboot, currentTerm, descendant, lastPublishedConfiguration,
212 |                       electionWon, lastPublishedVersion, joinVotes, publishVotes, initialConfiguration,
213 |                       initialValue, initialAcceptedVersion>>
214 | 
215 | \* node n commits a change
216 | HandlePublishResponse(n, m) ==
217 |   /\ m.method = PublishResponse
218 |   /\ electionWon[n]
219 |   /\ m.term = currentTerm[n]
220 |   /\ m.version = lastPublishedVersion[n]
221 |   /\ publishVotes' = [publishVotes EXCEPT ![n] = @ \cup {m.source}]
222 |   /\ IF
223 |        IsPublishQuorum(n, publishVotes'[n])
224 |      THEN
225 |        LET
226 |          commitRequests == { [method   |-> Commit,
227 |                               source   |-> n,
228 |                               dest     |-> ns,
229 |                               term     |-> currentTerm[n],
230 |                               version  |-> lastPublishedVersion[n]] : ns \in Nodes }
231 |        IN
232 |          /\ messages' = messages \cup commitRequests
233 |      ELSE
234 |        UNCHANGED <<messages>>
235 |   /\ UNCHANGED <<startedJoinSinceLastReboot, lastCommittedConfiguration, currentTerm, electionWon, descendant,
236 |                    lastAcceptedVersion, lastAcceptedValue, lastAcceptedTerm, lastAcceptedConfiguration,
237 |                    lastPublishedVersion, lastPublishedConfiguration, joinVotes, initialConfiguration,
238 |                    initialValue, initialAcceptedVersion>>
239 | 
240 | \* apply committed configuration to node n
241 | HandleCommit(n, m) ==
242 |   /\ m.method = Commit
243 |   /\ m.term = currentTerm[n]
244 |   /\ m.term = lastAcceptedTerm[n]
245 |   /\ m.version = lastAcceptedVersion[n]
246 |   /\ (electionWon[n] => lastAcceptedVersion[n] = lastPublishedVersion[n])
247 |   /\ lastCommittedConfiguration' = [lastCommittedConfiguration EXCEPT ![n] = lastAcceptedConfiguration[n]]
248 |   /\ UNCHANGED <<currentTerm, joinVotes, messages, lastAcceptedTerm, lastAcceptedValue, startedJoinSinceLastReboot, descendant,
249 |                  electionWon, lastAcceptedConfiguration, lastAcceptedVersion, lastPublishedVersion, publishVotes,
250 |                  lastPublishedConfiguration, initialConfiguration, initialValue, initialAcceptedVersion>>
251 | 
252 | \* crash/restart node n (loses ephemeral state)
253 | RestartNode(n) ==
254 |   /\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
255 |   /\ startedJoinSinceLastReboot' = [startedJoinSinceLastReboot EXCEPT ![n] = FALSE]
256 |   /\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
257 |   /\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = 0]
258 |   /\ lastPublishedConfiguration' = [lastPublishedConfiguration EXCEPT ![n] = lastAcceptedConfiguration[n]]
259 |   /\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
260 |   /\ UNCHANGED <<messages, lastAcceptedVersion, currentTerm, lastCommittedConfiguration, descendant,
261 |                  lastAcceptedTerm, lastAcceptedValue, lastAcceptedConfiguration, initialConfiguration,
262 |                  initialValue, initialAcceptedVersion>>
263 | 
264 | \* next-step relation
265 | Next ==
266 |   \/ \E n \in Nodes : SetInitialState(n)
267 |   \/ \E n, nm \in Nodes : \E t \in Terms : HandleStartJoin(n, nm, t)
268 |   \/ \E m \in messages : HandleJoin(m.dest, m)
269 |   \/ \E n \in Nodes : \E t \in Terms : \E v \in Versions : \E val \in Values : \E vs \in ValidConfigs : HandleClientValue(n, t, v, val, vs)
270 |   \/ \E m \in messages : HandlePublishRequest(m.dest, m)
271 |   \/ \E m \in messages : HandlePublishResponse(m.dest, m)
272 |   \/ \E m \in messages : HandleCommit(m.dest, m)
273 |   \/ \E n \in Nodes : RestartNode(n)
274 | 
275 | ----
276 | 
277 | \* Invariants
278 | 
279 | SingleNodeInvariant ==
280 |   \A n \in Nodes :
281 |     /\ lastAcceptedTerm[n] <= currentTerm[n]
282 |     /\ electionWon[n] = IsElectionQuorum(n, joinVotes[n]) \* cached value is consistent
283 |     /\ IF electionWon[n] THEN lastPublishedVersion[n] >= lastAcceptedVersion[n] ELSE lastPublishedVersion[n] = 0
284 |     /\ electionWon[n] => startedJoinSinceLastReboot[n]
285 |     /\ publishVotes[n] /= {} => electionWon[n]
286 | 
287 | OneMasterPerTerm ==
288 |   \A m1, m2 \in messages:
289 |     /\ m1.method = PublishRequest
290 |     /\ m2.method = PublishRequest
291 |     /\ m1.term = m2.term
292 |     => m1.source = m2.source
293 | 
294 | LogMatching ==
295 |   \A m1, m2 \in messages:
296 |     /\ m1.method = PublishRequest
297 |     /\ m2.method = PublishRequest
298 |     /\ m1.term = m2.term
299 |     /\ m1.version = m2.version
300 |     => m1.value = m2.value
301 | 
302 | CommittedPublishRequest(mp) ==
303 |   /\ mp.method = PublishRequest
304 |   /\ \E mc \in messages:
305 |        /\ mc.method = Commit
306 |        /\ mp.term = mc.term
307 |        /\ mp.version = mc.version
308 | 
309 | DescendantRelationIsStrictlyOrdered ==
310 |     \A d \in descendant:
311 |        /\ d.prevT <= d.nextT
312 |        /\ d.prevV < d.nextV
313 | 
314 | DescendantRelationIsTransitive ==
315 |     \A d1, d2 \in descendant:
316 |        d1.nextT = d2.prevT /\ d1.nextV = d2.prevV 
317 |        => [prevT |-> d1.prevT, prevV |-> d1.prevV, nextT |-> d2.nextT, nextV |-> d2.nextV] \in descendant
318 | 
319 | NewerOpsBasedOnOlderCommittedOps ==
320 |   \A m1, m2 \in messages :
321 |       /\ CommittedPublishRequest(m1)
322 |       /\ m2.method = PublishRequest
323 |       /\ m2.term >= m1.term
324 |       /\ m2.version > m1.version
325 |       => [prevT |-> m1.term, prevV |-> m1.version, nextT |-> m2.term, nextV |-> m2.version] \in descendant
326 | 
327 | \* main invariant (follows from NewerOpsBasedOnOlderCommittedOps):
328 | CommittedValuesDescendantsFromCommittedValues ==
329 |   \A m1, m2 \in messages : 
330 |       /\ CommittedPublishRequest(m1)
331 |       /\ CommittedPublishRequest(m2)
332 |       /\ \/ m1.term /= m2.term
333 |          \/ m1.version /= m2.version
334 |     =>
335 |       \/ [prevT |-> m1.term, prevV |-> m1.version, nextT |-> m2.term, nextV |-> m2.version] \in descendant 
336 |       \/ [prevT |-> m2.term, prevV |-> m2.version, nextT |-> m1.term, nextV |-> m1.version] \in descendant
337 | 
338 | CommittedValuesDescendantsFromInitialValue ==
339 |     \E v \in InitialVersions :
340 |         /\ \E n \in Nodes : v = initialAcceptedVersion[n]
341 |         /\ \E votes \in SUBSET(initialConfiguration) :
342 |                             /\ IsQuorum(votes, initialConfiguration)
343 |                             /\ \A n \in votes : initialAcceptedVersion[n] <= v
344 |         /\ \A m \in messages :
345 |                 CommittedPublishRequest(m)
346 |             =>
347 |                 [prevT |-> 0, prevV |-> v, nextT |-> m.term, nextV |-> m.version] \in descendant
348 | 
349 | CommitHasQuorumVsPreviousCommittedConfiguration ==
350 |   \A mc \in messages: mc.method = Commit
351 |     => (\A mprq \in messages: (/\ mprq.method  = PublishRequest
352 |                                /\ mprq.term    = mc.term
353 |                                /\ mprq.version = mc.version) 
354 | 
355 |           => IsQuorum({mprs.source: mprs \in {mprs \in messages: /\ mprs.method = PublishResponse
356 |                                                                  /\ mprs.term = mprq.term
357 |                                                                  /\ mprs.version = mprq.version
358 |                       }}, mprq.commConf))
359 | 
360 | P2bInvariant ==
361 |   \A mc \in messages: mc.method = Commit
362 |     => (\A mprq \in messages: mprq.method = PublishRequest
363 |             => (mprq.term > mc.term => mprq.version > mc.version))
364 | 
365 | \* State-exploration limits
366 | StateConstraint ==
367 |   /\ \A n \in Nodes: IF currentTerm[n] <= 1 THEN lastPublishedVersion[n] <= 2 ELSE lastPublishedVersion[n] <= 3
368 |   /\ Cardinality(messages) <= 15
369 | 
370 | ====================================================================================================
371 | 


--------------------------------------------------------------------------------
/ZenWithTerms/tla/ZenWithTerms.toolbox/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>ZenWithTerms</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>toolbox.builder.TLAParserBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>toolbox.builder.PCalAlgorithmSearchingBuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>toolbox.natures.TLANature</nature>
21 | 	</natures>
22 | 	<linkedResources>
23 | 		<link>
24 | 			<name>ZenWithTerms.tla</name>
25 | 			<type>1</type>
26 | 			<locationURI>PARENT-1-PROJECT_LOC/ZenWithTerms.tla</locationURI>
27 | 		</link>
28 | 	</linkedResources>
29 | </projectDescription>
30 | 


--------------------------------------------------------------------------------
/ZenWithTerms/tla/ZenWithTerms.toolbox/.settings/org.lamport.tla.toolbox.prefs:
--------------------------------------------------------------------------------
1 | ProjectRootFile=PARENT-1-PROJECT_LOC/ZenWithTerms.tla
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/ZenWithTerms/tla/ZenWithTerms.toolbox/ZenWithTerms___model.launch:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <launchConfiguration type="org.lamport.tla.toolbox.tool.tlc.modelCheck">
 3 |     <stringAttribute key="TLCCmdLineParameters" value=""/>
 4 |     <stringAttribute key="configurationName" value="model"/>
 5 |     <booleanAttribute key="deferLiveness" value="false"/>
 6 |     <intAttribute key="dfidDepth" value="100"/>
 7 |     <booleanAttribute key="dfidMode" value="false"/>
 8 |     <intAttribute key="distributedFPSetCount" value="0"/>
 9 |     <stringAttribute key="distributedNetworkInterface" value="192.168.178.34"/>
10 |     <intAttribute key="distributedNodesCount" value="1"/>
11 |     <stringAttribute key="distributedTLC" value="off"/>
12 |     <stringAttribute key="distributedTLCVMArgs" value=""/>
13 |     <intAttribute key="fpBits" value="1"/>
14 |     <intAttribute key="fpIndex" value="1"/>
15 |     <intAttribute key="maxHeapSize" value="25"/>
16 |     <intAttribute key="maxSetSize" value="1000000"/>
17 |     <booleanAttribute key="mcMode" value="true"/>
18 |     <stringAttribute key="modelBehaviorInit" value="Init"/>
19 |     <stringAttribute key="modelBehaviorNext" value="Next"/>
20 |     <stringAttribute key="modelBehaviorSpec" value=""/>
21 |     <intAttribute key="modelBehaviorSpecType" value="2"/>
22 |     <stringAttribute key="modelBehaviorVars" value="lastAcceptedTerm, initialConfiguration, messages, initialValue, lastPublishedConfiguration, lastPublishedVersion, electionWon, lastCommittedConfiguration, startedJoinSinceLastReboot, publishVotes, currentTerm, lastAcceptedVersion, descendant, joinVotes, lastAcceptedConfiguration, initialAcceptedVersion, lastAcceptedValue"/>
23 |     <stringAttribute key="modelComments" value=""/>
24 |     <booleanAttribute key="modelCorrectnessCheckDeadlock" value="true"/>
25 |     <listAttribute key="modelCorrectnessInvariants">
26 |         <listEntry value="1OneMasterPerTerm"/>
27 |         <listEntry value="1LogMatching"/>
28 |         <listEntry value="1SingleNodeInvariant"/>
29 |         <listEntry value="1CommittedValuesDescendantsFromCommittedValues"/>
30 |         <listEntry value="1CommittedValuesDescendantsFromInitialValue"/>
31 |         <listEntry value="1DescendantRelationIsStrictlyOrdered"/>
32 |         <listEntry value="1NewerOpsBasedOnOlderCommittedOps"/>
33 |         <listEntry value="1CommitHasQuorumVsPreviousCommittedConfiguration"/>
34 |         <listEntry value="1P2bInvariant"/>
35 |         <listEntry value="1DescendantRelationIsTransitive"/>
36 |     </listAttribute>
37 |     <listAttribute key="modelCorrectnessProperties"/>
38 |     <stringAttribute key="modelExpressionEval" value=""/>
39 |     <stringAttribute key="modelParameterActionConstraint" value=""/>
40 |     <listAttribute key="modelParameterConstants">
41 |         <listEntry value="Join;;Join;1;0"/>
42 |         <listEntry value="Nodes;;{n1, n2, n3};1;1"/>
43 |         <listEntry value="Commit;;Commit;1;0"/>
44 |         <listEntry value="PublishResponse;;PublishResponse;1;0"/>
45 |         <listEntry value="Values;;{v1, v2};1;1"/>
46 |         <listEntry value="PublishRequest;;PublishRequest;1;0"/>
47 |     </listAttribute>
48 |     <stringAttribute key="modelParameterContraint" value="StateConstraint"/>
49 |     <listAttribute key="modelParameterDefinitions">
50 |         <listEntry value="Terms;;{0,1,2};0;0"/>
51 |         <listEntry value="Versions;;{0,1,2,3};0;0"/>
52 |         <listEntry value="InitialVersions;;{0,1,2};0;0"/>
53 |     </listAttribute>
54 |     <stringAttribute key="modelParameterModelValues" value="{}"/>
55 |     <stringAttribute key="modelParameterNewDefinitions" value=""/>
56 |     <intAttribute key="numberOfWorkers" value="2"/>
57 |     <booleanAttribute key="recover" value="false"/>
58 |     <stringAttribute key="result.mail.address" value=""/>
59 |     <intAttribute key="simuAril" value="-1"/>
60 |     <intAttribute key="simuDepth" value="100"/>
61 |     <intAttribute key="simuSeed" value="-1"/>
62 |     <stringAttribute key="specName" value="ZenWithTerms"/>
63 |     <stringAttribute key="view" value=""/>
64 |     <booleanAttribute key="visualizeStateGraph" value="false"/>
65 | </launchConfiguration>
66 | 


--------------------------------------------------------------------------------
/cluster/isabelle/Implementation.thy:
--------------------------------------------------------------------------------
  1 | section \<open>Implementation\<close>
  2 | 
  3 | text \<open>This section presents the implementation of the algorithm.\<close>
  4 | 
  5 | theory Implementation
  6 |   imports Preliminaries
  7 | begin
  8 | 
  9 | subsection \<open>Protocol messages\<close>
 10 | 
 11 | text \<open>The
 12 | proven-safe core of the protocol works by sending messages as described here. The remainder of the
 13 | protocol may send other messages too, and may drop, reorder or duplicate any of these messages, but
 14 | must not send these messages itself to ensure safety. Another way of thinking of these messages is
 15 | to consider them as ``fire-and-forget'' RPC invocations that, on receipt, call some local method, maybe
 16 | update the receiving node's state, and maybe yield some further messages. The @{type nat} parameter to each
 17 | message refers to a slot number.\<close>
 18 | 
 19 | datatype TermOption = NO_TERM | SomeTerm Term
 20 | 
 21 | instantiation TermOption :: linorder
 22 | begin
 23 | 
 24 | fun less_TermOption :: "TermOption \<Rightarrow> TermOption \<Rightarrow> bool"
 25 |   where "t < NO_TERM = False"
 26 |   | "NO_TERM < SomeTerm t = True"
 27 |   | "SomeTerm t\<^sub>1 < SomeTerm t\<^sub>2 = (t\<^sub>1 < t\<^sub>2)"
 28 | 
 29 | definition less_eq_TermOption :: "TermOption \<Rightarrow> TermOption \<Rightarrow> bool"
 30 |   where "(t\<^sub>1 :: TermOption) \<le> t\<^sub>2 \<equiv> t\<^sub>1 = t\<^sub>2 \<or> t\<^sub>1 < t\<^sub>2"
 31 | 
 32 | instance proof
 33 |   fix x y z :: TermOption
 34 |   show "(x < y) = (x \<le> y \<and> \<not> y \<le> x)" unfolding less_eq_TermOption_def apply auto
 35 |     using less_TermOption.elims apply fastforce
 36 |     by (metis less_TermOption.elims(2) less_TermOption.simps(3) less_not_sym)
 37 | 
 38 |   show "x \<le> x" by (simp add: less_eq_TermOption_def)
 39 | 
 40 |   show "x \<le> y \<Longrightarrow> y \<le> z \<Longrightarrow> x \<le> z" unfolding less_eq_TermOption_def apply auto
 41 |     by (metis TermOption.distinct(1) TermOption.inject dual_order.strict_trans less_TermOption.elims(2) less_TermOption.elims(3))
 42 | 
 43 |   show "x \<le> y \<Longrightarrow> y \<le> x \<Longrightarrow> x = y" unfolding less_eq_TermOption_def apply auto
 44 |     using \<open>(x < y) = (x \<le> y \<and> \<not> y \<le> x)\<close> less_eq_TermOption_def by blast
 45 | 
 46 |   show "x \<le> y \<or> y \<le> x" unfolding less_eq_TermOption_def apply auto
 47 |     by (metis TermOption.distinct(1) TermOption.inject less_TermOption.elims(3) neqE)
 48 | qed
 49 | 
 50 | end
 51 | 
 52 | lemma NO_TERM_le [simp]: "NO_TERM \<le> t" by (cases t, simp_all add: less_eq_TermOption_def)
 53 | lemma le_NO_TERM [simp]: "(t \<le> NO_TERM) = (t = NO_TERM)" by (cases t, simp_all add: less_eq_TermOption_def)
 54 | lemma le_SomeTerm [simp]: "(SomeTerm t\<^sub>1 \<le> SomeTerm t\<^sub>2) = (t\<^sub>1 \<le> t\<^sub>2)" by (auto simp add: less_eq_TermOption_def)
 55 | 
 56 | datatype Message
 57 |   = StartJoin Term
 58 |   | Vote Slot Term TermOption
 59 |   | ClientValue Value
 60 |   | PublishRequest Slot Term Value
 61 |   | PublishResponse Slot Term
 62 |   | ApplyCommit Slot Term
 63 |   | CatchUpRequest
 64 |   | CatchUpResponse Slot "Node set" ClusterState
 65 |   | DiscardJoinVotes
 66 |   | Reboot
 67 | 
 68 | text \<open>Some prose descriptions of these messages follows, in order to give a bit more of an
 69 | intuitive understanding of their purposes.\<close>
 70 | 
 71 | text \<open>The message @{term "StartJoin t"} may be sent by any node to attempt to start a master
 72 | election in the given term @{term t}.\<close>
 73 | 
 74 | text \<open>The message @{term "Vote i t a"} may be sent by a node in response
 75 | to a @{term StartJoin} message. It indicates that the sender knows all committed values for slots
 76 | strictly below @{term i}, and that the sender will no longer vote (i.e. send an @{term
 77 | PublishResponse}) in any term prior to @{term t}. The field @{term a} is either @{term
 78 | None} or @{term "Some t'"}. In the former case this indicates that
 79 | the node has not yet sent any @{term PublishResponse} message in slot @{term i}, and in the latter
 80 | case it indicates that the largest term in which it has previously sent an @{term PublishResponse}
 81 | message is @{term t'}.  All
 82 | nodes must avoid sending a @{term Vote} message to two different masters in the same term.\<close>
 83 | 
 84 | text \<open>The message @{term "ClientValue x"} may be sent by any node and indicates an attempt to
 85 | reach consensus on the value @{term x}.\<close>
 86 | 
 87 | text \<open>The message @{term "PublishRequest i t v"} may be sent by the elected master of term
 88 | @{term t} to request the other master-eligible nodes to vote for value @{term v} to be committed in
 89 | slot @{term i}.\<close>
 90 | 
 91 | text \<open>The message @{term "PublishResponse i t"} may be sent by node in response to
 92 | the corresponding @{term PublishRequest} message, indicating that the sender votes for the value
 93 | proposed by the master of term @{term t} to be committed in slot @{term i}.\<close>
 94 | 
 95 | text \<open>The message @{term "ApplyCommit i t"} indicates that the value proposed by the master of
 96 | term @{term t} in slot @{term i} received a quorum of votes and is therefore committed.\<close>
 97 | 
 98 | text \<open>The message @{term Reboot} may be sent by any node to represent the restart of a node, which
 99 | loses any ephemeral state.\<close>
100 | 
101 | text \<open>The abstract model of Zen keeps track of the set of all messages that have ever been
102 | sent, and asserts that this set obeys certain invariants, listed below. Further below, it will be
103 | shown that these invariants imply that each slot obeys the @{term oneSlot} invariants above and
104 | hence that each slot cannot see inconsistent committed values.\<close>
105 | 
106 | datatype Destination = Broadcast | OneNode Node
107 | 
108 | record RoutedMessage =
109 |   sender :: Node
110 |   destination :: Destination
111 |   payload :: Message
112 | 
113 | text \<open>It will be useful to be able to choose the optional term with the greater term,
114 | so here is a function that does that.\<close>
115 | 
116 | subsection \<open>Node implementation\<close>
117 | 
118 | text \<open>Each node holds the following local data.\<close>
119 | 
120 | record TermValue =
121 |   tvTerm :: Term
122 |   tvValue :: Value
123 | 
124 | record NodeData =
125 |   currentNode :: Node
126 |   currentTerm :: Term
127 |   (* committed state *)
128 |   firstUncommittedSlot :: Slot
129 |   currentVotingNodes :: "Node set"
130 |   currentClusterState :: ClusterState
131 |   (* accepted state *)
132 |   lastAcceptedData :: "TermValue option"
133 |   (* election state *)
134 |   joinVotes :: "Node set"
135 |   electionWon :: bool
136 |   (* publish state *)
137 |   publishPermitted :: bool
138 |   publishVotes :: "Node set"
139 | 
140 | definition lastAcceptedValue :: "NodeData \<Rightarrow> Value"
141 |   where "lastAcceptedValue nd \<equiv> tvValue (THE lad. lastAcceptedData nd = Some lad)"
142 | 
143 | definition lastAcceptedTerm :: "NodeData \<Rightarrow> TermOption"
144 |   where "lastAcceptedTerm nd \<equiv> case lastAcceptedData nd of None \<Rightarrow> NO_TERM | Some lad \<Rightarrow> SomeTerm (tvTerm lad)"
145 | 
146 | definition isQuorum :: "NodeData \<Rightarrow> Node set \<Rightarrow> bool"
147 |   where "isQuorum nd q \<equiv> q \<in> majorities (currentVotingNodes nd)"
148 | 
149 | lemma lastAcceptedValue_joinVotes_update[simp]: "lastAcceptedValue (joinVotes_update f nd) = lastAcceptedValue nd" by (simp add: lastAcceptedValue_def)
150 | lemma lastAcceptedTerm_joinVotes_update[simp]: "lastAcceptedTerm (joinVotes_update f nd) = lastAcceptedTerm nd" by (simp add: lastAcceptedTerm_def)
151 | 
152 | lemma lastAcceptedValue_electionWon_update[simp]: "lastAcceptedValue (electionWon_update f nd) = lastAcceptedValue nd" by (simp add: lastAcceptedValue_def)
153 | lemma lastAcceptedTerm_electionWon_update[simp]: "lastAcceptedTerm (electionWon_update f nd) = lastAcceptedTerm nd" by (simp add: lastAcceptedTerm_def)
154 | 
155 | text \<open>This method publishes a value via a @{term PublishRequest} message.\<close>
156 | 
157 | definition publishValue :: "Value \<Rightarrow> NodeData \<Rightarrow> (NodeData * Message option)"
158 |   where
159 |     "publishValue x nd \<equiv>
160 |         if electionWon nd \<and> publishPermitted nd
161 |               then ( nd \<lparr> publishPermitted := False \<rparr>
162 |                    , Some (PublishRequest
163 |                              (firstUncommittedSlot nd)
164 |                              (currentTerm nd) x) )
165 |               else (nd, None)"
166 | 
167 | text \<open>This method updates the node's current term (if necessary) and discards any data associated
168 | with the previous term.\<close>
169 | 
170 | definition ensureCurrentTerm :: "Term \<Rightarrow> NodeData \<Rightarrow> NodeData"
171 |   where
172 |     "ensureCurrentTerm t nd \<equiv>
173 |         if t \<le> currentTerm nd
174 |             then nd
175 |             else nd
176 |               \<lparr> joinVotes := {}
177 |               , currentTerm := t
178 |               , electionWon := False
179 |               , publishPermitted := True
180 |               , publishVotes := {} \<rparr>"
181 | 
182 | text \<open>This method updates the node's state on receipt of a vote (a @{term Vote}) in an election.\<close>
183 | 
184 | definition addElectionVote :: "Node \<Rightarrow> Slot => TermOption \<Rightarrow> NodeData \<Rightarrow> NodeData"
185 |   where
186 |     "addElectionVote s i a nd \<equiv> let newVotes = insert s (joinVotes nd)
187 |       in nd \<lparr> joinVotes := newVotes
188 |             , electionWon := isQuorum nd newVotes \<rparr>"
189 | 
190 | text \<open>Clients request the cluster to achieve consensus on certain values using the @{term ClientValue}
191 | message which is handled as follows.\<close>
192 | 
193 | definition handleClientValue :: "Value \<Rightarrow> NodeData \<Rightarrow> (NodeData * Message option)"
194 |   where
195 |     "handleClientValue x nd \<equiv> if lastAcceptedTerm nd = NO_TERM then publishValue x nd else (nd, None)"
196 | 
197 | text \<open>A @{term StartJoin} message is checked for acceptability and then handled by updating the
198 | node's term and yielding a @{term Vote} message as follows.\<close>
199 | 
200 | definition handleStartJoin :: "Term \<Rightarrow> NodeData \<Rightarrow> (NodeData * Message option)"
201 |   where
202 |     "handleStartJoin t nd \<equiv>
203 |         if currentTerm nd < t
204 |           then ( ensureCurrentTerm t nd
205 |                , Some (Vote (firstUncommittedSlot nd)
206 |                                      t
207 |                                     (lastAcceptedTerm nd)))
208 |           else (nd, None)"
209 | 
210 | text \<open>A @{term Vote} message is checked for acceptability and then handled as follows, perhaps
211 | yielding a @{term PublishRequest} message.\<close>
212 | 
213 | definition handleVote :: "Node \<Rightarrow> Slot \<Rightarrow> Term \<Rightarrow> TermOption \<Rightarrow> NodeData \<Rightarrow> (NodeData * Message option)"
214 |   where
215 |     "handleVote s i t a nd \<equiv>
216 |          if t = currentTerm nd
217 |              \<and> (i < firstUncommittedSlot nd
218 |                 \<or> (i = firstUncommittedSlot nd \<and> a \<le> lastAcceptedTerm nd))
219 |           then let nd1 = addElectionVote s i a nd
220 |                in (if lastAcceptedTerm nd = NO_TERM then (nd1, None) else publishValue (lastAcceptedValue nd1) nd1)
221 |           else (nd, None)"
222 | 
223 | text \<open>A @{term PublishRequest} message is checked for acceptability and then handled as follows,
224 | yielding a @{term PublishResponse} message.\<close>
225 | 
226 | definition handlePublishRequest :: "Slot \<Rightarrow> Term \<Rightarrow> Value \<Rightarrow> NodeData \<Rightarrow> (NodeData * Message option)"
227 |   where
228 |     "handlePublishRequest i t x nd \<equiv>
229 |           if i = firstUncommittedSlot nd
230 |                 \<and> t = currentTerm nd
231 |           then ( nd \<lparr> lastAcceptedData := Some \<lparr> tvTerm = t, tvValue = x \<rparr> \<rparr>
232 |                , Some (PublishResponse i t))
233 |           else (nd, None)"
234 | 
235 | text \<open>This method sends an @{term ApplyCommit} message if a quorum of votes has been received.\<close>
236 | 
237 | definition commitIfQuorate :: "NodeData \<Rightarrow> (NodeData * Message option)"
238 |   where
239 |     "commitIfQuorate nd = (nd, if isQuorum nd (publishVotes nd)
240 |                                   then Some (ApplyCommit (firstUncommittedSlot nd) (currentTerm nd)) else None)"
241 | 
242 | text \<open>A @{term PublishResponse} message is checked for acceptability and handled as follows. If
243 | this message, together with the previously-received messages, forms a quorum of votes then the
244 | value is committed, yielding an @{term ApplyCommit} message.\<close>
245 | 
246 | definition handlePublishResponse :: "Node \<Rightarrow> Slot \<Rightarrow> Term \<Rightarrow> NodeData \<Rightarrow> (NodeData * Message option)"
247 |   where
248 |     "handlePublishResponse s i t nd \<equiv>
249 |         if i = firstUncommittedSlot nd \<and> t = currentTerm nd
250 |         then commitIfQuorate (nd \<lparr> publishVotes := insert s (publishVotes nd) \<rparr>)
251 |         else (nd, None)"
252 | 
253 | text \<open>This method updates the node's state when a value is committed.\<close>
254 | 
255 | definition applyAcceptedValue :: "NodeData \<Rightarrow> NodeData"
256 |   where
257 |     "applyAcceptedValue nd \<equiv> case lastAcceptedValue nd of
258 |         NoOp \<Rightarrow> nd
259 |       | Reconfigure votingNodes \<Rightarrow> nd
260 |           \<lparr> currentVotingNodes := set votingNodes
261 |           , electionWon := joinVotes nd \<in> majorities (set votingNodes) \<rparr>
262 |       | ClusterStateDiff diff \<Rightarrow> nd \<lparr> currentClusterState := diff (currentClusterState nd) \<rparr>"
263 | 
264 | text \<open>An @{term ApplyCommit} message is applied to the current node's state, updating its configuration
265 | and \texttt{ClusterState} via the @{term applyValue} method. It yields no messages.\<close>
266 | 
267 | definition handleApplyCommit :: "Slot \<Rightarrow> Term \<Rightarrow> NodeData \<Rightarrow> NodeData"
268 |   where
269 |     "handleApplyCommit i t nd \<equiv>
270 |         if i = firstUncommittedSlot nd \<and> lastAcceptedTerm nd = SomeTerm t
271 |           then (applyAcceptedValue nd)
272 |                      \<lparr> firstUncommittedSlot := i + 1
273 |                      , lastAcceptedData := None
274 |                      , publishPermitted := True
275 |                      , publishVotes := {} \<rparr>
276 |           else nd"
277 | 
278 | definition handleCatchUpRequest :: "NodeData \<Rightarrow> (NodeData * Message option)"
279 |   where
280 |     "handleCatchUpRequest nd = (nd, Some (CatchUpResponse (firstUncommittedSlot nd)
281 |                                               (currentVotingNodes nd) (currentClusterState nd)))"
282 | 
283 | definition handleCatchUpResponse :: "Slot \<Rightarrow> Node set \<Rightarrow> ClusterState \<Rightarrow> NodeData \<Rightarrow> NodeData"
284 |   where
285 |     "handleCatchUpResponse i conf cs nd \<equiv>
286 |       if firstUncommittedSlot nd < i
287 |         then nd \<lparr> firstUncommittedSlot := i
288 |                 , publishPermitted := True
289 |                 , publishVotes := {}
290 |                 , currentVotingNodes := conf
291 |                 , currentClusterState := cs
292 |                 , lastAcceptedData := None
293 |                 , joinVotes := {}
294 |                 , electionWon := False \<rparr>
295 |         else nd"
296 | 
297 | text \<open>A @{term Reboot} message simulates the effect of a reboot, discarding any ephemeral state but
298 | preserving the persistent state. It yields no messages.\<close>
299 | 
300 | definition handleReboot :: "NodeData \<Rightarrow> NodeData"
301 |   where
302 |     "handleReboot nd \<equiv>
303 |       \<lparr> currentNode = currentNode nd
304 |       , currentTerm = currentTerm nd
305 |       , firstUncommittedSlot = firstUncommittedSlot nd
306 |       , currentVotingNodes = currentVotingNodes nd
307 |       , currentClusterState = currentClusterState nd
308 |       , lastAcceptedData = lastAcceptedData nd
309 |       , joinVotes = {}
310 |       , electionWon = False
311 |       , publishPermitted = False
312 |       , publishVotes = {} \<rparr>"
313 | 
314 | text \<open>A @{term DiscardJoinVotes} message discards the votes received by a node. It yields
315 | no messages.\<close>
316 | 
317 | definition handleDiscardJoinVotes :: "NodeData \<Rightarrow> NodeData"
318 |   where
319 |   "handleDiscardJoinVotes nd \<equiv> nd \<lparr> electionWon := False, joinVotes := {} \<rparr>"
320 | 
321 | text \<open>This function dispatches incoming messages to the appropriate handler method, and
322 | routes any responses to the appropriate places. In particular, @{term Vote} messages
323 | (sent by the @{term handleStartJoin} method) and
324 | @{term PublishResponse} messages (sent by the @{term handlePublishRequest} method) are
325 | only sent to a single node, whereas all other responses are broadcast to all nodes.\<close>
326 | 
327 | definition ProcessMessage :: "NodeData \<Rightarrow> RoutedMessage \<Rightarrow> (NodeData * RoutedMessage option)"
328 |   where
329 |     "ProcessMessage nd msg \<equiv>
330 |       let respondTo =
331 |           (\<lambda> d (nd, mmsg). case mmsg of
332 |                None \<Rightarrow> (nd, None)
333 |              | Some msg \<Rightarrow> (nd,
334 |                  Some \<lparr> sender = currentNode nd, destination = d,
335 |                              payload = msg \<rparr>));
336 |           respondToSender = respondTo (OneNode (sender msg));
337 |           respondToAll    = respondTo Broadcast
338 |       in
339 |         if destination msg \<in> { Broadcast, OneNode (currentNode nd) }
340 |         then case payload msg of
341 |           StartJoin t
342 |               \<Rightarrow> respondToSender (handleStartJoin t nd)
343 |           | Vote i t a
344 |               \<Rightarrow> respondToAll (handleVote (sender msg) i t a nd)
345 |           | ClientValue x
346 |               \<Rightarrow> respondToAll (handleClientValue x nd)
347 |           | PublishRequest i t x
348 |               \<Rightarrow> respondToSender (handlePublishRequest i t x nd)
349 |           | PublishResponse i t
350 |               \<Rightarrow> respondToAll (handlePublishResponse (sender msg) i t nd)
351 |           | ApplyCommit i t
352 |               \<Rightarrow> (handleApplyCommit i t nd, None)
353 |           | CatchUpRequest
354 |               \<Rightarrow> respondToSender (handleCatchUpRequest nd)
355 |           | CatchUpResponse i conf cs
356 |               \<Rightarrow> (handleCatchUpResponse i conf cs nd, None)
357 |           | DiscardJoinVotes
358 |               \<Rightarrow> (handleDiscardJoinVotes nd, None)
359 |           | Reboot
360 |               \<Rightarrow> (handleReboot nd, None)
361 |         else (nd, None)"
362 | 
363 | text \<open>Nodes are initialised to this state. The data required is the initial configuration, @{term Q\<^sub>0}
364 | and the initial \texttt{ClusterState}, here shown as @{term "ClusterState 0"}.\<close>
365 | 
366 | definition initialNodeState :: "Node \<Rightarrow> NodeData"
367 |   where "initialNodeState n =
368 |       \<lparr> currentNode = n
369 |       , currentTerm = 0
370 |       , firstUncommittedSlot = 0
371 |       , currentVotingNodes = V\<^sub>0
372 |       , currentClusterState = CS\<^sub>0
373 |       , lastAcceptedData = None
374 |       , joinVotes = {}
375 |       , electionWon = False
376 |       , publishPermitted = False
377 |       , publishVotes = {} \<rparr>"
378 | (* Note: publishPermitted could be True initially, but in the actual implementation we call the
379 | same constructor whether we're starting up from afresh or recovering from a reboot, and the value
380 | is really unimportant as we need to run an election in a new term before becoming master anyway,
381 | so it's hard to justify putting any effort into calculating different values for these two cases.
382 | Instead just set it to False initially.*)
383 | 
384 | end
385 | 


--------------------------------------------------------------------------------
/cluster/isabelle/Monadic.thy:
--------------------------------------------------------------------------------
  1 | theory Monadic
  2 |   imports Implementation "~~/src/HOL/Library/Monad_Syntax"
  3 | begin
  4 | 
  5 | datatype Exception = IllegalArgumentException
  6 | 
  7 | datatype ('e,'a) Result = Success 'a | Exception 'e
  8 | 
  9 | datatype 'a Action = Action "NodeData \<Rightarrow> (NodeData * RoutedMessage list * (Exception,'a) Result)"
 10 | 
 11 | definition runM :: "'a Action \<Rightarrow> NodeData \<Rightarrow> (NodeData * RoutedMessage list * (Exception,'a) Result)"
 12 |   where "runM ma \<equiv> case ma of Action unwrapped_ma \<Rightarrow> unwrapped_ma"
 13 | 
 14 | lemma runM_Action[simp]: "runM (Action f) = f" by (simp add: runM_def)
 15 | lemma runM_inject[intro]: "(\<And>nd. runM ma nd = runM mb nd) \<Longrightarrow> ma = mb" by (cases ma, cases mb, auto simp add: runM_def)
 16 | 
 17 | definition return :: "'a \<Rightarrow> 'a Action" where "return a \<equiv> Action (\<lambda> nd. (nd, [], Success a))"
 18 | 
 19 | lemma runM_return[simp]: "runM (return a) nd = (nd, [], Success a)" unfolding runM_def return_def by simp
 20 | 
 21 | definition Action_bind :: "'a Action \<Rightarrow> ('a \<Rightarrow> 'b Action) \<Rightarrow> 'b Action"
 22 |   where "Action_bind ma mf \<equiv> Action (\<lambda> nd0. case runM ma nd0 of
 23 |       (nd1, msgs1, result1) \<Rightarrow> (case result1 of
 24 |           Exception e \<Rightarrow> (nd1, msgs1, Exception e)
 25 |         | Success a \<Rightarrow> (case runM (mf a) nd1 of
 26 |              (nd2, msgs2, result2) \<Rightarrow> (nd2, msgs1 @ msgs2, result2))))"
 27 | 
 28 | adhoc_overloading bind Action_bind
 29 | 
 30 | lemma runM_bind: "runM (a \<bind> f) nd0 = (case runM a nd0 of (nd1, msgs1, result1) \<Rightarrow> (case result1 of Exception e \<Rightarrow> (nd1, msgs1, Exception e) | Success b \<Rightarrow> (case runM (f b) nd1 of (nd2, msgs2, c) \<Rightarrow> (nd2, msgs1@msgs2, c))))"
 31 |   unfolding Action_bind_def by auto
 32 | 
 33 | lemma return_bind[simp]: "do { a' <- return a; f a' } = f a"
 34 |   apply (intro runM_inject) by (simp add: runM_bind)
 35 | lemma bind_return[simp]: "do { a' <- f; return a' } = f"
 36 | proof (intro runM_inject)
 37 |   fix nd
 38 |   obtain nd1 msgs1 result1 where result1: "runM f nd = (nd1, msgs1, result1)" by (cases "runM f nd", blast)
 39 |   show "runM (f \<bind> return) nd = runM f nd"
 40 |     by (cases result1, simp_all add: runM_bind result1)
 41 | qed
 42 | 
 43 | lemma bind_bind_assoc[simp]:
 44 |   fixes f :: "'a Action"
 45 |   shows "do { b <- do { a <- f; g a }; h b } = do { a <- f; b <- g a; h b }" (is "?LHS = ?RHS")
 46 | proof (intro runM_inject)
 47 |   fix nd0
 48 |   show "runM ?LHS nd0 = runM ?RHS nd0"
 49 |   proof (cases "runM f nd0")
 50 |     case fields1: (fields nd1 msgs1 result1)
 51 |     show ?thesis
 52 |     proof (cases result1)
 53 |       case Exception show ?thesis by (simp add: runM_bind fields1 Exception)
 54 |     next
 55 |       case Success1: (Success b)
 56 |       show ?thesis
 57 |       proof (cases "runM (g b) nd1")
 58 |         case fields2: (fields nd2 msgs2 result2)
 59 |         show ?thesis
 60 |         proof (cases result2)
 61 |           case Exception show ?thesis by (simp add: runM_bind fields1 fields2 Success1 Exception)
 62 |         next
 63 |           case Success2: (Success c)
 64 |           show ?thesis
 65 |             by (cases "runM (h c) nd2", simp add: runM_bind fields1 Success1 fields2 Success2)
 66 |         qed
 67 |       qed
 68 |     qed
 69 |   qed
 70 | qed
 71 | 
 72 | definition getNodeData :: "NodeData Action" where "getNodeData \<equiv> Action (\<lambda>nd. (nd, [], Success nd))"
 73 | definition setNodeData :: "NodeData \<Rightarrow> unit Action" where "setNodeData nd \<equiv> Action (\<lambda>_. (nd, [], Success ()))"
 74 | 
 75 | lemma runM_getNodeData[simp]: "runM  getNodeData      nd = (nd,  [], Success nd)" by (simp add: runM_def getNodeData_def)
 76 | lemma runM_setNodeData[simp]: "runM (setNodeData nd') nd = (nd', [], Success ())" by (simp add: runM_def setNodeData_def)
 77 | 
 78 | lemma runM_getNodeData_continue[simp]: "runM (do { nd' <- getNodeData; f nd' }) nd = runM (f nd) nd" by (simp add: runM_bind)
 79 | lemma runM_setNodeData_continue[simp]: "runM (do { setNodeData nd'; f }) nd = runM f nd'" by (simp add: runM_bind)
 80 | 
 81 | definition modifyNodeData :: "(NodeData \<Rightarrow> NodeData) \<Rightarrow> unit Action" where "modifyNodeData f = getNodeData \<bind> (setNodeData \<circ> f)"
 82 | 
 83 | lemma runM_modifyNodeData[simp]: "runM (modifyNodeData f) nd = (f nd, [], Success ())" by (simp add: modifyNodeData_def runM_bind)
 84 | lemma runM_modifyNodeData_continue[simp]: "runM (do { modifyNodeData f; a }) nd = runM a (f nd)" by (simp add: runM_bind)
 85 | 
 86 | definition tell :: "RoutedMessage list \<Rightarrow> unit Action" where "tell rms \<equiv> Action (\<lambda>nd. (nd, rms, Success ()))"
 87 | lemma runM_tell[simp]: "runM (tell rms) nd = (nd, rms, Success ())" by (simp add: runM_def tell_def)
 88 | lemma runM_tell_contiue[simp]: "runM (do { tell rms; a }) nd = (let (nd, rms', x) = runM a nd in (nd, rms@rms', x))" by (simp add: runM_bind tell_def)
 89 | 
 90 | definition send :: "RoutedMessage \<Rightarrow> unit Action" where "send rm = tell [rm]"
 91 | 
 92 | definition throw :: "Exception \<Rightarrow> 'a Action" where "throw e = Action (\<lambda>nd. (nd, [], Exception e))"
 93 | lemma runM_throw[simp]: "runM (throw e) nd = (nd, [], Exception e)" by (simp add: runM_def throw_def)
 94 | lemma throw_continue[simp]: "do { throw e; a } = throw e" by (intro runM_inject, simp add: runM_bind)
 95 | 
 96 | definition catch :: "'a Action \<Rightarrow> (Exception \<Rightarrow> 'a Action) \<Rightarrow> 'a Action"
 97 |   where "catch go onException = Action (\<lambda>nd0. case runM go nd0 of (nd1, rms1, result1) \<Rightarrow> (case result1 of Success _ \<Rightarrow> (nd1, rms1, result1) | Exception e \<Rightarrow> runM (tell rms1 \<then> onException e) nd1))"
 98 | lemma catch_throw[simp]: "catch (throw e) handle = handle e" by (intro runM_inject, simp add: catch_def)
 99 | lemma catch_return[simp]: "catch (return a) handle = return a" by (intro runM_inject, simp add: catch_def)
100 | 
101 | lemma catch_getNodeData[simp]: "catch getNodeData handle = getNodeData" by (intro runM_inject, simp add: catch_def)
102 | lemma catch_getNodeData_continue[simp]: "catch (do { nd <- getNodeData; f nd }) handle = do { nd <- getNodeData; catch (f nd) handle }" by (intro runM_inject, simp add: catch_def)
103 | lemma catch_setNodeData[simp]: "catch (setNodeData nd) handle = setNodeData nd" by (intro runM_inject, simp add: catch_def)
104 | lemma catch_setNodeData_continue[simp]: "catch (do { setNodeData nd; f }) handle = do { setNodeData nd; catch f handle }" by (intro runM_inject, simp add: catch_def)
105 | lemma catch_modifyNodeData[simp]: "catch (modifyNodeData f) handle = modifyNodeData f" by (intro runM_inject, simp add: catch_def)
106 | lemma catch_modifyNodeData_continue[simp]: "catch (do { modifyNodeData f; g }) handle = do { modifyNodeData f; catch g handle }" by (intro runM_inject, simp add: catch_def)
107 | lemma catch_tell[simp]: "catch (tell rms) handle = tell rms" by (intro runM_inject, simp add: catch_def)
108 | lemma catch_tell_continue[simp]: "catch (do { tell rms; f }) handle = do { tell rms; catch f handle }"
109 | proof (intro runM_inject)
110 |   fix nd0
111 |   show "runM (catch (do { tell rms; f }) handle) nd0 = runM (do { tell rms; catch f handle }) nd0"
112 |   proof (cases "runM f nd0")
113 |     case fields1: (fields nd1 msgs1 result1)
114 |     show ?thesis
115 |     proof (cases result1)
116 |       case (Exception e) show ?thesis by (cases "runM (handle e) nd1", simp add: catch_def fields1 Exception)
117 |     next
118 |       case Success1: (Success b)
119 |       show ?thesis
120 |         by (simp add: catch_def fields1 Success1)
121 |     qed
122 |   qed
123 | qed
124 | lemma catch_send[simp]: "catch (send rm) handle = send rm" by (simp add: send_def)
125 | lemma catch_send_continue[simp]: "catch (do { send rm; f }) handle = do { send rm; catch f handle }" by (simp add: send_def)
126 | 
127 | definition gets :: "(NodeData \<Rightarrow> 'a) \<Rightarrow> 'a Action" where "gets f \<equiv> do { nd <- getNodeData; return (f nd) }"
128 | definition getCurrentClusterState where "getCurrentClusterState = gets currentClusterState"
129 | definition getCurrentNode where "getCurrentNode = gets currentNode"
130 | definition getCurrentTerm where "getCurrentTerm = gets currentTerm"
131 | definition getCurrentVotingNodes where "getCurrentVotingNodes = gets currentVotingNodes"
132 | definition getElectionWon where "getElectionWon = gets electionWon"
133 | definition getFirstUncommittedSlot where "getFirstUncommittedSlot = gets firstUncommittedSlot"
134 | definition getJoinVotes where "getJoinVotes = gets joinVotes"
135 | definition getLastAcceptedData where "getLastAcceptedData = gets lastAcceptedData"
136 | definition getPublishPermitted where "getPublishPermitted = gets publishPermitted"
137 | definition getPublishVotes where "getPublishVotes = gets publishVotes"
138 | 
139 | definition sets where "sets f x = modifyNodeData (f (\<lambda>_. x))"
140 | definition setCurrentClusterState where "setCurrentClusterState = sets currentClusterState_update"
141 | definition setCurrentNode where "setCurrentNode = sets currentNode_update"
142 | definition setCurrentTerm where "setCurrentTerm = sets currentTerm_update"
143 | definition setCurrentVotingNodes where "setCurrentVotingNodes = sets currentVotingNodes_update"
144 | definition setElectionWon where "setElectionWon = sets electionWon_update"
145 | definition setFirstUncommittedSlot where "setFirstUncommittedSlot = sets firstUncommittedSlot_update"
146 | definition setJoinVotes where "setJoinVotes = sets joinVotes_update"
147 | definition setLastAcceptedData where "setLastAcceptedData = sets lastAcceptedData_update"
148 | definition setPublishPermitted where "setPublishPermitted = sets publishPermitted_update"
149 | definition setPublishVotes where "setPublishVotes = sets publishVotes_update"
150 | 
151 | definition modifies where "modifies f g = modifyNodeData (f g)"
152 | definition modifyJoinVotes where "modifyJoinVotes = modifies joinVotes_update"
153 | definition modifyPublishVotes where "modifyPublishVotes = modifies publishVotes_update"
154 | definition modifyCurrentClusterState where "modifyCurrentClusterState = modifies currentClusterState_update"
155 | 
156 | definition "when" :: "bool \<Rightarrow> unit Action \<Rightarrow> unit Action" where "when c a \<equiv> if c then a else return ()"
157 | definition unless :: "bool \<Rightarrow> unit Action \<Rightarrow> unit Action" where "unless \<equiv> when \<circ> Not"
158 | 
159 | lemma runM_when: "runM (when c a) nd = (if c then runM a nd else (nd, [], Success ()))"
160 |   by (auto simp add: when_def)
161 | lemma runM_unless: "runM (unless c a) nd = (if c then (nd, [], Success ()) else runM a nd)"
162 |   by (auto simp add: unless_def when_def)
163 | 
164 | lemma runM_when_continue: "runM (do { when c a; b }) nd = (if c then runM (do {a;b}) nd else runM b nd)"
165 |   by (auto simp add: when_def)
166 | lemma runM_unless_continue: "runM (do { unless c a; b }) nd = (if c then runM b nd else runM (do {a;b}) nd)"
167 |   by (auto simp add: unless_def when_def)
168 | 
169 | lemma catch_when[simp]: "catch (when c a) onException = when c (catch a onException)"
170 |   by (intro runM_inject, simp add: catch_def runM_when)
171 | lemma catch_unless[simp]: "catch (unless c a) onException = unless c (catch a onException)"
172 |   by (intro runM_inject, simp add: catch_def runM_unless)
173 | 
174 | lemma catch_when_continue[simp]: "catch (do { when c a; b }) onException = (if c then catch (do {a;b}) onException else catch b onException)"
175 |   by (intro runM_inject, simp add: catch_def runM_when_continue)
176 | lemma catch_unless_continue[simp]: "catch (do { unless c a; b }) onException = (if c then catch b onException else catch (do {a;b}) onException)"
177 |   by (intro runM_inject, simp add: catch_def runM_unless_continue)
178 | 
179 | definition ensureCorrectDestination :: "Destination \<Rightarrow> unit Action"
180 |   where "ensureCorrectDestination d \<equiv> do {
181 |     n <- getCurrentNode;
182 |     when (d \<notin> { Broadcast, OneNode n }) (throw IllegalArgumentException)
183 |   }"
184 | 
185 | lemma runM_ensureCorrectDestination_continue:
186 |   "runM (do { ensureCorrectDestination d; go }) nd = (if d \<in> { Broadcast, OneNode (currentNode nd) } then runM go nd else (nd, [], Exception IllegalArgumentException))"
187 |   by (simp add: ensureCorrectDestination_def getCurrentNode_def gets_def runM_when_continue)
188 | 
189 | definition broadcast :: "Message \<Rightarrow> unit Action"
190 |   where "broadcast msg \<equiv> do {
191 |        n <- getCurrentNode;
192 |        send \<lparr> sender = n, destination = Broadcast, payload = msg \<rparr>
193 |     }"
194 | 
195 | lemma runM_broadcast[simp]: "runM (broadcast msg) nd = (nd, [\<lparr> sender = currentNode nd, destination = Broadcast, payload = msg \<rparr>], Success ())"
196 |   by (simp add: broadcast_def getCurrentNode_def gets_def send_def)
197 | 
198 | definition sendTo :: "Node \<Rightarrow> Message \<Rightarrow> unit Action"
199 |   where "sendTo d msg \<equiv> do {
200 |        n <- getCurrentNode;
201 |        send \<lparr> sender = n, destination = OneNode d, payload = msg \<rparr>
202 |     }"
203 | 
204 | lemma runM_sendTo[simp]: "runM (sendTo d msg) nd = (nd, [\<lparr> sender = currentNode nd, destination = OneNode d, payload = msg \<rparr>], Success ())"
205 |   by (simp add: sendTo_def getCurrentNode_def gets_def send_def)
206 | 
207 | definition ignoringExceptions :: "unit Action \<Rightarrow> unit Action" where "ignoringExceptions go \<equiv> catch go (\<lambda>_. return ())"
208 | 
209 | lemma None_lt[simp]: "NO_TERM < t = (t \<noteq> NO_TERM)" by (cases t, simp_all)
210 | 
211 | definition getLastAcceptedTerm :: "TermOption Action"
212 |   where
213 |     "getLastAcceptedTerm \<equiv> do {
214 |       lastAcceptedData <- getLastAcceptedData;
215 |       case lastAcceptedData of
216 |           None \<Rightarrow> return NO_TERM
217 |         | Some tv \<Rightarrow> return (SomeTerm (tvTerm tv))
218 |     }"
219 | 
220 | definition doStartJoin :: "Node \<Rightarrow> Term \<Rightarrow> unit Action"
221 |   where
222 |     "doStartJoin newMaster newTerm \<equiv> do {
223 |         currentTerm <- getCurrentTerm;
224 | 
225 |         when (newTerm \<le> currentTerm) (throw IllegalArgumentException);
226 | 
227 |         setCurrentTerm newTerm;
228 |         setJoinVotes {};
229 |         setElectionWon False;
230 |         setPublishPermitted True;
231 |         setPublishVotes {};
232 | 
233 |         firstUncommittedSlot <- getFirstUncommittedSlot;
234 |         lastAcceptedTerm <- getLastAcceptedTerm;
235 |         sendTo newMaster (Vote firstUncommittedSlot newTerm lastAcceptedTerm)
236 | 
237 |       }"
238 | 
239 | definition doVote :: "Node \<Rightarrow> Slot \<Rightarrow> Term \<Rightarrow> TermOption \<Rightarrow> unit Action"
240 |   where
241 |     "doVote sourceNode voteFirstUncommittedSlot voteTerm voteLastAcceptedTerm \<equiv> do {
242 | 
243 |       currentTerm <- getCurrentTerm;
244 |       when (voteTerm \<noteq> currentTerm) (throw IllegalArgumentException);
245 | 
246 |       firstUncommittedSlot <- getFirstUncommittedSlot;
247 |       when (voteFirstUncommittedSlot > firstUncommittedSlot) (throw IllegalArgumentException);
248 | 
249 |       lastAcceptedTerm <- getLastAcceptedTerm;
250 |       when (voteFirstUncommittedSlot = firstUncommittedSlot
251 |               \<and> voteLastAcceptedTerm > lastAcceptedTerm)
252 |           (throw IllegalArgumentException);
253 | 
254 |       modifyJoinVotes (insert sourceNode);
255 | 
256 |       joinVotes <- getJoinVotes;
257 |       currentVotingNodes <- getCurrentVotingNodes;
258 | 
259 |       let electionWon' = card (joinVotes \<inter> currentVotingNodes) * 2 > card currentVotingNodes;
260 |       setElectionWon electionWon';
261 |       publishPermitted <- getPublishPermitted;
262 |       when (electionWon' \<and> publishPermitted \<and> lastAcceptedTerm \<noteq> NO_TERM) (do {
263 |         setPublishPermitted False;
264 | 
265 |         lastAcceptedValue <- gets lastAcceptedValue; (* NB must be present since lastAcceptedTermInSlot \<noteq> NO_TERM *)
266 |         broadcast (PublishRequest firstUncommittedSlot currentTerm lastAcceptedValue)
267 |       })
268 |     }"
269 | 
270 | definition doPublishRequest :: "Node \<Rightarrow> Slot \<Rightarrow> TermValue \<Rightarrow> unit Action"
271 |   where
272 |     "doPublishRequest sourceNode requestSlot newAcceptedState \<equiv> do {
273 | 
274 |       currentTerm <- getCurrentTerm;
275 |       when (tvTerm newAcceptedState \<noteq> currentTerm) (throw IllegalArgumentException);
276 | 
277 |       firstUncommittedSlot <- getFirstUncommittedSlot;
278 |       when (requestSlot \<noteq> firstUncommittedSlot) (throw IllegalArgumentException);
279 | 
280 |       setLastAcceptedData (Some newAcceptedState);
281 |       sendTo sourceNode (PublishResponse requestSlot (tvTerm newAcceptedState))
282 |     }"
283 | 
284 | record SlotTerm =
285 |   stSlot :: Slot
286 |   stTerm :: Term
287 | 
288 | definition ApplyCommitFromSlotTerm :: "SlotTerm \<Rightarrow> Message"
289 |   where "ApplyCommitFromSlotTerm st = ApplyCommit (stSlot st) (stTerm st)"
290 | 
291 | definition doPublishResponse :: "Node \<Rightarrow> SlotTerm \<Rightarrow> unit Action"
292 |   where
293 |     "doPublishResponse sourceNode slotTerm \<equiv> do {
294 | 
295 |       currentTerm <- getCurrentTerm;
296 |       when (stTerm slotTerm \<noteq> currentTerm) (throw IllegalArgumentException);
297 | 
298 |       firstUncommittedSlot <- getFirstUncommittedSlot;
299 |       when (stSlot slotTerm \<noteq> firstUncommittedSlot) (throw IllegalArgumentException);
300 | 
301 |       modifyPublishVotes (insert sourceNode);
302 |       publishVotes <- getPublishVotes;
303 |       currentVotingNodes <- getCurrentVotingNodes;
304 |       when (card (publishVotes \<inter> currentVotingNodes) * 2 > card currentVotingNodes)
305 |         (broadcast (ApplyCommitFromSlotTerm slotTerm))
306 |     }"
307 | 
308 | definition doCommit :: "SlotTerm \<Rightarrow> unit Action"
309 |   where
310 |     "doCommit slotTerm \<equiv> do {
311 | 
312 |       lastAcceptedTermInSlot <- getLastAcceptedTerm;
313 |       when (SomeTerm (stTerm slotTerm) \<noteq> lastAcceptedTermInSlot) (throw IllegalArgumentException);
314 | 
315 |       firstUncommittedSlot <- getFirstUncommittedSlot;
316 |       when (stSlot slotTerm \<noteq> firstUncommittedSlot) (throw IllegalArgumentException);
317 | 
318 |       lastAcceptedValue <- gets lastAcceptedValue;  (* NB must be not None since lastAcceptedTerm = Some t *)
319 |       (case lastAcceptedValue of
320 |         ClusterStateDiff diff
321 |             \<Rightarrow> modifyCurrentClusterState diff
322 |         | Reconfigure votingNodes \<Rightarrow> do {
323 |                setCurrentVotingNodes (set votingNodes);
324 |                joinVotes <- getJoinVotes;
325 |                setElectionWon (card (joinVotes \<inter> (set votingNodes)) * 2 > card (set votingNodes))
326 |              }
327 |         | NoOp \<Rightarrow> return ());
328 | 
329 |       setFirstUncommittedSlot (firstUncommittedSlot + 1);
330 |       setLastAcceptedData None;
331 |       setPublishPermitted True;
332 |       setPublishVotes {}
333 |     }"
334 | 
335 | definition generateCatchup :: "Node \<Rightarrow> unit Action"
336 |   where
337 |     "generateCatchup sourceNode \<equiv> do {
338 | 
339 |       firstUncommittedSlot <- getFirstUncommittedSlot;
340 |       currentVotingNodes <- getCurrentVotingNodes;
341 |       currentClusterState <- getCurrentClusterState;
342 | 
343 |       sendTo sourceNode (CatchUpResponse firstUncommittedSlot currentVotingNodes currentClusterState)
344 |     }"
345 | 
346 | definition applyCatchup :: "Slot \<Rightarrow> Node set \<Rightarrow> ClusterState \<Rightarrow> unit Action"
347 |   where
348 |     "applyCatchup catchUpSlot catchUpConfiguration catchUpState \<equiv> do {
349 | 
350 |       firstUncommittedSlot <- getFirstUncommittedSlot;
351 |       when (catchUpSlot \<le> firstUncommittedSlot) (throw IllegalArgumentException);
352 | 
353 |       setFirstUncommittedSlot catchUpSlot;
354 |       setCurrentVotingNodes catchUpConfiguration;
355 |       setCurrentClusterState catchUpState;
356 |       setLastAcceptedData None;
357 | 
358 |       setJoinVotes {};
359 |       setElectionWon False;
360 | 
361 |       setPublishVotes {};
362 |       setPublishPermitted True
363 |     }"
364 | 
365 | definition doClientValue :: "Value \<Rightarrow> unit Action"
366 |   where
367 |     "doClientValue x \<equiv> do {
368 | 
369 |       electionWon <- getElectionWon;
370 |       when (\<not> electionWon) (throw IllegalArgumentException);
371 | 
372 |       publishPermitted <- getPublishPermitted;
373 |       when (\<not> publishPermitted) (throw IllegalArgumentException);
374 | 
375 |       lastAcceptedTermInSlot <- getLastAcceptedTerm;
376 |       when (lastAcceptedTermInSlot \<noteq> NO_TERM) (throw IllegalArgumentException);
377 | 
378 |       setPublishPermitted False;
379 | 
380 |       currentTerm <- getCurrentTerm;
381 |       firstUncommittedSlot <- getFirstUncommittedSlot;
382 |       broadcast (PublishRequest firstUncommittedSlot currentTerm x)
383 |     }"
384 | 
385 | definition doDiscardJoinVotes :: "unit Action"
386 |   where
387 |     "doDiscardJoinVotes \<equiv> do {
388 |       setJoinVotes {};
389 |       setElectionWon False
390 |     }"
391 | 
392 | definition doReboot :: "unit Action"
393 |   where
394 |     "doReboot \<equiv> modifyNodeData (\<lambda>nd.
395 |                       (* persistent fields *)
396 |                   \<lparr> currentNode = currentNode nd
397 |                   , currentTerm = currentTerm nd
398 |                   , firstUncommittedSlot = firstUncommittedSlot nd
399 |                   , currentVotingNodes = currentVotingNodes nd
400 |                   , currentClusterState = currentClusterState nd
401 |                   , lastAcceptedData = lastAcceptedData nd
402 |                       (* transient fields *)
403 |                   , joinVotes = {}
404 |                   , electionWon = False
405 |                   , publishPermitted = False
406 |                   , publishVotes = {} \<rparr>)"
407 | 
408 | definition ProcessMessageAction :: "RoutedMessage \<Rightarrow> unit Action"
409 |   where "ProcessMessageAction rm \<equiv> Action (\<lambda>nd. case ProcessMessage nd rm of (nd', messageOption) \<Rightarrow> (nd', case messageOption of None \<Rightarrow> [] | Some m \<Rightarrow> [m], Success ()))"
410 | 
411 | definition dispatchMessageInner :: "RoutedMessage \<Rightarrow> unit Action"
412 |   where "dispatchMessageInner m \<equiv> case payload m of
413 |           StartJoin t \<Rightarrow> doStartJoin (sender m) t
414 |           | Vote i t a \<Rightarrow> doVote (sender m) i t a
415 |           | ClientValue x \<Rightarrow> doClientValue x
416 |           | PublishRequest i t x \<Rightarrow> doPublishRequest (sender m) i \<lparr> tvTerm = t, tvValue = x \<rparr>
417 |           | PublishResponse i t \<Rightarrow> doPublishResponse (sender m) \<lparr> stSlot = i, stTerm = t \<rparr>
418 |           | ApplyCommit i t \<Rightarrow> doCommit \<lparr> stSlot = i, stTerm = t \<rparr>
419 |           | CatchUpRequest \<Rightarrow> generateCatchup (sender m)
420 |           | CatchUpResponse i conf cs \<Rightarrow> applyCatchup i conf cs
421 |           | DiscardJoinVotes \<Rightarrow> doDiscardJoinVotes
422 |           | Reboot \<Rightarrow> doReboot"
423 | 
424 | definition dispatchMessage :: "RoutedMessage \<Rightarrow> unit Action"
425 |   where "dispatchMessage m \<equiv> ignoringExceptions (do {
426 |       ensureCorrectDestination (destination m);
427 |       dispatchMessageInner m
428 |     })"
429 | 
430 | lemma getLastAcceptedTermInSlot_gets[simp]: "getLastAcceptedTerm = gets lastAcceptedTerm"
431 | proof (intro runM_inject)
432 |   fix nd
433 |   show "runM getLastAcceptedTerm nd = runM (gets lastAcceptedTerm) nd"
434 |     by (cases "lastAcceptedData nd", simp_all add: gets_def getLastAcceptedTerm_def getLastAcceptedData_def
435 |         getFirstUncommittedSlot_def lastAcceptedTerm_def)
436 | qed
437 | 
438 | lemma monadic_implementation_is_faithful:
439 |   "dispatchMessage = ProcessMessageAction"
440 | proof (intro ext runM_inject)
441 |   fix rm nd
442 |   show "runM (dispatchMessage rm) nd = runM (ProcessMessageAction rm) nd" (is "?LHS = ?RHS")
443 |   proof (cases "destination rm \<in> {Broadcast, OneNode (currentNode nd)}")
444 |     case False
445 | 
446 |     hence 1: "\<And>f. runM (do { ensureCorrectDestination (destination rm); f }) nd = (nd, [], Exception IllegalArgumentException)"
447 |       by (simp add: runM_ensureCorrectDestination_continue)
448 | 
449 |     from False
450 |     show ?thesis
451 |       unfolding ProcessMessageAction_def dispatchMessage_def
452 |       by (simp add: ignoringExceptions_def catch_def 1 ProcessMessage_def)
453 |   next
454 |     case dest_ok: True
455 | 
456 |     hence 1: "runM (dispatchMessage rm) nd = runM (ignoringExceptions (dispatchMessageInner rm)) nd"
457 |       by (simp add: dispatchMessage_def ignoringExceptions_def catch_def runM_ensureCorrectDestination_continue)
458 | 
459 |     also have "... = runM (ProcessMessageAction rm) nd" (is "?LHS = ?RHS")
460 |     proof (cases "payload rm")
461 |       case (StartJoin t)
462 | 
463 |       have "?LHS = runM (ignoringExceptions (doStartJoin (sender rm) t)) nd" (is "_ = ?STEP")
464 |         by (simp add: dispatchMessageInner_def StartJoin)
465 | 
466 |       also consider
467 |         (a) "t \<le> currentTerm nd"
468 |         | (b) "currentTerm nd < t" "case lastAcceptedTerm nd of NO_TERM \<Rightarrow> False | SomeTerm x \<Rightarrow> t \<le> x"
469 |         | (c) "currentTerm nd < t" "case lastAcceptedTerm nd of NO_TERM \<Rightarrow> True | SomeTerm x \<Rightarrow> x < t"
470 |       proof (cases "t \<le> currentTerm nd")
471 |         case True thus ?thesis by (intro a)
472 |       next
473 |         case 1: False
474 |         with b c show ?thesis
475 |           by (cases "case lastAcceptedTerm nd of NO_TERM \<Rightarrow> False | SomeTerm x \<Rightarrow> t \<le> x", auto, cases "lastAcceptedTerm nd", auto)
476 |       qed
477 | 
478 |       hence "?STEP = ?RHS"
479 |       proof cases
480 |         case a
481 |         thus ?thesis
482 |           by (simp add: StartJoin ProcessMessageAction_def dispatchMessage_def ProcessMessage_def Let_def runM_unless
483 |               doStartJoin_def getCurrentTerm_def gets_def setJoinVotes_def sets_def setCurrentTerm_def
484 |               setPublishPermitted_def setPublishVotes_def getFirstUncommittedSlot_def handleStartJoin_def ensureCurrentTerm_def setElectionWon_def
485 |               ignoringExceptions_def catch_def runM_when_continue)
486 |       next
487 |         case b
488 |         with StartJoin dest_ok show ?thesis
489 |           by (cases "lastAcceptedTerm nd ", simp_all add: ProcessMessageAction_def dispatchMessage_def ProcessMessage_def Let_def
490 |               doStartJoin_def getCurrentTerm_def gets_def setJoinVotes_def sets_def setCurrentTerm_def runM_unless lastAcceptedTerm_def
491 |               setPublishPermitted_def setPublishVotes_def getFirstUncommittedSlot_def handleStartJoin_def ensureCurrentTerm_def setElectionWon_def
492 |               ignoringExceptions_def catch_def runM_when_continue)
493 |       next
494 |         case c with StartJoin dest_ok show ?thesis
495 |           by (cases "lastAcceptedTerm nd", simp_all add: ProcessMessageAction_def dispatchMessage_def ProcessMessage_def Let_def
496 |               doStartJoin_def getCurrentTerm_def gets_def setJoinVotes_def sets_def setCurrentTerm_def runM_unless lastAcceptedTerm_def
497 |               setPublishPermitted_def setPublishVotes_def getFirstUncommittedSlot_def handleStartJoin_def ensureCurrentTerm_def setElectionWon_def
498 |               ignoringExceptions_def catch_def runM_when_continue)
499 |       qed
500 | 
501 |       finally show ?thesis by simp
502 | 
503 |     next
504 |       case (Vote i t a)
505 | 
506 |       have "?LHS = runM (ignoringExceptions (doVote (sender rm) i t a)) nd" (is "_ = ?STEP")
507 |         by (simp add: dispatchMessageInner_def Vote)
508 | 
509 |       also have "... = ?RHS"
510 |       proof (cases "firstUncommittedSlot nd < i")
511 |         case True
512 |         with Vote dest_ok show ?thesis
513 |           by (simp add: dispatchMessage_def runM_unless
514 |               doVote_def gets_def getFirstUncommittedSlot_def ProcessMessage_def
515 |               ProcessMessageAction_def handleVote_def ignoringExceptions_def getCurrentTerm_def)
516 |       next
517 |         case False hence le: "i \<le> firstUncommittedSlot nd" by simp
518 | 
519 |         show ?thesis
520 |         proof (cases "t = currentTerm nd")
521 |           case False
522 |           with Vote dest_ok le show ?thesis
523 |             by (simp add: dispatchMessage_def runM_when runM_unless
524 |                 doVote_def gets_def getFirstUncommittedSlot_def getCurrentTerm_def
525 |                 ProcessMessage_def ProcessMessageAction_def handleVote_def ignoringExceptions_def)
526 | 
527 |         next
528 |           case t: True
529 | 
530 |           show ?thesis
531 |           proof (cases "i = firstUncommittedSlot nd")
532 |             case False
533 |             with Vote dest_ok le t show ?thesis
534 |               by (simp add: dispatchMessage_def Let_def runM_when_continue
535 |                   doVote_def runM_when runM_unless
536 |                   gets_def getFirstUncommittedSlot_def getCurrentTerm_def
537 |                   getJoinVotes_def getCurrentVotingNodes_def
538 |                   getPublishPermitted_def ignoringExceptions_def broadcast_def getCurrentNode_def
539 |                   modifies_def modifyJoinVotes_def send_def
540 |                   sets_def setElectionWon_def setPublishPermitted_def lastAcceptedValue_def
541 |                   ProcessMessage_def ProcessMessageAction_def handleVote_def
542 |                   addElectionVote_def publishValue_def isQuorum_def majorities_def)
543 |           next
544 |             case i: True
545 |             show ?thesis
546 |             proof (cases a)
547 |               case a: NO_TERM
548 | 
549 |               show ?thesis
550 |               proof (cases "isQuorum nd (insert (sender rm) (joinVotes nd))")
551 |                 case not_quorum: False
552 |                 hence not_quorum_card: "\<not> card (currentVotingNodes nd) < card (insert (sender rm) (joinVotes nd) \<inter> currentVotingNodes nd) * 2"
553 |                   by (simp add: isQuorum_def majorities_def)
554 | 
555 |                 have "?STEP = (nd\<lparr>electionWon := False, joinVotes := insert (sender rm) (joinVotes nd)\<rparr>, [], Success ())"
556 |                   by (simp add: ignoringExceptions_def i t a doVote_def catch_def
557 |                       gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def
558 |                       modifyJoinVotes_def modifies_def getJoinVotes_def
559 |                       getCurrentVotingNodes_def Let_def setElectionWon_def sets_def runM_when
560 |                       not_quorum_card getPublishPermitted_def)
561 | 
562 |                 also from dest_ok have "... = ?RHS"
563 |                   by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
564 |                       i t a addElectionVote_def not_quorum publishValue_def Let_def)
565 | 
566 |                 finally show ?thesis .
567 | 
568 |               next
569 |                 case quorum: True
570 |                 hence quorum_card: "card (currentVotingNodes nd) < card (insert (sender rm) (joinVotes nd) \<inter> currentVotingNodes nd) * 2"
571 |                   by (simp add: isQuorum_def majorities_def)
572 | 
573 |                 show ?thesis
574 |                 proof (cases "publishPermitted nd \<and> lastAcceptedTerm nd \<noteq> NO_TERM")
575 |                   case False
576 | 
577 |                   hence "?STEP = (nd\<lparr>electionWon := True, joinVotes := insert (sender rm) (joinVotes nd)\<rparr>, [], Success ())"
578 |                     by (auto simp add: ignoringExceptions_def i t a doVote_def catch_def
579 |                         gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def
580 |                         modifyJoinVotes_def modifies_def getJoinVotes_def
581 |                         getCurrentVotingNodes_def Let_def setElectionWon_def sets_def runM_when
582 |                         quorum_card getPublishPermitted_def)
583 | 
584 |                   also from False dest_ok have "... = ?RHS"
585 |                     by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
586 |                         i t a addElectionVote_def quorum publishValue_def Let_def)
587 | 
588 |                   finally show ?thesis .
589 | 
590 |                 next
591 |                   case True
592 | 
593 |                   hence "?STEP = (nd\<lparr>electionWon := True, publishPermitted := False,
594 |                                     joinVotes := insert (sender rm) (joinVotes nd)\<rparr>,
595 |                                 [\<lparr>sender = currentNode nd, destination = Broadcast,
596 |                                   payload = PublishRequest (firstUncommittedSlot nd) (currentTerm nd)
597 |                                                            (lastAcceptedValue nd) \<rparr>], Success ())"
598 |                     by (auto simp add: ignoringExceptions_def i t a doVote_def catch_def
599 |                         gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def
600 |                         modifyJoinVotes_def modifies_def getJoinVotes_def
601 |                         getCurrentVotingNodes_def Let_def setElectionWon_def sets_def runM_when
602 |                         quorum_card getPublishPermitted_def setPublishPermitted_def lastAcceptedValue_def)
603 | 
604 |                   also from True dest_ok have "... = ?RHS"
605 |                     by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
606 |                         i t a addElectionVote_def quorum publishValue_def Let_def lastAcceptedValue_def)
607 | 
608 |                   finally show ?thesis .
609 | 
610 |                 qed
611 |               qed
612 | 
613 |             next
614 |               case a: (SomeTerm voteLastAcceptedTerm)
615 | 
616 |               show ?thesis
617 |               proof (cases "lastAcceptedTerm nd")
618 |                 case lat: NO_TERM
619 | 
620 |                 have "?STEP = (nd, [], Success ())"
621 |                   by (auto simp add: ignoringExceptions_def i t a lat doVote_def catch_def
622 |                       gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def)
623 | 
624 |                 also from dest_ok have "... = ?RHS"
625 |                   by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
626 |                       i t a lat)
627 | 
628 |                 finally show ?thesis .
629 | 
630 |               next
631 |                 case lat: (SomeTerm nodeLastAcceptedTerm)
632 | 
633 |                 show ?thesis
634 |                 proof (cases "voteLastAcceptedTerm \<le> nodeLastAcceptedTerm")
635 |                   case False
636 |                   hence "?STEP = (nd, [], Success ())"
637 |                     by (simp add: ignoringExceptions_def i t a lat doVote_def catch_def
638 |                         gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def)
639 |                   also from False dest_ok have "... = ?RHS"
640 |                     by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
641 |                         i t a lat max_def addElectionVote_def publishValue_def)
642 | 
643 |                   finally show ?thesis by simp
644 |                 next
645 |                   case True
646 | 
647 |                   show ?thesis
648 |                   proof (cases "isQuorum nd (insert (sender rm) (joinVotes nd))")
649 |                     case not_quorum: False
650 |                     hence not_quorum_card: "\<not> card (currentVotingNodes nd) < card (insert (sender rm) (joinVotes nd) \<inter> currentVotingNodes nd) * 2"
651 |                       by (simp add: isQuorum_def majorities_def)
652 | 
653 |                     from True
654 |                     have "?STEP = (nd\<lparr>electionWon := False,
655 |                                     joinVotes := insert (sender rm) (joinVotes nd)\<rparr>, [], Success ())"
656 |                       by (simp add: ignoringExceptions_def i t a lat doVote_def catch_def
657 |                           gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def
658 |                           modifyJoinVotes_def modifies_def getJoinVotes_def
659 |                           getCurrentVotingNodes_def Let_def setElectionWon_def sets_def runM_when
660 |                           not_quorum_card getPublishPermitted_def)
661 | 
662 |                     also from dest_ok True have "... = ?RHS"
663 |                       by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
664 |                           i t a lat max_def addElectionVote_def not_quorum publishValue_def Let_def)
665 | 
666 |                     finally show ?thesis .
667 | 
668 |                   next
669 |                     case quorum: True
670 |                     hence quorum_card: "card (currentVotingNodes nd) < card (insert (sender rm) (joinVotes nd) \<inter> currentVotingNodes nd) * 2"
671 |                       by (simp add: isQuorum_def majorities_def)
672 | 
673 |                     show ?thesis
674 |                     proof (cases "publishPermitted nd")
675 |                       case False
676 | 
677 |                       with True
678 |                       have "?STEP = (nd\<lparr>electionWon := True,
679 |                                     joinVotes := insert (sender rm) (joinVotes nd)\<rparr>, [], Success ())"
680 |                         by (simp add: ignoringExceptions_def i t a lat doVote_def catch_def
681 |                             gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def
682 |                             modifyJoinVotes_def modifies_def getJoinVotes_def
683 |                             getCurrentVotingNodes_def Let_def setElectionWon_def sets_def runM_when
684 |                             quorum_card getPublishPermitted_def setPublishPermitted_def)
685 | 
686 |                       also from False dest_ok have "... = ?RHS"
687 |                         by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
688 |                             i t a lat True addElectionVote_def quorum publishValue_def Let_def)
689 | 
690 |                       finally show ?thesis .
691 | 
692 |                     next
693 |                       case publishPermitted: True
694 | 
695 |                       have "?STEP = (nd\<lparr>electionWon := True, publishPermitted := False,
696 |                                     joinVotes := insert (sender rm) (joinVotes nd)\<rparr>,
697 |                                 [\<lparr>sender = currentNode nd, destination = Broadcast,
698 |                                   payload = PublishRequest (firstUncommittedSlot nd) (currentTerm nd)
699 |                                                            (lastAcceptedValue nd) \<rparr>], Success ())"
700 |                         apply (auto simp add: ignoringExceptions_def i t a lat True doVote_def catch_def
701 |                             gets_def getCurrentTerm_def runM_when_continue getFirstUncommittedSlot_def
702 |                             modifyJoinVotes_def modifies_def getJoinVotes_def
703 |                             getCurrentVotingNodes_def Let_def setElectionWon_def sets_def runM_when
704 |                             quorum_card getPublishPermitted_def setPublishPermitted_def lastAcceptedValue_def)
705 |                         using True publishPermitted by auto
706 | 
707 |                       also from publishPermitted True dest_ok have "... = ?RHS"
708 |                         by (simp add: ProcessMessageAction_def ProcessMessage_def Vote handleVote_def
709 |                             i t a lat True addElectionVote_def quorum publishValue_def Let_def lastAcceptedValue_def)
710 | 
711 |                       finally show ?thesis .
712 | 
713 |                     qed
714 |                   qed
715 |                 qed
716 |               qed
717 |             qed
718 |           qed
719 |         qed
720 |       qed
721 | 
722 |       finally show ?thesis .
723 | 
724 |     next
725 |       case (ClientValue x)
726 | 
727 |       with dest_ok show ?thesis
728 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
729 |             doClientValue_def gets_def getElectionWon_def
730 |             runM_unless getPublishPermitted_def setPublishPermitted_def sets_def
731 |             getCurrentTerm_def getFirstUncommittedSlot_def ProcessMessage_def handleClientValue_def
732 |             publishValue_def runM_when ignoringExceptions_def ClientValue catch_def runM_when_continue)
733 | 
734 |     next
735 |       case (PublishRequest i t x) with dest_ok show ?thesis
736 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
737 |           doPublishRequest_def gets_def getCurrentTerm_def getFirstUncommittedSlot_def
738 |           sets_def setLastAcceptedData_def ignoringExceptions_def catch_def runM_when_continue
739 |           getCurrentNode_def runM_unless send_def
740 |           ProcessMessage_def handlePublishRequest_def runM_when)
741 | 
742 |     next
743 |       case (PublishResponse i t) with dest_ok show ?thesis
744 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
745 |           doPublishResponse_def gets_def getCurrentTerm_def getFirstUncommittedSlot_def
746 |           broadcast_def getCurrentNode_def runM_unless send_def
747 |           modifyPublishVotes_def modifies_def getPublishVotes_def getCurrentVotingNodes_def
748 |           runM_when ignoringExceptions_def catch_def runM_when_continue
749 |           ProcessMessage_def handlePublishResponse_def commitIfQuorate_def isQuorum_def majorities_def
750 |           ApplyCommitFromSlotTerm_def)
751 | 
752 |     next
753 |       case (ApplyCommit i t)
754 | 
755 |       show ?thesis
756 |       proof (cases "lastAcceptedValue nd")
757 |         case NoOp
758 |         with ApplyCommit dest_ok show ?thesis
759 |           by (simp add: ProcessMessageAction_def dispatchMessageInner_def
760 |             doCommit_def runM_unless runM_when
761 |             gets_def getFirstUncommittedSlot_def
762 |             sets_def setFirstUncommittedSlot_def setLastAcceptedData_def
763 |             setPublishPermitted_def setPublishVotes_def
764 |             ProcessMessage_def handleApplyCommit_def applyAcceptedValue_def
765 |             ignoringExceptions_def catch_def runM_when_continue)
766 |       next
767 |         case Reconfigure
768 |         with ApplyCommit dest_ok show ?thesis
769 |           by (simp add: ProcessMessageAction_def dispatchMessageInner_def
770 |             doCommit_def runM_unless runM_when
771 |             gets_def getFirstUncommittedSlot_def
772 |             getJoinVotes_def
773 |             sets_def setFirstUncommittedSlot_def setLastAcceptedData_def
774 |             setPublishPermitted_def setPublishVotes_def
775 |             setCurrentVotingNodes_def setElectionWon_def
776 |             ProcessMessage_def handleApplyCommit_def applyAcceptedValue_def majorities_def
777 |             ignoringExceptions_def catch_def runM_when_continue)
778 |       next
779 |         case ClusterStateDiff
780 |         with ApplyCommit dest_ok show ?thesis
781 |           by (simp add: ProcessMessageAction_def dispatchMessageInner_def
782 |             doCommit_def runM_unless runM_when
783 |             gets_def getFirstUncommittedSlot_def
784 |             sets_def setFirstUncommittedSlot_def
785 |             modifies_def modifyCurrentClusterState_def
786 |             setPublishPermitted_def setPublishVotes_def setLastAcceptedData_def
787 |             ProcessMessage_def handleApplyCommit_def applyAcceptedValue_def
788 |             ignoringExceptions_def catch_def runM_when_continue)
789 |       qed
790 | 
791 |     next
792 |       case CatchUpRequest
793 |       with dest_ok show ?thesis
794 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
795 |           generateCatchup_def
796 |           gets_def getFirstUncommittedSlot_def getCurrentVotingNodes_def getCurrentClusterState_def
797 |           ProcessMessage_def handleCatchUpRequest_def ignoringExceptions_def catch_def runM_when_continue)
798 | 
799 |     next
800 |       case (CatchUpResponse i conf cs)
801 |       with dest_ok show ?thesis
802 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
803 |             applyCatchup_def gets_def getFirstUncommittedSlot_def
804 |             sets_def setFirstUncommittedSlot_def
805 |             setPublishPermitted_def setPublishVotes_def setLastAcceptedData_def
806 |             setCurrentVotingNodes_def setCurrentClusterState_def setJoinVotes_def
807 |             setElectionWon_def runM_unless
808 |             ProcessMessage_def handleCatchUpResponse_def
809 |             ignoringExceptions_def catch_def runM_when_continue)
810 | 
811 |     next
812 |       case Reboot
813 |       with dest_ok show ?thesis
814 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
815 |             doReboot_def ProcessMessage_def handleReboot_def ignoringExceptions_def catch_def runM_when_continue)
816 | 
817 |     next
818 |       case DiscardJoinVotes
819 |       with dest_ok show ?thesis
820 |         by (simp add: ProcessMessageAction_def dispatchMessageInner_def
821 |             doDiscardJoinVotes_def ProcessMessage_def handleDiscardJoinVotes_def ignoringExceptions_def catch_def 
822 |             runM_when_continue setJoinVotes_def sets_def setElectionWon_def)
823 | 
824 |     qed
825 | 
826 |     finally show ?thesis .
827 |   qed
828 | qed
829 | 
830 | end
831 | 


--------------------------------------------------------------------------------
/cluster/isabelle/OneSlot.thy:
--------------------------------------------------------------------------------
  1 | section \<open>One-slot consistency\<close>
  2 | 
  3 | text \<open>The replicated state machine determines the values that are committed in each of a sequence
  4 | of \textit{slots}. Each slot runs a logically-separate consensus algorithm which is shown to be
  5 | consistent here. Further below, the protocol is shown to refine this slot-by-slot model correctly.\<close>
  6 | 
  7 | text \<open>Consistency is shown to follow from the invariants listed below. Further below, the protocol
  8 | is shown to preserve these invariants in each step, which means it is not enormously important
  9 | to understand these in detail.\<close>
 10 | 
 11 | theory OneSlot
 12 |   imports Preliminaries
 13 | begin
 14 | 
 15 | locale oneSlot =
 16 |   (* basic functions *)
 17 |   fixes Q :: "Node set set"
 18 |   fixes v :: "Term \<Rightarrow> Value"
 19 |     (* message-sent predicates *)
 20 |   fixes promised\<^sub>f :: "Node \<Rightarrow> Term \<Rightarrow> bool"
 21 |   fixes promised\<^sub>b :: "Node \<Rightarrow> Term \<Rightarrow> Term \<Rightarrow> bool"
 22 |   fixes proposed :: "Term \<Rightarrow> bool"
 23 |   fixes accepted :: "Node \<Rightarrow> Term \<Rightarrow> bool"
 24 |   fixes committed :: "Term \<Rightarrow> bool"
 25 |     (* other definitions *)
 26 |   fixes promised :: "Node \<Rightarrow> Term \<Rightarrow> bool"
 27 |   defines "promised n t \<equiv> promised\<^sub>f n t \<or> (\<exists> t'. promised\<^sub>b n t t')"
 28 |   fixes prevAccepted :: "Term \<Rightarrow> Node set \<Rightarrow> Term set"
 29 |   defines "prevAccepted t ns \<equiv> {t'. \<exists> n \<in> ns. promised\<^sub>b n t t'}"
 30 |     (* invariants *)
 31 |   assumes Q_intersects: "Q \<frown> Q"
 32 |   assumes promised\<^sub>f: "\<lbrakk> promised\<^sub>f n t; t' < t \<rbrakk> \<Longrightarrow> \<not> accepted n t'"
 33 |   assumes promised\<^sub>b_lt: "promised\<^sub>b n t t' \<Longrightarrow> t' < t"
 34 |   assumes promised\<^sub>b_accepted: "promised\<^sub>b n t t' \<Longrightarrow> accepted n t'"
 35 |   assumes promised\<^sub>b_max: "\<lbrakk> promised\<^sub>b n t t'; t' < t''; t'' < t \<rbrakk>
 36 |    \<Longrightarrow> \<not> accepted n t''"
 37 |   assumes proposed: "proposed t
 38 |      \<Longrightarrow> \<exists> q \<in> Q. (\<forall> n \<in> q. promised n t)
 39 |                      \<and> (prevAccepted t q = {}
 40 |                           \<or> (\<exists> t'. v t = v t' \<and> maxTerm (prevAccepted t q) \<le> t' \<and> proposed t' \<and> t' < t))"
 41 |   assumes proposed_finite: "finite {t. proposed t}"
 42 |   assumes accepted: "accepted n t \<Longrightarrow> proposed t"
 43 |   assumes committed: "committed t \<Longrightarrow> \<exists> q \<in> Q. \<forall> n \<in> q. accepted n t"
 44 | 
 45 | lemma (in oneSlot) prevAccepted_proposed: "prevAccepted t ns \<subseteq> {t. proposed t}"
 46 |   using accepted prevAccepted_def promised\<^sub>b_accepted by fastforce
 47 | 
 48 | lemma (in oneSlot) prevAccepted_finite: "finite (prevAccepted p ns)"
 49 |   using prevAccepted_proposed proposed_finite by (meson rev_finite_subset)
 50 | 
 51 | lemma (in oneSlot) Q_nonempty: "\<And>q. q \<in> Q \<Longrightarrow> q \<noteq> {}"
 52 |   using Q_intersects by (auto simp add: intersects_def)
 53 | 
 54 | text \<open>The heart of the consistency proof is property P2b from \textit{Paxos made simple} by Lamport:\<close>
 55 | 
 56 | lemma (in oneSlot) p2b:
 57 |   assumes "proposed t\<^sub>1" and "committed t\<^sub>2" and "t\<^sub>2 < t\<^sub>1"
 58 |   shows "v t\<^sub>1 = v t\<^sub>2"
 59 |   using assms
 60 | proof (induct t\<^sub>1 rule: less_induct)
 61 |   case (less t\<^sub>1)
 62 | 
 63 |   hence hyp: "\<And> t\<^sub>1'. \<lbrakk> t\<^sub>1' < t\<^sub>1; proposed t\<^sub>1'; t\<^sub>2 \<le> t\<^sub>1' \<rbrakk> \<Longrightarrow> v t\<^sub>1' = v t\<^sub>2"
 64 |     using le_imp_less_or_eq by blast
 65 | 
 66 |   from `proposed t\<^sub>1` obtain q\<^sub>1 t\<^sub>1' where
 67 |     q\<^sub>1_quorum:   "q\<^sub>1 \<in> Q" and
 68 |     q\<^sub>1_promised: "\<And>n. n \<in> q\<^sub>1 \<Longrightarrow> promised n t\<^sub>1" and
 69 |     q\<^sub>1_value:    "prevAccepted t\<^sub>1 q\<^sub>1 = {} \<or> (v t\<^sub>1 = v t\<^sub>1' \<and> maxTerm (prevAccepted t\<^sub>1 q\<^sub>1) \<le> t\<^sub>1' \<and> proposed t\<^sub>1' \<and> t\<^sub>1' < t\<^sub>1)"
 70 |     by (meson proposed)
 71 | 
 72 |   from `committed t\<^sub>2` obtain q\<^sub>2 where
 73 |     q\<^sub>2_quorum:   "q\<^sub>2 \<in> Q" and
 74 |     q\<^sub>2_accepted: "\<And>n. n \<in> q\<^sub>2 \<Longrightarrow> accepted n t\<^sub>2"
 75 |     using committed by force
 76 | 
 77 |   have "q\<^sub>1 \<inter> q\<^sub>2 \<noteq> {}"
 78 |     using Q_intersects intersects_def less.prems q\<^sub>1_quorum q\<^sub>2_quorum by auto
 79 | 
 80 |   then obtain n where n\<^sub>1: "n \<in> q\<^sub>1" and n\<^sub>2: "n \<in> q\<^sub>2" by auto
 81 | 
 82 |   from n\<^sub>1 q\<^sub>1_promised have "promised n t\<^sub>1" by simp
 83 |   moreover from n\<^sub>2 q\<^sub>2_accepted have "accepted n t\<^sub>2" by simp
 84 |   ultimately obtain t\<^sub>2' where t\<^sub>2': "promised\<^sub>b n t\<^sub>1 t\<^sub>2'"
 85 |     using less.prems(3) promised\<^sub>f promised_def by blast
 86 | 
 87 |   have q\<^sub>1_value: "v t\<^sub>1 = v t\<^sub>1'" "maxTerm (prevAccepted t\<^sub>1 q\<^sub>1) \<le> t\<^sub>1'" "proposed t\<^sub>1'" "t\<^sub>1' < t\<^sub>1"
 88 |     using n\<^sub>1 prevAccepted_def q\<^sub>1_value t\<^sub>2' by auto
 89 | 
 90 |   note `v t\<^sub>1 = v t\<^sub>1'`
 91 |   also have "v t\<^sub>1' = v t\<^sub>2"
 92 |   proof (intro hyp)
 93 |     have p: "maxTerm (prevAccepted t\<^sub>1 q\<^sub>1) \<in> prevAccepted t\<^sub>1 q\<^sub>1"
 94 |       apply (intro maxTerm_mem prevAccepted_finite)
 95 |       using n\<^sub>1 prevAccepted_def t\<^sub>2' by auto
 96 | 
 97 |     show "t\<^sub>1' < t\<^sub>1" "proposed t\<^sub>1'" using q\<^sub>1_value by simp_all
 98 | 
 99 |     have "t\<^sub>2 \<le> t\<^sub>2'"
100 |       by (meson \<open>accepted n t\<^sub>2\<close> less.prems(3) not_le promised\<^sub>b_max t\<^sub>2')
101 |     also have "t\<^sub>2' \<le> maxTerm (prevAccepted t\<^sub>1 q\<^sub>1)"
102 |       using n\<^sub>1 prevAccepted_def t\<^sub>2' prevAccepted_finite by (intro maxTerm_max, auto)
103 |     also have "... \<le> t\<^sub>1'" using q\<^sub>1_value by simp
104 |     finally show "t\<^sub>2 \<le> t\<^sub>1'" .
105 |   qed
106 | 
107 |   finally show ?case .
108 | qed
109 | 
110 | text \<open>From this, it follows that any two committed values are equal as desired.\<close>
111 | 
112 | lemma (in oneSlot) consistent:
113 |   assumes "committed t\<^sub>1" and "committed t\<^sub>2"
114 |   shows "v t\<^sub>1 = v t\<^sub>2"
115 |   using assms by (metis Q_nonempty accepted all_not_in_conv committed not_less_iff_gr_or_eq p2b)
116 | 
117 | text \<open>It will be useful later to know the conditions under which a value in a term can be committed,
118 | which is spelled out here:\<close>
119 | 
120 | lemma (in oneSlot) commit:
121 |   assumes q_quorum: "q \<in> Q"
122 |   assumes q_accepted: "\<And>n. n \<in> q \<Longrightarrow> accepted n t\<^sub>0"
123 |   defines "committed' t \<equiv> committed t \<or> t = t\<^sub>0"
124 |   shows "oneSlot Q v promised\<^sub>f promised\<^sub>b proposed accepted committed'"
125 |   by (smt committed'_def Q_intersects oneSlot_axioms oneSlot_def q_accepted q_quorum)
126 | 
127 | end
128 | 


--------------------------------------------------------------------------------
/cluster/isabelle/Preliminaries.thy:
--------------------------------------------------------------------------------
  1 | section \<open>Preliminaries\<close>
  2 | 
  3 | text \<open>We start with some definitions of the types involved.\<close>
  4 | 
  5 | theory Preliminaries
  6 |   imports Main
  7 | begin
  8 | 
  9 | subsection \<open>Slots\<close>
 10 | 
 11 | text \<open>Slots are identified by natural numbers.\<close>
 12 | 
 13 | type_synonym Slot = nat
 14 | 
 15 | subsection \<open>Terms\<close>
 16 | 
 17 | text \<open>Terms are identified by natural numbers.\<close>
 18 | 
 19 | type_synonym Term = nat
 20 | 
 21 | subsubsection \<open>Maximum term of a set\<close>
 22 | 
 23 | text \<open>A function for finding the maximum term in a set is as follows.\<close>
 24 | 
 25 | definition maxTerm :: "Term set \<Rightarrow> Term"
 26 |   where "maxTerm S \<equiv> THE t. t \<in> S \<and> (\<forall> t' \<in> S. t' \<le> t)"
 27 | 
 28 | text \<open>It works correctly on finite and nonempty sets as follows:\<close>
 29 | 
 30 | theorem
 31 |   fixes S :: "Term set"
 32 |   assumes finite: "finite S"
 33 |   shows maxTerm_mem: "S \<noteq> {} \<Longrightarrow> maxTerm S \<in> S"
 34 |     and maxTerm_max: "\<And> t'. t' \<in> S \<Longrightarrow> t' \<le> maxTerm S"
 35 | proof -
 36 |   presume "S \<noteq> {}"
 37 |   with assms
 38 |   obtain t where t: "t \<in> S" "\<And> t'. t' \<in> S \<Longrightarrow> t' \<le> t"
 39 |   proof (induct arbitrary: thesis)
 40 |     case empty
 41 |     then show ?case by simp
 42 |   next
 43 |     case (insert t S)
 44 |     show ?case
 45 |     proof (cases "S = {}")
 46 |       case True hence [simp]: "insert t S = {t}" by simp
 47 |       from insert.prems show ?thesis by simp
 48 |     next
 49 |       case False
 50 |       obtain t' where t': "t' \<in> S" "\<forall> t'' \<in> S. t'' \<le> t'"
 51 |         by (meson False insert.hyps(3))
 52 | 
 53 |       from t'
 54 |       show ?thesis
 55 |       proof (intro insert.prems ballI)
 56 |         fix t'' assume t'': "t'' \<in> insert t S"
 57 |         show "t'' \<le> (if t \<le> t' then t' else t)"
 58 |         proof (cases "t'' = t")
 59 |           case False
 60 |           with t'' have "t'' \<in> S" by simp
 61 |           with t' have "t'' \<le> t'" by simp
 62 |           thus ?thesis by auto
 63 |         qed simp
 64 |       qed simp
 65 |     qed
 66 |   qed
 67 | 
 68 |   from t have "maxTerm S = t"
 69 |     by (unfold maxTerm_def, intro the_equality, simp_all add: eq_iff)
 70 | 
 71 |   with t show "maxTerm S \<in> S" "\<And>t'. t' \<in> S \<Longrightarrow> t' \<le> maxTerm S" by simp_all
 72 | qed auto
 73 | 
 74 | lemma
 75 |   assumes "\<And>t. t \<in> S \<Longrightarrow> t \<le> t'" "finite S" "S \<noteq> {}"
 76 |   shows maxTerm_le: "maxTerm S \<le> t'" using assms maxTerm_mem by auto
 77 | 
 78 | subsection \<open>Configurations and quorums\<close>
 79 | 
 80 | text \<open>Nodes are simply identified by a natural number.\<close>
 81 | 
 82 | datatype Node = Node nat
 83 | 
 84 | definition natOfNode :: "Node \<Rightarrow> nat" where "natOfNode node \<equiv> case node of Node n \<Rightarrow> n"
 85 | lemma natOfNode_Node[simp]: "natOfNode (Node n) = n" by (simp add: natOfNode_def)
 86 | lemma Node_natOfNode[simp]: "Node (natOfNode n) = n" by (cases n, simp add: natOfNode_def)
 87 | lemma natOfNode_inj[simp]: "(natOfNode n\<^sub>1 = natOfNode n\<^sub>2) = (n\<^sub>1 = n\<^sub>2)" by (metis Node_natOfNode)
 88 | 
 89 | text \<open>It is useful to be able to talk about whether sets-of-sets-of nodes mutually intersect or not.\<close>
 90 | 
 91 | definition intersects :: "Node set set \<Rightarrow> Node set set \<Rightarrow> bool" (infixl "\<frown>" 50)
 92 |   where "A \<frown> B \<equiv> \<forall> a \<in> A. \<forall> b \<in> B. a \<inter> b \<noteq> {}"
 93 | 
 94 | definition majorities :: "Node set \<Rightarrow> Node set set"
 95 |   where "majorities votingNodes = { q. card votingNodes < card (q \<inter> votingNodes) * 2 }"
 96 | 
 97 | lemma majorities_nonempty: assumes "q \<in> majorities Q" shows "q \<noteq> {}"
 98 |   using assms by (auto simp add: majorities_def)
 99 | 
100 | lemma majorities_member: assumes "q \<in> majorities Q" obtains n where "n \<in> q"
101 |   using majorities_nonempty assms by fastforce
102 | 
103 | lemma majorities_intersect:
104 |   assumes "finite votingNodes"
105 |   shows "majorities votingNodes \<frown> majorities votingNodes"
106 |   unfolding intersects_def
107 | proof (intro ballI notI)
108 |   fix q\<^sub>1 assume q\<^sub>1: "q\<^sub>1 \<in> majorities votingNodes"
109 |   fix q\<^sub>2 assume q\<^sub>2: "q\<^sub>2 \<in> majorities votingNodes"
110 |   assume disj: "q\<^sub>1 \<inter> q\<^sub>2 = {}"
111 | 
112 |   have 1: "card ((q\<^sub>1 \<inter> votingNodes) \<union> (q\<^sub>2 \<inter> votingNodes)) = card (q\<^sub>1 \<inter> votingNodes) + card (q\<^sub>2 \<inter> votingNodes)"
113 |   proof (intro card_Un_disjoint)
114 |     from assms show "finite (q\<^sub>1 \<inter> votingNodes)" by simp
115 |     from assms show "finite (q\<^sub>2 \<inter> votingNodes)" by simp
116 |     from disj show "q\<^sub>1 \<inter> votingNodes \<inter> (q\<^sub>2 \<inter> votingNodes) = {}" by auto
117 |   qed
118 | 
119 |   have "card ((q\<^sub>1 \<inter> votingNodes) \<union> (q\<^sub>2 \<inter> votingNodes)) \<le> card votingNodes" by (simp add: assms card_mono)
120 |   hence 2: "2 * card (q\<^sub>1 \<inter> votingNodes) + 2 * card (q\<^sub>2 \<inter> votingNodes) \<le> 2 * card votingNodes" by (simp add: 1)
121 | 
122 |   from q\<^sub>1 q\<^sub>2 have 3: "card votingNodes + card votingNodes < 2 * card (q\<^sub>1 \<inter> votingNodes) + 2 * card (q\<^sub>2 \<inter> votingNodes)"
123 |     unfolding majorities_def by auto
124 | 
125 |   from 2 3 show False by simp
126 | qed
127 | 
128 | text \<open>A configuration of the system defines the sets of master-eligible nodes whose votes count when calculating quorums.
129 | The initial configuration of the system is fixed to some arbitrary value.\<close>
130 | 
131 | consts Vs\<^sub>0 :: "Node list"
132 | definition V\<^sub>0 :: "Node set" where "V\<^sub>0 \<equiv> set Vs\<^sub>0"
133 | 
134 | lemma finite_V\<^sub>0: "finite V\<^sub>0" unfolding V\<^sub>0_def by auto
135 | lemma V\<^sub>0_intersects: "majorities V\<^sub>0 \<frown> majorities V\<^sub>0" using finite_V\<^sub>0 by (intro majorities_intersect)
136 | 
137 | subsection \<open>Values\<close>
138 | 
139 | text \<open>The model is a replicated state machine, with transitions that either do nothing, alter
140 | the configuration of the system or set a new \texttt{ClusterState}. \texttt{ClusterState} values
141 | are modelled simply as natural numbers.\<close>
142 | 
143 | datatype ClusterState = ClusterState nat
144 | consts CS\<^sub>0 :: ClusterState
145 | 
146 | datatype Value
147 |   = NoOp
148 |   | Reconfigure "Node list" (* update the set of voting nodes. A list rather than a set to force it to be finite *)
149 |   | ClusterStateDiff "ClusterState \<Rightarrow> ClusterState" (* a ClusterState diff *)
150 | 
151 | text \<open>Some useful definitions and lemmas follow.\<close>
152 | 
153 | fun isReconfiguration :: "Value \<Rightarrow> bool"
154 |   where "isReconfiguration (Reconfigure _) = True"
155 |   | "isReconfiguration _ = False"
156 | 
157 | fun getConf :: "Value \<Rightarrow> Node set"
158 |   where "getConf (Reconfigure conf) = set conf"
159 |   | "getConf _                      = {}"
160 | 
161 | lemma getConf_finite: "finite (getConf v)"
162 |   by (metis List.finite_set getConf.elims infinite_imp_nonempty)
163 | 
164 | lemma getConf_intersects: "majorities (getConf v) \<frown> majorities (getConf v)"
165 |   by (simp add: getConf_finite majorities_intersect)
166 | 
167 | end
168 | 


--------------------------------------------------------------------------------
/cluster/isabelle/ROOT:
--------------------------------------------------------------------------------
 1 | session "elasticsearch-isabelle" = "HOL" +
 2 |   options [document = pdf, document_output = "output", document_variants="document:outline=/proof"]
 3 |   theories [document = false]
 4 |     (* Foo *)
 5 |     (* Bar *)
 6 |   theories
 7 |     Preliminaries
 8 |     Implementation
 9 |     Monadic
10 |     OneSlot
11 |     Zen
12 |   document_files
13 |     "root.tex"
14 | 


--------------------------------------------------------------------------------
/cluster/isabelle/document/root.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[11pt,a4paper]{article}
 2 | \usepackage{isabelle,isabellesym}
 3 | \usepackage{latexsym}
 4 | 
 5 | % further packages required for unusual symbols (see also
 6 | % isabellesym.sty), use only when needed
 7 | 
 8 | %\usepackage{amssymb}
 9 |   %for \<leadsto>, \<box>, \<diamond>, \<sqsupset>, \<mho>, \<Join>,
10 |   %\<lhd>, \<lesssim>, \<greatersim>, \<lessapprox>, \<greaterapprox>,
11 |   %\<triangleq>, \<yen>, \<lozenge>
12 | 
13 | %\usepackage{eurosym}
14 |   %for \<euro>
15 | 
16 | %\usepackage[only,bigsqcap]{stmaryrd}
17 |   %for \<Sqinter>
18 | 
19 | %\usepackage{eufrak}
20 |   %for \<AA> ... \<ZZ>, \<aa> ... \<zz> (also included in amssymb)
21 | 
22 | %\usepackage{textcomp}
23 |   %for \<onequarter>, \<onehalf>, \<threequarters>, \<degree>, \<cent>,
24 |   %\<currency>
25 | 
26 | % this should be the last package used
27 | \usepackage{pdfsetup}
28 | 
29 | % urls in roman style, theory text in math-similar italics
30 | \urlstyle{rm}
31 | \isabellestyle{it}
32 | 
33 | % for uniform font size
34 | %\renewcommand{\isastyle}{\isastyleminor}
35 | 
36 | 
37 | \begin{document}
38 | 
39 | \title{elasticsearch-isabelle}
40 | \author{David Turner}
41 | \maketitle
42 | 
43 | \tableofcontents
44 | 
45 | % sane default for proof documents
46 | \parindent 0pt\parskip 0.5ex
47 | 
48 | \section{Introduction}
49 | 
50 | This is a presentation of an Isabelle/HOL theory that describes, and proves correct,
51 | a protocol for sharing \texttt{ClusterState} updates in Elasticsearch.
52 | 
53 | % generated text of all theories
54 | \input{session}
55 | 
56 | % optional bibliography
57 | %\bibliographystyle{abbrv}
58 | %\bibliography{root}
59 | 
60 | \end{document}
61 | 
62 | %%% Local Variables:
63 | %%% mode: latex
64 | %%% TeX-master: t
65 | %%% End:
66 | 


--------------------------------------------------------------------------------
/cluster/tla/consensus.tla:
--------------------------------------------------------------------------------
  1 | `^\Large\bf
  2 | TLA+ Model of an improved Zen consensus algorithm with reconfiguration capabilities ^'
  3 | -------------------------------------------------------------------------------------
  4 | 
  5 | -------------------------------- MODULE consensus -----------------------------------
  6 | \* Imported modules used in this specification
  7 | EXTENDS Naturals, FiniteSets, Sequences, TLC
  8 | 
  9 | ----
 10 | 
 11 | \* `^\Large\bf Constants ^'
 12 | 
 13 | \* The specification first defines the constants of the model, which amount to values or sets of
 14 | \* values that are fixed.
 15 | 
 16 | CONSTANTS Values
 17 | 
 18 | \* Set of node ids (all master-eligible nodes)
 19 | CONSTANTS Nodes
 20 | 
 21 | \* The constant "Nil" denotes a place-holder for a non-existing value
 22 | CONSTANTS Nil
 23 | 
 24 | \* RPC message types
 25 | CONSTANTS
 26 |   Join, \* only request is modeled
 27 |   PublishRequest,
 28 |   PublishResponse,
 29 |   Commit, \* only request is modeled
 30 |   Catchup \* only response is modeled
 31 | 
 32 | \* Publish request types
 33 | CONSTANTS
 34 |   Reconfigure,
 35 |   ApplyCSDiff  
 36 | 
 37 | ----
 38 | 
 39 | \* `^\Large\bf Variables ^'
 40 | 
 41 | \* The following describes the variable state of the model.
 42 | 
 43 | \* Set of requests and responses sent between nodes.
 44 | VARIABLE messages
 45 | 
 46 | \* node state (map from node id to state)
 47 | VARIABLE firstUncommittedSlot
 48 | VARIABLE currentTerm
 49 | VARIABLE currentConfiguration
 50 | VARIABLE currentClusterState
 51 | VARIABLE lastAcceptedTerm
 52 | VARIABLE lastAcceptedValue
 53 | VARIABLE joinVotes
 54 | VARIABLE electionWon
 55 | VARIABLE publishPermitted
 56 | VARIABLE publishVotes
 57 | 
 58 | ----
 59 | 
 60 | \* set of valid configurations (i.e. the set of all non-empty subsets of Nodes)
 61 | ValidConfigs == SUBSET(Nodes) \ {{}}
 62 | 
 63 | \* quorums correspond to majority of votes in a config
 64 | IsQuorum(votes, config) == Cardinality(votes \cap config) * 2 > Cardinality(config)
 65 | 
 66 | \* checks whether two configurations only have intersecting quorums
 67 | IntersectingQuorums(config1, config2) ==
 68 |   /\ \lnot IsQuorum(config1 \ config2, config1)
 69 |   /\ \lnot IsQuorum(config2 \ config1, config2)
 70 | 
 71 | \* initial model state
 72 | Init == /\ messages = {}
 73 |         /\ firstUncommittedSlot = [n \in Nodes |-> 0]
 74 |         /\ currentTerm = [n \in Nodes |-> 0]
 75 |         /\ currentConfiguration \in {[n \in Nodes |-> vc] : vc \in ValidConfigs} \* all agree on initial config
 76 |         /\ currentClusterState \in {[n \in Nodes |-> v] : v \in Values} \* all agree on initial value
 77 |         /\ lastAcceptedTerm = [n \in Nodes |-> Nil]
 78 |         /\ lastAcceptedValue = [n \in Nodes |-> Nil]
 79 |         /\ joinVotes = [n \in Nodes |-> {}]
 80 |         /\ electionWon = [n \in Nodes |-> FALSE]
 81 |         /\ publishPermitted = [n \in Nodes |-> FALSE]
 82 |         /\ publishVotes = [n \in Nodes |-> {}]
 83 | 
 84 | \* Send join request from node n to node nm for term t
 85 | HandleStartJoin(n, nm, t) ==
 86 |   /\ t > currentTerm[n]
 87 |   /\ LET
 88 |        joinRequest == [method  |-> Join,
 89 |                        source  |-> n,
 90 |                        dest    |-> nm,
 91 |                        slot    |-> firstUncommittedSlot[n],
 92 |                        term    |-> t,
 93 |                        laTerm  |-> lastAcceptedTerm[n]]
 94 |      IN
 95 |        /\ currentTerm' = [currentTerm EXCEPT ![n] = t]
 96 |        /\ publishPermitted' = [publishPermitted EXCEPT ![n] = TRUE]
 97 |        /\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
 98 |        /\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
 99 |        /\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
100 |        /\ messages' = messages \cup { joinRequest }
101 |        /\ UNCHANGED <<currentClusterState, currentConfiguration,
102 |                       firstUncommittedSlot, lastAcceptedValue, lastAcceptedTerm>>
103 | 
104 | \* node n handles a join request and checks if it has received enough joins (= votes)
105 | \* for its term to be elected as master
106 | HandleJoinRequest(n, m) ==
107 |   /\ m.method = Join
108 |   /\ m.term = currentTerm[n]
109 |   /\ \/ /\ m.slot < firstUncommittedSlot[n]
110 |      \/ /\ m.slot = firstUncommittedSlot[n]
111 |         /\ (m.laTerm /= Nil => lastAcceptedTerm[n] /= Nil /\ m.laTerm <= lastAcceptedTerm[n])
112 |   /\ joinVotes' = [joinVotes EXCEPT ![n] = @ \cup { m.source }]
113 |   /\ electionWon' = [electionWon EXCEPT ![n] = IsQuorum(joinVotes'[n], currentConfiguration[n])]
114 |   /\ IF electionWon'[n] /\ publishPermitted[n] /\ lastAcceptedTerm[n] /= Nil 
115 |      THEN LET publishRequests == { [method  |-> PublishRequest,
116 |                                     source  |-> n,
117 |                                     dest    |-> ns,
118 |                                     term    |-> currentTerm[n],
119 |                                     slot    |-> firstUncommittedSlot[n],
120 |                                     value   |-> lastAcceptedValue[n]] : ns \in Nodes }
121 |      IN
122 |        /\ messages' = messages \cup publishRequests
123 |        /\ publishPermitted' = [publishPermitted EXCEPT ![n] = FALSE]
124 |      ELSE
125 |        /\ UNCHANGED <<messages, publishPermitted>>
126 |   /\ UNCHANGED <<currentClusterState, currentConfiguration, currentTerm, publishVotes,
127 |                  firstUncommittedSlot, lastAcceptedValue, lastAcceptedTerm>>
128 | 
129 | \* client causes a cluster state change v
130 | ClientRequest(n, v) ==
131 |   /\ electionWon[n]
132 |   /\ publishPermitted[n]
133 |   /\ LET
134 |        publishRequests == { [method  |-> PublishRequest,
135 |                              source  |-> n,
136 |                              dest    |-> ns,
137 |                              term    |-> currentTerm[n],
138 |                              slot    |-> firstUncommittedSlot[n],
139 |                              value   |-> [type |-> ApplyCSDiff, 
140 |                                           val  |-> (currentClusterState[n] :> v)]
141 |                             ] : ns \in Nodes }
142 |      IN
143 |        /\ publishPermitted' = [publishPermitted EXCEPT ![n] = FALSE]
144 |        /\ messages' = messages \cup publishRequests
145 |        /\ UNCHANGED <<currentClusterState, currentConfiguration, currentTerm, electionWon,
146 |                       firstUncommittedSlot, lastAcceptedValue, lastAcceptedTerm, joinVotes, publishVotes>>
147 | 
148 | \* change the set of voters
149 | ChangeVoters(n, vs) ==
150 |   /\ electionWon[n]
151 |   /\ publishPermitted[n]
152 |   /\ LET
153 |        publishRequests == { [method  |-> PublishRequest,
154 |                              source  |-> n,
155 |                              dest    |-> ns,
156 |                              term    |-> currentTerm[n],
157 |                              slot    |-> firstUncommittedSlot[n],
158 |                              value   |-> [type |-> Reconfigure, val |-> vs]] : ns \in Nodes }
159 |      IN
160 |        /\ publishPermitted' = [publishPermitted EXCEPT ![n] = FALSE]
161 |        /\ messages' = messages \cup publishRequests
162 |        /\ UNCHANGED <<currentClusterState, currentConfiguration, currentTerm, electionWon,
163 |                       firstUncommittedSlot, lastAcceptedValue, lastAcceptedTerm, joinVotes, publishVotes>>
164 | 
165 | \* handle publish request m on node n
166 | HandlePublishRequest(n, m) ==
167 |   /\ m.method = PublishRequest
168 |   /\ m.slot = firstUncommittedSlot[n]
169 |   /\ m.term = currentTerm[n]
170 |   /\ lastAcceptedTerm' = [lastAcceptedTerm EXCEPT ![n] = m.term]
171 |   /\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = m.value]
172 |   /\ LET
173 |        response == [method  |-> PublishResponse,
174 |                     source  |-> n,
175 |                     dest    |-> m.source,
176 |                     success |-> TRUE,
177 |                     term    |-> m.term,
178 |                     slot    |-> m.slot]
179 |      IN
180 |        /\ messages' = messages \cup {response}
181 |        /\ UNCHANGED <<currentClusterState, currentConfiguration, currentTerm,
182 |                       electionWon, firstUncommittedSlot, publishPermitted, joinVotes, publishVotes>>
183 | 
184 | \* node n commits a change
185 | HandlePublishResponse(n, m) ==
186 |   /\ m.method = PublishResponse
187 |   /\ m.slot = firstUncommittedSlot[n]
188 |   /\ m.term = currentTerm[n]
189 |   /\ publishVotes' = [publishVotes EXCEPT ![n] = @ \cup {m.source}]
190 |   /\ IF IsQuorum(publishVotes'[n], currentConfiguration[n])
191 |      THEN
192 |        LET
193 |          commitRequests == { [method  |-> Commit,
194 |                               source  |-> n,
195 |                               dest    |-> ns,
196 |                               term    |-> currentTerm[n],
197 |                               slot    |-> firstUncommittedSlot[n]] : ns \in Nodes }
198 |        IN
199 |          /\ messages' = messages \cup commitRequests
200 |      ELSE
201 |        UNCHANGED <<messages>>
202 |   /\ UNCHANGED <<currentClusterState, currentConfiguration, currentTerm, electionWon,
203 |                    firstUncommittedSlot, lastAcceptedValue, lastAcceptedTerm,
204 |                    publishPermitted, joinVotes>>
205 | 
206 | \* apply committed change to node n
207 | HandleCommitRequest(n, m) ==
208 |   /\ m.method = Commit
209 |   /\ m.slot = firstUncommittedSlot[n]
210 |   /\ m.term = lastAcceptedTerm[n]
211 |   /\ firstUncommittedSlot' = [firstUncommittedSlot EXCEPT ![n] = @ + 1]
212 |   /\ lastAcceptedTerm' = [lastAcceptedTerm EXCEPT ![n] = Nil]
213 |   /\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = Nil]
214 |   /\ publishPermitted' = [publishPermitted EXCEPT ![n] = TRUE]
215 |   /\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
216 |   /\ IF lastAcceptedValue[n].type = Reconfigure THEN
217 |        /\ currentConfiguration' = [currentConfiguration EXCEPT ![n] = lastAcceptedValue[n].val]
218 |        /\ electionWon' = [electionWon EXCEPT ![n] = IsQuorum(joinVotes[n], currentConfiguration'[n])]
219 |        /\ UNCHANGED <<currentClusterState>>
220 |      ELSE
221 |        /\ Assert(lastAcceptedValue[n].type = ApplyCSDiff, "unexpected type")
222 |        /\ Assert(DOMAIN(lastAcceptedValue[n].val) = {currentClusterState[n]}, "diff mismatch")
223 |        /\ currentClusterState' = [currentClusterState EXCEPT ![n] = lastAcceptedValue[n].val[@]] \* apply diff
224 |        /\ UNCHANGED <<currentConfiguration, electionWon>>
225 |   /\ UNCHANGED <<currentTerm, joinVotes, messages>>
226 | 
227 | \* node n captures current state and sends a catch up message
228 | SendCatchupResponse(n) ==
229 |   /\ LET
230 |        catchupMessage == [method  |-> Catchup,
231 |                           slot    |-> firstUncommittedSlot[n],
232 |                           config  |-> currentConfiguration[n],
233 |                           state   |-> currentClusterState[n]]
234 |      IN
235 |        /\ messages' = messages \cup { catchupMessage }
236 |        /\ UNCHANGED <<currentClusterState, currentConfiguration, currentTerm, 
237 |                       lastAcceptedValue, electionWon, firstUncommittedSlot, publishPermitted,
238 |                       joinVotes, lastAcceptedTerm, publishVotes>>
239 | 
240 | \* node n handles a catchup message
241 | HandleCatchupResponse(n, m) ==
242 |   /\ m.method = Catchup
243 |   /\ m.slot > firstUncommittedSlot[n]
244 |   /\ firstUncommittedSlot' = [firstUncommittedSlot EXCEPT ![n] = m.slot]
245 |   /\ lastAcceptedTerm' = [lastAcceptedTerm EXCEPT ![n] = Nil]
246 |   /\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = Nil]
247 |   /\ publishPermitted' = [publishPermitted EXCEPT ![n] = TRUE]
248 |   /\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
249 |   /\ currentConfiguration' = [currentConfiguration EXCEPT ![n] = m.config]
250 |   /\ currentClusterState' = [currentClusterState EXCEPT ![n] = m.state]
251 |   /\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
252 |   /\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
253 |   /\ UNCHANGED <<currentTerm, messages>>
254 |   
255 | 
256 | \* crash/restart node n (loses ephemeral state)
257 | RestartNode(n) ==
258 |   /\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
259 |   /\ publishPermitted' = [publishPermitted EXCEPT ![n] = FALSE]
260 |   /\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
261 |   /\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
262 |   /\ UNCHANGED <<messages, firstUncommittedSlot, currentTerm, currentConfiguration, 
263 |                  currentClusterState, lastAcceptedTerm, lastAcceptedValue>>
264 | 
265 | \* next-step relation
266 | Next ==
267 |   \/ \E n, nm \in Nodes : HandleStartJoin(n, nm, currentTerm[n] + 1)
268 |   \/ \E m \in messages : HandleJoinRequest(m.dest, m)
269 |   \/ \E n \in Nodes : \E v \in Values : ClientRequest(n, v)
270 |   \/ \E m \in messages : HandlePublishRequest(m.dest, m)
271 |   \/ \E m \in messages : HandlePublishResponse(m.dest, m)
272 |   \/ \E m \in messages : HandleCommitRequest(m.dest, m)
273 |   \/ \E n \in Nodes : RestartNode(n)
274 |   \/ \E n \in Nodes : \E vs \in ValidConfigs : ChangeVoters(n, vs)
275 |   \/ \E n \in Nodes : SendCatchupResponse(n)
276 |   \/ \E n \in Nodes : \E m \in messages : HandleCatchupResponse(n, m)
277 | 
278 | ----
279 | 
280 | \* main invariant:
281 | StateMachineSafety ==
282 |   \A n1, n2 \in Nodes :
283 |     firstUncommittedSlot[n1] = firstUncommittedSlot[n2] => 
284 |       /\ currentClusterState[n1] = currentClusterState[n2]
285 |       /\ currentConfiguration[n1] = currentConfiguration[n2]
286 | 
287 | OneMasterPerTerm ==
288 |   \A n1, n2 \in Nodes :
289 |     /\ electionWon[n1]
290 |     /\ electionWon[n2]
291 |     /\ currentTerm[n1] = currentTerm[n2]
292 |     /\ IntersectingQuorums(currentConfiguration[n1], currentConfiguration[n2])
293 |     => n1 = n2
294 | 
295 | LogMatching ==
296 |   \A n1, n2 \in Nodes :
297 |     /\ firstUncommittedSlot[n1] = firstUncommittedSlot[n2]
298 |     /\ lastAcceptedTerm[n1] = lastAcceptedTerm[n2]
299 |     => lastAcceptedValue[n1] = lastAcceptedValue[n2]
300 | 
301 | SingleNodeInvariant ==
302 |   \A n \in Nodes :
303 |     /\ (lastAcceptedTerm[n] = Nil) = (lastAcceptedValue[n] = Nil)
304 |     /\ lastAcceptedTerm[n] /= Nil => (lastAcceptedTerm[n] <= currentTerm[n])
305 |     /\ electionWon[n] = IsQuorum(joinVotes[n], currentConfiguration[n]) \* cached value is consistent
306 |     /\ electionWon[n] /\ publishPermitted[n] => lastAcceptedValue[n] = Nil
307 | 
308 | LogMatchingMessages ==
309 |   \A m1, m2 \in messages:
310 |     /\ m1.method = PublishRequest
311 |     /\ m2.method = PublishRequest
312 |     /\ m1.slot = m2.slot
313 |     /\ m1.term = m2.term
314 |     => m1.value = m2.value
315 | 
316 | SafeCatchupMessages ==
317 |   \A m1, m2 \in messages:
318 |     /\ m1.method = Catchup
319 |     /\ m2.method = Catchup
320 |     /\ m1.slot = m2.slot
321 |     => m1.config = m2.config /\ m1.state = m2.state
322 | 
323 | \* State-exploration limits
324 | StateConstraint ==
325 |   /\ \A n \in Nodes: currentTerm[n] <= 3
326 |   /\ \A n \in Nodes: firstUncommittedSlot[n] <= 2
327 |   /\ Cardinality(messages) <= 15
328 | 
329 | ====================================================================================================
330 | 


--------------------------------------------------------------------------------
/cluster/tla/consensus.toolbox/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>zen</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>toolbox.builder.TLAParserBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>toolbox.builder.PCalAlgorithmSearchingBuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>toolbox.natures.TLANature</nature>
21 | 	</natures>
22 | 	<linkedResources>
23 | 		<link>
24 | 			<name>consensus.tla</name>
25 | 			<type>1</type>
26 | 			<locationURI>PARENT-1-PROJECT_LOC/consensus.tla</locationURI>
27 | 		</link>
28 | 	</linkedResources>
29 | </projectDescription>
30 | 


--------------------------------------------------------------------------------
/cluster/tla/consensus.toolbox/.settings/org.lamport.tla.toolbox.prefs:
--------------------------------------------------------------------------------
1 | ProjectRootFile=PARENT-1-PROJECT_LOC/consensus.tla
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/cluster/tla/consensus.toolbox/consensus___model.launch:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <launchConfiguration type="org.lamport.tla.toolbox.tool.tlc.modelCheck">
 3 |     <stringAttribute key="TLCCmdLineParameters" value=""/>
 4 |     <intAttribute key="autoLockTime" value="15"/>
 5 |     <stringAttribute key="configurationName" value="model"/>
 6 |     <booleanAttribute key="deferLiveness" value="false"/>
 7 |     <intAttribute key="dfidDepth" value="100"/>
 8 |     <booleanAttribute key="dfidMode" value="false"/>
 9 |     <intAttribute key="distributedFPSetCount" value="0"/>
10 |     <stringAttribute key="distributedNetworkInterface" value="192.168.178.34"/>
11 |     <intAttribute key="distributedNodesCount" value="1"/>
12 |     <stringAttribute key="distributedTLC" value="off"/>
13 |     <stringAttribute key="distributedTLCVMArgs" value=""/>
14 |     <intAttribute key="fpBits" value="0"/>
15 |     <intAttribute key="fpIndex" value="1"/>
16 |     <intAttribute key="maxHeapSize" value="25"/>
17 |     <intAttribute key="maxSetSize" value="1000000"/>
18 |     <booleanAttribute key="mcMode" value="true"/>
19 |     <stringAttribute key="modelBehaviorInit" value="Init"/>
20 |     <stringAttribute key="modelBehaviorNext" value="Next"/>
21 |     <stringAttribute key="modelBehaviorSpec" value=""/>
22 |     <intAttribute key="modelBehaviorSpecType" value="2"/>
23 |     <stringAttribute key="modelBehaviorVars" value="lastAcceptedTerm, currentClusterState, messages, firstUncommittedSlot, electionWon, publishVotes, currentTerm, currentConfiguration, joinVotes, publishPermitted, lastAcceptedValue"/>
24 |     <stringAttribute key="modelComments" value=""/>
25 |     <booleanAttribute key="modelCorrectnessCheckDeadlock" value="true"/>
26 |     <listAttribute key="modelCorrectnessInvariants">
27 |         <listEntry value="1StateMachineSafety"/>
28 |         <listEntry value="1OneMasterPerTerm"/>
29 |         <listEntry value="1LogMatching"/>
30 |         <listEntry value="1SingleNodeInvariant"/>
31 |         <listEntry value="1LogMatchingMessages"/>
32 |         <listEntry value="1SafeCatchupMessages"/>
33 |     </listAttribute>
34 |     <listAttribute key="modelCorrectnessProperties"/>
35 |     <stringAttribute key="modelExpressionEval" value=""/>
36 |     <stringAttribute key="modelParameterActionConstraint" value=""/>
37 |     <listAttribute key="modelParameterConstants">
38 |         <listEntry value="Nodes;;{n1, n2, n3};1;1"/>
39 |         <listEntry value="Nil;;Nil;1;0"/>
40 |         <listEntry value="Join;;Join;1;0"/>
41 |         <listEntry value="Reconfigure;;Reconfigure;1;0"/>
42 |         <listEntry value="ApplyCSDiff;;ApplyCSDiff;1;0"/>
43 |         <listEntry value="Values;;{v1, v2};1;1"/>
44 |         <listEntry value="Commit;;Commit;1;0"/>
45 |         <listEntry value="Catchup;;Catchup;1;0"/>
46 |         <listEntry value="PublishResponse;;PublishResponse;1;0"/>
47 |         <listEntry value="PublishRequest;;PublishRequest;1;0"/>
48 |     </listAttribute>
49 |     <stringAttribute key="modelParameterContraint" value="StateConstraint"/>
50 |     <listAttribute key="modelParameterDefinitions"/>
51 |     <stringAttribute key="modelParameterModelValues" value="{}"/>
52 |     <stringAttribute key="modelParameterNewDefinitions" value=""/>
53 |     <intAttribute key="numberOfWorkers" value="3"/>
54 |     <booleanAttribute key="recover" value="false"/>
55 |     <stringAttribute key="result.mail.address" value=""/>
56 |     <intAttribute key="simuAril" value="-1"/>
57 |     <intAttribute key="simuDepth" value="100"/>
58 |     <intAttribute key="simuSeed" value="-1"/>
59 |     <stringAttribute key="specName" value="consensus"/>
60 |     <stringAttribute key="view" value=""/>
61 |     <booleanAttribute key="visualizeStateGraph" value="false"/>
62 | </launchConfiguration>
63 | 


--------------------------------------------------------------------------------
/data/tla/replication.tla:
--------------------------------------------------------------------------------
  1 | `^\Large\bf
  2 | TLA+ Model of the Elasticsearch data replication approach ^'
  3 | -------------------------------------------------------------------------------------
  4 | 
  5 | This file provides a formal specification of how data replication will work in future versions of
  6 | Elasticsearch. We consider this work-in-progress: Model as well as implementation are still evolving
  7 | and might differ in substantial ways.
  8 | 
  9 | Introduction
 10 | ------------
 11 | 
 12 | An index, which is a collection of documents, can be divided into multiple pieces, known as shards,
 13 | each of which can be stored on different machines. This approach of horizontal scaling enables Elasticsearch
 14 | to store much larger indices than could fit on a single machine. To ensure high availability and 
 15 | scale out read access, shards are usually also replicated onto multiple machines. The main copy 
 16 | is called the primary, all other copies are simply called replicas. The number of primary shards 
 17 | for an index is fixed at creation time, which allows to deterministically route requests for specific 
 18 | documents to the right shard, based on a hash of the document key (called the document "_id"). In the 
 19 | context of data replication, shards do their work independently from each other. The specification 
 20 | therefore only considers a single primary shard together with its copies, the replicas. It assumes a 
 21 | fixed set of nodes, each of which can host either the primary or one of the replicas. Shard allocation 
 22 | (i.e. which node has which shard) is dynamic and determined by the master, a process in the cluster 
 23 | that is backed by a consensus module. The TLA+ specification does not model the consensus module, it 
 24 | assumes that this component is working correctly. It shows however how data replication integrates with 
 25 | the consensus module to achieve its guarantees.
 26 | 
 27 | How data is replicated
 28 | ----------------------
 29 | 
 30 | Clients send requests to an arbitrary node in the cluster. The node then routes the request to the
 31 | node in the cluster that has the primary shard for the corresponding document id. The document is
 32 | indexed at the primary and then replicated concurrently to the replicas. The replicas index the
 33 | document and confirm successful replication to the primary. The primary then acknowledges successful
 34 | replication to the client.
 35 | 
 36 | What's covered / not covered in this model
 37 | ------------------------------------------
 38 | 
 39 | Failures covered by the model:
 40 | 
 41 | - node crashes
 42 | 
 43 | - disconnects between nodes on a per request basis
 44 | 
 45 | Also covered:
 46 | 
 47 | - cluster state batching / asynchronous application (each node applies the cluster state, which is
 48 | the state backed by the consensus model, at different points in time)
 49 | 
 50 | - network delays: messages can arrive out-of-order and be delayed
 51 | 
 52 | Limitations of the model:
 53 | 
 54 | - shard initialization / recovery is not modeled: the model initially assumes shards to be started.
 55 | When a shard fails, it is not reallocated / reassigned to another node but stays unassigned.
 56 | When a primary shard fails, a random replica is promoted to primary (if replica exists).
 57 | 
 58 | - only the transaction log is modeled. Lucene store as an optimistic consumer of the transaction log
 59 | is not modeled.
 60 | 
 61 | - adding nodes to the cluster is not modeled, whereas in Elasticsearch, nodes can dynamically 
 62 | be added to the cluster and those nodes will share in the hosting of shard data (shard data is 
 63 | moved to the new node through the process of recovery, mentioned above, which is also not modeled).
 64 | 
 65 | Other differences between model and current Java implementation:
 66 | 
 67 | - the Java implementation uses zero-based numbering for sequence numbers whereas the TLA+ model
 68 | starts from one, as this is the natural way to access the first element of a sequence in TLA+.
 69 | 
 70 | - ...
 71 | 
 72 | 
 73 | ------------------------------ MODULE replication ---------------------------------
 74 | 
 75 | \* Imported modules used in this specification
 76 | EXTENDS Naturals, FiniteSets, Sequences, TLC
 77 | 
 78 | ----
 79 | 
 80 | \* `^\Large\bf Constants ^'
 81 | 
 82 | \* The specification first defines the constants of the model, which amount to values or sets of
 83 | \* values that are fixed.
 84 | 
 85 | \* Set of node ids uniquely representing each data node in the cluster
 86 | CONSTANTS Nodes
 87 | 
 88 | \* Set of possible document ids (i.e. values for "_id" field)
 89 | CONSTANTS DocumentIds
 90 | 
 91 | (* Communication between the nodes is done using an RPC-like mechanism.
 92 |    For each request there is an associated response. In-flight requests / responses are modeled
 93 |    using messages, which are represented in TLA+ as record types. To distinguish requests from
 94 |    responses, the message record type contains a field "request" that denotes whether the message is
 95 |    a request or a response (takes as value one of {TRUE, FALSE}). It also contains the node id of
 96 |    the sender and receiver of the call (in the fields "source" and "dest"). The procedure that is
 97 |    called is found under the field "method".
 98 | 
 99 |    For example, sending a replication request from node n1 to n2 would yield the following message:
100 |       [request   |-> TRUE,
101 |        method    |-> Replication,
102 |        source    |-> n1,
103 |        dest      |-> n2]
104 | 
105 |    Responses also have a field "success" which indicates whether the call was successful
106 |    or not (one of {TRUE, FALSE}).
107 | 
108 |    The model currently supports two RPC methods, one to replicate data from the primary to the 
109 |    replicas and another one to trim the translog (explained later). The reroute logic for rerouting
110 |    requests to the primary is not modeled as it has no impact on consistency/durability guarantees.
111 | *) 
112 | CONSTANTS Replication, TrimTranslog
113 | 
114 | (* Shard allocation is determined by the master and broadcasted to the data nodes in the form of a
115 |    routing table, which is a map from node id to one of {Primary, Replica, Unassigned}, denoting 
116 |    whether the respective node has the primary shard, a replica shard or no shard assigned at all.
117 | *)
118 | CONSTANTS Primary, Replica, Unassigned
119 | 
120 | \* The constant "Nil" denotes a non-existing value (e.g. in transaction log) or a place-holder for
121 | \* a value that is to be computed.
122 | CONSTANTS Nil
123 | 
124 | ----
125 | 
126 | \* `^\Large\bf Variables ^'
127 | 
128 | \* The following describes the variable state of the model.
129 | 
130 | \* Set of in-flight requests and responses sent between data nodes.
131 | VARIABLE messages
132 | 
133 | (* Beside managing and broadcasting the routing table, the master also tracks if
134 |    a primary failed and/or a replica was promoted to primary, incrementing a number called the 
135 |    "primary term" whenever this happens. Each new primary operates under a new term, allowing nodes
136 |    to reject replication requests that come from a primary that has been demoted by the master.
137 |    The routing table and primary term together form the cluster state, which is simply a record type
138 |    containing both:
139 | 
140 |       [routingTable |->
141 |           [n1 |-> Primary,
142 |            n2 |-> Replica,
143 |            n3 |-> Unassigned],
144 |        primaryTerm |-> 1]
145 | 
146 |    The following variable represents the current cluster state on the master, which is not
147 |    explicitly modeled as a node.
148 | *)
149 | VARIABLE clusterStateOnMaster
150 | 
151 | \* For simplicity we assume that each client (index) request uses a new unique value to be indexed.
152 | \* This is just a natural number incremented on each client operation.
153 | VARIABLE nextClientValue
154 | 
155 | \* The set of (acknowledged) client responses. Stored in a variable so that we can make assertions
156 | \* about successfully acknowledged requests (e.g. that they've been successfully stored).
157 | VARIABLE clientResponses
158 | 
159 | \* To improve readibility of the specification, the following placeholder clientVars can be used
160 | \* instead of explicitly writing the two variables on the right hand side.
161 | clientVars == <<nextClientValue, clientResponses>>
162 | 
163 | (* After indexing a document into the primary, it is replicated concurrently to the replicas.
164 |    Writes are only acknowledged to the client after all replicas have acknowledged their writes to
165 |    the primary. To correlate acknowledgements from multiple replicas for the same write on the
166 |    primary, we use a unique request id that is shared by the concurrent replication requests going
167 |    out to the replicas for each specific indexing operation on the primary. The responses carry the
168 |    same request id as the requests they were originating from.
169 | 
170 |    This is just a natural number denoting the next available request id. It is incremented whenever
171 |    a request id is used.
172 | *)
173 | VARIABLE nextRequestId
174 | 
175 | 
176 | \* The following variables capture state on a per-node basis (maps with domain nodes).
177 | 
178 | (* Cluster states are determined by the master and broadcasted to nodes. Due to the distributed
179 |    nature of the system, they can arrive and be applied at different times on the nodes. ES only
180 |    guarantees that an older state is not applied on a node after a new one has been applied on the
181 |    same node. It supports batching, however, so cluster states can be skipped if a newer has
182 |    already arrived on a node before the old one has been processed on that node.
183 | 
184 |    Cluster states applied on each node are represented as a map from node id to cluster state that
185 |    is currently applied on this node 
186 | *)
187 | VARIABLE clusterStateOnNode
188 | 
189 | (* The cluster state contains the current term number. Nodes might learn about the highest primary
190 |    term number not only through cluster state updates, but also through other node-to-node
191 |    communication such as replication requests. They store the most recent information (highest term
192 |    they've heard about). The variable "currentTerm" is a map from node id to primary term number,
193 |    representing the highest primary term number that is known to the node.
194 | *)
195 | VARIABLE currentTerm
196 | 
197 | (* The transaction log is a history of operations. The primary shard determines the order in which
198 |    operations occur by assigning consecutive sequence numbers to the operations that are indexed.
199 |    The sequence number represents a slot in the transaction log that is occupied by the operation.
200 |    When a write on a primary is replicated to replicas, the replication request contains the sequence
201 |    number that was assigned to this operation on the primary. The replica then assigns the operation
202 |    to the same slot in its transaction log. Due to the concurrent nature of replication, replicas
203 |    might fill these slots out-of-order. If the primary crashes or some replication requests don't make
204 |    it to the replica, the replica can end up in a state where its transaction log has holes in it
205 |    (slots that are not filled while subsequent slots are filled).
206 | 
207 |    Example of a transaction log on the primary and a replica:
208 |    `.
209 |                 ---------------------
210 |                 | 1 | 2 | 3 | 4 | 5 |
211 |       primary   |-------------------|
212 |                 | x | x | x | x |   |
213 |                 ---------------------
214 | 
215 |                 ---------------------
216 |                 | 1 | 2 | 3 | 4 | 5 |
217 |       replica   |-------------------|  (request for slot 2 is still in-flight)
218 |                 | x |   | x | x |   |
219 |                 ---------------------
220 |    .'
221 | 
222 |    The transaction log is modeled as a map from node id to map from sequence number to record type
223 |    consisting of document id, value to be stored, primary term and "pending confirmation" marker
224 |    (more on that later).
225 | *)
226 | VARIABLE tlog
227 | 
228 | (* Having a transaction log in place, it is useful to know about the highest slot in the transaction
229 |    log where all slots below it have been successfully replicated to all replicas, i.e. the common
230 |    history shared by all in-sync shard copies. It is useful because in the case where a primary
231 |    fails, a replica which is promoted to primary knows that it has to only worry about being out of sync
232 |    with other replicas on slots that are beyond this slot. The primary is in charge of tracking
233 |    this information, also called global checkpoint. For this, replica shards share information with the
234 |    primary on the highest slot they have filled where all lower slots are filled as well, called the
235 |    local checkpoint. The primary then establishes the global checkpoint as the minimum of the local
236 |    checkpoint value received from all shard copies (including its own local checkpoint) and broadcasts 
237 |    this information to the replicas.
238 | 
239 |    The global checkpoint is modeled as a map from node id to sequence number.
240 | *)
241 | VARIABLE globalCheckPoint
242 | 
243 | (* The local checkpoint is modeled as a map from node id (node that is doing the tracking)
244 |    to node id (node for which the local checkpoint is being tracked) to sequence number.
245 | 
246 |    Only the primary maintains the local checkpoints from all replicas, but because of the possibility of 
247 |    the primary changing over time, and in order to separate the state for each node more clearly, we maintain 
248 |    a node id to local checkpoint map for each node in the cluster.
249 | *)
250 | VARIABLE localCheckPoint
251 | 
252 | \* The placeholder "nodeVars" is used as a shorthand for all node variables
253 | nodeVars == <<clusterStateOnNode, tlog, localCheckPoint, globalCheckPoint, currentTerm>>
254 | 
255 | 
256 | ----
257 | 
258 | \* `^\Large\bf General helper functions ^'
259 | 
260 | \* Return the minimum value from a set, or undefined if the set is empty.
261 | Min(s) == CHOOSE x \in s : \A y \in s : x <= y
262 | 
263 | \* Return the maximum value from a set, or undefined if the set is empty.
264 | Max(s) == CHOOSE x \in s : \A y \in s : x >= y
265 | 
266 | ----
267 | 
268 | \* `^\Large\bf Helper functions on routing table ^'
269 | 
270 | (* Note, in this section, the terms "shard" and "node id" are conflated, because
271 |    we are only considering one shard and all its copies, so each shard can be 
272 |    uniquely identified by the node it resides on. The term "shard" or "node" is chosen 
273 |    solely based on the context of what is being explained or specified.
274 | *)
275 | 
276 | \* Returns shards that are marked as Primary in routing table
277 | Primaries(routingTable) == {n \in DOMAIN routingTable : routingTable[n] = Primary}
278 | 
279 | \* Returns shards that are marked as Replica in routing table
280 | Replicas(routingTable) == {n \in DOMAIN routingTable : routingTable[n] = Replica}
281 | 
282 | \* Returns shards that are marked as Primary or Replica in routing table
283 | Assigned(routingTable) == {n \in DOMAIN routingTable : routingTable[n] /= Unassigned}
284 | 
285 | \* Determines whether the shard on node n was promoted to primary when a cluster state update occurs
286 | ShardWasPromotedToPrimary(n, incomingRoutingTable, localRoutingTable) ==
287 |   LET oldPrimaries == Primaries(localRoutingTable)
288 |       newPrimaries == Primaries(incomingRoutingTable)
289 |   IN  /\ n \notin oldPrimaries
290 |       /\ n \in newPrimaries
291 | 
292 | \* Calculates new cluster state based on shard failure on node n
293 | FailShardOnMaster(n) ==
294 |   LET rt == clusterStateOnMaster.routingTable
295 |   IN
296 |     IF rt[n] = Unassigned THEN
297 |       UNCHANGED <<clusterStateOnMaster>>
298 |     ELSE
299 |       \* increase primary term on primary failure
300 |       LET newPt == IF rt[n] = Primary THEN
301 |                      clusterStateOnMaster.primaryTerm + 1
302 |                    ELSE
303 |                      clusterStateOnMaster.primaryTerm
304 |       IN
305 |         IF rt[n] = Primary /\ Cardinality(Replicas(rt)) > 0 THEN
306 |           \* promote replica to primary
307 |           \E r \in Replicas(rt):
308 |             clusterStateOnMaster' = [clusterStateOnMaster EXCEPT
309 |               !.routingTable = [rt EXCEPT ![n] = Unassigned, ![r] = Primary],
310 |               !.primaryTerm  = newPt]
311 |         ELSE
312 |           clusterStateOnMaster' = [clusterStateOnMaster EXCEPT
313 |             !.routingTable[n] = Unassigned,
314 |             !.primaryTerm  = newPt]
315 | 
316 | ----
317 | 
318 | \* `^\Large\bf Helper functions for sending/receiving messages ^'
319 | 
320 | \* Remove request from the set of messages and add response instead
321 | Reply(response, request) == messages' = {response} \cup (messages \ {request})
322 | 
323 | \* Generate default replication response based on replication request m
324 | \* Copies most of the fields over for convenience (to restore caller information upon return)
325 | DefaultReplicationResponse(m) == [request   |-> FALSE,
326 |                                   method    |-> m.method,
327 |                                   source    |-> m.dest,
328 |                                   dest      |-> m.source,
329 |                                   req       |-> m.req,
330 |                                   id        |-> m.id,
331 |                                   seq       |-> m.seq,
332 |                                   rterm     |-> m.rterm,
333 |                                   sterm     |-> m.sterm,
334 |                                   value     |-> m.value,
335 |                                   client    |-> m.client,
336 |                                   localCP   |-> 0,
337 |                                   success   |-> TRUE]
338 | 
339 | \* Generate default trim translog response based on trim translog request m
340 | DefaultTrimTranslogResponse(m) == [request   |-> FALSE,
341 |                                    method    |-> m.method,
342 |                                    source    |-> m.dest,
343 |                                    dest      |-> m.source,
344 |                                    req       |-> m.req,
345 |                                    maxseq    |-> m.maxseq, \* trim above this sequence number
346 |                                    term      |-> m.term, \* trim entries with term lower than this
347 |                                    success   |-> TRUE]
348 | 
349 | \* Generate default response based on request m
350 | DefaultResponse(m) == IF m.method = Replication THEN
351 |                         DefaultReplicationResponse(m)
352 |                       ELSE
353 |                         DefaultTrimTranslogResponse(m)
354 | 
355 | \* Generate default failure response based on request m
356 | FailedResponse(m) == [DefaultResponse(m) EXCEPT !.success = FALSE]
357 | 
358 | ----
359 | 
360 | \* `^\Large\bf Helper functions on translog ^'
361 | 
362 | (* When a request comes to a primary, it has to select a slot (the sequence number) to put the
363 |    request into. It corresponds to the slot in the transaction log right after the highest slot
364 |    that's filled.
365 |    The following function yields the highest slot that's filled in the transaction log, or 0 if
366 |    no such slot exists.
367 | *)
368 | MaxSeq(ntlog) == Max(DOMAIN ntlog \cup {0})
369 | 
370 | (* `^\bf\large MaxConfirmedSeq ^'
371 |    The local checkpoint is defined as the highest slot in the translog, where all lower slots
372 |    are filled. It is not only holes in the translog that prevent the local checkpoint from moving foward. 
373 |    We actually want to prevent the local checkpoint from moving past slots which are filled but marked as 
374 |    pending confirmation. Pending entries in the translog are entries that appear after the global checkpoint that are not
375 |    allowed to contribute to the advancement of the local checkpoint until a resyncing phase happens due to 
376 |    the primary shard changing nodes. Translog entries thus have a "pc" (pending confirmation) marker
377 |    (\in {TRUE, FALSE}) that says whether the local checkpoint can move past them.
378 |    This function yields highest sequence number which are not pending confirmation and where
379 |    all lower slots are not pending confirmation either.
380 |    Yields 0 if no such number exists.
381 | 
382 |    Examples: (T stands for TRUE, F for FALSE) 
383 |    `.
384 |                   ---------------------
385 |                   | 1 | 2 | 3 | 4 | 5 |
386 |                   |-------------------|     MaxConfirmedSeq = 1
387 |                   | x | x | x | x |   |
388 |       pc markers: | F | T | T | F |   |
389 |                   ---------------------
390 | 
391 |                   ---------------------
392 |                   | 1 | 2 | 3 | 4 | 5 |
393 |                   |-------------------|     MaxConfirmedSeq = 2
394 |                   | x | x |   | x |   |
395 |       pc markers: | F | F |   | F |   |
396 |                   ---------------------
397 |    .'
398 | *)
399 | MaxConfirmedSeq(ntlog) ==
400 |   LET ConfirmedTlogSlot(i) == i \in DOMAIN ntlog /\ ntlog[i].pc = FALSE
401 |   IN  CHOOSE i \in (DOMAIN ntlog \cup {0}) : /\ ConfirmedTlogSlot(i+1) = FALSE
402 |                                              /\ \A j \in 1..i : ConfirmedTlogSlot(j)
403 | 
404 | (* `^\bf\large MarkPC ^'
405 |    Yields translog where all entries at position strictly larger than "globalCP" (representing the
406 |    global checkpoint) are marked as "pending confirmation".
407 | 
408 |    Example: 
409 |    `.
410 |       ---------------------   MarkPC          ---------------------
411 |       | 1 | 2 | 3 | 4 | 5 |   ----------->    | 1 | 2 | 3 | 4 | 5 |
412 |       |-------------------|   globalCP = 1    |-------------------|
413 |       | x | x |   | x |   |                   | x | x |   | x |   |
414 |       | F | F |   | F |   |                   | F | T |   | T |   |
415 |       ---------------------                   ---------------------
416 |    .'
417 | *)
418 | MarkPC(ntlog, globalCP) ==
419 |   [j \in DOMAIN ntlog |-> [ntlog[j] EXCEPT !.pc = IF j > globalCP THEN TRUE ELSE @]]
420 | 
421 | (* `^\bf\large FillAndUnmarkPC ^'
422 |    Yields translog where all gaps are filled and the pending confirmation marker is set to FALSE
423 |    for values at position strictly larger than "globalCP" representing the global checkpoint
424 | 
425 |    Example: 
426 |    `.
427 |       ---------------------   FillAndUnmarkPC ---------------------
428 |       | 1 | 2 | 3 | 4 | 5 |   ----------->    | 1 | 2 | 3 | 4 | 5 |
429 |       |-------------------|   globalCP = 1    |-------------------|
430 |       | x | x |   | x |   |                   | x | x | x | x |   |
431 |       | T | T |   | F |   |                   | T | F | F | F |   |
432 |       ---------------------                   ---------------------
433 |    .'
434 | *)
435 | FillAndUnmarkPC(ntlog, storedTerm, globalCP) ==
436 |   [j \in 1..Max(DOMAIN ntlog \cup {0}) |->
437 |     IF j > globalCP THEN
438 |       IF j \in DOMAIN ntlog THEN
439 |         [ntlog[j] EXCEPT !.pc = FALSE]
440 |       ELSE
441 |         [id    |-> Nil,
442 |          term  |-> storedTerm,
443 |          value |-> Nil,
444 |          pc    |-> FALSE]
445 |     ELSE
446 |       ntlog[j]]
447 | 
448 | (* `^\bf\large TrimTlog ^'
449 |    Trim elements from translog with position strictly greater than maxseq and
450 |    term strictly lower than minterm.
451 | 
452 |    Example: 
453 |    `.
454 |           ---------------------   TrimTlog        ---------------------
455 |           | 1 | 2 | 3 | 4 | 5 |   ----------->    | 1 | 2 | 3 | 4 | 5 |
456 |           |-------------------|   maxseq = 2      |-------------------|
457 |           | x | x | x | x | x |   minterm = 2     | x | x | x |   | x |
458 |           | T | T | T | F | T |                   | T | T | T |   | F |
459 |    terms: | 1 | 1 | 2 | 1 | 2 |                   | 1 | 1 | 2 |   | 2 |
460 |           ---------------------                   ---------------------
461 |    .'
462 | *)
463 | TrimTlog(ntlog, maxseq, minterm) ==
464 |   [j \in {i \in DOMAIN ntlog : i <= maxseq \/ ntlog[i].term >= minterm} |-> ntlog[j]]
465 | 
466 | ----
467 | 
468 | \* `^\Large\bf Initial states ^'
469 | 
470 | \* All possible routing tables where there is one primary
471 | RoutingTablesWithPrimary ==
472 |   UNION {
473 |     {
474 |       [n \in {pn} |-> Primary] @@ \* pn has the primary
475 |       [n \in rs |-> Replica] @@ \* rs is the subset of nodes having replicas
476 |       [n \in ((Nodes \ rs) \ {pn}) |-> Unassigned] \* remaining nodes have unassigned shards
477 |         : rs \in SUBSET (Nodes \ {pn})
478 |     } : pn \in Nodes
479 |   }
480 | 
481 | \* Possible initial routing tables are those which have a primary or where all shards are unassigned
482 | InitialRoutingTables == RoutingTablesWithPrimary \cup {[n \in Nodes |-> Unassigned]}
483 | 
484 | \* The following constant denotes the set of possible initial cluster states that are to be
485 | \* considered for exploring the model, containing cluster states such as
486 | \* [ routingTable |-> [n1 |-> Primary, n2 |-> Replica, n3 |-> Replica], primaryTerm |-> 1 ]
487 | InitialClusterStates == { [routingTable |-> rt, primaryTerm |-> 1] : rt \in InitialRoutingTables }
488 | 
489 | Init == /\ clusterStateOnMaster \in InitialClusterStates
490 |         /\ messages           = {}
491 |         /\ nextClientValue    = 1
492 |         /\ clientResponses    = {}
493 |         /\ nextRequestId      = 1
494 |         /\ tlog               = [n \in Nodes |-> << >>]
495 |         /\ localCheckPoint    = [n1 \in Nodes |-> [n2 \in Nodes |-> 0]]
496 |         /\ globalCheckPoint   = [n \in Nodes |-> 0]
497 |         /\ clusterStateOnNode = [n \in Nodes |-> clusterStateOnMaster]
498 |         /\ currentTerm        = [n \in Nodes |-> clusterStateOnMaster.primaryTerm]
499 | 
500 | ----
501 | 
502 | \* `^\Large\bf Next-step relations ^'
503 | 
504 | \* Index request arrives on node n with document id docId
505 | ClientRequest(n, docId) ==
506 |   /\ clusterStateOnNode[n].routingTable[n] = Primary \* node believes itself to be the primary
507 |   /\ LET
508 |        replicas     == Replicas(clusterStateOnNode[n].routingTable)
509 |        primaryTerm  == currentTerm[n]
510 |        tlogEntry    == [id    |-> docId,
511 |                         term  |-> primaryTerm,
512 |                         value |-> nextClientValue,
513 |                         pc    |-> FALSE]
514 |        seq          == MaxSeq(tlog[n]) + 1
515 |        \* create replication requests for each replica that the primary knows about
516 |        replRequests == {([request  |-> TRUE,
517 |                           method   |-> Replication,
518 |                           source   |-> n,
519 |                           dest     |-> rn,
520 |                           req      |-> nextRequestId,
521 |                           id       |-> docId,
522 |                           value    |-> nextClientValue,
523 |                           seq      |-> seq,
524 |                           rterm    |-> primaryTerm, \* current term when issuing request
525 |                           sterm    |-> primaryTerm, \* term to be stored (differs for fast resync)
526 |                           client   |-> TRUE, \* it's a replication request initiated by client
527 |                           globalCP |-> globalCheckPoint[n]]) : rn \in replicas}
528 |      IN 
529 |        \* put entry into translog
530 |        /\ tlog' = [tlog EXCEPT ![n] = (seq :> tlogEntry) @@ @]
531 |        \* Make sure that each client request uses a unique value
532 |        /\ nextClientValue' = nextClientValue + 1
533 |        \* set next unique key to use for replication requests so that we can relate responses
534 |        /\ nextRequestId' = nextRequestId + 1
535 |        \* update local checkpoint
536 |        /\ localCheckPoint' = [localCheckPoint EXCEPT ![n][n] = seq]
537 |        /\ Assert(localCheckPoint'[n][n] = localCheckPoint[n][n] + 1, "localCheckPoint incremented")
538 |        \* send out replication requests
539 |        /\ messages' = messages \cup replRequests
540 |        /\ IF replicas = {} THEN
541 |             \* no replicas, directly acknowledge to the client
542 |             /\ clientResponses' = clientResponses \cup {[success |-> TRUE,
543 |                                                          id      |-> docId,
544 |                                                          value   |-> nextClientValue,
545 |                                                          seq     |-> seq,
546 |                                                          term    |-> primaryTerm]}
547 |           ELSE
548 |             \* replication requests sent out, wait for responses before acking to client
549 |             /\ UNCHANGED <<clientResponses>>
550 |   /\ UNCHANGED <<clusterStateOnMaster, clusterStateOnNode, globalCheckPoint, currentTerm>>
551 | 
552 | \* Helper function for marking translog entries as pending confirmation if incoming term is higher
553 | \* than current term on node n.
554 | MaybeMarkPC(incomingTerm, n, globalCP) ==
555 |   IF incomingTerm > currentTerm[n] THEN
556 |     \* there is a new primary, cannot safely advance local checkpoint
557 |     \* before resync done, move it back to global checkpoint and add
558 |     \* pending confirmation marker to all entries above global checkpoint
559 |     MarkPC(tlog[n], globalCP)
560 |   ELSE
561 |     tlog[n]
562 |     
563 | (* 
564 | `^\bf Note on handling replication requests with higher primary terms^'
565 | If a replica receives a replication request with a primary term greater than its current primary term, 
566 | it means that new primary was promoted. The `^na\"ive^' handling of this case would be to forget all translog
567 | operations above global checkpoint, reset local checkpoint to global checkpoint and await replay 
568 | of operations above global checkpoint from the newly promoted primary.
569 | However, this approach does not work, because newly promoted primary might fail during resync process
570 | and if our replica is promoted to primary - it may miss operations above global checkpoint, effectively 
571 | losing acknowledged writes.
572 | 
573 | That's why we preserve existing entries in translog above global checkpoint but mark them as 
574 | pending confirmation. During operations replay, replica replaces its translog entry with the entry received 
575 | from the primary (which can be noop) and resets pending confirmation flag. 
576 | *)
577 | \* Replication request arrives on node n with message m
578 | HandleReplicationRequest(n, m) ==
579 |    /\ m.request = TRUE
580 |    /\ m.method = Replication
581 |    /\ IF m.rterm < currentTerm[n] THEN
582 |         \* don't accept replication requests with lower term than we have
583 |         \* lower term means that it's coming from a primary that has since been demoted
584 |         /\ Reply(FailedResponse(m), m)
585 |         /\ UNCHANGED <<clusterStateOnMaster, nextRequestId, clientVars, nodeVars>>
586 |       ELSE
587 |         /\ LET
588 |              tlogEntry     == [id    |-> m.id,
589 |                                term  |-> m.sterm,
590 |                                value |-> m.value,
591 |                                pc    |-> FALSE]
592 |              newGlobalCP   == Max({m.globalCP, globalCheckPoint[n]})
593 |              \* mark translog entries as pending if higher term and write request into translog
594 |              newTlog       == (m.seq :> tlogEntry) @@ MaybeMarkPC(m.rterm, n, newGlobalCP)
595 |              \* recompute local checkpoint
596 |              localCP       == MaxConfirmedSeq(newTlog)
597 |            IN 
598 |              /\ tlog' = [tlog EXCEPT ![n] = newTlog]
599 |              /\ currentTerm' = [currentTerm EXCEPT ![n] = m.rterm]
600 |              /\ globalCheckPoint' = [globalCheckPoint EXCEPT ![n] = newGlobalCP]
601 |              /\ Reply([DefaultResponse(m) EXCEPT !.localCP = localCP], m)
602 |         /\ UNCHANGED <<clusterStateOnMaster, clusterStateOnNode, nextClientValue, nextRequestId,
603 |                        clientResponses, localCheckPoint>>
604 | 
605 | \* Trim translog request arrives on node n with message m
606 | HandleTrimTranslogRequest(n, m) ==
607 |   /\ m.request = TRUE
608 |   /\ m.method = TrimTranslog
609 |   /\ IF m.term < currentTerm[n] THEN
610 |        \* don't handle requests with lower term than we have
611 |        \* lower term means that it's coming from a primary that has since been demoted
612 |        /\ Reply(FailedResponse(m), m)
613 |        /\ UNCHANGED <<tlog, globalCheckPoint, currentTerm>>
614 |      ELSE
615 |        /\ LET
616 |             newGlobalCP   == Max({m.globalCP, globalCheckPoint[n]})
617 |             \* mark translog entries as pending if higher term and trim translog
618 |             newTlog       == TrimTlog(MaybeMarkPC(m.term, n, newGlobalCP), m.maxseq, m.term)
619 |           IN
620 |             /\ tlog' = [tlog EXCEPT ![n] = newTlog]
621 |             /\ globalCheckPoint' = [globalCheckPoint EXCEPT ![n] = newGlobalCP]
622 |             /\ currentTerm' = [currentTerm EXCEPT ![n] = m.term]
623 |             /\ Reply(DefaultResponse(m), m)
624 |   /\ UNCHANGED <<clusterStateOnMaster, nextRequestId, clientVars, localCheckPoint,
625 |                  clusterStateOnNode>>
626 | 
627 | \* Helper function for handling replication responses
628 | FinishIfNeeded(m) ==
629 |   \* check if this is the last response we're waiting for
630 |   IF /\ m.client
631 |      /\ { ms \in messages : ms.req = m.req } = {m}
632 |      \* check if the request has not been failed already to the client
633 |      /\ { cr \in clientResponses : cr.success = FALSE /\ cr.req = m.req } = {} THEN
634 |     clientResponses' = clientResponses \cup {[success |-> TRUE,
635 |                                               id      |-> m.id,
636 |                                               value   |-> m.value,
637 |                                               seq     |-> m.seq,
638 |                                               term    |-> m.rterm]}
639 |   ELSE
640 |     UNCHANGED <<clientResponses>>
641 | 
642 | \* Helper function for handling replication responses
643 | FinishAsFailed(m) ==
644 |   /\ clientResponses' = clientResponses \cup {[success |-> FALSE,
645 |                                                req     |-> m.req]}
646 | 
647 | \* Replication response arrives on node n from node rn with message m
648 | HandleReplicationResponse(n, rn, m) ==
649 |   /\ m.request = FALSE
650 |   /\ m.method = Replication
651 |   \* are we still interested in the response or already marked the overall client request as failed?
652 |   /\ IF m.success THEN
653 |        \* is it a newer local checkpoint than we have?
654 |        /\ IF m.localCP > localCheckPoint[n][rn] /\
655 |              \* is the shard still active on this node?
656 |              clusterStateOnNode[n].routingTable[n] /= Unassigned THEN
657 |             LET
658 |               newLocalCheckPoint == [localCheckPoint EXCEPT ![n][rn] = m.localCP]
659 |               assigned           == Assigned(clusterStateOnNode[n].routingTable)
660 |               computedGlobalCP   == Min({newLocalCheckPoint[n][i] : i \in assigned})
661 |             IN
662 |               /\ localCheckPoint'  = newLocalCheckPoint
663 |               \* also update global checkpoint if necessary
664 |               /\ globalCheckPoint' = [globalCheckPoint EXCEPT ![n] = computedGlobalCP]
665 |           ELSE
666 |             UNCHANGED <<localCheckPoint, globalCheckPoint>>
667 |        /\ UNCHANGED <<clusterStateOnMaster, clusterStateOnNode>>
668 |        /\ FinishIfNeeded(m)
669 |      ELSE
670 |        \* replication failed, ask master to fail shard
671 |        /\ IF m.rterm < clusterStateOnMaster.primaryTerm THEN
672 |             \* term outdated, fail itself and don't ack client write
673 |             /\ FinishAsFailed(m)
674 |             /\ UNCHANGED <<clusterStateOnMaster>>
675 |           ELSE
676 |             \* fail shard and respond to client
677 |             /\ FailShardOnMaster(rn)
678 |             /\ FinishIfNeeded(m)
679 |        /\ UNCHANGED <<localCheckPoint, globalCheckPoint>>
680 |   /\ messages' = messages \ {m}
681 |   /\ UNCHANGED <<nextClientValue, nextRequestId, tlog, clusterStateOnNode,
682 |                  currentTerm>>
683 | 
684 | 
685 | \* Trim translog response arrives on node n from node rn with message m
686 | HandleTrimTranslogResponse(n, rn, m) ==
687 |    /\ m.request = FALSE
688 |    /\ m.method = TrimTranslog
689 |    /\ messages' = messages \ {m}
690 |    /\ IF m.success = FALSE /\ m.term >= clusterStateOnMaster.primaryTerm THEN
691 |         \* fail shard
692 |         FailShardOnMaster(rn)
693 |       ELSE
694 |         UNCHANGED <<clusterStateOnMaster>>
695 |    /\ UNCHANGED <<nextClientValue, nextRequestId, nodeVars, clientResponses>>
696 | 
697 | \* Cluster state propagated from master is applied to node n
698 | ApplyClusterStateFromMaster(n) ==
699 |   /\ clusterStateOnNode[n] /= clusterStateOnMaster
700 |   /\ clusterStateOnNode' = [clusterStateOnNode EXCEPT ![n] = clusterStateOnMaster]
701 |   /\ IF ShardWasPromotedToPrimary(n, clusterStateOnMaster.routingTable,
702 |          clusterStateOnNode[n].routingTable) THEN
703 |        \* shard promoted to primary, resync with replicas
704 |        LET
705 |          ntlog          == tlog[n]
706 |          globalCP       == globalCheckPoint[n]
707 |          newTerm        == clusterStateOnMaster.primaryTerm
708 |          \* fill gaps in tlog and remove pending confirmation marker
709 |          newTlog        == FillAndUnmarkPC(ntlog, newTerm, globalCP)
710 |          replicas       == Replicas(clusterStateOnMaster.routingTable)
711 |          numReplicas    == Cardinality(replicas)
712 |          startSeq       == globalCP + 1
713 |          endSeq         == Max((DOMAIN ntlog) \cup {0})
714 |          numDocs        == endSeq + 1 - startSeq
715 |          \* resend all translog entries above global checkpoint to replicas
716 |          replRequests   == {([request  |-> TRUE,
717 |                               method   |-> Replication,
718 |                               source   |-> n,
719 |                               dest     |-> rn,
720 |                               req      |-> nextRequestId + (i - startSeq),
721 |                               seq      |-> i,
722 |                               rterm    |-> newTerm, \* new term when issuing request
723 |                               sterm    |-> newTlog[i].term, \* stored term for entry
724 |                               id       |-> newTlog[i].id,
725 |                               value    |-> newTlog[i].value,
726 |                               client   |-> FALSE, \* request not initiated by client
727 |                               globalCP |-> globalCP]) : i \in startSeq..endSeq, rn \in replicas}
728 |          \* send trim request to replicas
729 |          trimRequests   == {[request  |-> TRUE,
730 |                              method   |-> TrimTranslog,
731 |                              source   |-> n,
732 |                              dest     |-> rn,
733 |                              req      |-> nextRequestId + numDocs,
734 |                              maxseq   |-> endSeq,
735 |                              term     |-> newTerm,
736 |                              client   |-> FALSE,
737 |                              globalCP |-> globalCP] : rn \in replicas}
738 |        IN
739 |          /\ currentTerm' = [currentTerm EXCEPT ![n] = newTerm]
740 |          /\ tlog' = [tlog EXCEPT ![n] = newTlog]
741 |          /\ localCheckPoint' = [localCheckPoint EXCEPT ![n][n] = MaxConfirmedSeq(newTlog)]
742 |          /\ messages' = messages \cup replRequests \cup trimRequests
743 |          /\ nextRequestId' = nextRequestId + numDocs + 1
744 |     ELSE
745 |       /\ UNCHANGED <<messages, nextRequestId, currentTerm, localCheckPoint, tlog>>
746 |   /\ UNCHANGED <<clusterStateOnMaster, clientVars,
747 |                  globalCheckPoint>>
748 | 
749 | 
750 | \* Fail request message
751 | FailRequestMessage(m) ==
752 |   /\ m.request = TRUE
753 |   /\ Reply(FailedResponse(m), m)
754 |   /\ UNCHANGED <<clusterStateOnMaster, nextRequestId, clientVars, nodeVars>>
755 | 
756 | \* Fail response message
757 | FailResponseMessage(m) ==
758 |   /\ m.request = FALSE
759 |   /\ m.success = TRUE
760 |   /\ Reply([m EXCEPT !.success = FALSE], m)
761 |   /\ UNCHANGED <<clusterStateOnMaster, nextRequestId, clientVars, nodeVars>>
762 | 
763 | \* Node fault detection on master finds node n to be isolated from the cluster or crashed
764 | NodeFaultDetectionKicksNodeOut(n) ==
765 |   /\ clusterStateOnMaster.routingTable[n] /= Unassigned \* not already unassigned
766 |   /\ FailShardOnMaster(n) 
767 |   /\ UNCHANGED <<messages, nextRequestId, clientVars, nodeVars>>
768 | 
769 | \* Defines how the variables may transition.
770 | Next == \/ \E n \in Nodes : \E docId \in DocumentIds : ClientRequest(n, docId)
771 |         \/ \E m \in messages : HandleReplicationRequest(m.dest, m)
772 |         \/ \E m \in messages : HandleReplicationResponse(m.dest, m.source, m)
773 |         \/ \E m \in messages : HandleTrimTranslogRequest(m.dest, m)
774 |         \/ \E m \in messages : HandleTrimTranslogResponse(m.dest, m.source, m)
775 |         \/ \E m \in messages : FailRequestMessage(m)
776 |         \/ \E m \in messages : FailResponseMessage(m)
777 |         \/ \E n \in Nodes : ApplyClusterStateFromMaster(n)
778 |         \/ \E n \in Nodes : NodeFaultDetectionKicksNodeOut(n)
779 | 
780 | ----
781 | 
782 | \* `^\Large\bf Helper functions for making assertions ^'
783 | 
784 | \* no active messages
785 | NoActiveMessages == messages = {}
786 | 
787 | \* shard that is considered active by the master
788 | ActiveShard(n) == clusterStateOnMaster.routingTable[n] /= Unassigned
789 | 
790 | \* cluster state on master has been applied to all nodes that are still supposed to have an active shard
791 | ClusterStateAppliedOnAllNodesWithActiveShards ==
792 |   \A n \in Nodes : ActiveShard(n) => clusterStateOnNode[n] = clusterStateOnMaster
793 | 
794 | \* everything in the translog up to and including slot i
795 | UpToSlot(ntlog, i) == [j \in 1..i |-> ntlog[j]]
796 | 
797 | \* copy of translog, where we ignore the pending confirmation marker
798 | ExceptPC(ntlog) == [j \in DOMAIN ntlog |-> [r \in DOMAIN ntlog[j] \ {"pc"} |-> ntlog[j][r] ]]
799 | 
800 | \* all shard copies contain same data
801 | AllCopiesSameContents ==
802 |   \A n1, n2 \in Nodes:
803 |        /\ n1 /= n2
804 |        /\ ActiveShard(n1)
805 |        /\ ActiveShard(n2) 
806 |     => ExceptPC(tlog[n1]) = ExceptPC(tlog[n2])
807 | 
808 | ----
809 | 
810 | \* `^\Large\bf Main invariants ^'
811 | 
812 | \* checks if the translog for all nodes are equivalent up to their global checkpoint, only differing
813 | \* in the safety marker (which can be false sometimes if the global checkpoint on one shard is lower
814 | \* than on another one)
815 | SameTranslogUpToGlobalCheckPoint ==
816 |   \A n1, n2 \in Nodes:
817 |        /\ n1 /= n2
818 |        /\ ActiveShard(n1)
819 |        /\ ActiveShard(n2)
820 |     => ExceptPC(UpToSlot(tlog[n1], globalCheckPoint[n1])) = 
821 |          ExceptPC(UpToSlot(tlog[n2], globalCheckPoint[n1]))
822 | 
823 | \* checks if the translog for all nodes is eventually the same
824 | AllCopiesSameContentsOnQuietDown ==
825 |       (/\ NoActiveMessages
826 |        /\ ClusterStateAppliedOnAllNodesWithActiveShards)
827 |     => AllCopiesSameContents
828 | 
829 | \* checks if all (acked) responses to client are successfully and correctly stored 
830 | AllAckedResponsesStored ==
831 |     \A r \in clientResponses : \A n \in Nodes :
832 |       /\ r.success = TRUE
833 |       /\ ActiveShard(n)
834 |       => /\ r.seq \in DOMAIN tlog[n]
835 |          /\ tlog[n][r.seq].id = r.id
836 |          /\ tlog[n][r.seq].value = r.value
837 |          /\ tlog[n][r.seq].term = r.term
838 | 
839 | \* checks that the global checkpoint is the same as or below the local checkpoint on each node
840 | GlobalCheckPointBelowLocalCheckPoints ==
841 |     \A n \in Nodes : globalCheckPoint[n] <= MaxConfirmedSeq(tlog[n])
842 | 
843 | \* local checkpoint always corresponds to MaxSeq and MaxConfirmedSeq on the primary node
844 | LocalCheckPointMatchesMaxConfirmedSeq ==
845 |   \A n \in Nodes : clusterStateOnNode[n].routingTable[n] = Primary
846 |     => /\ localCheckPoint[n][n] = MaxConfirmedSeq(tlog[n])
847 |        /\ MaxSeq(tlog[n]) = MaxConfirmedSeq(tlog[n])
848 | 
849 | \* routing table is well-formed (has at most one primary, and has no replicas if no primaries)
850 | WellFormedRoutingTable(routingTable) ==
851 |   /\ Cardinality(Primaries(routingTable)) <= 1
852 |   /\ (   Cardinality(Primaries(routingTable)) = 0
853 |       => Cardinality(Assigned (routingTable)) = 0)
854 | 
855 | StateConstraint ==
856 |   /\ nextClientValue <= 3
857 |   /\ Cardinality(messages) <= 5
858 |   /\ 0 < Cardinality(Assigned (clusterStateOnMaster.routingTable))
859 | 
860 | =============================================================================
861 | 


--------------------------------------------------------------------------------
/data/tla/replication.toolbox/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>elastic</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>toolbox.builder.TLAParserBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>toolbox.builder.PCalAlgorithmSearchingBuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>toolbox.natures.TLANature</nature>
21 | 	</natures>
22 | 	<linkedResources>
23 | 		<link>
24 | 			<name>replication.tla</name>
25 | 			<type>1</type>
26 | 			<locationURI>PARENT-1-PROJECT_LOC/replication.tla</locationURI>
27 | 		</link>
28 | 	</linkedResources>
29 | </projectDescription>
30 | 


--------------------------------------------------------------------------------
/data/tla/replication.toolbox/.settings/org.lamport.tla.toolbox.prefs:
--------------------------------------------------------------------------------
1 | ProjectRootFile=PARENT-1-PROJECT_LOC/replication.tla
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/data/tla/replication.toolbox/replication___model.launch:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <launchConfiguration type="org.lamport.tla.toolbox.tool.tlc.modelCheck">
 3 | <stringAttribute key="TLCCmdLineParameters" value=""/>
 4 | <intAttribute key="autoLockTime" value="15"/>
 5 | <stringAttribute key="configurationName" value="model"/>
 6 | <intAttribute key="dfidDepth" value="100"/>
 7 | <booleanAttribute key="dfidMode" value="false"/>
 8 | <intAttribute key="distributedFPSetCount" value="0"/>
 9 | <stringAttribute key="distributedNetworkInterface" value="192.168.178.34"/>
10 | <intAttribute key="distributedNodesCount" value="1"/>
11 | <stringAttribute key="distributedTLC" value="off"/>
12 | <stringAttribute key="distributedTLCVMArgs" value=""/>
13 | <intAttribute key="fpBits" value="0"/>
14 | <intAttribute key="fpIndex" value="1"/>
15 | <intAttribute key="maxHeapSize" value="20"/>
16 | <intAttribute key="maxSetSize" value="1000000"/>
17 | <booleanAttribute key="mcMode" value="true"/>
18 | <stringAttribute key="modelBehaviorInit" value="Init"/>
19 | <stringAttribute key="modelBehaviorNext" value="Next"/>
20 | <stringAttribute key="modelBehaviorSpec" value="Spec"/>
21 | <intAttribute key="modelBehaviorSpecType" value="2"/>
22 | <stringAttribute key="modelBehaviorVars" value="nextClientValue, clusterStateOnMaster, globalCheckPoint, tlog, messages, clientResponses, currentTerm, clusterStateOnNode, nextRequestId, localCheckPoint"/>
23 | <stringAttribute key="modelComments" value=""/>
24 | <booleanAttribute key="modelCorrectnessCheckDeadlock" value="false"/>
25 | <listAttribute key="modelCorrectnessInvariants">
26 | <listEntry value="1AllCopiesSameContentsOnQuietDown"/>
27 | <listEntry value="1WellFormedRoutingTable(clusterStateOnMaster.routingTable)"/>
28 | <listEntry value="1GlobalCheckPointBelowLocalCheckPoints"/>
29 | <listEntry value="1SameTranslogUpToGlobalCheckPoint"/>
30 | <listEntry value="1AllAckedResponsesStored"/>
31 | <listEntry value="1LocalCheckPointMatchesMaxConfirmedSeq"/>
32 | </listAttribute>
33 | <listAttribute key="modelCorrectnessProperties">
34 | <listEntry value="1&lt;&gt;AllCopiesSameContents"/>
35 | <listEntry value="1&lt;&gt;NoActiveMessages"/>
36 | <listEntry value="1&lt;&gt;AllAckedResponsesStored"/>
37 | </listAttribute>
38 | <stringAttribute key="modelExpressionEval" value=""/>
39 | <stringAttribute key="modelParameterActionConstraint" value=""/>
40 | <listAttribute key="modelParameterConstants">
41 | <listEntry value="Primary;;Primary;1;0"/>
42 | <listEntry value="Replica;;Replica;1;0"/>
43 | <listEntry value="Unassigned;;Unassigned;1;0"/>
44 | <listEntry value="Nil;;Nil;1;0"/>
45 | <listEntry value="Nodes;;{n1, n2, n3};1;1"/>
46 | <listEntry value="DocumentIds;;{id1, id2};1;1"/>
47 | <listEntry value="Replication;;Replication;1;0"/>
48 | <listEntry value="TrimTranslog;;TrimTranslog;1;0"/>
49 | </listAttribute>
50 | <stringAttribute key="modelParameterContraint" value="StateConstraint"/>
51 | <listAttribute key="modelParameterDefinitions"/>
52 | <stringAttribute key="modelParameterModelValues" value="{}"/>
53 | <stringAttribute key="modelParameterNewDefinitions" value=""/>
54 | <intAttribute key="numberOfWorkers" value="4"/>
55 | <booleanAttribute key="recover" value="false"/>
56 | <stringAttribute key="result.mail.address" value=""/>
57 | <intAttribute key="simuAril" value="-1"/>
58 | <intAttribute key="simuDepth" value="100"/>
59 | <intAttribute key="simuSeed" value="-1"/>
60 | <stringAttribute key="specName" value="replication"/>
61 | <stringAttribute key="view" value=""/>
62 | </launchConfiguration>
63 | 


--------------------------------------------------------------------------------