├── LICENSE
├── README.md
├── config.go
├── lab_res
    ├── Lab2A.PNG
    ├── Lab2B.PNG
    ├── Lab2C.PNG
    └── Lab2D.PNG
├── persister.go
├── raft.go
├── test_test.go
└── util.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Holdonbush
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MIT6.824-Lab2-Raft
 2 | MIT6.824 Lab2-Raft的实现
 3 | 
 4 | ## Lab2A
 5 | 可在master/main/Lab2A/Lab2B/Lab2C/Lab2D分支执行命令`go test -run 2A -race`  
 6 | ![Lab2A](./lab_res/Lab2A.PNG)
 7 | 
 8 | ## Lab2B
 9 | 可在master/main/Lab2B/Lab2C/Lab2D分支执行命令`go test -run 2B`或者`time go test -run 2B`  
10 | ![Lab2B](./lab_res/Lab2B.PNG)
11 | 
12 | ## Lab2C
13 | 可在master/main/Lab2C/Lab2D分支执行命令`go test -run 2C -race`  
14 | ![Lab2C](./lab_res/Lab2C.PNG)
15 | 
16 | ## Lab2D
17 | 可在master/main/Lab2D执行命令`go test -run 2D`  
18 | ![Lab2D](./lab_res/Lab2D.PNG)
19 | 


--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | //
  4 | // support for Raft tester.
  5 | //
  6 | // we will use the original config.go to test your code for grading.
  7 | // so, while you can modify this code to help you debug, please
  8 | // test with the original before submitting.
  9 | //
 10 | 
 11 | import "6.824/labgob"
 12 | import "6.824/labrpc"
 13 | import "bytes"
 14 | import "log"
 15 | import "sync"
 16 | import "testing"
 17 | import "runtime"
 18 | import "math/rand"
 19 | import crand "crypto/rand"
 20 | import "math/big"
 21 | import "encoding/base64"
 22 | import "time"
 23 | import "fmt"
 24 | 
 25 | func randstring(n int) string {
 26 | 	b := make([]byte, 2*n)
 27 | 	crand.Read(b)
 28 | 	s := base64.URLEncoding.EncodeToString(b)
 29 | 	return s[0:n]
 30 | }
 31 | 
 32 | func makeSeed() int64 {
 33 | 	max := big.NewInt(int64(1) << 62)
 34 | 	bigx, _ := crand.Int(crand.Reader, max)
 35 | 	x := bigx.Int64()
 36 | 	return x
 37 | }
 38 | 
 39 | type config struct {
 40 | 	mu        sync.Mutex
 41 | 	t         *testing.T
 42 | 	net       *labrpc.Network
 43 | 	n         int
 44 | 	rafts     []*Raft
 45 | 	applyErr  []string // from apply channel readers
 46 | 	connected []bool   // whether each server is on the net
 47 | 	saved     []*Persister
 48 | 	endnames  [][]string            // the port file names each sends to
 49 | 	logs      []map[int]interface{} // copy of each server's committed entries
 50 | 	start     time.Time             // time at which make_config() was called
 51 | 	// begin()/end() statistics
 52 | 	t0        time.Time // time at which test_test.go called cfg.begin()
 53 | 	rpcs0     int       // rpcTotal() at start of test
 54 | 	cmds0     int       // number of agreements
 55 | 	bytes0    int64
 56 | 	maxIndex  int
 57 | 	maxIndex0 int
 58 | }
 59 | 
 60 | var ncpu_once sync.Once
 61 | 
 62 | func make_config(t *testing.T, n int, unreliable bool, snapshot bool) *config {
 63 | 	ncpu_once.Do(func() {
 64 | 		if runtime.NumCPU() < 2 {
 65 | 			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
 66 | 		}
 67 | 		rand.Seed(makeSeed())
 68 | 	})
 69 | 	runtime.GOMAXPROCS(4)
 70 | 	cfg := &config{}
 71 | 	cfg.t = t
 72 | 	cfg.net = labrpc.MakeNetwork()
 73 | 	cfg.n = n
 74 | 	cfg.applyErr = make([]string, cfg.n)
 75 | 	cfg.rafts = make([]*Raft, cfg.n)
 76 | 	cfg.connected = make([]bool, cfg.n)
 77 | 	cfg.saved = make([]*Persister, cfg.n)
 78 | 	cfg.endnames = make([][]string, cfg.n)
 79 | 	cfg.logs = make([]map[int]interface{}, cfg.n)
 80 | 	cfg.start = time.Now()
 81 | 
 82 | 	cfg.setunreliable(unreliable)
 83 | 
 84 | 	cfg.net.LongDelays(true)
 85 | 
 86 | 	applier := cfg.applier
 87 | 	if snapshot {
 88 | 		applier = cfg.applierSnap
 89 | 	}
 90 | 	// create a full set of Rafts.
 91 | 	for i := 0; i < cfg.n; i++ {
 92 | 		cfg.logs[i] = map[int]interface{}{}
 93 | 		cfg.start1(i, applier)
 94 | 	}
 95 | 
 96 | 	// connect everyone
 97 | 	for i := 0; i < cfg.n; i++ {
 98 | 		cfg.connect(i)
 99 | 	}
100 | 
101 | 	return cfg
102 | }
103 | 
104 | // shut down a Raft server but save its persistent state.
105 | func (cfg *config) crash1(i int) {
106 | 	cfg.disconnect(i)
107 | 	cfg.net.DeleteServer(i) // disable client connections to the server.
108 | 
109 | 	cfg.mu.Lock()
110 | 	defer cfg.mu.Unlock()
111 | 
112 | 	// a fresh persister, in case old instance
113 | 	// continues to update the Persister.
114 | 	// but copy old persister's content so that we always
115 | 	// pass Make() the last persisted state.
116 | 	if cfg.saved[i] != nil {
117 | 		cfg.saved[i] = cfg.saved[i].Copy()
118 | 	}
119 | 
120 | 	rf := cfg.rafts[i]
121 | 	if rf != nil {
122 | 		cfg.mu.Unlock()
123 | 		rf.Kill()
124 | 		cfg.mu.Lock()
125 | 		cfg.rafts[i] = nil
126 | 	}
127 | 
128 | 	if cfg.saved[i] != nil {
129 | 		raftlog := cfg.saved[i].ReadRaftState()
130 | 		snapshot := cfg.saved[i].ReadSnapshot()
131 | 		cfg.saved[i] = &Persister{}
132 | 		cfg.saved[i].SaveStateAndSnapshot(raftlog, snapshot)
133 | 	}
134 | }
135 | 
136 | func (cfg *config) checkLogs(i int, m ApplyMsg) (string, bool) {
137 | 	err_msg := ""
138 | 	v := m.Command
139 | 	for j := 0; j < len(cfg.logs); j++ {
140 | 		if old, oldok := cfg.logs[j][m.CommandIndex]; oldok && old != v {
141 | 			log.Printf("%v: log %v; server %v\n", i, cfg.logs[i], cfg.logs[j])
142 | 			// some server has already committed a different value for this entry!
143 | 			err_msg = fmt.Sprintf("commit index=%v server=%v %v != server=%v %v",
144 | 				m.CommandIndex, i, m.Command, j, old)
145 | 		}
146 | 	}
147 | 	_, prevok := cfg.logs[i][m.CommandIndex-1]
148 | 	cfg.logs[i][m.CommandIndex] = v
149 | 	if m.CommandIndex > cfg.maxIndex {
150 | 		cfg.maxIndex = m.CommandIndex
151 | 	}
152 | 	return err_msg, prevok
153 | }
154 | 
155 | // applier reads message from apply ch and checks that they match the log
156 | // contents
157 | func (cfg *config) applier(i int, applyCh chan ApplyMsg) {
158 | 	for m := range applyCh {
159 | 		if m.CommandValid == false {
160 | 			// ignore other types of ApplyMsg
161 | 		} else {
162 | 			cfg.mu.Lock()
163 | 			err_msg, prevok := cfg.checkLogs(i, m)
164 | 			cfg.mu.Unlock()
165 | 			if m.CommandIndex > 1 && prevok == false {
166 | 				err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex)
167 | 			}
168 | 			if err_msg != "" {
169 | 				log.Fatalf("apply error: %v\n", err_msg)
170 | 				cfg.applyErr[i] = err_msg
171 | 				// keep reading after error so that Raft doesn't block
172 | 				// holding locks...
173 | 			}
174 | 		}
175 | 	}
176 | }
177 | 
178 | const SnapShotInterval = 10
179 | 
180 | // periodically snapshot raft state
181 | func (cfg *config) applierSnap(i int, applyCh chan ApplyMsg) {
182 | 	lastApplied := 0
183 | 	for m := range applyCh {
184 | 		if m.SnapshotValid {
185 | 			//DPrintf("Installsnapshot %v %v\n", m.SnapshotIndex, lastApplied)
186 | 			cfg.mu.Lock()
187 | 			if cfg.rafts[i].CondInstallSnapshot(m.SnapshotTerm,
188 | 				m.SnapshotIndex, m.Snapshot) {
189 | 				cfg.logs[i] = make(map[int]interface{})
190 | 				r := bytes.NewBuffer(m.Snapshot)
191 | 				d := labgob.NewDecoder(r)
192 | 				var v int
193 | 				if d.Decode(&v) != nil {
194 | 					log.Fatalf("decode error\n")
195 | 				}
196 | 				cfg.logs[i][m.SnapshotIndex] = v
197 | 				lastApplied = m.SnapshotIndex
198 | 			}
199 | 			cfg.mu.Unlock()
200 | 		} else if m.CommandValid && m.CommandIndex > lastApplied {
201 | 			//DPrintf("apply %v lastApplied %v\n", m.CommandIndex, lastApplied)
202 | 			cfg.mu.Lock()
203 | 			err_msg, prevok := cfg.checkLogs(i, m)
204 | 			cfg.mu.Unlock()
205 | 			if m.CommandIndex > 1 && prevok == false {
206 | 				err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex)
207 | 			}
208 | 			if err_msg != "" {
209 | 				log.Fatalf("apply error: %v\n", err_msg)
210 | 				cfg.applyErr[i] = err_msg
211 | 				// keep reading after error so that Raft doesn't block
212 | 				// holding locks...
213 | 			}
214 | 			lastApplied = m.CommandIndex
215 | 			if (m.CommandIndex+1)%SnapShotInterval == 0 {
216 | 				w := new(bytes.Buffer)
217 | 				e := labgob.NewEncoder(w)
218 | 				v := m.Command
219 | 				e.Encode(v)
220 | 				cfg.rafts[i].Snapshot(m.CommandIndex, w.Bytes())
221 | 			}
222 | 		} else {
223 | 			// Ignore other types of ApplyMsg or old
224 | 			// commands. Old command may never happen,
225 | 			// depending on the Raft implementation, but
226 | 			// just in case.
227 | 			// DPrintf("Ignore: Index %v lastApplied %v\n", m.CommandIndex, lastApplied)
228 | 
229 | 		}
230 | 	}
231 | }
232 | 
233 | //
234 | // start or re-start a Raft.
235 | // if one already exists, "kill" it first.
236 | // allocate new outgoing port file names, and a new
237 | // state persister, to isolate previous instance of
238 | // this server. since we cannot really kill it.
239 | //
240 | func (cfg *config) start1(i int, applier func(int, chan ApplyMsg)) {
241 | 	cfg.crash1(i)
242 | 
243 | 	// a fresh set of outgoing ClientEnd names.
244 | 	// so that old crashed instance's ClientEnds can't send.
245 | 	cfg.endnames[i] = make([]string, cfg.n)
246 | 	for j := 0; j < cfg.n; j++ {
247 | 		cfg.endnames[i][j] = randstring(20)
248 | 	}
249 | 
250 | 	// a fresh set of ClientEnds.
251 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
252 | 	for j := 0; j < cfg.n; j++ {
253 | 		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
254 | 		cfg.net.Connect(cfg.endnames[i][j], j)
255 | 	}
256 | 
257 | 	cfg.mu.Lock()
258 | 
259 | 	// a fresh persister, so old instance doesn't overwrite
260 | 	// new instance's persisted state.
261 | 	// but copy old persister's content so that we always
262 | 	// pass Make() the last persisted state.
263 | 	if cfg.saved[i] != nil {
264 | 		cfg.saved[i] = cfg.saved[i].Copy()
265 | 	} else {
266 | 		cfg.saved[i] = MakePersister()
267 | 	}
268 | 
269 | 	cfg.mu.Unlock()
270 | 
271 | 	applyCh := make(chan ApplyMsg)
272 | 
273 | 	rf := Make(ends, i, cfg.saved[i], applyCh)
274 | 
275 | 	cfg.mu.Lock()
276 | 	cfg.rafts[i] = rf
277 | 	cfg.mu.Unlock()
278 | 
279 | 	go applier(i, applyCh)
280 | 
281 | 	svc := labrpc.MakeService(rf)
282 | 	srv := labrpc.MakeServer()
283 | 	srv.AddService(svc)
284 | 	cfg.net.AddServer(i, srv)
285 | }
286 | 
287 | func (cfg *config) checkTimeout() {
288 | 	// enforce a two minute real-time limit on each test
289 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
290 | 		cfg.t.Fatal("test took longer than 120 seconds")
291 | 	}
292 | }
293 | 
294 | func (cfg *config) cleanup() {
295 | 	for i := 0; i < len(cfg.rafts); i++ {
296 | 		if cfg.rafts[i] != nil {
297 | 			cfg.rafts[i].Kill()
298 | 		}
299 | 	}
300 | 	cfg.net.Cleanup()
301 | 	cfg.checkTimeout()
302 | }
303 | 
304 | // attach server i to the net.
305 | func (cfg *config) connect(i int) {
306 | 	// fmt.Printf("connect(%d)\n", i)
307 | 
308 | 	cfg.connected[i] = true
309 | 
310 | 	// outgoing ClientEnds
311 | 	for j := 0; j < cfg.n; j++ {
312 | 		if cfg.connected[j] {
313 | 			endname := cfg.endnames[i][j]
314 | 			cfg.net.Enable(endname, true)
315 | 		}
316 | 	}
317 | 
318 | 	// incoming ClientEnds
319 | 	for j := 0; j < cfg.n; j++ {
320 | 		if cfg.connected[j] {
321 | 			endname := cfg.endnames[j][i]
322 | 			cfg.net.Enable(endname, true)
323 | 		}
324 | 	}
325 | }
326 | 
327 | // detach server i from the net.
328 | func (cfg *config) disconnect(i int) {
329 | 	// fmt.Printf("disconnect(%d)\n", i)
330 | 
331 | 	cfg.connected[i] = false
332 | 
333 | 	// outgoing ClientEnds
334 | 	for j := 0; j < cfg.n; j++ {
335 | 		if cfg.endnames[i] != nil {
336 | 			endname := cfg.endnames[i][j]
337 | 			cfg.net.Enable(endname, false)
338 | 		}
339 | 	}
340 | 
341 | 	// incoming ClientEnds
342 | 	for j := 0; j < cfg.n; j++ {
343 | 		if cfg.endnames[j] != nil {
344 | 			endname := cfg.endnames[j][i]
345 | 			cfg.net.Enable(endname, false)
346 | 		}
347 | 	}
348 | }
349 | 
350 | func (cfg *config) rpcCount(server int) int {
351 | 	return cfg.net.GetCount(server)
352 | }
353 | 
354 | func (cfg *config) rpcTotal() int {
355 | 	return cfg.net.GetTotalCount()
356 | }
357 | 
358 | func (cfg *config) setunreliable(unrel bool) {
359 | 	cfg.net.Reliable(!unrel)
360 | }
361 | 
362 | func (cfg *config) bytesTotal() int64 {
363 | 	return cfg.net.GetTotalBytes()
364 | }
365 | 
366 | func (cfg *config) setlongreordering(longrel bool) {
367 | 	cfg.net.LongReordering(longrel)
368 | }
369 | 
370 | // check that there's exactly one leader.
371 | // try a few times in case re-elections are needed.
372 | func (cfg *config) checkOneLeader() int {
373 | 	for iters := 0; iters < 10; iters++ {
374 | 		ms := 450 + (rand.Int63() % 100)
375 | 		time.Sleep(time.Duration(ms) * time.Millisecond)
376 | 
377 | 		leaders := make(map[int][]int)
378 | 		for i := 0; i < cfg.n; i++ {
379 | 			if cfg.connected[i] {
380 | 				if term, leader := cfg.rafts[i].GetState(); leader {
381 | 					leaders[term] = append(leaders[term], i)
382 | 				}
383 | 			}
384 | 		}
385 | 
386 | 		lastTermWithLeader := -1
387 | 		for term, leaders := range leaders {
388 | 			if len(leaders) > 1 {
389 | 				cfg.t.Fatalf("term %d has %d (>1) leaders", term, len(leaders))
390 | 			}
391 | 			if term > lastTermWithLeader {
392 | 				lastTermWithLeader = term
393 | 			}
394 | 		}
395 | 
396 | 		if len(leaders) != 0 {
397 | 			return leaders[lastTermWithLeader][0]
398 | 		}
399 | 	}
400 | 	cfg.t.Fatalf("expected one leader, got none")
401 | 	return -1
402 | }
403 | 
404 | // check that everyone agrees on the term.
405 | func (cfg *config) checkTerms() int {
406 | 	term := -1
407 | 	for i := 0; i < cfg.n; i++ {
408 | 		if cfg.connected[i] {
409 | 			xterm, _ := cfg.rafts[i].GetState()
410 | 			if term == -1 {
411 | 				term = xterm
412 | 			} else if term != xterm {
413 | 				cfg.t.Fatalf("servers disagree on term")
414 | 			}
415 | 		}
416 | 	}
417 | 	return term
418 | }
419 | 
420 | // check that there's no leader
421 | func (cfg *config) checkNoLeader() {
422 | 	for i := 0; i < cfg.n; i++ {
423 | 		if cfg.connected[i] {
424 | 			_, is_leader := cfg.rafts[i].GetState()
425 | 			if is_leader {
426 | 				cfg.t.Fatalf("expected no leader, but %v claims to be leader", i)
427 | 			}
428 | 		}
429 | 	}
430 | }
431 | 
432 | // how many servers think a log entry is committed?
433 | func (cfg *config) nCommitted(index int) (int, interface{}) {
434 | 	count := 0
435 | 	var cmd interface{} = nil
436 | 	for i := 0; i < len(cfg.rafts); i++ {
437 | 		if cfg.applyErr[i] != "" {
438 | 			cfg.t.Fatal(cfg.applyErr[i])
439 | 		}
440 | 
441 | 		cfg.mu.Lock()
442 | 		cmd1, ok := cfg.logs[i][index]
443 | 		cfg.mu.Unlock()
444 | 
445 | 		if ok {
446 | 			if count > 0 && cmd != cmd1 {
447 | 				cfg.t.Fatalf("committed values do not match: index %v, %v, %v\n",
448 | 					index, cmd, cmd1)
449 | 			}
450 | 			count += 1
451 | 			cmd = cmd1
452 | 		}
453 | 	}
454 | 	return count, cmd
455 | }
456 | 
457 | // wait for at least n servers to commit.
458 | // but don't wait forever.
459 | func (cfg *config) wait(index int, n int, startTerm int) interface{} {
460 | 	to := 10 * time.Millisecond
461 | 	for iters := 0; iters < 30; iters++ {
462 | 		nd, _ := cfg.nCommitted(index)
463 | 		if nd >= n {
464 | 			break
465 | 		}
466 | 		time.Sleep(to)
467 | 		if to < time.Second {
468 | 			to *= 2
469 | 		}
470 | 		if startTerm > -1 {
471 | 			for _, r := range cfg.rafts {
472 | 				if t, _ := r.GetState(); t > startTerm {
473 | 					// someone has moved on
474 | 					// can no longer guarantee that we'll "win"
475 | 					return -1
476 | 				}
477 | 			}
478 | 		}
479 | 	}
480 | 	nd, cmd := cfg.nCommitted(index)
481 | 	if nd < n {
482 | 		cfg.t.Fatalf("only %d decided for index %d; wanted %d\n",
483 | 			nd, index, n)
484 | 	}
485 | 	return cmd
486 | }
487 | 
488 | // do a complete agreement.
489 | // it might choose the wrong leader initially,
490 | // and have to re-submit after giving up.
491 | // entirely gives up after about 10 seconds.
492 | // indirectly checks that the servers agree on the
493 | // same value, since nCommitted() checks this,
494 | // as do the threads that read from applyCh.
495 | // returns index.
496 | // if retry==true, may submit the command multiple
497 | // times, in case a leader fails just after Start().
498 | // if retry==false, calls Start() only once, in order
499 | // to simplify the early Lab 2B tests.
500 | func (cfg *config) one(cmd interface{}, expectedServers int, retry bool) int {
501 | 	t0 := time.Now()
502 | 	starts := 0
503 | 	for time.Since(t0).Seconds() < 10 {
504 | 		// try all the servers, maybe one is the leader.
505 | 		index := -1
506 | 		for si := 0; si < cfg.n; si++ {
507 | 			starts = (starts + 1) % cfg.n
508 | 			var rf *Raft
509 | 			cfg.mu.Lock()
510 | 			if cfg.connected[starts] {
511 | 				rf = cfg.rafts[starts]
512 | 			}
513 | 			cfg.mu.Unlock()
514 | 			if rf != nil {
515 | 				index1, _, ok := rf.Start(cmd)
516 | 				if ok {
517 | 					index = index1
518 | 					break
519 | 				}
520 | 			}
521 | 		}
522 | 
523 | 		if index != -1 {
524 | 			// somebody claimed to be the leader and to have
525 | 			// submitted our command; wait a while for agreement.
526 | 			t1 := time.Now()
527 | 			for time.Since(t1).Seconds() < 2 {
528 | 				nd, cmd1 := cfg.nCommitted(index)
529 | 				if nd > 0 && nd >= expectedServers {
530 | 					// committed
531 | 					if cmd1 == cmd {
532 | 						// and it was the command we submitted.
533 | 						return index
534 | 					}
535 | 				}
536 | 				time.Sleep(20 * time.Millisecond)
537 | 			}
538 | 			if retry == false {
539 | 				cfg.t.Fatalf("one(%v) failed to reach agreement", cmd)
540 | 			}
541 | 		} else {
542 | 			time.Sleep(50 * time.Millisecond)
543 | 		}
544 | 	}
545 | 	cfg.t.Fatalf("one(%v) failed to reach agreement", cmd)
546 | 	return -1
547 | }
548 | 
549 | // start a Test.
550 | // print the Test message.
551 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high")
552 | func (cfg *config) begin(description string) {
553 | 	fmt.Printf("%s ...\n", description)
554 | 	cfg.t0 = time.Now()
555 | 	cfg.rpcs0 = cfg.rpcTotal()
556 | 	cfg.bytes0 = cfg.bytesTotal()
557 | 	cfg.cmds0 = 0
558 | 	cfg.maxIndex0 = cfg.maxIndex
559 | }
560 | 
561 | // end a Test -- the fact that we got here means there
562 | // was no failure.
563 | // print the Passed message,
564 | // and some performance numbers.
565 | func (cfg *config) end() {
566 | 	cfg.checkTimeout()
567 | 	if cfg.t.Failed() == false {
568 | 		cfg.mu.Lock()
569 | 		t := time.Since(cfg.t0).Seconds()       // real time
570 | 		npeers := cfg.n                         // number of Raft peers
571 | 		nrpc := cfg.rpcTotal() - cfg.rpcs0      // number of RPC sends
572 | 		nbytes := cfg.bytesTotal() - cfg.bytes0 // number of bytes
573 | 		ncmds := cfg.maxIndex - cfg.maxIndex0   // number of Raft agreements reported
574 | 		cfg.mu.Unlock()
575 | 
576 | 		fmt.Printf("  ... Passed --")
577 | 		fmt.Printf("  %4.1f  %d %4d %7d %4d\n", t, npeers, nrpc, nbytes, ncmds)
578 | 	}
579 | }
580 | 
581 | // Maximum log size across all servers
582 | func (cfg *config) LogSize() int {
583 | 	logsize := 0
584 | 	for i := 0; i < cfg.n; i++ {
585 | 		n := cfg.saved[i].RaftStateSize()
586 | 		if n > logsize {
587 | 			logsize = n
588 | 		}
589 | 	}
590 | 	return logsize
591 | }
592 | 


--------------------------------------------------------------------------------
/lab_res/Lab2A.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holdonbush/MIT6.824-Lab2-Raft/c240da0adadfcd639cc7de44ff134da5f47444e4/lab_res/Lab2A.PNG


--------------------------------------------------------------------------------
/lab_res/Lab2B.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holdonbush/MIT6.824-Lab2-Raft/c240da0adadfcd639cc7de44ff134da5f47444e4/lab_res/Lab2B.PNG


--------------------------------------------------------------------------------
/lab_res/Lab2C.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holdonbush/MIT6.824-Lab2-Raft/c240da0adadfcd639cc7de44ff134da5f47444e4/lab_res/Lab2C.PNG


--------------------------------------------------------------------------------
/lab_res/Lab2D.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holdonbush/MIT6.824-Lab2-Raft/c240da0adadfcd639cc7de44ff134da5f47444e4/lab_res/Lab2D.PNG


--------------------------------------------------------------------------------
/persister.go:
--------------------------------------------------------------------------------
 1 | package raft
 2 | 
 3 | //
 4 | // support for Raft and kvraft to save persistent
 5 | // Raft state (log &c) and k/v server snapshots.
 6 | //
 7 | // we will use the original persister.go to test your code for grading.
 8 | // so, while you can modify this code to help you debug, please
 9 | // test with the original before submitting.
10 | //
11 | 
12 | import "sync"
13 | 
14 | type Persister struct {
15 | 	mu        sync.Mutex
16 | 	raftstate []byte
17 | 	snapshot  []byte
18 | }
19 | 
20 | func MakePersister() *Persister {
21 | 	return &Persister{}
22 | }
23 | 
24 | func clone(orig []byte) []byte {
25 | 	x := make([]byte, len(orig))
26 | 	copy(x, orig)
27 | 	return x
28 | }
29 | 
30 | func (ps *Persister) Copy() *Persister {
31 | 	ps.mu.Lock()
32 | 	defer ps.mu.Unlock()
33 | 	np := MakePersister()
34 | 	np.raftstate = ps.raftstate
35 | 	np.snapshot = ps.snapshot
36 | 	return np
37 | }
38 | 
39 | func (ps *Persister) SaveRaftState(state []byte) {
40 | 	ps.mu.Lock()
41 | 	defer ps.mu.Unlock()
42 | 	ps.raftstate = clone(state)
43 | }
44 | 
45 | func (ps *Persister) ReadRaftState() []byte {
46 | 	ps.mu.Lock()
47 | 	defer ps.mu.Unlock()
48 | 	return clone(ps.raftstate)
49 | }
50 | 
51 | func (ps *Persister) RaftStateSize() int {
52 | 	ps.mu.Lock()
53 | 	defer ps.mu.Unlock()
54 | 	return len(ps.raftstate)
55 | }
56 | 
57 | // Save both Raft state and K/V snapshot as a single atomic action,
58 | // to help avoid them getting out of sync.
59 | func (ps *Persister) SaveStateAndSnapshot(state []byte, snapshot []byte) {
60 | 	ps.mu.Lock()
61 | 	defer ps.mu.Unlock()
62 | 	ps.raftstate = clone(state)
63 | 	ps.snapshot = clone(snapshot)
64 | }
65 | 
66 | func (ps *Persister) ReadSnapshot() []byte {
67 | 	ps.mu.Lock()
68 | 	defer ps.mu.Unlock()
69 | 	return clone(ps.snapshot)
70 | }
71 | 
72 | func (ps *Persister) SnapshotSize() int {
73 | 	ps.mu.Lock()
74 | 	defer ps.mu.Unlock()
75 | 	return len(ps.snapshot)
76 | }
77 | 


--------------------------------------------------------------------------------
/raft.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | //
  4 | // this is an outline of the API that raft must expose to
  5 | // the service (or tester). see comments below for
  6 | // each of these functions for more details.
  7 | //
  8 | // rf = Make(...)
  9 | //   create a new Raft server.
 10 | // rf.Start(command interface{}) (index, term, isleader)
 11 | //   start agreement on a new log entry
 12 | // rf.GetState() (term, isLeader)
 13 | //   ask a Raft for its current term, and whether it thinks it is leader
 14 | // ApplyMsg
 15 | //   each time a new entry is committed to the log, each Raft peer
 16 | //   should send an ApplyMsg to the service (or tester)
 17 | //   in the same server.
 18 | //
 19 | 
 20 | import (
 21 | 	"6.824/labgob"
 22 | 	"bytes"
 23 | 	"fmt"
 24 | 	"math/rand"
 25 | 
 26 | 	//	"bytes"
 27 | 	"sync"
 28 | 	"sync/atomic"
 29 | 	"time"
 30 | 
 31 | 	//	"6.824/labgob"
 32 | 	"6.824/labrpc"
 33 | )
 34 | 
 35 | 
 36 | //
 37 | // as each Raft peer becomes aware that successive log entries are
 38 | // committed, the peer should send an ApplyMsg to the service (or
 39 | // tester) on the same server, via the applyCh passed to Make(). set
 40 | // CommandValid to true to indicate that the ApplyMsg contains a newly
 41 | // committed log entry.
 42 | //
 43 | // in part 2D you'll want to send other kinds of messages (e.g.,
 44 | // snapshots) on the applyCh, but set CommandValid to false for these
 45 | // other uses.
 46 | //
 47 | type ApplyMsg struct {
 48 | 	CommandValid bool
 49 | 	Command      interface{}
 50 | 	CommandIndex int
 51 | 
 52 | 	// For 2D:
 53 | 	SnapshotValid bool
 54 | 	Snapshot      []byte
 55 | 	SnapshotTerm  int
 56 | 	SnapshotIndex int
 57 | }
 58 | 
 59 | //
 60 | // A Go object implementing a single Raft peer.
 61 | //
 62 | type Raft struct {
 63 | 	mu        sync.Mutex          // Lock to protect shared access to this peer's state
 64 | 	peers     []*labrpc.ClientEnd // RPC end points of all peers
 65 | 	persister *Persister          // Object to hold this peer's persisted state
 66 | 	me        int                 // this peer's index into peers[]
 67 | 	dead      int32               // set by Kill()
 68 | 
 69 | 	// Your data here (2A, 2B, 2C).
 70 | 	// Look at the paper's Figure 2 for a description of what
 71 | 	// state a Raft server must maintain.
 72 | 	currentTerm		int           // Server当前的term
 73 | 	voteFor			int           // Server在选举阶段的投票目标
 74 | 	logs            []LogEntry
 75 | 	nextIndexs      []int         // Leader在发送LogEntry时，对应每个其他Server，开始发送的index
 76 | 	matchIndexs     []int
 77 | 	commitIndex     int           // Server已经commit了的Log index
 78 | 	lastApplied     int           // Server已经apply了的log index
 79 | 	myStatus        Status        // Server的状态
 80 | 
 81 | 	timer           *time.Ticker  // timer
 82 | 	voteTimeout     time.Duration // 选举超时时间，选举超时时间是会变动的，所以定义在Raft结构体中
 83 | 	applyChan       chan ApplyMsg // 消息channel
 84 | 
 85 | 	// 2D
 86 | 	lastIncludeIndex  int         // snapshot保存的最后log的index
 87 | 	lastIncludeTerm   int         // snapshot保存的最后log的term
 88 | 	snapshotCmd       []byte
 89 | }
 90 | 
 91 | // LogEntry
 92 | type LogEntry struct {
 93 | 	Term    int                // LogEntry中记录有log的Term
 94 | 	Cmd     interface{}        // Log的command
 95 | }
 96 | 
 97 | // 定义一个全局心跳超时时间
 98 | var HeartBeatTimeout = 120*time.Millisecond
 99 | 
100 | type Status int64
101 | const (
102 | 	Follower Status = iota
103 | 	Candidate
104 | 	Leader
105 | )
106 | 
107 | // return currentTerm and whether this server
108 | // believes it is the leader.
109 | func (rf *Raft) GetState() (int, bool) {
110 | 
111 | 	var term int
112 | 	var isleader bool
113 | 	// Your code here (2A).
114 | 	// 获取Server当前的Term和是否是Leader
115 | 	rf.mu.Lock()
116 | 	term = rf.currentTerm
117 | 	isleader = rf.myStatus == Leader
118 | 	rf.mu.Unlock()
119 | 	return term, isleader
120 | }
121 | 
122 | //
123 | // save Raft's persistent state to stable storage,
124 | // where it can later be retrieved after a crash and restart.
125 | // see paper's Figure 2 for a description of what should be persistent.
126 | //
127 | func (rf *Raft) persist() {
128 | 	// Your code here (2C).
129 | 	// Example:
130 | 	// w := new(bytes.Buffer)
131 | 	// e := labgob.NewEncoder(w)
132 | 	// e.Encode(rf.xxx)
133 | 	// e.Encode(rf.yyy)
134 | 	// data := w.Bytes()
135 | 	// rf.persister.SaveRaftState(data)
136 | 	w := new(bytes.Buffer)
137 | 	e := labgob.NewEncoder(w)
138 | 	e.Encode(rf.currentTerm)
139 | 	e.Encode(rf.voteFor)
140 | 	e.Encode(rf.logs)
141 | 	data := w.Bytes()
142 | 	rf.persister.SaveRaftState(data)
143 | }
144 | 
145 | 
146 | //
147 | // restore previously persisted state.
148 | //
149 | func (rf *Raft) readPersist(data []byte) {
150 | 	if data == nil || len(data) < 1 { // bootstrap without any state?
151 | 		return
152 | 	}
153 | 	// Your code here (2C).
154 | 	// Example:
155 | 	// r := bytes.NewBuffer(data)
156 | 	// d := labgob.NewDecoder(r)
157 | 	// var xxx
158 | 	// var yyy
159 | 	// if d.Decode(&xxx) != nil ||
160 | 	//    d.Decode(&yyy) != nil {
161 | 	//   error...
162 | 	// } else {
163 | 	//   rf.xxx = xxx
164 | 	//   rf.yyy = yyy
165 | 	// }
166 | 	r := bytes.NewBuffer(data)
167 | 	d := labgob.NewDecoder(r)
168 | 	var tmpTerm int
169 | 	var tmpVoteFor int
170 | 	var tmplogs []LogEntry
171 | 	if d.Decode(&tmpTerm) != nil ||
172 | 		d.Decode(&tmpVoteFor) != nil ||
173 | 		d.Decode(&tmplogs) != nil {
174 | 		fmt.Println("decode error")
175 | 	} else {
176 | 		rf.currentTerm = tmpTerm
177 | 		rf.voteFor = tmpVoteFor
178 | 		rf.logs = tmplogs
179 | 	}
180 | }
181 | 
182 | 
183 | //
184 | // A service wants to switch to snapshot.  Only do so if Raft hasn't
185 | // have more recent info since it communicate the snapshot on applyCh.
186 | //
187 | func (rf *Raft) CondInstallSnapshot(lastIncludedTerm int, lastIncludedIndex int, snapshot []byte) bool {
188 | 
189 | 	// Your code here (2D).
190 | 	// 在lastIncludeIndex后有新的内容，返回false
191 | 	rf.mu.Lock()
192 | 	if len(rf.logs)+rf.lastIncludeIndex > lastIncludedIndex {
193 | 		return false
194 | 	}
195 | 
196 | 	rf.snapshotCmd = snapshot
197 | 	// 当前log最大index小于等于snapshot中存储的最后一个index
198 | 	rf.logs = []LogEntry{}
199 | 	rf.lastIncludeTerm = lastIncludedTerm
200 | 	rf.lastIncludeIndex = lastIncludedIndex
201 | 
202 | 	rf.commitIndex = rf.lastIncludeIndex
203 | 	rf.lastApplied = rf.commitIndex
204 | 	rf.mu.Unlock()
205 | 	return true
206 | }
207 | 
208 | // the service says it has created a snapshot that has
209 | // all info up to and including index. this means the
210 | // service no longer needs the log through (and including)
211 | // that index. Raft should now trim its log as much as possible.
212 | func (rf *Raft) Snapshot(index int, snapshot []byte) {
213 | 	// Your code here (2D).
214 | 	if rf.killed() {
215 | 		return
216 | 	}
217 | 	pos := index - rf.lastIncludeIndex-1
218 | 	rf.lastIncludeIndex = index
219 | 	rf.lastIncludeTerm = rf.logs[pos].Term
220 | 	rf.logs = rf.logs[pos+1:]
221 | 	rf.snapshotCmd = snapshot
222 | }
223 | 
224 | 
225 | //
226 | // example RequestVote RPC arguments structure.
227 | // field names must start with capital letters!
228 | //
229 | type VoteErr int64
230 | const (
231 | 	Nil  VoteErr = iota     //投票过程无错误
232 | 	VoteReqOutofDate        //投票消息过期
233 | 	CandidateLogTooOld      //候选人Log不够新
234 | 	VotedThisTerm           //本Term内已经投过票
235 | 	RaftKilled              //Raft程已终止
236 | )
237 | 
238 | type RequestVoteArgs struct {
239 | 	// Your data here (2A, 2B).
240 | 	Term    		 int
241 | 	Candidate 		 int
242 | 	LastLogIndex 	 int    // 用于选举限制，LogEntry中最后Log的index
243 | 	LastLogTerm      int    // 用于选举限制，LogEntry中最后log的Term
244 | }
245 | 
246 | //
247 | // example RequestVote RPC reply structure.
248 | // field names must start with capital letters!
249 | //
250 | type RequestVoteReply struct {
251 | 	// Your data here (2A).
252 | 	Term     		int
253 | 	VoteGranted     bool     //是否同意投票
254 | 	VoteErr 	    VoteErr  //投票操作错误
255 | }
256 | 
257 | type AppendEntriesErr int64
258 | const (
259 | 	AppendErr_Nil AppendEntriesErr = iota    // Append操作无错误
260 | 	AppendErr_LogsNotMatch                   // Append操作log不匹配
261 | 	AppendErr_ReqOutofDate                   // Append操作请求过期
262 | 	AppendErr_ReqRepeat                      // Append请求重复
263 | 	AppendErr_Commited                       // Append的log已经commit
264 | 	AppendErr_RaftKilled                     // Raft程序终止
265 |  )
266 | 
267 | type AppendEntriesArgs struct {
268 | 	Term   				 int
269 | 	LeaderId  			 int            //Leader标识
270 | 	PrevLogIndex  		 int            //nextIndex前一个index
271 | 	PrevLogTerm    		 int            //nextindex前一个index处的term
272 | 	Logs    			 []LogEntry
273 | 	LeaderCommit  		 int            //Leader已经commit了的Log index
274 | 	LogIndex  			 int
275 | }
276 | 
277 | type AppendEntriesReply struct {
278 | 	Term       int
279 | 	Success    bool              // Append操作结果
280 | 	AppendErr  AppendEntriesErr  // Append操作错误情况
281 | 	NotMatchIndex  int           // 当前Term的第一个元素（没有被commit的元素）的index
282 | }
283 | 
284 | // snapshot
285 | type InstallSnapshotRequest struct {
286 | 	Term       		 int
287 | 	LeaderId   		 int
288 | 	LastIncludeIndex int
289 | 	LastIncludeTerm  int
290 | 	//Offset         int        // Lab2D不要求实现
291 | 	Data         	 []byte
292 | 	//Done         	 bool       // Lab2D不要求实现
293 | }
294 | 
295 | type InstallSnapshotErr int64
296 | const (
297 | 	InstallSnapshotErr_Nil InstallSnapshotErr = iota
298 | 	InstallSnapshotErr_ReqOutofDate
299 | 	InstallSnapshotErr_OldIndex
300 | )
301 | type InstallSnapshotResponse struct {
302 | 	Term         int
303 | 	Err          InstallSnapshotErr
304 | }
305 | 
306 | //
307 | // example RequestVote RPC handler.
308 | //
309 | // 投票过程
310 | func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
311 | 	// Your code here (2A, 2B).
312 | 	if rf.killed() {
313 | 		reply.Term = -1
314 | 		reply.VoteGranted = false
315 | 		reply.VoteErr = RaftKilled
316 | 		return
317 | 	}
318 | 	rf.mu.Lock()
319 | 	if args.Term < rf.currentTerm {      // 请求term更小，不投票
320 | 		reply.Term = rf.currentTerm
321 | 		reply.VoteGranted = false
322 | 		reply.VoteErr = VoteReqOutofDate
323 | 		rf.mu.Unlock()
324 | 		return
325 | 	}
326 | 
327 | 	if args.Term > rf.currentTerm {
328 | 		rf.myStatus = Follower
329 | 		rf.currentTerm = args.Term
330 | 		rf.voteFor = -1
331 | 	}
332 | 	// 选举限制-参数term小于自身的term
333 | 	candidateLogTermTooOld :=args.LastLogTerm < rf.lastIncludeTerm || (len(rf.logs) > 0 && args.LastLogTerm < rf.logs[len(rf.logs)-1].Term)    // 有log，取最后一条log的term比较 || 无log时参数term小于snapshot的term
334 | 	// 选举限制-term相等，参数index小于自身index
335 | 	candidateLogIndexTooOld :=  (args.LastLogIndex < rf.lastIncludeIndex) || (len(rf.logs) > 0 && args.LastLogTerm == rf.logs[len(rf.logs)-1].Term && args.LastLogIndex < len(rf.logs)+rf.lastIncludeIndex) //有log，取最后一条log的index比较 || 无log时取snapshot的index比较
336 | 
337 | 	// 选举限制
338 | 	if candidateLogIndexTooOld || candidateLogTermTooOld {
339 | 		rf.currentTerm = args.Term
340 | 		reply.Term = args.Term
341 | 		reply.VoteGranted = false
342 | 		reply.VoteErr = CandidateLogTooOld
343 | 		rf.persist()
344 | 		rf.mu.Unlock()
345 | 		return
346 | 	}
347 | 
348 | 	if args.Term == rf.currentTerm {
349 | 		reply.Term = args.Term
350 | 		// 已经投过票,且投给了同一人,由于某些原因，之前的resp丢失
351 | 		if rf.voteFor == args.Candidate {
352 | 			rf.myStatus = Follower
353 | 			rf.timer.Reset(rf.voteTimeout)
354 | 			reply.VoteGranted = true
355 | 			reply.VoteErr = VotedThisTerm
356 | 			rf.mu.Unlock()
357 | 			return
358 | 		}
359 | 		// 来自同一Term不同Candidate的请求，忽略
360 | 		if rf.voteFor != -1 {
361 | 			reply.VoteGranted = false
362 | 			reply.VoteErr = VotedThisTerm
363 | 			rf.mu.Unlock()
364 | 			return
365 | 		}
366 | 	}
367 | 
368 | 	// 可以投票
369 | 	rf.currentTerm = args.Term
370 | 	rf.voteFor = args.Candidate
371 | 	rf.myStatus = Follower
372 | 	rf.timer.Reset(rf.voteTimeout)
373 | 
374 | 	reply.Term = rf.currentTerm
375 | 	reply.VoteGranted = true
376 | 	reply.VoteErr = Nil
377 | 	rf.persist()
378 | 	rf.mu.Unlock()
379 | 	return
380 | }
381 | 
382 | // 心跳包/log追加
383 | func (rf *Raft) AppendEntries(args *AppendEntriesArgs, reply *AppendEntriesReply)  {
384 | 	if rf.killed() {
385 | 		reply.Term = -1
386 | 		reply.AppendErr = AppendErr_RaftKilled
387 | 		reply.Success = false
388 | 		return
389 | 	}
390 | 	rf.mu.Lock()
391 | 	// 无效消息
392 | 	if args.Term < rf.currentTerm || args.PrevLogIndex < rf.lastIncludeIndex {
393 | 		reply.Term = rf.currentTerm
394 | 		reply.Success = false
395 | 		reply.AppendErr = AppendErr_ReqOutofDate
396 | 		reply.NotMatchIndex = -1
397 | 		rf.mu.Unlock()
398 | 		return
399 | 	}
400 | 
401 | 	rf.currentTerm = args.Term
402 | 	rf.voteFor = args.LeaderId
403 | 	rf.myStatus = Follower
404 | 	rf.timer.Reset(rf.voteTimeout)
405 | 
406 | 	// 不匹配
407 | 	if (args.PrevLogIndex != rf.lastIncludeIndex && (args.PrevLogIndex >= len(rf.logs)+rf.lastIncludeIndex+1 || args.PrevLogTerm != rf.logs[args.PrevLogIndex-rf.lastIncludeIndex-1].Term)) ||
408 | 		(args.PrevLogIndex == rf.lastIncludeIndex && args.PrevLogTerm != rf.lastIncludeTerm){
409 | 		reply.Term = rf.currentTerm
410 | 		reply.Success = false
411 | 		reply.AppendErr = AppendErr_LogsNotMatch
412 | 		reply.NotMatchIndex = rf.lastApplied + 1
413 | 		rf.persist()
414 | 		rf.mu.Unlock()
415 | 		return
416 | 	}
417 | 
418 | 	if rf.lastApplied > args.PrevLogIndex {
419 | 		reply.Term = rf.currentTerm
420 | 		reply.Success = false
421 | 		reply.AppendErr = AppendErr_Commited
422 | 		reply.NotMatchIndex = rf.lastApplied+1
423 | 		rf.persist()
424 | 		rf.mu.Unlock()
425 | 		return
426 | 	}
427 | 
428 | 	// 处理日志
429 | 	if args.Logs != nil {
430 | 		rf.logs = rf.logs[:args.PrevLogIndex-rf.lastIncludeIndex]
431 | 		rf.logs = append(rf.logs, args.Logs...)
432 | 	}
433 | 	for rf.lastApplied < args.LeaderCommit {
434 | 		rf.lastApplied++
435 | 		applyMsg := ApplyMsg{
436 | 			CommandValid: true,
437 | 			CommandIndex: rf.lastApplied,
438 | 			Command: rf.logs[rf.lastApplied-rf.lastIncludeIndex-1].Cmd,
439 | 		}
440 | 		rf.applyChan <- applyMsg
441 | 		rf.commitIndex = rf.lastApplied
442 | 	}
443 | 
444 | 
445 | 	reply.Term = rf.currentTerm
446 | 	reply.Success = true
447 | 	reply.AppendErr = AppendErr_Nil
448 | 	reply.NotMatchIndex = -1
449 | 	rf.persist()
450 | 	rf.mu.Unlock()
451 | 	return
452 | }
453 | 
454 | func (rf *Raft) InstallSnapshot(args *InstallSnapshotRequest, reply *InstallSnapshotResponse) {
455 | 	if rf.killed() {
456 | 		reply.Term = args.Term
457 | 		return
458 | 	}
459 | 
460 | 	rf.mu.Lock()
461 | 	//fmt.Println(rf.me,"收到snapshot","来自",args.LeaderId,"自身lastinclueindex",rf.lastIncludeTerm, args.LastIncludeIndex)
462 | 	// defer rf.mu.Unlock()
463 | 
464 | 	// 过期消息
465 | 	if args.Term < rf.currentTerm {
466 | 		reply.Term = rf.currentTerm
467 | 		//fmt.Println("installsnapshot消息过期")
468 | 		reply.Err = InstallSnapshotErr_ReqOutofDate
469 | 		rf.mu.Unlock()
470 | 		return
471 | 	}
472 | 
473 | 	// 错误消息
474 | 	if args.LastIncludeIndex <= rf.lastIncludeIndex {
475 | 		reply.Term = rf.currentTerm
476 | 		//fmt.Println("installsnapsho消息lastIncludeIndex错误")
477 | 		reply.Err = InstallSnapshotErr_OldIndex
478 | 		rf.timer.Reset(rf.voteTimeout)
479 | 		rf.mu.Unlock()
480 | 		return
481 | 	}
482 | 	// 创建快照
483 | 	rf.currentTerm = args.Term
484 | 	rf.voteFor = args.LeaderId
485 | 	rf.myStatus = Follower
486 | 	rf.timer.Reset(rf.voteTimeout)
487 | 
488 | 	if len(rf.logs)+rf.lastIncludeIndex <= args.LastIncludeIndex {
489 | 		rf.logs = []LogEntry{}
490 | 		rf.lastIncludeIndex = args.LastIncludeIndex
491 | 		rf.lastIncludeTerm = args.LastIncludeTerm
492 | 	} else {
493 | 		//  rf.logs = rf.logs[len(rf.logs)+rf.lastIncludeIndex-args.LastIncludeIndex:]
494 | 		rf.logs = rf.logs[args.LastIncludeIndex-rf.lastIncludeIndex:]
495 | 		rf.lastIncludeIndex = args.LastIncludeIndex
496 | 		rf.lastIncludeTerm = args.LastIncludeTerm
497 | 	}
498 | 
499 | 	rf.applyChan <- ApplyMsg{
500 | 		SnapshotValid: true,
501 | 		Snapshot:      args.Data,
502 | 		SnapshotTerm:  args.LastIncludeTerm,
503 | 		SnapshotIndex: args.LastIncludeIndex,
504 | 	}
505 | 	rf.lastApplied = args.LastIncludeIndex
506 | 	rf.commitIndex = rf.lastApplied
507 | 
508 | 	reply.Term = rf.currentTerm
509 | 	reply.Err = InstallSnapshotErr_Nil
510 | 	rf.mu.Unlock()
511 | 	return
512 | }
513 | //
514 | // example code to send a RequestVote RPC to a server.
515 | // server is the index of the target server in rf.peers[].
516 | // expects RPC arguments in args.
517 | // fills in *reply with RPC reply, so caller should
518 | // pass &reply.
519 | // the types of the args and reply passed to Call() must be
520 | // the same as the types of the arguments declared in the
521 | // handler function (including whether they are pointers).
522 | //
523 | // The labrpc package simulates a lossy network, in which servers
524 | // may be unreachable, and in which requests and replies may be lost.
525 | // Call() sends a request and waits for a reply. If a reply arrives
526 | // within a timeout interval, Call() returns true; otherwise
527 | // Call() returns false. Thus Call() may not return for a while.
528 | // A false return can be caused by a dead server, a live server that
529 | // can't be reached, a lost request, or a lost reply.
530 | //
531 | // Call() is guaranteed to return (perhaps after a delay) *except* if the
532 | // handler function on the server side does not return.  Thus there
533 | // is no need to implement your own timeouts around Call().
534 | //
535 | // look at the comments in ../labrpc/labrpc.go for more details.
536 | //
537 | // if you're having trouble getting RPC to work, check that you've
538 | // capitalized all field names in structs passed over RPC, and
539 | // that the caller passes the address of the reply struct with &, not
540 | // the struct itself.
541 | //
542 | // 改造函数，添加了一个参数，用于方便实现同一Term内请求的统计
543 | func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply, voteNum *int) bool {
544 | 	if rf.killed() {
545 | 		return false
546 | 	}
547 | 	ok := rf.peers[server].Call("Raft.RequestVote", args, reply)
548 | 	for !ok {
549 | 		// 失败重传
550 | 		if rf.killed() {
551 | 			return false
552 | 		}
553 | 		ok := rf.peers[server].Call("Raft.RequestVote", args, reply)
554 | 		if ok {
555 | 			break
556 | 		}
557 | 	}
558 | 
559 | 	if rf.killed() {
560 | 		return false
561 | 	}
562 | 	rf.mu.Lock()
563 | 	if args.Term < rf.currentTerm {   // 过期请求
564 | 		rf.mu.Unlock()
565 | 		return false
566 | 	}
567 | 	rf.mu.Unlock()
568 | 
569 | 	switch reply.VoteErr {
570 | 	case VoteReqOutofDate:
571 | 		rf.mu.Lock()
572 | 		rf.myStatus = Follower
573 | 		rf.timer.Reset(rf.voteTimeout)
574 | 		if reply.Term > rf.currentTerm {
575 | 			rf.currentTerm = reply.Term
576 | 			rf.voteFor = -1
577 | 			rf.persist()
578 | 		}
579 | 		rf.mu.Unlock()
580 | 	case CandidateLogTooOld:
581 | 		// 日志不够新
582 | 		rf.mu.Lock()
583 | 		rf.myStatus = Follower
584 | 		rf.timer.Reset(rf.voteTimeout)
585 | 		if reply.Term > rf.currentTerm {
586 | 			rf.currentTerm = reply.Term
587 | 			rf.voteFor = -1
588 | 			rf.persist()
589 | 		}
590 | 		rf.mu.Unlock()
591 | 	case Nil,VotedThisTerm:
592 | 		rf.mu.Lock()
593 | 		//根据是否同意投票，收集选票数量
594 | 		if reply.VoteGranted && reply.Term == rf.currentTerm && *voteNum <= len(rf.peers)/2 {
595 | 			*voteNum++
596 | 		}
597 | 		if *voteNum > len(rf.peers)/2 {
598 | 			*voteNum = 0
599 | 			if rf.myStatus == Leader {
600 | 				rf.mu.Unlock()
601 | 				return ok
602 | 			}
603 | 			rf.myStatus = Leader
604 | 			rf.nextIndexs = make([]int, len(rf.peers))
605 | 			for i,_ := range rf.nextIndexs {
606 | 				rf.nextIndexs[i] = len(rf.logs)+rf.lastIncludeIndex+1
607 | 			}
608 | 			rf.timer.Reset(HeartBeatTimeout)
609 | 		}
610 | 		rf.mu.Unlock()
611 | 	case RaftKilled:
612 | 		return false
613 | 	}
614 | 	return ok
615 | }
616 | 
617 | func (rf *Raft) sendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply, appendNum *int) bool {
618 | 	if rf.killed() {
619 | 		return false
620 | 	}
621 | 	ok := rf.peers[server].Call("Raft.AppendEntries", args, reply)
622 | 	for !ok {
623 | 		if rf.killed() {
624 | 			return false
625 | 		}
626 | 		ok = rf.peers[server].Call("Raft.AppendEntries", args, reply)
627 | 		if ok {
628 | 			break
629 | 		}
630 | 	}
631 | 
632 | 	if rf.killed() {
633 | 		return false
634 | 	}
635 | 	rf.mu.Lock()
636 | 	if args.Term < rf.currentTerm { // 过期消息
637 | 		rf.mu.Unlock()
638 | 		return false
639 | 	}
640 | 
641 | 	switch reply.AppendErr {
642 | 	case AppendErr_Nil:
643 | 		if reply.Success && reply.Term == rf.currentTerm && *appendNum <= len(rf.peers)/2 {
644 | 			*appendNum++
645 | 		}
646 | 		if rf.nextIndexs[server] > args.LogIndex+1 {
647 | 			rf.mu.Unlock()
648 | 			return ok
649 | 		}
650 | 		rf.nextIndexs[server] = args.LogIndex+1
651 | 		if *appendNum > len(rf.peers)/2 {
652 | 			*appendNum = 0
653 | 			if (args.LogIndex>rf.lastIncludeIndex && rf.logs[args.LogIndex-rf.lastIncludeIndex-1].Term != rf.currentTerm) ||
654 | 				(args.LogIndex == rf.lastIncludeIndex && rf.lastIncludeTerm != rf.currentTerm){
655 | 				rf.mu.Unlock()
656 | 				return false
657 | 			}
658 | 			for rf.lastApplied < args.LogIndex {
659 | 				rf.lastApplied++
660 | 				applyMsg := ApplyMsg{
661 | 					CommandValid:  true,
662 | 					Command:       rf.logs[rf.lastApplied-rf.lastIncludeIndex-1].Cmd,
663 | 					CommandIndex:  rf.lastApplied,
664 | 				}
665 | 				rf.applyChan <- applyMsg
666 | 				rf.commitIndex = rf.lastApplied
667 | 			}
668 | 		}
669 | 	case AppendErr_ReqOutofDate:
670 | 		rf.myStatus = Follower
671 | 		rf.timer.Reset(rf.voteTimeout)
672 | 		if reply.Term > rf.currentTerm {
673 | 			rf.currentTerm = reply.Term
674 | 			rf.voteFor = -1
675 | 			rf.persist()
676 | 		}
677 | 	case AppendErr_LogsNotMatch:
678 | 		if args.Term != rf.currentTerm {
679 | 			rf.mu.Unlock()
680 | 			return false
681 | 		}
682 | 		rf.nextIndexs[server] = reply.NotMatchIndex
683 | 	case AppendErr_ReqRepeat:
684 | 		if reply.Term > rf.currentTerm {
685 | 			rf.myStatus = Follower
686 | 			rf.currentTerm = reply.Term
687 | 			rf.voteFor = -1
688 | 			rf.timer.Reset(rf.voteTimeout)
689 | 			rf.persist()
690 | 		}
691 | 	case AppendErr_Commited:
692 | 		if args.Term != rf.currentTerm {
693 | 			rf.mu.Unlock()
694 | 			return false
695 | 		}
696 | 		rf.nextIndexs[server] = reply.NotMatchIndex
697 | 	case AppendErr_RaftKilled:
698 | 		rf.mu.Unlock()
699 | 		return false
700 | 	}
701 | 	rf.mu.Unlock()
702 | 	return ok
703 | }
704 | 
705 | func (rf *Raft) sendInstallSnapshot(server int, args *InstallSnapshotRequest, reply *InstallSnapshotResponse) bool {
706 | 	if rf.killed() {
707 | 		return false
708 | 	}
709 | 	ok := rf.peers[server].Call("Raft.InstallSnapshot", args, reply)
710 | 	for !ok {
711 | 		if rf.killed() {
712 | 			return false
713 | 		}
714 | 		ok = rf.peers[server].Call("Raft.InstallSnapshot", args, reply)
715 | 		if ok {
716 | 			break
717 | 		}
718 | 	}
719 | 
720 | 	if rf.killed() {
721 | 		return false
722 | 	}
723 | 	rf.mu.Lock()
724 | 	if reply.Term < rf.currentTerm {
725 | 		rf.mu.Unlock()
726 | 		return false
727 | 	}
728 | 	switch reply.Err {
729 | 	case InstallSnapshotErr_Nil:
730 | 		if reply.Term > rf.currentTerm {
731 | 			rf.myStatus = Follower
732 | 			rf.currentTerm = reply.Term
733 | 			rf.voteFor = -1
734 | 			rf.timer.Reset(rf.voteTimeout)
735 | 			rf.persist()
736 | 		}
737 | 		rf.nextIndexs[server] = args.LastIncludeIndex+1
738 | 	case InstallSnapshotErr_OldIndex:
739 | 		if reply.Term > rf.currentTerm {
740 | 			rf.myStatus = Follower
741 | 			rf.currentTerm = reply.Term
742 | 			rf.voteFor = -1
743 | 			rf.timer.Reset(rf.voteTimeout)
744 | 			rf.persist()
745 | 		}
746 | 		rf.nextIndexs[server] = len(rf.logs)+rf.lastIncludeIndex+1
747 | 	case InstallSnapshotErr_ReqOutofDate:
748 | 	}
749 | 
750 | 
751 | 	rf.mu.Unlock()
752 | 	return false
753 | }
754 | //
755 | // the service using Raft (e.g. a k/v server) wants to start
756 | // agreement on the next command to be appended to Raft's log. if this
757 | // server isn't the leader, returns false. otherwise start the
758 | // agreement and return immediately. there is no guarantee that this
759 | // command will ever be committed to the Raft log, since the leader
760 | // may fail or lose an election. even if the Raft instance has been killed,
761 | // this function should return gracefully.
762 | //
763 | // the first return value is the index that the command will appear at
764 | // if it's ever committed. the second return value is the current
765 | // term. the third return value is true if this server believes it is
766 | // the leader.
767 | //
768 | func (rf *Raft) Start(command interface{}) (int, int, bool) {
769 | 	index := -1
770 | 	term := -1
771 | 	isLeader := true
772 | 
773 | 	// Your code here (2B).
774 | 	// 客户端的log
775 | 	if rf.killed() {
776 | 		return index, term, false
777 | 	}
778 | 	rf.mu.Lock()
779 | 	isLeader = rf.myStatus == Leader
780 | 	if !isLeader {
781 | 		rf.mu.Unlock()
782 | 		return index, term, isLeader
783 | 	}
784 | 	logEntry := LogEntry{Term: rf.currentTerm, Cmd:  command}
785 | 	rf.logs = append(rf.logs, logEntry)
786 | 
787 | 	index = len(rf.logs)+rf.lastIncludeIndex
788 | 	term = rf.currentTerm
789 | 	rf.persist()
790 | 	rf.mu.Unlock()
791 | 
792 | 	return index, term, isLeader
793 | }
794 | 
795 | //
796 | // the tester doesn't halt goroutines created by Raft after each test,
797 | // but it does call the Kill() method. your code can use killed() to
798 | // check whether Kill() has been called. the use of atomic avoids the
799 | // need for a lock.
800 | //
801 | // the issue is that long-running goroutines use memory and may chew
802 | // up CPU time, perhaps causing later tests to fail and generating
803 | // confusing debug output. any goroutine with a long-running loop
804 | // should call killed() to check whether it should stop.
805 | //
806 | func (rf *Raft) Kill() {
807 | 	atomic.StoreInt32(&rf.dead, 1)
808 | 	// Your code here, if desired.
809 | 	rf.mu.Lock()
810 | 	rf.timer.Stop()
811 | 	rf.mu.Unlock()
812 | }
813 | 
814 | func (rf *Raft) killed() bool {
815 | 	z := atomic.LoadInt32(&rf.dead)
816 | 	return z == 1
817 | }
818 | 
819 | // The ticker go routine starts a new election if this peer hasn't received
820 | // heartsbeats recently.
821 | func (rf *Raft) ticker() {
822 | 	for rf.killed() == false {
823 | 
824 | 		// Your code here to check if a leader election should
825 | 		// be started and to randomize sleeping time using
826 | 		// time.Sleep().
827 | 		select {
828 | 		case <-rf.timer.C:
829 | 			if rf.killed() {
830 | 				return
831 | 			}
832 | 			rf.mu.Lock()
833 | 			currStatus := rf.myStatus
834 | 			switch currStatus {
835 | 			case Follower:
836 | 				rf.myStatus = Candidate
837 | 				fallthrough
838 | 			case Candidate:
839 | 				// 进行选举
840 | 				rf.currentTerm+=1
841 | 				rf.voteFor = rf.me
842 | 				// 每轮选举开始时，重新设置选举超时
843 | 				rf.voteTimeout = time.Duration(rand.Intn(150)+200)*time.Millisecond
844 | 				voteNum := 1
845 | 				rf.persist()
846 | 				rf.timer.Reset(rf.voteTimeout)
847 | 				// 构造msg
848 | 				for i,_ := range rf.peers {
849 | 					if i == rf.me {
850 | 						continue
851 | 					}
852 | 					voteArgs := &RequestVoteArgs{
853 | 						Term:         rf.currentTerm,
854 | 						Candidate:    rf.me,
855 | 						LastLogIndex: len(rf.logs)+rf.lastIncludeIndex,
856 | 						LastLogTerm:  rf.lastIncludeTerm,
857 | 					}
858 | 					if len(rf.logs) > 0 {
859 | 						voteArgs.LastLogTerm = rf.logs[len(rf.logs)-1].Term
860 | 					}
861 | 					voteReply := new(RequestVoteReply)
862 | 					//fmt.Println("发起选举",rf.me,i,voteArgs,rf.currentTerm, rf.lastIncludeIndex, rf.lastIncludeTerm)
863 | 					go rf.sendRequestVote(i, voteArgs, voteReply, &voteNum)
864 | 				}
865 | 			case Leader:
866 | 				// 进行心跳
867 | 				appendNum := 1
868 | 				rf.timer.Reset(HeartBeatTimeout)
869 | 				// 构造msg
870 | 				for i,_ := range rf.peers {
871 | 					if i == rf.me {
872 | 						continue
873 | 					}
874 | 					appendEntriesArgs := &AppendEntriesArgs{
875 | 						Term:         rf.currentTerm,
876 | 						LeaderId:     rf.me,
877 | 						PrevLogIndex: 0,
878 | 						PrevLogTerm:  0,
879 | 						Logs:         nil,
880 | 						LeaderCommit: rf.commitIndex,
881 | 						LogIndex:     len(rf.logs)+rf.lastIncludeIndex,
882 | 					}
883 | 					//installSnapshot，如果rf.nextIndex[i]小于等lastCludeIndex,则发送snapShot
884 | 					if rf.nextIndexs[i] <= rf.lastIncludeIndex {
885 | 						installSnapshotReq := &InstallSnapshotRequest{
886 | 							Term:             rf.currentTerm,
887 | 							LeaderId:         rf.me,
888 | 							LastIncludeIndex: rf.lastIncludeIndex,
889 | 							LastIncludeTerm:  rf.lastIncludeTerm,
890 | 							Data:             rf.snapshotCmd,
891 | 						}
892 | 						installSnapshotReply := &InstallSnapshotResponse{}
893 | 						//fmt.Println("installsnapshot", rf.me, i, rf.lastIncludeIndex, rf.lastIncludeTerm, rf.currentTerm, installSnapshotReq)
894 | 						go rf.sendInstallSnapshot(i, installSnapshotReq, installSnapshotReply)
895 | 						continue
896 | 					}
897 | 					for rf.nextIndexs[i] > rf.lastIncludeIndex {
898 | 						appendEntriesArgs.PrevLogIndex = rf.nextIndexs[i]-1
899 | 						if appendEntriesArgs.PrevLogIndex >= len(rf.logs)+rf.lastIncludeIndex+1 {
900 | 							rf.nextIndexs[i]--
901 | 							continue
902 | 						}
903 | 						if appendEntriesArgs.PrevLogIndex == rf.lastIncludeIndex {
904 | 							appendEntriesArgs.PrevLogTerm = rf.lastIncludeTerm
905 | 						} else {
906 | 							appendEntriesArgs.PrevLogTerm = rf.logs[appendEntriesArgs.PrevLogIndex-rf.lastIncludeIndex-1].Term
907 | 						}
908 | 						break
909 | 					}
910 | 					if rf.nextIndexs[i] < len(rf.logs)+rf.lastIncludeIndex+1 {
911 | 						appendEntriesArgs.Logs = make([]LogEntry,appendEntriesArgs.LogIndex+1-rf.nextIndexs[i])
912 | 						copy(appendEntriesArgs.Logs, rf.logs[rf.nextIndexs[i]-rf.lastIncludeIndex-1:appendEntriesArgs.LogIndex-rf.lastIncludeIndex])
913 | 					}
914 | 
915 | 					appendEntriesReply := new(AppendEntriesReply)
916 | 					go rf.sendAppendEntries(i, appendEntriesArgs, appendEntriesReply, &appendNum)
917 | 				}
918 | 			}
919 | 			rf.mu.Unlock()
920 | 		}
921 | 	}
922 | }
923 | 
924 | //
925 | // the service or tester wants to create a Raft server. the ports
926 | // of all the Raft servers (including this one) are in peers[]. this
927 | // server's port is peers[me]. all the servers' peers[] arrays
928 | // have the same order. persister is a place for this server to
929 | // save its persistent state, and also initially holds the most
930 | // recent saved state, if any. applyCh is a channel on which the
931 | // tester or service expects Raft to send ApplyMsg messages.
932 | // Make() must return quickly, so it should start goroutines
933 | // for any long-running work.
934 | //
935 | func Make(peers []*labrpc.ClientEnd, me int,
936 | 	persister *Persister, applyCh chan ApplyMsg) *Raft {
937 | 	rf := &Raft{}
938 | 	rf.peers = peers
939 | 	rf.persister = persister
940 | 	rf.me = me
941 | 
942 | 	// Your initialization code here (2A, 2B, 2C).
943 | 	rf.myStatus = Follower
944 | 	rf.voteFor = -1
945 | 	rand.Seed(time.Now().UnixNano())
946 | 	rf.voteTimeout = time.Duration(rand.Intn(150)+200)*time.Millisecond
947 | 	rf.currentTerm, rf.commitIndex, rf.lastApplied = 0,0,0
948 | 	rf.nextIndexs, rf.matchIndexs, rf.logs = nil, nil, []LogEntry{{0,nil}}
949 | 	rf.timer = time.NewTicker(rf.voteTimeout)
950 | 	rf.applyChan = applyCh
951 | 
952 | 	// 2D
953 | 	rf.lastIncludeIndex = -1
954 | 	rf.lastIncludeTerm = 0
955 | 	rf.snapshotCmd = make([]byte, 0)
956 | 	// initialize from state persisted before a crash
957 | 	rf.readPersist(persister.ReadRaftState())
958 | 
959 | 	// start ticker goroutine to start elections
960 | 	go rf.ticker()
961 | 
962 | 
963 | 	return rf
964 | }
965 | 


--------------------------------------------------------------------------------
/test_test.go:
--------------------------------------------------------------------------------
   1 | package raft
   2 | 
   3 | //
   4 | // Raft tests.
   5 | //
   6 | // we will use the original test_test.go to test your code for grading.
   7 | // so, while you can modify this code to help you debug, please
   8 | // test with the original before submitting.
   9 | //
  10 | 
  11 | import "testing"
  12 | import "fmt"
  13 | import "time"
  14 | import "math/rand"
  15 | import "sync/atomic"
  16 | import "sync"
  17 | 
  18 | // The tester generously allows solutions to complete elections in one second
  19 | // (much more than the paper's range of timeouts).
  20 | const RaftElectionTimeout = 1000 * time.Millisecond
  21 | 
  22 | func TestInitialElection2A(t *testing.T) {
  23 | 	servers := 3
  24 | 	cfg := make_config(t, servers, false, false)
  25 | 	defer cfg.cleanup()
  26 | 
  27 | 	cfg.begin("Test (2A): initial election")
  28 | 
  29 | 	// is a leader elected?
  30 | 	cfg.checkOneLeader()
  31 | 
  32 | 	// sleep a bit to avoid racing with followers learning of the
  33 | 	// election, then check that all peers agree on the term.
  34 | 	time.Sleep(50 * time.Millisecond)
  35 | 	term1 := cfg.checkTerms()
  36 | 	if term1 < 1 {
  37 | 		t.Fatalf("term is %v, but should be at least 1", term1)
  38 | 	}
  39 | 
  40 | 	// does the leader+term stay the same if there is no network failure?
  41 | 	time.Sleep(2 * RaftElectionTimeout)
  42 | 	term2 := cfg.checkTerms()
  43 | 	if term1 != term2 {
  44 | 		fmt.Printf("warning: term changed even though there were no failures")
  45 | 	}
  46 | 
  47 | 	// there should still be a leader.
  48 | 	cfg.checkOneLeader()
  49 | 
  50 | 	cfg.end()
  51 | }
  52 | 
  53 | func TestReElection2A(t *testing.T) {
  54 | 	servers := 3
  55 | 	cfg := make_config(t, servers, false, false)
  56 | 	defer cfg.cleanup()
  57 | 
  58 | 	cfg.begin("Test (2A): election after network failure")
  59 | 
  60 | 	leader1 := cfg.checkOneLeader()
  61 | 
  62 | 	// if the leader disconnects, a new one should be elected.
  63 | 	cfg.disconnect(leader1)
  64 | 	cfg.checkOneLeader()
  65 | 
  66 | 	// if the old leader rejoins, that shouldn't
  67 | 	// disturb the new leader.
  68 | 	cfg.connect(leader1)
  69 | 	leader2 := cfg.checkOneLeader()
  70 | 
  71 | 	// if there's no quorum, no leader should
  72 | 	// be elected.
  73 | 	cfg.disconnect(leader2)
  74 | 	cfg.disconnect((leader2 + 1) % servers)
  75 | 	time.Sleep(2 * RaftElectionTimeout)
  76 | 	cfg.checkNoLeader()
  77 | 
  78 | 	// if a quorum arises, it should elect a leader.
  79 | 	cfg.connect((leader2 + 1) % servers)
  80 | 	cfg.checkOneLeader()
  81 | 
  82 | 	// re-join of last node shouldn't prevent leader from existing.
  83 | 	cfg.connect(leader2)
  84 | 	cfg.checkOneLeader()
  85 | 
  86 | 	cfg.end()
  87 | }
  88 | 
  89 | func TestManyElections2A(t *testing.T) {
  90 | 	servers := 7
  91 | 	cfg := make_config(t, servers, false, false)
  92 | 	defer cfg.cleanup()
  93 | 
  94 | 	cfg.begin("Test (2A): multiple elections")
  95 | 
  96 | 	cfg.checkOneLeader()
  97 | 
  98 | 	iters := 10
  99 | 	for ii := 1; ii < iters; ii++ {
 100 | 		// disconnect three nodes
 101 | 		i1 := rand.Int() % servers
 102 | 		i2 := rand.Int() % servers
 103 | 		i3 := rand.Int() % servers
 104 | 		cfg.disconnect(i1)
 105 | 		cfg.disconnect(i2)
 106 | 		cfg.disconnect(i3)
 107 | 
 108 | 		// either the current leader should still be alive,
 109 | 		// or the remaining four should elect a new one.
 110 | 		cfg.checkOneLeader()
 111 | 
 112 | 		cfg.connect(i1)
 113 | 		cfg.connect(i2)
 114 | 		cfg.connect(i3)
 115 | 	}
 116 | 
 117 | 	cfg.checkOneLeader()
 118 | 
 119 | 	cfg.end()
 120 | }
 121 | 
 122 | func TestBasicAgree2B(t *testing.T) {
 123 | 	servers := 3
 124 | 	cfg := make_config(t, servers, false, false)
 125 | 	defer cfg.cleanup()
 126 | 
 127 | 	cfg.begin("Test (2B): basic agreement")
 128 | 
 129 | 	iters := 3
 130 | 	for index := 1; index < iters+1; index++ {
 131 | 		nd, _ := cfg.nCommitted(index)
 132 | 		if nd > 0 {
 133 | 			t.Fatalf("some have committed before Start()")
 134 | 		}
 135 | 
 136 | 		xindex := cfg.one(index*100, servers, false)
 137 | 		if xindex != index {
 138 | 			t.Fatalf("got index %v but expected %v", xindex, index)
 139 | 		}
 140 | 	}
 141 | 
 142 | 	cfg.end()
 143 | }
 144 | 
 145 | //
 146 | // check, based on counting bytes of RPCs, that
 147 | // each command is sent to each peer just once.
 148 | //
 149 | func TestRPCBytes2B(t *testing.T) {
 150 | 	servers := 3
 151 | 	cfg := make_config(t, servers, false, false)
 152 | 	defer cfg.cleanup()
 153 | 
 154 | 	cfg.begin("Test (2B): RPC byte count")
 155 | 
 156 | 	cfg.one(99, servers, false)
 157 | 	bytes0 := cfg.bytesTotal()
 158 | 
 159 | 	iters := 10
 160 | 	var sent int64 = 0
 161 | 	for index := 2; index < iters+2; index++ {
 162 | 		cmd := randstring(5000)
 163 | 		xindex := cfg.one(cmd, servers, false)
 164 | 		if xindex != index {
 165 | 			t.Fatalf("got index %v but expected %v", xindex, index)
 166 | 		}
 167 | 		sent += int64(len(cmd))
 168 | 	}
 169 | 
 170 | 	bytes1 := cfg.bytesTotal()
 171 | 	got := bytes1 - bytes0
 172 | 	expected := int64(servers) * sent
 173 | 	if got > expected+50000 {
 174 | 		t.Fatalf("too many RPC bytes; got %v, expected %v", got, expected)
 175 | 	}
 176 | 
 177 | 	cfg.end()
 178 | }
 179 | 
 180 | func TestFailAgree2B(t *testing.T) {
 181 | 	servers := 3
 182 | 	cfg := make_config(t, servers, false, false)
 183 | 	defer cfg.cleanup()
 184 | 
 185 | 	cfg.begin("Test (2B): agreement despite follower disconnection")
 186 | 
 187 | 	cfg.one(101, servers, false)
 188 | 
 189 | 	// disconnect one follower from the network.
 190 | 	leader := cfg.checkOneLeader()
 191 | 	cfg.disconnect((leader + 1) % servers)
 192 | 
 193 | 	// the leader and remaining follower should be
 194 | 	// able to agree despite the disconnected follower.
 195 | 	cfg.one(102, servers-1, false)
 196 | 	cfg.one(103, servers-1, false)
 197 | 	time.Sleep(RaftElectionTimeout)
 198 | 	cfg.one(104, servers-1, false)
 199 | 	cfg.one(105, servers-1, false)
 200 | 
 201 | 	// re-connect
 202 | 	cfg.connect((leader + 1) % servers)
 203 | 
 204 | 	// the full set of servers should preserve
 205 | 	// previous agreements, and be able to agree
 206 | 	// on new commands.
 207 | 	cfg.one(106, servers, true)
 208 | 	time.Sleep(RaftElectionTimeout)
 209 | 	cfg.one(107, servers, true)
 210 | 
 211 | 	cfg.end()
 212 | }
 213 | 
 214 | func TestFailNoAgree2B(t *testing.T) {
 215 | 	servers := 5
 216 | 	cfg := make_config(t, servers, false, false)
 217 | 	defer cfg.cleanup()
 218 | 
 219 | 	cfg.begin("Test (2B): no agreement if too many followers disconnect")
 220 | 
 221 | 	cfg.one(10, servers, false)
 222 | 
 223 | 	// 3 of 5 followers disconnect
 224 | 	leader := cfg.checkOneLeader()
 225 | 	cfg.disconnect((leader + 1) % servers)
 226 | 	cfg.disconnect((leader + 2) % servers)
 227 | 	cfg.disconnect((leader + 3) % servers)
 228 | 
 229 | 	index, _, ok := cfg.rafts[leader].Start(20)
 230 | 	if ok != true {
 231 | 		t.Fatalf("leader rejected Start()")
 232 | 	}
 233 | 	if index != 2 {
 234 | 		t.Fatalf("expected index 2, got %v", index)
 235 | 	}
 236 | 
 237 | 	time.Sleep(2 * RaftElectionTimeout)
 238 | 
 239 | 	n, _ := cfg.nCommitted(index)
 240 | 	if n > 0 {
 241 | 		t.Fatalf("%v committed but no majority", n)
 242 | 	}
 243 | 
 244 | 	// repair
 245 | 	cfg.connect((leader + 1) % servers)
 246 | 	cfg.connect((leader + 2) % servers)
 247 | 	cfg.connect((leader + 3) % servers)
 248 | 
 249 | 	// the disconnected majority may have chosen a leader from
 250 | 	// among their own ranks, forgetting index 2.
 251 | 	leader2 := cfg.checkOneLeader()
 252 | 	index2, _, ok2 := cfg.rafts[leader2].Start(30)
 253 | 	if ok2 == false {
 254 | 		t.Fatalf("leader2 rejected Start()")
 255 | 	}
 256 | 	if index2 < 2 || index2 > 3 {
 257 | 		t.Fatalf("unexpected index %v", index2)
 258 | 	}
 259 | 
 260 | 	cfg.one(1000, servers, true)
 261 | 
 262 | 	cfg.end()
 263 | }
 264 | 
 265 | func TestConcurrentStarts2B(t *testing.T) {
 266 | 	servers := 3
 267 | 	cfg := make_config(t, servers, false, false)
 268 | 	defer cfg.cleanup()
 269 | 
 270 | 	cfg.begin("Test (2B): concurrent Start()s")
 271 | 
 272 | 	var success bool
 273 | loop:
 274 | 	for try := 0; try < 5; try++ {
 275 | 		if try > 0 {
 276 | 			// give solution some time to settle
 277 | 			time.Sleep(3 * time.Second)
 278 | 		}
 279 | 
 280 | 		leader := cfg.checkOneLeader()
 281 | 		_, term, ok := cfg.rafts[leader].Start(1)
 282 | 		if !ok {
 283 | 			// leader moved on really quickly
 284 | 			continue
 285 | 		}
 286 | 
 287 | 		iters := 5
 288 | 		var wg sync.WaitGroup
 289 | 		is := make(chan int, iters)
 290 | 		for ii := 0; ii < iters; ii++ {
 291 | 			wg.Add(1)
 292 | 			go func(i int) {
 293 | 				defer wg.Done()
 294 | 				i, term1, ok := cfg.rafts[leader].Start(100 + i)
 295 | 				if term1 != term {
 296 | 					return
 297 | 				}
 298 | 				if ok != true {
 299 | 					return
 300 | 				}
 301 | 				is <- i
 302 | 			}(ii)
 303 | 		}
 304 | 
 305 | 		wg.Wait()
 306 | 		close(is)
 307 | 
 308 | 		for j := 0; j < servers; j++ {
 309 | 			if t, _ := cfg.rafts[j].GetState(); t != term {
 310 | 				// term changed -- can't expect low RPC counts
 311 | 				continue loop
 312 | 			}
 313 | 		}
 314 | 
 315 | 		failed := false
 316 | 		cmds := []int{}
 317 | 		for index := range is {
 318 | 			cmd := cfg.wait(index, servers, term)
 319 | 			if ix, ok := cmd.(int); ok {
 320 | 				if ix == -1 {
 321 | 					// peers have moved on to later terms
 322 | 					// so we can't expect all Start()s to
 323 | 					// have succeeded
 324 | 					failed = true
 325 | 					break
 326 | 				}
 327 | 				cmds = append(cmds, ix)
 328 | 			} else {
 329 | 				t.Fatalf("value %v is not an int", cmd)
 330 | 			}
 331 | 		}
 332 | 
 333 | 		if failed {
 334 | 			// avoid leaking goroutines
 335 | 			go func() {
 336 | 				for range is {
 337 | 				}
 338 | 			}()
 339 | 			continue
 340 | 		}
 341 | 
 342 | 		for ii := 0; ii < iters; ii++ {
 343 | 			x := 100 + ii
 344 | 			ok := false
 345 | 			for j := 0; j < len(cmds); j++ {
 346 | 				if cmds[j] == x {
 347 | 					ok = true
 348 | 				}
 349 | 			}
 350 | 			if ok == false {
 351 | 				t.Fatalf("cmd %v missing in %v", x, cmds)
 352 | 			}
 353 | 		}
 354 | 
 355 | 		success = true
 356 | 		break
 357 | 	}
 358 | 
 359 | 	if !success {
 360 | 		t.Fatalf("term changed too often")
 361 | 	}
 362 | 
 363 | 	cfg.end()
 364 | }
 365 | 
 366 | func TestRejoin2B(t *testing.T) {
 367 | 	servers := 3
 368 | 	cfg := make_config(t, servers, false, false)
 369 | 	defer cfg.cleanup()
 370 | 
 371 | 	cfg.begin("Test (2B): rejoin of partitioned leader")
 372 | 
 373 | 	cfg.one(101, servers, true)
 374 | 
 375 | 	// leader network failure
 376 | 	leader1 := cfg.checkOneLeader()
 377 | 	cfg.disconnect(leader1)
 378 | 
 379 | 	// make old leader try to agree on some entries
 380 | 	cfg.rafts[leader1].Start(102)
 381 | 	cfg.rafts[leader1].Start(103)
 382 | 	cfg.rafts[leader1].Start(104)
 383 | 
 384 | 	// new leader commits, also for index=2
 385 | 	cfg.one(103, 2, true)
 386 | 
 387 | 	// new leader network failure
 388 | 	leader2 := cfg.checkOneLeader()
 389 | 	cfg.disconnect(leader2)
 390 | 
 391 | 	// old leader connected again
 392 | 	cfg.connect(leader1)
 393 | 
 394 | 	cfg.one(104, 2, true)
 395 | 
 396 | 	// all together now
 397 | 	cfg.connect(leader2)
 398 | 
 399 | 	cfg.one(105, servers, true)
 400 | 
 401 | 	cfg.end()
 402 | }
 403 | 
 404 | func TestBackup2B(t *testing.T) {
 405 | 	servers := 5
 406 | 	cfg := make_config(t, servers, false, false)
 407 | 	defer cfg.cleanup()
 408 | 
 409 | 	cfg.begin("Test (2B): leader backs up quickly over incorrect follower logs")
 410 | 
 411 | 	cfg.one(rand.Int(), servers, true)
 412 | 
 413 | 	// put leader and one follower in a partition
 414 | 	leader1 := cfg.checkOneLeader()
 415 | 	cfg.disconnect((leader1 + 2) % servers)
 416 | 	cfg.disconnect((leader1 + 3) % servers)
 417 | 	cfg.disconnect((leader1 + 4) % servers)
 418 | 
 419 | 	// submit lots of commands that won't commit
 420 | 	for i := 0; i < 50; i++ {
 421 | 		cfg.rafts[leader1].Start(rand.Int())
 422 | 	}
 423 | 
 424 | 	time.Sleep(RaftElectionTimeout / 2)
 425 | 
 426 | 	cfg.disconnect((leader1 + 0) % servers)
 427 | 	cfg.disconnect((leader1 + 1) % servers)
 428 | 
 429 | 	// allow other partition to recover
 430 | 	cfg.connect((leader1 + 2) % servers)
 431 | 	cfg.connect((leader1 + 3) % servers)
 432 | 	cfg.connect((leader1 + 4) % servers)
 433 | 
 434 | 	// lots of successful commands to new group.
 435 | 	for i := 0; i < 50; i++ {
 436 | 		cfg.one(rand.Int(), 3, true)
 437 | 	}
 438 | 
 439 | 	// now another partitioned leader and one follower
 440 | 	leader2 := cfg.checkOneLeader()
 441 | 	other := (leader1 + 2) % servers
 442 | 	if leader2 == other {
 443 | 		other = (leader2 + 1) % servers
 444 | 	}
 445 | 	cfg.disconnect(other)
 446 | 
 447 | 	// lots more commands that won't commit
 448 | 	for i := 0; i < 50; i++ {
 449 | 		cfg.rafts[leader2].Start(rand.Int())
 450 | 	}
 451 | 
 452 | 	time.Sleep(RaftElectionTimeout / 2)
 453 | 
 454 | 	// bring original leader back to life,
 455 | 	for i := 0; i < servers; i++ {
 456 | 		cfg.disconnect(i)
 457 | 	}
 458 | 	cfg.connect((leader1 + 0) % servers)
 459 | 	cfg.connect((leader1 + 1) % servers)
 460 | 	cfg.connect(other)
 461 | 
 462 | 	// lots of successful commands to new group.
 463 | 	for i := 0; i < 50; i++ {
 464 | 		cfg.one(rand.Int(), 3, true)
 465 | 	}
 466 | 
 467 | 	// now everyone
 468 | 	for i := 0; i < servers; i++ {
 469 | 		cfg.connect(i)
 470 | 	}
 471 | 	cfg.one(rand.Int(), servers, true)
 472 | 
 473 | 	cfg.end()
 474 | }
 475 | 
 476 | func TestCount2B(t *testing.T) {
 477 | 	servers := 3
 478 | 	cfg := make_config(t, servers, false, false)
 479 | 	defer cfg.cleanup()
 480 | 
 481 | 	cfg.begin("Test (2B): RPC counts aren't too high")
 482 | 
 483 | 	rpcs := func() (n int) {
 484 | 		for j := 0; j < servers; j++ {
 485 | 			n += cfg.rpcCount(j)
 486 | 		}
 487 | 		return
 488 | 	}
 489 | 
 490 | 	leader := cfg.checkOneLeader()
 491 | 
 492 | 	total1 := rpcs()
 493 | 
 494 | 	if total1 > 30 || total1 < 1 {
 495 | 		t.Fatalf("too many or few RPCs (%v) to elect initial leader\n", total1)
 496 | 	}
 497 | 
 498 | 	var total2 int
 499 | 	var success bool
 500 | loop:
 501 | 	for try := 0; try < 5; try++ {
 502 | 		if try > 0 {
 503 | 			// give solution some time to settle
 504 | 			time.Sleep(3 * time.Second)
 505 | 		}
 506 | 
 507 | 		leader = cfg.checkOneLeader()
 508 | 		total1 = rpcs()
 509 | 
 510 | 		iters := 10
 511 | 		starti, term, ok := cfg.rafts[leader].Start(1)
 512 | 		if !ok {
 513 | 			// leader moved on really quickly
 514 | 			continue
 515 | 		}
 516 | 		cmds := []int{}
 517 | 		for i := 1; i < iters+2; i++ {
 518 | 			x := int(rand.Int31())
 519 | 			cmds = append(cmds, x)
 520 | 			index1, term1, ok := cfg.rafts[leader].Start(x)
 521 | 			if term1 != term {
 522 | 				// Term changed while starting
 523 | 				continue loop
 524 | 			}
 525 | 			if !ok {
 526 | 				// No longer the leader, so term has changed
 527 | 				continue loop
 528 | 			}
 529 | 			if starti+i != index1 {
 530 | 				t.Fatalf("Start() failed")
 531 | 			}
 532 | 		}
 533 | 
 534 | 		for i := 1; i < iters+1; i++ {
 535 | 			cmd := cfg.wait(starti+i, servers, term)
 536 | 			if ix, ok := cmd.(int); ok == false || ix != cmds[i-1] {
 537 | 				if ix == -1 {
 538 | 					// term changed -- try again
 539 | 					continue loop
 540 | 				}
 541 | 				t.Fatalf("wrong value %v committed for index %v; expected %v\n", cmd, starti+i, cmds)
 542 | 			}
 543 | 		}
 544 | 
 545 | 		failed := false
 546 | 		total2 = 0
 547 | 		for j := 0; j < servers; j++ {
 548 | 			if t, _ := cfg.rafts[j].GetState(); t != term {
 549 | 				// term changed -- can't expect low RPC counts
 550 | 				// need to keep going to update total2
 551 | 				failed = true
 552 | 			}
 553 | 			total2 += cfg.rpcCount(j)
 554 | 		}
 555 | 
 556 | 		if failed {
 557 | 			continue loop
 558 | 		}
 559 | 
 560 | 		if total2-total1 > (iters+1+3)*3 {
 561 | 			t.Fatalf("too many RPCs (%v) for %v entries\n", total2-total1, iters)
 562 | 		}
 563 | 
 564 | 		success = true
 565 | 		break
 566 | 	}
 567 | 
 568 | 	if !success {
 569 | 		t.Fatalf("term changed too often")
 570 | 	}
 571 | 
 572 | 	time.Sleep(RaftElectionTimeout)
 573 | 
 574 | 	total3 := 0
 575 | 	for j := 0; j < servers; j++ {
 576 | 		total3 += cfg.rpcCount(j)
 577 | 	}
 578 | 
 579 | 	if total3-total2 > 3*20 {
 580 | 		t.Fatalf("too many RPCs (%v) for 1 second of idleness\n", total3-total2)
 581 | 	}
 582 | 
 583 | 	cfg.end()
 584 | }
 585 | 
 586 | func TestPersist12C(t *testing.T) {
 587 | 	servers := 3
 588 | 	cfg := make_config(t, servers, false, false)
 589 | 	defer cfg.cleanup()
 590 | 
 591 | 	cfg.begin("Test (2C): basic persistence")
 592 | 
 593 | 	cfg.one(11, servers, true)
 594 | 
 595 | 	// crash and re-start all
 596 | 	for i := 0; i < servers; i++ {
 597 | 		cfg.start1(i, cfg.applier)
 598 | 	}
 599 | 	for i := 0; i < servers; i++ {
 600 | 		cfg.disconnect(i)
 601 | 		cfg.connect(i)
 602 | 	}
 603 | 
 604 | 	cfg.one(12, servers, true)
 605 | 
 606 | 	leader1 := cfg.checkOneLeader()
 607 | 	cfg.disconnect(leader1)
 608 | 	cfg.start1(leader1, cfg.applier)
 609 | 	cfg.connect(leader1)
 610 | 
 611 | 	cfg.one(13, servers, true)
 612 | 
 613 | 	leader2 := cfg.checkOneLeader()
 614 | 	cfg.disconnect(leader2)
 615 | 	cfg.one(14, servers-1, true)
 616 | 	cfg.start1(leader2, cfg.applier)
 617 | 	cfg.connect(leader2)
 618 | 
 619 | 	cfg.wait(4, servers, -1) // wait for leader2 to join before killing i3
 620 | 
 621 | 	i3 := (cfg.checkOneLeader() + 1) % servers
 622 | 	cfg.disconnect(i3)
 623 | 	cfg.one(15, servers-1, true)
 624 | 	cfg.start1(i3, cfg.applier)
 625 | 	cfg.connect(i3)
 626 | 
 627 | 	cfg.one(16, servers, true)
 628 | 
 629 | 	cfg.end()
 630 | }
 631 | 
 632 | func TestPersist22C(t *testing.T) {
 633 | 	servers := 5
 634 | 	cfg := make_config(t, servers, false, false)
 635 | 	defer cfg.cleanup()
 636 | 
 637 | 	cfg.begin("Test (2C): more persistence")
 638 | 
 639 | 	index := 1
 640 | 	for iters := 0; iters < 5; iters++ {
 641 | 		cfg.one(10+index, servers, true)
 642 | 		index++
 643 | 
 644 | 		leader1 := cfg.checkOneLeader()
 645 | 
 646 | 		cfg.disconnect((leader1 + 1) % servers)
 647 | 		cfg.disconnect((leader1 + 2) % servers)
 648 | 
 649 | 		cfg.one(10+index, servers-2, true)
 650 | 		index++
 651 | 
 652 | 		cfg.disconnect((leader1 + 0) % servers)
 653 | 		cfg.disconnect((leader1 + 3) % servers)
 654 | 		cfg.disconnect((leader1 + 4) % servers)
 655 | 
 656 | 		cfg.start1((leader1+1)%servers, cfg.applier)
 657 | 		cfg.start1((leader1+2)%servers, cfg.applier)
 658 | 		cfg.connect((leader1 + 1) % servers)
 659 | 		cfg.connect((leader1 + 2) % servers)
 660 | 
 661 | 		time.Sleep(RaftElectionTimeout)
 662 | 
 663 | 		cfg.start1((leader1+3)%servers, cfg.applier)
 664 | 		cfg.connect((leader1 + 3) % servers)
 665 | 
 666 | 		cfg.one(10+index, servers-2, true)
 667 | 		index++
 668 | 
 669 | 		cfg.connect((leader1 + 4) % servers)
 670 | 		cfg.connect((leader1 + 0) % servers)
 671 | 	}
 672 | 
 673 | 	cfg.one(1000, servers, true)
 674 | 
 675 | 	cfg.end()
 676 | }
 677 | 
 678 | func TestPersist32C(t *testing.T) {
 679 | 	servers := 3
 680 | 	cfg := make_config(t, servers, false, false)
 681 | 	defer cfg.cleanup()
 682 | 
 683 | 	cfg.begin("Test (2C): partitioned leader and one follower crash, leader restarts")
 684 | 
 685 | 	cfg.one(101, 3, true)
 686 | 
 687 | 	leader := cfg.checkOneLeader()
 688 | 	cfg.disconnect((leader + 2) % servers)
 689 | 
 690 | 	cfg.one(102, 2, true)
 691 | 
 692 | 	cfg.crash1((leader + 0) % servers)
 693 | 	cfg.crash1((leader + 1) % servers)
 694 | 	cfg.connect((leader + 2) % servers)
 695 | 	cfg.start1((leader+0)%servers, cfg.applier)
 696 | 	cfg.connect((leader + 0) % servers)
 697 | 
 698 | 	cfg.one(103, 2, true)
 699 | 
 700 | 	cfg.start1((leader+1)%servers, cfg.applier)
 701 | 	cfg.connect((leader + 1) % servers)
 702 | 
 703 | 	cfg.one(104, servers, true)
 704 | 
 705 | 	cfg.end()
 706 | }
 707 | 
 708 | //
 709 | // Test the scenarios described in Figure 8 of the extended Raft paper. Each
 710 | // iteration asks a leader, if there is one, to insert a command in the Raft
 711 | // log.  If there is a leader, that leader will fail quickly with a high
 712 | // probability (perhaps without committing the command), or crash after a while
 713 | // with low probability (most likey committing the command).  If the number of
 714 | // alive servers isn't enough to form a majority, perhaps start a new server.
 715 | // The leader in a new term may try to finish replicating log entries that
 716 | // haven't been committed yet.
 717 | //
 718 | func TestFigure82C(t *testing.T) {
 719 | 	servers := 5
 720 | 	cfg := make_config(t, servers, false, false)
 721 | 	defer cfg.cleanup()
 722 | 
 723 | 	cfg.begin("Test (2C): Figure 8")
 724 | 
 725 | 	cfg.one(rand.Int(), 1, true)
 726 | 
 727 | 	nup := servers
 728 | 	for iters := 0; iters < 1000; iters++ {
 729 | 		leader := -1
 730 | 		for i := 0; i < servers; i++ {
 731 | 			if cfg.rafts[i] != nil {
 732 | 				_, _, ok := cfg.rafts[i].Start(rand.Int())
 733 | 				if ok {
 734 | 					leader = i
 735 | 				}
 736 | 			}
 737 | 		}
 738 | 
 739 | 		if (rand.Int() % 1000) < 100 {
 740 | 			ms := rand.Int63() % (int64(RaftElectionTimeout/time.Millisecond) / 2)
 741 | 			time.Sleep(time.Duration(ms) * time.Millisecond)
 742 | 		} else {
 743 | 			ms := (rand.Int63() % 13)
 744 | 			time.Sleep(time.Duration(ms) * time.Millisecond)
 745 | 		}
 746 | 
 747 | 		if leader != -1 {
 748 | 			cfg.crash1(leader)
 749 | 			nup -= 1
 750 | 		}
 751 | 
 752 | 		if nup < 3 {
 753 | 			s := rand.Int() % servers
 754 | 			if cfg.rafts[s] == nil {
 755 | 				cfg.start1(s, cfg.applier)
 756 | 				cfg.connect(s)
 757 | 				nup += 1
 758 | 			}
 759 | 		}
 760 | 	}
 761 | 
 762 | 	for i := 0; i < servers; i++ {
 763 | 		if cfg.rafts[i] == nil {
 764 | 			cfg.start1(i, cfg.applier)
 765 | 			cfg.connect(i)
 766 | 		}
 767 | 	}
 768 | 
 769 | 	cfg.one(rand.Int(), servers, true)
 770 | 
 771 | 	cfg.end()
 772 | }
 773 | 
 774 | func TestUnreliableAgree2C(t *testing.T) {
 775 | 	servers := 5
 776 | 	cfg := make_config(t, servers, true, false)
 777 | 	defer cfg.cleanup()
 778 | 
 779 | 	cfg.begin("Test (2C): unreliable agreement")
 780 | 
 781 | 	var wg sync.WaitGroup
 782 | 
 783 | 	for iters := 1; iters < 50; iters++ {
 784 | 		for j := 0; j < 4; j++ {
 785 | 			wg.Add(1)
 786 | 			go func(iters, j int) {
 787 | 				defer wg.Done()
 788 | 				cfg.one((100*iters)+j, 1, true)
 789 | 			}(iters, j)
 790 | 		}
 791 | 		cfg.one(iters, 1, true)
 792 | 	}
 793 | 
 794 | 	cfg.setunreliable(false)
 795 | 
 796 | 	wg.Wait()
 797 | 
 798 | 	cfg.one(100, servers, true)
 799 | 
 800 | 	cfg.end()
 801 | }
 802 | 
 803 | func TestFigure8Unreliable2C(t *testing.T) {
 804 | 	servers := 5
 805 | 	cfg := make_config(t, servers, true, false)
 806 | 	defer cfg.cleanup()
 807 | 
 808 | 	cfg.begin("Test (2C): Figure 8 (unreliable)")
 809 | 
 810 | 	cfg.one(rand.Int()%10000, 1, true)
 811 | 
 812 | 	nup := servers
 813 | 	for iters := 0; iters < 1000; iters++ {
 814 | 		if iters == 200 {
 815 | 			cfg.setlongreordering(true)
 816 | 		}
 817 | 		leader := -1
 818 | 		for i := 0; i < servers; i++ {
 819 | 			_, _, ok := cfg.rafts[i].Start(rand.Int() % 10000)
 820 | 			if ok && cfg.connected[i] {
 821 | 				leader = i
 822 | 			}
 823 | 		}
 824 | 
 825 | 		if (rand.Int() % 1000) < 100 {
 826 | 			ms := rand.Int63() % (int64(RaftElectionTimeout/time.Millisecond) / 2)
 827 | 			time.Sleep(time.Duration(ms) * time.Millisecond)
 828 | 		} else {
 829 | 			ms := (rand.Int63() % 13)
 830 | 			time.Sleep(time.Duration(ms) * time.Millisecond)
 831 | 		}
 832 | 
 833 | 		if leader != -1 && (rand.Int()%1000) < int(RaftElectionTimeout/time.Millisecond)/2 {
 834 | 			cfg.disconnect(leader)
 835 | 			nup -= 1
 836 | 		}
 837 | 
 838 | 		if nup < 3 {
 839 | 			s := rand.Int() % servers
 840 | 			if cfg.connected[s] == false {
 841 | 				cfg.connect(s)
 842 | 				nup += 1
 843 | 			}
 844 | 		}
 845 | 	}
 846 | 
 847 | 	for i := 0; i < servers; i++ {
 848 | 		if cfg.connected[i] == false {
 849 | 			cfg.connect(i)
 850 | 		}
 851 | 	}
 852 | 
 853 | 	cfg.one(rand.Int()%10000, servers, true)
 854 | 
 855 | 	cfg.end()
 856 | }
 857 | 
 858 | func internalChurn(t *testing.T, unreliable bool) {
 859 | 
 860 | 	servers := 5
 861 | 	cfg := make_config(t, servers, unreliable, false)
 862 | 	defer cfg.cleanup()
 863 | 
 864 | 	if unreliable {
 865 | 		cfg.begin("Test (2C): unreliable churn")
 866 | 	} else {
 867 | 		cfg.begin("Test (2C): churn")
 868 | 	}
 869 | 
 870 | 	stop := int32(0)
 871 | 
 872 | 	// create concurrent clients
 873 | 	cfn := func(me int, ch chan []int) {
 874 | 		var ret []int
 875 | 		ret = nil
 876 | 		defer func() { ch <- ret }()
 877 | 		values := []int{}
 878 | 		for atomic.LoadInt32(&stop) == 0 {
 879 | 			x := rand.Int()
 880 | 			index := -1
 881 | 			ok := false
 882 | 			for i := 0; i < servers; i++ {
 883 | 				// try them all, maybe one of them is a leader
 884 | 				cfg.mu.Lock()
 885 | 				rf := cfg.rafts[i]
 886 | 				cfg.mu.Unlock()
 887 | 				if rf != nil {
 888 | 					index1, _, ok1 := rf.Start(x)
 889 | 					if ok1 {
 890 | 						ok = ok1
 891 | 						index = index1
 892 | 					}
 893 | 				}
 894 | 			}
 895 | 			if ok {
 896 | 				// maybe leader will commit our value, maybe not.
 897 | 				// but don't wait forever.
 898 | 				for _, to := range []int{10, 20, 50, 100, 200} {
 899 | 					nd, cmd := cfg.nCommitted(index)
 900 | 					if nd > 0 {
 901 | 						if xx, ok := cmd.(int); ok {
 902 | 							if xx == x {
 903 | 								values = append(values, x)
 904 | 							}
 905 | 						} else {
 906 | 							cfg.t.Fatalf("wrong command type")
 907 | 						}
 908 | 						break
 909 | 					}
 910 | 					time.Sleep(time.Duration(to) * time.Millisecond)
 911 | 				}
 912 | 			} else {
 913 | 				time.Sleep(time.Duration(79+me*17) * time.Millisecond)
 914 | 			}
 915 | 		}
 916 | 		ret = values
 917 | 	}
 918 | 
 919 | 	ncli := 3
 920 | 	cha := []chan []int{}
 921 | 	for i := 0; i < ncli; i++ {
 922 | 		cha = append(cha, make(chan []int))
 923 | 		go cfn(i, cha[i])
 924 | 	}
 925 | 
 926 | 	for iters := 0; iters < 20; iters++ {
 927 | 		if (rand.Int() % 1000) < 200 {
 928 | 			i := rand.Int() % servers
 929 | 			cfg.disconnect(i)
 930 | 		}
 931 | 
 932 | 		if (rand.Int() % 1000) < 500 {
 933 | 			i := rand.Int() % servers
 934 | 			if cfg.rafts[i] == nil {
 935 | 				cfg.start1(i, cfg.applier)
 936 | 			}
 937 | 			cfg.connect(i)
 938 | 		}
 939 | 
 940 | 		if (rand.Int() % 1000) < 200 {
 941 | 			i := rand.Int() % servers
 942 | 			if cfg.rafts[i] != nil {
 943 | 				cfg.crash1(i)
 944 | 			}
 945 | 		}
 946 | 
 947 | 		// Make crash/restart infrequent enough that the peers can often
 948 | 		// keep up, but not so infrequent that everything has settled
 949 | 		// down from one change to the next. Pick a value smaller than
 950 | 		// the election timeout, but not hugely smaller.
 951 | 		time.Sleep((RaftElectionTimeout * 7) / 10)
 952 | 	}
 953 | 
 954 | 	time.Sleep(RaftElectionTimeout)
 955 | 	cfg.setunreliable(false)
 956 | 	for i := 0; i < servers; i++ {
 957 | 		if cfg.rafts[i] == nil {
 958 | 			cfg.start1(i, cfg.applier)
 959 | 		}
 960 | 		cfg.connect(i)
 961 | 	}
 962 | 
 963 | 	atomic.StoreInt32(&stop, 1)
 964 | 
 965 | 	values := []int{}
 966 | 	for i := 0; i < ncli; i++ {
 967 | 		vv := <-cha[i]
 968 | 		if vv == nil {
 969 | 			t.Fatal("client failed")
 970 | 		}
 971 | 		values = append(values, vv...)
 972 | 	}
 973 | 
 974 | 	time.Sleep(RaftElectionTimeout)
 975 | 
 976 | 	lastIndex := cfg.one(rand.Int(), servers, true)
 977 | 
 978 | 	really := make([]int, lastIndex+1)
 979 | 	for index := 1; index <= lastIndex; index++ {
 980 | 		v := cfg.wait(index, servers, -1)
 981 | 		if vi, ok := v.(int); ok {
 982 | 			really = append(really, vi)
 983 | 		} else {
 984 | 			t.Fatalf("not an int")
 985 | 		}
 986 | 	}
 987 | 
 988 | 	for _, v1 := range values {
 989 | 		ok := false
 990 | 		for _, v2 := range really {
 991 | 			if v1 == v2 {
 992 | 				ok = true
 993 | 			}
 994 | 		}
 995 | 		if ok == false {
 996 | 			cfg.t.Fatalf("didn't find a value")
 997 | 		}
 998 | 	}
 999 | 
1000 | 	cfg.end()
1001 | }
1002 | 
1003 | func TestReliableChurn2C(t *testing.T) {
1004 | 	internalChurn(t, false)
1005 | }
1006 | 
1007 | func TestUnreliableChurn2C(t *testing.T) {
1008 | 	internalChurn(t, true)
1009 | }
1010 | 
1011 | const MAXLOGSIZE = 2000
1012 | 
1013 | func snapcommon(t *testing.T, name string, disconnect bool, reliable bool, crash bool) {
1014 | 	iters := 30
1015 | 	servers := 3
1016 | 	cfg := make_config(t, servers, !reliable, true)
1017 | 	defer cfg.cleanup()
1018 | 
1019 | 	cfg.begin(name)
1020 | 
1021 | 	cfg.one(rand.Int(), servers, true)
1022 | 	leader1 := cfg.checkOneLeader()
1023 | 
1024 | 	for i := 0; i < iters; i++ {
1025 | 		victim := (leader1 + 1) % servers
1026 | 		sender := leader1
1027 | 		if i%3 == 1 {
1028 | 			sender = (leader1 + 1) % servers
1029 | 			victim = leader1
1030 | 		}
1031 | 
1032 | 		if disconnect {
1033 | 			cfg.disconnect(victim)
1034 | 			cfg.one(rand.Int(), servers-1, true)
1035 | 		}
1036 | 		if crash {
1037 | 			cfg.crash1(victim)
1038 | 			cfg.one(rand.Int(), servers-1, true)
1039 | 		}
1040 | 		// send enough to get a snapshot
1041 | 		for i := 0; i < SnapShotInterval+1; i++ {
1042 | 			cfg.rafts[sender].Start(rand.Int())
1043 | 		}
1044 | 		// let applier threads catch up with the Start()'s
1045 | 		cfg.one(rand.Int(), servers-1, true)
1046 | 
1047 | 		if cfg.LogSize() >= MAXLOGSIZE {
1048 | 			cfg.t.Fatalf("Log size too large")
1049 | 		}
1050 | 		if disconnect {
1051 | 			// reconnect a follower, who maybe behind and
1052 | 			// needs to rceive a snapshot to catch up.
1053 | 			cfg.connect(victim)
1054 | 			cfg.one(rand.Int(), servers, true)
1055 | 			leader1 = cfg.checkOneLeader()
1056 | 		}
1057 | 		if crash {
1058 | 			cfg.start1(victim, cfg.applierSnap)
1059 | 			cfg.connect(victim)
1060 | 			cfg.one(rand.Int(), servers, true)
1061 | 			leader1 = cfg.checkOneLeader()
1062 | 		}
1063 | 	}
1064 | 	cfg.end()
1065 | }
1066 | 
1067 | func TestSnapshotBasic2D(t *testing.T) {
1068 | 	snapcommon(t, "Test (2D): snapshots basic", false, true, false)
1069 | }
1070 | 
1071 | func TestSnapshotInstall2D(t *testing.T) {
1072 | 	snapcommon(t, "Test (2D): install snapshots (disconnect)", true, true, false)
1073 | }
1074 | 
1075 | func TestSnapshotInstallUnreliable2D(t *testing.T) {
1076 | 	snapcommon(t, "Test (2D): install snapshots (disconnect+unreliable)",
1077 | 		true, false, false)
1078 | }
1079 | 
1080 | func TestSnapshotInstallCrash2D(t *testing.T) {
1081 | 	snapcommon(t, "Test (2D): install snapshots (crash)", false, true, true)
1082 | }
1083 | 
1084 | func TestSnapshotInstallUnCrash2D(t *testing.T) {
1085 | 	snapcommon(t, "Test (2D): install snapshots (unreliable+crash)", false, false, true)
1086 | }
1087 | 


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
 1 | package raft
 2 | 
 3 | import "log"
 4 | 
 5 | // Debugging
 6 | const Debug = false
 7 | 
 8 | func DPrintf(format string, a ...interface{}) (n int, err error) {
 9 | 	if Debug {
10 | 		log.Printf(format, a...)
11 | 	}
12 | 	return
13 | }
14 | 


--------------------------------------------------------------------------------