├── go.sum
├── go.mod
├── .gitignore
├── main
    ├── viewd.go
    ├── pbd.go
    ├── test-mr-many.sh
    ├── mrcoordinator.go
    ├── lockc.go
    ├── lockd.go
    ├── pbc.go
    ├── mrworker.go
    ├── diskvd.go
    ├── mrsequential.go
    ├── test-mr-early.sh
    └── test-mr.sh
├── raft
    ├── util.go
    ├── persister.go
    ├── raft_snapshot.go
    ├── raft_vote.go
    └── raft_append_entries.go
├── mr
    ├── common.go
    ├── rpc.go
    ├── coordinator.go
    └── worker.go
├── kvraft
    ├── util.go
    ├── common.go
    ├── client.go
    ├── server.go
    └── config.go
├── shardctrler
    ├── util.go
    ├── client.go
    ├── common.go
    ├── config.go
    ├── test_test.go
    └── server.go
├── shardkv
    ├── util.go
    ├── server_snapshot.go
    ├── server_op.go
    ├── common.go
    ├── client.go
    ├── server_shard.go
    ├── server.go
    ├── server_apply.go
    └── config.go
├── mrapps
    ├── early_exit.go
    ├── nocrash.go
    ├── jobcount.go
    ├── indexer.go
    ├── wc.go
    ├── crash.go
    ├── rtiming.go
    └── mtiming.go
├── README.md
├── porcupine
    ├── porcupine.go
    ├── bitset.go
    ├── model.go
    └── checker.go
├── models
    └── kv.go
└── labgob
    ├── test_test.go
    └── labgob.go


/go.sum:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module 6.824
2 | 
3 | go 1.17
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.*/
 2 | main/mr-tmp/
 3 | mrtmp.*
 4 | 824-mrinput-*.txt
 5 | /main/diff.out
 6 | /mapreduce/x.txt
 7 | /pbservice/x.txt
 8 | /kvpaxos/x.txt
 9 | *.so
10 | /main/mrcoordinator
11 | /main/mrsequential
12 | /main/mrworker
13 | 


--------------------------------------------------------------------------------
/main/viewd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // see directions in pbc.go
 5 | //
 6 | 
 7 | import "time"
 8 | import "6.824/viewservice"
 9 | import "os"
10 | import "fmt"
11 | 
12 | func main() {
13 | 	if len(os.Args) != 2 {
14 | 		fmt.Printf("Usage: viewd port\n")
15 | 		os.Exit(1)
16 | 	}
17 | 
18 | 	viewservice.StartServer(os.Args[1])
19 | 
20 | 	for {
21 | 		time.Sleep(100 * time.Second)
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/main/pbd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // see directions in pbc.go
 5 | //
 6 | 
 7 | import "time"
 8 | import "6.824/pbservice"
 9 | import "os"
10 | import "fmt"
11 | 
12 | func main() {
13 | 	if len(os.Args) != 3 {
14 | 		fmt.Printf("Usage: pbd viewport myport\n")
15 | 		os.Exit(1)
16 | 	}
17 | 
18 | 	pbservice.StartServer(os.Args[1], os.Args[2])
19 | 
20 | 	for {
21 | 		time.Sleep(100 * time.Second)
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/main/test-mr-many.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $# -ne 1 ]; then
 4 |     echo "Usage: $0 numTrials"
 5 |     exit 1
 6 | fi
 7 | 
 8 | trap 'kill -INT -$pid; exit 1' INT
 9 | 
10 | # Note: because the socketID is based on the current userID,
11 | # ./test-mr.sh cannot be run in parallel
12 | runs=$1
13 | chmod +x test-mr.sh
14 | 
15 | for i in $(seq 1 $runs); do
16 |     timeout -k 2s 900s ./test-mr.sh &
17 |     pid=$!
18 |     if ! wait $pid; then
19 |         echo '***' FAILED TESTS IN TRIAL $i
20 |         exit 1
21 |     fi
22 | done
23 | echo '***' PASSED ALL $i TESTING TRIALS
24 | 


--------------------------------------------------------------------------------
/main/mrcoordinator.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // start the coordinator process, which is implemented
 5 | // in ../mr/coordinator.go
 6 | //
 7 | // go run mrcoordinator.go pg*.txt
 8 | //
 9 | // Please do not change this file.
10 | //
11 | 
12 | import "6.824/mr"
13 | import "time"
14 | import "os"
15 | import "fmt"
16 | 
17 | func main() {
18 | 	if len(os.Args) < 2 {
19 | 		fmt.Fprintf(os.Stderr, "Usage: mrcoordinator inputfiles...\n")
20 | 		os.Exit(1)
21 | 	}
22 | 
23 | 	m := mr.MakeCoordinator(os.Args[1:], 10)
24 | 	for m.Done() == false {
25 | 		time.Sleep(time.Second)
26 | 	}
27 | 
28 | 	time.Sleep(time.Second)
29 | }
30 | 


--------------------------------------------------------------------------------
/main/lockc.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // see comments in lockd.go
 5 | //
 6 | 
 7 | import "6.824/lockservice"
 8 | import "os"
 9 | import "fmt"
10 | 
11 | func usage() {
12 | 	fmt.Printf("Usage: lockc -l|-u primaryport backupport lockname\n")
13 | 	os.Exit(1)
14 | }
15 | 
16 | func main() {
17 | 	if len(os.Args) == 5 {
18 | 		ck := lockservice.MakeClerk(os.Args[2], os.Args[3])
19 | 		var ok bool
20 | 		if os.Args[1] == "-l" {
21 | 			ok = ck.Lock(os.Args[4])
22 | 		} else if os.Args[1] == "-u" {
23 | 			ok = ck.Unlock(os.Args[4])
24 | 		} else {
25 | 			usage()
26 | 		}
27 | 		fmt.Printf("reply: %v\n", ok)
28 | 	} else {
29 | 		usage()
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/raft/util.go:
--------------------------------------------------------------------------------
 1 | package raft
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"strconv"
 8 | 	"time"
 9 | )
10 | 
11 | // Debugging
12 | const Debug = false
13 | 
14 | var file *os.File
15 | 
16 | func init() {
17 | 	f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt")
18 | 	if err != nil {
19 | 		DPrintf("log create file fail!")
20 | 	}
21 | 	file = f
22 | }
23 | 
24 | //debug下打印日志
25 | func DPrintf(format string, value ...interface{}) {
26 | 	now := time.Now()
27 | 	info := fmt.Sprintf("%v-%v-%v %v:%v:%v:  ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...)
28 | 
29 | 	if Debug {
30 | 		log.Printf(info)
31 | 	} else {
32 | 		//file.WriteString(info)
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/mr/common.go:
--------------------------------------------------------------------------------
 1 | package mr
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"math/rand"
 7 | 	"os"
 8 | 	"strconv"
 9 | 	"time"
10 | )
11 | 
12 | const Debug = false
13 | 
14 | var file *os.File
15 | 
16 | func init() {
17 | 	rand.Seed(10)
18 | 	f, err := os.Create("log-" + strconv.Itoa(int(time.Now().Unix()+rand.Int63n(100))) + ".txt")
19 | 	if err != nil {
20 | 		DPrintf("log create file fail!")
21 | 	}
22 | 	file = f
23 | }
24 | 
25 | //debug下打印日志
26 | func DPrintf(format string, value ...interface{}) {
27 | 	now := time.Now()
28 | 	info := fmt.Sprintf("%v-%v-%v %v:%v:%v:  ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...)
29 | 
30 | 	if Debug {
31 | 		log.Printf(info)
32 | 	} else {
33 | 		file.WriteString(info)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/kvraft/util.go:
--------------------------------------------------------------------------------
 1 | package kvraft
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"strconv"
 8 | 	"time"
 9 | )
10 | 
11 | // Debugging
12 | const Debug = false
13 | 
14 | var file *os.File
15 | 
16 | func init() {
17 | 	f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt")
18 | 	if err != nil {
19 | 		DPrintf("log create file fail!")
20 | 		fmt.Println("log create file fail!")
21 | 	}
22 | 	file = f
23 | }
24 | 
25 | //debug下打印日志
26 | func DPrintf(format string, value ...interface{}) {
27 | 	now := time.Now()
28 | 	info := fmt.Sprintf("%v-%v-%v %v:%v:%v:  ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...)
29 | 
30 | 	if Debug {
31 | 		log.Printf(info)
32 | 	} else {
33 | 		file.WriteString(info)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/shardctrler/util.go:
--------------------------------------------------------------------------------
 1 | package shardctrler
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"strconv"
 8 | 	"time"
 9 | )
10 | 
11 | // Debugging
12 | const Debug = false
13 | 
14 | var file *os.File
15 | 
16 | func init() {
17 | 	f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt")
18 | 	if err != nil {
19 | 		DPrintf("log create file fail!")
20 | 		fmt.Println("log create file fail!")
21 | 	}
22 | 	file = f
23 | }
24 | 
25 | //debug下打印日志
26 | func DPrintf(format string, value ...interface{}) {
27 | 	now := time.Now()
28 | 	info := fmt.Sprintf("%v-%v-%v %v:%v:%v:  ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...)
29 | 
30 | 	if Debug {
31 | 		log.Printf(info)
32 | 	} else {
33 | 		file.WriteString(info)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/shardkv/util.go:
--------------------------------------------------------------------------------
 1 | package shardkv
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"strconv"
 8 | 	"time"
 9 | )
10 | 
11 | // Debugging
12 | const Debug = false
13 | 
14 | var file *os.File
15 | 
16 | func init() {
17 | 	f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt")
18 | 	if err != nil {
19 | 		DPrintf("", "log create file fail!")
20 | 		fmt.Println("log create file fail!")
21 | 	}
22 | 	file = f
23 | }
24 | 
25 | //debug下打印日志
26 | func DPrintf(msg, format string, value ...interface{}) {
27 | 	now := time.Now()
28 | 	info := fmt.Sprintf("%v-%v-%v %v:%v:%v:  ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + msg + fmt.Sprintf(format+"\n", value...)
29 | 
30 | 	if Debug {
31 | 		log.Printf(info)
32 | 	} else {
33 | 		file.WriteString(info)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/main/lockd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | // export GOPATH=~/6.824
 4 | // go build lockd.go
 5 | // go build lockc.go
 6 | // ./lockd -p a b &
 7 | // ./lockd -b a b &
 8 | // ./lockc -l a b lx
 9 | // ./lockc -u a b lx
10 | //
11 | // on Athena, use /tmp/myname-a and /tmp/myname-b
12 | // instead of a and b.
13 | 
14 | import "time"
15 | import "6.824/lockservice"
16 | import "os"
17 | import "fmt"
18 | 
19 | func main() {
20 | 	if len(os.Args) == 4 && os.Args[1] == "-p" {
21 | 		lockservice.StartServer(os.Args[2], os.Args[3], true)
22 | 	} else if len(os.Args) == 4 && os.Args[1] == "-b" {
23 | 		lockservice.StartServer(os.Args[2], os.Args[3], false)
24 | 	} else {
25 | 		fmt.Printf("Usage: lockd -p|-b primaryport backupport\n")
26 | 		os.Exit(1)
27 | 	}
28 | 	for {
29 | 		time.Sleep(100 * time.Second)
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/kvraft/common.go:
--------------------------------------------------------------------------------
 1 | package kvraft
 2 | 
 3 | const (
 4 | 	OK             = "OK"
 5 | 	ErrNoKey       = "ErrNoKey"
 6 | 	ErrWrongLeader = "ErrWrongLeader"
 7 | 	ErrTimeOut     = "ErrTimeOut"
 8 | 	ErrServer      = "ErrServer"
 9 | )
10 | 
11 | type Err string
12 | 
13 | // Put or Append
14 | type PutAppendArgs struct {
15 | 	Key   string
16 | 	Value string
17 | 	Op    string // "Put" or "Append"
18 | 	// You'll have to add definitions here.
19 | 	// Field names must start with capital letters,
20 | 	// otherwise RPC will break.
21 | 	ClientId  int64
22 | 	CommandId int64
23 | }
24 | 
25 | type PutAppendReply struct {
26 | 	Err Err
27 | }
28 | 
29 | type GetArgs struct {
30 | 	Key       string
31 | 	ClientId  int64
32 | 	CommandId int64
33 | 	// You'll have to add definitions here.
34 | }
35 | 
36 | type GetReply struct {
37 | 	Err   Err
38 | 	Value string
39 | }
40 | 


--------------------------------------------------------------------------------
/main/pbc.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // pbservice client application
 5 | //
 6 | // export GOPATH=~/6.824
 7 | // go build viewd.go
 8 | // go build pbd.go
 9 | // go build pbc.go
10 | // ./viewd /tmp/rtm-v &
11 | // ./pbd /tmp/rtm-v /tmp/rtm-1 &
12 | // ./pbd /tmp/rtm-v /tmp/rtm-2 &
13 | // ./pbc /tmp/rtm-v key1 value1
14 | // ./pbc /tmp/rtm-v key1
15 | //
16 | // change "rtm" to your user name.
17 | // start the pbd programs in separate windows and kill
18 | // and restart them to exercise fault tolerance.
19 | //
20 | 
21 | import "6.824/pbservice"
22 | import "os"
23 | import "fmt"
24 | 
25 | func usage() {
26 | 	fmt.Printf("Usage: pbc viewport key\n")
27 | 	fmt.Printf("       pbc viewport key value\n")
28 | 	os.Exit(1)
29 | }
30 | 
31 | func main() {
32 | 	if len(os.Args) == 3 {
33 | 		// get
34 | 		ck := pbservice.MakeClerk(os.Args[1], "")
35 | 		v := ck.Get(os.Args[2])
36 | 		fmt.Printf("%v\n", v)
37 | 	} else if len(os.Args) == 4 {
38 | 		// put
39 | 		ck := pbservice.MakeClerk(os.Args[1], "")
40 | 		ck.Put(os.Args[2], os.Args[3])
41 | 	} else {
42 | 		usage()
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/mr/rpc.go:
--------------------------------------------------------------------------------
 1 | package mr
 2 | 
 3 | //
 4 | // RPC definitions.
 5 | //
 6 | // remember to capitalize all names.
 7 | //
 8 | 
 9 | import "os"
10 | import "strconv"
11 | 
12 | //
13 | // example to show how to declare the arguments
14 | // and reply for an RPC.
15 | //
16 | 
17 | type ExampleArgs struct {
18 | 	X int
19 | }
20 | 
21 | type ExampleReply struct {
22 | 	Y int
23 | }
24 | 
25 | // Add your RPC definitions here.
26 | 
27 | //用于获取任务
28 | type TaskArgs struct {
29 | 	WorkerId int
30 | }
31 | 
32 | type TaskReply struct {
33 | 	Task *Task
34 | }
35 | 
36 | //用于worker创建后的注册
37 | type RegArgs struct {
38 | }
39 | 
40 | type RegReply struct {
41 | 	WorkerId int
42 | }
43 | 
44 | //用于worker响应任务
45 | type ReportTaskArgs struct {
46 | 	WorkerId int
47 | 	Phase    TaskPhase
48 | 	Seq      int
49 | 	Done     bool
50 | }
51 | 
52 | type ReportTaskReply struct {
53 | }
54 | 
55 | // Cook up a unique-ish UNIX-domain socket name
56 | // in /var/tmp, for the coordinator.
57 | // Can't use the current directory since
58 | // Athena AFS doesn't support UNIX-domain sockets.
59 | func coordinatorSock() string {
60 | 	s := "/var/tmp/824-mr-"
61 | 	s += strconv.Itoa(os.Getuid())
62 | 	return s
63 | }
64 | 


--------------------------------------------------------------------------------
/mrapps/early_exit.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a word-count application "plugin" for MapReduce.
 5 | //
 6 | // go build -buildmode=plugin wc_long.go
 7 | //
 8 | 
 9 | import (
10 | 	"strconv"
11 | 	"strings"
12 | 	"time"
13 | 
14 | 	"6.824/mr"
15 | )
16 | 
17 | //
18 | // The map function is called once for each file of input.
19 | // This map function just returns 1 for each file
20 | //
21 | func Map(filename string, contents string) []mr.KeyValue {
22 | 	kva := []mr.KeyValue{}
23 | 	kva = append(kva, mr.KeyValue{filename, "1"})
24 | 	return kva
25 | }
26 | 
27 | //
28 | // The reduce function is called once for each key generated by the
29 | // map tasks, with a list of all the values created for that key by
30 | // any map task.
31 | //
32 | func Reduce(key string, values []string) string {
33 | 	// some reduce tasks sleep for a long time; potentially seeing if
34 | 	// a worker will accidentally exit early
35 | 	if strings.Contains(key, "sherlock") || strings.Contains(key, "tom") {
36 | 		time.Sleep(time.Duration(3 * time.Second))
37 | 	}
38 | 	// return the number of occurrences of this file.
39 | 	return strconv.Itoa(len(values))
40 | }
41 | 


--------------------------------------------------------------------------------
/mrapps/nocrash.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // same as crash.go but doesn't actually crash.
 5 | //
 6 | // go build -buildmode=plugin nocrash.go
 7 | //
 8 | 
 9 | import "6.824/mr"
10 | import crand "crypto/rand"
11 | import "math/big"
12 | import "strings"
13 | import "os"
14 | import "sort"
15 | import "strconv"
16 | 
17 | func maybeCrash() {
18 | 	max := big.NewInt(1000)
19 | 	rr, _ := crand.Int(crand.Reader, max)
20 | 	if false && rr.Int64() < 500 {
21 | 		// crash!
22 | 		os.Exit(1)
23 | 	}
24 | }
25 | 
26 | func Map(filename string, contents string) []mr.KeyValue {
27 | 	maybeCrash()
28 | 
29 | 	kva := []mr.KeyValue{}
30 | 	kva = append(kva, mr.KeyValue{"a", filename})
31 | 	kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))})
32 | 	kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))})
33 | 	kva = append(kva, mr.KeyValue{"d", "xyzzy"})
34 | 	return kva
35 | }
36 | 
37 | func Reduce(key string, values []string) string {
38 | 	maybeCrash()
39 | 
40 | 	// sort values to ensure deterministic output.
41 | 	vv := make([]string, len(values))
42 | 	copy(vv, values)
43 | 	sort.Strings(vv)
44 | 
45 | 	val := strings.Join(vv, " ")
46 | 	return val
47 | }
48 | 


--------------------------------------------------------------------------------
/mrapps/jobcount.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application that counts the number of times map/reduce
 5 | // tasks are run, to test whether jobs are assigned multiple times even when
 6 | // there is no failure.
 7 | //
 8 | // go build -buildmode=plugin crash.go
 9 | //
10 | 
11 | import "6.824/mr"
12 | import "math/rand"
13 | import "strings"
14 | import "strconv"
15 | import "time"
16 | import "fmt"
17 | import "os"
18 | import "io/ioutil"
19 | 
20 | var count int
21 | 
22 | func Map(filename string, contents string) []mr.KeyValue {
23 | 	me := os.Getpid()
24 | 	f := fmt.Sprintf("mr-worker-jobcount-%d-%d", me, count)
25 | 	count++
26 | 	err := ioutil.WriteFile(f, []byte("x"), 0666)
27 | 	if err != nil {
28 | 		panic(err)
29 | 	}
30 | 	time.Sleep(time.Duration(2000+rand.Intn(3000)) * time.Millisecond)
31 | 	return []mr.KeyValue{mr.KeyValue{"a", "x"}}
32 | }
33 | 
34 | func Reduce(key string, values []string) string {
35 | 	files, err := ioutil.ReadDir(".")
36 | 	if err != nil {
37 | 		panic(err)
38 | 	}
39 | 	invocations := 0
40 | 	for _, f := range files {
41 | 		if strings.HasPrefix(f.Name(), "mr-worker-jobcount") {
42 | 			invocations++
43 | 		}
44 | 	}
45 | 	return strconv.Itoa(invocations)
46 | }
47 | 


--------------------------------------------------------------------------------
/mrapps/indexer.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // an indexing application "plugin" for MapReduce.
 5 | //
 6 | // go build -buildmode=plugin indexer.go
 7 | //
 8 | 
 9 | import "fmt"
10 | import "6.824/mr"
11 | 
12 | import "strings"
13 | import "unicode"
14 | import "sort"
15 | 
16 | // The mapping function is called once for each piece of the input.
17 | // In this framework, the key is the name of the file that is being processed,
18 | // and the value is the file's contents. The return value should be a slice of
19 | // key/value pairs, each represented by a mr.KeyValue.
20 | func Map(document string, value string) (res []mr.KeyValue) {
21 | 	m := make(map[string]bool)
22 | 	words := strings.FieldsFunc(value, func(x rune) bool { return !unicode.IsLetter(x) })
23 | 	for _, w := range words {
24 | 		m[w] = true
25 | 	}
26 | 	for w := range m {
27 | 		kv := mr.KeyValue{w, document}
28 | 		res = append(res, kv)
29 | 	}
30 | 	return
31 | }
32 | 
33 | // The reduce function is called once for each key generated by Map, with a
34 | // list of that key's string value (merged across all inputs). The return value
35 | // should be a single output value for that key.
36 | func Reduce(key string, values []string) string {
37 | 	sort.Strings(values)
38 | 	return fmt.Sprintf("%d %s", len(values), strings.Join(values, ","))
39 | }
40 | 


--------------------------------------------------------------------------------
/mrapps/wc.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a word-count application "plugin" for MapReduce.
 5 | //
 6 | // go build -buildmode=plugin wc.go
 7 | //
 8 | 
 9 | import "6.824/mr"
10 | import "unicode"
11 | import "strings"
12 | import "strconv"
13 | 
14 | //
15 | // The map function is called once for each file of input. The first
16 | // argument is the name of the input file, and the second is the
17 | // file's complete contents. You should ignore the input file name,
18 | // and look only at the contents argument. The return value is a slice
19 | // of key/value pairs.
20 | //
21 | func Map(filename string, contents string) []mr.KeyValue {
22 | 	// function to detect word separators.
23 | 	ff := func(r rune) bool { return !unicode.IsLetter(r) }
24 | 
25 | 	// split contents into an array of words.
26 | 	words := strings.FieldsFunc(contents, ff)
27 | 
28 | 	kva := []mr.KeyValue{}
29 | 	for _, w := range words {
30 | 		kv := mr.KeyValue{w, "1"}
31 | 		kva = append(kva, kv)
32 | 	}
33 | 	return kva
34 | }
35 | 
36 | //
37 | // The reduce function is called once for each key generated by the
38 | // map tasks, with a list of all the values created for that key by
39 | // any map task.
40 | //
41 | func Reduce(key string, values []string) string {
42 | 	// return the number of occurrences of this word.
43 | 	return strconv.Itoa(len(values))
44 | }
45 | 


--------------------------------------------------------------------------------
/main/mrworker.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // start a worker process, which is implemented
 5 | // in ../mr/worker.go. typically there will be
 6 | // multiple worker processes, talking to one coordinator.
 7 | //
 8 | // go run mrworker.go wc.so
 9 | //
10 | // Please do not change this file.
11 | //
12 | 
13 | import "6.824/mr"
14 | import "plugin"
15 | import "os"
16 | import "fmt"
17 | import "log"
18 | 
19 | func main() {
20 | 	if len(os.Args) != 2 {
21 | 		fmt.Fprintf(os.Stderr, "Usage: mrworker xxx.so\n")
22 | 		os.Exit(1)
23 | 	}
24 | 
25 | 	mapf, reducef := loadPlugin(os.Args[1])
26 | 
27 | 	mr.Worker(mapf, reducef)
28 | }
29 | 
30 | //
31 | // load the application Map and Reduce functions
32 | // from a plugin file, e.g. ../mrapps/wc.so
33 | //
34 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
35 | 	p, err := plugin.Open(filename)
36 | 	if err != nil {
37 | 		log.Fatalf("cannot load plugin %v", filename)
38 | 	}
39 | 	xmapf, err := p.Lookup("Map")
40 | 	if err != nil {
41 | 		log.Fatalf("cannot find Map in %v", filename)
42 | 	}
43 | 	mapf := xmapf.(func(string, string) []mr.KeyValue)
44 | 	xreducef, err := p.Lookup("Reduce")
45 | 	if err != nil {
46 | 		log.Fatalf("cannot find Reduce in %v", filename)
47 | 	}
48 | 	reducef := xreducef.(func(string, []string) string)
49 | 
50 | 	return mapf, reducef
51 | }
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## MIT6.824分布式系统
 2 | [6.824 Schedule: Spring 2022](https://pdos.csail.mit.edu/6.824/schedule.html) 课程的四个lab已经全部完成，并进行了文档记录：
 3 | 
 4 | - Lab01-MapReduce：https://blog.csdn.net/qq_44766883/article/details/124475672
 5 | - Lab02-Raft：
 6 |   - Part 2A(leader election)：https://blog.csdn.net/qq_44766883/article/details/126255117
 7 |   - Part 2B(log replication)：https://blog.csdn.net/qq_44766883/article/details/126255214
 8 |   - Part 2C(persistence)：https://blog.csdn.net/qq_44766883/article/details/126255266
 9 |   - Part 2D(log compaction)：https://blog.csdn.net/qq_44766883/article/details/126255298
10 | - Lab03-Fault-tolerant KV Service：
11 |   - Part 3A(Key/value service without snapshots)：https://blog.csdn.net/qq_44766883/article/details/126333690
12 |   - Part 3B(Key/value service with snapshots)：https://blog.csdn.net/qq_44766883/article/details/126333739
13 | - Lab03-Shard Sharded KV Service：
14 |   - Part 4A(The Shard controller)：https://blog.csdn.net/qq_44766883/article/details/126430294
15 |   - Part 4A(Sharded Key/Value Server)：https://blog.csdn.net/qq_44766883/article/details/126430452
16 | 
17 | 相关资料：
18 | - https://github.com/maemual/raft-zh_cn/blob/master/raft-zh_cn.md
19 | - https://raft.github.io/raft.pdf
20 | - https://pdos.csail.mit.edu/6.824/papers/mapreduce.pdf
21 | - https://github.com/OneSizeFitsQuorum/MIT6.824-2021
22 | - https://github.com/chaozh/MIT-6.824
23 | - https://www.bilibili.com/video/av87684880
24 | - https://shimo.im/docs/xwqvh3kGppJKvHvX?fallback=1


--------------------------------------------------------------------------------
/mrapps/crash.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application that sometimes crashes,
 5 | // and sometimes takes a long time,
 6 | // to test MapReduce's ability to recover.
 7 | //
 8 | // go build -buildmode=plugin crash.go
 9 | //
10 | 
11 | import "6.824/mr"
12 | import crand "crypto/rand"
13 | import "math/big"
14 | import "strings"
15 | import "os"
16 | import "sort"
17 | import "strconv"
18 | import "time"
19 | 
20 | func maybeCrash() {
21 | 	max := big.NewInt(1000)
22 | 	rr, _ := crand.Int(crand.Reader, max)
23 | 	if rr.Int64() < 330 {
24 | 		// crash!
25 | 		os.Exit(1)
26 | 	} else if rr.Int64() < 660 {
27 | 		// delay for a while.
28 | 		maxms := big.NewInt(10 * 1000)
29 | 		ms, _ := crand.Int(crand.Reader, maxms)
30 | 		time.Sleep(time.Duration(ms.Int64()) * time.Millisecond)
31 | 	}
32 | }
33 | 
34 | func Map(filename string, contents string) []mr.KeyValue {
35 | 	maybeCrash()
36 | 
37 | 	kva := []mr.KeyValue{}
38 | 	kva = append(kva, mr.KeyValue{"a", filename})
39 | 	kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))})
40 | 	kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))})
41 | 	kva = append(kva, mr.KeyValue{"d", "xyzzy"})
42 | 	return kva
43 | }
44 | 
45 | func Reduce(key string, values []string) string {
46 | 	maybeCrash()
47 | 
48 | 	// sort values to ensure deterministic output.
49 | 	vv := make([]string, len(values))
50 | 	copy(vv, values)
51 | 	sort.Strings(vv)
52 | 
53 | 	val := strings.Join(vv, " ")
54 | 	return val
55 | }
56 | 


--------------------------------------------------------------------------------
/porcupine/porcupine.go:
--------------------------------------------------------------------------------
 1 | package porcupine
 2 | 
 3 | import "time"
 4 | 
 5 | func CheckOperations(model Model, history []Operation) bool {
 6 | 	res, _ := checkOperations(model, history, false, 0)
 7 | 	return res == Ok
 8 | }
 9 | 
10 | // timeout = 0 means no timeout
11 | // if this operation times out, then a false positive is possible
12 | func CheckOperationsTimeout(model Model, history []Operation, timeout time.Duration) CheckResult {
13 | 	res, _ := checkOperations(model, history, false, timeout)
14 | 	return res
15 | }
16 | 
17 | // timeout = 0 means no timeout
18 | // if this operation times out, then a false positive is possible
19 | func CheckOperationsVerbose(model Model, history []Operation, timeout time.Duration) (CheckResult, linearizationInfo) {
20 | 	return checkOperations(model, history, true, timeout)
21 | }
22 | 
23 | func CheckEvents(model Model, history []Event) bool {
24 | 	res, _ := checkEvents(model, history, false, 0)
25 | 	return res == Ok
26 | }
27 | 
28 | // timeout = 0 means no timeout
29 | // if this operation times out, then a false positive is possible
30 | func CheckEventsTimeout(model Model, history []Event, timeout time.Duration) CheckResult {
31 | 	res, _ := checkEvents(model, history, false, timeout)
32 | 	return res
33 | }
34 | 
35 | // timeout = 0 means no timeout
36 | // if this operation times out, then a false positive is possible
37 | func CheckEventsVerbose(model Model, history []Event, timeout time.Duration) (CheckResult, linearizationInfo) {
38 | 	return checkEvents(model, history, true, timeout)
39 | }
40 | 


--------------------------------------------------------------------------------
/porcupine/bitset.go:
--------------------------------------------------------------------------------
 1 | package porcupine
 2 | 
 3 | import "math/bits"
 4 | 
 5 | type bitset []uint64
 6 | 
 7 | // data layout:
 8 | // bits 0-63 are in data[0], the next are in data[1], etc.
 9 | 
10 | func newBitset(bits uint) bitset {
11 | 	extra := uint(0)
12 | 	if bits%64 != 0 {
13 | 		extra = 1
14 | 	}
15 | 	chunks := bits/64 + extra
16 | 	return bitset(make([]uint64, chunks))
17 | }
18 | 
19 | func (b bitset) clone() bitset {
20 | 	dataCopy := make([]uint64, len(b))
21 | 	copy(dataCopy, b)
22 | 	return bitset(dataCopy)
23 | }
24 | 
25 | func bitsetIndex(pos uint) (uint, uint) {
26 | 	return pos / 64, pos % 64
27 | }
28 | 
29 | func (b bitset) set(pos uint) bitset {
30 | 	major, minor := bitsetIndex(pos)
31 | 	b[major] |= (1 << minor)
32 | 	return b
33 | }
34 | 
35 | func (b bitset) clear(pos uint) bitset {
36 | 	major, minor := bitsetIndex(pos)
37 | 	b[major] &^= (1 << minor)
38 | 	return b
39 | }
40 | 
41 | func (b bitset) get(pos uint) bool {
42 | 	major, minor := bitsetIndex(pos)
43 | 	return b[major]&(1<<minor) != 0
44 | }
45 | 
46 | func (b bitset) popcnt() uint {
47 | 	total := 0
48 | 	for _, v := range b {
49 | 		total += bits.OnesCount64(v)
50 | 	}
51 | 	return uint(total)
52 | }
53 | 
54 | func (b bitset) hash() uint64 {
55 | 	hash := uint64(b.popcnt())
56 | 	for _, v := range b {
57 | 		hash ^= v
58 | 	}
59 | 	return hash
60 | }
61 | 
62 | func (b bitset) equals(b2 bitset) bool {
63 | 	if len(b) != len(b2) {
64 | 		return false
65 | 	}
66 | 	for i := range b {
67 | 		if b[i] != b2[i] {
68 | 			return false
69 | 		}
70 | 	}
71 | 	return true
72 | }
73 | 


--------------------------------------------------------------------------------
/raft/persister.go:
--------------------------------------------------------------------------------
 1 | package raft
 2 | 
 3 | //
 4 | // support for Raft and kvraft to save persistent
 5 | // Raft state (log &c) and k/v server snapshots.
 6 | //
 7 | // we will use the original persister.go to test your code for grading.
 8 | // so, while you can modify this code to help you debug, please
 9 | // test with the original before submitting.
10 | //
11 | 
12 | import "sync"
13 | 
14 | type Persister struct {
15 | 	mu        sync.Mutex
16 | 	raftstate []byte //存储当前raft的状态
17 | 	snapshot  []byte //存储当前的快照
18 | }
19 | 
20 | func MakePersister() *Persister {
21 | 	return &Persister{}
22 | }
23 | 
24 | func clone(orig []byte) []byte {
25 | 	x := make([]byte, len(orig))
26 | 	copy(x, orig)
27 | 	return x
28 | }
29 | 
30 | func (ps *Persister) Copy() *Persister {
31 | 	ps.mu.Lock()
32 | 	defer ps.mu.Unlock()
33 | 	np := MakePersister()
34 | 	np.raftstate = ps.raftstate
35 | 	np.snapshot = ps.snapshot
36 | 	return np
37 | }
38 | 
39 | func (ps *Persister) SaveRaftState(state []byte) {
40 | 	ps.mu.Lock()
41 | 	defer ps.mu.Unlock()
42 | 	ps.raftstate = clone(state)
43 | }
44 | 
45 | func (ps *Persister) ReadRaftState() []byte {
46 | 	ps.mu.Lock()
47 | 	defer ps.mu.Unlock()
48 | 	return clone(ps.raftstate)
49 | }
50 | 
51 | func (ps *Persister) RaftStateSize() int {
52 | 	ps.mu.Lock()
53 | 	defer ps.mu.Unlock()
54 | 	return len(ps.raftstate)
55 | }
56 | 
57 | // Save both Raft state and K/V snapshot as a single atomic action,
58 | // to help avoid them getting out of sync.
59 | func (ps *Persister) SaveStateAndSnapshot(state []byte, snapshot []byte) {
60 | 	ps.mu.Lock()
61 | 	defer ps.mu.Unlock()
62 | 	ps.raftstate = clone(state)
63 | 	ps.snapshot = clone(snapshot)
64 | }
65 | 
66 | func (ps *Persister) ReadSnapshot() []byte {
67 | 	ps.mu.Lock()
68 | 	defer ps.mu.Unlock()
69 | 	return clone(ps.snapshot)
70 | }
71 | 
72 | func (ps *Persister) SnapshotSize() int {
73 | 	ps.mu.Lock()
74 | 	defer ps.mu.Unlock()
75 | 	return len(ps.snapshot)
76 | }
77 | 


--------------------------------------------------------------------------------
/models/kv.go:
--------------------------------------------------------------------------------
 1 | package models
 2 | 
 3 | import "6.824/porcupine"
 4 | import "fmt"
 5 | import "sort"
 6 | 
 7 | type KvInput struct {
 8 | 	Op    uint8 // 0 => get, 1 => put, 2 => append
 9 | 	Key   string
10 | 	Value string
11 | }
12 | 
13 | type KvOutput struct {
14 | 	Value string
15 | }
16 | 
17 | var KvModel = porcupine.Model{
18 | 	Partition: func(history []porcupine.Operation) [][]porcupine.Operation {
19 | 		m := make(map[string][]porcupine.Operation)
20 | 		for _, v := range history {
21 | 			key := v.Input.(KvInput).Key
22 | 			m[key] = append(m[key], v)
23 | 		}
24 | 		keys := make([]string, 0, len(m))
25 | 		for k := range m {
26 | 			keys = append(keys, k)
27 | 		}
28 | 		sort.Strings(keys)
29 | 		ret := make([][]porcupine.Operation, 0, len(keys))
30 | 		for _, k := range keys {
31 | 			ret = append(ret, m[k])
32 | 		}
33 | 		return ret
34 | 	},
35 | 	Init: func() interface{} {
36 | 		// note: we are modeling a single key's value here;
37 | 		// we're partitioning by key, so this is okay
38 | 		return ""
39 | 	},
40 | 	Step: func(state, input, output interface{}) (bool, interface{}) {
41 | 		inp := input.(KvInput)
42 | 		out := output.(KvOutput)
43 | 		st := state.(string)
44 | 		if inp.Op == 0 {
45 | 			// get
46 | 			return out.Value == st, state
47 | 		} else if inp.Op == 1 {
48 | 			// put
49 | 			return true, inp.Value
50 | 		} else {
51 | 			// append
52 | 			return true, (st + inp.Value)
53 | 		}
54 | 	},
55 | 	DescribeOperation: func(input, output interface{}) string {
56 | 		inp := input.(KvInput)
57 | 		out := output.(KvOutput)
58 | 		switch inp.Op {
59 | 		case 0:
60 | 			return fmt.Sprintf("get('%s') -> '%s'", inp.Key, out.Value)
61 | 		case 1:
62 | 			return fmt.Sprintf("put('%s', '%s')", inp.Key, inp.Value)
63 | 		case 2:
64 | 			return fmt.Sprintf("append('%s', '%s')", inp.Key, inp.Value)
65 | 		default:
66 | 			return "<invalid>"
67 | 		}
68 | 	},
69 | }
70 | 


--------------------------------------------------------------------------------
/main/diskvd.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // start a diskvd server. it's a member of some replica
 5 | // group, which has other members, and it needs to know
 6 | // how to talk to the members of the shardmaster service.
 7 | // used by ../diskv/test_test.go
 8 | //
 9 | // arguments:
10 | //   -g groupid
11 | //   -m masterport1 -m masterport2 ...
12 | //   -s replicaport1 -s replicaport2 ...
13 | //   -i my-index-in-server-port-list
14 | //   -u unreliable
15 | //   -d directory
16 | //   -r restart
17 | 
18 | import "time"
19 | import "6.824/diskv"
20 | import "os"
21 | import "fmt"
22 | import "strconv"
23 | import "runtime"
24 | 
25 | func usage() {
26 | 	fmt.Printf("Usage: diskvd -g gid -m master... -s server... -i my-index -d dir\n")
27 | 	os.Exit(1)
28 | }
29 | 
30 | func main() {
31 | 	var gid int64 = -1     // my replica group ID
32 | 	masters := []string{}  // ports of shardmasters
33 | 	replicas := []string{} // ports of servers in my replica group
34 | 	me := -1               // my index in replicas[]
35 | 	unreliable := false
36 | 	dir := "" // store persistent data here
37 | 	restart := false
38 | 
39 | 	for i := 1; i+1 < len(os.Args); i += 2 {
40 | 		a0 := os.Args[i]
41 | 		a1 := os.Args[i+1]
42 | 		if a0 == "-g" {
43 | 			gid, _ = strconv.ParseInt(a1, 10, 64)
44 | 		} else if a0 == "-m" {
45 | 			masters = append(masters, a1)
46 | 		} else if a0 == "-s" {
47 | 			replicas = append(replicas, a1)
48 | 		} else if a0 == "-i" {
49 | 			me, _ = strconv.Atoi(a1)
50 | 		} else if a0 == "-u" {
51 | 			unreliable, _ = strconv.ParseBool(a1)
52 | 		} else if a0 == "-d" {
53 | 			dir = a1
54 | 		} else if a0 == "-r" {
55 | 			restart, _ = strconv.ParseBool(a1)
56 | 		} else {
57 | 			usage()
58 | 		}
59 | 	}
60 | 
61 | 	if gid < 0 || me < 0 || len(masters) < 1 || me >= len(replicas) || dir == "" {
62 | 		usage()
63 | 	}
64 | 
65 | 	runtime.GOMAXPROCS(4)
66 | 
67 | 	srv := diskv.StartServer(gid, masters, replicas, me, dir, restart)
68 | 	srv.Setunreliable(unreliable)
69 | 
70 | 	// for safety, force quit after 10 minutes.
71 | 	time.Sleep(10 * 60 * time.Second)
72 | 	mep, _ := os.FindProcess(os.Getpid())
73 | 	mep.Kill()
74 | }
75 | 


--------------------------------------------------------------------------------
/mrapps/rtiming.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application to test that workers
 5 | // execute reduce tasks in parallel.
 6 | //
 7 | // go build -buildmode=plugin rtiming.go
 8 | //
 9 | 
10 | import "6.824/mr"
11 | import "fmt"
12 | import "os"
13 | import "syscall"
14 | import "time"
15 | import "io/ioutil"
16 | 
17 | func nparallel(phase string) int {
18 | 	// create a file so that other workers will see that
19 | 	// we're running at the same time as them.
20 | 	pid := os.Getpid()
21 | 	myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid)
22 | 	err := ioutil.WriteFile(myfilename, []byte("x"), 0666)
23 | 	if err != nil {
24 | 		panic(err)
25 | 	}
26 | 
27 | 	// are any other workers running?
28 | 	// find their PIDs by scanning directory for mr-worker-XXX files.
29 | 	dd, err := os.Open(".")
30 | 	if err != nil {
31 | 		panic(err)
32 | 	}
33 | 	names, err := dd.Readdirnames(1000000)
34 | 	if err != nil {
35 | 		panic(err)
36 | 	}
37 | 	ret := 0
38 | 	for _, name := range names {
39 | 		var xpid int
40 | 		pat := fmt.Sprintf("mr-worker-%s-%%d", phase)
41 | 		n, err := fmt.Sscanf(name, pat, &xpid)
42 | 		if n == 1 && err == nil {
43 | 			err := syscall.Kill(xpid, 0)
44 | 			if err == nil {
45 | 				// if err == nil, xpid is alive.
46 | 				ret += 1
47 | 			}
48 | 		}
49 | 	}
50 | 	dd.Close()
51 | 
52 | 	time.Sleep(1 * time.Second)
53 | 
54 | 	err = os.Remove(myfilename)
55 | 	if err != nil {
56 | 		panic(err)
57 | 	}
58 | 
59 | 	return ret
60 | }
61 | 
62 | func Map(filename string, contents string) []mr.KeyValue {
63 | 
64 | 	kva := []mr.KeyValue{}
65 | 	kva = append(kva, mr.KeyValue{"a", "1"})
66 | 	kva = append(kva, mr.KeyValue{"b", "1"})
67 | 	kva = append(kva, mr.KeyValue{"c", "1"})
68 | 	kva = append(kva, mr.KeyValue{"d", "1"})
69 | 	kva = append(kva, mr.KeyValue{"e", "1"})
70 | 	kva = append(kva, mr.KeyValue{"f", "1"})
71 | 	kva = append(kva, mr.KeyValue{"g", "1"})
72 | 	kva = append(kva, mr.KeyValue{"h", "1"})
73 | 	kva = append(kva, mr.KeyValue{"i", "1"})
74 | 	kva = append(kva, mr.KeyValue{"j", "1"})
75 | 	return kva
76 | }
77 | 
78 | func Reduce(key string, values []string) string {
79 | 	n := nparallel("reduce")
80 | 
81 | 	val := fmt.Sprintf("%d", n)
82 | 
83 | 	return val
84 | }
85 | 


--------------------------------------------------------------------------------
/shardkv/server_snapshot.go:
--------------------------------------------------------------------------------
 1 | package shardkv
 2 | 
 3 | import (
 4 | 	"6.824/labgob"
 5 | 	"6.824/shardctrler"
 6 | 	"bytes"
 7 | 	"log"
 8 | )
 9 | 
10 | //保存快照
11 | func (kv *ShardKV) saveSnapshot(logIndex int) {
12 | 	//判断条件，满足一定的日志量才能进行持久化
13 | 	if kv.maxraftstate == -1 || kv.persister.RaftStateSize() < kv.maxraftstate {
14 | 		return
15 | 	}
16 | 
17 | 	//生成快照数据
18 | 	w := new(bytes.Buffer)
19 | 	e := labgob.NewEncoder(w)
20 | 	if e.Encode(kv.data) != nil ||
21 | 		e.Encode(kv.lastApplies) != nil ||
22 | 		e.Encode(kv.inputShards) != nil ||
23 | 		e.Encode(kv.outputShards) != nil ||
24 | 		e.Encode(kv.config) != nil ||
25 | 		e.Encode(kv.oldConfig) != nil ||
26 | 		e.Encode(kv.meShards) != nil {
27 | 		panic("gen snapshot data encode err")
28 | 	}
29 | 	data := w.Bytes()
30 | 	kv.rf.Snapshot(logIndex, data)
31 | }
32 | 
33 | //读取快照
34 | //两处调用：初始化阶段；收到Snapshot命令，即接收了leader的Snapshot
35 | func (kv *ShardKV) readPersist(isInit bool, snapshotTerm, snapshotIndex int, data []byte) {
36 | 	if data == nil || len(data) < 1 {
37 | 		return
38 | 	}
39 | 	//只要不是初始化调用，即如果收到一个Snapshot命令，就要执行该函数
40 | 	//不知道为什么，只要在ShardKV中调用该函数，就会导致测试一直阻塞，就算该函数为空也没办法通过，只能注释掉，将CondInstallSnapshot的逻辑写到InstallSnapshot RPC的处理代码中
41 | 	//if !isInit {
42 | 	//	res := kv.rf.CondInstallSnapshot(snapshotTerm, snapshotIndex, data)
43 | 	//	if !res {
44 | 	//		log.Panicln("kv read persist err in CondInstallSnapshot!")
45 | 	//		return
46 | 	//	}
47 | 	//}
48 | 	//对数据进行同步
49 | 	r := bytes.NewBuffer(data)
50 | 	d := labgob.NewDecoder(r)
51 | 	var kvData [shardctrler.NShards]map[string]string
52 | 	var lastApplies [shardctrler.NShards]map[int64]int64
53 | 	var inputShards map[int]bool
54 | 	var outputShards map[int]map[int]MergeShardData
55 | 	var config shardctrler.Config
56 | 	var oldConfig shardctrler.Config
57 | 	var meShards map[int]bool
58 | 
59 | 	if d.Decode(&kvData) != nil ||
60 | 		d.Decode(&lastApplies) != nil ||
61 | 		d.Decode(&inputShards) != nil ||
62 | 		d.Decode(&outputShards) != nil ||
63 | 		d.Decode(&config) != nil ||
64 | 		d.Decode(&oldConfig) != nil ||
65 | 		d.Decode(&meShards) != nil {
66 | 		log.Fatal("kv read persist err")
67 | 	} else {
68 | 		kv.data = kvData
69 | 		kv.lastApplies = lastApplies
70 | 		kv.inputShards = inputShards
71 | 		kv.outputShards = outputShards
72 | 		kv.config = config
73 | 		kv.oldConfig = oldConfig
74 | 		kv.meShards = meShards
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/mrapps/mtiming.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | //
 4 | // a MapReduce pseudo-application to test that workers
 5 | // execute map tasks in parallel.
 6 | //
 7 | // go build -buildmode=plugin mtiming.go
 8 | //
 9 | 
10 | import "6.824/mr"
11 | import "strings"
12 | import "fmt"
13 | import "os"
14 | import "syscall"
15 | import "time"
16 | import "sort"
17 | import "io/ioutil"
18 | 
19 | func nparallel(phase string) int {
20 | 	// create a file so that other workers will see that
21 | 	// we're running at the same time as them.
22 | 	pid := os.Getpid()
23 | 	myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid)
24 | 	err := ioutil.WriteFile(myfilename, []byte("x"), 0666)
25 | 	if err != nil {
26 | 		panic(err)
27 | 	}
28 | 
29 | 	// are any other workers running?
30 | 	// find their PIDs by scanning directory for mr-worker-XXX files.
31 | 	dd, err := os.Open(".")
32 | 	if err != nil {
33 | 		panic(err)
34 | 	}
35 | 	names, err := dd.Readdirnames(1000000)
36 | 	if err != nil {
37 | 		panic(err)
38 | 	}
39 | 	ret := 0
40 | 	for _, name := range names {
41 | 		var xpid int
42 | 		pat := fmt.Sprintf("mr-worker-%s-%%d", phase)
43 | 		n, err := fmt.Sscanf(name, pat, &xpid)
44 | 		if n == 1 && err == nil {
45 | 			err := syscall.Kill(xpid, 0)
46 | 			if err == nil {
47 | 				// if err == nil, xpid is alive.
48 | 				ret += 1
49 | 			}
50 | 		}
51 | 	}
52 | 	dd.Close()
53 | 
54 | 	time.Sleep(1 * time.Second)
55 | 
56 | 	err = os.Remove(myfilename)
57 | 	if err != nil {
58 | 		panic(err)
59 | 	}
60 | 
61 | 	return ret
62 | }
63 | 
64 | func Map(filename string, contents string) []mr.KeyValue {
65 | 	t0 := time.Now()
66 | 	ts := float64(t0.Unix()) + (float64(t0.Nanosecond()) / 1000000000.0)
67 | 	pid := os.Getpid()
68 | 
69 | 	n := nparallel("map")
70 | 
71 | 	kva := []mr.KeyValue{}
72 | 	kva = append(kva, mr.KeyValue{
73 | 		fmt.Sprintf("times-%v", pid),
74 | 		fmt.Sprintf("%.1f", ts)})
75 | 	kva = append(kva, mr.KeyValue{
76 | 		fmt.Sprintf("parallel-%v", pid),
77 | 		fmt.Sprintf("%d", n)})
78 | 	return kva
79 | }
80 | 
81 | func Reduce(key string, values []string) string {
82 | 	//n := nparallel("reduce")
83 | 
84 | 	// sort values to ensure deterministic output.
85 | 	vv := make([]string, len(values))
86 | 	copy(vv, values)
87 | 	sort.Strings(vv)
88 | 
89 | 	val := strings.Join(vv, " ")
90 | 	return val
91 | }
92 | 


--------------------------------------------------------------------------------
/shardkv/server_op.go:
--------------------------------------------------------------------------------
 1 | package shardkv
 2 | 
 3 | import (
 4 | 	"time"
 5 | )
 6 | 
 7 | type Op struct {
 8 | 	// Your definitions here.
 9 | 	// Field names must start with capital letters,
10 | 	// otherwise RPC will break.
11 | 	ReqId     int64 //用来标识commandNotify
12 | 	CommandId int64
13 | 	ClientId  int64
14 | 	Key       string
15 | 	Value     string
16 | 	Method    string
17 | 	ConfigNum int
18 | }
19 | 
20 | type CommandResult struct {
21 | 	Err   Err
22 | 	Value string
23 | }
24 | 
25 | func (kv *ShardKV) removeCh(reqId int64) {
26 | 	kv.lock("removeCh")
27 | 	if _, ok := kv.commandNotifyCh[reqId]; ok {
28 | 		delete(kv.commandNotifyCh, reqId)
29 | 	}
30 | 	kv.unlock("removeCh")
31 | }
32 | 
33 | /*
34 | Get和PutAppend RPC的处理
35 | */
36 | 
37 | func (kv *ShardKV) Get(args *GetArgs, reply *GetReply) {
38 | 	// Your code here.
39 | 	res := kv.waitCommand(args.ClientId, args.CommandId, "Get", args.Key, "", args.ConfigNum)
40 | 	reply.Err = res.Err
41 | 	reply.Value = res.Value
42 | }
43 | 
44 | func (kv *ShardKV) PutAppend(args *PutAppendArgs, reply *PutAppendReply) {
45 | 	// Your code here.
46 | 	res := kv.waitCommand(args.ClientId, args.CommandId, args.Op, args.Key, args.Value, args.ConfigNum)
47 | 	reply.Err = res.Err
48 | }
49 | 
50 | func (kv *ShardKV) waitCommand(clientId int64, commandId int64, method, key, value string, configNum int) (res CommandResult) {
51 | 	kv.log("wait cmd start,clientId：%d,commandId: %d,method: %s,key-value:%s %s,configNum %d", clientId, commandId, method, key, value, configNum)
52 | 	op := Op{
53 | 		ReqId:     nrand(),
54 | 		ClientId:  clientId,
55 | 		CommandId: commandId,
56 | 		Method:    method,
57 | 		Key:       key,
58 | 		ConfigNum: configNum,
59 | 		Value:     value,
60 | 	}
61 | 	index, term, isLeader := kv.rf.Start(op)
62 | 	if !isLeader {
63 | 		res.Err = ErrWrongLeader
64 | 		kv.log("wait cmd NOT LEADER.")
65 | 		return
66 | 	}
67 | 	kv.lock("waitCommand")
68 | 	ch := make(chan CommandResult, 1)
69 | 	kv.commandNotifyCh[op.ReqId] = ch
70 | 	kv.unlock("waitCommand")
71 | 	kv.log("wait cmd notify,index: %v,term: %v,op: %+v", index, term, op)
72 | 	t := time.NewTimer(WaitCmdTimeOut)
73 | 	defer t.Stop()
74 | 
75 | 	select {
76 | 	case <-t.C:
77 | 		res.Err = ErrTimeOut
78 | 	case res = <-ch:
79 | 	case <-kv.stopCh:
80 | 		res.Err = ErrServer
81 | 	}
82 | 
83 | 	kv.removeCh(op.ReqId)
84 | 	kv.log("wait cmd end,Op: %+v.res：%+v", op, res)
85 | 	return
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/porcupine/model.go:
--------------------------------------------------------------------------------
 1 | package porcupine
 2 | 
 3 | import "fmt"
 4 | 
 5 | type Operation struct {
 6 | 	ClientId int // optional, unless you want a visualization; zero-indexed
 7 | 	Input    interface{}
 8 | 	Call     int64 // invocation time
 9 | 	Output   interface{}
10 | 	Return   int64 // response time
11 | }
12 | 
13 | type EventKind bool
14 | 
15 | const (
16 | 	CallEvent   EventKind = false
17 | 	ReturnEvent EventKind = true
18 | )
19 | 
20 | type Event struct {
21 | 	ClientId int // optional, unless you want a visualization; zero-indexed
22 | 	Kind     EventKind
23 | 	Value    interface{}
24 | 	Id       int
25 | }
26 | 
27 | type Model struct {
28 | 	// Partition functions, such that a history is linearizable if and only
29 | 	// if each partition is linearizable. If you don't want to implement
30 | 	// this, you can always use the `NoPartition` functions implemented
31 | 	// below.
32 | 	Partition      func(history []Operation) [][]Operation
33 | 	PartitionEvent func(history []Event) [][]Event
34 | 	// Initial state of the system.
35 | 	Init func() interface{}
36 | 	// Step function for the system. Returns whether or not the system
37 | 	// could take this step with the given inputs and outputs and also
38 | 	// returns the new state. This should not mutate the existing state.
39 | 	Step func(state interface{}, input interface{}, output interface{}) (bool, interface{})
40 | 	// Equality on states. If you are using a simple data type for states,
41 | 	// you can use the `ShallowEqual` function implemented below.
42 | 	Equal func(state1, state2 interface{}) bool
43 | 	// For visualization, describe an operation as a string.
44 | 	// For example, "Get('x') -> 'y'".
45 | 	DescribeOperation func(input interface{}, output interface{}) string
46 | 	// For visualization purposes, describe a state as a string.
47 | 	// For example, "{'x' -> 'y', 'z' -> 'w'}"
48 | 	DescribeState func(state interface{}) string
49 | }
50 | 
51 | func NoPartition(history []Operation) [][]Operation {
52 | 	return [][]Operation{history}
53 | }
54 | 
55 | func NoPartitionEvent(history []Event) [][]Event {
56 | 	return [][]Event{history}
57 | }
58 | 
59 | func ShallowEqual(state1, state2 interface{}) bool {
60 | 	return state1 == state2
61 | }
62 | 
63 | func DefaultDescribeOperation(input interface{}, output interface{}) string {
64 | 	return fmt.Sprintf("%v -> %v", input, output)
65 | }
66 | 
67 | func DefaultDescribeState(state interface{}) string {
68 | 	return fmt.Sprintf("%v", state)
69 | }
70 | 
71 | type CheckResult string
72 | 
73 | const (
74 | 	Unknown CheckResult = "Unknown" // timed out
75 | 	Ok                  = "Ok"
76 | 	Illegal             = "Illegal"
77 | )
78 | 


--------------------------------------------------------------------------------
/shardctrler/client.go:
--------------------------------------------------------------------------------
  1 | package shardctrler
  2 | 
  3 | //
  4 | // Shardctrler clerk.
  5 | //
  6 | 
  7 | import "6.824/labrpc"
  8 | import "time"
  9 | import "crypto/rand"
 10 | import "math/big"
 11 | 
 12 | type Clerk struct {
 13 | 	servers []*labrpc.ClientEnd
 14 | 	// Your data here.
 15 | 	clientId int64
 16 | }
 17 | 
 18 | func nrand() int64 {
 19 | 	max := big.NewInt(int64(1) << 62)
 20 | 	bigx, _ := rand.Int(rand.Reader, max)
 21 | 	x := bigx.Int64()
 22 | 	return x
 23 | }
 24 | 
 25 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk {
 26 | 	ck := new(Clerk)
 27 | 	ck.servers = servers
 28 | 	// Your code here.
 29 | 	ck.clientId = nrand()
 30 | 	return ck
 31 | }
 32 | 
 33 | func (ck *Clerk) Query(num int) Config {
 34 | 	args := &QueryArgs{}
 35 | 	// Your code here.
 36 | 	args.Num = num
 37 | 	args.ClientId = ck.clientId
 38 | 	args.CommandId = nrand()
 39 | 	for {
 40 | 		// try each known server.
 41 | 		for _, srv := range ck.servers {
 42 | 			var reply QueryReply
 43 | 			ok := srv.Call("ShardCtrler.Query", args, &reply)
 44 | 			if ok && reply.WrongLeader == false {
 45 | 				return reply.Config
 46 | 			}
 47 | 		}
 48 | 		time.Sleep(100 * time.Millisecond)
 49 | 	}
 50 | }
 51 | 
 52 | func (ck *Clerk) Join(servers map[int][]string) {
 53 | 	args := &JoinArgs{}
 54 | 	// Your code here.
 55 | 	args.Servers = servers
 56 | 	args.ClientId = ck.clientId
 57 | 	args.CommandId = nrand()
 58 | 
 59 | 	for {
 60 | 		// try each known server.
 61 | 		for _, srv := range ck.servers {
 62 | 			var reply JoinReply
 63 | 			ok := srv.Call("ShardCtrler.Join", args, &reply)
 64 | 			if ok && reply.WrongLeader == false {
 65 | 				return
 66 | 			}
 67 | 		}
 68 | 		time.Sleep(100 * time.Millisecond)
 69 | 	}
 70 | }
 71 | 
 72 | func (ck *Clerk) Leave(gids []int) {
 73 | 	args := &LeaveArgs{}
 74 | 	// Your code here.
 75 | 	args.GIDs = gids
 76 | 	args.ClientId = ck.clientId
 77 | 	args.CommandId = nrand()
 78 | 
 79 | 	for {
 80 | 		// try each known server.
 81 | 		for _, srv := range ck.servers {
 82 | 			var reply LeaveReply
 83 | 			ok := srv.Call("ShardCtrler.Leave", args, &reply)
 84 | 			if ok && reply.WrongLeader == false {
 85 | 				return
 86 | 			}
 87 | 		}
 88 | 		time.Sleep(100 * time.Millisecond)
 89 | 	}
 90 | }
 91 | 
 92 | func (ck *Clerk) Move(shard int, gid int) {
 93 | 	args := &MoveArgs{}
 94 | 	// Your code here.
 95 | 	args.Shard = shard
 96 | 	args.GID = gid
 97 | 	args.ClientId = ck.clientId
 98 | 	args.CommandId = nrand()
 99 | 
100 | 	for {
101 | 		// try each known server.
102 | 		for _, srv := range ck.servers {
103 | 			var reply MoveReply
104 | 			ok := srv.Call("ShardCtrler.Move", args, &reply)
105 | 			if ok && reply.WrongLeader == false {
106 | 				return
107 | 			}
108 | 		}
109 | 		time.Sleep(100 * time.Millisecond)
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/shardctrler/common.go:
--------------------------------------------------------------------------------
  1 | package shardctrler
  2 | 
  3 | import (
  4 | 	"6.824/labgob"
  5 | )
  6 | 
  7 | //
  8 | // Shard controler: assigns shards to replication groups.
  9 | //
 10 | // RPC interface:
 11 | // Join(servers) -- add a set of groups (gid -> server-list mapping).
 12 | // Leave(gids) -- delete a set of groups.
 13 | // Move(shard, gid) -- hand off one shard from current owner to gid.
 14 | // Query(num) -> fetch Config # num, or latest config if num==-1.
 15 | //
 16 | // A Config (configuration) describes a set of replica groups, and the
 17 | // replica group responsible for each shard. Configs are numbered. Config
 18 | // #0 is the initial configuration, with no groups and all shards
 19 | // assigned to group 0 (the invalid group).
 20 | //
 21 | // You will need to add fields to the RPC argument structs.
 22 | //
 23 | 
 24 | type Err string
 25 | 
 26 | // The number of shards.
 27 | const NShards = 10
 28 | 
 29 | //状态码
 30 | const (
 31 | 	OK             = "OK"
 32 | 	ErrWrongLeader = "wrongLeader"
 33 | 	ErrTimeout     = "timeout"
 34 | 	ErrServer      = "ErrServer"
 35 | )
 36 | 
 37 | //必须注册才能进行解码和编码
 38 | func init() {
 39 | 	labgob.Register(Config{})
 40 | 	labgob.Register(QueryArgs{})
 41 | 	labgob.Register(QueryReply{})
 42 | 	labgob.Register(JoinArgs{})
 43 | 	labgob.Register(JoinReply{})
 44 | 	labgob.Register(LeaveArgs{})
 45 | 	labgob.Register(MoveArgs{})
 46 | 	labgob.Register(LeaveReply{})
 47 | 	labgob.Register(MoveReply{})
 48 | }
 49 | 
 50 | // A configuration -- an assignment of shards to groups.
 51 | // Please don't change this.
 52 | //保存配置信息
 53 | type Config struct {
 54 | 	Num    int              // config number，当前配置的编号
 55 | 	Shards [NShards]int     // shard -> gid，每一个分片到replica group id的映射
 56 | 	Groups map[int][]string // gid -> servers[]，每一个replica group包含哪些server
 57 | }
 58 | 
 59 | type ClientCommandId struct {
 60 | 	ClientId  int64
 61 | 	CommandId int64
 62 | }
 63 | 
 64 | type JoinArgs struct {
 65 | 	Servers map[int][]string // new GID -> servers mappings
 66 | 	ClientCommandId
 67 | }
 68 | 
 69 | type JoinReply struct {
 70 | 	WrongLeader bool
 71 | 	Err         Err
 72 | }
 73 | 
 74 | type LeaveArgs struct {
 75 | 	GIDs []int
 76 | 	ClientCommandId
 77 | }
 78 | 
 79 | type LeaveReply struct {
 80 | 	WrongLeader bool
 81 | 	Err         Err
 82 | }
 83 | 
 84 | type MoveArgs struct {
 85 | 	Shard int
 86 | 	GID   int
 87 | 	ClientCommandId
 88 | }
 89 | 
 90 | type MoveReply struct {
 91 | 	WrongLeader bool
 92 | 	Err         Err
 93 | }
 94 | 
 95 | type QueryArgs struct {
 96 | 	Num int // desired config number
 97 | 	ClientCommandId
 98 | }
 99 | 
100 | type QueryReply struct {
101 | 	WrongLeader bool
102 | 	Err         Err
103 | 	Config      Config
104 | }
105 | 
106 | func (c *Config) Copy() Config {
107 | 	config := Config{
108 | 		Num:    c.Num,
109 | 		Shards: c.Shards,
110 | 		Groups: make(map[int][]string),
111 | 	}
112 | 	for gid, s := range c.Groups {
113 | 		config.Groups[gid] = append([]string{}, s...)
114 | 	}
115 | 	return config
116 | }
117 | 


--------------------------------------------------------------------------------
/main/mrsequential.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | //
  4 | // simple sequential MapReduce.
  5 | //
  6 | // go run mrsequential.go wc.so pg*.txt
  7 | //
  8 | 
  9 | import "fmt"
 10 | import "6.824/mr"
 11 | import "plugin"
 12 | import "os"
 13 | import "log"
 14 | import "io/ioutil"
 15 | import "sort"
 16 | 
 17 | // for sorting by key.
 18 | type ByKey []mr.KeyValue
 19 | 
 20 | // for sorting by key.
 21 | func (a ByKey) Len() int           { return len(a) }
 22 | func (a ByKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 23 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }
 24 | 
 25 | func main() {
 26 | 	if len(os.Args) < 3 {
 27 | 		fmt.Fprintf(os.Stderr, "Usage: mrsequential xxx.so inputfiles...\n")
 28 | 		os.Exit(1)
 29 | 	}
 30 | 
 31 | 	mapf, reducef := loadPlugin(os.Args[1])
 32 | 
 33 | 	//
 34 | 	// read each input file,
 35 | 	// pass it to Map,
 36 | 	// accumulate the intermediate Map output.
 37 | 	//
 38 | 	intermediate := []mr.KeyValue{}
 39 | 	for _, filename := range os.Args[2:] {
 40 | 		file, err := os.Open(filename)
 41 | 		if err != nil {
 42 | 			log.Fatalf("cannot open %v", filename)
 43 | 		}
 44 | 		content, err := ioutil.ReadAll(file)
 45 | 		if err != nil {
 46 | 			log.Fatalf("cannot read %v", filename)
 47 | 		}
 48 | 		file.Close()
 49 | 		kva := mapf(filename, string(content))
 50 | 		intermediate = append(intermediate, kva...)
 51 | 	}
 52 | 
 53 | 	//
 54 | 	// a big difference from real MapReduce is that all the
 55 | 	// intermediate data is in one place, intermediate[],
 56 | 	// rather than being partitioned into NxM buckets.
 57 | 	//
 58 | 
 59 | 	sort.Sort(ByKey(intermediate))
 60 | 
 61 | 	oname := "mr-out-0"
 62 | 	ofile, _ := os.Create(oname)
 63 | 
 64 | 	//
 65 | 	// call Reduce on each distinct key in intermediate[],
 66 | 	// and print the result to mr-out-0.
 67 | 	//
 68 | 	i := 0
 69 | 	for i < len(intermediate) {
 70 | 		j := i + 1
 71 | 		for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key {
 72 | 			j++
 73 | 		}
 74 | 		values := []string{}
 75 | 		for k := i; k < j; k++ {
 76 | 			values = append(values, intermediate[k].Value)
 77 | 		}
 78 | 		output := reducef(intermediate[i].Key, values)
 79 | 
 80 | 		// this is the correct format for each line of Reduce output.
 81 | 		fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output)
 82 | 
 83 | 		i = j
 84 | 	}
 85 | 
 86 | 	ofile.Close()
 87 | }
 88 | 
 89 | //
 90 | // load the application Map and Reduce functions
 91 | // from a plugin file, e.g. ../mrapps/wc.so
 92 | //
 93 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
 94 | 	p, err := plugin.Open(filename)
 95 | 	if err != nil {
 96 | 		log.Fatalf("cannot load plugin %v", filename)
 97 | 	}
 98 | 	xmapf, err := p.Lookup("Map")
 99 | 	if err != nil {
100 | 		log.Fatalf("cannot find Map in %v", filename)
101 | 	}
102 | 	mapf := xmapf.(func(string, string) []mr.KeyValue)
103 | 	xreducef, err := p.Lookup("Reduce")
104 | 	if err != nil {
105 | 		log.Fatalf("cannot find Reduce in %v", filename)
106 | 	}
107 | 	reducef := xreducef.(func(string, []string) string)
108 | 
109 | 	return mapf, reducef
110 | }
111 | 


--------------------------------------------------------------------------------
/shardkv/common.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import "6.824/labgob"
  4 | 
  5 | //
  6 | // Sharded key/value server.
  7 | // Lots of replica groups, each running Raft.
  8 | // Shardctrler decides which group serves each shard.
  9 | // Shardctrler may change shard assignment from time to time.
 10 | //
 11 | // You will have to modify these definitions.
 12 | //
 13 | 
 14 | //回复状态码
 15 | const (
 16 | 	OK             = "OK"
 17 | 	ErrNoKey       = "ErrNoKey"
 18 | 	ErrWrongGroup  = "ErrWrongGroup"
 19 | 	ErrWrongLeader = "ErrWrongLeader"
 20 | 	ErrTimeOut     = "ErrTimeOut"
 21 | 	ErrServer      = "ErrServer"
 22 | )
 23 | 
 24 | type Err string
 25 | 
 26 | //主要是applyCh的处理中，ApplyMsg的Command是一个interface，因此要向labgob注册具体实现才能进行编解码
 27 | func init() {
 28 | 	//labgob.Register(PutAppendArgs{})
 29 | 	//labgob.Register(PutAppendReply{})
 30 | 	//labgob.Register(GetArgs{})
 31 | 	//labgob.Register(GetReply{})
 32 | 	//labgob.Register(FetchShardDataArgs{})
 33 | 	//labgob.Register(FetchShardDataReply{})
 34 | 	labgob.Register(CleanShardDataArgs{})
 35 | 	//labgob.Register(CleanShardDataReply{})
 36 | 	labgob.Register(MergeShardData{})
 37 | }
 38 | 
 39 | // Put or Append
 40 | type PutAppendArgs struct {
 41 | 	// You'll have to add definitions here.
 42 | 	Key   string
 43 | 	Value string
 44 | 	Op    string // "Put" or "Append"
 45 | 	// You'll have to add definitions here.
 46 | 	// Field names must start with capital letters,
 47 | 	// otherwise RPC will break.
 48 | 	ClientId  int64
 49 | 	CommandId int64
 50 | 	ConfigNum int
 51 | }
 52 | 
 53 | type PutAppendReply struct {
 54 | 	Err Err
 55 | }
 56 | 
 57 | func (c *PutAppendArgs) copy() PutAppendArgs {
 58 | 	r := PutAppendArgs{
 59 | 		Key:       c.Key,
 60 | 		Value:     c.Value,
 61 | 		Op:        c.Op,
 62 | 		ClientId:  c.ClientId,
 63 | 		CommandId: c.CommandId,
 64 | 		ConfigNum: c.ConfigNum,
 65 | 	}
 66 | 	return r
 67 | }
 68 | 
 69 | type GetArgs struct {
 70 | 	Key string
 71 | 	// You'll have to add definitions here.
 72 | 	ClientId  int64
 73 | 	CommandId int64
 74 | 	ConfigNum int
 75 | }
 76 | 
 77 | type GetReply struct {
 78 | 	Err   Err
 79 | 	Value string
 80 | }
 81 | 
 82 | func (c *GetArgs) copy() GetArgs {
 83 | 	r := GetArgs{
 84 | 		Key:       c.Key,
 85 | 		ClientId:  c.ClientId,
 86 | 		CommandId: c.CommandId,
 87 | 		ConfigNum: c.ConfigNum,
 88 | 	}
 89 | 	return r
 90 | }
 91 | 
 92 | //用于向目标节点获取input shard
 93 | type FetchShardDataArgs struct {
 94 | 	ConfigNum int
 95 | 	ShardNum  int
 96 | }
 97 | 
 98 | type FetchShardDataReply struct {
 99 | 	Success        bool
100 | 	CommandIndexes map[int64]int64
101 | 	Data           map[string]string
102 | }
103 | 
104 | func (reply *FetchShardDataReply) Copy() FetchShardDataReply {
105 | 	res := FetchShardDataReply{
106 | 		Success:        reply.Success,
107 | 		Data:           make(map[string]string),
108 | 		CommandIndexes: make(map[int64]int64),
109 | 	}
110 | 	for k, v := range reply.Data {
111 | 		res.Data[k] = v
112 | 	}
113 | 	for k, v := range reply.CommandIndexes {
114 | 		res.CommandIndexes[k] = v
115 | 	}
116 | 	return res
117 | }
118 | 
119 | //用于请求目标节点清除指定的output shard
120 | type CleanShardDataArgs struct {
121 | 	ConfigNum int
122 | 	ShardNum  int
123 | }
124 | 
125 | type CleanShardDataReply struct {
126 | 	Success bool
127 | }
128 | 
129 | //用于存储output shard的数据，以及充当input shard在apply的命令
130 | type MergeShardData struct {
131 | 	ConfigNum      int
132 | 	ShardNum       int
133 | 	CommandIndexes map[int64]int64 //当前shard的所有客户端的最后一条命令id
134 | 	Data           map[string]string
135 | }
136 | 


--------------------------------------------------------------------------------
/main/test-mr-early.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | 
  4 | #
  5 | # map-reduce tests
  6 | #
  7 | 
  8 | # comment this out to run the tests without the Go race detector.
  9 | RACE=-race
 10 | 
 11 | if [[ "$OSTYPE" = "darwin"* ]]
 12 | then
 13 |   if go version | grep 'go1.17.[012345]'
 14 |   then
 15 |     # -race with plug-ins on x86 MacOS 12 with
 16 |     # go1.17 before 1.17.6 sometimes crash.
 17 |     RACE=
 18 |     echo '*** Turning off -race since it may not work on a Mac'
 19 |     echo '    with ' `go version`
 20 |   fi
 21 | fi
 22 | 
 23 | TIMEOUT=timeout
 24 | if timeout 2s sleep 1 > /dev/null 2>&1
 25 | then
 26 |   :
 27 | else
 28 |   if gtimeout 2s sleep 1 > /dev/null 2>&1
 29 |   then
 30 |     TIMEOUT=gtimeout
 31 |   else
 32 |     # no timeout command
 33 |     TIMEOUT=
 34 |     echo '*** Cannot find timeout command; proceeding without timeouts.'
 35 |   fi
 36 | fi
 37 | if [ "$TIMEOUT" != "" ]
 38 | then
 39 |   TIMEOUT+=" -k 2s 180s "
 40 | fi
 41 | 
 42 | # run the test in a fresh sub-directory.
 43 | rm -rf mr-tmp
 44 | mkdir mr-tmp || exit 1
 45 | cd mr-tmp || exit 1
 46 | rm -f mr-*
 47 | 
 48 | # make sure software is freshly built.
 49 | (cd ../../mrapps && go clean)
 50 | (cd .. && go clean)
 51 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1
 52 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1
 53 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1
 54 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1
 55 | (cd ../../mrapps && go build $RACE -buildmode=plugin jobcount.go) || exit 1
 56 | (cd ../../mrapps && go build $RACE -buildmode=plugin early_exit.go) || exit 1
 57 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1
 58 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1
 59 | (cd .. && go build $RACE mrcoordinator.go) || exit 1
 60 | (cd .. && go build $RACE mrworker.go) || exit 1
 61 | (cd .. && go build $RACE mrsequential.go) || exit 1
 62 | 
 63 | failed_any=0
 64 | 
 65 | #########################################################
 66 | 
 67 | #########################################################
 68 | # test whether any worker or coordinator exits before the
 69 | # task has completed (i.e., all output files have been finalized)
 70 | rm -f mr-*
 71 | 
 72 | echo '***' Starting early exit test.
 73 | 
 74 | DF=anydone$$
 75 | rm -f $DF
 76 | 
 77 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch $DF) &
 78 | 
 79 | # give the coordinator time to create the sockets.
 80 | sleep 1
 81 | 
 82 | # start multiple workers.
 83 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) &
 84 | sleep 1
 85 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) &
 86 | sleep 1
 87 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) &
 88 | 
 89 | # wait for any of the coord or workers to exit.
 90 | # `jobs` ensures that any completed old processes from other tests
 91 | # are not waited upon.
 92 | jobs &> /dev/null
 93 | if [[ "$OSTYPE" = "darwin"* ]]
 94 | then
 95 |   # bash on the Mac doesn't have wait -n
 96 |   while [ ! -e $DF ]
 97 |   do
 98 |     sleep 0.2
 99 |   done
100 | else
101 |   # the -n causes wait to wait for just one child process,
102 |   # rather than waiting for all to finish.
103 |   wait -n
104 | fi
105 | 
106 | rm -f $DF
107 | 
108 | # a process has exited. this means that the output should be finalized
109 | # otherwise, either a worker or the coordinator exited early
110 | sort mr-out* | grep . > mr-wc-all-initial
111 | echo
112 | # wait for remaining workers and coordinator to exit.
113 | wait
114 | 
115 | # compare initial and final outputs
116 | sort mr-out* | grep . > mr-wc-all-final
117 | if cmp mr-wc-all-final mr-wc-all-initial
118 | then
119 |   echo '---' early exit test: PASS
120 | else
121 |   echo '---' output changed after first worker exited
122 |   echo '---' early exit test: FAIL
123 |   failed_any=1
124 | fi
125 | #rm -f mr-*


--------------------------------------------------------------------------------
/raft/raft_snapshot.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | import "time"
  4 | 
  5 | type InstallSnapshotArgs struct {
  6 | 	Term              int
  7 | 	LeaderId          int
  8 | 	LastIncludedIndex int
  9 | 	LastIncludedTerm  int
 10 | 	//Offset            int
 11 | 	Data []byte
 12 | 	//Done bool
 13 | }
 14 | 
 15 | type InstallSnapshotReply struct {
 16 | 	Term int
 17 | }
 18 | 
 19 | func (rf *Raft) InstallSnapshot(args *InstallSnapshotArgs, reply *InstallSnapshotReply) {
 20 | 	rf.mu.Lock()
 21 | 	defer rf.mu.Unlock()
 22 | 
 23 | 	reply.Term = rf.currentTerm
 24 | 	if rf.currentTerm > args.Term {
 25 | 		return
 26 | 	}
 27 | 
 28 | 	if args.Term > rf.currentTerm || rf.role != Role_Follower {
 29 | 		rf.changeRole(Role_Follower)
 30 | 		rf.votedFor = -1
 31 | 		rf.currentTerm = args.Term
 32 | 		rf.resetElectionTimer()
 33 | 		rf.persist()
 34 | 	}
 35 | 
 36 | 	//如果自身快照包含的最后一个日志>=leader快照包含的最后一个日志，就没必要接受了
 37 | 	if rf.lastSnapshotIndex >= args.LastIncludedIndex {
 38 | 		return
 39 | 	}
 40 | 
 41 | 	/********以下内容和CondInstallSnapshot的操作是相同的，因为不知道为什么在lab4B中只要调用CondInstallSnapshot函数就会陷入阻塞，因此将操作逻辑复制到这里一份，lab4中就没有调用CondInstallSnapshot函数了***********/
 42 | 
 43 | 	lastIncludedIndex := args.LastIncludedIndex
 44 | 	lastIncludedTerm := args.LastIncludedTerm
 45 | 	_, lastIndex := rf.getLastLogTermAndIndex()
 46 | 	if lastIncludedIndex > lastIndex {
 47 | 		rf.logs = make([]LogEntry, 1)
 48 | 	} else {
 49 | 		installLen := lastIncludedIndex - rf.lastSnapshotIndex
 50 | 		rf.logs = rf.logs[installLen:]
 51 | 		rf.logs[0].Command = nil
 52 | 	}
 53 | 	//0处是空日志，代表了快照日志的标记
 54 | 	rf.logs[0].Term = lastIncludedTerm
 55 | 
 56 | 	rf.lastSnapshotIndex, rf.lastSnapshotTerm = lastIncludedIndex, lastIncludedTerm
 57 | 	rf.lastApplied, rf.commitIndex = lastIncludedIndex, lastIncludedIndex
 58 | 	//保存快照和状态
 59 | 	rf.persister.SaveStateAndSnapshot(rf.getPersistData(), args.Data)
 60 | 
 61 | 	/***********************************/
 62 | 
 63 | 	//接收发来的快照，并提交一个命令处理
 64 | 	rf.applyCh <- ApplyMsg{
 65 | 		SnapshotValid: true,
 66 | 		Snapshot:      args.Data,
 67 | 		SnapshotTerm:  args.LastIncludedTerm,
 68 | 		SnapshotIndex: args.LastIncludedIndex,
 69 | 	}
 70 | 
 71 | }
 72 | 
 73 | //向指定节点发送快照
 74 | func (rf *Raft) sendInstallSnapshotToPeer(server int) {
 75 | 	rf.mu.Lock()
 76 | 	args := InstallSnapshotArgs{
 77 | 		Term:              rf.currentTerm,
 78 | 		LeaderId:          rf.me,
 79 | 		LastIncludedIndex: rf.lastSnapshotIndex,
 80 | 		LastIncludedTerm:  rf.lastSnapshotTerm,
 81 | 		Data:              rf.persister.ReadSnapshot(),
 82 | 	}
 83 | 	rf.mu.Unlock()
 84 | 
 85 | 	timer := time.NewTimer(RPCTimeout)
 86 | 	defer timer.Stop()
 87 | 	DPrintf("%v role: %v, send snapshot  to peer,%v,args = %+v,reply = %+v", rf.me, rf.role, server, args)
 88 | 
 89 | 	for {
 90 | 		timer.Stop()
 91 | 		timer.Reset(RPCTimeout)
 92 | 
 93 | 		ch := make(chan bool, 1)
 94 | 		reply := &InstallSnapshotReply{}
 95 | 		go func() {
 96 | 			ok := rf.peers[server].Call("Raft.InstallSnapshot", &args, reply)
 97 | 			if !ok {
 98 | 				time.Sleep(time.Millisecond * 10)
 99 | 			}
100 | 			ch <- ok
101 | 		}()
102 | 
103 | 		select {
104 | 		case <-rf.stopCh:
105 | 			return
106 | 		case <-timer.C:
107 | 			DPrintf("%v role: %v, send snapshot to peer %v TIME OUT!!!", rf.me, rf.role, server)
108 | 			continue
109 | 		case ok := <-ch:
110 | 			if !ok {
111 | 				continue
112 | 			}
113 | 		}
114 | 
115 | 		rf.mu.Lock()
116 | 		defer rf.mu.Unlock()
117 | 		if rf.role != Role_Leader || args.Term != rf.currentTerm {
118 | 			return
119 | 		}
120 | 		if reply.Term > rf.currentTerm {
121 | 			rf.changeRole(Role_Follower)
122 | 			rf.currentTerm = reply.Term
123 | 			rf.resetElectionTimer()
124 | 			rf.persist()
125 | 			return
126 | 		}
127 | 
128 | 		if args.LastIncludedIndex > rf.matchIndex[server] {
129 | 			rf.matchIndex[server] = args.LastIncludedIndex
130 | 		}
131 | 		if args.LastIncludedIndex+1 > rf.nextIndex[server] {
132 | 			rf.nextIndex[server] = args.LastIncludedIndex + 1
133 | 		}
134 | 		return
135 | 	}
136 | }
137 | 


--------------------------------------------------------------------------------
/labgob/test_test.go:
--------------------------------------------------------------------------------
  1 | package labgob
  2 | 
  3 | import "testing"
  4 | 
  5 | import "bytes"
  6 | 
  7 | type T1 struct {
  8 | 	T1int0    int
  9 | 	T1int1    int
 10 | 	T1string0 string
 11 | 	T1string1 string
 12 | }
 13 | 
 14 | type T2 struct {
 15 | 	T2slice []T1
 16 | 	T2map   map[int]*T1
 17 | 	T2t3    interface{}
 18 | }
 19 | 
 20 | type T3 struct {
 21 | 	T3int999 int
 22 | }
 23 | 
 24 | //
 25 | // test that we didn't break GOB.
 26 | //
 27 | func TestGOB(t *testing.T) {
 28 | 	e0 := errorCount
 29 | 
 30 | 	w := new(bytes.Buffer)
 31 | 
 32 | 	Register(T3{})
 33 | 
 34 | 	{
 35 | 		x0 := 0
 36 | 		x1 := 1
 37 | 		t1 := T1{}
 38 | 		t1.T1int1 = 1
 39 | 		t1.T1string1 = "6.824"
 40 | 		t2 := T2{}
 41 | 		t2.T2slice = []T1{T1{}, t1}
 42 | 		t2.T2map = map[int]*T1{}
 43 | 		t2.T2map[99] = &T1{1, 2, "x", "y"}
 44 | 		t2.T2t3 = T3{999}
 45 | 
 46 | 		e := NewEncoder(w)
 47 | 		e.Encode(x0)
 48 | 		e.Encode(x1)
 49 | 		e.Encode(t1)
 50 | 		e.Encode(t2)
 51 | 	}
 52 | 	data := w.Bytes()
 53 | 
 54 | 	{
 55 | 		var x0 int
 56 | 		var x1 int
 57 | 		var t1 T1
 58 | 		var t2 T2
 59 | 
 60 | 		r := bytes.NewBuffer(data)
 61 | 		d := NewDecoder(r)
 62 | 		if d.Decode(&x0) != nil ||
 63 | 			d.Decode(&x1) != nil ||
 64 | 			d.Decode(&t1) != nil ||
 65 | 			d.Decode(&t2) != nil {
 66 | 			t.Fatalf("Decode failed")
 67 | 		}
 68 | 
 69 | 		if x0 != 0 {
 70 | 			t.Fatalf("wrong x0 %v\n", x0)
 71 | 		}
 72 | 		if x1 != 1 {
 73 | 			t.Fatalf("wrong x1 %v\n", x1)
 74 | 		}
 75 | 		if t1.T1int0 != 0 {
 76 | 			t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0)
 77 | 		}
 78 | 		if t1.T1int1 != 1 {
 79 | 			t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1)
 80 | 		}
 81 | 		if t1.T1string0 != "" {
 82 | 			t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0)
 83 | 		}
 84 | 		if t1.T1string1 != "6.824" {
 85 | 			t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1)
 86 | 		}
 87 | 		if len(t2.T2slice) != 2 {
 88 | 			t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice))
 89 | 		}
 90 | 		if t2.T2slice[1].T1int1 != 1 {
 91 | 			t.Fatalf("wrong slice value\n")
 92 | 		}
 93 | 		if len(t2.T2map) != 1 {
 94 | 			t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map))
 95 | 		}
 96 | 		if t2.T2map[99].T1string1 != "y" {
 97 | 			t.Fatalf("wrong map value\n")
 98 | 		}
 99 | 		t3 := (t2.T2t3).(T3)
100 | 		if t3.T3int999 != 999 {
101 | 			t.Fatalf("wrong t2.T2t3.T3int999\n")
102 | 		}
103 | 	}
104 | 
105 | 	if errorCount != e0 {
106 | 		t.Fatalf("there were errors, but should not have been")
107 | 	}
108 | }
109 | 
110 | type T4 struct {
111 | 	Yes int
112 | 	no  int
113 | }
114 | 
115 | //
116 | // make sure we check capitalization
117 | // labgob prints one warning during this test.
118 | //
119 | func TestCapital(t *testing.T) {
120 | 	e0 := errorCount
121 | 
122 | 	v := []map[*T4]int{}
123 | 
124 | 	w := new(bytes.Buffer)
125 | 	e := NewEncoder(w)
126 | 	e.Encode(v)
127 | 	data := w.Bytes()
128 | 
129 | 	var v1 []map[T4]int
130 | 	r := bytes.NewBuffer(data)
131 | 	d := NewDecoder(r)
132 | 	d.Decode(&v1)
133 | 
134 | 	if errorCount != e0+1 {
135 | 		t.Fatalf("failed to warn about lower-case field")
136 | 	}
137 | }
138 | 
139 | //
140 | // check that we warn when someone sends a default value over
141 | // RPC but the target into which we're decoding holds a non-default
142 | // value, which GOB seems not to overwrite as you'd expect.
143 | //
144 | // labgob does not print a warning.
145 | //
146 | func TestDefault(t *testing.T) {
147 | 	e0 := errorCount
148 | 
149 | 	type DD struct {
150 | 		X int
151 | 	}
152 | 
153 | 	// send a default value...
154 | 	dd1 := DD{}
155 | 
156 | 	w := new(bytes.Buffer)
157 | 	e := NewEncoder(w)
158 | 	e.Encode(dd1)
159 | 	data := w.Bytes()
160 | 
161 | 	// and receive it into memory that already
162 | 	// holds non-default values.
163 | 	reply := DD{99}
164 | 
165 | 	r := bytes.NewBuffer(data)
166 | 	d := NewDecoder(r)
167 | 	d.Decode(&reply)
168 | 
169 | 	if errorCount != e0+1 {
170 | 		t.Fatalf("failed to warn about decoding into non-default value")
171 | 	}
172 | }
173 | 


--------------------------------------------------------------------------------
/shardkv/client.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | //
  4 | // client code to talk to a sharded key/value service.
  5 | //
  6 | // the client first talks to the shardctrler to find out
  7 | // the assignment of shards (keys) to groups, and then
  8 | // talks to the group that holds the key's shard.
  9 | //
 10 | 
 11 | import (
 12 | 	"6.824/labrpc"
 13 | )
 14 | import "crypto/rand"
 15 | import "math/big"
 16 | import "6.824/shardctrler"
 17 | import "time"
 18 | 
 19 | //
 20 | // which shard is a key in?
 21 | // please use this function,
 22 | // and please do not change it.
 23 | //
 24 | func key2shard(key string) int {
 25 | 	shard := 0
 26 | 	if len(key) > 0 {
 27 | 		shard = int(key[0])
 28 | 	}
 29 | 	shard %= shardctrler.NShards
 30 | 	return shard
 31 | }
 32 | 
 33 | func nrand() int64 {
 34 | 	max := big.NewInt(int64(1) << 62)
 35 | 	bigx, _ := rand.Int(rand.Reader, max)
 36 | 	x := bigx.Int64()
 37 | 	return x
 38 | }
 39 | 
 40 | type Clerk struct {
 41 | 	sm       *shardctrler.Clerk
 42 | 	config   shardctrler.Config
 43 | 	make_end func(string) *labrpc.ClientEnd
 44 | 	// You will have to modify this struct.
 45 | 	clientId int64
 46 | }
 47 | 
 48 | //
 49 | // the tester calls MakeClerk.
 50 | //
 51 | // ctrlers[] is needed to call shardctrler.MakeClerk().
 52 | //
 53 | // make_end(servername) turns a server name from a
 54 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can
 55 | // send RPCs.
 56 | //
 57 | func MakeClerk(ctrlers []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *Clerk {
 58 | 	ck := new(Clerk)
 59 | 	ck.sm = shardctrler.MakeClerk(ctrlers)
 60 | 	ck.make_end = make_end
 61 | 	// You'll have to add code here.
 62 | 	ck.clientId = nrand()
 63 | 	return ck
 64 | }
 65 | 
 66 | //
 67 | // fetch the current value for a key.
 68 | // returns "" if the key does not exist.
 69 | // keeps trying forever in the face of all other errors.
 70 | // You will have to modify this function.
 71 | //
 72 | func (ck *Clerk) Get(key string) string {
 73 | 	args := GetArgs{}
 74 | 	args.Key = key
 75 | 	args.ClientId = ck.clientId
 76 | 	args.CommandId = nrand()
 77 | 
 78 | 	for {
 79 | 		args.ConfigNum = ck.config.Num
 80 | 		shard := key2shard(key)
 81 | 		gid := ck.config.Shards[shard]
 82 | 		if servers, ok := ck.config.Groups[gid]; ok {
 83 | 			// try each server for the shard.
 84 | 			for si := 0; si < len(servers); si++ {
 85 | 				srv := ck.make_end(servers[si])
 86 | 				var reply GetReply
 87 | 				ok := srv.Call("ShardKV.Get", &args, &reply)
 88 | 				if ok && (reply.Err == OK || reply.Err == ErrNoKey) {
 89 | 					return reply.Value
 90 | 				}
 91 | 				if ok && (reply.Err == ErrWrongGroup) {
 92 | 					break
 93 | 				}
 94 | 				// ... not ok, or ErrWrongLeader
 95 | 			}
 96 | 		}
 97 | 		time.Sleep(100 * time.Millisecond)
 98 | 		// ask controler for the latest configuration.
 99 | 		ck.config = ck.sm.Query(-1)
100 | 	}
101 | 
102 | 	return ""
103 | }
104 | 
105 | //
106 | // shared by Put and Append.
107 | // You will have to modify this function.
108 | //
109 | func (ck *Clerk) PutAppend(key string, value string, op string) {
110 | 	args := PutAppendArgs{}
111 | 	args.Key = key
112 | 	args.Value = value
113 | 	args.Op = op
114 | 	args.ClientId = ck.clientId
115 | 	args.CommandId = nrand()
116 | 
117 | 	for {
118 | 		args.ConfigNum = ck.config.Num
119 | 		shard := key2shard(key)
120 | 		gid := ck.config.Shards[shard]
121 | 		if servers, ok := ck.config.Groups[gid]; ok {
122 | 			for si := 0; si < len(servers); si++ {
123 | 				srv := ck.make_end(servers[si])
124 | 				var reply PutAppendReply
125 | 				ok := srv.Call("ShardKV.PutAppend", &args, &reply)
126 | 				if ok && reply.Err == OK {
127 | 					return
128 | 				}
129 | 				if ok && reply.Err == ErrWrongGroup {
130 | 					break
131 | 				}
132 | 				// ... not ok, or ErrWrongLeader
133 | 			}
134 | 		}
135 | 		time.Sleep(100 * time.Millisecond)
136 | 		// ask controler for the latest configuration.
137 | 		ck.config = ck.sm.Query(-1)
138 | 	}
139 | }
140 | 
141 | func (ck *Clerk) Put(key string, value string) {
142 | 	ck.PutAppend(key, value, "Put")
143 | }
144 | func (ck *Clerk) Append(key string, value string) {
145 | 	ck.PutAppend(key, value, "Append")
146 | }
147 | 


--------------------------------------------------------------------------------
/labgob/labgob.go:
--------------------------------------------------------------------------------
  1 | package labgob
  2 | 
  3 | //
  4 | // trying to send non-capitalized fields over RPC produces a range of
  5 | // misbehavior, including both mysterious incorrect computation and
  6 | // outright crashes. so this wrapper around Go's encoding/gob warns
  7 | // about non-capitalized field names.
  8 | //
  9 | 
 10 | import "encoding/gob"
 11 | import "io"
 12 | import "reflect"
 13 | import "fmt"
 14 | import "sync"
 15 | import "unicode"
 16 | import "unicode/utf8"
 17 | 
 18 | var mu sync.Mutex
 19 | var errorCount int // for TestCapital
 20 | var checked map[reflect.Type]bool
 21 | 
 22 | type LabEncoder struct {
 23 | 	gob *gob.Encoder
 24 | }
 25 | 
 26 | func NewEncoder(w io.Writer) *LabEncoder {
 27 | 	enc := &LabEncoder{}
 28 | 	enc.gob = gob.NewEncoder(w)
 29 | 	return enc
 30 | }
 31 | 
 32 | func (enc *LabEncoder) Encode(e interface{}) error {
 33 | 	checkValue(e)
 34 | 	return enc.gob.Encode(e)
 35 | }
 36 | 
 37 | func (enc *LabEncoder) EncodeValue(value reflect.Value) error {
 38 | 	checkValue(value.Interface())
 39 | 	return enc.gob.EncodeValue(value)
 40 | }
 41 | 
 42 | type LabDecoder struct {
 43 | 	gob *gob.Decoder
 44 | }
 45 | 
 46 | func NewDecoder(r io.Reader) *LabDecoder {
 47 | 	dec := &LabDecoder{}
 48 | 	dec.gob = gob.NewDecoder(r)
 49 | 	return dec
 50 | }
 51 | 
 52 | func (dec *LabDecoder) Decode(e interface{}) error {
 53 | 	checkValue(e)
 54 | 	checkDefault(e)
 55 | 	return dec.gob.Decode(e)
 56 | }
 57 | 
 58 | func Register(value interface{}) {
 59 | 	checkValue(value)
 60 | 	gob.Register(value)
 61 | }
 62 | 
 63 | func RegisterName(name string, value interface{}) {
 64 | 	checkValue(value)
 65 | 	gob.RegisterName(name, value)
 66 | }
 67 | 
 68 | func checkValue(value interface{}) {
 69 | 	checkType(reflect.TypeOf(value))
 70 | }
 71 | 
 72 | func checkType(t reflect.Type) {
 73 | 	k := t.Kind()
 74 | 
 75 | 	mu.Lock()
 76 | 	// only complain once, and avoid recursion.
 77 | 	if checked == nil {
 78 | 		checked = map[reflect.Type]bool{}
 79 | 	}
 80 | 	if checked[t] {
 81 | 		mu.Unlock()
 82 | 		return
 83 | 	}
 84 | 	checked[t] = true
 85 | 	mu.Unlock()
 86 | 
 87 | 	switch k {
 88 | 	case reflect.Struct:
 89 | 		for i := 0; i < t.NumField(); i++ {
 90 | 			f := t.Field(i)
 91 | 			rune, _ := utf8.DecodeRuneInString(f.Name)
 92 | 			if unicode.IsUpper(rune) == false {
 93 | 				// ta da
 94 | 				fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n",
 95 | 					f.Name, t.Name())
 96 | 				mu.Lock()
 97 | 				errorCount += 1
 98 | 				mu.Unlock()
 99 | 			}
100 | 			checkType(f.Type)
101 | 		}
102 | 		return
103 | 	case reflect.Slice, reflect.Array, reflect.Ptr:
104 | 		checkType(t.Elem())
105 | 		return
106 | 	case reflect.Map:
107 | 		checkType(t.Elem())
108 | 		checkType(t.Key())
109 | 		return
110 | 	default:
111 | 		return
112 | 	}
113 | }
114 | 
115 | //
116 | // warn if the value contains non-default values,
117 | // as it would if one sent an RPC but the reply
118 | // struct was already modified. if the RPC reply
119 | // contains default values, GOB won't overwrite
120 | // the non-default value.
121 | //
122 | func checkDefault(value interface{}) {
123 | 	if value == nil {
124 | 		return
125 | 	}
126 | 	checkDefault1(reflect.ValueOf(value), 1, "")
127 | }
128 | 
129 | func checkDefault1(value reflect.Value, depth int, name string) {
130 | 	if depth > 3 {
131 | 		return
132 | 	}
133 | 
134 | 	t := value.Type()
135 | 	k := t.Kind()
136 | 
137 | 	switch k {
138 | 	case reflect.Struct:
139 | 		for i := 0; i < t.NumField(); i++ {
140 | 			vv := value.Field(i)
141 | 			name1 := t.Field(i).Name
142 | 			if name != "" {
143 | 				name1 = name + "." + name1
144 | 			}
145 | 			checkDefault1(vv, depth+1, name1)
146 | 		}
147 | 		return
148 | 	case reflect.Ptr:
149 | 		if value.IsNil() {
150 | 			return
151 | 		}
152 | 		checkDefault1(value.Elem(), depth+1, name)
153 | 		return
154 | 	case reflect.Bool,
155 | 		reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64,
156 | 		reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64,
157 | 		reflect.Uintptr, reflect.Float32, reflect.Float64,
158 | 		reflect.String:
159 | 		if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false {
160 | 			mu.Lock()
161 | 			if errorCount < 1 {
162 | 				what := name
163 | 				if what == "" {
164 | 					what = t.Name()
165 | 				}
166 | 				// this warning typically arises if code re-uses the same RPC reply
167 | 				// variable for multiple RPC calls, or if code restores persisted
168 | 				// state into variable that already have non-default values.
169 | 				fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n",
170 | 					what)
171 | 			}
172 | 			errorCount += 1
173 | 			mu.Unlock()
174 | 		}
175 | 		return
176 | 	}
177 | }
178 | 


--------------------------------------------------------------------------------
/kvraft/client.go:
--------------------------------------------------------------------------------
  1 | package kvraft
  2 | 
  3 | import (
  4 | 	"6.824/labrpc"
  5 | 	"log"
  6 | 	"time"
  7 | )
  8 | import "crypto/rand"
  9 | import "math/big"
 10 | 
 11 | const (
 12 | 	ChangeLeaderInterval = time.Millisecond * 20
 13 | )
 14 | 
 15 | //客户端
 16 | type Clerk struct {
 17 | 	servers []*labrpc.ClientEnd
 18 | 	// You will have to modify this struct.
 19 | 	clientId int64
 20 | 	leaderId int
 21 | }
 22 | 
 23 | //用于生成一个随机数，可以生成clientId和commandId
 24 | func nrand() int64 {
 25 | 	max := big.NewInt(int64(1) << 62)
 26 | 	bigx, _ := rand.Int(rand.Reader, max)
 27 | 	x := bigx.Int64()
 28 | 	return x
 29 | }
 30 | 
 31 | //生成一个客户端
 32 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk {
 33 | 	ck := new(Clerk)
 34 | 	ck.servers = servers
 35 | 	ck.clientId = nrand()
 36 | 	// You'll have to add code here.
 37 | 	return ck
 38 | }
 39 | 
 40 | //
 41 | // fetch the current value for a key.
 42 | // returns "" if the key does not exist.
 43 | // keeps trying forever in the face of all other errors.
 44 | //
 45 | // you can send an RPC with code like this:
 46 | // ok := ck.servers[i].Call("KVServer.Get", &args, &reply)
 47 | //
 48 | // the types of args and reply (including whether they are pointers)
 49 | // must match the declared types of the RPC handler function's
 50 | // arguments. and reply must be passed as a pointer.
 51 | //
 52 | //根据key获取value
 53 | func (ck *Clerk) Get(key string) string {
 54 | 	// You will have to modify this function.
 55 | 	//DPrintf("%v client get key：%s.", ck.clientId, key)
 56 | 	args := GetArgs{
 57 | 		Key:       key,
 58 | 		ClientId:  ck.clientId,
 59 | 		CommandId: nrand(),
 60 | 	}
 61 | 	leaderId := ck.leaderId
 62 | 	for {
 63 | 		reply := GetReply{}
 64 | 		ok := ck.servers[leaderId].Call("KVServer.Get", &args, &reply)
 65 | 		if !ok {
 66 | 			//如果请求失败，等一段时间再请求,换一个节点再请求
 67 | 			DPrintf("%v client get key %v from server %v,not ok.", ck.clientId, key, leaderId)
 68 | 			time.Sleep(ChangeLeaderInterval)
 69 | 			leaderId = (leaderId + 1) % len(ck.servers)
 70 | 			continue
 71 | 		} else if reply.Err != OK {
 72 | 			DPrintf("%v client get key %v from server %v,reply err = %v!", ck.clientId, key, leaderId, reply.Err)
 73 | 		}
 74 | 
 75 | 		switch reply.Err {
 76 | 		case OK:
 77 | 			DPrintf("%v client get key %v from server %v,value: %v，OK.", ck.clientId, key, leaderId, reply.Value, leaderId)
 78 | 			ck.leaderId = leaderId
 79 | 			return reply.Value
 80 | 		case ErrNoKey:
 81 | 			DPrintf("%v client get key %v from server %v,NO KEY!", ck.clientId, key, leaderId)
 82 | 			ck.leaderId = leaderId
 83 | 			return ""
 84 | 		case ErrTimeOut:
 85 | 			continue
 86 | 		default:
 87 | 			time.Sleep(ChangeLeaderInterval)
 88 | 			leaderId = (leaderId + 1) % len(ck.servers)
 89 | 			continue
 90 | 		}
 91 | 
 92 | 	}
 93 | }
 94 | 
 95 | //
 96 | // shared by Put and Append.
 97 | //
 98 | // you can send an RPC with code like this:
 99 | // ok := ck.servers[i].Call("KVServer.PutAppend", &args, &reply)
100 | //
101 | // the types of args and reply (including whether they are pointers)
102 | // must match the declared types of the RPC handler function's
103 | // arguments. and reply must be passed as a pointer.
104 | //
105 | func (ck *Clerk) PutAppend(key string, value string, op string) {
106 | 	DPrintf("%v client PutAppend,key：%v,value：%v,op：%v", ck.clientId, key, value, op)
107 | 	// You will have to modify this function.
108 | 	args := PutAppendArgs{
109 | 		Key:       key,
110 | 		Value:     value,
111 | 		Op:        op,
112 | 		ClientId:  ck.clientId,
113 | 		CommandId: nrand(),
114 | 	}
115 | 	leaderId := ck.leaderId
116 | 	for {
117 | 		reply := PutAppendReply{}
118 | 		ok := ck.servers[leaderId].Call("KVServer.PutAppend", &args, &reply)
119 | 		if !ok {
120 | 			//可能当前请求的server不是leader，换一个server再访问
121 | 			DPrintf("%v client set key %v to %v to server %v,not ok.", ck.clientId, key, value, leaderId)
122 | 			time.Sleep(ChangeLeaderInterval)
123 | 			leaderId = (leaderId + 1) % len(ck.servers)
124 | 			continue
125 | 		} else if reply.Err != OK {
126 | 			DPrintf("%v client set key %v to %v to server %v,reply err = %v!", ck.clientId, key, value, leaderId, reply.Err)
127 | 		}
128 | 
129 | 		switch reply.Err {
130 | 		case OK:
131 | 			DPrintf("%v client set key %v to %v to server %v，OK.", ck.clientId, key, value, leaderId)
132 | 			ck.leaderId = leaderId
133 | 			return
134 | 		case ErrNoKey:
135 | 			DPrintf("%v client set key %v to %v to server %v，NOKEY!", ck.clientId, key, value, leaderId)
136 | 			return
137 | 		case ErrTimeOut:
138 | 			continue
139 | 		case ErrWrongLeader:
140 | 			//换一个节点继续请求
141 | 			time.Sleep(ChangeLeaderInterval)
142 | 			leaderId = (leaderId + 1) % len(ck.servers)
143 | 			continue
144 | 		case ErrServer:
145 | 			//换一个节点继续请求
146 | 			time.Sleep(ChangeLeaderInterval)
147 | 			leaderId = (leaderId + 1) % len(ck.servers)
148 | 			continue
149 | 		default:
150 | 			log.Fatal("client rev unknown err", reply.Err)
151 | 		}
152 | 	}
153 | }
154 | 
155 | func (ck *Clerk) Put(key string, value string) {
156 | 	ck.PutAppend(key, value, "Put")
157 | }
158 | func (ck *Clerk) Append(key string, value string) {
159 | 	ck.PutAppend(key, value, "Append")
160 | }
161 | 


--------------------------------------------------------------------------------
/raft/raft_vote.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | import (
  4 | 	"time"
  5 | )
  6 | 
  7 | //
  8 | // example RequestVote RPC arguments structure.
  9 | // field names must start with capital letters!
 10 | //
 11 | type RequestVoteArgs struct {
 12 | 	// Your data here (2A, 2B).
 13 | 	Term         int
 14 | 	CandidateId  int
 15 | 	LastLogIndex int
 16 | 	LastLogTerm  int
 17 | }
 18 | 
 19 | //
 20 | // example RequestVote RPC reply structure.
 21 | // field names must start with capital letters!
 22 | //
 23 | type RequestVoteReply struct {
 24 | 	// Your data here (2A).
 25 | 	Term        int
 26 | 	VoteGranted bool
 27 | }
 28 | 
 29 | //
 30 | // example RequestVote RPC handler.
 31 | //
 32 | func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
 33 | 	// Your code here (2A, 2B).
 34 | 	rf.mu.Lock()
 35 | 	defer rf.mu.Unlock()
 36 | 
 37 | 	//默认失败，返回
 38 | 	lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex()
 39 | 	reply.Term = rf.currentTerm
 40 | 	reply.VoteGranted = false
 41 | 
 42 | 	if rf.currentTerm > args.Term {
 43 | 		return
 44 | 	} else if rf.currentTerm == args.Term {
 45 | 		if rf.role == Role_Leader {
 46 | 			return
 47 | 		}
 48 | 
 49 | 		if args.CandidateId == rf.votedFor {
 50 | 			reply.Term = args.Term
 51 | 			reply.VoteGranted = true
 52 | 			return
 53 | 		}
 54 | 		if rf.votedFor != -1 && args.CandidateId != rf.votedFor {
 55 | 			return
 56 | 		}
 57 | 
 58 | 		//还有一种情况，没有投过票
 59 | 	}
 60 | 
 61 | 	if rf.currentTerm < args.Term {
 62 | 		rf.currentTerm = args.Term
 63 | 		rf.changeRole(Role_Follower)
 64 | 		rf.votedFor = -1
 65 | 		reply.Term = rf.currentTerm
 66 | 		rf.persist()
 67 | 	}
 68 | 
 69 | 	//判断日志完整性
 70 | 	if lastLogTerm > args.LastLogTerm || (lastLogTerm == args.LastLogTerm && lastLogIndex > args.LastLogIndex) {
 71 | 		return
 72 | 	}
 73 | 
 74 | 	rf.votedFor = args.CandidateId
 75 | 	reply.VoteGranted = true
 76 | 	rf.changeRole(Role_Follower)
 77 | 	rf.resetElectionTimer()
 78 | 	rf.persist()
 79 | 	DPrintf("%v， role：%v，voteFor: %v", rf.me, rf.role, rf.votedFor)
 80 | }
 81 | 
 82 | //
 83 | // example code to send a RequestVote RPC to a server.
 84 | // server is the index of the target server in rf.peers[].
 85 | // expects RPC arguments in args.
 86 | // fills in *reply with RPC reply, so caller should
 87 | // pass &reply.
 88 | // the types of the args and reply passed to Call() must be
 89 | // the same as the types of the arguments declared in the
 90 | // handler function (including whether they are pointers).
 91 | //
 92 | // The labrpc package simulates a lossy network, in which servers
 93 | // may be unreachable, and in which requests and replies may be lost.
 94 | // Call() sends a request and waits for a reply. If a reply arrives
 95 | // within a timeout interval, Call() returns true; otherwise
 96 | // Call() returns false. Thus Call() may not return for a while.
 97 | // A false return can be caused by a dead server, a live server that
 98 | // can't be reached, a lost request, or a lost reply.
 99 | //
100 | // Call() is guaranteed to return (perhaps after a delay) *except* if the
101 | // handler function on the server side does not return.  Thus there
102 | // is no need to implement your own timeouts around Call().
103 | //
104 | // look at the comments in ../labrpc/labrpc.go for more details.
105 | //
106 | // if you're having trouble getting RPC to work, check that you've
107 | // capitalized all field names in structs passed over RPC, and
108 | // that the caller passes the address of the reply struct with &, not
109 | // the struct itself.
110 | //
111 | func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply) {
112 | 	if server < 0 || server > len(rf.peers) || server == rf.me {
113 | 		panic("server invalid in sendRequestVote!")
114 | 	}
115 | 
116 | 	rpcTimer := time.NewTimer(RPCTimeout)
117 | 	defer rpcTimer.Stop()
118 | 
119 | 	ch := make(chan bool, 1)
120 | 	go func() {
121 | 		for i := 0; i < 10 && !rf.killed(); i++ {
122 | 			ok := rf.peers[server].Call("Raft.RequestVote", args, reply)
123 | 			if !ok {
124 | 				continue
125 | 			} else {
126 | 				ch <- ok
127 | 				return
128 | 			}
129 | 		}
130 | 	}()
131 | 
132 | 	select {
133 | 	case <-rpcTimer.C:
134 | 		DPrintf("%v role: %v, send request vote to peer %v TIME OUT!!!", rf.me, rf.role, server)
135 | 		return
136 | 	case <-ch:
137 | 		return
138 | 	}
139 | 
140 | }
141 | 
142 | func (rf *Raft) startElection() {
143 | 	rf.mu.Lock()
144 | 	rf.resetElectionTimer()
145 | 	if rf.role == Role_Leader {
146 | 		rf.mu.Unlock()
147 | 		return
148 | 	}
149 | 
150 | 	rf.changeRole(Role_Candidate)
151 | 	DPrintf("%v role %v,start election,term: %v", rf.me, rf.role, rf.currentTerm)
152 | 
153 | 	lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex()
154 | 	args := RequestVoteArgs{
155 | 		CandidateId:  rf.me,
156 | 		Term:         rf.currentTerm,
157 | 		LastLogTerm:  lastLogTerm,
158 | 		LastLogIndex: lastLogIndex,
159 | 	}
160 | 	rf.persist()
161 | 	rf.mu.Unlock()
162 | 
163 | 	allCount := len(rf.peers)
164 | 	grantedCount := 1
165 | 	resCount := 1
166 | 	grantedChan := make(chan bool, len(rf.peers)-1)
167 | 	for i := 0; i < allCount; i++ {
168 | 		if i == rf.me {
169 | 			continue
170 | 		}
171 | 		//对每一个其他节点都要发送rpc
172 | 		go func(gch chan bool, index int) {
173 | 			reply := RequestVoteReply{}
174 | 			rf.sendRequestVote(index, &args, &reply)
175 | 			gch <- reply.VoteGranted
176 | 			if reply.Term > args.Term {
177 | 				rf.mu.Lock()
178 | 				if reply.Term > rf.currentTerm {
179 | 					//放弃选举
180 | 					rf.currentTerm = reply.Term
181 | 					rf.changeRole(Role_Follower)
182 | 					rf.votedFor = -1
183 | 					rf.resetElectionTimer()
184 | 					rf.persist()
185 | 				}
186 | 				rf.mu.Unlock()
187 | 			}
188 | 		}(grantedChan, i)
189 | 
190 | 	}
191 | 
192 | 	for rf.role == Role_Candidate {
193 | 		flag := <-grantedChan
194 | 		resCount++
195 | 		if flag {
196 | 			grantedCount++
197 | 		}
198 | 		DPrintf("vote: %v, allCount: %v, resCount: %v, grantedCount: %v", flag, allCount, resCount, grantedCount)
199 | 
200 | 		if grantedCount > allCount/2 {
201 | 			//竞选成功
202 | 			rf.mu.Lock()
203 | 			DPrintf("before try change to leader,count:%d, args:%+v, currentTerm: %v, argsTerm: %v", grantedCount, args, rf.currentTerm, args.Term)
204 | 			if rf.role == Role_Candidate && rf.currentTerm == args.Term {
205 | 				rf.changeRole(Role_Leader)
206 | 			}
207 | 			if rf.role == Role_Leader {
208 | 				rf.resetAppendEntriesTimersZero()
209 | 			}
210 | 			rf.persist()
211 | 			rf.mu.Unlock()
212 | 			DPrintf("%v current role: %v", rf.me, rf.role)
213 | 		} else if resCount == allCount || resCount-grantedCount > allCount/2 {
214 | 			DPrintf("grant fail! grantedCount <= len/2:count:%d", grantedCount)
215 | 			return
216 | 		}
217 | 	}
218 | 
219 | }
220 | 


--------------------------------------------------------------------------------
/shardkv/server_shard.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import (
  4 | 	"6.824/shardctrler"
  5 | 	"time"
  6 | )
  7 | 
  8 | //判断是否存在指定config和指定shardId的output shard
  9 | func (kv *ShardKV) OutputDataExist(configNum int, shardId int) bool {
 10 | 	if _, ok := kv.outputShards[configNum]; ok {
 11 | 		if _, ok = kv.outputShards[configNum][shardId]; ok {
 12 | 			return true
 13 | 		}
 14 | 	}
 15 | 	return false
 16 | }
 17 | 
 18 | /*
 19 | RPC，针对output shard
 20 | */
 21 | //请求获取shard
 22 | func (kv *ShardKV) FetchShardData(args *FetchShardDataArgs, reply *FetchShardDataReply) {
 23 | 	kv.log("get req fetchsharddata:args:%+v, reply:%+v", args, reply)
 24 | 	defer kv.log("resp fetchsharddata:args:%+v, reply:%+v", args, reply)
 25 | 	kv.lock("fetchShardData")
 26 | 	defer kv.unlock("fetchShardData")
 27 | 
 28 | 	//必须是过去的config
 29 | 	if args.ConfigNum >= kv.config.Num {
 30 | 		return
 31 | 	}
 32 | 
 33 | 	reply.Success = false
 34 | 	if configData, ok := kv.outputShards[args.ConfigNum]; ok {
 35 | 		if shardData, ok := configData[args.ShardNum]; ok {
 36 | 			reply.Success = true
 37 | 			reply.Data = make(map[string]string)
 38 | 			reply.CommandIndexes = make(map[int64]int64)
 39 | 			for k, v := range shardData.Data {
 40 | 				reply.Data[k] = v
 41 | 			}
 42 | 			for k, v := range shardData.CommandIndexes {
 43 | 				reply.CommandIndexes[k] = v
 44 | 			}
 45 | 		}
 46 | 	}
 47 | 	return
 48 | 
 49 | }
 50 | 
 51 | //请求清除shard
 52 | func (kv *ShardKV) CleanShardData(args *CleanShardDataArgs, reply *CleanShardDataReply) {
 53 | 	kv.log("get req CleanShardData:args:%+v, reply:%+v", args, reply)
 54 | 	defer kv.log("resp CleanShardData:args:%+v, reply:%+v", args, reply)
 55 | 	kv.lock("cleanShardData")
 56 | 
 57 | 	//必须是过去的config
 58 | 	if args.ConfigNum >= kv.config.Num {
 59 | 		kv.unlock("cleanShardData")
 60 | 		return
 61 | 	}
 62 | 	kv.unlock("cleanShardData")
 63 | 	_, _, isLeader := kv.rf.Start(*args)
 64 | 	if !isLeader {
 65 | 		return
 66 | 	}
 67 | 
 68 | 	// 简单处理下。。。
 69 | 	for i := 0; i < 10; i++ {
 70 | 		kv.lock("cleanShardData")
 71 | 		exist := kv.OutputDataExist(args.ConfigNum, args.ShardNum)
 72 | 		kv.unlock("cleanShardData")
 73 | 		if !exist {
 74 | 			reply.Success = true
 75 | 			return
 76 | 		}
 77 | 		time.Sleep(time.Millisecond * 20)
 78 | 	}
 79 | 
 80 | 	//采用下面这种方式获取start结果，其实会慢一些，还会出现锁的问题
 81 | 	//kv.lock("CleanShardData")
 82 | 	//ch := make(chan struct{}, 1)
 83 | 	//kv.cleanOutputDataNotifyCh[fmt.Sprintf("%d%d", args.ConfigNum, args.ShardNum)] = ch
 84 | 	//kv.unlock("CleanShardData")
 85 | 	//t := time.NewTimer(WaitCmdTimeOut)
 86 | 	//defer t.Stop()
 87 | 	//
 88 | 	//select {
 89 | 	//case <-t.C:
 90 | 	//case <-ch:
 91 | 	//case <-kv.stopCh:
 92 | 	//}
 93 | 	//
 94 | 	//kv.lock("removeCh")
 95 | 	////删除ch
 96 | 	//if _, ok := kv.cleanOutputDataNotifyCh[fmt.Sprintf("%d%d", args.ConfigNum, args.ShardNum)]; ok {
 97 | 	//	delete(kv.cleanOutputDataNotifyCh, fmt.Sprintf("%d%d", args.ConfigNum, args.ShardNum))
 98 | 	//}
 99 | 	////判断是否还存在
100 | 	//exist := kv.OutputDataExist(args.ConfigNum, args.ShardNum)
101 | 	//kv.unlock("removeCh")
102 | 	//if !exist {
103 | 	//	reply.Success = true
104 | 	//}
105 | 	return
106 | 
107 | }
108 | 
109 | /*
110 | 定时任务，请求input shard
111 | */
112 | 
113 | //定时获取shard
114 | func (kv *ShardKV) fetchShards() {
115 | 	for {
116 | 		select {
117 | 		case <-kv.stopCh:
118 | 			return
119 | 		case <-kv.pullShardsTimer.C:
120 | 			//判断是否有要input的shard
121 | 			_, isLeader := kv.rf.GetState()
122 | 			if isLeader {
123 | 				kv.lock("pullshards")
124 | 				for shardId, _ := range kv.inputShards {
125 | 					//注意要从上一个config中请求shard的源节点
126 | 					go kv.fetchShard(shardId, kv.oldConfig)
127 | 				}
128 | 				kv.unlock("pullshards")
129 | 			}
130 | 			kv.pullShardsTimer.Reset(PullShardsInterval)
131 | 
132 | 		}
133 | 	}
134 | }
135 | 
136 | //获取指定的shard
137 | func (kv *ShardKV) fetchShard(shardId int, config shardctrler.Config) {
138 | 	args := FetchShardDataArgs{
139 | 		ConfigNum: config.Num,
140 | 		ShardNum:  shardId,
141 | 	}
142 | 
143 | 	t := time.NewTimer(CallPeerFetchShardDataTimeOut)
144 | 	defer t.Stop()
145 | 
146 | 	for {
147 | 		//依次请求group中的每个节点,但只要获取一个就好了
148 | 		for _, s := range config.Groups[config.Shards[shardId]] {
149 | 			reply := FetchShardDataReply{}
150 | 			srv := kv.make_end(s)
151 | 			done := make(chan bool, 1)
152 | 			go func(args *FetchShardDataArgs, reply *FetchShardDataReply) {
153 | 				done <- srv.Call("ShardKV.FetchShardData", args, reply)
154 | 			}(&args, &reply)
155 | 
156 | 			t.Reset(CallPeerFetchShardDataTimeOut)
157 | 
158 | 			select {
159 | 			case <-kv.stopCh:
160 | 				return
161 | 			case <-t.C:
162 | 			case isDone := <-done:
163 | 				if isDone && reply.Success == true {
164 | 					kv.lock("pullShard")
165 | 					if _, ok := kv.inputShards[shardId]; ok && kv.config.Num == config.Num+1 {
166 | 						replyCopy := reply.Copy()
167 | 						mergeShardData := MergeShardData{
168 | 							ConfigNum:      args.ConfigNum,
169 | 							ShardNum:       args.ShardNum,
170 | 							Data:           replyCopy.Data,
171 | 							CommandIndexes: replyCopy.CommandIndexes,
172 | 						}
173 | 						kv.log("pullShard get data:%+v", mergeShardData)
174 | 						kv.unlock("pullShard")
175 | 						kv.rf.Start(mergeShardData)
176 | 						//不管是不是leader都返回
177 | 						return
178 | 					} else {
179 | 						kv.unlock("pullshard")
180 | 					}
181 | 				}
182 | 			}
183 | 
184 | 		}
185 | 	}
186 | 
187 | }
188 | 
189 | /*
190 | 处理好input shard，请求源节点清除output shard
191 | */
192 | 
193 | //发送给shard源节点，可以删除shard数据了
194 | //一般在apply command中处理好input的shard，发送给源节点删除保存的shard数据
195 | func (kv *ShardKV) callPeerCleanShardData(config shardctrler.Config, shardId int) {
196 | 	args := CleanShardDataArgs{
197 | 		ConfigNum: config.Num,
198 | 		ShardNum:  shardId,
199 | 	}
200 | 
201 | 	t := time.NewTimer(CallPeerCleanShardDataTimeOut)
202 | 	defer t.Stop()
203 | 
204 | 	for {
205 | 		//因为并不知道哪一个节点是leader，因此群发吧
206 | 		for _, group := range config.Groups[config.Shards[shardId]] {
207 | 			reply := CleanShardDataReply{}
208 | 			srv := kv.make_end(group)
209 | 			done := make(chan bool, 1)
210 | 
211 | 			go func(args *CleanShardDataArgs, reply *CleanShardDataReply) {
212 | 				done <- srv.Call("ShardKV.CleanShardData", args, reply)
213 | 			}(&args, &reply)
214 | 
215 | 			t.Reset(CallPeerCleanShardDataTimeOut)
216 | 
217 | 			select {
218 | 			case <-kv.stopCh:
219 | 				return
220 | 			case <-t.C:
221 | 			case isDone := <-done:
222 | 				if isDone && reply.Success == true {
223 | 					return
224 | 				}
225 | 			}
226 | 
227 | 		}
228 | 		kv.lock("callPeerCleanShardData")
229 | 		if kv.config.Num != config.Num+1 || len(kv.inputShards) == 0 {
230 | 			kv.unlock("callPeerCleanShardData")
231 | 			break
232 | 		}
233 | 		kv.unlock("callPeerCleanShardData")
234 | 	}
235 | }
236 | 


--------------------------------------------------------------------------------
/mr/coordinator.go:
--------------------------------------------------------------------------------
  1 | package mr
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"log"
  6 | 	"net"
  7 | 	"net/http"
  8 | 	"net/rpc"
  9 | 	"os"
 10 | 	"sync"
 11 | 	"time"
 12 | )
 13 | 
 14 | type TaskPhase int  //任务阶段
 15 | type TaskStatus int //任务状态
 16 | 
 17 | //任务阶段
 18 | const (
 19 | 	TaskPhase_Map    TaskPhase = 0
 20 | 	TaskPhase_Reduce TaskPhase = 1
 21 | )
 22 | 
 23 | //任务状态
 24 | const (
 25 | 	TaskStatus_New        TaskStatus = 0 //还没有创建
 26 | 	TaskStatus_Ready      TaskStatus = 1 //进入队列
 27 | 	TaskStatus_Running    TaskStatus = 2 //已经分配，正在运行
 28 | 	TaskStatus_Terminated TaskStatus = 3 //运行结束
 29 | 	TaskStatus_Error      TaskStatus = 4 //运行出错
 30 | )
 31 | 
 32 | const (
 33 | 	ScheduleInterval   = time.Millisecond * 500 //扫描任务状态的间隔时间
 34 | 	MaxTaskRunningTime = time.Second * 5        //每个任务的最大执行时间，用于判断是否超时
 35 | )
 36 | 
 37 | //任务
 38 | type Task struct {
 39 | 	FileName string    //当前任务的文件名
 40 | 	Phase    TaskPhase //当前任务状态
 41 | 	Seq      int       //当前的任务序列
 42 | 	NMap     int       //map任务/file的数量
 43 | 	NReduce  int       //reduce任务/分区的数量
 44 | 	Alive    bool      //是否存活
 45 | }
 46 | 
 47 | //任务状态
 48 | type TaskState struct {
 49 | 	Status    TaskStatus //任务状态
 50 | 	WorkerId  int        //执行当前Task的workerid
 51 | 	StartTime time.Time  //任务开始执行的时间
 52 | }
 53 | 
 54 | type Coordinator struct {
 55 | 	files      []string    //存储要处理的文件
 56 | 	nReduce    int         //reduce/分区数量
 57 | 	taskPhase  TaskPhase   //任务阶段
 58 | 	taskStates []TaskState //任务的状态
 59 | 	taskChan   chan Task   //任务队列
 60 | 	workerSeq  int         //worker序列
 61 | 	done       bool        //是否做完
 62 | 	muLock     sync.Mutex  //互斥锁
 63 | 
 64 | }
 65 | 
 66 | //创建一个task
 67 | func (c *Coordinator) NewOneTask(seq int) Task {
 68 | 	task := Task{
 69 | 		FileName: "",
 70 | 		Phase:    c.taskPhase,
 71 | 		NMap:     len(c.files),
 72 | 		NReduce:  c.nReduce,
 73 | 		Seq:      seq,
 74 | 		Alive:    true,
 75 | 	}
 76 | 
 77 | 	DPrintf("m:%+v, taskseq:%d, lenfiles:%d, lents:%d", c, seq, len(c.files), len(c.taskStates))
 78 | 
 79 | 	if task.Phase == TaskPhase_Map {
 80 | 		task.FileName = c.files[seq]
 81 | 	}
 82 | 	return task
 83 | }
 84 | 
 85 | //扫描任务状态并适当更新
 86 | func (c *Coordinator) scanTaskState() {
 87 | 	DPrintf("scanTaskState...")
 88 | 	c.muLock.Lock()
 89 | 	defer c.muLock.Unlock()
 90 | 
 91 | 	//这里不能使用函数Done()，因为此时已经上锁
 92 | 	if c.done {
 93 | 		return
 94 | 	}
 95 | 
 96 | 	allDone := true
 97 | 	//循环每个任务的状态
 98 | 	for k, v := range c.taskStates {
 99 | 		switch v.Status {
100 | 		case TaskStatus_New:
101 | 			allDone = false
102 | 			c.taskStates[k].Status = TaskStatus_Ready
103 | 			c.taskChan <- c.NewOneTask(k)
104 | 		case TaskStatus_Ready:
105 | 			allDone = false
106 | 		case TaskStatus_Running:
107 | 			allDone = false
108 | 			//超时重新分配该任务
109 | 			if time.Now().Sub(v.StartTime) > MaxTaskRunningTime {
110 | 				c.taskStates[k].Status = TaskStatus_Ready
111 | 				c.taskChan <- c.NewOneTask(k)
112 | 			}
113 | 		case TaskStatus_Terminated:
114 | 		case TaskStatus_Error:
115 | 			allDone = false
116 | 			c.taskStates[k].Status = TaskStatus_Ready
117 | 			c.taskChan <- c.NewOneTask(k)
118 | 		default:
119 | 			panic("t. status err in schedule")
120 | 		}
121 | 	}
122 | 
123 | 	//如果当前任务完成了
124 | 	if allDone {
125 | 		if c.taskPhase == TaskPhase_Map {
126 | 			//进入Reduce阶段
127 | 			DPrintf("init ReduceTask")
128 | 			c.taskPhase = TaskPhase_Reduce
129 | 			c.taskStates = make([]TaskState, c.nReduce)
130 | 		} else {
131 | 			log.Println("finish all tasks!!!😊")
132 | 			c.done = true
133 | 		}
134 | 	}
135 | }
136 | 
137 | //定时更新状态
138 | func (c *Coordinator) schedule() {
139 | 	for !c.Done() {
140 | 		c.scanTaskState()
141 | 		time.Sleep(ScheduleInterval)
142 | 	}
143 | }
144 | 
145 | // Your code here -- RPC handlers for the worker to call.
146 | 
147 | //处理Rpc请求：获取任务
148 | func (c *Coordinator) GetOneTask(args *TaskArgs, reply *TaskReply) error {
149 | 	task := <-c.taskChan
150 | 	reply.Task = &task
151 | 
152 | 	if task.Alive {
153 | 		//修改状态
154 | 		c.muLock.Lock()
155 | 		if task.Phase != c.taskPhase {
156 | 			return errors.New("GetOneTask Task phase neq")
157 | 		}
158 | 		c.taskStates[task.Seq].WorkerId = args.WorkerId
159 | 		c.taskStates[task.Seq].Status = TaskStatus_Running
160 | 		c.taskStates[task.Seq].StartTime = time.Now()
161 | 		c.muLock.Unlock()
162 | 	}
163 | 
164 | 	DPrintf("in get one Task, args:%+v, reply:%+v", args, reply)
165 | 	return nil
166 | }
167 | 
168 | //处理Rpc请求：注册worker
169 | func (c *Coordinator) RegWorker(args *RegArgs, reply *RegReply) error {
170 | 	DPrintf("worker reg!")
171 | 	c.muLock.Lock()
172 | 	defer c.muLock.Unlock()
173 | 	c.workerSeq++
174 | 	reply.WorkerId = c.workerSeq
175 | 	return nil
176 | }
177 | 
178 | //处理Rpc请求：worker响应task完成情况
179 | func (c *Coordinator) ReportTask(args *ReportTaskArgs, reply *ReportTaskReply) error {
180 | 	c.muLock.Lock()
181 | 	defer c.muLock.Unlock()
182 | 
183 | 	DPrintf("get report task: %+v, taskPhase: %+v", args, c.taskPhase)
184 | 
185 | 	//如果发现阶段不同或者当前任务已经分配给了其它worker就不修改当前任务状态
186 | 	if c.taskPhase != args.Phase || c.taskStates[args.Seq].WorkerId != args.WorkerId {
187 | 		DPrintf("in report task,workerId=%v report a useless task=%v", args.WorkerId, args.Seq)
188 | 		return nil
189 | 	}
190 | 
191 | 	if args.Done {
192 | 		c.taskStates[args.Seq].Status = TaskStatus_Terminated
193 | 	} else {
194 | 		c.taskStates[args.Seq].Status = TaskStatus_Error
195 | 	}
196 | 
197 | 	go c.scanTaskState()
198 | 	return nil
199 | }
200 | 
201 | //
202 | // an example RPC handler.
203 | //
204 | // the RPC argument and reply types are defined in rpc.go.
205 | //
206 | //func (c *Coordinator) Example(args *ExampleArgs, reply *ExampleReply) error {
207 | //	reply.Y = args.X + 1
208 | //	return nil
209 | //}
210 | 
211 | //
212 | // start a thread that listens for RPCs from worker.go
213 | //
214 | func (c *Coordinator) server() {
215 | 	rpc.Register(c)  // 注册 RPC 服务
216 | 	rpc.HandleHTTP() // 将 RPC 服务绑定到 HTTP 服务中去
217 | 	//l, e := net.Listen("tcp", ":1234")
218 | 	sockname := coordinatorSock()
219 | 	os.Remove(sockname)
220 | 	l, e := net.Listen("unix", sockname)
221 | 	if e != nil {
222 | 		log.Fatal("listen error:", e)
223 | 	}
224 | 	go http.Serve(l, nil)
225 | }
226 | 
227 | //
228 | // main/mrcoordinator.go calls Done() periodically to find out
229 | // if the entire job has finished.
230 | //如果工作全部完成，返回true
231 | //
232 | func (c *Coordinator) Done() bool {
233 | 	c.muLock.Lock()
234 | 	defer c.muLock.Unlock()
235 | 
236 | 	return c.done
237 | }
238 | 
239 | //
240 | // create a Coordinator.
241 | // main/mrcoordinator.go calls this function.
242 | // nReduce is the number of reduce tasks to use.
243 | //
244 | func MakeCoordinator(files []string, nReduce int) *Coordinator {
245 | 	c := Coordinator{
246 | 		files:      files,
247 | 		nReduce:    nReduce,
248 | 		taskPhase:  TaskPhase_Map,
249 | 		taskStates: make([]TaskState, len(files)),
250 | 		workerSeq:  0,
251 | 		done:       false,
252 | 	}
253 | 	if len(files) > nReduce {
254 | 		c.taskChan = make(chan Task, len(files))
255 | 	} else {
256 | 		c.taskChan = make(chan Task, nReduce)
257 | 	}
258 | 
259 | 	go c.schedule()
260 | 	c.server()
261 | 	DPrintf("master init")
262 | 
263 | 	return &c
264 | }
265 | 


--------------------------------------------------------------------------------
/mr/worker.go:
--------------------------------------------------------------------------------
  1 | package mr
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"strings"
 10 | )
 11 | import "log"
 12 | import "net/rpc"
 13 | import "hash/fnv"
 14 | 
 15 | //
 16 | // Map functions return a slice of KeyValue.
 17 | //
 18 | type KeyValue struct {
 19 | 	Key   string
 20 | 	Value string
 21 | }
 22 | 
 23 | type worker struct {
 24 | 	worerId int
 25 | 	mapF    func(string, string) []KeyValue
 26 | 	reduceF func(string, []string) string
 27 | }
 28 | 
 29 | //
 30 | // main/mrworker.go calls this function.
 31 | //
 32 | func Worker(mapf func(string, string) []KeyValue,
 33 | 	reducef func(string, []string) string) {
 34 | 
 35 | 	// Your worker implementation here.
 36 | 	worker := worker{
 37 | 		mapF:    mapf,
 38 | 		reduceF: reducef,
 39 | 	}
 40 | 
 41 | 	worker.register()
 42 | 	worker.run()
 43 | 	// uncomment to send the Example RPC to the coordinator.
 44 | 	// CallExample()
 45 | 
 46 | }
 47 | 
 48 | func (w *worker) run() {
 49 | 	DPrintf("run")
 50 | 	for {
 51 | 		task, err := w.getTask()
 52 | 		if err != nil {
 53 | 			DPrintf(err.Error())
 54 | 			continue
 55 | 		}
 56 | 		if !task.Alive {
 57 | 			DPrintf("worker get task not alive, exit")
 58 | 			return
 59 | 		}
 60 | 		w.doTask(*task)
 61 | 	}
 62 | }
 63 | 
 64 | //开始做任务
 65 | func (w *worker) doTask(task Task) {
 66 | 	switch task.Phase {
 67 | 	case TaskPhase_Map:
 68 | 		w.doMapTask(task)
 69 | 	case TaskPhase_Reduce:
 70 | 		w.doReduceTask(task)
 71 | 	default:
 72 | 		panic(fmt.Sprintf("task phase err: %v", task.Phase))
 73 | 	}
 74 | }
 75 | 
 76 | //
 77 | // use ihash(key) % NReduce to choose the reduce
 78 | // task number for each KeyValue emitted by Map.
 79 | //
 80 | func ihash(key string) int {
 81 | 	h := fnv.New32a()
 82 | 	h.Write([]byte(key))
 83 | 	return int(h.Sum32() & 0x7fffffff)
 84 | }
 85 | 
 86 | //map任务时获取要输出的文件名
 87 | func (w *worker) getReduceName(mapId, partitionId int) string {
 88 | 	return fmt.Sprintf("mr-kv-%d-%d", mapId, partitionId)
 89 | }
 90 | 
 91 | //reduce任务时获取要输出的文件名
 92 | func (w *worker) getMergeName(partitionId int) string {
 93 | 	return fmt.Sprintf("mr-out-%d", partitionId)
 94 | }
 95 | 
 96 | //做map任务
 97 | func (w *worker) doMapTask(task Task) {
 98 | 	DPrintf("%v start read file %v", w.worerId, task.FileName)
 99 | 	cont, err := ioutil.ReadFile(task.FileName)
100 | 	if err != nil {
101 | 		DPrintf("%v", err)
102 | 		w.reportTask(task, false)
103 | 		return
104 | 	}
105 | 
106 | 	kvs := w.mapF(task.FileName, string(cont))
107 | 	partions := make([][]KeyValue, task.NReduce)
108 | 	for _, kv := range kvs {
109 | 		pid := ihash(kv.Key) % task.NReduce
110 | 		partions[pid] = append(partions[pid], kv)
111 | 	}
112 | 
113 | 	for k, v := range partions {
114 | 		fileName := w.getReduceName(task.Seq, k)
115 | 		file, err := os.Create(fileName)
116 | 		if err != nil {
117 | 			DPrintf("create file-%v fail in doMapTask. %v", fileName, err)
118 | 			w.reportTask(task, false)
119 | 			return
120 | 		}
121 | 		encoder := json.NewEncoder(file)
122 | 		for _, kv := range v {
123 | 			if err := encoder.Encode(&kv); err != nil {
124 | 				DPrintf("encode  kvs to file-%v  fail in doMapTask. %v", fileName, err)
125 | 				w.reportTask(task, false)
126 | 			}
127 | 		}
128 | 		if err := file.Close(); err != nil {
129 | 			DPrintf("close file-%v fail in doMapTask. %v", fileName, err)
130 | 			w.reportTask(task, false)
131 | 		}
132 | 	}
133 | 	w.reportTask(task, true)
134 | }
135 | 
136 | //做reduce任务
137 | func (w *worker) doReduceTask(task Task) {
138 | 	maps := make(map[string][]string)
139 | 
140 | 	for i := 0; i < task.NMap; i++ {
141 | 		fileName := w.getReduceName(i, task.Seq)
142 | 		file, err := os.Open(fileName)
143 | 		if err != nil {
144 | 			DPrintf("open  file-%v fail in doReduceTask. %v", fileName, err)
145 | 			w.reportTask(task, false)
146 | 			return
147 | 		}
148 | 		decoder := json.NewDecoder(file)
149 | 		for {
150 | 			var kv KeyValue
151 | 			if err := decoder.Decode(&kv); err != nil {
152 | 				break
153 | 			}
154 | 			if _, ok := maps[kv.Key]; !ok {
155 | 				maps[kv.Key] = make([]string, 0)
156 | 			}
157 | 			maps[kv.Key] = append(maps[kv.Key], kv.Value)
158 | 		}
159 | 	}
160 | 
161 | 	res := make([]string, 0)
162 | 	for k, v := range maps {
163 | 		len := w.reduceF(k, v)
164 | 		res = append(res, fmt.Sprintf("%v %v\n", k, len))
165 | 	}
166 | 
167 | 	fileName := w.getMergeName(task.Seq)
168 | 	if err := ioutil.WriteFile(fileName, []byte(strings.Join(res, "")), 0600); err != nil {
169 | 		DPrintf("write file-%v in doReduceTask. %v", fileName, err)
170 | 		w.reportTask(task, false)
171 | 	}
172 | 
173 | 	w.reportTask(task, true)
174 | }
175 | 
176 | //
177 | // example function to show how to make an RPC call to the coordinator.
178 | //
179 | // the RPC argument and reply types are defined in rpc.go.
180 | //
181 | //func CallExample() {
182 | //
183 | //	// declare an argument structure.
184 | //	args := ExampleArgs{}
185 | //
186 | //	// fill in the argument(s).
187 | //	args.X = 99
188 | //
189 | //	// declare a reply structure.
190 | //	reply := ExampleReply{}
191 | //
192 | //	// send the RPC request, wait for the reply.
193 | //	// the "Coordinator.Example" tells the
194 | //	// receiving server that we'd like to call
195 | //	// the Example() method of struct Coordinator.
196 | //	ok := call("Coordinator.Example", &args, &reply)
197 | //	if ok {
198 | //		// reply.Y should be 100.
199 | //		fmt.Printf("reply.Y %v\n", reply.Y)
200 | //	} else {
201 | //		fmt.Printf("call failed!\n")
202 | //	}
203 | //}
204 | 
205 | //rpc请求：注册worker
206 | func (w *worker) register() {
207 | 	DPrintf("reg")
208 | 	args := &RegArgs{}
209 | 	reply := &RegReply{}
210 | 
211 | 	if err := call("Coordinator.RegWorker", args, reply); !err {
212 | 		log.Fatal("worker register error!", err)
213 | 	}
214 | 	w.worerId = reply.WorkerId
215 | }
216 | 
217 | //rpc请求：请求获取任务
218 | func (w *worker) getTask() (*Task, error) {
219 | 	args := TaskArgs{WorkerId: w.worerId}
220 | 	reply := TaskReply{}
221 | 
222 | 	if err := call("Coordinator.GetOneTask", &args, &reply); !err {
223 | 		return nil, errors.New("worker getTask error!")
224 | 	}
225 | 	DPrintf("worker get task:%+v", reply.Task)
226 | 	return reply.Task, nil
227 | }
228 | 
229 | //rpc请求：报告任务状态
230 | func (w *worker) reportTask(task Task, done bool) {
231 | 	args := ReportTaskArgs{
232 | 		WorkerId: w.worerId,
233 | 		Phase:    task.Phase,
234 | 		Seq:      task.Seq,
235 | 		Done:     done,
236 | 	}
237 | 	reply := ReportTaskReply{}
238 | 	if ok := call("Coordinator.ReportTask", &args, &reply); !ok {
239 | 		DPrintf("report task fail:%+v", args)
240 | 	}
241 | }
242 | 
243 | //
244 | // send an RPC request to the coordinator, wait for the response.
245 | // usually returns true.
246 | // returns false if something goes wrong.
247 | //
248 | func call(rpcname string, args interface{}, reply interface{}) bool {
249 | 	// c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
250 | 	sockname := coordinatorSock()
251 | 	conn, err := rpc.DialHTTP("unix", sockname)
252 | 	if err != nil {
253 | 		log.Fatal("dialing:", err)
254 | 	}
255 | 	defer conn.Close()
256 | 
257 | 	err = conn.Call(rpcname, args, reply) //rpcname = 结构体名.方法名
258 | 	if err == nil {
259 | 		return true
260 | 	}
261 | 
262 | 	fmt.Println(err)
263 | 	return false
264 | }
265 | 


--------------------------------------------------------------------------------
/shardkv/server.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import (
  4 | 	"6.824/labrpc"
  5 | 	"6.824/shardctrler"
  6 | 	"fmt"
  7 | 	"time"
  8 | )
  9 | import "6.824/raft"
 10 | import "sync"
 11 | import "6.824/labgob"
 12 | 
 13 | const (
 14 | 	PullConfigInterval            = time.Millisecond * 100
 15 | 	PullShardsInterval            = time.Millisecond * 200
 16 | 	WaitCmdTimeOut                = time.Millisecond * 500
 17 | 	CallPeerFetchShardDataTimeOut = time.Millisecond * 500
 18 | 	CallPeerCleanShardDataTimeOut = time.Millisecond * 500
 19 | 	MaxLockTime                   = time.Millisecond * 10 // debug
 20 | )
 21 | 
 22 | type ShardKV struct {
 23 | 	mu           sync.Mutex
 24 | 	me           int
 25 | 	rf           *raft.Raft
 26 | 	applyCh      chan raft.ApplyMsg
 27 | 	make_end     func(string) *labrpc.ClientEnd
 28 | 	gid          int
 29 | 	ctrlers      []*labrpc.ClientEnd
 30 | 	maxraftstate int // snapshot if log grows this big
 31 | 
 32 | 	// Your definitions here.
 33 | 	stopCh          chan struct{}
 34 | 	commandNotifyCh map[int64]chan CommandResult         //用于命令apply后的唤醒
 35 | 	lastApplies     [shardctrler.NShards]map[int64]int64 //k-v：ClientId-CommandId
 36 | 	config          shardctrler.Config                   //记录当前的config
 37 | 	oldConfig       shardctrler.Config                   //保存上一个config，进行shard迁移时，目标节点根据这个config来获取源节点，从而获取shard数据和请求清除shard数据
 38 | 	meShards        map[int]bool                         //记录自己分配到的shard
 39 | 	data            [shardctrler.NShards]map[string]string
 40 | 
 41 | 	inputShards  map[int]bool                   //当前这个config相较于上一个config新指派的shard，只有input为空了才能更新下一个config
 42 | 	outputShards map[int]map[int]MergeShardData // configNum -> shard -> data。当某一个config，当前节点的shard移除，则记录当前config的所有移除shard的mergeShardData
 43 | 	//cleanOutputDataNotifyCh map[string]chan struct{}  //用来通知等待协程clean完成
 44 | 	scc *shardctrler.Clerk //保存一个shardctrler的客户端，因为要向shardctrler发送query获取配置信息
 45 | 
 46 | 	//持久化
 47 | 	persister *raft.Persister
 48 | 
 49 | 	//定时任务计时器
 50 | 	pullConfigTimer *time.Timer //定期获取config
 51 | 	pullShardsTimer *time.Timer //定期检查inputShard并请求数据
 52 | 
 53 | 	//用于互斥锁
 54 | 	lockStartTime time.Time
 55 | 	lockEndTime   time.Time
 56 | 	lockMsg       string
 57 | }
 58 | 
 59 | /*
 60 | 通用函数
 61 | */
 62 | 
 63 | //自定义锁
 64 | func (kv *ShardKV) lock(msg string) {
 65 | 	kv.mu.Lock()
 66 | 	kv.lockStartTime = time.Now()
 67 | 	kv.lockMsg = msg
 68 | }
 69 | 
 70 | func (kv *ShardKV) unlock(msg string) {
 71 | 	kv.lockEndTime = time.Now()
 72 | 	duration := kv.lockEndTime.Sub(kv.lockStartTime)
 73 | 	kv.lockMsg = ""
 74 | 	kv.mu.Unlock()
 75 | 	if duration > MaxLockTime {
 76 | 		kv.log("lock too long:%s:%s\n", msg, duration)
 77 | 	}
 78 | }
 79 | 
 80 | func (kv *ShardKV) log(format string, value ...interface{}) {
 81 | 	baseMsg := fmt.Sprintf("server me: %d, gid:%d, config:%+v, input:%+v.",
 82 | 		kv.me, kv.gid, kv.config, kv.inputShards)
 83 | 	DPrintf(baseMsg, format, value)
 84 | }
 85 | 
 86 | //
 87 | // the tester calls Kill() when a ShardKV instance won't
 88 | // be needed again. you are not required to do anything
 89 | // in Kill(), but it might be convenient to (for example)
 90 | // turn off debug output from this instance.
 91 | //
 92 | func (kv *ShardKV) Kill() {
 93 | 	kv.rf.Kill()
 94 | 	// Your code here, if desired.
 95 | 	close(kv.stopCh)
 96 | 	kv.log("kil kv")
 97 | }
 98 | 
 99 | /*
100 | 定时任务
101 | */
102 | 
103 | func (kv *ShardKV) pullConfig() {
104 | 	for {
105 | 		select {
106 | 		case <-kv.stopCh:
107 | 			return
108 | 		case <-kv.pullConfigTimer.C:
109 | 			//只有leader才能获取
110 | 			_, isLeader := kv.rf.GetState()
111 | 			if !isLeader {
112 | 				kv.pullConfigTimer.Reset(PullConfigInterval)
113 | 				break
114 | 			}
115 | 			kv.lock("pullconfig")
116 | 			lastNum := kv.config.Num
117 | 			kv.log("pull config,last: %d", lastNum)
118 | 			kv.unlock("pullconfig")
119 | 
120 | 			config := kv.scc.Query(lastNum + 1)
121 | 			if config.Num == lastNum+1 {
122 | 				//找到新的config
123 | 				kv.log("pull config,new config：%+v", config)
124 | 				kv.lock("pullconfig")
125 | 				//这一个判断很关键，必须当前shard全部迁移完成才能获取下一个config
126 | 				if len(kv.inputShards) == 0 && kv.config.Num+1 == config.Num {
127 | 					kv.log("pull config,start config：%+v", config)
128 | 					kv.unlock("pullconfig")
129 | 					//请求该命令
130 | 					kv.rf.Start(config.Copy())
131 | 				} else {
132 | 					kv.unlock("pullconfig")
133 | 				}
134 | 			}
135 | 			kv.pullConfigTimer.Reset(PullConfigInterval)
136 | 		}
137 | 	}
138 | }
139 | 
140 | func (kv *ShardKV) ticker() {
141 | 	//处理applyCh
142 | 	go kv.handleApplyCh()
143 | 	//定时获取config信息
144 | 	go kv.pullConfig()
145 | 	//定时获取input shard(如果有的话)
146 | 	go kv.fetchShards()
147 | }
148 | 
149 | /*
150 | 初始服务器
151 | */
152 | 
153 | //
154 | // servers[] contains the ports of the servers in this group.
155 | //
156 | // me is the index of the current server in servers[].
157 | //
158 | // the k/v server should store snapshots through the underlying Raft
159 | // implementation, which should call persister.SaveStateAndSnapshot() to
160 | // atomically save the Raft state along with the snapshot.
161 | //
162 | // the k/v server should snapshot when Raft's saved state exceeds
163 | // maxraftstate bytes, in order to allow Raft to garbage-collect its
164 | // log. if maxraftstate is -1, you don't need to snapshot.
165 | //
166 | // gid is this group's GID, for interacting with the shardctrler.
167 | //
168 | // pass ctrlers[] to shardctrler.MakeClerk() so you can send
169 | // RPCs to the shardctrler.
170 | //
171 | // make_end(servername) turns a server name from a
172 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can
173 | // send RPCs. You'll need this to send RPCs to other groups.
174 | //
175 | // look at client.go for examples of how to use ctrlers[]
176 | // and make_end() to send RPCs to the group owning a specific shard.
177 | //
178 | // StartServer() must return quickly, so it should start goroutines
179 | // for any long-running work.
180 | //
181 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int, gid int, ctrlers []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *ShardKV {
182 | 	// call labgob.Register on structures you want
183 | 	// Go's RPC library to marshall/unmarshall.
184 | 	labgob.Register(Op{})
185 | 
186 | 	kv := new(ShardKV)
187 | 	kv.me = me
188 | 	kv.maxraftstate = maxraftstate
189 | 	kv.make_end = make_end
190 | 	kv.gid = gid
191 | 	kv.ctrlers = ctrlers
192 | 
193 | 	// Your initialization code here.
194 | 	kv.persister = persister
195 | 	kv.scc = shardctrler.MakeClerk(kv.ctrlers)
196 | 	// Use something like this to talk to the shardctrler:
197 | 	// kv.mck = shardctrler.MakeClerk(kv.ctrlers)
198 | 
199 | 	kv.applyCh = make(chan raft.ApplyMsg)
200 | 	kv.stopCh = make(chan struct{})
201 | 	kv.rf = raft.Make(servers, me, persister, kv.applyCh)
202 | 
203 | 	//初始化自身数据
204 | 	kv.data = [shardctrler.NShards]map[string]string{}
205 | 	for i, _ := range kv.data {
206 | 		kv.data[i] = make(map[string]string)
207 | 	}
208 | 	kv.lastApplies = [shardctrler.NShards]map[int64]int64{}
209 | 	for i, _ := range kv.lastApplies {
210 | 		kv.lastApplies[i] = make(map[int64]int64)
211 | 	}
212 | 
213 | 	kv.inputShards = make(map[int]bool)
214 | 	kv.outputShards = make(map[int]map[int]MergeShardData)
215 | 	//kv.cleanOutputDataNotifyCh = make(map[string]chan struct{})
216 | 	config := shardctrler.Config{
217 | 		Num:    0,
218 | 		Shards: [shardctrler.NShards]int{},
219 | 		Groups: map[int][]string{},
220 | 	}
221 | 	kv.config = config
222 | 	kv.oldConfig = config
223 | 
224 | 	//读取快照内容
225 | 	kv.readPersist(true, 0, 0, kv.persister.ReadSnapshot())
226 | 
227 | 	kv.commandNotifyCh = make(map[int64]chan CommandResult)
228 | 	//设置定时器
229 | 	kv.pullConfigTimer = time.NewTimer(PullConfigInterval)
230 | 	kv.pullShardsTimer = time.NewTimer(PullShardsInterval)
231 | 
232 | 	kv.ticker()
233 | 
234 | 	return kv
235 | }
236 | 


--------------------------------------------------------------------------------
/shardkv/server_apply.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import (
  4 | 	"6.824/shardctrler"
  5 | )
  6 | 
  7 | func (kv *ShardKV) notifyWaitCommand(reqId int64, err Err, value string) {
  8 | 	if ch, ok := kv.commandNotifyCh[reqId]; ok {
  9 | 		ch <- CommandResult{
 10 | 			Err:   err,
 11 | 			Value: value,
 12 | 		}
 13 | 	}
 14 | }
 15 | 
 16 | func (kv *ShardKV) getValueByKey(key string) (err Err, value string) {
 17 | 	if v, ok := kv.data[key2shard(key)][key]; ok {
 18 | 		err = OK
 19 | 		value = v
 20 | 	} else {
 21 | 		err = ErrNoKey
 22 | 		value = ""
 23 | 	}
 24 | 	return
 25 | }
 26 | 
 27 | //判断能否执行客户端发来的命令
 28 | func (kv *ShardKV) ProcessKeyReady(configNum int, key string) Err {
 29 | 	//config不对
 30 | 	if configNum == 0 || configNum != kv.config.Num {
 31 | 		kv.log("process key ready err config.")
 32 | 		return ErrWrongGroup
 33 | 	}
 34 | 	shardId := key2shard(key)
 35 | 	//没有分配该shard
 36 | 	if _, ok := kv.meShards[shardId]; !ok {
 37 | 		kv.log("process key ready err shard.")
 38 | 		return ErrWrongGroup
 39 | 	}
 40 | 	//正在迁移，这里有优化的空间，如果没有迁移完成，可以直接请求目标节点完成操作并返回，但是这样就太复杂了，这里简略了
 41 | 	if _, ok := kv.inputShards[shardId]; ok {
 42 | 		kv.log("process key ready err waitShard.")
 43 | 		return ErrWrongGroup
 44 | 	}
 45 | 	return OK
 46 | }
 47 | 
 48 | //应用每一条命令
 49 | func (kv *ShardKV) handleApplyCh() {
 50 | 	for {
 51 | 		select {
 52 | 		case <-kv.stopCh:
 53 | 			kv.log("get from stopCh,server-%v stop!", kv.me)
 54 | 			return
 55 | 		case cmd := <-kv.applyCh:
 56 | 			//处理快照命令，读取快照的内容
 57 | 			if cmd.SnapshotValid {
 58 | 				kv.log("%v get install sn,%v %v", kv.me, cmd.SnapshotIndex, cmd.SnapshotTerm)
 59 | 				kv.lock("waitApplyCh_sn")
 60 | 				kv.readPersist(false, cmd.SnapshotTerm, cmd.SnapshotIndex, cmd.Snapshot)
 61 | 				kv.unlock("waitApplyCh_sn")
 62 | 				continue
 63 | 			}
 64 | 			//处理普通命令
 65 | 			if !cmd.CommandValid {
 66 | 				continue
 67 | 			}
 68 | 			cmdIdx := cmd.CommandIndex
 69 | 			//处理不同的命令
 70 | 			if op, ok := cmd.Command.(Op); ok {
 71 | 				kv.handleOpCommand(cmdIdx, op)
 72 | 			} else if config, ok := cmd.Command.(shardctrler.Config); ok {
 73 | 				kv.handleConfigCommand(cmdIdx, config)
 74 | 			} else if mergeData, ok := cmd.Command.(MergeShardData); ok {
 75 | 				kv.handleMergeShardDataCommand(cmdIdx, mergeData)
 76 | 			} else if cleanData, ok := cmd.Command.(CleanShardDataArgs); ok {
 77 | 				kv.handleCleanShardDataCommand(cmdIdx, cleanData)
 78 | 			} else {
 79 | 				panic("apply command,NOT FOUND COMMDN！")
 80 | 			}
 81 | 
 82 | 		}
 83 | 
 84 | 	}
 85 | 
 86 | }
 87 | 
 88 | //处理get、put、append命令
 89 | func (kv *ShardKV) handleOpCommand(cmdIdx int, op Op) {
 90 | 	kv.log("start apply command %v：%+v", cmdIdx, op)
 91 | 	kv.lock("handleApplyCh")
 92 | 	defer kv.unlock("handleApplyCh")
 93 | 	shardId := key2shard(op.Key)
 94 | 	if err := kv.ProcessKeyReady(op.ConfigNum, op.Key); err != OK {
 95 | 		kv.notifyWaitCommand(op.ReqId, err, "")
 96 | 		return
 97 | 	}
 98 | 	if op.Method == "Get" {
 99 | 		//处理读
100 | 		e, v := kv.getValueByKey(op.Key)
101 | 		kv.notifyWaitCommand(op.ReqId, e, v)
102 | 	} else if op.Method == "Put" || op.Method == "Append" {
103 | 		//处理写
104 | 		//判断命令是否重复
105 | 		isRepeated := false
106 | 		if v, ok := kv.lastApplies[shardId][op.ClientId]; ok {
107 | 			if v == op.CommandId {
108 | 				isRepeated = true
109 | 			}
110 | 		}
111 | 
112 | 		if !isRepeated {
113 | 			switch op.Method {
114 | 			case "Put":
115 | 				kv.data[shardId][op.Key] = op.Value
116 | 				kv.lastApplies[shardId][op.ClientId] = op.CommandId
117 | 			case "Append":
118 | 				e, v := kv.getValueByKey(op.Key)
119 | 				if e == ErrNoKey {
120 | 					//按put处理
121 | 					kv.data[shardId][op.Key] = op.Value
122 | 					kv.lastApplies[shardId][op.ClientId] = op.CommandId
123 | 				} else {
124 | 					//追加
125 | 					kv.data[shardId][op.Key] = v + op.Value
126 | 					kv.lastApplies[shardId][op.ClientId] = op.CommandId
127 | 				}
128 | 			default:
129 | 				panic("unknown method " + op.Method)
130 | 			}
131 | 
132 | 		}
133 | 		//命令处理成功
134 | 		kv.notifyWaitCommand(op.ReqId, OK, "")
135 | 	} else {
136 | 		panic("unknown method " + op.Method)
137 | 	}
138 | 
139 | 	kv.log("apply op: cmdId:%d, op: %+v, data:%v", cmdIdx, op, kv.data[shardId][op.Key])
140 | 	//每应用一条命令，就判断是否进行持久化
141 | 	kv.saveSnapshot(cmdIdx)
142 | }
143 | 
144 | //处理config命令，即更新config
145 | //主要是处理meshard、inputshard、outputshard
146 | func (kv *ShardKV) handleConfigCommand(cmdIdx int, config shardctrler.Config) {
147 | 	kv.log("start handle config %v：%+v", cmdIdx, config)
148 | 	kv.lock("handleApplyCh")
149 | 	defer kv.unlock("handleApplyCh")
150 | 	if config.Num <= kv.config.Num {
151 | 		kv.saveSnapshot(cmdIdx)
152 | 		return
153 | 	}
154 | 
155 | 	if config.Num != kv.config.Num+1 {
156 | 		panic("applyConfig err")
157 | 	}
158 | 
159 | 	oldConfig := kv.config.Copy()
160 | 	outputShards := make([]int, 0, shardctrler.NShards)
161 | 	inputShards := make([]int, 0, shardctrler.NShards)
162 | 	meShards := make([]int, 0, shardctrler.NShards)
163 | 
164 | 	for i := 0; i < shardctrler.NShards; i++ {
165 | 		if config.Shards[i] == kv.gid {
166 | 			meShards = append(meShards, i)
167 | 			if oldConfig.Shards[i] != kv.gid {
168 | 				inputShards = append(inputShards, i)
169 | 			}
170 | 		} else {
171 | 			if oldConfig.Shards[i] == kv.gid {
172 | 				outputShards = append(outputShards, i)
173 | 			}
174 | 		}
175 | 	}
176 | 
177 | 	//处理当前的shard
178 | 	kv.meShards = make(map[int]bool)
179 | 	for _, shardId := range meShards {
180 | 		kv.meShards[shardId] = true
181 | 	}
182 | 
183 | 	//处理移出的shard
184 | 	//保存当前所处配置的所有移除的shard数据
185 | 	d := make(map[int]MergeShardData)
186 | 	for _, shardId := range outputShards {
187 | 		mergeShardData := MergeShardData{
188 | 			ConfigNum:      oldConfig.Num,
189 | 			ShardNum:       shardId,
190 | 			Data:           kv.data[shardId],
191 | 			CommandIndexes: kv.lastApplies[shardId],
192 | 		}
193 | 		d[shardId] = mergeShardData
194 | 		//初始化数据
195 | 		kv.data[shardId] = make(map[string]string)
196 | 		kv.lastApplies[shardId] = make(map[int64]int64)
197 | 	}
198 | 	kv.outputShards[oldConfig.Num] = d
199 | 
200 | 	//处理移入的shard
201 | 	kv.inputShards = make(map[int]bool)
202 | 	if oldConfig.Num != 0 {
203 | 		for _, shardId := range inputShards {
204 | 			kv.inputShards[shardId] = true
205 | 		}
206 | 	}
207 | 
208 | 	kv.config = config
209 | 	kv.oldConfig = oldConfig
210 | 	kv.log("apply op: cmdId:%d, config:%+v", cmdIdx, config)
211 | 	kv.saveSnapshot(cmdIdx)
212 | }
213 | 
214 | //处理新的shard数据，即input shard
215 | func (kv *ShardKV) handleMergeShardDataCommand(cmdIdx int, data MergeShardData) {
216 | 	kv.log("start merge Shard Data %v：%+v", cmdIdx, data)
217 | 	kv.lock("handleApplyCh")
218 | 	defer kv.unlock("handleApplyCh")
219 | 	if kv.config.Num != data.ConfigNum+1 {
220 | 		return
221 | 	}
222 | 
223 | 	if _, ok := kv.inputShards[data.ShardNum]; !ok {
224 | 		return
225 | 	}
226 | 
227 | 	kv.data[data.ShardNum] = make(map[string]string)
228 | 	kv.lastApplies[data.ShardNum] = make(map[int64]int64)
229 | 
230 | 	for k, v := range data.Data {
231 | 		kv.data[data.ShardNum][k] = v
232 | 	}
233 | 	for k, v := range data.CommandIndexes {
234 | 		kv.lastApplies[data.ShardNum][k] = v
235 | 	}
236 | 	delete(kv.inputShards, data.ShardNum)
237 | 
238 | 	kv.log("apply op: cmdId:%d, mergeShardData:%+v", cmdIdx, data)
239 | 	kv.saveSnapshot(cmdIdx)
240 | 	go kv.callPeerCleanShardData(kv.oldConfig, data.ShardNum)
241 | }
242 | 
243 | //处理已经迁移走的shard，即output shard
244 | func (kv *ShardKV) handleCleanShardDataCommand(cmdIdx int, data CleanShardDataArgs) {
245 | 	kv.log("start clean shard data %v：%+v", cmdIdx, data)
246 | 	kv.lock("handleApplyCh")
247 | 	defer kv.unlock("handleApplyCh")
248 | 	//如果要清除的shard确实是在outputShard中，且没有被清除，则需要清除
249 | 	if kv.OutputDataExist(data.ConfigNum, data.ShardNum) {
250 | 		delete(kv.outputShards[data.ConfigNum], data.ShardNum)
251 | 	}
252 | 
253 | 	//通知等待协程
254 | 	//if ch, ok := kv.cleanOutputDataNotifyCh[fmt.Sprintf("%d%d", data.ConfigNum, data.ShardNum)]; ok {
255 | 	//	ch <- struct{}{}
256 | 	//}
257 | 
258 | 	kv.saveSnapshot(cmdIdx)
259 | }
260 | 


--------------------------------------------------------------------------------
/raft/raft_append_entries.go:
--------------------------------------------------------------------------------
  1 | package raft
  2 | 
  3 | import (
  4 | 	"time"
  5 | )
  6 | 
  7 | type AppendEntriesArgs struct {
  8 | 	Term         int
  9 | 	LeaderId     int
 10 | 	PrevLogIndex int
 11 | 	PrevLogTerm  int
 12 | 	Entries      []LogEntry
 13 | 	LeaderCommit int
 14 | }
 15 | 
 16 | type AppendEntriesReply struct {
 17 | 	Term         int
 18 | 	Success      bool
 19 | 	NextLogTerm  int
 20 | 	NextLogIndex int
 21 | }
 22 | 
 23 | //立马发送
 24 | func (rf *Raft) resetAppendEntriesTimersZero() {
 25 | 	for _, timer := range rf.appendEntriesTimers {
 26 | 		timer.Stop()
 27 | 		timer.Reset(0)
 28 | 	}
 29 | }
 30 | 
 31 | func (rf *Raft) resetAppendEntriesTimerZero(peerId int) {
 32 | 	rf.appendEntriesTimers[peerId].Stop()
 33 | 	rf.appendEntriesTimers[peerId].Reset(0)
 34 | }
 35 | 
 36 | //重置单个timer
 37 | func (rf *Raft) resetAppendEntriesTimer(peerId int) {
 38 | 	rf.appendEntriesTimers[peerId].Stop()
 39 | 	rf.appendEntriesTimers[peerId].Reset(HeartBeatInterval)
 40 | }
 41 | 
 42 | //判断当前raft的日志记录是否超过发送过来的日志记录
 43 | func (rf *Raft) isOutOfArgsAppendEntries(args *AppendEntriesArgs) bool {
 44 | 	argsLastLogIndex := args.PrevLogIndex + len(args.Entries)
 45 | 	lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex()
 46 | 	if lastLogTerm == args.Term && argsLastLogIndex < lastLogIndex {
 47 | 		return true
 48 | 	}
 49 | 	return false
 50 | }
 51 | 
 52 | //获取当前存储位置的索引
 53 | func (rf *Raft) getStoreIndexByLogIndex(logIndex int) int {
 54 | 	storeIndex := logIndex - rf.lastSnapshotIndex
 55 | 	if storeIndex < 0 {
 56 | 		return -1
 57 | 	}
 58 | 	return storeIndex
 59 | }
 60 | 
 61 | //接收端处理rpc
 62 | //主要进行三个处理：
 63 | // 	1. 判断任期
 64 | // 	2. 判断是否接收数据，success：数据全部接受，或者根本就没有数据
 65 | //	3. 判断是否提交数据
 66 | func (rf *Raft) AppendEntries(args *AppendEntriesArgs, reply *AppendEntriesReply) {
 67 | 	rf.mu.Lock()
 68 | 	DPrintf("%v receive a appendEntries: %+v", rf.me, args)
 69 | 	reply.Term = rf.currentTerm
 70 | 	if args.Term < rf.currentTerm {
 71 | 		rf.mu.Unlock()
 72 | 		return
 73 | 	}
 74 | 	rf.currentTerm = args.Term
 75 | 	rf.changeRole(Role_Follower)
 76 | 	rf.resetElectionTimer()
 77 | 
 78 | 	_, lastLogIndex := rf.getLastLogTermAndIndex()
 79 | 	//先判断两边，再判断刚好从快照开始，再判断中间的情况
 80 | 	if args.PrevLogIndex < rf.lastSnapshotIndex {
 81 | 		//1.要插入的前一个index小于快照index，几乎不会发生
 82 | 		reply.Success = false
 83 | 		reply.NextLogIndex = rf.lastSnapshotIndex + 1
 84 | 	} else if args.PrevLogIndex > lastLogIndex {
 85 | 		//2. 要插入的前一个index大于最后一个log的index，说明中间还有log
 86 | 		reply.Success = false
 87 | 		reply.NextLogIndex = lastLogIndex + 1
 88 | 	} else if args.PrevLogIndex == rf.lastSnapshotIndex {
 89 | 		//3. 要插入的前一个index刚好等于快照的index，说明可以全覆盖，但要判断是否是全覆盖
 90 | 		if rf.isOutOfArgsAppendEntries(args) {
 91 | 			reply.Success = false
 92 | 			reply.NextLogIndex = 0 //=0代表着插入会导致乱序
 93 | 		} else {
 94 | 			reply.Success = true
 95 | 			rf.logs = append(rf.logs[:1], args.Entries...)
 96 | 			_, currentLogIndex := rf.getLastLogTermAndIndex()
 97 | 			reply.NextLogIndex = currentLogIndex + 1
 98 | 		}
 99 | 	} else if args.PrevLogTerm == rf.logs[rf.getStoreIndexByLogIndex(args.PrevLogIndex)].Term {
100 | 		//4. 中间的情况：索引处的两个term相同
101 | 		if rf.isOutOfArgsAppendEntries(args) {
102 | 			reply.Success = false
103 | 			reply.NextLogIndex = 0
104 | 		} else {
105 | 			reply.Success = true
106 | 			rf.logs = append(rf.logs[:rf.getStoreIndexByLogIndex(args.PrevLogIndex)+1], args.Entries...)
107 | 			_, currentLogIndex := rf.getLastLogTermAndIndex()
108 | 			reply.NextLogIndex = currentLogIndex + 1
109 | 		}
110 | 	} else {
111 | 		//5. 中间的情况：索引处的两个term不相同，跳过一个term
112 | 		term := rf.logs[rf.getStoreIndexByLogIndex(args.PrevLogIndex)].Term
113 | 		index := args.PrevLogIndex
114 | 		for index > rf.commitIndex && index > rf.lastSnapshotIndex && rf.logs[rf.getStoreIndexByLogIndex(index)].Term == term {
115 | 			index--
116 | 		}
117 | 		reply.Success = false
118 | 		reply.NextLogIndex = index + 1
119 | 	}
120 | 
121 | 	if reply.Success {
122 | 		DPrintf("%v current commit: %v, try to commit %v", rf.me, rf.commitIndex, args.LeaderCommit)
123 | 		if rf.commitIndex < args.LeaderCommit {
124 | 			rf.commitIndex = args.LeaderCommit
125 | 			rf.notifyApplyCh <- struct{}{}
126 | 		}
127 | 	}
128 | 
129 | 	rf.persist()
130 | 	DPrintf("%v role: %v, get appendentries finish,args = %v,reply = %+v", rf.me, rf.role, *args, *reply)
131 | 	rf.mu.Unlock()
132 | 
133 | }
134 | 
135 | //获取要向指定节点发送的日志
136 | func (rf *Raft) getAppendLogs(peerId int) (prevLogIndex int, prevLogTerm int, logEntries []LogEntry) {
137 | 	nextIndex := rf.nextIndex[peerId]
138 | 	lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex()
139 | 	if nextIndex <= rf.lastSnapshotIndex || nextIndex > lastLogIndex {
140 | 		//没有要发送的log
141 | 		prevLogTerm = lastLogTerm
142 | 		prevLogIndex = lastLogIndex
143 | 		return
144 | 	}
145 | 	//这里一定要进行深拷贝，不然会和Snapshot()发生数据上的冲突
146 | 	//logEntries = rf.logs[nextIndex-rf.lastSnapshotIndex:]
147 | 	logEntries = make([]LogEntry, lastLogIndex-nextIndex+1)
148 | 	copy(logEntries, rf.logs[nextIndex-rf.lastSnapshotIndex:])
149 | 	prevLogIndex = nextIndex - 1
150 | 	if prevLogIndex == rf.lastSnapshotIndex {
151 | 		prevLogTerm = rf.lastSnapshotTerm
152 | 	} else {
153 | 		prevLogTerm = rf.logs[prevLogIndex-rf.lastSnapshotIndex].Term
154 | 	}
155 | 
156 | 	return
157 | }
158 | 
159 | //尝试去提交日志
160 | //会依次判断，可以提交多个，但不能有间断
161 | func (rf *Raft) tryCommitLog() {
162 | 	_, lastLogIndex := rf.getLastLogTermAndIndex()
163 | 	hasCommit := false
164 | 
165 | 	for i := rf.commitIndex + 1; i <= lastLogIndex; i++ {
166 | 		count := 0
167 | 		for _, m := range rf.matchIndex {
168 | 			if m >= i {
169 | 				count += 1
170 | 				//提交数达到多数派
171 | 				if count > len(rf.peers)/2 {
172 | 					rf.commitIndex = i
173 | 					hasCommit = true
174 | 					DPrintf("%v role: %v,commit index %v", rf.me, rf.role, i)
175 | 					break
176 | 				}
177 | 			}
178 | 		}
179 | 		if rf.commitIndex != i {
180 | 			break
181 | 		}
182 | 	}
183 | 
184 | 	if hasCommit {
185 | 		rf.notifyApplyCh <- struct{}{}
186 | 	}
187 | }
188 | 
189 | //发送端发送数据
190 | func (rf *Raft) sendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply) {
191 | 	rpcTimer := time.NewTimer(RPCTimeout)
192 | 	defer rpcTimer.Stop()
193 | 
194 | 	ch := make(chan bool, 1)
195 | 	go func() {
196 | 		//尝试10次
197 | 		for i := 0; i < 10 && !rf.killed(); i++ {
198 | 			ok := rf.peers[server].Call("Raft.AppendEntries", args, reply)
199 | 			if !ok {
200 | 				time.Sleep(time.Millisecond * 10)
201 | 				continue
202 | 			} else {
203 | 				ch <- ok
204 | 				return
205 | 			}
206 | 		}
207 | 	}()
208 | 
209 | 	select {
210 | 	case <-rpcTimer.C:
211 | 		DPrintf("%v role: %v, send append entries to peer %v TIME OUT!!!", rf.me, rf.role, server)
212 | 		return
213 | 	case <-ch:
214 | 		return
215 | 	}
216 | }
217 | 
218 | func (rf *Raft) sendAppendEntriesToPeer(peerId int) {
219 | 	if rf.killed() {
220 | 		return
221 | 	}
222 | 
223 | 	rf.mu.Lock()
224 | 	if rf.role != Role_Leader {
225 | 		rf.resetAppendEntriesTimer(peerId)
226 | 		rf.mu.Unlock()
227 | 		return
228 | 	}
229 | 	DPrintf("%v send append entries to peer %v", rf.me, peerId)
230 | 
231 | 	prevLogIndex, prevLogTerm, logEntries := rf.getAppendLogs(peerId)
232 | 	args := AppendEntriesArgs{
233 | 		Term:         rf.currentTerm,
234 | 		LeaderId:     rf.me,
235 | 		PrevLogIndex: prevLogIndex,
236 | 		PrevLogTerm:  prevLogTerm,
237 | 		Entries:      logEntries,
238 | 		LeaderCommit: rf.commitIndex,
239 | 	}
240 | 	reply := AppendEntriesReply{}
241 | 	rf.resetAppendEntriesTimer(peerId)
242 | 	rf.mu.Unlock()
243 | 
244 | 	rf.sendAppendEntries(peerId, &args, &reply)
245 | 
246 | 	DPrintf("%v role: %v, send append entries to peer finish,%v,args = %+v,reply = %+v", rf.me, rf.role, peerId, args, reply)
247 | 
248 | 	rf.mu.Lock()
249 | 	if reply.Term > rf.currentTerm {
250 | 		rf.changeRole(Role_Follower)
251 | 		rf.currentTerm = reply.Term
252 | 		rf.resetElectionTimer()
253 | 		rf.persist()
254 | 		rf.mu.Unlock()
255 | 		return
256 | 	}
257 | 
258 | 	if rf.role != Role_Leader || rf.currentTerm != args.Term {
259 | 		rf.mu.Unlock()
260 | 		return
261 | 	}
262 | 
263 | 	//响应：成功了，即：发送的数据全部接收了，或者根本没有数据
264 | 	if reply.Success {
265 | 		if reply.NextLogIndex > rf.nextIndex[peerId] {
266 | 			rf.nextIndex[peerId] = reply.NextLogIndex
267 | 			rf.matchIndex[peerId] = reply.NextLogIndex - 1
268 | 		}
269 | 		if len(args.Entries) > 0 && args.Entries[len(args.Entries)-1].Term == rf.currentTerm {
270 | 			//每个leader只能提交自己任期的日志
271 | 			rf.tryCommitLog()
272 | 		}
273 | 		rf.persist()
274 | 		rf.mu.Unlock()
275 | 		return
276 | 	}
277 | 
278 | 	//响应：失败了，此时要修改nextIndex或者不做处理
279 | 	if reply.NextLogIndex != 0 {
280 | 		if reply.NextLogIndex > rf.lastSnapshotIndex {
281 | 			rf.nextIndex[peerId] = reply.NextLogIndex
282 | 			//为了一致性，立马发送
283 | 			rf.resetAppendEntriesTimerZero(peerId)
284 | 		} else {
285 | 			//发送快照
286 | 			go rf.sendInstallSnapshotToPeer(peerId)
287 | 		}
288 | 		rf.mu.Unlock()
289 | 		return
290 | 	} else {
291 | 		//reply.NextLogIndex = 0,此时如果插入会导致乱序，可以不进行处理
292 | 	}
293 | 
294 | 	rf.mu.Unlock()
295 | 	return
296 | 
297 | }
298 | 


--------------------------------------------------------------------------------
/main/test-mr.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | #
  4 | # map-reduce tests
  5 | #
  6 | 
  7 | # comment this out to run the tests without the Go race detector.
  8 | RACE=-race
  9 | 
 10 | if [[ "$OSTYPE" = "darwin"* ]]
 11 | then
 12 |   if go version | grep 'go1.17.[012345]'
 13 |   then
 14 |     # -race with plug-ins on x86 MacOS 12 with
 15 |     # go1.17 before 1.17.6 sometimes crash.
 16 |     RACE=
 17 |     echo '*** Turning off -race since it may not work on a Mac'
 18 |     echo '    with ' `go version`
 19 |   fi
 20 | fi
 21 | 
 22 | TIMEOUT=timeout
 23 | if timeout 2s sleep 1 > /dev/null 2>&1
 24 | then
 25 |   :
 26 | else
 27 |   if gtimeout 2s sleep 1 > /dev/null 2>&1
 28 |   then
 29 |     TIMEOUT=gtimeout
 30 |   else
 31 |     # no timeout command
 32 |     TIMEOUT=
 33 |     echo '*** Cannot find timeout command; proceeding without timeouts.'
 34 |   fi
 35 | fi
 36 | if [ "$TIMEOUT" != "" ]
 37 | then
 38 |   TIMEOUT+=" -k 2s 180s "
 39 | fi
 40 | 
 41 | # run the test in a fresh sub-directory.
 42 | rm -rf mr-tmp
 43 | mkdir mr-tmp || exit 1
 44 | cd mr-tmp || exit 1
 45 | rm -f mr-*
 46 | 
 47 | # make sure software is freshly built.
 48 | (cd ../../mrapps && go clean)
 49 | (cd .. && go clean)
 50 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1
 51 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1
 52 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1
 53 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1
 54 | (cd ../../mrapps && go build $RACE -buildmode=plugin jobcount.go) || exit 1
 55 | (cd ../../mrapps && go build $RACE -buildmode=plugin early_exit.go) || exit 1
 56 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1
 57 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1
 58 | (cd .. && go build $RACE mrcoordinator.go) || exit 1
 59 | (cd .. && go build $RACE mrworker.go) || exit 1
 60 | (cd .. && go build $RACE mrsequential.go) || exit 1
 61 | 
 62 | failed_any=0
 63 | 
 64 | #########################################################
 65 | # first word-count
 66 | 
 67 | # generate the correct output
 68 | ../mrsequential ../../mrapps/wc.so ../pg*txt || exit 1
 69 | sort mr-out-0 > mr-correct-wc.txt
 70 | rm -f mr-out*
 71 | 
 72 | echo '***' Starting wc test.
 73 | 
 74 | $TIMEOUT ../mrcoordinator ../pg*txt &
 75 | pid=$!
 76 | 
 77 | # give the coordinator time to create the sockets.
 78 | sleep 1
 79 | 
 80 | # start multiple workers.
 81 | $TIMEOUT ../mrworker ../../mrapps/wc.so &
 82 | $TIMEOUT ../mrworker ../../mrapps/wc.so &
 83 | $TIMEOUT ../mrworker ../../mrapps/wc.so &
 84 | 
 85 | # wait for the coordinator to exit.
 86 | wait $pid
 87 | 
 88 | # since workers are required to exit when a job is completely finished,
 89 | # and not before, that means the job has finished.
 90 | sort mr-out* | grep . > mr-wc-all
 91 | if cmp mr-wc-all mr-correct-wc.txt
 92 | then
 93 |   echo '' wc test: PASS
 94 | else
 95 |   echo '---' wc output is not the same as mr-correct-wc.txt
 96 |   echo '---' wc test: FAIL
 97 |   failed_any=1
 98 | fi
 99 | 
100 | # wait for remaining workers and coordinator to exit.
101 | wait
102 | 
103 | #########################################################
104 | # now indexer
105 | rm -f mr-*
106 | 
107 | # generate the correct output
108 | ../mrsequential ../../mrapps/indexer.so ../pg*txt || exit 1
109 | sort mr-out-0 > mr-correct-indexer.txt
110 | rm -f mr-out*
111 | 
112 | echo '***' Starting indexer test.
113 | 
114 | $TIMEOUT ../mrcoordinator ../pg*txt &
115 | sleep 1
116 | 
117 | # start multiple workers
118 | $TIMEOUT ../mrworker ../../mrapps/indexer.so &
119 | $TIMEOUT ../mrworker ../../mrapps/indexer.so
120 | 
121 | sort mr-out* | grep . > mr-indexer-all
122 | if cmp mr-indexer-all mr-correct-indexer.txt
123 | then
124 |   echo '---' indexer test: PASS
125 | else
126 |   echo '---' indexer output is not the same as mr-correct-indexer.txt
127 |   echo '---' indexer test: FAIL
128 |   failed_any=1
129 | fi
130 | 
131 | wait
132 | 
133 | #########################################################
134 | echo '***' Starting map parallelism test.
135 | 
136 | rm -f mr-*
137 | 
138 | $TIMEOUT ../mrcoordinator ../pg*txt &
139 | sleep 1
140 | 
141 | $TIMEOUT ../mrworker ../../mrapps/mtiming.so &
142 | $TIMEOUT ../mrworker ../../mrapps/mtiming.so
143 | 
144 | NT=`cat mr-out* | grep '^times-' | wc -l | sed 's/ //g'`
145 | if [ "$NT" != "2" ]
146 | then
147 |   echo '---' saw "$NT" workers rather than 2
148 |   echo '---' map parallelism test: FAIL
149 |   failed_any=1
150 | fi
151 | 
152 | if cat mr-out* | grep '^parallel.* 2' > /dev/null
153 | then
154 |   echo '---' map parallelism test: PASS
155 | else
156 |   echo '---' map workers did not run in parallel
157 |   echo '---' map parallelism test: FAIL
158 |   failed_any=1
159 | fi
160 | 
161 | wait
162 | 
163 | 
164 | #########################################################
165 | echo '***' Starting reduce parallelism test.
166 | 
167 | rm -f mr-*
168 | 
169 | $TIMEOUT ../mrcoordinator ../pg*txt &
170 | sleep 1
171 | 
172 | $TIMEOUT ../mrworker ../../mrapps/rtiming.so &
173 | $TIMEOUT ../mrworker ../../mrapps/rtiming.so
174 | 
175 | NT=`cat mr-out* | grep '^[a-z] 2' | wc -l | sed 's/ //g'`
176 | if [ "$NT" -lt "2" ]
177 | then
178 |   echo '---' too few parallel reduces.
179 |   echo '---' reduce parallelism test: FAIL
180 |   failed_any=1
181 | else
182 |   echo '---' reduce parallelism test: PASS
183 | fi
184 | 
185 | wait
186 | 
187 | #########################################################
188 | echo '***' Starting job count test.
189 | 
190 | rm -f mr-*
191 | 
192 | $TIMEOUT ../mrcoordinator ../pg*txt &
193 | sleep 1
194 | 
195 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so &
196 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so
197 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so &
198 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so
199 | 
200 | NT=`cat mr-out* | awk '{print $2}'`
201 | if [ "$NT" -eq "8" ]
202 | then
203 |   echo '---' job count test: PASS
204 | else
205 |   echo '---' map jobs ran incorrect number of times "($NT != 8)"
206 |   echo '---' job count test: FAIL
207 |   failed_any=1
208 | fi
209 | 
210 | wait
211 | 
212 | #########################################################
213 | # test whether any worker or coordinator exits before the
214 | # task has completed (i.e., all output files have been finalized)
215 | rm -f mr-*
216 | 
217 | echo '***' Starting early exit test.
218 | 
219 | DF=anydone$$
220 | rm -f $DF
221 | 
222 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch $DF) &
223 | 
224 | # give the coordinator time to create the sockets.
225 | sleep 1
226 | 
227 | # start multiple workers.
228 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) &
229 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) &
230 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) &
231 | 
232 | # wait for any of the coord or workers to exit.
233 | # `jobs` ensures that any completed old processes from other tests
234 | # are not waited upon.
235 | jobs &> /dev/null
236 | if [[ "$OSTYPE" = "darwin"* ]]
237 | then
238 |   # bash on the Mac doesn't have wait -n
239 |   while [ ! -e $DF ]
240 |   do
241 |     sleep 0.2
242 |   done
243 | else
244 |   # the -n causes wait to wait for just one child process,
245 |   # rather than waiting for all to finish.
246 |   wait -n
247 | fi
248 | 
249 | rm -f $DF
250 | 
251 | # a process has exited. this means that the output should be finalized
252 | # otherwise, either a worker or the coordinator exited early
253 | sort mr-out* | grep . > mr-wc-all-initial
254 | 
255 | # wait for remaining workers and coordinator to exit.
256 | wait
257 | 
258 | # compare initial and final outputs
259 | sort mr-out* | grep . > mr-wc-all-final
260 | if cmp mr-wc-all-final mr-wc-all-initial
261 | then
262 |   echo '---' early exit test: PASS
263 | else
264 |   echo '---' output changed after first worker exited
265 |   echo '---' early exit test: FAIL
266 |   failed_any=1
267 | fi
268 | rm -f mr-*
269 | 
270 | #########################################################
271 | echo '***' Starting crash test.
272 | 
273 | # generate the correct output
274 | ../mrsequential ../../mrapps/nocrash.so ../pg*txt || exit 1
275 | sort mr-out-0 > mr-correct-crash.txt
276 | rm -f mr-out*
277 | 
278 | rm -f mr-done
279 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch mr-done ) &
280 | sleep 1
281 | 
282 | # start multiple workers
283 | $TIMEOUT ../mrworker ../../mrapps/crash.so &
284 | 
285 | # mimic rpc.go's coordinatorSock()
286 | SOCKNAME=/var/tmp/824-mr-`id -u`
287 | 
288 | ( while [ -e $SOCKNAME -a ! -f mr-done ]
289 |   do
290 |     $TIMEOUT ../mrworker ../../mrapps/crash.so
291 |     sleep 1
292 |   done ) &
293 | 
294 | ( while [ -e $SOCKNAME -a ! -f mr-done ]
295 |   do
296 |     $TIMEOUT ../mrworker ../../mrapps/crash.so
297 |     sleep 1
298 |   done ) &
299 | 
300 | while [ -e $SOCKNAME -a ! -f mr-done ]
301 | do
302 |   $TIMEOUT ../mrworker ../../mrapps/crash.so
303 |   sleep 1
304 | done
305 | 
306 | wait
307 | 
308 | rm $SOCKNAME
309 | sort mr-out* | grep . > mr-crash-all
310 | if cmp mr-crash-all mr-correct-crash.txt
311 | then
312 |   echo '---' crash test: PASS
313 | else
314 |   echo '---' crash output is not the same as mr-correct-crash.txt
315 |   echo '---' crash test: FAIL
316 |   failed_any=1
317 | fi
318 | 
319 | #########################################################
320 | if [ $failed_any -eq 0 ]; then
321 |     echo '***' PASSED ALL TESTS
322 | else
323 |     echo '***' FAILED SOME TESTS
324 |     exit 1
325 | fi
326 | 


--------------------------------------------------------------------------------
/shardctrler/config.go:
--------------------------------------------------------------------------------
  1 | package shardctrler
  2 | 
  3 | import "6.824/labrpc"
  4 | import "6.824/raft"
  5 | import "testing"
  6 | import "os"
  7 | 
  8 | // import "log"
  9 | import crand "crypto/rand"
 10 | import "math/rand"
 11 | import "encoding/base64"
 12 | import "sync"
 13 | import "runtime"
 14 | import "time"
 15 | 
 16 | func randstring(n int) string {
 17 | 	b := make([]byte, 2*n)
 18 | 	crand.Read(b)
 19 | 	s := base64.URLEncoding.EncodeToString(b)
 20 | 	return s[0:n]
 21 | }
 22 | 
 23 | // Randomize server handles
 24 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
 25 | 	sa := make([]*labrpc.ClientEnd, len(kvh))
 26 | 	copy(sa, kvh)
 27 | 	for i := range sa {
 28 | 		j := rand.Intn(i + 1)
 29 | 		sa[i], sa[j] = sa[j], sa[i]
 30 | 	}
 31 | 	return sa
 32 | }
 33 | 
 34 | type config struct {
 35 | 	mu           sync.Mutex
 36 | 	t            *testing.T
 37 | 	net          *labrpc.Network
 38 | 	n            int
 39 | 	servers      []*ShardCtrler
 40 | 	saved        []*raft.Persister
 41 | 	endnames     [][]string // names of each server's sending ClientEnds
 42 | 	clerks       map[*Clerk][]string
 43 | 	nextClientId int
 44 | 	start        time.Time // time at which make_config() was called
 45 | }
 46 | 
 47 | func (cfg *config) checkTimeout() {
 48 | 	// enforce a two minute real-time limit on each test
 49 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
 50 | 		cfg.t.Fatal("test took longer than 120 seconds")
 51 | 	}
 52 | }
 53 | 
 54 | func (cfg *config) cleanup() {
 55 | 	cfg.mu.Lock()
 56 | 	defer cfg.mu.Unlock()
 57 | 	for i := 0; i < len(cfg.servers); i++ {
 58 | 		if cfg.servers[i] != nil {
 59 | 			cfg.servers[i].Kill()
 60 | 		}
 61 | 	}
 62 | 	cfg.net.Cleanup()
 63 | 	cfg.checkTimeout()
 64 | }
 65 | 
 66 | // Maximum log size across all servers
 67 | func (cfg *config) LogSize() int {
 68 | 	logsize := 0
 69 | 	for i := 0; i < cfg.n; i++ {
 70 | 		n := cfg.saved[i].RaftStateSize()
 71 | 		if n > logsize {
 72 | 			logsize = n
 73 | 		}
 74 | 	}
 75 | 	return logsize
 76 | }
 77 | 
 78 | // attach server i to servers listed in to
 79 | // caller must hold cfg.mu
 80 | func (cfg *config) connectUnlocked(i int, to []int) {
 81 | 	// log.Printf("connect peer %d to %v\n", i, to)
 82 | 
 83 | 	// outgoing socket files
 84 | 	for j := 0; j < len(to); j++ {
 85 | 		endname := cfg.endnames[i][to[j]]
 86 | 		cfg.net.Enable(endname, true)
 87 | 	}
 88 | 
 89 | 	// incoming socket files
 90 | 	for j := 0; j < len(to); j++ {
 91 | 		endname := cfg.endnames[to[j]][i]
 92 | 		cfg.net.Enable(endname, true)
 93 | 	}
 94 | }
 95 | 
 96 | func (cfg *config) connect(i int, to []int) {
 97 | 	cfg.mu.Lock()
 98 | 	defer cfg.mu.Unlock()
 99 | 	cfg.connectUnlocked(i, to)
100 | }
101 | 
102 | // detach server i from the servers listed in from
103 | // caller must hold cfg.mu
104 | func (cfg *config) disconnectUnlocked(i int, from []int) {
105 | 	// log.Printf("disconnect peer %d from %v\n", i, from)
106 | 
107 | 	// outgoing socket files
108 | 	for j := 0; j < len(from); j++ {
109 | 		if cfg.endnames[i] != nil {
110 | 			endname := cfg.endnames[i][from[j]]
111 | 			cfg.net.Enable(endname, false)
112 | 		}
113 | 	}
114 | 
115 | 	// incoming socket files
116 | 	for j := 0; j < len(from); j++ {
117 | 		if cfg.endnames[j] != nil {
118 | 			endname := cfg.endnames[from[j]][i]
119 | 			cfg.net.Enable(endname, false)
120 | 		}
121 | 	}
122 | }
123 | 
124 | func (cfg *config) disconnect(i int, from []int) {
125 | 	cfg.mu.Lock()
126 | 	defer cfg.mu.Unlock()
127 | 	cfg.disconnectUnlocked(i, from)
128 | }
129 | 
130 | func (cfg *config) All() []int {
131 | 	all := make([]int, cfg.n)
132 | 	for i := 0; i < cfg.n; i++ {
133 | 		all[i] = i
134 | 	}
135 | 	return all
136 | }
137 | 
138 | func (cfg *config) ConnectAll() {
139 | 	cfg.mu.Lock()
140 | 	defer cfg.mu.Unlock()
141 | 	for i := 0; i < cfg.n; i++ {
142 | 		cfg.connectUnlocked(i, cfg.All())
143 | 	}
144 | }
145 | 
146 | // Sets up 2 partitions with connectivity between servers in each  partition.
147 | func (cfg *config) partition(p1 []int, p2 []int) {
148 | 	cfg.mu.Lock()
149 | 	defer cfg.mu.Unlock()
150 | 	// log.Printf("partition servers into: %v %v\n", p1, p2)
151 | 	for i := 0; i < len(p1); i++ {
152 | 		cfg.disconnectUnlocked(p1[i], p2)
153 | 		cfg.connectUnlocked(p1[i], p1)
154 | 	}
155 | 	for i := 0; i < len(p2); i++ {
156 | 		cfg.disconnectUnlocked(p2[i], p1)
157 | 		cfg.connectUnlocked(p2[i], p2)
158 | 	}
159 | }
160 | 
161 | // Create a clerk with clerk specific server names.
162 | // Give it connections to all of the servers, but for
163 | // now enable only connections to servers in to[].
164 | func (cfg *config) makeClient(to []int) *Clerk {
165 | 	cfg.mu.Lock()
166 | 	defer cfg.mu.Unlock()
167 | 
168 | 	// a fresh set of ClientEnds.
169 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
170 | 	endnames := make([]string, cfg.n)
171 | 	for j := 0; j < cfg.n; j++ {
172 | 		endnames[j] = randstring(20)
173 | 		ends[j] = cfg.net.MakeEnd(endnames[j])
174 | 		cfg.net.Connect(endnames[j], j)
175 | 	}
176 | 
177 | 	ck := MakeClerk(random_handles(ends))
178 | 	cfg.clerks[ck] = endnames
179 | 	cfg.nextClientId++
180 | 	cfg.ConnectClientUnlocked(ck, to)
181 | 	return ck
182 | }
183 | 
184 | func (cfg *config) deleteClient(ck *Clerk) {
185 | 	cfg.mu.Lock()
186 | 	defer cfg.mu.Unlock()
187 | 
188 | 	v := cfg.clerks[ck]
189 | 	for i := 0; i < len(v); i++ {
190 | 		os.Remove(v[i])
191 | 	}
192 | 	delete(cfg.clerks, ck)
193 | }
194 | 
195 | // caller should hold cfg.mu
196 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) {
197 | 	// log.Printf("ConnectClient %v to %v\n", ck, to)
198 | 	endnames := cfg.clerks[ck]
199 | 	for j := 0; j < len(to); j++ {
200 | 		s := endnames[to[j]]
201 | 		cfg.net.Enable(s, true)
202 | 	}
203 | }
204 | 
205 | func (cfg *config) ConnectClient(ck *Clerk, to []int) {
206 | 	cfg.mu.Lock()
207 | 	defer cfg.mu.Unlock()
208 | 	cfg.ConnectClientUnlocked(ck, to)
209 | }
210 | 
211 | // caller should hold cfg.mu
212 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) {
213 | 	// log.Printf("DisconnectClient %v from %v\n", ck, from)
214 | 	endnames := cfg.clerks[ck]
215 | 	for j := 0; j < len(from); j++ {
216 | 		s := endnames[from[j]]
217 | 		cfg.net.Enable(s, false)
218 | 	}
219 | }
220 | 
221 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) {
222 | 	cfg.mu.Lock()
223 | 	defer cfg.mu.Unlock()
224 | 	cfg.DisconnectClientUnlocked(ck, from)
225 | }
226 | 
227 | // Shutdown a server by isolating it
228 | func (cfg *config) ShutdownServer(i int) {
229 | 	cfg.mu.Lock()
230 | 	defer cfg.mu.Unlock()
231 | 
232 | 	cfg.disconnectUnlocked(i, cfg.All())
233 | 
234 | 	// disable client connections to the server.
235 | 	// it's important to do this before creating
236 | 	// the new Persister in saved[i], to avoid
237 | 	// the possibility of the server returning a
238 | 	// positive reply to an Append but persisting
239 | 	// the result in the superseded Persister.
240 | 	cfg.net.DeleteServer(i)
241 | 
242 | 	// a fresh persister, in case old instance
243 | 	// continues to update the Persister.
244 | 	// but copy old persister's content so that we always
245 | 	// pass Make() the last persisted state.
246 | 	if cfg.saved[i] != nil {
247 | 		cfg.saved[i] = cfg.saved[i].Copy()
248 | 	}
249 | 
250 | 	kv := cfg.servers[i]
251 | 	if kv != nil {
252 | 		cfg.mu.Unlock()
253 | 		kv.Kill()
254 | 		cfg.mu.Lock()
255 | 		cfg.servers[i] = nil
256 | 	}
257 | }
258 | 
259 | // If restart servers, first call ShutdownServer
260 | func (cfg *config) StartServer(i int) {
261 | 	cfg.mu.Lock()
262 | 
263 | 	// a fresh set of outgoing ClientEnd names.
264 | 	cfg.endnames[i] = make([]string, cfg.n)
265 | 	for j := 0; j < cfg.n; j++ {
266 | 		cfg.endnames[i][j] = randstring(20)
267 | 	}
268 | 
269 | 	// a fresh set of ClientEnds.
270 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
271 | 	for j := 0; j < cfg.n; j++ {
272 | 		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
273 | 		cfg.net.Connect(cfg.endnames[i][j], j)
274 | 	}
275 | 
276 | 	// a fresh persister, so old instance doesn't overwrite
277 | 	// new instance's persisted state.
278 | 	// give the fresh persister a copy of the old persister's
279 | 	// state, so that the spec is that we pass StartKVServer()
280 | 	// the last persisted state.
281 | 	if cfg.saved[i] != nil {
282 | 		cfg.saved[i] = cfg.saved[i].Copy()
283 | 	} else {
284 | 		cfg.saved[i] = raft.MakePersister()
285 | 	}
286 | 
287 | 	cfg.mu.Unlock()
288 | 
289 | 	cfg.servers[i] = StartServer(ends, i, cfg.saved[i])
290 | 
291 | 	kvsvc := labrpc.MakeService(cfg.servers[i])
292 | 	rfsvc := labrpc.MakeService(cfg.servers[i].rf)
293 | 	srv := labrpc.MakeServer()
294 | 	srv.AddService(kvsvc)
295 | 	srv.AddService(rfsvc)
296 | 	cfg.net.AddServer(i, srv)
297 | }
298 | 
299 | func (cfg *config) Leader() (bool, int) {
300 | 	cfg.mu.Lock()
301 | 	defer cfg.mu.Unlock()
302 | 
303 | 	for i := 0; i < cfg.n; i++ {
304 | 		if cfg.servers[i] != nil {
305 | 			_, is_leader := cfg.servers[i].rf.GetState()
306 | 			if is_leader {
307 | 				return true, i
308 | 			}
309 | 		}
310 | 	}
311 | 	return false, 0
312 | }
313 | 
314 | // Partition servers into 2 groups and put current leader in minority
315 | func (cfg *config) make_partition() ([]int, []int) {
316 | 	_, l := cfg.Leader()
317 | 	p1 := make([]int, cfg.n/2+1)
318 | 	p2 := make([]int, cfg.n/2)
319 | 	j := 0
320 | 	for i := 0; i < cfg.n; i++ {
321 | 		if i != l {
322 | 			if j < len(p1) {
323 | 				p1[j] = i
324 | 			} else {
325 | 				p2[j-len(p1)] = i
326 | 			}
327 | 			j++
328 | 		}
329 | 	}
330 | 	p2[len(p2)-1] = l
331 | 	return p1, p2
332 | }
333 | 
334 | func make_config(t *testing.T, n int, unreliable bool) *config {
335 | 	runtime.GOMAXPROCS(4)
336 | 	cfg := &config{}
337 | 	cfg.t = t
338 | 	cfg.net = labrpc.MakeNetwork()
339 | 	cfg.n = n
340 | 	cfg.servers = make([]*ShardCtrler, cfg.n)
341 | 	cfg.saved = make([]*raft.Persister, cfg.n)
342 | 	cfg.endnames = make([][]string, cfg.n)
343 | 	cfg.clerks = make(map[*Clerk][]string)
344 | 	cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid
345 | 	cfg.start = time.Now()
346 | 
347 | 	// create a full set of KV servers.
348 | 	for i := 0; i < cfg.n; i++ {
349 | 		cfg.StartServer(i)
350 | 	}
351 | 
352 | 	cfg.ConnectAll()
353 | 
354 | 	cfg.net.Reliable(!unreliable)
355 | 
356 | 	return cfg
357 | }
358 | 


--------------------------------------------------------------------------------
/kvraft/server.go:
--------------------------------------------------------------------------------
  1 | package kvraft
  2 | 
  3 | import (
  4 | 	"6.824/labgob"
  5 | 	"6.824/labrpc"
  6 | 	"6.824/raft"
  7 | 	"bytes"
  8 | 	"log"
  9 | 	"sync"
 10 | 	"sync/atomic"
 11 | 	"time"
 12 | )
 13 | 
 14 | const WaitCmdTimeOut = time.Millisecond * 500 // cmd执行超过这个时间，就返回timeout
 15 | const MaxLockTime = time.Millisecond * 10     // debug
 16 | 
 17 | type Op struct {
 18 | 	// Your definitions here.
 19 | 	// Field names must start with capital letters,
 20 | 	// otherwise RPC will break.
 21 | 	ReqId     int64 //用来标识commandNotify
 22 | 	CommandId int64
 23 | 	ClientId  int64
 24 | 	Key       string
 25 | 	Value     string
 26 | 	Method    string
 27 | }
 28 | 
 29 | type CommandResult struct {
 30 | 	Err   Err
 31 | 	Value string
 32 | }
 33 | 
 34 | type KVServer struct {
 35 | 	mu      sync.Mutex
 36 | 	me      int
 37 | 	rf      *raft.Raft
 38 | 	applyCh chan raft.ApplyMsg
 39 | 	dead    int32 // set by Kill()
 40 | 	stopCh  chan struct{}
 41 | 
 42 | 	maxraftstate int // snapshot if log grows this big
 43 | 
 44 | 	// Your definitions here.
 45 | 	commandNotifyCh map[int64]chan CommandResult
 46 | 	lastApplies     map[int64]int64 //k-v：ClientId-CommandId
 47 | 	data            map[string]string
 48 | 
 49 | 	//持久化
 50 | 	persister *raft.Persister
 51 | 
 52 | 	//用于互斥锁
 53 | 	lockStartTime time.Time
 54 | 	lockEndTime   time.Time
 55 | 	lockMsg       string
 56 | }
 57 | 
 58 | //自定义锁
 59 | func (kv *KVServer) lock(msg string) {
 60 | 	kv.mu.Lock()
 61 | 	kv.lockStartTime = time.Now()
 62 | 	kv.lockMsg = msg
 63 | }
 64 | 
 65 | func (kv *KVServer) unlock(msg string) {
 66 | 	kv.lockEndTime = time.Now()
 67 | 	duration := kv.lockEndTime.Sub(kv.lockStartTime)
 68 | 	kv.lockMsg = ""
 69 | 	kv.mu.Unlock()
 70 | 	if duration > MaxLockTime {
 71 | 		DPrintf("lock too long:%s:%s\n", msg, duration)
 72 | 	}
 73 | }
 74 | 
 75 | func (kv *KVServer) removeCh(reqId int64) {
 76 | 	kv.lock("removeCh")
 77 | 	defer kv.unlock("removeCh")
 78 | 	delete(kv.commandNotifyCh, reqId)
 79 | }
 80 | 
 81 | //调用start向raft请求命令
 82 | func (kv *KVServer) waitCmd(op Op) (res CommandResult) {
 83 | 	DPrintf("server %v wait cmd start,Op: %+v.\n", kv.me, op)
 84 | 
 85 | 	//提交命令,其实这里的start要改，一个kv数据库get命令可以发生在所有节点上
 86 | 	index, term, isLeader := kv.rf.Start(op)
 87 | 	if !isLeader {
 88 | 		res.Err = ErrWrongLeader
 89 | 		return
 90 | 	}
 91 | 
 92 | 	kv.lock("waitCmd")
 93 | 	ch := make(chan CommandResult, 1)
 94 | 	kv.commandNotifyCh[op.ReqId] = ch
 95 | 	kv.unlock("waitCmd")
 96 | 	DPrintf("start cmd: index:%d, term:%d, op:%+v", index, term, op)
 97 | 
 98 | 	t := time.NewTimer(WaitCmdTimeOut)
 99 | 	defer t.Stop()
100 | 	select {
101 | 	case <-kv.stopCh:
102 | 		DPrintf("stop ch waitCmd")
103 | 		kv.removeCh(op.ReqId)
104 | 		res.Err = ErrServer
105 | 		return
106 | 	case res = <-ch:
107 | 		kv.removeCh(op.ReqId)
108 | 		return
109 | 	case <-t.C:
110 | 		kv.removeCh(op.ReqId)
111 | 		res.Err = ErrTimeOut
112 | 		return
113 | 
114 | 	}
115 | }
116 | 
117 | //处理Get rpc
118 | func (kv *KVServer) Get(args *GetArgs, reply *GetReply) {
119 | 	// Your code here.
120 | 	DPrintf("server %v in rpc Get,args: %+v", kv.me, args)
121 | 
122 | 	_, isLeader := kv.rf.GetState()
123 | 	if !isLeader {
124 | 		reply.Err = ErrWrongLeader
125 | 		return
126 | 	}
127 | 
128 | 	op := Op{
129 | 		ReqId:     nrand(),
130 | 		ClientId:  args.ClientId,
131 | 		CommandId: args.CommandId,
132 | 		Key:       args.Key,
133 | 		Method:    "Get",
134 | 	}
135 | 	//等待命令执行
136 | 	res := kv.waitCmd(op)
137 | 	reply.Err = res.Err
138 | 	reply.Value = res.Value
139 | 
140 | 	DPrintf("server %v in rpc Get,args：%+v,reply：%+v", kv.me, args, reply)
141 | }
142 | 
143 | //处理Put rpc
144 | func (kv *KVServer) PutAppend(args *PutAppendArgs, reply *PutAppendReply) {
145 | 	// Your code here.
146 | 	DPrintf("server %v in rpc PutAppend,args: %+v", kv.me, args)
147 | 	_, isLeader := kv.rf.GetState()
148 | 	if !isLeader {
149 | 		reply.Err = ErrWrongLeader
150 | 		return
151 | 	}
152 | 	op := Op{
153 | 		ReqId:     nrand(),
154 | 		ClientId:  args.ClientId,
155 | 		CommandId: args.CommandId,
156 | 		Key:       args.Key,
157 | 		Value:     args.Value,
158 | 		Method:    args.Op,
159 | 	}
160 | 	//等待命令执行
161 | 	res := kv.waitCmd(op)
162 | 	reply.Err = res.Err
163 | 
164 | 	DPrintf("server %v in rpc PutAppend,args：%+v,reply：%+v", kv.me, args, reply)
165 | }
166 | 
167 | //
168 | // the tester calls Kill() when a KVServer instance won't
169 | // be needed again. for your convenience, we supply
170 | // code to set rf.dead (without needing a lock),
171 | // and a killed() method to test rf.dead in
172 | // long-running loops. you can also add your own
173 | // code to Kill(). you're not required to do anything
174 | // about this, but it may be convenient (for example)
175 | // to suppress debug output from a Kill()ed instance.
176 | //
177 | func (kv *KVServer) Kill() {
178 | 	atomic.StoreInt32(&kv.dead, 1)
179 | 	kv.rf.Kill()
180 | 	close(kv.stopCh)
181 | 	// Your code here, if desired.
182 | }
183 | 
184 | func (kv *KVServer) killed() bool {
185 | 	z := atomic.LoadInt32(&kv.dead)
186 | 	return z == 1
187 | }
188 | 
189 | //保存快照
190 | func (kv *KVServer) saveSnapshot(logIndex int) {
191 | 	if kv.maxraftstate == -1 || kv.persister.RaftStateSize() < kv.maxraftstate {
192 | 		return
193 | 	}
194 | 
195 | 	//生成快照数据
196 | 	w := new(bytes.Buffer)
197 | 	e := labgob.NewEncoder(w)
198 | 	if err := e.Encode(kv.data); err != nil {
199 | 		panic(err)
200 | 	}
201 | 	if err := e.Encode(kv.lastApplies); err != nil {
202 | 		panic(err)
203 | 	}
204 | 	data := w.Bytes()
205 | 	kv.rf.Snapshot(logIndex, data)
206 | }
207 | 
208 | //读取快照
209 | //两处调用：初始化阶段；收到Snapshot命令，即接收了leader的Snapshot
210 | func (kv *KVServer) readPersist(isInit bool, snapshotTerm, snapshotIndex int, data []byte) {
211 | 	if data == nil || len(data) < 1 {
212 | 		return
213 | 	}
214 | 	//只要不是初始化调用，即如果收到一个Snapshot命令，就要执行该函数
215 | 	if !isInit {
216 | 		res := kv.rf.CondInstallSnapshot(snapshotTerm, snapshotIndex, data)
217 | 		if !res {
218 | 			log.Panicln("kv read persist err in CondInstallSnapshot!")
219 | 			return
220 | 		}
221 | 	}
222 | 	//对数据进行同步
223 | 	r := bytes.NewBuffer(data)
224 | 	d := labgob.NewDecoder(r)
225 | 	var kvData map[string]string
226 | 	var lastApplies map[int64]int64
227 | 
228 | 	if d.Decode(&kvData) != nil ||
229 | 		d.Decode(&lastApplies) != nil {
230 | 		log.Fatal("kv read persist err!")
231 | 	} else {
232 | 		kv.data = kvData
233 | 		kv.lastApplies = lastApplies
234 | 	}
235 | }
236 | 
237 | func (kv *KVServer) getValueByKey(key string) (err Err, value string) {
238 | 	if v, ok := kv.data[key]; ok {
239 | 		err = OK
240 | 		value = v
241 | 	} else {
242 | 		err = ErrNoKey
243 | 	}
244 | 	return
245 | }
246 | 
247 | func (kv *KVServer) notifyWaitCommand(reqId int64, err Err, value string) {
248 | 	if ch, ok := kv.commandNotifyCh[reqId]; ok {
249 | 		ch <- CommandResult{
250 | 			Err:   err,
251 | 			Value: value,
252 | 		}
253 | 	}
254 | }
255 | 
256 | //应用每一条命令
257 | func (kv *KVServer) handleApplyCh() {
258 | 	for {
259 | 		select {
260 | 		case <-kv.stopCh:
261 | 			DPrintf("get from stopCh,server-%v stop!", kv.me)
262 | 			return
263 | 		case cmd := <-kv.applyCh:
264 | 			//处理快照命令，读取快照的内容
265 | 			if cmd.SnapshotValid {
266 | 				DPrintf("%v get install sn,%v %v", kv.me, cmd.SnapshotIndex, cmd.SnapshotTerm)
267 | 				kv.lock("waitApplyCh_sn")
268 | 				kv.readPersist(false, cmd.SnapshotTerm, cmd.SnapshotIndex, cmd.Snapshot)
269 | 				kv.unlock("waitApplyCh_sn")
270 | 				continue
271 | 			}
272 | 			//处理普通命令
273 | 			if !cmd.CommandValid {
274 | 				continue
275 | 			}
276 | 			cmdIdx := cmd.CommandIndex
277 | 			DPrintf("server %v start apply command %v：%+v", kv.me, cmdIdx, cmd.Command)
278 | 			op := cmd.Command.(Op)
279 | 			kv.lock("handleApplyCh")
280 | 
281 | 			if op.Method == "Get" {
282 | 				//处理读
283 | 				e, v := kv.getValueByKey(op.Key)
284 | 				kv.notifyWaitCommand(op.ReqId, e, v)
285 | 			} else if op.Method == "Put" || op.Method == "Append" {
286 | 				//处理写
287 | 				//判断命令是否重复
288 | 				isRepeated := false
289 | 				if v, ok := kv.lastApplies[op.ClientId]; ok {
290 | 					if v == op.CommandId {
291 | 						isRepeated = true
292 | 					}
293 | 				}
294 | 
295 | 				if !isRepeated {
296 | 					switch op.Method {
297 | 					case "Put":
298 | 						kv.data[op.Key] = op.Value
299 | 						kv.lastApplies[op.ClientId] = op.CommandId
300 | 					case "Append":
301 | 						e, v := kv.getValueByKey(op.Key)
302 | 						if e == ErrNoKey {
303 | 							//按put处理
304 | 							kv.data[op.Key] = op.Value
305 | 							kv.lastApplies[op.ClientId] = op.CommandId
306 | 						} else {
307 | 							//追加
308 | 							kv.data[op.Key] = v + op.Value
309 | 							kv.lastApplies[op.ClientId] = op.CommandId
310 | 						}
311 | 					default:
312 | 						kv.unlock("handleApplyCh")
313 | 						panic("unknown method " + op.Method)
314 | 					}
315 | 
316 | 				}
317 | 				//命令处理成功
318 | 				kv.notifyWaitCommand(op.ReqId, OK, "")
319 | 			} else {
320 | 				kv.unlock("handleApplyCh")
321 | 				panic("unknown method " + op.Method)
322 | 			}
323 | 
324 | 			DPrintf("apply op: cmdId:%d, op: %+v, data:%v", cmdIdx, op, kv.data[op.Key])
325 | 			//每应用一条命令，就判断是否进行持久化
326 | 			kv.saveSnapshot(cmdIdx)
327 | 
328 | 			kv.unlock("handleApplyCh")
329 | 		}
330 | 
331 | 	}
332 | 
333 | }
334 | 
335 | //
336 | // servers[] contains the ports of the set of
337 | // servers that will cooperate via Raft to
338 | // form the fault-tolerant key/value service.
339 | // me is the index of the current server in servers[].
340 | // the k/v server should store snapshots through the underlying Raft
341 | // implementation, which should call persister.SaveStateAndSnapshot() to
342 | // atomically save the Raft state along with the snapshot.
343 | // the k/v server should snapshot when Raft's saved state exceeds maxraftstate bytes,
344 | // in order to allow Raft to garbage-collect its log. if maxraftstate is -1,
345 | // you don't need to snapshot.
346 | // StartKVServer() must return quickly, so it should start goroutines
347 | // for any long-running work.
348 | //
349 | func StartKVServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int) *KVServer {
350 | 	// call labgob.Register on structures you want
351 | 	// Go's RPC library to marshall/unmarshall.
352 | 	labgob.Register(Op{})
353 | 
354 | 	kv := new(KVServer)
355 | 	kv.me = me
356 | 	kv.maxraftstate = maxraftstate
357 | 	kv.persister = persister
358 | 
359 | 	// You may need initialization code here.
360 | 	kv.lastApplies = make(map[int64]int64)
361 | 	kv.data = make(map[string]string)
362 | 
363 | 	kv.stopCh = make(chan struct{})
364 | 	//读取快照
365 | 	kv.readPersist(true, 0, 0, kv.persister.ReadSnapshot())
366 | 
367 | 	kv.commandNotifyCh = make(map[int64]chan CommandResult)
368 | 	kv.applyCh = make(chan raft.ApplyMsg)
369 | 	kv.rf = raft.Make(servers, me, persister, kv.applyCh)
370 | 
371 | 	go kv.handleApplyCh()
372 | 
373 | 	return kv
374 | }
375 | 


--------------------------------------------------------------------------------
/shardctrler/test_test.go:
--------------------------------------------------------------------------------
  1 | package shardctrler
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"sync"
  6 | 	"testing"
  7 | 	"time"
  8 | )
  9 | 
 10 | // import "time"
 11 | 
 12 | func check(t *testing.T, groups []int, ck *Clerk) {
 13 | 	c := ck.Query(-1)
 14 | 	if len(c.Groups) != len(groups) {
 15 | 		t.Fatalf("wanted %v groups, got %v", len(groups), len(c.Groups))
 16 | 	}
 17 | 
 18 | 	// are the groups as expected?
 19 | 	for _, g := range groups {
 20 | 		_, ok := c.Groups[g]
 21 | 		if ok != true {
 22 | 			t.Fatalf("missing group %v", g)
 23 | 		}
 24 | 	}
 25 | 
 26 | 	// any un-allocated shards?
 27 | 	if len(groups) > 0 {
 28 | 		for s, g := range c.Shards {
 29 | 			_, ok := c.Groups[g]
 30 | 			if ok == false {
 31 | 				t.Fatalf("shard %v -> invalid group %v", s, g)
 32 | 			}
 33 | 		}
 34 | 	}
 35 | 
 36 | 	// more or less balanced sharding?
 37 | 	counts := map[int]int{}
 38 | 	for _, g := range c.Shards {
 39 | 		counts[g] += 1
 40 | 	}
 41 | 	min := 257
 42 | 	max := 0
 43 | 	for g, _ := range c.Groups {
 44 | 		if counts[g] > max {
 45 | 			max = counts[g]
 46 | 		}
 47 | 		if counts[g] < min {
 48 | 			min = counts[g]
 49 | 		}
 50 | 	}
 51 | 	if max > min+1 {
 52 | 		t.Fatalf("max %v too much larger than min %v", max, min)
 53 | 	}
 54 | }
 55 | 
 56 | func check_same_config(t *testing.T, c1 Config, c2 Config) {
 57 | 	if c1.Num != c2.Num {
 58 | 		t.Fatalf("Num wrong")
 59 | 	}
 60 | 	if c1.Shards != c2.Shards {
 61 | 		t.Fatalf("Shards wrong")
 62 | 	}
 63 | 	if len(c1.Groups) != len(c2.Groups) {
 64 | 		t.Fatalf("number of Groups is wrong")
 65 | 	}
 66 | 	for gid, sa := range c1.Groups {
 67 | 		sa1, ok := c2.Groups[gid]
 68 | 		if ok == false || len(sa1) != len(sa) {
 69 | 			t.Fatalf("len(Groups) wrong")
 70 | 		}
 71 | 		if ok && len(sa1) == len(sa) {
 72 | 			for j := 0; j < len(sa); j++ {
 73 | 				if sa[j] != sa1[j] {
 74 | 					t.Fatalf("Groups wrong")
 75 | 				}
 76 | 			}
 77 | 		}
 78 | 	}
 79 | }
 80 | 
 81 | func TestBasic(t *testing.T) {
 82 | 	const nservers = 3
 83 | 	cfg := make_config(t, nservers, false)
 84 | 	defer cfg.cleanup()
 85 | 
 86 | 	ck := cfg.makeClient(cfg.All())
 87 | 
 88 | 	fmt.Printf("Test: Basic leave/join ...\n")
 89 | 
 90 | 	cfa := make([]Config, 6)
 91 | 	cfa[0] = ck.Query(-1)
 92 | 
 93 | 	check(t, []int{}, ck)
 94 | 
 95 | 	var gid1 int = 1
 96 | 	ck.Join(map[int][]string{gid1: []string{"x", "y", "z"}})
 97 | 	check(t, []int{gid1}, ck)
 98 | 	cfa[1] = ck.Query(-1)
 99 | 
100 | 	var gid2 int = 2
101 | 	ck.Join(map[int][]string{gid2: []string{"a", "b", "c"}})
102 | 	check(t, []int{gid1, gid2}, ck)
103 | 	cfa[2] = ck.Query(-1)
104 | 
105 | 	cfx := ck.Query(-1)
106 | 	sa1 := cfx.Groups[gid1]
107 | 	if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" {
108 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1)
109 | 	}
110 | 	sa2 := cfx.Groups[gid2]
111 | 	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
112 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2)
113 | 	}
114 | 
115 | 	ck.Leave([]int{gid1})
116 | 	check(t, []int{gid2}, ck)
117 | 	cfa[4] = ck.Query(-1)
118 | 
119 | 	ck.Leave([]int{gid2})
120 | 	cfa[5] = ck.Query(-1)
121 | 
122 | 	fmt.Printf("  ... Passed\n")
123 | 
124 | 	fmt.Printf("Test: Historical queries ...\n")
125 | 
126 | 	for s := 0; s < nservers; s++ {
127 | 		cfg.ShutdownServer(s)
128 | 		for i := 0; i < len(cfa); i++ {
129 | 			c := ck.Query(cfa[i].Num)
130 | 			check_same_config(t, c, cfa[i])
131 | 		}
132 | 		cfg.StartServer(s)
133 | 		cfg.ConnectAll()
134 | 	}
135 | 
136 | 	fmt.Printf("  ... Passed\n")
137 | 
138 | 	fmt.Printf("Test: Move ...\n")
139 | 	{
140 | 		var gid3 int = 503
141 | 		ck.Join(map[int][]string{gid3: []string{"3a", "3b", "3c"}})
142 | 		var gid4 int = 504
143 | 		ck.Join(map[int][]string{gid4: []string{"4a", "4b", "4c"}})
144 | 		for i := 0; i < NShards; i++ {
145 | 			cf := ck.Query(-1)
146 | 			if i < NShards/2 {
147 | 				ck.Move(i, gid3)
148 | 				if cf.Shards[i] != gid3 {
149 | 					cf1 := ck.Query(-1)
150 | 					if cf1.Num <= cf.Num {
151 | 						t.Fatalf("Move should increase Config.Num")
152 | 					}
153 | 				}
154 | 			} else {
155 | 				ck.Move(i, gid4)
156 | 				if cf.Shards[i] != gid4 {
157 | 					cf1 := ck.Query(-1)
158 | 					if cf1.Num <= cf.Num {
159 | 						t.Fatalf("Move should increase Config.Num")
160 | 					}
161 | 				}
162 | 			}
163 | 		}
164 | 		cf2 := ck.Query(-1)
165 | 		for i := 0; i < NShards; i++ {
166 | 			if i < NShards/2 {
167 | 				if cf2.Shards[i] != gid3 {
168 | 					t.Fatalf("expected shard %v on gid %v actually %v",
169 | 						i, gid3, cf2.Shards[i])
170 | 				}
171 | 			} else {
172 | 				if cf2.Shards[i] != gid4 {
173 | 					t.Fatalf("expected shard %v on gid %v actually %v",
174 | 						i, gid4, cf2.Shards[i])
175 | 				}
176 | 			}
177 | 		}
178 | 		ck.Leave([]int{gid3})
179 | 		ck.Leave([]int{gid4})
180 | 	}
181 | 	fmt.Printf("  ... Passed\n")
182 | 
183 | 	fmt.Printf("Test: Concurrent leave/join ...\n")
184 | 
185 | 	const npara = 10
186 | 	var cka [npara]*Clerk
187 | 	for i := 0; i < len(cka); i++ {
188 | 		cka[i] = cfg.makeClient(cfg.All())
189 | 	}
190 | 	gids := make([]int, npara)
191 | 	ch := make(chan bool)
192 | 	for xi := 0; xi < npara; xi++ {
193 | 		gids[xi] = int((xi * 10) + 100)
194 | 		go func(i int) {
195 | 			defer func() { ch <- true }()
196 | 			var gid int = gids[i]
197 | 			var sid1 = fmt.Sprintf("s%da", gid)
198 | 			var sid2 = fmt.Sprintf("s%db", gid)
199 | 			cka[i].Join(map[int][]string{gid + 1000: []string{sid1}})
200 | 			cka[i].Join(map[int][]string{gid: []string{sid2}})
201 | 			cka[i].Leave([]int{gid + 1000})
202 | 		}(xi)
203 | 	}
204 | 	for i := 0; i < npara; i++ {
205 | 		<-ch
206 | 	}
207 | 	check(t, gids, ck)
208 | 
209 | 	fmt.Printf("  ... Passed\n")
210 | 
211 | 	fmt.Printf("Test: Minimal transfers after joins ...\n")
212 | 
213 | 	c1 := ck.Query(-1)
214 | 	for i := 0; i < 5; i++ {
215 | 		var gid = int(npara + 1 + i)
216 | 		ck.Join(map[int][]string{gid: []string{
217 | 			fmt.Sprintf("%da", gid),
218 | 			fmt.Sprintf("%db", gid),
219 | 			fmt.Sprintf("%db", gid)}})
220 | 	}
221 | 	c2 := ck.Query(-1)
222 | 	for i := int(1); i <= npara; i++ {
223 | 		for j := 0; j < len(c1.Shards); j++ {
224 | 			if c2.Shards[j] == i {
225 | 				if c1.Shards[j] != i {
226 | 					t.Fatalf("non-minimal transfer after Join()s")
227 | 				}
228 | 			}
229 | 		}
230 | 	}
231 | 
232 | 	fmt.Printf("  ... Passed\n")
233 | 
234 | 	fmt.Printf("Test: Minimal transfers after leaves ...\n")
235 | 
236 | 	for i := 0; i < 5; i++ {
237 | 		ck.Leave([]int{int(npara + 1 + i)})
238 | 	}
239 | 	c3 := ck.Query(-1)
240 | 	for i := int(1); i <= npara; i++ {
241 | 		for j := 0; j < len(c1.Shards); j++ {
242 | 			if c2.Shards[j] == i {
243 | 				if c3.Shards[j] != i {
244 | 					t.Fatalf("non-minimal transfer after Leave()s")
245 | 				}
246 | 			}
247 | 		}
248 | 	}
249 | 
250 | 	fmt.Printf("  ... Passed\n")
251 | }
252 | 
253 | func TestMulti(t *testing.T) {
254 | 	const nservers = 3
255 | 	cfg := make_config(t, nservers, false)
256 | 	defer cfg.cleanup()
257 | 
258 | 	ck := cfg.makeClient(cfg.All())
259 | 
260 | 	fmt.Printf("Test: Multi-group join/leave ...\n")
261 | 
262 | 	cfa := make([]Config, 6)
263 | 	cfa[0] = ck.Query(-1)
264 | 
265 | 	check(t, []int{}, ck)
266 | 
267 | 	var gid1 int = 1
268 | 	var gid2 int = 2
269 | 	ck.Join(map[int][]string{
270 | 		gid1: []string{"x", "y", "z"},
271 | 		gid2: []string{"a", "b", "c"},
272 | 	})
273 | 	check(t, []int{gid1, gid2}, ck)
274 | 	cfa[1] = ck.Query(-1)
275 | 
276 | 	var gid3 int = 3
277 | 	ck.Join(map[int][]string{gid3: []string{"j", "k", "l"}})
278 | 	check(t, []int{gid1, gid2, gid3}, ck)
279 | 	cfa[2] = ck.Query(-1)
280 | 
281 | 	cfx := ck.Query(-1)
282 | 	sa1 := cfx.Groups[gid1]
283 | 	if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" {
284 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1)
285 | 	}
286 | 	sa2 := cfx.Groups[gid2]
287 | 	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
288 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2)
289 | 	}
290 | 	sa3 := cfx.Groups[gid3]
291 | 	if len(sa3) != 3 || sa3[0] != "j" || sa3[1] != "k" || sa3[2] != "l" {
292 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid3, sa3)
293 | 	}
294 | 
295 | 	ck.Leave([]int{gid1, gid3})
296 | 	check(t, []int{gid2}, ck)
297 | 	cfa[3] = ck.Query(-1)
298 | 
299 | 	cfx = ck.Query(-1)
300 | 	sa2 = cfx.Groups[gid2]
301 | 	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
302 | 		t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2)
303 | 	}
304 | 
305 | 	ck.Leave([]int{gid2})
306 | 
307 | 	fmt.Printf("  ... Passed\n")
308 | 
309 | 	fmt.Printf("Test: Concurrent multi leave/join ...\n")
310 | 
311 | 	const npara = 10
312 | 	var cka [npara]*Clerk
313 | 	for i := 0; i < len(cka); i++ {
314 | 		cka[i] = cfg.makeClient(cfg.All())
315 | 	}
316 | 	gids := make([]int, npara)
317 | 	var wg sync.WaitGroup
318 | 	for xi := 0; xi < npara; xi++ {
319 | 		wg.Add(1)
320 | 		gids[xi] = int(xi + 1000)
321 | 		go func(i int) {
322 | 			defer wg.Done()
323 | 			var gid int = gids[i]
324 | 			cka[i].Join(map[int][]string{
325 | 				gid: []string{
326 | 					fmt.Sprintf("%da", gid),
327 | 					fmt.Sprintf("%db", gid),
328 | 					fmt.Sprintf("%dc", gid)},
329 | 				gid + 1000: []string{fmt.Sprintf("%da", gid+1000)},
330 | 				gid + 2000: []string{fmt.Sprintf("%da", gid+2000)},
331 | 			})
332 | 			cka[i].Leave([]int{gid + 1000, gid + 2000})
333 | 		}(xi)
334 | 	}
335 | 	wg.Wait()
336 | 	check(t, gids, ck)
337 | 
338 | 	fmt.Printf("  ... Passed\n")
339 | 
340 | 	fmt.Printf("Test: Minimal transfers after multijoins ...\n")
341 | 
342 | 	c1 := ck.Query(-1)
343 | 	m := make(map[int][]string)
344 | 	for i := 0; i < 5; i++ {
345 | 		var gid = npara + 1 + i
346 | 		m[gid] = []string{fmt.Sprintf("%da", gid), fmt.Sprintf("%db", gid)}
347 | 	}
348 | 	ck.Join(m)
349 | 	c2 := ck.Query(-1)
350 | 	for i := int(1); i <= npara; i++ {
351 | 		for j := 0; j < len(c1.Shards); j++ {
352 | 			if c2.Shards[j] == i {
353 | 				if c1.Shards[j] != i {
354 | 					t.Fatalf("non-minimal transfer after Join()s")
355 | 				}
356 | 			}
357 | 		}
358 | 	}
359 | 
360 | 	fmt.Printf("  ... Passed\n")
361 | 
362 | 	fmt.Printf("Test: Minimal transfers after multileaves ...\n")
363 | 
364 | 	var l []int
365 | 	for i := 0; i < 5; i++ {
366 | 		l = append(l, npara+1+i)
367 | 	}
368 | 	ck.Leave(l)
369 | 	c3 := ck.Query(-1)
370 | 	for i := int(1); i <= npara; i++ {
371 | 		for j := 0; j < len(c1.Shards); j++ {
372 | 			if c2.Shards[j] == i {
373 | 				if c3.Shards[j] != i {
374 | 					t.Fatalf("non-minimal transfer after Leave()s")
375 | 				}
376 | 			}
377 | 		}
378 | 	}
379 | 
380 | 	fmt.Printf("  ... Passed\n")
381 | 
382 | 	fmt.Printf("Test: Check Same config on servers ...\n")
383 | 
384 | 	isLeader, leader := cfg.Leader()
385 | 	if !isLeader {
386 | 		t.Fatalf("Leader not found")
387 | 	}
388 | 	c := ck.Query(-1) // Config leader claims
389 | 
390 | 	cfg.ShutdownServer(leader)
391 | 
392 | 	attempts := 0
393 | 	for isLeader, leader = cfg.Leader(); isLeader; time.Sleep(1 * time.Second) {
394 | 		if attempts++; attempts >= 3 {
395 | 			t.Fatalf("Leader not found")
396 | 		}
397 | 	}
398 | 
399 | 	c1 = ck.Query(-1)
400 | 	check_same_config(t, c, c1)
401 | 
402 | 	fmt.Printf("  ... Passed\n")
403 | }
404 | 


--------------------------------------------------------------------------------
/porcupine/checker.go:
--------------------------------------------------------------------------------
  1 | package porcupine
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"sync/atomic"
  6 | 	"time"
  7 | )
  8 | 
  9 | type entryKind bool
 10 | 
 11 | const (
 12 | 	callEntry   entryKind = false
 13 | 	returnEntry           = true
 14 | )
 15 | 
 16 | type entry struct {
 17 | 	kind     entryKind
 18 | 	value    interface{}
 19 | 	id       int
 20 | 	time     int64
 21 | 	clientId int
 22 | }
 23 | 
 24 | type linearizationInfo struct {
 25 | 	history               [][]entry // for each partition, a list of entries
 26 | 	partialLinearizations [][][]int // for each partition, a set of histories (list of ids)
 27 | }
 28 | 
 29 | type byTime []entry
 30 | 
 31 | func (a byTime) Len() int {
 32 | 	return len(a)
 33 | }
 34 | 
 35 | func (a byTime) Swap(i, j int) {
 36 | 	a[i], a[j] = a[j], a[i]
 37 | }
 38 | 
 39 | func (a byTime) Less(i, j int) bool {
 40 | 	if a[i].time != a[j].time {
 41 | 		return a[i].time < a[j].time
 42 | 	}
 43 | 	// if the timestamps are the same, we need to make sure we order calls
 44 | 	// before returns
 45 | 	return a[i].kind == callEntry && a[j].kind == returnEntry
 46 | }
 47 | 
 48 | func makeEntries(history []Operation) []entry {
 49 | 	var entries []entry = nil
 50 | 	id := 0
 51 | 	for _, elem := range history {
 52 | 		entries = append(entries, entry{
 53 | 			callEntry, elem.Input, id, elem.Call, elem.ClientId})
 54 | 		entries = append(entries, entry{
 55 | 			returnEntry, elem.Output, id, elem.Return, elem.ClientId})
 56 | 		id++
 57 | 	}
 58 | 	sort.Sort(byTime(entries))
 59 | 	return entries
 60 | }
 61 | 
 62 | type node struct {
 63 | 	value interface{}
 64 | 	match *node // call if match is nil, otherwise return
 65 | 	id    int
 66 | 	next  *node
 67 | 	prev  *node
 68 | }
 69 | 
 70 | func insertBefore(n *node, mark *node) *node {
 71 | 	if mark != nil {
 72 | 		beforeMark := mark.prev
 73 | 		mark.prev = n
 74 | 		n.next = mark
 75 | 		if beforeMark != nil {
 76 | 			n.prev = beforeMark
 77 | 			beforeMark.next = n
 78 | 		}
 79 | 	}
 80 | 	return n
 81 | }
 82 | 
 83 | func length(n *node) int {
 84 | 	l := 0
 85 | 	for n != nil {
 86 | 		n = n.next
 87 | 		l++
 88 | 	}
 89 | 	return l
 90 | }
 91 | 
 92 | func renumber(events []Event) []Event {
 93 | 	var e []Event
 94 | 	m := make(map[int]int) // renumbering
 95 | 	id := 0
 96 | 	for _, v := range events {
 97 | 		if r, ok := m[v.Id]; ok {
 98 | 			e = append(e, Event{v.ClientId, v.Kind, v.Value, r})
 99 | 		} else {
100 | 			e = append(e, Event{v.ClientId, v.Kind, v.Value, id})
101 | 			m[v.Id] = id
102 | 			id++
103 | 		}
104 | 	}
105 | 	return e
106 | }
107 | 
108 | func convertEntries(events []Event) []entry {
109 | 	var entries []entry
110 | 	for i, elem := range events {
111 | 		kind := callEntry
112 | 		if elem.Kind == ReturnEvent {
113 | 			kind = returnEntry
114 | 		}
115 | 		// use index as "time"
116 | 		entries = append(entries, entry{kind, elem.Value, elem.Id, int64(i), elem.ClientId})
117 | 	}
118 | 	return entries
119 | }
120 | 
121 | func makeLinkedEntries(entries []entry) *node {
122 | 	var root *node = nil
123 | 	match := make(map[int]*node)
124 | 	for i := len(entries) - 1; i >= 0; i-- {
125 | 		elem := entries[i]
126 | 		if elem.kind == returnEntry {
127 | 			entry := &node{value: elem.value, match: nil, id: elem.id}
128 | 			match[elem.id] = entry
129 | 			insertBefore(entry, root)
130 | 			root = entry
131 | 		} else {
132 | 			entry := &node{value: elem.value, match: match[elem.id], id: elem.id}
133 | 			insertBefore(entry, root)
134 | 			root = entry
135 | 		}
136 | 	}
137 | 	return root
138 | }
139 | 
140 | type cacheEntry struct {
141 | 	linearized bitset
142 | 	state      interface{}
143 | }
144 | 
145 | func cacheContains(model Model, cache map[uint64][]cacheEntry, entry cacheEntry) bool {
146 | 	for _, elem := range cache[entry.linearized.hash()] {
147 | 		if entry.linearized.equals(elem.linearized) && model.Equal(entry.state, elem.state) {
148 | 			return true
149 | 		}
150 | 	}
151 | 	return false
152 | }
153 | 
154 | type callsEntry struct {
155 | 	entry *node
156 | 	state interface{}
157 | }
158 | 
159 | func lift(entry *node) {
160 | 	entry.prev.next = entry.next
161 | 	entry.next.prev = entry.prev
162 | 	match := entry.match
163 | 	match.prev.next = match.next
164 | 	if match.next != nil {
165 | 		match.next.prev = match.prev
166 | 	}
167 | }
168 | 
169 | func unlift(entry *node) {
170 | 	match := entry.match
171 | 	match.prev.next = match
172 | 	if match.next != nil {
173 | 		match.next.prev = match
174 | 	}
175 | 	entry.prev.next = entry
176 | 	entry.next.prev = entry
177 | }
178 | 
179 | func checkSingle(model Model, history []entry, computePartial bool, kill *int32) (bool, []*[]int) {
180 | 	entry := makeLinkedEntries(history)
181 | 	n := length(entry) / 2
182 | 	linearized := newBitset(uint(n))
183 | 	cache := make(map[uint64][]cacheEntry) // map from hash to cache entry
184 | 	var calls []callsEntry
185 | 	// longest linearizable prefix that includes the given entry
186 | 	longest := make([]*[]int, n)
187 | 
188 | 	state := model.Init()
189 | 	headEntry := insertBefore(&node{value: nil, match: nil, id: -1}, entry)
190 | 	for headEntry.next != nil {
191 | 		if atomic.LoadInt32(kill) != 0 {
192 | 			return false, longest
193 | 		}
194 | 		if entry.match != nil {
195 | 			matching := entry.match // the return entry
196 | 			ok, newState := model.Step(state, entry.value, matching.value)
197 | 			if ok {
198 | 				newLinearized := linearized.clone().set(uint(entry.id))
199 | 				newCacheEntry := cacheEntry{newLinearized, newState}
200 | 				if !cacheContains(model, cache, newCacheEntry) {
201 | 					hash := newLinearized.hash()
202 | 					cache[hash] = append(cache[hash], newCacheEntry)
203 | 					calls = append(calls, callsEntry{entry, state})
204 | 					state = newState
205 | 					linearized.set(uint(entry.id))
206 | 					lift(entry)
207 | 					entry = headEntry.next
208 | 				} else {
209 | 					entry = entry.next
210 | 				}
211 | 			} else {
212 | 				entry = entry.next
213 | 			}
214 | 		} else {
215 | 			if len(calls) == 0 {
216 | 				return false, longest
217 | 			}
218 | 			// longest
219 | 			if computePartial {
220 | 				callsLen := len(calls)
221 | 				var seq []int = nil
222 | 				for _, v := range calls {
223 | 					if longest[v.entry.id] == nil || callsLen > len(*longest[v.entry.id]) {
224 | 						// create seq lazily
225 | 						if seq == nil {
226 | 							seq = make([]int, len(calls))
227 | 							for i, v := range calls {
228 | 								seq[i] = v.entry.id
229 | 							}
230 | 						}
231 | 						longest[v.entry.id] = &seq
232 | 					}
233 | 				}
234 | 			}
235 | 			callsTop := calls[len(calls)-1]
236 | 			entry = callsTop.entry
237 | 			state = callsTop.state
238 | 			linearized.clear(uint(entry.id))
239 | 			calls = calls[:len(calls)-1]
240 | 			unlift(entry)
241 | 			entry = entry.next
242 | 		}
243 | 	}
244 | 	// longest linearization is the complete linearization, which is calls
245 | 	seq := make([]int, len(calls))
246 | 	for i, v := range calls {
247 | 		seq[i] = v.entry.id
248 | 	}
249 | 	for i := 0; i < n; i++ {
250 | 		longest[i] = &seq
251 | 	}
252 | 	return true, longest
253 | }
254 | 
255 | func fillDefault(model Model) Model {
256 | 	if model.Partition == nil {
257 | 		model.Partition = NoPartition
258 | 	}
259 | 	if model.PartitionEvent == nil {
260 | 		model.PartitionEvent = NoPartitionEvent
261 | 	}
262 | 	if model.Equal == nil {
263 | 		model.Equal = ShallowEqual
264 | 	}
265 | 	if model.DescribeOperation == nil {
266 | 		model.DescribeOperation = DefaultDescribeOperation
267 | 	}
268 | 	if model.DescribeState == nil {
269 | 		model.DescribeState = DefaultDescribeState
270 | 	}
271 | 	return model
272 | }
273 | 
274 | func checkParallel(model Model, history [][]entry, computeInfo bool, timeout time.Duration) (CheckResult, linearizationInfo) {
275 | 	ok := true
276 | 	timedOut := false
277 | 	results := make(chan bool, len(history))
278 | 	longest := make([][]*[]int, len(history))
279 | 	kill := int32(0)
280 | 	for i, subhistory := range history {
281 | 		go func(i int, subhistory []entry) {
282 | 			ok, l := checkSingle(model, subhistory, computeInfo, &kill)
283 | 			longest[i] = l
284 | 			results <- ok
285 | 		}(i, subhistory)
286 | 	}
287 | 	var timeoutChan <-chan time.Time
288 | 	if timeout > 0 {
289 | 		timeoutChan = time.After(timeout)
290 | 	}
291 | 	count := 0
292 | loop:
293 | 	for {
294 | 		select {
295 | 		case result := <-results:
296 | 			count++
297 | 			ok = ok && result
298 | 			if !ok && !computeInfo {
299 | 				atomic.StoreInt32(&kill, 1)
300 | 				break loop
301 | 			}
302 | 			if count >= len(history) {
303 | 				break loop
304 | 			}
305 | 		case <-timeoutChan:
306 | 			timedOut = true
307 | 			atomic.StoreInt32(&kill, 1)
308 | 			break loop // if we time out, we might get a false positive
309 | 		}
310 | 	}
311 | 	var info linearizationInfo
312 | 	if computeInfo {
313 | 		// make sure we've waited for all goroutines to finish,
314 | 		// otherwise we might race on access to longest[]
315 | 		for count < len(history) {
316 | 			<-results
317 | 			count++
318 | 		}
319 | 		// return longest linearizable prefixes that include each history element
320 | 		partialLinearizations := make([][][]int, len(history))
321 | 		for i := 0; i < len(history); i++ {
322 | 			var partials [][]int
323 | 			// turn longest into a set of unique linearizations
324 | 			set := make(map[*[]int]struct{})
325 | 			for _, v := range longest[i] {
326 | 				if v != nil {
327 | 					set[v] = struct{}{}
328 | 				}
329 | 			}
330 | 			for k := range set {
331 | 				arr := make([]int, len(*k))
332 | 				for i, v := range *k {
333 | 					arr[i] = v
334 | 				}
335 | 				partials = append(partials, arr)
336 | 			}
337 | 			partialLinearizations[i] = partials
338 | 		}
339 | 		info.history = history
340 | 		info.partialLinearizations = partialLinearizations
341 | 	}
342 | 	var result CheckResult
343 | 	if !ok {
344 | 		result = Illegal
345 | 	} else {
346 | 		if timedOut {
347 | 			result = Unknown
348 | 		} else {
349 | 			result = Ok
350 | 		}
351 | 	}
352 | 	return result, info
353 | }
354 | 
355 | func checkEvents(model Model, history []Event, verbose bool, timeout time.Duration) (CheckResult, linearizationInfo) {
356 | 	model = fillDefault(model)
357 | 	partitions := model.PartitionEvent(history)
358 | 	l := make([][]entry, len(partitions))
359 | 	for i, subhistory := range partitions {
360 | 		l[i] = convertEntries(renumber(subhistory))
361 | 	}
362 | 	return checkParallel(model, l, verbose, timeout)
363 | }
364 | 
365 | func checkOperations(model Model, history []Operation, verbose bool, timeout time.Duration) (CheckResult, linearizationInfo) {
366 | 	model = fillDefault(model)
367 | 	partitions := model.Partition(history)
368 | 	l := make([][]entry, len(partitions))
369 | 	for i, subhistory := range partitions {
370 | 		l[i] = makeEntries(subhistory)
371 | 	}
372 | 	return checkParallel(model, l, verbose, timeout)
373 | }
374 | 


--------------------------------------------------------------------------------
/shardkv/config.go:
--------------------------------------------------------------------------------
  1 | package shardkv
  2 | 
  3 | import "6.824/shardctrler"
  4 | import "6.824/labrpc"
  5 | import "testing"
  6 | import "os"
  7 | 
  8 | // import "log"
  9 | import crand "crypto/rand"
 10 | import "math/big"
 11 | import "math/rand"
 12 | import "encoding/base64"
 13 | import "sync"
 14 | import "runtime"
 15 | import "6.824/raft"
 16 | import "strconv"
 17 | import "fmt"
 18 | import "time"
 19 | 
 20 | func randstring(n int) string {
 21 | 	b := make([]byte, 2*n)
 22 | 	crand.Read(b)
 23 | 	s := base64.URLEncoding.EncodeToString(b)
 24 | 	return s[0:n]
 25 | }
 26 | 
 27 | func makeSeed() int64 {
 28 | 	max := big.NewInt(int64(1) << 62)
 29 | 	bigx, _ := crand.Int(crand.Reader, max)
 30 | 	x := bigx.Int64()
 31 | 	return x
 32 | }
 33 | 
 34 | // Randomize server handles
 35 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
 36 | 	sa := make([]*labrpc.ClientEnd, len(kvh))
 37 | 	copy(sa, kvh)
 38 | 	for i := range sa {
 39 | 		j := rand.Intn(i + 1)
 40 | 		sa[i], sa[j] = sa[j], sa[i]
 41 | 	}
 42 | 	return sa
 43 | }
 44 | 
 45 | type group struct {
 46 | 	gid       int
 47 | 	servers   []*ShardKV
 48 | 	saved     []*raft.Persister
 49 | 	endnames  [][]string
 50 | 	mendnames [][]string
 51 | }
 52 | 
 53 | type config struct {
 54 | 	mu    sync.Mutex
 55 | 	t     *testing.T
 56 | 	net   *labrpc.Network
 57 | 	start time.Time // time at which make_config() was called
 58 | 
 59 | 	nctrlers      int
 60 | 	ctrlerservers []*shardctrler.ShardCtrler
 61 | 	mck           *shardctrler.Clerk
 62 | 
 63 | 	ngroups int
 64 | 	n       int // servers per k/v group
 65 | 	groups  []*group
 66 | 
 67 | 	clerks       map[*Clerk][]string
 68 | 	nextClientId int
 69 | 	maxraftstate int
 70 | }
 71 | 
 72 | func (cfg *config) checkTimeout() {
 73 | 	// enforce a two minute real-time limit on each test
 74 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
 75 | 		cfg.t.Fatal("test took longer than 120 seconds")
 76 | 	}
 77 | }
 78 | 
 79 | func (cfg *config) cleanup() {
 80 | 	for gi := 0; gi < cfg.ngroups; gi++ {
 81 | 		cfg.ShutdownGroup(gi)
 82 | 	}
 83 | 	for i := 0; i < cfg.nctrlers; i++ {
 84 | 		cfg.ctrlerservers[i].Kill()
 85 | 	}
 86 | 	cfg.net.Cleanup()
 87 | 	cfg.checkTimeout()
 88 | }
 89 | 
 90 | // check that no server's log is too big.
 91 | func (cfg *config) checklogs() {
 92 | 	for gi := 0; gi < cfg.ngroups; gi++ {
 93 | 		for i := 0; i < cfg.n; i++ {
 94 | 			raft := cfg.groups[gi].saved[i].RaftStateSize()
 95 | 			snap := len(cfg.groups[gi].saved[i].ReadSnapshot())
 96 | 			if cfg.maxraftstate >= 0 && raft > 8*cfg.maxraftstate {
 97 | 				cfg.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v",
 98 | 					raft, cfg.maxraftstate)
 99 | 			}
100 | 			if cfg.maxraftstate < 0 && snap > 0 {
101 | 				cfg.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!")
102 | 			}
103 | 		}
104 | 	}
105 | }
106 | 
107 | // controler server name for labrpc.
108 | func (cfg *config) ctrlername(i int) string {
109 | 	return "ctrler" + strconv.Itoa(i)
110 | }
111 | 
112 | // shard server name for labrpc.
113 | // i'th server of group gid.
114 | func (cfg *config) servername(gid int, i int) string {
115 | 	return "server-" + strconv.Itoa(gid) + "-" + strconv.Itoa(i)
116 | }
117 | 
118 | func (cfg *config) makeClient() *Clerk {
119 | 	cfg.mu.Lock()
120 | 	defer cfg.mu.Unlock()
121 | 
122 | 	// ClientEnds to talk to controler service.
123 | 	ends := make([]*labrpc.ClientEnd, cfg.nctrlers)
124 | 	endnames := make([]string, cfg.n)
125 | 	for j := 0; j < cfg.nctrlers; j++ {
126 | 		endnames[j] = randstring(20)
127 | 		ends[j] = cfg.net.MakeEnd(endnames[j])
128 | 		cfg.net.Connect(endnames[j], cfg.ctrlername(j))
129 | 		cfg.net.Enable(endnames[j], true)
130 | 	}
131 | 
132 | 	ck := MakeClerk(ends, func(servername string) *labrpc.ClientEnd {
133 | 		name := randstring(20)
134 | 		end := cfg.net.MakeEnd(name)
135 | 		cfg.net.Connect(name, servername)
136 | 		cfg.net.Enable(name, true)
137 | 		return end
138 | 	})
139 | 	cfg.clerks[ck] = endnames
140 | 	cfg.nextClientId++
141 | 	return ck
142 | }
143 | 
144 | func (cfg *config) deleteClient(ck *Clerk) {
145 | 	cfg.mu.Lock()
146 | 	defer cfg.mu.Unlock()
147 | 
148 | 	v := cfg.clerks[ck]
149 | 	for i := 0; i < len(v); i++ {
150 | 		os.Remove(v[i])
151 | 	}
152 | 	delete(cfg.clerks, ck)
153 | }
154 | 
155 | // Shutdown i'th server of gi'th group, by isolating it
156 | func (cfg *config) ShutdownServer(gi int, i int) {
157 | 	cfg.mu.Lock()
158 | 	defer cfg.mu.Unlock()
159 | 
160 | 	gg := cfg.groups[gi]
161 | 
162 | 	// prevent this server from sending
163 | 	for j := 0; j < len(gg.servers); j++ {
164 | 		name := gg.endnames[i][j]
165 | 		cfg.net.Enable(name, false)
166 | 	}
167 | 	for j := 0; j < len(gg.mendnames[i]); j++ {
168 | 		name := gg.mendnames[i][j]
169 | 		cfg.net.Enable(name, false)
170 | 	}
171 | 
172 | 	// disable client connections to the server.
173 | 	// it's important to do this before creating
174 | 	// the new Persister in saved[i], to avoid
175 | 	// the possibility of the server returning a
176 | 	// positive reply to an Append but persisting
177 | 	// the result in the superseded Persister.
178 | 	cfg.net.DeleteServer(cfg.servername(gg.gid, i))
179 | 
180 | 	// a fresh persister, in case old instance
181 | 	// continues to update the Persister.
182 | 	// but copy old persister's content so that we always
183 | 	// pass Make() the last persisted state.
184 | 	if gg.saved[i] != nil {
185 | 		gg.saved[i] = gg.saved[i].Copy()
186 | 	}
187 | 
188 | 	kv := gg.servers[i]
189 | 	if kv != nil {
190 | 		cfg.mu.Unlock()
191 | 		kv.Kill()
192 | 		cfg.mu.Lock()
193 | 		gg.servers[i] = nil
194 | 	}
195 | }
196 | 
197 | func (cfg *config) ShutdownGroup(gi int) {
198 | 	for i := 0; i < cfg.n; i++ {
199 | 		cfg.ShutdownServer(gi, i)
200 | 	}
201 | }
202 | 
203 | // start i'th server in gi'th group
204 | func (cfg *config) StartServer(gi int, i int) {
205 | 	cfg.mu.Lock()
206 | 
207 | 	gg := cfg.groups[gi]
208 | 
209 | 	// a fresh set of outgoing ClientEnd names
210 | 	// to talk to other servers in this group.
211 | 	gg.endnames[i] = make([]string, cfg.n)
212 | 	for j := 0; j < cfg.n; j++ {
213 | 		gg.endnames[i][j] = randstring(20)
214 | 	}
215 | 
216 | 	// and the connections to other servers in this group.
217 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
218 | 	for j := 0; j < cfg.n; j++ {
219 | 		ends[j] = cfg.net.MakeEnd(gg.endnames[i][j])
220 | 		cfg.net.Connect(gg.endnames[i][j], cfg.servername(gg.gid, j))
221 | 		cfg.net.Enable(gg.endnames[i][j], true)
222 | 	}
223 | 
224 | 	// ends to talk to shardctrler service
225 | 	mends := make([]*labrpc.ClientEnd, cfg.nctrlers)
226 | 	gg.mendnames[i] = make([]string, cfg.nctrlers)
227 | 	for j := 0; j < cfg.nctrlers; j++ {
228 | 		gg.mendnames[i][j] = randstring(20)
229 | 		mends[j] = cfg.net.MakeEnd(gg.mendnames[i][j])
230 | 		cfg.net.Connect(gg.mendnames[i][j], cfg.ctrlername(j))
231 | 		cfg.net.Enable(gg.mendnames[i][j], true)
232 | 	}
233 | 
234 | 	// a fresh persister, so old instance doesn't overwrite
235 | 	// new instance's persisted state.
236 | 	// give the fresh persister a copy of the old persister's
237 | 	// state, so that the spec is that we pass StartKVServer()
238 | 	// the last persisted state.
239 | 	if gg.saved[i] != nil {
240 | 		gg.saved[i] = gg.saved[i].Copy()
241 | 	} else {
242 | 		gg.saved[i] = raft.MakePersister()
243 | 	}
244 | 	cfg.mu.Unlock()
245 | 
246 | 	gg.servers[i] = StartServer(ends, i, gg.saved[i], cfg.maxraftstate,
247 | 		gg.gid, mends,
248 | 		func(servername string) *labrpc.ClientEnd {
249 | 			name := randstring(20)
250 | 			end := cfg.net.MakeEnd(name)
251 | 			cfg.net.Connect(name, servername)
252 | 			cfg.net.Enable(name, true)
253 | 			return end
254 | 		})
255 | 
256 | 	kvsvc := labrpc.MakeService(gg.servers[i])
257 | 	rfsvc := labrpc.MakeService(gg.servers[i].rf)
258 | 	srv := labrpc.MakeServer()
259 | 	srv.AddService(kvsvc)
260 | 	srv.AddService(rfsvc)
261 | 	cfg.net.AddServer(cfg.servername(gg.gid, i), srv)
262 | }
263 | 
264 | func (cfg *config) StartGroup(gi int) {
265 | 	for i := 0; i < cfg.n; i++ {
266 | 		cfg.StartServer(gi, i)
267 | 	}
268 | }
269 | 
270 | func (cfg *config) StartCtrlerserver(i int) {
271 | 	// ClientEnds to talk to other controler replicas.
272 | 	ends := make([]*labrpc.ClientEnd, cfg.nctrlers)
273 | 	for j := 0; j < cfg.nctrlers; j++ {
274 | 		endname := randstring(20)
275 | 		ends[j] = cfg.net.MakeEnd(endname)
276 | 		cfg.net.Connect(endname, cfg.ctrlername(j))
277 | 		cfg.net.Enable(endname, true)
278 | 	}
279 | 
280 | 	p := raft.MakePersister()
281 | 
282 | 	cfg.ctrlerservers[i] = shardctrler.StartServer(ends, i, p)
283 | 
284 | 	msvc := labrpc.MakeService(cfg.ctrlerservers[i])
285 | 	rfsvc := labrpc.MakeService(cfg.ctrlerservers[i].Raft())
286 | 	srv := labrpc.MakeServer()
287 | 	srv.AddService(msvc)
288 | 	srv.AddService(rfsvc)
289 | 	cfg.net.AddServer(cfg.ctrlername(i), srv)
290 | }
291 | 
292 | func (cfg *config) shardclerk() *shardctrler.Clerk {
293 | 	// ClientEnds to talk to ctrler service.
294 | 	ends := make([]*labrpc.ClientEnd, cfg.nctrlers)
295 | 	for j := 0; j < cfg.nctrlers; j++ {
296 | 		name := randstring(20)
297 | 		ends[j] = cfg.net.MakeEnd(name)
298 | 		cfg.net.Connect(name, cfg.ctrlername(j))
299 | 		cfg.net.Enable(name, true)
300 | 	}
301 | 
302 | 	return shardctrler.MakeClerk(ends)
303 | }
304 | 
305 | // tell the shardctrler that a group is joining.
306 | func (cfg *config) join(gi int) {
307 | 	cfg.joinm([]int{gi})
308 | }
309 | 
310 | func (cfg *config) joinm(gis []int) {
311 | 	m := make(map[int][]string, len(gis))
312 | 	for _, g := range gis {
313 | 		gid := cfg.groups[g].gid
314 | 		servernames := make([]string, cfg.n)
315 | 		for i := 0; i < cfg.n; i++ {
316 | 			servernames[i] = cfg.servername(gid, i)
317 | 		}
318 | 		m[gid] = servernames
319 | 	}
320 | 	cfg.mck.Join(m)
321 | }
322 | 
323 | // tell the shardctrler that a group is leaving.
324 | func (cfg *config) leave(gi int) {
325 | 	cfg.leavem([]int{gi})
326 | }
327 | 
328 | func (cfg *config) leavem(gis []int) {
329 | 	gids := make([]int, 0, len(gis))
330 | 	for _, g := range gis {
331 | 		gids = append(gids, cfg.groups[g].gid)
332 | 	}
333 | 	cfg.mck.Leave(gids)
334 | }
335 | 
336 | var ncpu_once sync.Once
337 | 
338 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config {
339 | 	ncpu_once.Do(func() {
340 | 		if runtime.NumCPU() < 2 {
341 | 			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
342 | 		}
343 | 		rand.Seed(makeSeed())
344 | 	})
345 | 	runtime.GOMAXPROCS(4)
346 | 	cfg := &config{}
347 | 	cfg.t = t
348 | 	cfg.maxraftstate = maxraftstate
349 | 	cfg.net = labrpc.MakeNetwork()
350 | 	cfg.start = time.Now()
351 | 
352 | 	// controler
353 | 	cfg.nctrlers = 3
354 | 	cfg.ctrlerservers = make([]*shardctrler.ShardCtrler, cfg.nctrlers)
355 | 	for i := 0; i < cfg.nctrlers; i++ {
356 | 		cfg.StartCtrlerserver(i)
357 | 	}
358 | 	cfg.mck = cfg.shardclerk()
359 | 
360 | 	cfg.ngroups = 3
361 | 	cfg.groups = make([]*group, cfg.ngroups)
362 | 	cfg.n = n
363 | 	for gi := 0; gi < cfg.ngroups; gi++ {
364 | 		gg := &group{}
365 | 		cfg.groups[gi] = gg
366 | 		gg.gid = 100 + gi
367 | 		gg.servers = make([]*ShardKV, cfg.n)
368 | 		gg.saved = make([]*raft.Persister, cfg.n)
369 | 		gg.endnames = make([][]string, cfg.n)
370 | 		gg.mendnames = make([][]string, cfg.nctrlers)
371 | 		for i := 0; i < cfg.n; i++ {
372 | 			cfg.StartServer(gi, i)
373 | 		}
374 | 	}
375 | 
376 | 	cfg.clerks = make(map[*Clerk][]string)
377 | 	cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid
378 | 
379 | 	cfg.net.Reliable(!unreliable)
380 | 
381 | 	return cfg
382 | }
383 | 


--------------------------------------------------------------------------------
/kvraft/config.go:
--------------------------------------------------------------------------------
  1 | package kvraft
  2 | 
  3 | import "6.824/labrpc"
  4 | import "testing"
  5 | import "os"
  6 | 
  7 | // import "log"
  8 | import crand "crypto/rand"
  9 | import "math/big"
 10 | import "math/rand"
 11 | import "encoding/base64"
 12 | import "sync"
 13 | import "runtime"
 14 | import "6.824/raft"
 15 | import "fmt"
 16 | import "time"
 17 | import "sync/atomic"
 18 | 
 19 | func randstring(n int) string {
 20 | 	b := make([]byte, 2*n)
 21 | 	crand.Read(b)
 22 | 	s := base64.URLEncoding.EncodeToString(b)
 23 | 	return s[0:n]
 24 | }
 25 | 
 26 | func makeSeed() int64 {
 27 | 	max := big.NewInt(int64(1) << 62)
 28 | 	bigx, _ := crand.Int(crand.Reader, max)
 29 | 	x := bigx.Int64()
 30 | 	return x
 31 | }
 32 | 
 33 | // Randomize server handles
 34 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
 35 | 	sa := make([]*labrpc.ClientEnd, len(kvh))
 36 | 	copy(sa, kvh)
 37 | 	for i := range sa {
 38 | 		j := rand.Intn(i + 1)
 39 | 		sa[i], sa[j] = sa[j], sa[i]
 40 | 	}
 41 | 	return sa
 42 | }
 43 | 
 44 | type config struct {
 45 | 	mu           sync.Mutex
 46 | 	t            *testing.T
 47 | 	net          *labrpc.Network
 48 | 	n            int
 49 | 	kvservers    []*KVServer
 50 | 	saved        []*raft.Persister
 51 | 	endnames     [][]string // names of each server's sending ClientEnds
 52 | 	clerks       map[*Clerk][]string
 53 | 	nextClientId int
 54 | 	maxraftstate int
 55 | 	start        time.Time // time at which make_config() was called
 56 | 	// begin()/end() statistics
 57 | 	t0    time.Time // time at which test_test.go called cfg.begin()
 58 | 	rpcs0 int       // rpcTotal() at start of test
 59 | 	ops   int32     // number of clerk get/put/append method calls
 60 | }
 61 | 
 62 | func (cfg *config) checkTimeout() {
 63 | 	// enforce a two minute real-time limit on each test
 64 | 	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
 65 | 		cfg.t.Fatal("test took longer than 120 seconds")
 66 | 	}
 67 | }
 68 | 
 69 | func (cfg *config) cleanup() {
 70 | 	cfg.mu.Lock()
 71 | 	defer cfg.mu.Unlock()
 72 | 	for i := 0; i < len(cfg.kvservers); i++ {
 73 | 		if cfg.kvservers[i] != nil {
 74 | 			cfg.kvservers[i].Kill()
 75 | 		}
 76 | 	}
 77 | 	cfg.net.Cleanup()
 78 | 	cfg.checkTimeout()
 79 | }
 80 | 
 81 | // Maximum log size across all servers
 82 | func (cfg *config) LogSize() int {
 83 | 	logsize := 0
 84 | 	for i := 0; i < cfg.n; i++ {
 85 | 		n := cfg.saved[i].RaftStateSize()
 86 | 		if n > logsize {
 87 | 			logsize = n
 88 | 		}
 89 | 	}
 90 | 	return logsize
 91 | }
 92 | 
 93 | // Maximum snapshot size across all servers
 94 | func (cfg *config) SnapshotSize() int {
 95 | 	snapshotsize := 0
 96 | 	for i := 0; i < cfg.n; i++ {
 97 | 		n := cfg.saved[i].SnapshotSize()
 98 | 		if n > snapshotsize {
 99 | 			snapshotsize = n
100 | 		}
101 | 	}
102 | 	return snapshotsize
103 | }
104 | 
105 | // attach server i to servers listed in to
106 | // caller must hold cfg.mu
107 | func (cfg *config) connectUnlocked(i int, to []int) {
108 | 	// log.Printf("connect peer %d to %v\n", i, to)
109 | 
110 | 	// outgoing socket files
111 | 	for j := 0; j < len(to); j++ {
112 | 		endname := cfg.endnames[i][to[j]]
113 | 		cfg.net.Enable(endname, true)
114 | 	}
115 | 
116 | 	// incoming socket files
117 | 	for j := 0; j < len(to); j++ {
118 | 		endname := cfg.endnames[to[j]][i]
119 | 		cfg.net.Enable(endname, true)
120 | 	}
121 | }
122 | 
123 | func (cfg *config) connect(i int, to []int) {
124 | 	cfg.mu.Lock()
125 | 	defer cfg.mu.Unlock()
126 | 	cfg.connectUnlocked(i, to)
127 | }
128 | 
129 | // detach server i from the servers listed in from
130 | // caller must hold cfg.mu
131 | func (cfg *config) disconnectUnlocked(i int, from []int) {
132 | 	// log.Printf("disconnect peer %d from %v\n", i, from)
133 | 
134 | 	// outgoing socket files
135 | 	for j := 0; j < len(from); j++ {
136 | 		if cfg.endnames[i] != nil {
137 | 			endname := cfg.endnames[i][from[j]]
138 | 			cfg.net.Enable(endname, false)
139 | 		}
140 | 	}
141 | 
142 | 	// incoming socket files
143 | 	for j := 0; j < len(from); j++ {
144 | 		if cfg.endnames[j] != nil {
145 | 			endname := cfg.endnames[from[j]][i]
146 | 			cfg.net.Enable(endname, false)
147 | 		}
148 | 	}
149 | }
150 | 
151 | func (cfg *config) disconnect(i int, from []int) {
152 | 	cfg.mu.Lock()
153 | 	defer cfg.mu.Unlock()
154 | 	cfg.disconnectUnlocked(i, from)
155 | }
156 | 
157 | func (cfg *config) All() []int {
158 | 	all := make([]int, cfg.n)
159 | 	for i := 0; i < cfg.n; i++ {
160 | 		all[i] = i
161 | 	}
162 | 	return all
163 | }
164 | 
165 | func (cfg *config) ConnectAll() {
166 | 	cfg.mu.Lock()
167 | 	defer cfg.mu.Unlock()
168 | 	for i := 0; i < cfg.n; i++ {
169 | 		cfg.connectUnlocked(i, cfg.All())
170 | 	}
171 | }
172 | 
173 | // Sets up 2 partitions with connectivity between servers in each  partition.
174 | func (cfg *config) partition(p1 []int, p2 []int) {
175 | 	cfg.mu.Lock()
176 | 	defer cfg.mu.Unlock()
177 | 	// log.Printf("partition servers into: %v %v\n", p1, p2)
178 | 	for i := 0; i < len(p1); i++ {
179 | 		cfg.disconnectUnlocked(p1[i], p2)
180 | 		cfg.connectUnlocked(p1[i], p1)
181 | 	}
182 | 	for i := 0; i < len(p2); i++ {
183 | 		cfg.disconnectUnlocked(p2[i], p1)
184 | 		cfg.connectUnlocked(p2[i], p2)
185 | 	}
186 | }
187 | 
188 | // Create a clerk with clerk specific server names.
189 | // Give it connections to all of the servers, but for
190 | // now enable only connections to servers in to[].
191 | func (cfg *config) makeClient(to []int) *Clerk {
192 | 	cfg.mu.Lock()
193 | 	defer cfg.mu.Unlock()
194 | 
195 | 	// a fresh set of ClientEnds.
196 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
197 | 	endnames := make([]string, cfg.n)
198 | 	for j := 0; j < cfg.n; j++ {
199 | 		endnames[j] = randstring(20)
200 | 		ends[j] = cfg.net.MakeEnd(endnames[j])
201 | 		cfg.net.Connect(endnames[j], j)
202 | 	}
203 | 
204 | 	ck := MakeClerk(random_handles(ends))
205 | 	cfg.clerks[ck] = endnames
206 | 	cfg.nextClientId++
207 | 	cfg.ConnectClientUnlocked(ck, to)
208 | 	return ck
209 | }
210 | 
211 | func (cfg *config) deleteClient(ck *Clerk) {
212 | 	cfg.mu.Lock()
213 | 	defer cfg.mu.Unlock()
214 | 
215 | 	v := cfg.clerks[ck]
216 | 	for i := 0; i < len(v); i++ {
217 | 		os.Remove(v[i])
218 | 	}
219 | 	delete(cfg.clerks, ck)
220 | }
221 | 
222 | // caller should hold cfg.mu
223 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) {
224 | 	// log.Printf("ConnectClient %v to %v\n", ck, to)
225 | 	endnames := cfg.clerks[ck]
226 | 	for j := 0; j < len(to); j++ {
227 | 		s := endnames[to[j]]
228 | 		cfg.net.Enable(s, true)
229 | 	}
230 | }
231 | 
232 | func (cfg *config) ConnectClient(ck *Clerk, to []int) {
233 | 	cfg.mu.Lock()
234 | 	defer cfg.mu.Unlock()
235 | 	cfg.ConnectClientUnlocked(ck, to)
236 | }
237 | 
238 | // caller should hold cfg.mu
239 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) {
240 | 	// log.Printf("DisconnectClient %v from %v\n", ck, from)
241 | 	endnames := cfg.clerks[ck]
242 | 	for j := 0; j < len(from); j++ {
243 | 		s := endnames[from[j]]
244 | 		cfg.net.Enable(s, false)
245 | 	}
246 | }
247 | 
248 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) {
249 | 	cfg.mu.Lock()
250 | 	defer cfg.mu.Unlock()
251 | 	cfg.DisconnectClientUnlocked(ck, from)
252 | }
253 | 
254 | // Shutdown a server by isolating it
255 | func (cfg *config) ShutdownServer(i int) {
256 | 	cfg.mu.Lock()
257 | 	defer cfg.mu.Unlock()
258 | 
259 | 	cfg.disconnectUnlocked(i, cfg.All())
260 | 
261 | 	// disable client connections to the server.
262 | 	// it's important to do this before creating
263 | 	// the new Persister in saved[i], to avoid
264 | 	// the possibility of the server returning a
265 | 	// positive reply to an Append but persisting
266 | 	// the result in the superseded Persister.
267 | 	cfg.net.DeleteServer(i)
268 | 
269 | 	// a fresh persister, in case old instance
270 | 	// continues to update the Persister.
271 | 	// but copy old persister's content so that we always
272 | 	// pass Make() the last persisted state.
273 | 	if cfg.saved[i] != nil {
274 | 		cfg.saved[i] = cfg.saved[i].Copy()
275 | 	}
276 | 
277 | 	kv := cfg.kvservers[i]
278 | 	if kv != nil {
279 | 		cfg.mu.Unlock()
280 | 		kv.Kill()
281 | 		cfg.mu.Lock()
282 | 		cfg.kvservers[i] = nil
283 | 	}
284 | }
285 | 
286 | // If restart servers, first call ShutdownServer
287 | func (cfg *config) StartServer(i int) {
288 | 	cfg.mu.Lock()
289 | 
290 | 	// a fresh set of outgoing ClientEnd names.
291 | 	cfg.endnames[i] = make([]string, cfg.n)
292 | 	for j := 0; j < cfg.n; j++ {
293 | 		cfg.endnames[i][j] = randstring(20)
294 | 	}
295 | 
296 | 	// a fresh set of ClientEnds.
297 | 	ends := make([]*labrpc.ClientEnd, cfg.n)
298 | 	for j := 0; j < cfg.n; j++ {
299 | 		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
300 | 		cfg.net.Connect(cfg.endnames[i][j], j)
301 | 	}
302 | 
303 | 	// a fresh persister, so old instance doesn't overwrite
304 | 	// new instance's persisted state.
305 | 	// give the fresh persister a copy of the old persister's
306 | 	// state, so that the spec is that we pass StartKVServer()
307 | 	// the last persisted state.
308 | 	if cfg.saved[i] != nil {
309 | 		cfg.saved[i] = cfg.saved[i].Copy()
310 | 	} else {
311 | 		cfg.saved[i] = raft.MakePersister()
312 | 	}
313 | 	cfg.mu.Unlock()
314 | 
315 | 	cfg.kvservers[i] = StartKVServer(ends, i, cfg.saved[i], cfg.maxraftstate)
316 | 
317 | 	kvsvc := labrpc.MakeService(cfg.kvservers[i])
318 | 	rfsvc := labrpc.MakeService(cfg.kvservers[i].rf)
319 | 	srv := labrpc.MakeServer()
320 | 	srv.AddService(kvsvc)
321 | 	srv.AddService(rfsvc)
322 | 	cfg.net.AddServer(i, srv)
323 | }
324 | 
325 | func (cfg *config) Leader() (bool, int) {
326 | 	cfg.mu.Lock()
327 | 	defer cfg.mu.Unlock()
328 | 
329 | 	for i := 0; i < cfg.n; i++ {
330 | 		_, is_leader := cfg.kvservers[i].rf.GetState()
331 | 		if is_leader {
332 | 			return true, i
333 | 		}
334 | 	}
335 | 	return false, 0
336 | }
337 | 
338 | // Partition servers into 2 groups and put current leader in minority
339 | func (cfg *config) make_partition() ([]int, []int) {
340 | 	_, l := cfg.Leader()
341 | 	p1 := make([]int, cfg.n/2+1)
342 | 	p2 := make([]int, cfg.n/2)
343 | 	j := 0
344 | 	for i := 0; i < cfg.n; i++ {
345 | 		if i != l {
346 | 			if j < len(p1) {
347 | 				p1[j] = i
348 | 			} else {
349 | 				p2[j-len(p1)] = i
350 | 			}
351 | 			j++
352 | 		}
353 | 	}
354 | 	p2[len(p2)-1] = l
355 | 	return p1, p2
356 | }
357 | 
358 | var ncpu_once sync.Once
359 | 
360 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config {
361 | 	ncpu_once.Do(func() {
362 | 		if runtime.NumCPU() < 2 {
363 | 			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
364 | 		}
365 | 		rand.Seed(makeSeed())
366 | 	})
367 | 	runtime.GOMAXPROCS(4)
368 | 	cfg := &config{}
369 | 	cfg.t = t
370 | 	cfg.net = labrpc.MakeNetwork()
371 | 	cfg.n = n
372 | 	cfg.kvservers = make([]*KVServer, cfg.n)
373 | 	cfg.saved = make([]*raft.Persister, cfg.n)
374 | 	cfg.endnames = make([][]string, cfg.n)
375 | 	cfg.clerks = make(map[*Clerk][]string)
376 | 	cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid
377 | 	cfg.maxraftstate = maxraftstate
378 | 	cfg.start = time.Now()
379 | 
380 | 	// create a full set of KV servers.
381 | 	for i := 0; i < cfg.n; i++ {
382 | 		cfg.StartServer(i)
383 | 	}
384 | 
385 | 	cfg.ConnectAll()
386 | 
387 | 	cfg.net.Reliable(!unreliable)
388 | 
389 | 	return cfg
390 | }
391 | 
392 | func (cfg *config) rpcTotal() int {
393 | 	return cfg.net.GetTotalCount()
394 | }
395 | 
396 | // start a Test.
397 | // print the Test message.
398 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high")
399 | func (cfg *config) begin(description string) {
400 | 	fmt.Printf("%s ...\n", description)
401 | 	cfg.t0 = time.Now()
402 | 	cfg.rpcs0 = cfg.rpcTotal()
403 | 	atomic.StoreInt32(&cfg.ops, 0)
404 | }
405 | 
406 | func (cfg *config) op() {
407 | 	atomic.AddInt32(&cfg.ops, 1)
408 | }
409 | 
410 | // end a Test -- the fact that we got here means there
411 | // was no failure.
412 | // print the Passed message,
413 | // and some performance numbers.
414 | func (cfg *config) end() {
415 | 	cfg.checkTimeout()
416 | 	if cfg.t.Failed() == false {
417 | 		t := time.Since(cfg.t0).Seconds()  // real time
418 | 		npeers := cfg.n                    // number of Raft peers
419 | 		nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends
420 | 		ops := atomic.LoadInt32(&cfg.ops)  //  number of clerk get/put/append calls
421 | 
422 | 		fmt.Printf("  ... Passed --")
423 | 		fmt.Printf("  %4.1f  %d %5d %4d\n", t, npeers, nrpc, ops)
424 | 	}
425 | }
426 | 


--------------------------------------------------------------------------------
/shardctrler/server.go:
--------------------------------------------------------------------------------
  1 | package shardctrler
  2 | 
  3 | import (
  4 | 	"6.824/labgob"
  5 | 	"6.824/raft"
  6 | 	"sort"
  7 | 	"time"
  8 | )
  9 | import "6.824/labrpc"
 10 | import "sync"
 11 | 
 12 | const WaitCmdTimeOut = time.Millisecond * 500 // cmd执行超过这个时间，就返回timeout
 13 | const MaxLockTime = time.Millisecond * 10     // debug
 14 | 
 15 | type ShardCtrler struct {
 16 | 	mu      sync.Mutex
 17 | 	me      int
 18 | 	rf      *raft.Raft
 19 | 	applyCh chan raft.ApplyMsg
 20 | 
 21 | 	// Your data here.
 22 | 	stopCh          chan struct{}
 23 | 	commandNotifyCh map[int64]chan CommandResult
 24 | 	lastApplies     map[int64]int64 //k-v：ClientId-CommandId
 25 | 
 26 | 	configs []Config // indexed by config num
 27 | 
 28 | 	//用于互斥锁
 29 | 	lockStartTime time.Time
 30 | 	lockEndTime   time.Time
 31 | 	lockMsg       string
 32 | }
 33 | 
 34 | type CommandResult struct {
 35 | 	Err    Err
 36 | 	Config Config
 37 | }
 38 | 
 39 | type Op struct {
 40 | 	// Your definitions here.
 41 | 	// Field names must start with capital letters,
 42 | 	// otherwise RPC will break.
 43 | 	ReqId     int64 //用来标识commandNotify
 44 | 	CommandId int64
 45 | 	ClientId  int64
 46 | 	Args      interface{}
 47 | 	Method    string
 48 | }
 49 | 
 50 | //自定义锁
 51 | func (sc *ShardCtrler) lock(msg string) {
 52 | 	sc.mu.Lock()
 53 | 	sc.lockStartTime = time.Now()
 54 | 	sc.lockMsg = msg
 55 | }
 56 | 
 57 | func (sc *ShardCtrler) unlock(msg string) {
 58 | 	sc.lockEndTime = time.Now()
 59 | 	duration := sc.lockEndTime.Sub(sc.lockStartTime)
 60 | 	sc.lockMsg = ""
 61 | 	sc.mu.Unlock()
 62 | 	if duration > MaxLockTime {
 63 | 		DPrintf("lock too long:%s:%s\n", msg, duration)
 64 | 	}
 65 | }
 66 | 
 67 | //
 68 | // the tester calls Kill() when a ShardCtrler instance won't
 69 | // be needed again. you are not required to do anything
 70 | // in Kill(), but it might be convenient to (for example)
 71 | // turn off debug output from this instance.
 72 | //
 73 | func (sc *ShardCtrler) Kill() {
 74 | 	sc.rf.Kill()
 75 | 	close(sc.stopCh)
 76 | 	// Your code here, if desired.
 77 | }
 78 | 
 79 | func (sc *ShardCtrler) removeCh(reqId int64) {
 80 | 	sc.lock("removeCh")
 81 | 	defer sc.unlock("removeCh")
 82 | 	delete(sc.commandNotifyCh, reqId)
 83 | }
 84 | 
 85 | func (sc *ShardCtrler) getConfigByIndex(idx int) Config {
 86 | 	if idx < 0 || idx >= len(sc.configs) {
 87 | 		//因为会在config的基础上进行修改形成新的config，又涉及到map需要深拷贝
 88 | 		return sc.configs[len(sc.configs)-1].Copy()
 89 | 	}
 90 | 	return sc.configs[idx].Copy()
 91 | }
 92 | 
 93 | // needed by shardkv tester
 94 | func (sc *ShardCtrler) Raft() *raft.Raft {
 95 | 	return sc.rf
 96 | }
 97 | 
 98 | /*
 99 | rpc
100 | */
101 | 
102 | func (sc *ShardCtrler) Join(args *JoinArgs, reply *JoinReply) {
103 | 	// Your code here.
104 | 	res := sc.waitCommand(args.ClientId, args.CommandId, "Join", *args)
105 | 	if res.Err == ErrWrongLeader {
106 | 		reply.WrongLeader = true
107 | 	}
108 | 	reply.Err = res.Err
109 | }
110 | 
111 | func (sc *ShardCtrler) Leave(args *LeaveArgs, reply *LeaveReply) {
112 | 	res := sc.waitCommand(args.ClientId, args.CommandId, "Leave", *args)
113 | 	if res.Err == ErrWrongLeader {
114 | 		reply.WrongLeader = true
115 | 	}
116 | 	reply.Err = res.Err
117 | }
118 | 
119 | func (sc *ShardCtrler) Move(args *MoveArgs, reply *MoveReply) {
120 | 	res := sc.waitCommand(args.ClientId, args.CommandId, "Move", *args)
121 | 	if res.Err == ErrWrongLeader {
122 | 		reply.WrongLeader = true
123 | 	}
124 | 	reply.Err = res.Err
125 | }
126 | 
127 | func (sc *ShardCtrler) Query(args *QueryArgs, reply *QueryReply) {
128 | 	// Your code here.
129 | 	DPrintf("server %v query:args %+v", sc.me, args)
130 | 
131 | 	//如果是查询已经存在的配置可以直接返回，因为存在的配置是不会改变的；
132 | 	//如果是-1，则必须在handleApplyCh中进行处理，按照命令顺序执行，不然不准确。
133 | 	sc.lock("query")
134 | 	if args.Num >= 0 && args.Num < len(sc.configs) {
135 | 		reply.Err = OK
136 | 		reply.WrongLeader = false
137 | 		reply.Config = sc.getConfigByIndex(args.Num)
138 | 		sc.unlock("query")
139 | 		return
140 | 	}
141 | 	sc.unlock("query")
142 | 	res := sc.waitCommand(args.ClientId, args.CommandId, "Query", *args)
143 | 	if res.Err == ErrWrongLeader {
144 | 		reply.WrongLeader = true
145 | 	}
146 | 	reply.Err = res.Err
147 | 	reply.Config = res.Config
148 | }
149 | 
150 | func (sc *ShardCtrler) waitCommand(clientId int64, commandId int64, method string, args interface{}) (res CommandResult) {
151 | 	DPrintf("server %v wait cmd start,clientId：%v,commandId: %v,method: %s,args: %+v", sc.me, clientId, commandId, method, args)
152 | 	op := Op{
153 | 		ReqId:     nrand(),
154 | 		ClientId:  clientId,
155 | 		CommandId: commandId,
156 | 		Method:    method,
157 | 		Args:      args,
158 | 	}
159 | 	index, term, isLeader := sc.rf.Start(op)
160 | 	if !isLeader {
161 | 		res.Err = ErrWrongLeader
162 | 		DPrintf("server %v wait cmd NOT LEADER.", sc.me)
163 | 		return
164 | 	}
165 | 	sc.lock("waitCommand")
166 | 	ch := make(chan CommandResult, 1)
167 | 	sc.commandNotifyCh[op.ReqId] = ch
168 | 	sc.unlock("waitCommand")
169 | 	DPrintf("server %v wait cmd notify,index: %v,term: %v,op: %+v", sc.me, index, term, op)
170 | 
171 | 	t := time.NewTimer(WaitCmdTimeOut)
172 | 	defer t.Stop()
173 | 
174 | 	select {
175 | 	case <-t.C:
176 | 		res.Err = ErrTimeout
177 | 	case res = <-ch:
178 | 	case <-sc.stopCh:
179 | 		res.Err = ErrServer
180 | 	}
181 | 
182 | 	sc.removeCh(op.ReqId)
183 | 	DPrintf("server %v wait cmd end,Op: %+v.", sc.me, op)
184 | 	return
185 | 
186 | }
187 | 
188 | /*
189 | 配置调整代码
190 | */
191 | 
192 | //配置的调整
193 | //我们的策略是尽量不改变当前的配置
194 | func (sc *ShardCtrler) adjustConfig(conf *Config) {
195 | 	//针对三种情况分别进行调整
196 | 	if len(conf.Groups) == 0 {
197 | 		conf.Shards = [NShards]int{}
198 | 	} else if len(conf.Groups) == 1 {
199 | 		for gid, _ := range conf.Groups {
200 | 			for i, _ := range conf.Shards {
201 | 				conf.Shards[i] = gid
202 | 			}
203 | 		}
204 | 	} else if len(conf.Groups) <= NShards {
205 | 		//group数小于shard数，因此某些group可能会分配多一个或多个shard
206 | 		avgShardsCount := NShards / len(conf.Groups)
207 | 		otherShardsCount := NShards - avgShardsCount*len(conf.Groups)
208 | 		isTryAgain := true
209 | 
210 | 		for isTryAgain {
211 | 			isTryAgain = false
212 | 			DPrintf("adjust config,%+v", conf)
213 | 			//获取所有的gid
214 | 			var gids []int
215 | 			for gid, _ := range conf.Groups {
216 | 				gids = append(gids, gid)
217 | 			}
218 | 			sort.Ints(gids)
219 | 			//遍历每一个server
220 | 			for _, gid := range gids {
221 | 				count := 0
222 | 				for _, val := range conf.Shards {
223 | 					if val == gid {
224 | 						count++
225 | 					}
226 | 				}
227 | 
228 | 				//判断是否要改变配置
229 | 				if count == avgShardsCount {
230 | 					//不需要改变配置
231 | 					continue
232 | 				} else if count > avgShardsCount && otherShardsCount == 0 {
233 | 					//多出来的设置为0
234 | 					temp := 0
235 | 					for k, v := range conf.Shards {
236 | 						if gid == v {
237 | 							if temp < avgShardsCount {
238 | 								temp += 1
239 | 							} else {
240 | 								conf.Shards[k] = 0
241 | 							}
242 | 						}
243 | 					}
244 | 				} else if count > avgShardsCount && otherShardsCount > 0 {
245 | 					//此时看看多出的shard能否全部分配给该server
246 | 					//如果没有全部分配完，下一次循环再看
247 | 					//如果全部分配完还不够，则需要将多出的部分设置为0
248 | 					temp := 0
249 | 					for k, v := range conf.Shards {
250 | 						if gid == v {
251 | 							if temp < avgShardsCount {
252 | 								temp += 1
253 | 							} else if temp == avgShardsCount && otherShardsCount != 0 {
254 | 								otherShardsCount -= 1
255 | 							} else {
256 | 								conf.Shards[k] = 0
257 | 							}
258 | 						}
259 | 					}
260 | 
261 | 				} else {
262 | 					//count < arg
263 | 					for k, v := range conf.Shards {
264 | 						if v == 0 && count < avgShardsCount {
265 | 							conf.Shards[k] = gid
266 | 							count += 1
267 | 						}
268 | 						if count == avgShardsCount {
269 | 							break
270 | 						}
271 | 					}
272 | 					//因为调整的顺序问题，可能前面调整的server没有足够的shard进行分配，需要在进行一次调整
273 | 					if count < avgShardsCount {
274 | 						DPrintf("adjust config try again.")
275 | 						isTryAgain = true
276 | 						continue
277 | 					}
278 | 				}
279 | 			}
280 | 
281 | 			//调整完成后，可能会有所有group都打到平均的shard数，但是多出来的shard没有进行分配
282 | 			//此时可以采用轮询的方法
283 | 			cur := 0
284 | 			for k, v := range conf.Shards {
285 | 				//需要进行分配的
286 | 				if v == 0 {
287 | 					conf.Shards[k] = gids[cur]
288 | 					cur += 1
289 | 					cur %= len(conf.Groups)
290 | 				}
291 | 			}
292 | 
293 | 		}
294 | 	} else {
295 | 		//group数大于shard数，每一个group最多一个shard，会有group没有shard
296 | 
297 | 		gidsFlag := make(map[int]int)
298 | 		emptyShards := make([]int, 0, NShards)
299 | 		for k, gid := range conf.Shards {
300 | 			if gid == 0 {
301 | 				emptyShards = append(emptyShards, k)
302 | 				continue
303 | 			}
304 | 			if _, ok := gidsFlag[gid]; ok {
305 | 				conf.Shards[k] = 0
306 | 				emptyShards = append(emptyShards, k)
307 | 			} else {
308 | 				gidsFlag[gid] = 1
309 | 			}
310 | 		}
311 | 		if len(emptyShards) > 0 {
312 | 			var gids []int
313 | 			for k, _ := range conf.Groups {
314 | 				gids = append(gids, k)
315 | 			}
316 | 			sort.Ints(gids)
317 | 			temp := 0
318 | 			for _, gid := range gids {
319 | 				if _, ok := gidsFlag[gid]; !ok {
320 | 					conf.Shards[emptyShards[temp]] = gid
321 | 					temp += 1
322 | 				}
323 | 				if temp >= len(emptyShards) {
324 | 					break
325 | 				}
326 | 			}
327 | 
328 | 		}
329 | 	}
330 | }
331 | 
332 | /*
333 | applych处理代码
334 | */
335 | 
336 | func (sc *ShardCtrler) handleJoinCommand(args JoinArgs) {
337 | 	conf := sc.getConfigByIndex(-1)
338 | 	conf.Num += 1
339 | 
340 | 	//加入组
341 | 	for k, v := range args.Servers {
342 | 		conf.Groups[k] = v
343 | 	}
344 | 
345 | 	sc.adjustConfig(&conf)
346 | 	sc.configs = append(sc.configs, conf)
347 | }
348 | 
349 | func (sc *ShardCtrler) handleLeaveCommand(args LeaveArgs) {
350 | 	conf := sc.getConfigByIndex(-1)
351 | 	conf.Num += 1
352 | 
353 | 	//删掉server，并重置分配的shard
354 | 	for _, gid := range args.GIDs {
355 | 		delete(conf.Groups, gid)
356 | 		for i, v := range conf.Shards {
357 | 			if v == gid {
358 | 				conf.Shards[i] = 0
359 | 			}
360 | 		}
361 | 	}
362 | 
363 | 	sc.adjustConfig(&conf)
364 | 	sc.configs = append(sc.configs, conf)
365 | }
366 | 
367 | func (sc *ShardCtrler) handleMoveCommand(args MoveArgs) {
368 | 	conf := sc.getConfigByIndex(-1)
369 | 	conf.Num += 1
370 | 	conf.Shards[args.Shard] = args.GID
371 | 	sc.configs = append(sc.configs, conf)
372 | }
373 | 
374 | func (sc *ShardCtrler) notifyWaitCommand(reqId int64, err Err, conf Config) {
375 | 	if ch, ok := sc.commandNotifyCh[reqId]; ok {
376 | 		ch <- CommandResult{
377 | 			Err:    err,
378 | 			Config: conf,
379 | 		}
380 | 	}
381 | }
382 | 
383 | //处理applych
384 | func (sc *ShardCtrler) handleApplyCh() {
385 | 	for {
386 | 		select {
387 | 		case <-sc.stopCh:
388 | 			DPrintf("get from stopCh,server-%v stop!", sc.me)
389 | 			return
390 | 		case cmd := <-sc.applyCh:
391 | 			//处理快照命令，读取快照的内容
392 | 			if cmd.SnapshotValid {
393 | 				continue
394 | 			}
395 | 			//处理普通命令
396 | 			if !cmd.CommandValid {
397 | 				continue
398 | 			}
399 | 			cmdIdx := cmd.CommandIndex
400 | 			DPrintf("server %v start apply command %v：%+v", sc.me, cmdIdx, cmd.Command)
401 | 			op := cmd.Command.(Op)
402 | 			sc.lock("handleApplyCh")
403 | 
404 | 			if op.Method == "Query" {
405 | 				//处理读
406 | 				conf := sc.getConfigByIndex(op.Args.(QueryArgs).Num)
407 | 				sc.notifyWaitCommand(op.ReqId, OK, conf)
408 | 			} else {
409 | 				//处理其他命令
410 | 				//判断命令是否重复
411 | 				isRepeated := false
412 | 				if v, ok := sc.lastApplies[op.ClientId]; ok {
413 | 					if v == op.CommandId {
414 | 						isRepeated = true
415 | 					}
416 | 				}
417 | 				if !isRepeated {
418 | 					switch op.Method {
419 | 					case "Join":
420 | 						sc.handleJoinCommand(op.Args.(JoinArgs))
421 | 					case "Leave":
422 | 						sc.handleLeaveCommand(op.Args.(LeaveArgs))
423 | 					case "Move":
424 | 						sc.handleMoveCommand(op.Args.(MoveArgs))
425 | 					default:
426 | 						panic("unknown method")
427 | 					}
428 | 				}
429 | 				sc.lastApplies[op.ClientId] = op.CommandId
430 | 				sc.notifyWaitCommand(op.ReqId, OK, Config{})
431 | 			}
432 | 
433 | 			DPrintf("apply op: cmdId:%d, op: %+v", cmdIdx, op)
434 | 			sc.unlock("handleApplyCh")
435 | 		}
436 | 	}
437 | }
438 | 
439 | /*
440 | 初始化代码
441 | */
442 | 
443 | //
444 | // servers[] contains the ports of the set of
445 | // servers that will cooperate via Raft to
446 | // form the fault-tolerant shardctrler service.
447 | // me is the index of the current server in servers[].
448 | //
449 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister) *ShardCtrler {
450 | 	labgob.Register(Op{})
451 | 
452 | 	sc := new(ShardCtrler)
453 | 	sc.me = me
454 | 
455 | 	sc.configs = make([]Config, 1)
456 | 	sc.configs[0].Groups = map[int][]string{}
457 | 
458 | 	sc.applyCh = make(chan raft.ApplyMsg)
459 | 	sc.rf = raft.Make(servers, me, persister, sc.applyCh)
460 | 
461 | 	// Your code here.
462 | 	sc.stopCh = make(chan struct{})
463 | 	sc.commandNotifyCh = make(map[int64]chan CommandResult)
464 | 	sc.lastApplies = make(map[int64]int64)
465 | 
466 | 	go sc.handleApplyCh()
467 | 
468 | 	return sc
469 | }
470 | 


--------------------------------------------------------------------------------