├── go.sum ├── go.mod ├── .gitignore ├── main ├── viewd.go ├── pbd.go ├── test-mr-many.sh ├── mrcoordinator.go ├── lockc.go ├── lockd.go ├── pbc.go ├── mrworker.go ├── diskvd.go ├── mrsequential.go ├── test-mr-early.sh └── test-mr.sh ├── raft ├── util.go ├── persister.go ├── raft_snapshot.go ├── raft_vote.go └── raft_append_entries.go ├── mr ├── common.go ├── rpc.go ├── coordinator.go └── worker.go ├── kvraft ├── util.go ├── common.go ├── client.go ├── server.go └── config.go ├── shardctrler ├── util.go ├── client.go ├── common.go ├── config.go ├── test_test.go └── server.go ├── shardkv ├── util.go ├── server_snapshot.go ├── server_op.go ├── common.go ├── client.go ├── server_shard.go ├── server.go ├── server_apply.go └── config.go ├── mrapps ├── early_exit.go ├── nocrash.go ├── jobcount.go ├── indexer.go ├── wc.go ├── crash.go ├── rtiming.go └── mtiming.go ├── README.md ├── porcupine ├── porcupine.go ├── bitset.go ├── model.go └── checker.go ├── models └── kv.go └── labgob ├── test_test.go └── labgob.go /go.sum: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module 6.824 2 | 3 | go 1.17 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.*/ 2 | main/mr-tmp/ 3 | mrtmp.* 4 | 824-mrinput-*.txt 5 | /main/diff.out 6 | /mapreduce/x.txt 7 | /pbservice/x.txt 8 | /kvpaxos/x.txt 9 | *.so 10 | /main/mrcoordinator 11 | /main/mrsequential 12 | /main/mrworker 13 | -------------------------------------------------------------------------------- /main/viewd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see directions in pbc.go 5 | // 6 | 7 | import "time" 8 | import "6.824/viewservice" 9 | import "os" 10 | import "fmt" 11 | 12 | func main() { 13 | if len(os.Args) != 2 { 14 | fmt.Printf("Usage: viewd port\n") 15 | os.Exit(1) 16 | } 17 | 18 | viewservice.StartServer(os.Args[1]) 19 | 20 | for { 21 | time.Sleep(100 * time.Second) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /main/pbd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see directions in pbc.go 5 | // 6 | 7 | import "time" 8 | import "6.824/pbservice" 9 | import "os" 10 | import "fmt" 11 | 12 | func main() { 13 | if len(os.Args) != 3 { 14 | fmt.Printf("Usage: pbd viewport myport\n") 15 | os.Exit(1) 16 | } 17 | 18 | pbservice.StartServer(os.Args[1], os.Args[2]) 19 | 20 | for { 21 | time.Sleep(100 * time.Second) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /main/test-mr-many.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "Usage: $0 numTrials" 5 | exit 1 6 | fi 7 | 8 | trap 'kill -INT -$pid; exit 1' INT 9 | 10 | # Note: because the socketID is based on the current userID, 11 | # ./test-mr.sh cannot be run in parallel 12 | runs=$1 13 | chmod +x test-mr.sh 14 | 15 | for i in $(seq 1 $runs); do 16 | timeout -k 2s 900s ./test-mr.sh & 17 | pid=$! 18 | if ! wait $pid; then 19 | echo '***' FAILED TESTS IN TRIAL $i 20 | exit 1 21 | fi 22 | done 23 | echo '***' PASSED ALL $i TESTING TRIALS 24 | -------------------------------------------------------------------------------- /main/mrcoordinator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start the coordinator process, which is implemented 5 | // in ../mr/coordinator.go 6 | // 7 | // go run mrcoordinator.go pg*.txt 8 | // 9 | // Please do not change this file. 10 | // 11 | 12 | import "6.824/mr" 13 | import "time" 14 | import "os" 15 | import "fmt" 16 | 17 | func main() { 18 | if len(os.Args) < 2 { 19 | fmt.Fprintf(os.Stderr, "Usage: mrcoordinator inputfiles...\n") 20 | os.Exit(1) 21 | } 22 | 23 | m := mr.MakeCoordinator(os.Args[1:], 10) 24 | for m.Done() == false { 25 | time.Sleep(time.Second) 26 | } 27 | 28 | time.Sleep(time.Second) 29 | } 30 | -------------------------------------------------------------------------------- /main/lockc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see comments in lockd.go 5 | // 6 | 7 | import "6.824/lockservice" 8 | import "os" 9 | import "fmt" 10 | 11 | func usage() { 12 | fmt.Printf("Usage: lockc -l|-u primaryport backupport lockname\n") 13 | os.Exit(1) 14 | } 15 | 16 | func main() { 17 | if len(os.Args) == 5 { 18 | ck := lockservice.MakeClerk(os.Args[2], os.Args[3]) 19 | var ok bool 20 | if os.Args[1] == "-l" { 21 | ok = ck.Lock(os.Args[4]) 22 | } else if os.Args[1] == "-u" { 23 | ok = ck.Unlock(os.Args[4]) 24 | } else { 25 | usage() 26 | } 27 | fmt.Printf("reply: %v\n", ok) 28 | } else { 29 | usage() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /raft/util.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strconv" 8 | "time" 9 | ) 10 | 11 | // Debugging 12 | const Debug = false 13 | 14 | var file *os.File 15 | 16 | func init() { 17 | f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt") 18 | if err != nil { 19 | DPrintf("log create file fail!") 20 | } 21 | file = f 22 | } 23 | 24 | //debug下打印日志 25 | func DPrintf(format string, value ...interface{}) { 26 | now := time.Now() 27 | info := fmt.Sprintf("%v-%v-%v %v:%v:%v: ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...) 28 | 29 | if Debug { 30 | log.Printf(info) 31 | } else { 32 | //file.WriteString(info) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /mr/common.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "math/rand" 7 | "os" 8 | "strconv" 9 | "time" 10 | ) 11 | 12 | const Debug = false 13 | 14 | var file *os.File 15 | 16 | func init() { 17 | rand.Seed(10) 18 | f, err := os.Create("log-" + strconv.Itoa(int(time.Now().Unix()+rand.Int63n(100))) + ".txt") 19 | if err != nil { 20 | DPrintf("log create file fail!") 21 | } 22 | file = f 23 | } 24 | 25 | //debug下打印日志 26 | func DPrintf(format string, value ...interface{}) { 27 | now := time.Now() 28 | info := fmt.Sprintf("%v-%v-%v %v:%v:%v: ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...) 29 | 30 | if Debug { 31 | log.Printf(info) 32 | } else { 33 | file.WriteString(info) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /kvraft/util.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strconv" 8 | "time" 9 | ) 10 | 11 | // Debugging 12 | const Debug = false 13 | 14 | var file *os.File 15 | 16 | func init() { 17 | f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt") 18 | if err != nil { 19 | DPrintf("log create file fail!") 20 | fmt.Println("log create file fail!") 21 | } 22 | file = f 23 | } 24 | 25 | //debug下打印日志 26 | func DPrintf(format string, value ...interface{}) { 27 | now := time.Now() 28 | info := fmt.Sprintf("%v-%v-%v %v:%v:%v: ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...) 29 | 30 | if Debug { 31 | log.Printf(info) 32 | } else { 33 | file.WriteString(info) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /shardctrler/util.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strconv" 8 | "time" 9 | ) 10 | 11 | // Debugging 12 | const Debug = false 13 | 14 | var file *os.File 15 | 16 | func init() { 17 | f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt") 18 | if err != nil { 19 | DPrintf("log create file fail!") 20 | fmt.Println("log create file fail!") 21 | } 22 | file = f 23 | } 24 | 25 | //debug下打印日志 26 | func DPrintf(format string, value ...interface{}) { 27 | now := time.Now() 28 | info := fmt.Sprintf("%v-%v-%v %v:%v:%v: ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + fmt.Sprintf(format+"\n", value...) 29 | 30 | if Debug { 31 | log.Printf(info) 32 | } else { 33 | file.WriteString(info) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /shardkv/util.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strconv" 8 | "time" 9 | ) 10 | 11 | // Debugging 12 | const Debug = false 13 | 14 | var file *os.File 15 | 16 | func init() { 17 | f, err := os.Create("./tmp/log-" + strconv.Itoa(int(time.Now().Unix())) + ".txt") 18 | if err != nil { 19 | DPrintf("", "log create file fail!") 20 | fmt.Println("log create file fail!") 21 | } 22 | file = f 23 | } 24 | 25 | //debug下打印日志 26 | func DPrintf(msg, format string, value ...interface{}) { 27 | now := time.Now() 28 | info := fmt.Sprintf("%v-%v-%v %v:%v:%v: ", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute(), now.Second()) + msg + fmt.Sprintf(format+"\n", value...) 29 | 30 | if Debug { 31 | log.Printf(info) 32 | } else { 33 | file.WriteString(info) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /main/lockd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // export GOPATH=~/6.824 4 | // go build lockd.go 5 | // go build lockc.go 6 | // ./lockd -p a b & 7 | // ./lockd -b a b & 8 | // ./lockc -l a b lx 9 | // ./lockc -u a b lx 10 | // 11 | // on Athena, use /tmp/myname-a and /tmp/myname-b 12 | // instead of a and b. 13 | 14 | import "time" 15 | import "6.824/lockservice" 16 | import "os" 17 | import "fmt" 18 | 19 | func main() { 20 | if len(os.Args) == 4 && os.Args[1] == "-p" { 21 | lockservice.StartServer(os.Args[2], os.Args[3], true) 22 | } else if len(os.Args) == 4 && os.Args[1] == "-b" { 23 | lockservice.StartServer(os.Args[2], os.Args[3], false) 24 | } else { 25 | fmt.Printf("Usage: lockd -p|-b primaryport backupport\n") 26 | os.Exit(1) 27 | } 28 | for { 29 | time.Sleep(100 * time.Second) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /kvraft/common.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | const ( 4 | OK = "OK" 5 | ErrNoKey = "ErrNoKey" 6 | ErrWrongLeader = "ErrWrongLeader" 7 | ErrTimeOut = "ErrTimeOut" 8 | ErrServer = "ErrServer" 9 | ) 10 | 11 | type Err string 12 | 13 | // Put or Append 14 | type PutAppendArgs struct { 15 | Key string 16 | Value string 17 | Op string // "Put" or "Append" 18 | // You'll have to add definitions here. 19 | // Field names must start with capital letters, 20 | // otherwise RPC will break. 21 | ClientId int64 22 | CommandId int64 23 | } 24 | 25 | type PutAppendReply struct { 26 | Err Err 27 | } 28 | 29 | type GetArgs struct { 30 | Key string 31 | ClientId int64 32 | CommandId int64 33 | // You'll have to add definitions here. 34 | } 35 | 36 | type GetReply struct { 37 | Err Err 38 | Value string 39 | } 40 | -------------------------------------------------------------------------------- /main/pbc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // pbservice client application 5 | // 6 | // export GOPATH=~/6.824 7 | // go build viewd.go 8 | // go build pbd.go 9 | // go build pbc.go 10 | // ./viewd /tmp/rtm-v & 11 | // ./pbd /tmp/rtm-v /tmp/rtm-1 & 12 | // ./pbd /tmp/rtm-v /tmp/rtm-2 & 13 | // ./pbc /tmp/rtm-v key1 value1 14 | // ./pbc /tmp/rtm-v key1 15 | // 16 | // change "rtm" to your user name. 17 | // start the pbd programs in separate windows and kill 18 | // and restart them to exercise fault tolerance. 19 | // 20 | 21 | import "6.824/pbservice" 22 | import "os" 23 | import "fmt" 24 | 25 | func usage() { 26 | fmt.Printf("Usage: pbc viewport key\n") 27 | fmt.Printf(" pbc viewport key value\n") 28 | os.Exit(1) 29 | } 30 | 31 | func main() { 32 | if len(os.Args) == 3 { 33 | // get 34 | ck := pbservice.MakeClerk(os.Args[1], "") 35 | v := ck.Get(os.Args[2]) 36 | fmt.Printf("%v\n", v) 37 | } else if len(os.Args) == 4 { 38 | // put 39 | ck := pbservice.MakeClerk(os.Args[1], "") 40 | ck.Put(os.Args[2], os.Args[3]) 41 | } else { 42 | usage() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /mr/rpc.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | // 4 | // RPC definitions. 5 | // 6 | // remember to capitalize all names. 7 | // 8 | 9 | import "os" 10 | import "strconv" 11 | 12 | // 13 | // example to show how to declare the arguments 14 | // and reply for an RPC. 15 | // 16 | 17 | type ExampleArgs struct { 18 | X int 19 | } 20 | 21 | type ExampleReply struct { 22 | Y int 23 | } 24 | 25 | // Add your RPC definitions here. 26 | 27 | //用于获取任务 28 | type TaskArgs struct { 29 | WorkerId int 30 | } 31 | 32 | type TaskReply struct { 33 | Task *Task 34 | } 35 | 36 | //用于worker创建后的注册 37 | type RegArgs struct { 38 | } 39 | 40 | type RegReply struct { 41 | WorkerId int 42 | } 43 | 44 | //用于worker响应任务 45 | type ReportTaskArgs struct { 46 | WorkerId int 47 | Phase TaskPhase 48 | Seq int 49 | Done bool 50 | } 51 | 52 | type ReportTaskReply struct { 53 | } 54 | 55 | // Cook up a unique-ish UNIX-domain socket name 56 | // in /var/tmp, for the coordinator. 57 | // Can't use the current directory since 58 | // Athena AFS doesn't support UNIX-domain sockets. 59 | func coordinatorSock() string { 60 | s := "/var/tmp/824-mr-" 61 | s += strconv.Itoa(os.Getuid()) 62 | return s 63 | } 64 | -------------------------------------------------------------------------------- /mrapps/early_exit.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a word-count application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin wc_long.go 7 | // 8 | 9 | import ( 10 | "strconv" 11 | "strings" 12 | "time" 13 | 14 | "6.824/mr" 15 | ) 16 | 17 | // 18 | // The map function is called once for each file of input. 19 | // This map function just returns 1 for each file 20 | // 21 | func Map(filename string, contents string) []mr.KeyValue { 22 | kva := []mr.KeyValue{} 23 | kva = append(kva, mr.KeyValue{filename, "1"}) 24 | return kva 25 | } 26 | 27 | // 28 | // The reduce function is called once for each key generated by the 29 | // map tasks, with a list of all the values created for that key by 30 | // any map task. 31 | // 32 | func Reduce(key string, values []string) string { 33 | // some reduce tasks sleep for a long time; potentially seeing if 34 | // a worker will accidentally exit early 35 | if strings.Contains(key, "sherlock") || strings.Contains(key, "tom") { 36 | time.Sleep(time.Duration(3 * time.Second)) 37 | } 38 | // return the number of occurrences of this file. 39 | return strconv.Itoa(len(values)) 40 | } 41 | -------------------------------------------------------------------------------- /mrapps/nocrash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // same as crash.go but doesn't actually crash. 5 | // 6 | // go build -buildmode=plugin nocrash.go 7 | // 8 | 9 | import "6.824/mr" 10 | import crand "crypto/rand" 11 | import "math/big" 12 | import "strings" 13 | import "os" 14 | import "sort" 15 | import "strconv" 16 | 17 | func maybeCrash() { 18 | max := big.NewInt(1000) 19 | rr, _ := crand.Int(crand.Reader, max) 20 | if false && rr.Int64() < 500 { 21 | // crash! 22 | os.Exit(1) 23 | } 24 | } 25 | 26 | func Map(filename string, contents string) []mr.KeyValue { 27 | maybeCrash() 28 | 29 | kva := []mr.KeyValue{} 30 | kva = append(kva, mr.KeyValue{"a", filename}) 31 | kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))}) 32 | kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))}) 33 | kva = append(kva, mr.KeyValue{"d", "xyzzy"}) 34 | return kva 35 | } 36 | 37 | func Reduce(key string, values []string) string { 38 | maybeCrash() 39 | 40 | // sort values to ensure deterministic output. 41 | vv := make([]string, len(values)) 42 | copy(vv, values) 43 | sort.Strings(vv) 44 | 45 | val := strings.Join(vv, " ") 46 | return val 47 | } 48 | -------------------------------------------------------------------------------- /mrapps/jobcount.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application that counts the number of times map/reduce 5 | // tasks are run, to test whether jobs are assigned multiple times even when 6 | // there is no failure. 7 | // 8 | // go build -buildmode=plugin crash.go 9 | // 10 | 11 | import "6.824/mr" 12 | import "math/rand" 13 | import "strings" 14 | import "strconv" 15 | import "time" 16 | import "fmt" 17 | import "os" 18 | import "io/ioutil" 19 | 20 | var count int 21 | 22 | func Map(filename string, contents string) []mr.KeyValue { 23 | me := os.Getpid() 24 | f := fmt.Sprintf("mr-worker-jobcount-%d-%d", me, count) 25 | count++ 26 | err := ioutil.WriteFile(f, []byte("x"), 0666) 27 | if err != nil { 28 | panic(err) 29 | } 30 | time.Sleep(time.Duration(2000+rand.Intn(3000)) * time.Millisecond) 31 | return []mr.KeyValue{mr.KeyValue{"a", "x"}} 32 | } 33 | 34 | func Reduce(key string, values []string) string { 35 | files, err := ioutil.ReadDir(".") 36 | if err != nil { 37 | panic(err) 38 | } 39 | invocations := 0 40 | for _, f := range files { 41 | if strings.HasPrefix(f.Name(), "mr-worker-jobcount") { 42 | invocations++ 43 | } 44 | } 45 | return strconv.Itoa(invocations) 46 | } 47 | -------------------------------------------------------------------------------- /mrapps/indexer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // an indexing application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin indexer.go 7 | // 8 | 9 | import "fmt" 10 | import "6.824/mr" 11 | 12 | import "strings" 13 | import "unicode" 14 | import "sort" 15 | 16 | // The mapping function is called once for each piece of the input. 17 | // In this framework, the key is the name of the file that is being processed, 18 | // and the value is the file's contents. The return value should be a slice of 19 | // key/value pairs, each represented by a mr.KeyValue. 20 | func Map(document string, value string) (res []mr.KeyValue) { 21 | m := make(map[string]bool) 22 | words := strings.FieldsFunc(value, func(x rune) bool { return !unicode.IsLetter(x) }) 23 | for _, w := range words { 24 | m[w] = true 25 | } 26 | for w := range m { 27 | kv := mr.KeyValue{w, document} 28 | res = append(res, kv) 29 | } 30 | return 31 | } 32 | 33 | // The reduce function is called once for each key generated by Map, with a 34 | // list of that key's string value (merged across all inputs). The return value 35 | // should be a single output value for that key. 36 | func Reduce(key string, values []string) string { 37 | sort.Strings(values) 38 | return fmt.Sprintf("%d %s", len(values), strings.Join(values, ",")) 39 | } 40 | -------------------------------------------------------------------------------- /mrapps/wc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a word-count application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin wc.go 7 | // 8 | 9 | import "6.824/mr" 10 | import "unicode" 11 | import "strings" 12 | import "strconv" 13 | 14 | // 15 | // The map function is called once for each file of input. The first 16 | // argument is the name of the input file, and the second is the 17 | // file's complete contents. You should ignore the input file name, 18 | // and look only at the contents argument. The return value is a slice 19 | // of key/value pairs. 20 | // 21 | func Map(filename string, contents string) []mr.KeyValue { 22 | // function to detect word separators. 23 | ff := func(r rune) bool { return !unicode.IsLetter(r) } 24 | 25 | // split contents into an array of words. 26 | words := strings.FieldsFunc(contents, ff) 27 | 28 | kva := []mr.KeyValue{} 29 | for _, w := range words { 30 | kv := mr.KeyValue{w, "1"} 31 | kva = append(kva, kv) 32 | } 33 | return kva 34 | } 35 | 36 | // 37 | // The reduce function is called once for each key generated by the 38 | // map tasks, with a list of all the values created for that key by 39 | // any map task. 40 | // 41 | func Reduce(key string, values []string) string { 42 | // return the number of occurrences of this word. 43 | return strconv.Itoa(len(values)) 44 | } 45 | -------------------------------------------------------------------------------- /main/mrworker.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start a worker process, which is implemented 5 | // in ../mr/worker.go. typically there will be 6 | // multiple worker processes, talking to one coordinator. 7 | // 8 | // go run mrworker.go wc.so 9 | // 10 | // Please do not change this file. 11 | // 12 | 13 | import "6.824/mr" 14 | import "plugin" 15 | import "os" 16 | import "fmt" 17 | import "log" 18 | 19 | func main() { 20 | if len(os.Args) != 2 { 21 | fmt.Fprintf(os.Stderr, "Usage: mrworker xxx.so\n") 22 | os.Exit(1) 23 | } 24 | 25 | mapf, reducef := loadPlugin(os.Args[1]) 26 | 27 | mr.Worker(mapf, reducef) 28 | } 29 | 30 | // 31 | // load the application Map and Reduce functions 32 | // from a plugin file, e.g. ../mrapps/wc.so 33 | // 34 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) { 35 | p, err := plugin.Open(filename) 36 | if err != nil { 37 | log.Fatalf("cannot load plugin %v", filename) 38 | } 39 | xmapf, err := p.Lookup("Map") 40 | if err != nil { 41 | log.Fatalf("cannot find Map in %v", filename) 42 | } 43 | mapf := xmapf.(func(string, string) []mr.KeyValue) 44 | xreducef, err := p.Lookup("Reduce") 45 | if err != nil { 46 | log.Fatalf("cannot find Reduce in %v", filename) 47 | } 48 | reducef := xreducef.(func(string, []string) string) 49 | 50 | return mapf, reducef 51 | } 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## MIT6.824分布式系统 2 | [6.824 Schedule: Spring 2022](https://pdos.csail.mit.edu/6.824/schedule.html) 课程的四个lab已经全部完成,并进行了文档记录: 3 | 4 | - Lab01-MapReduce:https://blog.csdn.net/qq_44766883/article/details/124475672 5 | - Lab02-Raft: 6 | - Part 2A(leader election):https://blog.csdn.net/qq_44766883/article/details/126255117 7 | - Part 2B(log replication):https://blog.csdn.net/qq_44766883/article/details/126255214 8 | - Part 2C(persistence):https://blog.csdn.net/qq_44766883/article/details/126255266 9 | - Part 2D(log compaction):https://blog.csdn.net/qq_44766883/article/details/126255298 10 | - Lab03-Fault-tolerant KV Service: 11 | - Part 3A(Key/value service without snapshots):https://blog.csdn.net/qq_44766883/article/details/126333690 12 | - Part 3B(Key/value service with snapshots):https://blog.csdn.net/qq_44766883/article/details/126333739 13 | - Lab03-Shard Sharded KV Service: 14 | - Part 4A(The Shard controller):https://blog.csdn.net/qq_44766883/article/details/126430294 15 | - Part 4A(Sharded Key/Value Server):https://blog.csdn.net/qq_44766883/article/details/126430452 16 | 17 | 相关资料: 18 | - https://github.com/maemual/raft-zh_cn/blob/master/raft-zh_cn.md 19 | - https://raft.github.io/raft.pdf 20 | - https://pdos.csail.mit.edu/6.824/papers/mapreduce.pdf 21 | - https://github.com/OneSizeFitsQuorum/MIT6.824-2021 22 | - https://github.com/chaozh/MIT-6.824 23 | - https://www.bilibili.com/video/av87684880 24 | - https://shimo.im/docs/xwqvh3kGppJKvHvX?fallback=1 -------------------------------------------------------------------------------- /mrapps/crash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application that sometimes crashes, 5 | // and sometimes takes a long time, 6 | // to test MapReduce's ability to recover. 7 | // 8 | // go build -buildmode=plugin crash.go 9 | // 10 | 11 | import "6.824/mr" 12 | import crand "crypto/rand" 13 | import "math/big" 14 | import "strings" 15 | import "os" 16 | import "sort" 17 | import "strconv" 18 | import "time" 19 | 20 | func maybeCrash() { 21 | max := big.NewInt(1000) 22 | rr, _ := crand.Int(crand.Reader, max) 23 | if rr.Int64() < 330 { 24 | // crash! 25 | os.Exit(1) 26 | } else if rr.Int64() < 660 { 27 | // delay for a while. 28 | maxms := big.NewInt(10 * 1000) 29 | ms, _ := crand.Int(crand.Reader, maxms) 30 | time.Sleep(time.Duration(ms.Int64()) * time.Millisecond) 31 | } 32 | } 33 | 34 | func Map(filename string, contents string) []mr.KeyValue { 35 | maybeCrash() 36 | 37 | kva := []mr.KeyValue{} 38 | kva = append(kva, mr.KeyValue{"a", filename}) 39 | kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))}) 40 | kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))}) 41 | kva = append(kva, mr.KeyValue{"d", "xyzzy"}) 42 | return kva 43 | } 44 | 45 | func Reduce(key string, values []string) string { 46 | maybeCrash() 47 | 48 | // sort values to ensure deterministic output. 49 | vv := make([]string, len(values)) 50 | copy(vv, values) 51 | sort.Strings(vv) 52 | 53 | val := strings.Join(vv, " ") 54 | return val 55 | } 56 | -------------------------------------------------------------------------------- /porcupine/porcupine.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import "time" 4 | 5 | func CheckOperations(model Model, history []Operation) bool { 6 | res, _ := checkOperations(model, history, false, 0) 7 | return res == Ok 8 | } 9 | 10 | // timeout = 0 means no timeout 11 | // if this operation times out, then a false positive is possible 12 | func CheckOperationsTimeout(model Model, history []Operation, timeout time.Duration) CheckResult { 13 | res, _ := checkOperations(model, history, false, timeout) 14 | return res 15 | } 16 | 17 | // timeout = 0 means no timeout 18 | // if this operation times out, then a false positive is possible 19 | func CheckOperationsVerbose(model Model, history []Operation, timeout time.Duration) (CheckResult, linearizationInfo) { 20 | return checkOperations(model, history, true, timeout) 21 | } 22 | 23 | func CheckEvents(model Model, history []Event) bool { 24 | res, _ := checkEvents(model, history, false, 0) 25 | return res == Ok 26 | } 27 | 28 | // timeout = 0 means no timeout 29 | // if this operation times out, then a false positive is possible 30 | func CheckEventsTimeout(model Model, history []Event, timeout time.Duration) CheckResult { 31 | res, _ := checkEvents(model, history, false, timeout) 32 | return res 33 | } 34 | 35 | // timeout = 0 means no timeout 36 | // if this operation times out, then a false positive is possible 37 | func CheckEventsVerbose(model Model, history []Event, timeout time.Duration) (CheckResult, linearizationInfo) { 38 | return checkEvents(model, history, true, timeout) 39 | } 40 | -------------------------------------------------------------------------------- /porcupine/bitset.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import "math/bits" 4 | 5 | type bitset []uint64 6 | 7 | // data layout: 8 | // bits 0-63 are in data[0], the next are in data[1], etc. 9 | 10 | func newBitset(bits uint) bitset { 11 | extra := uint(0) 12 | if bits%64 != 0 { 13 | extra = 1 14 | } 15 | chunks := bits/64 + extra 16 | return bitset(make([]uint64, chunks)) 17 | } 18 | 19 | func (b bitset) clone() bitset { 20 | dataCopy := make([]uint64, len(b)) 21 | copy(dataCopy, b) 22 | return bitset(dataCopy) 23 | } 24 | 25 | func bitsetIndex(pos uint) (uint, uint) { 26 | return pos / 64, pos % 64 27 | } 28 | 29 | func (b bitset) set(pos uint) bitset { 30 | major, minor := bitsetIndex(pos) 31 | b[major] |= (1 << minor) 32 | return b 33 | } 34 | 35 | func (b bitset) clear(pos uint) bitset { 36 | major, minor := bitsetIndex(pos) 37 | b[major] &^= (1 << minor) 38 | return b 39 | } 40 | 41 | func (b bitset) get(pos uint) bool { 42 | major, minor := bitsetIndex(pos) 43 | return b[major]&(1< get, 1 => put, 2 => append 9 | Key string 10 | Value string 11 | } 12 | 13 | type KvOutput struct { 14 | Value string 15 | } 16 | 17 | var KvModel = porcupine.Model{ 18 | Partition: func(history []porcupine.Operation) [][]porcupine.Operation { 19 | m := make(map[string][]porcupine.Operation) 20 | for _, v := range history { 21 | key := v.Input.(KvInput).Key 22 | m[key] = append(m[key], v) 23 | } 24 | keys := make([]string, 0, len(m)) 25 | for k := range m { 26 | keys = append(keys, k) 27 | } 28 | sort.Strings(keys) 29 | ret := make([][]porcupine.Operation, 0, len(keys)) 30 | for _, k := range keys { 31 | ret = append(ret, m[k]) 32 | } 33 | return ret 34 | }, 35 | Init: func() interface{} { 36 | // note: we are modeling a single key's value here; 37 | // we're partitioning by key, so this is okay 38 | return "" 39 | }, 40 | Step: func(state, input, output interface{}) (bool, interface{}) { 41 | inp := input.(KvInput) 42 | out := output.(KvOutput) 43 | st := state.(string) 44 | if inp.Op == 0 { 45 | // get 46 | return out.Value == st, state 47 | } else if inp.Op == 1 { 48 | // put 49 | return true, inp.Value 50 | } else { 51 | // append 52 | return true, (st + inp.Value) 53 | } 54 | }, 55 | DescribeOperation: func(input, output interface{}) string { 56 | inp := input.(KvInput) 57 | out := output.(KvOutput) 58 | switch inp.Op { 59 | case 0: 60 | return fmt.Sprintf("get('%s') -> '%s'", inp.Key, out.Value) 61 | case 1: 62 | return fmt.Sprintf("put('%s', '%s')", inp.Key, inp.Value) 63 | case 2: 64 | return fmt.Sprintf("append('%s', '%s')", inp.Key, inp.Value) 65 | default: 66 | return "" 67 | } 68 | }, 69 | } 70 | -------------------------------------------------------------------------------- /main/diskvd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start a diskvd server. it's a member of some replica 5 | // group, which has other members, and it needs to know 6 | // how to talk to the members of the shardmaster service. 7 | // used by ../diskv/test_test.go 8 | // 9 | // arguments: 10 | // -g groupid 11 | // -m masterport1 -m masterport2 ... 12 | // -s replicaport1 -s replicaport2 ... 13 | // -i my-index-in-server-port-list 14 | // -u unreliable 15 | // -d directory 16 | // -r restart 17 | 18 | import "time" 19 | import "6.824/diskv" 20 | import "os" 21 | import "fmt" 22 | import "strconv" 23 | import "runtime" 24 | 25 | func usage() { 26 | fmt.Printf("Usage: diskvd -g gid -m master... -s server... -i my-index -d dir\n") 27 | os.Exit(1) 28 | } 29 | 30 | func main() { 31 | var gid int64 = -1 // my replica group ID 32 | masters := []string{} // ports of shardmasters 33 | replicas := []string{} // ports of servers in my replica group 34 | me := -1 // my index in replicas[] 35 | unreliable := false 36 | dir := "" // store persistent data here 37 | restart := false 38 | 39 | for i := 1; i+1 < len(os.Args); i += 2 { 40 | a0 := os.Args[i] 41 | a1 := os.Args[i+1] 42 | if a0 == "-g" { 43 | gid, _ = strconv.ParseInt(a1, 10, 64) 44 | } else if a0 == "-m" { 45 | masters = append(masters, a1) 46 | } else if a0 == "-s" { 47 | replicas = append(replicas, a1) 48 | } else if a0 == "-i" { 49 | me, _ = strconv.Atoi(a1) 50 | } else if a0 == "-u" { 51 | unreliable, _ = strconv.ParseBool(a1) 52 | } else if a0 == "-d" { 53 | dir = a1 54 | } else if a0 == "-r" { 55 | restart, _ = strconv.ParseBool(a1) 56 | } else { 57 | usage() 58 | } 59 | } 60 | 61 | if gid < 0 || me < 0 || len(masters) < 1 || me >= len(replicas) || dir == "" { 62 | usage() 63 | } 64 | 65 | runtime.GOMAXPROCS(4) 66 | 67 | srv := diskv.StartServer(gid, masters, replicas, me, dir, restart) 68 | srv.Setunreliable(unreliable) 69 | 70 | // for safety, force quit after 10 minutes. 71 | time.Sleep(10 * 60 * time.Second) 72 | mep, _ := os.FindProcess(os.Getpid()) 73 | mep.Kill() 74 | } 75 | -------------------------------------------------------------------------------- /mrapps/rtiming.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application to test that workers 5 | // execute reduce tasks in parallel. 6 | // 7 | // go build -buildmode=plugin rtiming.go 8 | // 9 | 10 | import "6.824/mr" 11 | import "fmt" 12 | import "os" 13 | import "syscall" 14 | import "time" 15 | import "io/ioutil" 16 | 17 | func nparallel(phase string) int { 18 | // create a file so that other workers will see that 19 | // we're running at the same time as them. 20 | pid := os.Getpid() 21 | myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid) 22 | err := ioutil.WriteFile(myfilename, []byte("x"), 0666) 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | // are any other workers running? 28 | // find their PIDs by scanning directory for mr-worker-XXX files. 29 | dd, err := os.Open(".") 30 | if err != nil { 31 | panic(err) 32 | } 33 | names, err := dd.Readdirnames(1000000) 34 | if err != nil { 35 | panic(err) 36 | } 37 | ret := 0 38 | for _, name := range names { 39 | var xpid int 40 | pat := fmt.Sprintf("mr-worker-%s-%%d", phase) 41 | n, err := fmt.Sscanf(name, pat, &xpid) 42 | if n == 1 && err == nil { 43 | err := syscall.Kill(xpid, 0) 44 | if err == nil { 45 | // if err == nil, xpid is alive. 46 | ret += 1 47 | } 48 | } 49 | } 50 | dd.Close() 51 | 52 | time.Sleep(1 * time.Second) 53 | 54 | err = os.Remove(myfilename) 55 | if err != nil { 56 | panic(err) 57 | } 58 | 59 | return ret 60 | } 61 | 62 | func Map(filename string, contents string) []mr.KeyValue { 63 | 64 | kva := []mr.KeyValue{} 65 | kva = append(kva, mr.KeyValue{"a", "1"}) 66 | kva = append(kva, mr.KeyValue{"b", "1"}) 67 | kva = append(kva, mr.KeyValue{"c", "1"}) 68 | kva = append(kva, mr.KeyValue{"d", "1"}) 69 | kva = append(kva, mr.KeyValue{"e", "1"}) 70 | kva = append(kva, mr.KeyValue{"f", "1"}) 71 | kva = append(kva, mr.KeyValue{"g", "1"}) 72 | kva = append(kva, mr.KeyValue{"h", "1"}) 73 | kva = append(kva, mr.KeyValue{"i", "1"}) 74 | kva = append(kva, mr.KeyValue{"j", "1"}) 75 | return kva 76 | } 77 | 78 | func Reduce(key string, values []string) string { 79 | n := nparallel("reduce") 80 | 81 | val := fmt.Sprintf("%d", n) 82 | 83 | return val 84 | } 85 | -------------------------------------------------------------------------------- /shardkv/server_snapshot.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import ( 4 | "6.824/labgob" 5 | "6.824/shardctrler" 6 | "bytes" 7 | "log" 8 | ) 9 | 10 | //保存快照 11 | func (kv *ShardKV) saveSnapshot(logIndex int) { 12 | //判断条件,满足一定的日志量才能进行持久化 13 | if kv.maxraftstate == -1 || kv.persister.RaftStateSize() < kv.maxraftstate { 14 | return 15 | } 16 | 17 | //生成快照数据 18 | w := new(bytes.Buffer) 19 | e := labgob.NewEncoder(w) 20 | if e.Encode(kv.data) != nil || 21 | e.Encode(kv.lastApplies) != nil || 22 | e.Encode(kv.inputShards) != nil || 23 | e.Encode(kv.outputShards) != nil || 24 | e.Encode(kv.config) != nil || 25 | e.Encode(kv.oldConfig) != nil || 26 | e.Encode(kv.meShards) != nil { 27 | panic("gen snapshot data encode err") 28 | } 29 | data := w.Bytes() 30 | kv.rf.Snapshot(logIndex, data) 31 | } 32 | 33 | //读取快照 34 | //两处调用:初始化阶段;收到Snapshot命令,即接收了leader的Snapshot 35 | func (kv *ShardKV) readPersist(isInit bool, snapshotTerm, snapshotIndex int, data []byte) { 36 | if data == nil || len(data) < 1 { 37 | return 38 | } 39 | //只要不是初始化调用,即如果收到一个Snapshot命令,就要执行该函数 40 | //不知道为什么,只要在ShardKV中调用该函数,就会导致测试一直阻塞,就算该函数为空也没办法通过,只能注释掉,将CondInstallSnapshot的逻辑写到InstallSnapshot RPC的处理代码中 41 | //if !isInit { 42 | // res := kv.rf.CondInstallSnapshot(snapshotTerm, snapshotIndex, data) 43 | // if !res { 44 | // log.Panicln("kv read persist err in CondInstallSnapshot!") 45 | // return 46 | // } 47 | //} 48 | //对数据进行同步 49 | r := bytes.NewBuffer(data) 50 | d := labgob.NewDecoder(r) 51 | var kvData [shardctrler.NShards]map[string]string 52 | var lastApplies [shardctrler.NShards]map[int64]int64 53 | var inputShards map[int]bool 54 | var outputShards map[int]map[int]MergeShardData 55 | var config shardctrler.Config 56 | var oldConfig shardctrler.Config 57 | var meShards map[int]bool 58 | 59 | if d.Decode(&kvData) != nil || 60 | d.Decode(&lastApplies) != nil || 61 | d.Decode(&inputShards) != nil || 62 | d.Decode(&outputShards) != nil || 63 | d.Decode(&config) != nil || 64 | d.Decode(&oldConfig) != nil || 65 | d.Decode(&meShards) != nil { 66 | log.Fatal("kv read persist err") 67 | } else { 68 | kv.data = kvData 69 | kv.lastApplies = lastApplies 70 | kv.inputShards = inputShards 71 | kv.outputShards = outputShards 72 | kv.config = config 73 | kv.oldConfig = oldConfig 74 | kv.meShards = meShards 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /mrapps/mtiming.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application to test that workers 5 | // execute map tasks in parallel. 6 | // 7 | // go build -buildmode=plugin mtiming.go 8 | // 9 | 10 | import "6.824/mr" 11 | import "strings" 12 | import "fmt" 13 | import "os" 14 | import "syscall" 15 | import "time" 16 | import "sort" 17 | import "io/ioutil" 18 | 19 | func nparallel(phase string) int { 20 | // create a file so that other workers will see that 21 | // we're running at the same time as them. 22 | pid := os.Getpid() 23 | myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid) 24 | err := ioutil.WriteFile(myfilename, []byte("x"), 0666) 25 | if err != nil { 26 | panic(err) 27 | } 28 | 29 | // are any other workers running? 30 | // find their PIDs by scanning directory for mr-worker-XXX files. 31 | dd, err := os.Open(".") 32 | if err != nil { 33 | panic(err) 34 | } 35 | names, err := dd.Readdirnames(1000000) 36 | if err != nil { 37 | panic(err) 38 | } 39 | ret := 0 40 | for _, name := range names { 41 | var xpid int 42 | pat := fmt.Sprintf("mr-worker-%s-%%d", phase) 43 | n, err := fmt.Sscanf(name, pat, &xpid) 44 | if n == 1 && err == nil { 45 | err := syscall.Kill(xpid, 0) 46 | if err == nil { 47 | // if err == nil, xpid is alive. 48 | ret += 1 49 | } 50 | } 51 | } 52 | dd.Close() 53 | 54 | time.Sleep(1 * time.Second) 55 | 56 | err = os.Remove(myfilename) 57 | if err != nil { 58 | panic(err) 59 | } 60 | 61 | return ret 62 | } 63 | 64 | func Map(filename string, contents string) []mr.KeyValue { 65 | t0 := time.Now() 66 | ts := float64(t0.Unix()) + (float64(t0.Nanosecond()) / 1000000000.0) 67 | pid := os.Getpid() 68 | 69 | n := nparallel("map") 70 | 71 | kva := []mr.KeyValue{} 72 | kva = append(kva, mr.KeyValue{ 73 | fmt.Sprintf("times-%v", pid), 74 | fmt.Sprintf("%.1f", ts)}) 75 | kva = append(kva, mr.KeyValue{ 76 | fmt.Sprintf("parallel-%v", pid), 77 | fmt.Sprintf("%d", n)}) 78 | return kva 79 | } 80 | 81 | func Reduce(key string, values []string) string { 82 | //n := nparallel("reduce") 83 | 84 | // sort values to ensure deterministic output. 85 | vv := make([]string, len(values)) 86 | copy(vv, values) 87 | sort.Strings(vv) 88 | 89 | val := strings.Join(vv, " ") 90 | return val 91 | } 92 | -------------------------------------------------------------------------------- /shardkv/server_op.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type Op struct { 8 | // Your definitions here. 9 | // Field names must start with capital letters, 10 | // otherwise RPC will break. 11 | ReqId int64 //用来标识commandNotify 12 | CommandId int64 13 | ClientId int64 14 | Key string 15 | Value string 16 | Method string 17 | ConfigNum int 18 | } 19 | 20 | type CommandResult struct { 21 | Err Err 22 | Value string 23 | } 24 | 25 | func (kv *ShardKV) removeCh(reqId int64) { 26 | kv.lock("removeCh") 27 | if _, ok := kv.commandNotifyCh[reqId]; ok { 28 | delete(kv.commandNotifyCh, reqId) 29 | } 30 | kv.unlock("removeCh") 31 | } 32 | 33 | /* 34 | Get和PutAppend RPC的处理 35 | */ 36 | 37 | func (kv *ShardKV) Get(args *GetArgs, reply *GetReply) { 38 | // Your code here. 39 | res := kv.waitCommand(args.ClientId, args.CommandId, "Get", args.Key, "", args.ConfigNum) 40 | reply.Err = res.Err 41 | reply.Value = res.Value 42 | } 43 | 44 | func (kv *ShardKV) PutAppend(args *PutAppendArgs, reply *PutAppendReply) { 45 | // Your code here. 46 | res := kv.waitCommand(args.ClientId, args.CommandId, args.Op, args.Key, args.Value, args.ConfigNum) 47 | reply.Err = res.Err 48 | } 49 | 50 | func (kv *ShardKV) waitCommand(clientId int64, commandId int64, method, key, value string, configNum int) (res CommandResult) { 51 | kv.log("wait cmd start,clientId:%d,commandId: %d,method: %s,key-value:%s %s,configNum %d", clientId, commandId, method, key, value, configNum) 52 | op := Op{ 53 | ReqId: nrand(), 54 | ClientId: clientId, 55 | CommandId: commandId, 56 | Method: method, 57 | Key: key, 58 | ConfigNum: configNum, 59 | Value: value, 60 | } 61 | index, term, isLeader := kv.rf.Start(op) 62 | if !isLeader { 63 | res.Err = ErrWrongLeader 64 | kv.log("wait cmd NOT LEADER.") 65 | return 66 | } 67 | kv.lock("waitCommand") 68 | ch := make(chan CommandResult, 1) 69 | kv.commandNotifyCh[op.ReqId] = ch 70 | kv.unlock("waitCommand") 71 | kv.log("wait cmd notify,index: %v,term: %v,op: %+v", index, term, op) 72 | t := time.NewTimer(WaitCmdTimeOut) 73 | defer t.Stop() 74 | 75 | select { 76 | case <-t.C: 77 | res.Err = ErrTimeOut 78 | case res = <-ch: 79 | case <-kv.stopCh: 80 | res.Err = ErrServer 81 | } 82 | 83 | kv.removeCh(op.ReqId) 84 | kv.log("wait cmd end,Op: %+v.res:%+v", op, res) 85 | return 86 | 87 | } 88 | -------------------------------------------------------------------------------- /porcupine/model.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import "fmt" 4 | 5 | type Operation struct { 6 | ClientId int // optional, unless you want a visualization; zero-indexed 7 | Input interface{} 8 | Call int64 // invocation time 9 | Output interface{} 10 | Return int64 // response time 11 | } 12 | 13 | type EventKind bool 14 | 15 | const ( 16 | CallEvent EventKind = false 17 | ReturnEvent EventKind = true 18 | ) 19 | 20 | type Event struct { 21 | ClientId int // optional, unless you want a visualization; zero-indexed 22 | Kind EventKind 23 | Value interface{} 24 | Id int 25 | } 26 | 27 | type Model struct { 28 | // Partition functions, such that a history is linearizable if and only 29 | // if each partition is linearizable. If you don't want to implement 30 | // this, you can always use the `NoPartition` functions implemented 31 | // below. 32 | Partition func(history []Operation) [][]Operation 33 | PartitionEvent func(history []Event) [][]Event 34 | // Initial state of the system. 35 | Init func() interface{} 36 | // Step function for the system. Returns whether or not the system 37 | // could take this step with the given inputs and outputs and also 38 | // returns the new state. This should not mutate the existing state. 39 | Step func(state interface{}, input interface{}, output interface{}) (bool, interface{}) 40 | // Equality on states. If you are using a simple data type for states, 41 | // you can use the `ShallowEqual` function implemented below. 42 | Equal func(state1, state2 interface{}) bool 43 | // For visualization, describe an operation as a string. 44 | // For example, "Get('x') -> 'y'". 45 | DescribeOperation func(input interface{}, output interface{}) string 46 | // For visualization purposes, describe a state as a string. 47 | // For example, "{'x' -> 'y', 'z' -> 'w'}" 48 | DescribeState func(state interface{}) string 49 | } 50 | 51 | func NoPartition(history []Operation) [][]Operation { 52 | return [][]Operation{history} 53 | } 54 | 55 | func NoPartitionEvent(history []Event) [][]Event { 56 | return [][]Event{history} 57 | } 58 | 59 | func ShallowEqual(state1, state2 interface{}) bool { 60 | return state1 == state2 61 | } 62 | 63 | func DefaultDescribeOperation(input interface{}, output interface{}) string { 64 | return fmt.Sprintf("%v -> %v", input, output) 65 | } 66 | 67 | func DefaultDescribeState(state interface{}) string { 68 | return fmt.Sprintf("%v", state) 69 | } 70 | 71 | type CheckResult string 72 | 73 | const ( 74 | Unknown CheckResult = "Unknown" // timed out 75 | Ok = "Ok" 76 | Illegal = "Illegal" 77 | ) 78 | -------------------------------------------------------------------------------- /shardctrler/client.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | // 4 | // Shardctrler clerk. 5 | // 6 | 7 | import "6.824/labrpc" 8 | import "time" 9 | import "crypto/rand" 10 | import "math/big" 11 | 12 | type Clerk struct { 13 | servers []*labrpc.ClientEnd 14 | // Your data here. 15 | clientId int64 16 | } 17 | 18 | func nrand() int64 { 19 | max := big.NewInt(int64(1) << 62) 20 | bigx, _ := rand.Int(rand.Reader, max) 21 | x := bigx.Int64() 22 | return x 23 | } 24 | 25 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk { 26 | ck := new(Clerk) 27 | ck.servers = servers 28 | // Your code here. 29 | ck.clientId = nrand() 30 | return ck 31 | } 32 | 33 | func (ck *Clerk) Query(num int) Config { 34 | args := &QueryArgs{} 35 | // Your code here. 36 | args.Num = num 37 | args.ClientId = ck.clientId 38 | args.CommandId = nrand() 39 | for { 40 | // try each known server. 41 | for _, srv := range ck.servers { 42 | var reply QueryReply 43 | ok := srv.Call("ShardCtrler.Query", args, &reply) 44 | if ok && reply.WrongLeader == false { 45 | return reply.Config 46 | } 47 | } 48 | time.Sleep(100 * time.Millisecond) 49 | } 50 | } 51 | 52 | func (ck *Clerk) Join(servers map[int][]string) { 53 | args := &JoinArgs{} 54 | // Your code here. 55 | args.Servers = servers 56 | args.ClientId = ck.clientId 57 | args.CommandId = nrand() 58 | 59 | for { 60 | // try each known server. 61 | for _, srv := range ck.servers { 62 | var reply JoinReply 63 | ok := srv.Call("ShardCtrler.Join", args, &reply) 64 | if ok && reply.WrongLeader == false { 65 | return 66 | } 67 | } 68 | time.Sleep(100 * time.Millisecond) 69 | } 70 | } 71 | 72 | func (ck *Clerk) Leave(gids []int) { 73 | args := &LeaveArgs{} 74 | // Your code here. 75 | args.GIDs = gids 76 | args.ClientId = ck.clientId 77 | args.CommandId = nrand() 78 | 79 | for { 80 | // try each known server. 81 | for _, srv := range ck.servers { 82 | var reply LeaveReply 83 | ok := srv.Call("ShardCtrler.Leave", args, &reply) 84 | if ok && reply.WrongLeader == false { 85 | return 86 | } 87 | } 88 | time.Sleep(100 * time.Millisecond) 89 | } 90 | } 91 | 92 | func (ck *Clerk) Move(shard int, gid int) { 93 | args := &MoveArgs{} 94 | // Your code here. 95 | args.Shard = shard 96 | args.GID = gid 97 | args.ClientId = ck.clientId 98 | args.CommandId = nrand() 99 | 100 | for { 101 | // try each known server. 102 | for _, srv := range ck.servers { 103 | var reply MoveReply 104 | ok := srv.Call("ShardCtrler.Move", args, &reply) 105 | if ok && reply.WrongLeader == false { 106 | return 107 | } 108 | } 109 | time.Sleep(100 * time.Millisecond) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /shardctrler/common.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import ( 4 | "6.824/labgob" 5 | ) 6 | 7 | // 8 | // Shard controler: assigns shards to replication groups. 9 | // 10 | // RPC interface: 11 | // Join(servers) -- add a set of groups (gid -> server-list mapping). 12 | // Leave(gids) -- delete a set of groups. 13 | // Move(shard, gid) -- hand off one shard from current owner to gid. 14 | // Query(num) -> fetch Config # num, or latest config if num==-1. 15 | // 16 | // A Config (configuration) describes a set of replica groups, and the 17 | // replica group responsible for each shard. Configs are numbered. Config 18 | // #0 is the initial configuration, with no groups and all shards 19 | // assigned to group 0 (the invalid group). 20 | // 21 | // You will need to add fields to the RPC argument structs. 22 | // 23 | 24 | type Err string 25 | 26 | // The number of shards. 27 | const NShards = 10 28 | 29 | //状态码 30 | const ( 31 | OK = "OK" 32 | ErrWrongLeader = "wrongLeader" 33 | ErrTimeout = "timeout" 34 | ErrServer = "ErrServer" 35 | ) 36 | 37 | //必须注册才能进行解码和编码 38 | func init() { 39 | labgob.Register(Config{}) 40 | labgob.Register(QueryArgs{}) 41 | labgob.Register(QueryReply{}) 42 | labgob.Register(JoinArgs{}) 43 | labgob.Register(JoinReply{}) 44 | labgob.Register(LeaveArgs{}) 45 | labgob.Register(MoveArgs{}) 46 | labgob.Register(LeaveReply{}) 47 | labgob.Register(MoveReply{}) 48 | } 49 | 50 | // A configuration -- an assignment of shards to groups. 51 | // Please don't change this. 52 | //保存配置信息 53 | type Config struct { 54 | Num int // config number,当前配置的编号 55 | Shards [NShards]int // shard -> gid,每一个分片到replica group id的映射 56 | Groups map[int][]string // gid -> servers[],每一个replica group包含哪些server 57 | } 58 | 59 | type ClientCommandId struct { 60 | ClientId int64 61 | CommandId int64 62 | } 63 | 64 | type JoinArgs struct { 65 | Servers map[int][]string // new GID -> servers mappings 66 | ClientCommandId 67 | } 68 | 69 | type JoinReply struct { 70 | WrongLeader bool 71 | Err Err 72 | } 73 | 74 | type LeaveArgs struct { 75 | GIDs []int 76 | ClientCommandId 77 | } 78 | 79 | type LeaveReply struct { 80 | WrongLeader bool 81 | Err Err 82 | } 83 | 84 | type MoveArgs struct { 85 | Shard int 86 | GID int 87 | ClientCommandId 88 | } 89 | 90 | type MoveReply struct { 91 | WrongLeader bool 92 | Err Err 93 | } 94 | 95 | type QueryArgs struct { 96 | Num int // desired config number 97 | ClientCommandId 98 | } 99 | 100 | type QueryReply struct { 101 | WrongLeader bool 102 | Err Err 103 | Config Config 104 | } 105 | 106 | func (c *Config) Copy() Config { 107 | config := Config{ 108 | Num: c.Num, 109 | Shards: c.Shards, 110 | Groups: make(map[int][]string), 111 | } 112 | for gid, s := range c.Groups { 113 | config.Groups[gid] = append([]string{}, s...) 114 | } 115 | return config 116 | } 117 | -------------------------------------------------------------------------------- /main/mrsequential.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // simple sequential MapReduce. 5 | // 6 | // go run mrsequential.go wc.so pg*.txt 7 | // 8 | 9 | import "fmt" 10 | import "6.824/mr" 11 | import "plugin" 12 | import "os" 13 | import "log" 14 | import "io/ioutil" 15 | import "sort" 16 | 17 | // for sorting by key. 18 | type ByKey []mr.KeyValue 19 | 20 | // for sorting by key. 21 | func (a ByKey) Len() int { return len(a) } 22 | func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 23 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key } 24 | 25 | func main() { 26 | if len(os.Args) < 3 { 27 | fmt.Fprintf(os.Stderr, "Usage: mrsequential xxx.so inputfiles...\n") 28 | os.Exit(1) 29 | } 30 | 31 | mapf, reducef := loadPlugin(os.Args[1]) 32 | 33 | // 34 | // read each input file, 35 | // pass it to Map, 36 | // accumulate the intermediate Map output. 37 | // 38 | intermediate := []mr.KeyValue{} 39 | for _, filename := range os.Args[2:] { 40 | file, err := os.Open(filename) 41 | if err != nil { 42 | log.Fatalf("cannot open %v", filename) 43 | } 44 | content, err := ioutil.ReadAll(file) 45 | if err != nil { 46 | log.Fatalf("cannot read %v", filename) 47 | } 48 | file.Close() 49 | kva := mapf(filename, string(content)) 50 | intermediate = append(intermediate, kva...) 51 | } 52 | 53 | // 54 | // a big difference from real MapReduce is that all the 55 | // intermediate data is in one place, intermediate[], 56 | // rather than being partitioned into NxM buckets. 57 | // 58 | 59 | sort.Sort(ByKey(intermediate)) 60 | 61 | oname := "mr-out-0" 62 | ofile, _ := os.Create(oname) 63 | 64 | // 65 | // call Reduce on each distinct key in intermediate[], 66 | // and print the result to mr-out-0. 67 | // 68 | i := 0 69 | for i < len(intermediate) { 70 | j := i + 1 71 | for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key { 72 | j++ 73 | } 74 | values := []string{} 75 | for k := i; k < j; k++ { 76 | values = append(values, intermediate[k].Value) 77 | } 78 | output := reducef(intermediate[i].Key, values) 79 | 80 | // this is the correct format for each line of Reduce output. 81 | fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output) 82 | 83 | i = j 84 | } 85 | 86 | ofile.Close() 87 | } 88 | 89 | // 90 | // load the application Map and Reduce functions 91 | // from a plugin file, e.g. ../mrapps/wc.so 92 | // 93 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) { 94 | p, err := plugin.Open(filename) 95 | if err != nil { 96 | log.Fatalf("cannot load plugin %v", filename) 97 | } 98 | xmapf, err := p.Lookup("Map") 99 | if err != nil { 100 | log.Fatalf("cannot find Map in %v", filename) 101 | } 102 | mapf := xmapf.(func(string, string) []mr.KeyValue) 103 | xreducef, err := p.Lookup("Reduce") 104 | if err != nil { 105 | log.Fatalf("cannot find Reduce in %v", filename) 106 | } 107 | reducef := xreducef.(func(string, []string) string) 108 | 109 | return mapf, reducef 110 | } 111 | -------------------------------------------------------------------------------- /shardkv/common.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import "6.824/labgob" 4 | 5 | // 6 | // Sharded key/value server. 7 | // Lots of replica groups, each running Raft. 8 | // Shardctrler decides which group serves each shard. 9 | // Shardctrler may change shard assignment from time to time. 10 | // 11 | // You will have to modify these definitions. 12 | // 13 | 14 | //回复状态码 15 | const ( 16 | OK = "OK" 17 | ErrNoKey = "ErrNoKey" 18 | ErrWrongGroup = "ErrWrongGroup" 19 | ErrWrongLeader = "ErrWrongLeader" 20 | ErrTimeOut = "ErrTimeOut" 21 | ErrServer = "ErrServer" 22 | ) 23 | 24 | type Err string 25 | 26 | //主要是applyCh的处理中,ApplyMsg的Command是一个interface,因此要向labgob注册具体实现才能进行编解码 27 | func init() { 28 | //labgob.Register(PutAppendArgs{}) 29 | //labgob.Register(PutAppendReply{}) 30 | //labgob.Register(GetArgs{}) 31 | //labgob.Register(GetReply{}) 32 | //labgob.Register(FetchShardDataArgs{}) 33 | //labgob.Register(FetchShardDataReply{}) 34 | labgob.Register(CleanShardDataArgs{}) 35 | //labgob.Register(CleanShardDataReply{}) 36 | labgob.Register(MergeShardData{}) 37 | } 38 | 39 | // Put or Append 40 | type PutAppendArgs struct { 41 | // You'll have to add definitions here. 42 | Key string 43 | Value string 44 | Op string // "Put" or "Append" 45 | // You'll have to add definitions here. 46 | // Field names must start with capital letters, 47 | // otherwise RPC will break. 48 | ClientId int64 49 | CommandId int64 50 | ConfigNum int 51 | } 52 | 53 | type PutAppendReply struct { 54 | Err Err 55 | } 56 | 57 | func (c *PutAppendArgs) copy() PutAppendArgs { 58 | r := PutAppendArgs{ 59 | Key: c.Key, 60 | Value: c.Value, 61 | Op: c.Op, 62 | ClientId: c.ClientId, 63 | CommandId: c.CommandId, 64 | ConfigNum: c.ConfigNum, 65 | } 66 | return r 67 | } 68 | 69 | type GetArgs struct { 70 | Key string 71 | // You'll have to add definitions here. 72 | ClientId int64 73 | CommandId int64 74 | ConfigNum int 75 | } 76 | 77 | type GetReply struct { 78 | Err Err 79 | Value string 80 | } 81 | 82 | func (c *GetArgs) copy() GetArgs { 83 | r := GetArgs{ 84 | Key: c.Key, 85 | ClientId: c.ClientId, 86 | CommandId: c.CommandId, 87 | ConfigNum: c.ConfigNum, 88 | } 89 | return r 90 | } 91 | 92 | //用于向目标节点获取input shard 93 | type FetchShardDataArgs struct { 94 | ConfigNum int 95 | ShardNum int 96 | } 97 | 98 | type FetchShardDataReply struct { 99 | Success bool 100 | CommandIndexes map[int64]int64 101 | Data map[string]string 102 | } 103 | 104 | func (reply *FetchShardDataReply) Copy() FetchShardDataReply { 105 | res := FetchShardDataReply{ 106 | Success: reply.Success, 107 | Data: make(map[string]string), 108 | CommandIndexes: make(map[int64]int64), 109 | } 110 | for k, v := range reply.Data { 111 | res.Data[k] = v 112 | } 113 | for k, v := range reply.CommandIndexes { 114 | res.CommandIndexes[k] = v 115 | } 116 | return res 117 | } 118 | 119 | //用于请求目标节点清除指定的output shard 120 | type CleanShardDataArgs struct { 121 | ConfigNum int 122 | ShardNum int 123 | } 124 | 125 | type CleanShardDataReply struct { 126 | Success bool 127 | } 128 | 129 | //用于存储output shard的数据,以及充当input shard在apply的命令 130 | type MergeShardData struct { 131 | ConfigNum int 132 | ShardNum int 133 | CommandIndexes map[int64]int64 //当前shard的所有客户端的最后一条命令id 134 | Data map[string]string 135 | } 136 | -------------------------------------------------------------------------------- /main/test-mr-early.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | # 5 | # map-reduce tests 6 | # 7 | 8 | # comment this out to run the tests without the Go race detector. 9 | RACE=-race 10 | 11 | if [[ "$OSTYPE" = "darwin"* ]] 12 | then 13 | if go version | grep 'go1.17.[012345]' 14 | then 15 | # -race with plug-ins on x86 MacOS 12 with 16 | # go1.17 before 1.17.6 sometimes crash. 17 | RACE= 18 | echo '*** Turning off -race since it may not work on a Mac' 19 | echo ' with ' `go version` 20 | fi 21 | fi 22 | 23 | TIMEOUT=timeout 24 | if timeout 2s sleep 1 > /dev/null 2>&1 25 | then 26 | : 27 | else 28 | if gtimeout 2s sleep 1 > /dev/null 2>&1 29 | then 30 | TIMEOUT=gtimeout 31 | else 32 | # no timeout command 33 | TIMEOUT= 34 | echo '*** Cannot find timeout command; proceeding without timeouts.' 35 | fi 36 | fi 37 | if [ "$TIMEOUT" != "" ] 38 | then 39 | TIMEOUT+=" -k 2s 180s " 40 | fi 41 | 42 | # run the test in a fresh sub-directory. 43 | rm -rf mr-tmp 44 | mkdir mr-tmp || exit 1 45 | cd mr-tmp || exit 1 46 | rm -f mr-* 47 | 48 | # make sure software is freshly built. 49 | (cd ../../mrapps && go clean) 50 | (cd .. && go clean) 51 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1 52 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1 53 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1 54 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1 55 | (cd ../../mrapps && go build $RACE -buildmode=plugin jobcount.go) || exit 1 56 | (cd ../../mrapps && go build $RACE -buildmode=plugin early_exit.go) || exit 1 57 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1 58 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1 59 | (cd .. && go build $RACE mrcoordinator.go) || exit 1 60 | (cd .. && go build $RACE mrworker.go) || exit 1 61 | (cd .. && go build $RACE mrsequential.go) || exit 1 62 | 63 | failed_any=0 64 | 65 | ######################################################### 66 | 67 | ######################################################### 68 | # test whether any worker or coordinator exits before the 69 | # task has completed (i.e., all output files have been finalized) 70 | rm -f mr-* 71 | 72 | echo '***' Starting early exit test. 73 | 74 | DF=anydone$$ 75 | rm -f $DF 76 | 77 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch $DF) & 78 | 79 | # give the coordinator time to create the sockets. 80 | sleep 1 81 | 82 | # start multiple workers. 83 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 84 | sleep 1 85 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 86 | sleep 1 87 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 88 | 89 | # wait for any of the coord or workers to exit. 90 | # `jobs` ensures that any completed old processes from other tests 91 | # are not waited upon. 92 | jobs &> /dev/null 93 | if [[ "$OSTYPE" = "darwin"* ]] 94 | then 95 | # bash on the Mac doesn't have wait -n 96 | while [ ! -e $DF ] 97 | do 98 | sleep 0.2 99 | done 100 | else 101 | # the -n causes wait to wait for just one child process, 102 | # rather than waiting for all to finish. 103 | wait -n 104 | fi 105 | 106 | rm -f $DF 107 | 108 | # a process has exited. this means that the output should be finalized 109 | # otherwise, either a worker or the coordinator exited early 110 | sort mr-out* | grep . > mr-wc-all-initial 111 | echo 112 | # wait for remaining workers and coordinator to exit. 113 | wait 114 | 115 | # compare initial and final outputs 116 | sort mr-out* | grep . > mr-wc-all-final 117 | if cmp mr-wc-all-final mr-wc-all-initial 118 | then 119 | echo '---' early exit test: PASS 120 | else 121 | echo '---' output changed after first worker exited 122 | echo '---' early exit test: FAIL 123 | failed_any=1 124 | fi 125 | #rm -f mr-* -------------------------------------------------------------------------------- /raft/raft_snapshot.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import "time" 4 | 5 | type InstallSnapshotArgs struct { 6 | Term int 7 | LeaderId int 8 | LastIncludedIndex int 9 | LastIncludedTerm int 10 | //Offset int 11 | Data []byte 12 | //Done bool 13 | } 14 | 15 | type InstallSnapshotReply struct { 16 | Term int 17 | } 18 | 19 | func (rf *Raft) InstallSnapshot(args *InstallSnapshotArgs, reply *InstallSnapshotReply) { 20 | rf.mu.Lock() 21 | defer rf.mu.Unlock() 22 | 23 | reply.Term = rf.currentTerm 24 | if rf.currentTerm > args.Term { 25 | return 26 | } 27 | 28 | if args.Term > rf.currentTerm || rf.role != Role_Follower { 29 | rf.changeRole(Role_Follower) 30 | rf.votedFor = -1 31 | rf.currentTerm = args.Term 32 | rf.resetElectionTimer() 33 | rf.persist() 34 | } 35 | 36 | //如果自身快照包含的最后一个日志>=leader快照包含的最后一个日志,就没必要接受了 37 | if rf.lastSnapshotIndex >= args.LastIncludedIndex { 38 | return 39 | } 40 | 41 | /********以下内容和CondInstallSnapshot的操作是相同的,因为不知道为什么在lab4B中只要调用CondInstallSnapshot函数就会陷入阻塞,因此将操作逻辑复制到这里一份,lab4中就没有调用CondInstallSnapshot函数了***********/ 42 | 43 | lastIncludedIndex := args.LastIncludedIndex 44 | lastIncludedTerm := args.LastIncludedTerm 45 | _, lastIndex := rf.getLastLogTermAndIndex() 46 | if lastIncludedIndex > lastIndex { 47 | rf.logs = make([]LogEntry, 1) 48 | } else { 49 | installLen := lastIncludedIndex - rf.lastSnapshotIndex 50 | rf.logs = rf.logs[installLen:] 51 | rf.logs[0].Command = nil 52 | } 53 | //0处是空日志,代表了快照日志的标记 54 | rf.logs[0].Term = lastIncludedTerm 55 | 56 | rf.lastSnapshotIndex, rf.lastSnapshotTerm = lastIncludedIndex, lastIncludedTerm 57 | rf.lastApplied, rf.commitIndex = lastIncludedIndex, lastIncludedIndex 58 | //保存快照和状态 59 | rf.persister.SaveStateAndSnapshot(rf.getPersistData(), args.Data) 60 | 61 | /***********************************/ 62 | 63 | //接收发来的快照,并提交一个命令处理 64 | rf.applyCh <- ApplyMsg{ 65 | SnapshotValid: true, 66 | Snapshot: args.Data, 67 | SnapshotTerm: args.LastIncludedTerm, 68 | SnapshotIndex: args.LastIncludedIndex, 69 | } 70 | 71 | } 72 | 73 | //向指定节点发送快照 74 | func (rf *Raft) sendInstallSnapshotToPeer(server int) { 75 | rf.mu.Lock() 76 | args := InstallSnapshotArgs{ 77 | Term: rf.currentTerm, 78 | LeaderId: rf.me, 79 | LastIncludedIndex: rf.lastSnapshotIndex, 80 | LastIncludedTerm: rf.lastSnapshotTerm, 81 | Data: rf.persister.ReadSnapshot(), 82 | } 83 | rf.mu.Unlock() 84 | 85 | timer := time.NewTimer(RPCTimeout) 86 | defer timer.Stop() 87 | DPrintf("%v role: %v, send snapshot to peer,%v,args = %+v,reply = %+v", rf.me, rf.role, server, args) 88 | 89 | for { 90 | timer.Stop() 91 | timer.Reset(RPCTimeout) 92 | 93 | ch := make(chan bool, 1) 94 | reply := &InstallSnapshotReply{} 95 | go func() { 96 | ok := rf.peers[server].Call("Raft.InstallSnapshot", &args, reply) 97 | if !ok { 98 | time.Sleep(time.Millisecond * 10) 99 | } 100 | ch <- ok 101 | }() 102 | 103 | select { 104 | case <-rf.stopCh: 105 | return 106 | case <-timer.C: 107 | DPrintf("%v role: %v, send snapshot to peer %v TIME OUT!!!", rf.me, rf.role, server) 108 | continue 109 | case ok := <-ch: 110 | if !ok { 111 | continue 112 | } 113 | } 114 | 115 | rf.mu.Lock() 116 | defer rf.mu.Unlock() 117 | if rf.role != Role_Leader || args.Term != rf.currentTerm { 118 | return 119 | } 120 | if reply.Term > rf.currentTerm { 121 | rf.changeRole(Role_Follower) 122 | rf.currentTerm = reply.Term 123 | rf.resetElectionTimer() 124 | rf.persist() 125 | return 126 | } 127 | 128 | if args.LastIncludedIndex > rf.matchIndex[server] { 129 | rf.matchIndex[server] = args.LastIncludedIndex 130 | } 131 | if args.LastIncludedIndex+1 > rf.nextIndex[server] { 132 | rf.nextIndex[server] = args.LastIncludedIndex + 1 133 | } 134 | return 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /labgob/test_test.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | import "testing" 4 | 5 | import "bytes" 6 | 7 | type T1 struct { 8 | T1int0 int 9 | T1int1 int 10 | T1string0 string 11 | T1string1 string 12 | } 13 | 14 | type T2 struct { 15 | T2slice []T1 16 | T2map map[int]*T1 17 | T2t3 interface{} 18 | } 19 | 20 | type T3 struct { 21 | T3int999 int 22 | } 23 | 24 | // 25 | // test that we didn't break GOB. 26 | // 27 | func TestGOB(t *testing.T) { 28 | e0 := errorCount 29 | 30 | w := new(bytes.Buffer) 31 | 32 | Register(T3{}) 33 | 34 | { 35 | x0 := 0 36 | x1 := 1 37 | t1 := T1{} 38 | t1.T1int1 = 1 39 | t1.T1string1 = "6.824" 40 | t2 := T2{} 41 | t2.T2slice = []T1{T1{}, t1} 42 | t2.T2map = map[int]*T1{} 43 | t2.T2map[99] = &T1{1, 2, "x", "y"} 44 | t2.T2t3 = T3{999} 45 | 46 | e := NewEncoder(w) 47 | e.Encode(x0) 48 | e.Encode(x1) 49 | e.Encode(t1) 50 | e.Encode(t2) 51 | } 52 | data := w.Bytes() 53 | 54 | { 55 | var x0 int 56 | var x1 int 57 | var t1 T1 58 | var t2 T2 59 | 60 | r := bytes.NewBuffer(data) 61 | d := NewDecoder(r) 62 | if d.Decode(&x0) != nil || 63 | d.Decode(&x1) != nil || 64 | d.Decode(&t1) != nil || 65 | d.Decode(&t2) != nil { 66 | t.Fatalf("Decode failed") 67 | } 68 | 69 | if x0 != 0 { 70 | t.Fatalf("wrong x0 %v\n", x0) 71 | } 72 | if x1 != 1 { 73 | t.Fatalf("wrong x1 %v\n", x1) 74 | } 75 | if t1.T1int0 != 0 { 76 | t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0) 77 | } 78 | if t1.T1int1 != 1 { 79 | t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1) 80 | } 81 | if t1.T1string0 != "" { 82 | t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0) 83 | } 84 | if t1.T1string1 != "6.824" { 85 | t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1) 86 | } 87 | if len(t2.T2slice) != 2 { 88 | t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice)) 89 | } 90 | if t2.T2slice[1].T1int1 != 1 { 91 | t.Fatalf("wrong slice value\n") 92 | } 93 | if len(t2.T2map) != 1 { 94 | t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map)) 95 | } 96 | if t2.T2map[99].T1string1 != "y" { 97 | t.Fatalf("wrong map value\n") 98 | } 99 | t3 := (t2.T2t3).(T3) 100 | if t3.T3int999 != 999 { 101 | t.Fatalf("wrong t2.T2t3.T3int999\n") 102 | } 103 | } 104 | 105 | if errorCount != e0 { 106 | t.Fatalf("there were errors, but should not have been") 107 | } 108 | } 109 | 110 | type T4 struct { 111 | Yes int 112 | no int 113 | } 114 | 115 | // 116 | // make sure we check capitalization 117 | // labgob prints one warning during this test. 118 | // 119 | func TestCapital(t *testing.T) { 120 | e0 := errorCount 121 | 122 | v := []map[*T4]int{} 123 | 124 | w := new(bytes.Buffer) 125 | e := NewEncoder(w) 126 | e.Encode(v) 127 | data := w.Bytes() 128 | 129 | var v1 []map[T4]int 130 | r := bytes.NewBuffer(data) 131 | d := NewDecoder(r) 132 | d.Decode(&v1) 133 | 134 | if errorCount != e0+1 { 135 | t.Fatalf("failed to warn about lower-case field") 136 | } 137 | } 138 | 139 | // 140 | // check that we warn when someone sends a default value over 141 | // RPC but the target into which we're decoding holds a non-default 142 | // value, which GOB seems not to overwrite as you'd expect. 143 | // 144 | // labgob does not print a warning. 145 | // 146 | func TestDefault(t *testing.T) { 147 | e0 := errorCount 148 | 149 | type DD struct { 150 | X int 151 | } 152 | 153 | // send a default value... 154 | dd1 := DD{} 155 | 156 | w := new(bytes.Buffer) 157 | e := NewEncoder(w) 158 | e.Encode(dd1) 159 | data := w.Bytes() 160 | 161 | // and receive it into memory that already 162 | // holds non-default values. 163 | reply := DD{99} 164 | 165 | r := bytes.NewBuffer(data) 166 | d := NewDecoder(r) 167 | d.Decode(&reply) 168 | 169 | if errorCount != e0+1 { 170 | t.Fatalf("failed to warn about decoding into non-default value") 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /shardkv/client.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | // 4 | // client code to talk to a sharded key/value service. 5 | // 6 | // the client first talks to the shardctrler to find out 7 | // the assignment of shards (keys) to groups, and then 8 | // talks to the group that holds the key's shard. 9 | // 10 | 11 | import ( 12 | "6.824/labrpc" 13 | ) 14 | import "crypto/rand" 15 | import "math/big" 16 | import "6.824/shardctrler" 17 | import "time" 18 | 19 | // 20 | // which shard is a key in? 21 | // please use this function, 22 | // and please do not change it. 23 | // 24 | func key2shard(key string) int { 25 | shard := 0 26 | if len(key) > 0 { 27 | shard = int(key[0]) 28 | } 29 | shard %= shardctrler.NShards 30 | return shard 31 | } 32 | 33 | func nrand() int64 { 34 | max := big.NewInt(int64(1) << 62) 35 | bigx, _ := rand.Int(rand.Reader, max) 36 | x := bigx.Int64() 37 | return x 38 | } 39 | 40 | type Clerk struct { 41 | sm *shardctrler.Clerk 42 | config shardctrler.Config 43 | make_end func(string) *labrpc.ClientEnd 44 | // You will have to modify this struct. 45 | clientId int64 46 | } 47 | 48 | // 49 | // the tester calls MakeClerk. 50 | // 51 | // ctrlers[] is needed to call shardctrler.MakeClerk(). 52 | // 53 | // make_end(servername) turns a server name from a 54 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can 55 | // send RPCs. 56 | // 57 | func MakeClerk(ctrlers []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *Clerk { 58 | ck := new(Clerk) 59 | ck.sm = shardctrler.MakeClerk(ctrlers) 60 | ck.make_end = make_end 61 | // You'll have to add code here. 62 | ck.clientId = nrand() 63 | return ck 64 | } 65 | 66 | // 67 | // fetch the current value for a key. 68 | // returns "" if the key does not exist. 69 | // keeps trying forever in the face of all other errors. 70 | // You will have to modify this function. 71 | // 72 | func (ck *Clerk) Get(key string) string { 73 | args := GetArgs{} 74 | args.Key = key 75 | args.ClientId = ck.clientId 76 | args.CommandId = nrand() 77 | 78 | for { 79 | args.ConfigNum = ck.config.Num 80 | shard := key2shard(key) 81 | gid := ck.config.Shards[shard] 82 | if servers, ok := ck.config.Groups[gid]; ok { 83 | // try each server for the shard. 84 | for si := 0; si < len(servers); si++ { 85 | srv := ck.make_end(servers[si]) 86 | var reply GetReply 87 | ok := srv.Call("ShardKV.Get", &args, &reply) 88 | if ok && (reply.Err == OK || reply.Err == ErrNoKey) { 89 | return reply.Value 90 | } 91 | if ok && (reply.Err == ErrWrongGroup) { 92 | break 93 | } 94 | // ... not ok, or ErrWrongLeader 95 | } 96 | } 97 | time.Sleep(100 * time.Millisecond) 98 | // ask controler for the latest configuration. 99 | ck.config = ck.sm.Query(-1) 100 | } 101 | 102 | return "" 103 | } 104 | 105 | // 106 | // shared by Put and Append. 107 | // You will have to modify this function. 108 | // 109 | func (ck *Clerk) PutAppend(key string, value string, op string) { 110 | args := PutAppendArgs{} 111 | args.Key = key 112 | args.Value = value 113 | args.Op = op 114 | args.ClientId = ck.clientId 115 | args.CommandId = nrand() 116 | 117 | for { 118 | args.ConfigNum = ck.config.Num 119 | shard := key2shard(key) 120 | gid := ck.config.Shards[shard] 121 | if servers, ok := ck.config.Groups[gid]; ok { 122 | for si := 0; si < len(servers); si++ { 123 | srv := ck.make_end(servers[si]) 124 | var reply PutAppendReply 125 | ok := srv.Call("ShardKV.PutAppend", &args, &reply) 126 | if ok && reply.Err == OK { 127 | return 128 | } 129 | if ok && reply.Err == ErrWrongGroup { 130 | break 131 | } 132 | // ... not ok, or ErrWrongLeader 133 | } 134 | } 135 | time.Sleep(100 * time.Millisecond) 136 | // ask controler for the latest configuration. 137 | ck.config = ck.sm.Query(-1) 138 | } 139 | } 140 | 141 | func (ck *Clerk) Put(key string, value string) { 142 | ck.PutAppend(key, value, "Put") 143 | } 144 | func (ck *Clerk) Append(key string, value string) { 145 | ck.PutAppend(key, value, "Append") 146 | } 147 | -------------------------------------------------------------------------------- /labgob/labgob.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | // 4 | // trying to send non-capitalized fields over RPC produces a range of 5 | // misbehavior, including both mysterious incorrect computation and 6 | // outright crashes. so this wrapper around Go's encoding/gob warns 7 | // about non-capitalized field names. 8 | // 9 | 10 | import "encoding/gob" 11 | import "io" 12 | import "reflect" 13 | import "fmt" 14 | import "sync" 15 | import "unicode" 16 | import "unicode/utf8" 17 | 18 | var mu sync.Mutex 19 | var errorCount int // for TestCapital 20 | var checked map[reflect.Type]bool 21 | 22 | type LabEncoder struct { 23 | gob *gob.Encoder 24 | } 25 | 26 | func NewEncoder(w io.Writer) *LabEncoder { 27 | enc := &LabEncoder{} 28 | enc.gob = gob.NewEncoder(w) 29 | return enc 30 | } 31 | 32 | func (enc *LabEncoder) Encode(e interface{}) error { 33 | checkValue(e) 34 | return enc.gob.Encode(e) 35 | } 36 | 37 | func (enc *LabEncoder) EncodeValue(value reflect.Value) error { 38 | checkValue(value.Interface()) 39 | return enc.gob.EncodeValue(value) 40 | } 41 | 42 | type LabDecoder struct { 43 | gob *gob.Decoder 44 | } 45 | 46 | func NewDecoder(r io.Reader) *LabDecoder { 47 | dec := &LabDecoder{} 48 | dec.gob = gob.NewDecoder(r) 49 | return dec 50 | } 51 | 52 | func (dec *LabDecoder) Decode(e interface{}) error { 53 | checkValue(e) 54 | checkDefault(e) 55 | return dec.gob.Decode(e) 56 | } 57 | 58 | func Register(value interface{}) { 59 | checkValue(value) 60 | gob.Register(value) 61 | } 62 | 63 | func RegisterName(name string, value interface{}) { 64 | checkValue(value) 65 | gob.RegisterName(name, value) 66 | } 67 | 68 | func checkValue(value interface{}) { 69 | checkType(reflect.TypeOf(value)) 70 | } 71 | 72 | func checkType(t reflect.Type) { 73 | k := t.Kind() 74 | 75 | mu.Lock() 76 | // only complain once, and avoid recursion. 77 | if checked == nil { 78 | checked = map[reflect.Type]bool{} 79 | } 80 | if checked[t] { 81 | mu.Unlock() 82 | return 83 | } 84 | checked[t] = true 85 | mu.Unlock() 86 | 87 | switch k { 88 | case reflect.Struct: 89 | for i := 0; i < t.NumField(); i++ { 90 | f := t.Field(i) 91 | rune, _ := utf8.DecodeRuneInString(f.Name) 92 | if unicode.IsUpper(rune) == false { 93 | // ta da 94 | fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n", 95 | f.Name, t.Name()) 96 | mu.Lock() 97 | errorCount += 1 98 | mu.Unlock() 99 | } 100 | checkType(f.Type) 101 | } 102 | return 103 | case reflect.Slice, reflect.Array, reflect.Ptr: 104 | checkType(t.Elem()) 105 | return 106 | case reflect.Map: 107 | checkType(t.Elem()) 108 | checkType(t.Key()) 109 | return 110 | default: 111 | return 112 | } 113 | } 114 | 115 | // 116 | // warn if the value contains non-default values, 117 | // as it would if one sent an RPC but the reply 118 | // struct was already modified. if the RPC reply 119 | // contains default values, GOB won't overwrite 120 | // the non-default value. 121 | // 122 | func checkDefault(value interface{}) { 123 | if value == nil { 124 | return 125 | } 126 | checkDefault1(reflect.ValueOf(value), 1, "") 127 | } 128 | 129 | func checkDefault1(value reflect.Value, depth int, name string) { 130 | if depth > 3 { 131 | return 132 | } 133 | 134 | t := value.Type() 135 | k := t.Kind() 136 | 137 | switch k { 138 | case reflect.Struct: 139 | for i := 0; i < t.NumField(); i++ { 140 | vv := value.Field(i) 141 | name1 := t.Field(i).Name 142 | if name != "" { 143 | name1 = name + "." + name1 144 | } 145 | checkDefault1(vv, depth+1, name1) 146 | } 147 | return 148 | case reflect.Ptr: 149 | if value.IsNil() { 150 | return 151 | } 152 | checkDefault1(value.Elem(), depth+1, name) 153 | return 154 | case reflect.Bool, 155 | reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, 156 | reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, 157 | reflect.Uintptr, reflect.Float32, reflect.Float64, 158 | reflect.String: 159 | if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false { 160 | mu.Lock() 161 | if errorCount < 1 { 162 | what := name 163 | if what == "" { 164 | what = t.Name() 165 | } 166 | // this warning typically arises if code re-uses the same RPC reply 167 | // variable for multiple RPC calls, or if code restores persisted 168 | // state into variable that already have non-default values. 169 | fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n", 170 | what) 171 | } 172 | errorCount += 1 173 | mu.Unlock() 174 | } 175 | return 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /kvraft/client.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "6.824/labrpc" 5 | "log" 6 | "time" 7 | ) 8 | import "crypto/rand" 9 | import "math/big" 10 | 11 | const ( 12 | ChangeLeaderInterval = time.Millisecond * 20 13 | ) 14 | 15 | //客户端 16 | type Clerk struct { 17 | servers []*labrpc.ClientEnd 18 | // You will have to modify this struct. 19 | clientId int64 20 | leaderId int 21 | } 22 | 23 | //用于生成一个随机数,可以生成clientId和commandId 24 | func nrand() int64 { 25 | max := big.NewInt(int64(1) << 62) 26 | bigx, _ := rand.Int(rand.Reader, max) 27 | x := bigx.Int64() 28 | return x 29 | } 30 | 31 | //生成一个客户端 32 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk { 33 | ck := new(Clerk) 34 | ck.servers = servers 35 | ck.clientId = nrand() 36 | // You'll have to add code here. 37 | return ck 38 | } 39 | 40 | // 41 | // fetch the current value for a key. 42 | // returns "" if the key does not exist. 43 | // keeps trying forever in the face of all other errors. 44 | // 45 | // you can send an RPC with code like this: 46 | // ok := ck.servers[i].Call("KVServer.Get", &args, &reply) 47 | // 48 | // the types of args and reply (including whether they are pointers) 49 | // must match the declared types of the RPC handler function's 50 | // arguments. and reply must be passed as a pointer. 51 | // 52 | //根据key获取value 53 | func (ck *Clerk) Get(key string) string { 54 | // You will have to modify this function. 55 | //DPrintf("%v client get key:%s.", ck.clientId, key) 56 | args := GetArgs{ 57 | Key: key, 58 | ClientId: ck.clientId, 59 | CommandId: nrand(), 60 | } 61 | leaderId := ck.leaderId 62 | for { 63 | reply := GetReply{} 64 | ok := ck.servers[leaderId].Call("KVServer.Get", &args, &reply) 65 | if !ok { 66 | //如果请求失败,等一段时间再请求,换一个节点再请求 67 | DPrintf("%v client get key %v from server %v,not ok.", ck.clientId, key, leaderId) 68 | time.Sleep(ChangeLeaderInterval) 69 | leaderId = (leaderId + 1) % len(ck.servers) 70 | continue 71 | } else if reply.Err != OK { 72 | DPrintf("%v client get key %v from server %v,reply err = %v!", ck.clientId, key, leaderId, reply.Err) 73 | } 74 | 75 | switch reply.Err { 76 | case OK: 77 | DPrintf("%v client get key %v from server %v,value: %v,OK.", ck.clientId, key, leaderId, reply.Value, leaderId) 78 | ck.leaderId = leaderId 79 | return reply.Value 80 | case ErrNoKey: 81 | DPrintf("%v client get key %v from server %v,NO KEY!", ck.clientId, key, leaderId) 82 | ck.leaderId = leaderId 83 | return "" 84 | case ErrTimeOut: 85 | continue 86 | default: 87 | time.Sleep(ChangeLeaderInterval) 88 | leaderId = (leaderId + 1) % len(ck.servers) 89 | continue 90 | } 91 | 92 | } 93 | } 94 | 95 | // 96 | // shared by Put and Append. 97 | // 98 | // you can send an RPC with code like this: 99 | // ok := ck.servers[i].Call("KVServer.PutAppend", &args, &reply) 100 | // 101 | // the types of args and reply (including whether they are pointers) 102 | // must match the declared types of the RPC handler function's 103 | // arguments. and reply must be passed as a pointer. 104 | // 105 | func (ck *Clerk) PutAppend(key string, value string, op string) { 106 | DPrintf("%v client PutAppend,key:%v,value:%v,op:%v", ck.clientId, key, value, op) 107 | // You will have to modify this function. 108 | args := PutAppendArgs{ 109 | Key: key, 110 | Value: value, 111 | Op: op, 112 | ClientId: ck.clientId, 113 | CommandId: nrand(), 114 | } 115 | leaderId := ck.leaderId 116 | for { 117 | reply := PutAppendReply{} 118 | ok := ck.servers[leaderId].Call("KVServer.PutAppend", &args, &reply) 119 | if !ok { 120 | //可能当前请求的server不是leader,换一个server再访问 121 | DPrintf("%v client set key %v to %v to server %v,not ok.", ck.clientId, key, value, leaderId) 122 | time.Sleep(ChangeLeaderInterval) 123 | leaderId = (leaderId + 1) % len(ck.servers) 124 | continue 125 | } else if reply.Err != OK { 126 | DPrintf("%v client set key %v to %v to server %v,reply err = %v!", ck.clientId, key, value, leaderId, reply.Err) 127 | } 128 | 129 | switch reply.Err { 130 | case OK: 131 | DPrintf("%v client set key %v to %v to server %v,OK.", ck.clientId, key, value, leaderId) 132 | ck.leaderId = leaderId 133 | return 134 | case ErrNoKey: 135 | DPrintf("%v client set key %v to %v to server %v,NOKEY!", ck.clientId, key, value, leaderId) 136 | return 137 | case ErrTimeOut: 138 | continue 139 | case ErrWrongLeader: 140 | //换一个节点继续请求 141 | time.Sleep(ChangeLeaderInterval) 142 | leaderId = (leaderId + 1) % len(ck.servers) 143 | continue 144 | case ErrServer: 145 | //换一个节点继续请求 146 | time.Sleep(ChangeLeaderInterval) 147 | leaderId = (leaderId + 1) % len(ck.servers) 148 | continue 149 | default: 150 | log.Fatal("client rev unknown err", reply.Err) 151 | } 152 | } 153 | } 154 | 155 | func (ck *Clerk) Put(key string, value string) { 156 | ck.PutAppend(key, value, "Put") 157 | } 158 | func (ck *Clerk) Append(key string, value string) { 159 | ck.PutAppend(key, value, "Append") 160 | } 161 | -------------------------------------------------------------------------------- /raft/raft_vote.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | // 8 | // example RequestVote RPC arguments structure. 9 | // field names must start with capital letters! 10 | // 11 | type RequestVoteArgs struct { 12 | // Your data here (2A, 2B). 13 | Term int 14 | CandidateId int 15 | LastLogIndex int 16 | LastLogTerm int 17 | } 18 | 19 | // 20 | // example RequestVote RPC reply structure. 21 | // field names must start with capital letters! 22 | // 23 | type RequestVoteReply struct { 24 | // Your data here (2A). 25 | Term int 26 | VoteGranted bool 27 | } 28 | 29 | // 30 | // example RequestVote RPC handler. 31 | // 32 | func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) { 33 | // Your code here (2A, 2B). 34 | rf.mu.Lock() 35 | defer rf.mu.Unlock() 36 | 37 | //默认失败,返回 38 | lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex() 39 | reply.Term = rf.currentTerm 40 | reply.VoteGranted = false 41 | 42 | if rf.currentTerm > args.Term { 43 | return 44 | } else if rf.currentTerm == args.Term { 45 | if rf.role == Role_Leader { 46 | return 47 | } 48 | 49 | if args.CandidateId == rf.votedFor { 50 | reply.Term = args.Term 51 | reply.VoteGranted = true 52 | return 53 | } 54 | if rf.votedFor != -1 && args.CandidateId != rf.votedFor { 55 | return 56 | } 57 | 58 | //还有一种情况,没有投过票 59 | } 60 | 61 | if rf.currentTerm < args.Term { 62 | rf.currentTerm = args.Term 63 | rf.changeRole(Role_Follower) 64 | rf.votedFor = -1 65 | reply.Term = rf.currentTerm 66 | rf.persist() 67 | } 68 | 69 | //判断日志完整性 70 | if lastLogTerm > args.LastLogTerm || (lastLogTerm == args.LastLogTerm && lastLogIndex > args.LastLogIndex) { 71 | return 72 | } 73 | 74 | rf.votedFor = args.CandidateId 75 | reply.VoteGranted = true 76 | rf.changeRole(Role_Follower) 77 | rf.resetElectionTimer() 78 | rf.persist() 79 | DPrintf("%v, role:%v,voteFor: %v", rf.me, rf.role, rf.votedFor) 80 | } 81 | 82 | // 83 | // example code to send a RequestVote RPC to a server. 84 | // server is the index of the target server in rf.peers[]. 85 | // expects RPC arguments in args. 86 | // fills in *reply with RPC reply, so caller should 87 | // pass &reply. 88 | // the types of the args and reply passed to Call() must be 89 | // the same as the types of the arguments declared in the 90 | // handler function (including whether they are pointers). 91 | // 92 | // The labrpc package simulates a lossy network, in which servers 93 | // may be unreachable, and in which requests and replies may be lost. 94 | // Call() sends a request and waits for a reply. If a reply arrives 95 | // within a timeout interval, Call() returns true; otherwise 96 | // Call() returns false. Thus Call() may not return for a while. 97 | // A false return can be caused by a dead server, a live server that 98 | // can't be reached, a lost request, or a lost reply. 99 | // 100 | // Call() is guaranteed to return (perhaps after a delay) *except* if the 101 | // handler function on the server side does not return. Thus there 102 | // is no need to implement your own timeouts around Call(). 103 | // 104 | // look at the comments in ../labrpc/labrpc.go for more details. 105 | // 106 | // if you're having trouble getting RPC to work, check that you've 107 | // capitalized all field names in structs passed over RPC, and 108 | // that the caller passes the address of the reply struct with &, not 109 | // the struct itself. 110 | // 111 | func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply) { 112 | if server < 0 || server > len(rf.peers) || server == rf.me { 113 | panic("server invalid in sendRequestVote!") 114 | } 115 | 116 | rpcTimer := time.NewTimer(RPCTimeout) 117 | defer rpcTimer.Stop() 118 | 119 | ch := make(chan bool, 1) 120 | go func() { 121 | for i := 0; i < 10 && !rf.killed(); i++ { 122 | ok := rf.peers[server].Call("Raft.RequestVote", args, reply) 123 | if !ok { 124 | continue 125 | } else { 126 | ch <- ok 127 | return 128 | } 129 | } 130 | }() 131 | 132 | select { 133 | case <-rpcTimer.C: 134 | DPrintf("%v role: %v, send request vote to peer %v TIME OUT!!!", rf.me, rf.role, server) 135 | return 136 | case <-ch: 137 | return 138 | } 139 | 140 | } 141 | 142 | func (rf *Raft) startElection() { 143 | rf.mu.Lock() 144 | rf.resetElectionTimer() 145 | if rf.role == Role_Leader { 146 | rf.mu.Unlock() 147 | return 148 | } 149 | 150 | rf.changeRole(Role_Candidate) 151 | DPrintf("%v role %v,start election,term: %v", rf.me, rf.role, rf.currentTerm) 152 | 153 | lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex() 154 | args := RequestVoteArgs{ 155 | CandidateId: rf.me, 156 | Term: rf.currentTerm, 157 | LastLogTerm: lastLogTerm, 158 | LastLogIndex: lastLogIndex, 159 | } 160 | rf.persist() 161 | rf.mu.Unlock() 162 | 163 | allCount := len(rf.peers) 164 | grantedCount := 1 165 | resCount := 1 166 | grantedChan := make(chan bool, len(rf.peers)-1) 167 | for i := 0; i < allCount; i++ { 168 | if i == rf.me { 169 | continue 170 | } 171 | //对每一个其他节点都要发送rpc 172 | go func(gch chan bool, index int) { 173 | reply := RequestVoteReply{} 174 | rf.sendRequestVote(index, &args, &reply) 175 | gch <- reply.VoteGranted 176 | if reply.Term > args.Term { 177 | rf.mu.Lock() 178 | if reply.Term > rf.currentTerm { 179 | //放弃选举 180 | rf.currentTerm = reply.Term 181 | rf.changeRole(Role_Follower) 182 | rf.votedFor = -1 183 | rf.resetElectionTimer() 184 | rf.persist() 185 | } 186 | rf.mu.Unlock() 187 | } 188 | }(grantedChan, i) 189 | 190 | } 191 | 192 | for rf.role == Role_Candidate { 193 | flag := <-grantedChan 194 | resCount++ 195 | if flag { 196 | grantedCount++ 197 | } 198 | DPrintf("vote: %v, allCount: %v, resCount: %v, grantedCount: %v", flag, allCount, resCount, grantedCount) 199 | 200 | if grantedCount > allCount/2 { 201 | //竞选成功 202 | rf.mu.Lock() 203 | DPrintf("before try change to leader,count:%d, args:%+v, currentTerm: %v, argsTerm: %v", grantedCount, args, rf.currentTerm, args.Term) 204 | if rf.role == Role_Candidate && rf.currentTerm == args.Term { 205 | rf.changeRole(Role_Leader) 206 | } 207 | if rf.role == Role_Leader { 208 | rf.resetAppendEntriesTimersZero() 209 | } 210 | rf.persist() 211 | rf.mu.Unlock() 212 | DPrintf("%v current role: %v", rf.me, rf.role) 213 | } else if resCount == allCount || resCount-grantedCount > allCount/2 { 214 | DPrintf("grant fail! grantedCount <= len/2:count:%d", grantedCount) 215 | return 216 | } 217 | } 218 | 219 | } 220 | -------------------------------------------------------------------------------- /shardkv/server_shard.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import ( 4 | "6.824/shardctrler" 5 | "time" 6 | ) 7 | 8 | //判断是否存在指定config和指定shardId的output shard 9 | func (kv *ShardKV) OutputDataExist(configNum int, shardId int) bool { 10 | if _, ok := kv.outputShards[configNum]; ok { 11 | if _, ok = kv.outputShards[configNum][shardId]; ok { 12 | return true 13 | } 14 | } 15 | return false 16 | } 17 | 18 | /* 19 | RPC,针对output shard 20 | */ 21 | //请求获取shard 22 | func (kv *ShardKV) FetchShardData(args *FetchShardDataArgs, reply *FetchShardDataReply) { 23 | kv.log("get req fetchsharddata:args:%+v, reply:%+v", args, reply) 24 | defer kv.log("resp fetchsharddata:args:%+v, reply:%+v", args, reply) 25 | kv.lock("fetchShardData") 26 | defer kv.unlock("fetchShardData") 27 | 28 | //必须是过去的config 29 | if args.ConfigNum >= kv.config.Num { 30 | return 31 | } 32 | 33 | reply.Success = false 34 | if configData, ok := kv.outputShards[args.ConfigNum]; ok { 35 | if shardData, ok := configData[args.ShardNum]; ok { 36 | reply.Success = true 37 | reply.Data = make(map[string]string) 38 | reply.CommandIndexes = make(map[int64]int64) 39 | for k, v := range shardData.Data { 40 | reply.Data[k] = v 41 | } 42 | for k, v := range shardData.CommandIndexes { 43 | reply.CommandIndexes[k] = v 44 | } 45 | } 46 | } 47 | return 48 | 49 | } 50 | 51 | //请求清除shard 52 | func (kv *ShardKV) CleanShardData(args *CleanShardDataArgs, reply *CleanShardDataReply) { 53 | kv.log("get req CleanShardData:args:%+v, reply:%+v", args, reply) 54 | defer kv.log("resp CleanShardData:args:%+v, reply:%+v", args, reply) 55 | kv.lock("cleanShardData") 56 | 57 | //必须是过去的config 58 | if args.ConfigNum >= kv.config.Num { 59 | kv.unlock("cleanShardData") 60 | return 61 | } 62 | kv.unlock("cleanShardData") 63 | _, _, isLeader := kv.rf.Start(*args) 64 | if !isLeader { 65 | return 66 | } 67 | 68 | // 简单处理下。。。 69 | for i := 0; i < 10; i++ { 70 | kv.lock("cleanShardData") 71 | exist := kv.OutputDataExist(args.ConfigNum, args.ShardNum) 72 | kv.unlock("cleanShardData") 73 | if !exist { 74 | reply.Success = true 75 | return 76 | } 77 | time.Sleep(time.Millisecond * 20) 78 | } 79 | 80 | //采用下面这种方式获取start结果,其实会慢一些,还会出现锁的问题 81 | //kv.lock("CleanShardData") 82 | //ch := make(chan struct{}, 1) 83 | //kv.cleanOutputDataNotifyCh[fmt.Sprintf("%d%d", args.ConfigNum, args.ShardNum)] = ch 84 | //kv.unlock("CleanShardData") 85 | //t := time.NewTimer(WaitCmdTimeOut) 86 | //defer t.Stop() 87 | // 88 | //select { 89 | //case <-t.C: 90 | //case <-ch: 91 | //case <-kv.stopCh: 92 | //} 93 | // 94 | //kv.lock("removeCh") 95 | ////删除ch 96 | //if _, ok := kv.cleanOutputDataNotifyCh[fmt.Sprintf("%d%d", args.ConfigNum, args.ShardNum)]; ok { 97 | // delete(kv.cleanOutputDataNotifyCh, fmt.Sprintf("%d%d", args.ConfigNum, args.ShardNum)) 98 | //} 99 | ////判断是否还存在 100 | //exist := kv.OutputDataExist(args.ConfigNum, args.ShardNum) 101 | //kv.unlock("removeCh") 102 | //if !exist { 103 | // reply.Success = true 104 | //} 105 | return 106 | 107 | } 108 | 109 | /* 110 | 定时任务,请求input shard 111 | */ 112 | 113 | //定时获取shard 114 | func (kv *ShardKV) fetchShards() { 115 | for { 116 | select { 117 | case <-kv.stopCh: 118 | return 119 | case <-kv.pullShardsTimer.C: 120 | //判断是否有要input的shard 121 | _, isLeader := kv.rf.GetState() 122 | if isLeader { 123 | kv.lock("pullshards") 124 | for shardId, _ := range kv.inputShards { 125 | //注意要从上一个config中请求shard的源节点 126 | go kv.fetchShard(shardId, kv.oldConfig) 127 | } 128 | kv.unlock("pullshards") 129 | } 130 | kv.pullShardsTimer.Reset(PullShardsInterval) 131 | 132 | } 133 | } 134 | } 135 | 136 | //获取指定的shard 137 | func (kv *ShardKV) fetchShard(shardId int, config shardctrler.Config) { 138 | args := FetchShardDataArgs{ 139 | ConfigNum: config.Num, 140 | ShardNum: shardId, 141 | } 142 | 143 | t := time.NewTimer(CallPeerFetchShardDataTimeOut) 144 | defer t.Stop() 145 | 146 | for { 147 | //依次请求group中的每个节点,但只要获取一个就好了 148 | for _, s := range config.Groups[config.Shards[shardId]] { 149 | reply := FetchShardDataReply{} 150 | srv := kv.make_end(s) 151 | done := make(chan bool, 1) 152 | go func(args *FetchShardDataArgs, reply *FetchShardDataReply) { 153 | done <- srv.Call("ShardKV.FetchShardData", args, reply) 154 | }(&args, &reply) 155 | 156 | t.Reset(CallPeerFetchShardDataTimeOut) 157 | 158 | select { 159 | case <-kv.stopCh: 160 | return 161 | case <-t.C: 162 | case isDone := <-done: 163 | if isDone && reply.Success == true { 164 | kv.lock("pullShard") 165 | if _, ok := kv.inputShards[shardId]; ok && kv.config.Num == config.Num+1 { 166 | replyCopy := reply.Copy() 167 | mergeShardData := MergeShardData{ 168 | ConfigNum: args.ConfigNum, 169 | ShardNum: args.ShardNum, 170 | Data: replyCopy.Data, 171 | CommandIndexes: replyCopy.CommandIndexes, 172 | } 173 | kv.log("pullShard get data:%+v", mergeShardData) 174 | kv.unlock("pullShard") 175 | kv.rf.Start(mergeShardData) 176 | //不管是不是leader都返回 177 | return 178 | } else { 179 | kv.unlock("pullshard") 180 | } 181 | } 182 | } 183 | 184 | } 185 | } 186 | 187 | } 188 | 189 | /* 190 | 处理好input shard,请求源节点清除output shard 191 | */ 192 | 193 | //发送给shard源节点,可以删除shard数据了 194 | //一般在apply command中处理好input的shard,发送给源节点删除保存的shard数据 195 | func (kv *ShardKV) callPeerCleanShardData(config shardctrler.Config, shardId int) { 196 | args := CleanShardDataArgs{ 197 | ConfigNum: config.Num, 198 | ShardNum: shardId, 199 | } 200 | 201 | t := time.NewTimer(CallPeerCleanShardDataTimeOut) 202 | defer t.Stop() 203 | 204 | for { 205 | //因为并不知道哪一个节点是leader,因此群发吧 206 | for _, group := range config.Groups[config.Shards[shardId]] { 207 | reply := CleanShardDataReply{} 208 | srv := kv.make_end(group) 209 | done := make(chan bool, 1) 210 | 211 | go func(args *CleanShardDataArgs, reply *CleanShardDataReply) { 212 | done <- srv.Call("ShardKV.CleanShardData", args, reply) 213 | }(&args, &reply) 214 | 215 | t.Reset(CallPeerCleanShardDataTimeOut) 216 | 217 | select { 218 | case <-kv.stopCh: 219 | return 220 | case <-t.C: 221 | case isDone := <-done: 222 | if isDone && reply.Success == true { 223 | return 224 | } 225 | } 226 | 227 | } 228 | kv.lock("callPeerCleanShardData") 229 | if kv.config.Num != config.Num+1 || len(kv.inputShards) == 0 { 230 | kv.unlock("callPeerCleanShardData") 231 | break 232 | } 233 | kv.unlock("callPeerCleanShardData") 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /mr/coordinator.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "errors" 5 | "log" 6 | "net" 7 | "net/http" 8 | "net/rpc" 9 | "os" 10 | "sync" 11 | "time" 12 | ) 13 | 14 | type TaskPhase int //任务阶段 15 | type TaskStatus int //任务状态 16 | 17 | //任务阶段 18 | const ( 19 | TaskPhase_Map TaskPhase = 0 20 | TaskPhase_Reduce TaskPhase = 1 21 | ) 22 | 23 | //任务状态 24 | const ( 25 | TaskStatus_New TaskStatus = 0 //还没有创建 26 | TaskStatus_Ready TaskStatus = 1 //进入队列 27 | TaskStatus_Running TaskStatus = 2 //已经分配,正在运行 28 | TaskStatus_Terminated TaskStatus = 3 //运行结束 29 | TaskStatus_Error TaskStatus = 4 //运行出错 30 | ) 31 | 32 | const ( 33 | ScheduleInterval = time.Millisecond * 500 //扫描任务状态的间隔时间 34 | MaxTaskRunningTime = time.Second * 5 //每个任务的最大执行时间,用于判断是否超时 35 | ) 36 | 37 | //任务 38 | type Task struct { 39 | FileName string //当前任务的文件名 40 | Phase TaskPhase //当前任务状态 41 | Seq int //当前的任务序列 42 | NMap int //map任务/file的数量 43 | NReduce int //reduce任务/分区的数量 44 | Alive bool //是否存活 45 | } 46 | 47 | //任务状态 48 | type TaskState struct { 49 | Status TaskStatus //任务状态 50 | WorkerId int //执行当前Task的workerid 51 | StartTime time.Time //任务开始执行的时间 52 | } 53 | 54 | type Coordinator struct { 55 | files []string //存储要处理的文件 56 | nReduce int //reduce/分区数量 57 | taskPhase TaskPhase //任务阶段 58 | taskStates []TaskState //任务的状态 59 | taskChan chan Task //任务队列 60 | workerSeq int //worker序列 61 | done bool //是否做完 62 | muLock sync.Mutex //互斥锁 63 | 64 | } 65 | 66 | //创建一个task 67 | func (c *Coordinator) NewOneTask(seq int) Task { 68 | task := Task{ 69 | FileName: "", 70 | Phase: c.taskPhase, 71 | NMap: len(c.files), 72 | NReduce: c.nReduce, 73 | Seq: seq, 74 | Alive: true, 75 | } 76 | 77 | DPrintf("m:%+v, taskseq:%d, lenfiles:%d, lents:%d", c, seq, len(c.files), len(c.taskStates)) 78 | 79 | if task.Phase == TaskPhase_Map { 80 | task.FileName = c.files[seq] 81 | } 82 | return task 83 | } 84 | 85 | //扫描任务状态并适当更新 86 | func (c *Coordinator) scanTaskState() { 87 | DPrintf("scanTaskState...") 88 | c.muLock.Lock() 89 | defer c.muLock.Unlock() 90 | 91 | //这里不能使用函数Done(),因为此时已经上锁 92 | if c.done { 93 | return 94 | } 95 | 96 | allDone := true 97 | //循环每个任务的状态 98 | for k, v := range c.taskStates { 99 | switch v.Status { 100 | case TaskStatus_New: 101 | allDone = false 102 | c.taskStates[k].Status = TaskStatus_Ready 103 | c.taskChan <- c.NewOneTask(k) 104 | case TaskStatus_Ready: 105 | allDone = false 106 | case TaskStatus_Running: 107 | allDone = false 108 | //超时重新分配该任务 109 | if time.Now().Sub(v.StartTime) > MaxTaskRunningTime { 110 | c.taskStates[k].Status = TaskStatus_Ready 111 | c.taskChan <- c.NewOneTask(k) 112 | } 113 | case TaskStatus_Terminated: 114 | case TaskStatus_Error: 115 | allDone = false 116 | c.taskStates[k].Status = TaskStatus_Ready 117 | c.taskChan <- c.NewOneTask(k) 118 | default: 119 | panic("t. status err in schedule") 120 | } 121 | } 122 | 123 | //如果当前任务完成了 124 | if allDone { 125 | if c.taskPhase == TaskPhase_Map { 126 | //进入Reduce阶段 127 | DPrintf("init ReduceTask") 128 | c.taskPhase = TaskPhase_Reduce 129 | c.taskStates = make([]TaskState, c.nReduce) 130 | } else { 131 | log.Println("finish all tasks!!!😊") 132 | c.done = true 133 | } 134 | } 135 | } 136 | 137 | //定时更新状态 138 | func (c *Coordinator) schedule() { 139 | for !c.Done() { 140 | c.scanTaskState() 141 | time.Sleep(ScheduleInterval) 142 | } 143 | } 144 | 145 | // Your code here -- RPC handlers for the worker to call. 146 | 147 | //处理Rpc请求:获取任务 148 | func (c *Coordinator) GetOneTask(args *TaskArgs, reply *TaskReply) error { 149 | task := <-c.taskChan 150 | reply.Task = &task 151 | 152 | if task.Alive { 153 | //修改状态 154 | c.muLock.Lock() 155 | if task.Phase != c.taskPhase { 156 | return errors.New("GetOneTask Task phase neq") 157 | } 158 | c.taskStates[task.Seq].WorkerId = args.WorkerId 159 | c.taskStates[task.Seq].Status = TaskStatus_Running 160 | c.taskStates[task.Seq].StartTime = time.Now() 161 | c.muLock.Unlock() 162 | } 163 | 164 | DPrintf("in get one Task, args:%+v, reply:%+v", args, reply) 165 | return nil 166 | } 167 | 168 | //处理Rpc请求:注册worker 169 | func (c *Coordinator) RegWorker(args *RegArgs, reply *RegReply) error { 170 | DPrintf("worker reg!") 171 | c.muLock.Lock() 172 | defer c.muLock.Unlock() 173 | c.workerSeq++ 174 | reply.WorkerId = c.workerSeq 175 | return nil 176 | } 177 | 178 | //处理Rpc请求:worker响应task完成情况 179 | func (c *Coordinator) ReportTask(args *ReportTaskArgs, reply *ReportTaskReply) error { 180 | c.muLock.Lock() 181 | defer c.muLock.Unlock() 182 | 183 | DPrintf("get report task: %+v, taskPhase: %+v", args, c.taskPhase) 184 | 185 | //如果发现阶段不同或者当前任务已经分配给了其它worker就不修改当前任务状态 186 | if c.taskPhase != args.Phase || c.taskStates[args.Seq].WorkerId != args.WorkerId { 187 | DPrintf("in report task,workerId=%v report a useless task=%v", args.WorkerId, args.Seq) 188 | return nil 189 | } 190 | 191 | if args.Done { 192 | c.taskStates[args.Seq].Status = TaskStatus_Terminated 193 | } else { 194 | c.taskStates[args.Seq].Status = TaskStatus_Error 195 | } 196 | 197 | go c.scanTaskState() 198 | return nil 199 | } 200 | 201 | // 202 | // an example RPC handler. 203 | // 204 | // the RPC argument and reply types are defined in rpc.go. 205 | // 206 | //func (c *Coordinator) Example(args *ExampleArgs, reply *ExampleReply) error { 207 | // reply.Y = args.X + 1 208 | // return nil 209 | //} 210 | 211 | // 212 | // start a thread that listens for RPCs from worker.go 213 | // 214 | func (c *Coordinator) server() { 215 | rpc.Register(c) // 注册 RPC 服务 216 | rpc.HandleHTTP() // 将 RPC 服务绑定到 HTTP 服务中去 217 | //l, e := net.Listen("tcp", ":1234") 218 | sockname := coordinatorSock() 219 | os.Remove(sockname) 220 | l, e := net.Listen("unix", sockname) 221 | if e != nil { 222 | log.Fatal("listen error:", e) 223 | } 224 | go http.Serve(l, nil) 225 | } 226 | 227 | // 228 | // main/mrcoordinator.go calls Done() periodically to find out 229 | // if the entire job has finished. 230 | //如果工作全部完成,返回true 231 | // 232 | func (c *Coordinator) Done() bool { 233 | c.muLock.Lock() 234 | defer c.muLock.Unlock() 235 | 236 | return c.done 237 | } 238 | 239 | // 240 | // create a Coordinator. 241 | // main/mrcoordinator.go calls this function. 242 | // nReduce is the number of reduce tasks to use. 243 | // 244 | func MakeCoordinator(files []string, nReduce int) *Coordinator { 245 | c := Coordinator{ 246 | files: files, 247 | nReduce: nReduce, 248 | taskPhase: TaskPhase_Map, 249 | taskStates: make([]TaskState, len(files)), 250 | workerSeq: 0, 251 | done: false, 252 | } 253 | if len(files) > nReduce { 254 | c.taskChan = make(chan Task, len(files)) 255 | } else { 256 | c.taskChan = make(chan Task, nReduce) 257 | } 258 | 259 | go c.schedule() 260 | c.server() 261 | DPrintf("master init") 262 | 263 | return &c 264 | } 265 | -------------------------------------------------------------------------------- /mr/worker.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "io/ioutil" 8 | "os" 9 | "strings" 10 | ) 11 | import "log" 12 | import "net/rpc" 13 | import "hash/fnv" 14 | 15 | // 16 | // Map functions return a slice of KeyValue. 17 | // 18 | type KeyValue struct { 19 | Key string 20 | Value string 21 | } 22 | 23 | type worker struct { 24 | worerId int 25 | mapF func(string, string) []KeyValue 26 | reduceF func(string, []string) string 27 | } 28 | 29 | // 30 | // main/mrworker.go calls this function. 31 | // 32 | func Worker(mapf func(string, string) []KeyValue, 33 | reducef func(string, []string) string) { 34 | 35 | // Your worker implementation here. 36 | worker := worker{ 37 | mapF: mapf, 38 | reduceF: reducef, 39 | } 40 | 41 | worker.register() 42 | worker.run() 43 | // uncomment to send the Example RPC to the coordinator. 44 | // CallExample() 45 | 46 | } 47 | 48 | func (w *worker) run() { 49 | DPrintf("run") 50 | for { 51 | task, err := w.getTask() 52 | if err != nil { 53 | DPrintf(err.Error()) 54 | continue 55 | } 56 | if !task.Alive { 57 | DPrintf("worker get task not alive, exit") 58 | return 59 | } 60 | w.doTask(*task) 61 | } 62 | } 63 | 64 | //开始做任务 65 | func (w *worker) doTask(task Task) { 66 | switch task.Phase { 67 | case TaskPhase_Map: 68 | w.doMapTask(task) 69 | case TaskPhase_Reduce: 70 | w.doReduceTask(task) 71 | default: 72 | panic(fmt.Sprintf("task phase err: %v", task.Phase)) 73 | } 74 | } 75 | 76 | // 77 | // use ihash(key) % NReduce to choose the reduce 78 | // task number for each KeyValue emitted by Map. 79 | // 80 | func ihash(key string) int { 81 | h := fnv.New32a() 82 | h.Write([]byte(key)) 83 | return int(h.Sum32() & 0x7fffffff) 84 | } 85 | 86 | //map任务时获取要输出的文件名 87 | func (w *worker) getReduceName(mapId, partitionId int) string { 88 | return fmt.Sprintf("mr-kv-%d-%d", mapId, partitionId) 89 | } 90 | 91 | //reduce任务时获取要输出的文件名 92 | func (w *worker) getMergeName(partitionId int) string { 93 | return fmt.Sprintf("mr-out-%d", partitionId) 94 | } 95 | 96 | //做map任务 97 | func (w *worker) doMapTask(task Task) { 98 | DPrintf("%v start read file %v", w.worerId, task.FileName) 99 | cont, err := ioutil.ReadFile(task.FileName) 100 | if err != nil { 101 | DPrintf("%v", err) 102 | w.reportTask(task, false) 103 | return 104 | } 105 | 106 | kvs := w.mapF(task.FileName, string(cont)) 107 | partions := make([][]KeyValue, task.NReduce) 108 | for _, kv := range kvs { 109 | pid := ihash(kv.Key) % task.NReduce 110 | partions[pid] = append(partions[pid], kv) 111 | } 112 | 113 | for k, v := range partions { 114 | fileName := w.getReduceName(task.Seq, k) 115 | file, err := os.Create(fileName) 116 | if err != nil { 117 | DPrintf("create file-%v fail in doMapTask. %v", fileName, err) 118 | w.reportTask(task, false) 119 | return 120 | } 121 | encoder := json.NewEncoder(file) 122 | for _, kv := range v { 123 | if err := encoder.Encode(&kv); err != nil { 124 | DPrintf("encode kvs to file-%v fail in doMapTask. %v", fileName, err) 125 | w.reportTask(task, false) 126 | } 127 | } 128 | if err := file.Close(); err != nil { 129 | DPrintf("close file-%v fail in doMapTask. %v", fileName, err) 130 | w.reportTask(task, false) 131 | } 132 | } 133 | w.reportTask(task, true) 134 | } 135 | 136 | //做reduce任务 137 | func (w *worker) doReduceTask(task Task) { 138 | maps := make(map[string][]string) 139 | 140 | for i := 0; i < task.NMap; i++ { 141 | fileName := w.getReduceName(i, task.Seq) 142 | file, err := os.Open(fileName) 143 | if err != nil { 144 | DPrintf("open file-%v fail in doReduceTask. %v", fileName, err) 145 | w.reportTask(task, false) 146 | return 147 | } 148 | decoder := json.NewDecoder(file) 149 | for { 150 | var kv KeyValue 151 | if err := decoder.Decode(&kv); err != nil { 152 | break 153 | } 154 | if _, ok := maps[kv.Key]; !ok { 155 | maps[kv.Key] = make([]string, 0) 156 | } 157 | maps[kv.Key] = append(maps[kv.Key], kv.Value) 158 | } 159 | } 160 | 161 | res := make([]string, 0) 162 | for k, v := range maps { 163 | len := w.reduceF(k, v) 164 | res = append(res, fmt.Sprintf("%v %v\n", k, len)) 165 | } 166 | 167 | fileName := w.getMergeName(task.Seq) 168 | if err := ioutil.WriteFile(fileName, []byte(strings.Join(res, "")), 0600); err != nil { 169 | DPrintf("write file-%v in doReduceTask. %v", fileName, err) 170 | w.reportTask(task, false) 171 | } 172 | 173 | w.reportTask(task, true) 174 | } 175 | 176 | // 177 | // example function to show how to make an RPC call to the coordinator. 178 | // 179 | // the RPC argument and reply types are defined in rpc.go. 180 | // 181 | //func CallExample() { 182 | // 183 | // // declare an argument structure. 184 | // args := ExampleArgs{} 185 | // 186 | // // fill in the argument(s). 187 | // args.X = 99 188 | // 189 | // // declare a reply structure. 190 | // reply := ExampleReply{} 191 | // 192 | // // send the RPC request, wait for the reply. 193 | // // the "Coordinator.Example" tells the 194 | // // receiving server that we'd like to call 195 | // // the Example() method of struct Coordinator. 196 | // ok := call("Coordinator.Example", &args, &reply) 197 | // if ok { 198 | // // reply.Y should be 100. 199 | // fmt.Printf("reply.Y %v\n", reply.Y) 200 | // } else { 201 | // fmt.Printf("call failed!\n") 202 | // } 203 | //} 204 | 205 | //rpc请求:注册worker 206 | func (w *worker) register() { 207 | DPrintf("reg") 208 | args := &RegArgs{} 209 | reply := &RegReply{} 210 | 211 | if err := call("Coordinator.RegWorker", args, reply); !err { 212 | log.Fatal("worker register error!", err) 213 | } 214 | w.worerId = reply.WorkerId 215 | } 216 | 217 | //rpc请求:请求获取任务 218 | func (w *worker) getTask() (*Task, error) { 219 | args := TaskArgs{WorkerId: w.worerId} 220 | reply := TaskReply{} 221 | 222 | if err := call("Coordinator.GetOneTask", &args, &reply); !err { 223 | return nil, errors.New("worker getTask error!") 224 | } 225 | DPrintf("worker get task:%+v", reply.Task) 226 | return reply.Task, nil 227 | } 228 | 229 | //rpc请求:报告任务状态 230 | func (w *worker) reportTask(task Task, done bool) { 231 | args := ReportTaskArgs{ 232 | WorkerId: w.worerId, 233 | Phase: task.Phase, 234 | Seq: task.Seq, 235 | Done: done, 236 | } 237 | reply := ReportTaskReply{} 238 | if ok := call("Coordinator.ReportTask", &args, &reply); !ok { 239 | DPrintf("report task fail:%+v", args) 240 | } 241 | } 242 | 243 | // 244 | // send an RPC request to the coordinator, wait for the response. 245 | // usually returns true. 246 | // returns false if something goes wrong. 247 | // 248 | func call(rpcname string, args interface{}, reply interface{}) bool { 249 | // c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234") 250 | sockname := coordinatorSock() 251 | conn, err := rpc.DialHTTP("unix", sockname) 252 | if err != nil { 253 | log.Fatal("dialing:", err) 254 | } 255 | defer conn.Close() 256 | 257 | err = conn.Call(rpcname, args, reply) //rpcname = 结构体名.方法名 258 | if err == nil { 259 | return true 260 | } 261 | 262 | fmt.Println(err) 263 | return false 264 | } 265 | -------------------------------------------------------------------------------- /shardkv/server.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import ( 4 | "6.824/labrpc" 5 | "6.824/shardctrler" 6 | "fmt" 7 | "time" 8 | ) 9 | import "6.824/raft" 10 | import "sync" 11 | import "6.824/labgob" 12 | 13 | const ( 14 | PullConfigInterval = time.Millisecond * 100 15 | PullShardsInterval = time.Millisecond * 200 16 | WaitCmdTimeOut = time.Millisecond * 500 17 | CallPeerFetchShardDataTimeOut = time.Millisecond * 500 18 | CallPeerCleanShardDataTimeOut = time.Millisecond * 500 19 | MaxLockTime = time.Millisecond * 10 // debug 20 | ) 21 | 22 | type ShardKV struct { 23 | mu sync.Mutex 24 | me int 25 | rf *raft.Raft 26 | applyCh chan raft.ApplyMsg 27 | make_end func(string) *labrpc.ClientEnd 28 | gid int 29 | ctrlers []*labrpc.ClientEnd 30 | maxraftstate int // snapshot if log grows this big 31 | 32 | // Your definitions here. 33 | stopCh chan struct{} 34 | commandNotifyCh map[int64]chan CommandResult //用于命令apply后的唤醒 35 | lastApplies [shardctrler.NShards]map[int64]int64 //k-v:ClientId-CommandId 36 | config shardctrler.Config //记录当前的config 37 | oldConfig shardctrler.Config //保存上一个config,进行shard迁移时,目标节点根据这个config来获取源节点,从而获取shard数据和请求清除shard数据 38 | meShards map[int]bool //记录自己分配到的shard 39 | data [shardctrler.NShards]map[string]string 40 | 41 | inputShards map[int]bool //当前这个config相较于上一个config新指派的shard,只有input为空了才能更新下一个config 42 | outputShards map[int]map[int]MergeShardData // configNum -> shard -> data。当某一个config,当前节点的shard移除,则记录当前config的所有移除shard的mergeShardData 43 | //cleanOutputDataNotifyCh map[string]chan struct{} //用来通知等待协程clean完成 44 | scc *shardctrler.Clerk //保存一个shardctrler的客户端,因为要向shardctrler发送query获取配置信息 45 | 46 | //持久化 47 | persister *raft.Persister 48 | 49 | //定时任务计时器 50 | pullConfigTimer *time.Timer //定期获取config 51 | pullShardsTimer *time.Timer //定期检查inputShard并请求数据 52 | 53 | //用于互斥锁 54 | lockStartTime time.Time 55 | lockEndTime time.Time 56 | lockMsg string 57 | } 58 | 59 | /* 60 | 通用函数 61 | */ 62 | 63 | //自定义锁 64 | func (kv *ShardKV) lock(msg string) { 65 | kv.mu.Lock() 66 | kv.lockStartTime = time.Now() 67 | kv.lockMsg = msg 68 | } 69 | 70 | func (kv *ShardKV) unlock(msg string) { 71 | kv.lockEndTime = time.Now() 72 | duration := kv.lockEndTime.Sub(kv.lockStartTime) 73 | kv.lockMsg = "" 74 | kv.mu.Unlock() 75 | if duration > MaxLockTime { 76 | kv.log("lock too long:%s:%s\n", msg, duration) 77 | } 78 | } 79 | 80 | func (kv *ShardKV) log(format string, value ...interface{}) { 81 | baseMsg := fmt.Sprintf("server me: %d, gid:%d, config:%+v, input:%+v.", 82 | kv.me, kv.gid, kv.config, kv.inputShards) 83 | DPrintf(baseMsg, format, value) 84 | } 85 | 86 | // 87 | // the tester calls Kill() when a ShardKV instance won't 88 | // be needed again. you are not required to do anything 89 | // in Kill(), but it might be convenient to (for example) 90 | // turn off debug output from this instance. 91 | // 92 | func (kv *ShardKV) Kill() { 93 | kv.rf.Kill() 94 | // Your code here, if desired. 95 | close(kv.stopCh) 96 | kv.log("kil kv") 97 | } 98 | 99 | /* 100 | 定时任务 101 | */ 102 | 103 | func (kv *ShardKV) pullConfig() { 104 | for { 105 | select { 106 | case <-kv.stopCh: 107 | return 108 | case <-kv.pullConfigTimer.C: 109 | //只有leader才能获取 110 | _, isLeader := kv.rf.GetState() 111 | if !isLeader { 112 | kv.pullConfigTimer.Reset(PullConfigInterval) 113 | break 114 | } 115 | kv.lock("pullconfig") 116 | lastNum := kv.config.Num 117 | kv.log("pull config,last: %d", lastNum) 118 | kv.unlock("pullconfig") 119 | 120 | config := kv.scc.Query(lastNum + 1) 121 | if config.Num == lastNum+1 { 122 | //找到新的config 123 | kv.log("pull config,new config:%+v", config) 124 | kv.lock("pullconfig") 125 | //这一个判断很关键,必须当前shard全部迁移完成才能获取下一个config 126 | if len(kv.inputShards) == 0 && kv.config.Num+1 == config.Num { 127 | kv.log("pull config,start config:%+v", config) 128 | kv.unlock("pullconfig") 129 | //请求该命令 130 | kv.rf.Start(config.Copy()) 131 | } else { 132 | kv.unlock("pullconfig") 133 | } 134 | } 135 | kv.pullConfigTimer.Reset(PullConfigInterval) 136 | } 137 | } 138 | } 139 | 140 | func (kv *ShardKV) ticker() { 141 | //处理applyCh 142 | go kv.handleApplyCh() 143 | //定时获取config信息 144 | go kv.pullConfig() 145 | //定时获取input shard(如果有的话) 146 | go kv.fetchShards() 147 | } 148 | 149 | /* 150 | 初始服务器 151 | */ 152 | 153 | // 154 | // servers[] contains the ports of the servers in this group. 155 | // 156 | // me is the index of the current server in servers[]. 157 | // 158 | // the k/v server should store snapshots through the underlying Raft 159 | // implementation, which should call persister.SaveStateAndSnapshot() to 160 | // atomically save the Raft state along with the snapshot. 161 | // 162 | // the k/v server should snapshot when Raft's saved state exceeds 163 | // maxraftstate bytes, in order to allow Raft to garbage-collect its 164 | // log. if maxraftstate is -1, you don't need to snapshot. 165 | // 166 | // gid is this group's GID, for interacting with the shardctrler. 167 | // 168 | // pass ctrlers[] to shardctrler.MakeClerk() so you can send 169 | // RPCs to the shardctrler. 170 | // 171 | // make_end(servername) turns a server name from a 172 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can 173 | // send RPCs. You'll need this to send RPCs to other groups. 174 | // 175 | // look at client.go for examples of how to use ctrlers[] 176 | // and make_end() to send RPCs to the group owning a specific shard. 177 | // 178 | // StartServer() must return quickly, so it should start goroutines 179 | // for any long-running work. 180 | // 181 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int, gid int, ctrlers []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *ShardKV { 182 | // call labgob.Register on structures you want 183 | // Go's RPC library to marshall/unmarshall. 184 | labgob.Register(Op{}) 185 | 186 | kv := new(ShardKV) 187 | kv.me = me 188 | kv.maxraftstate = maxraftstate 189 | kv.make_end = make_end 190 | kv.gid = gid 191 | kv.ctrlers = ctrlers 192 | 193 | // Your initialization code here. 194 | kv.persister = persister 195 | kv.scc = shardctrler.MakeClerk(kv.ctrlers) 196 | // Use something like this to talk to the shardctrler: 197 | // kv.mck = shardctrler.MakeClerk(kv.ctrlers) 198 | 199 | kv.applyCh = make(chan raft.ApplyMsg) 200 | kv.stopCh = make(chan struct{}) 201 | kv.rf = raft.Make(servers, me, persister, kv.applyCh) 202 | 203 | //初始化自身数据 204 | kv.data = [shardctrler.NShards]map[string]string{} 205 | for i, _ := range kv.data { 206 | kv.data[i] = make(map[string]string) 207 | } 208 | kv.lastApplies = [shardctrler.NShards]map[int64]int64{} 209 | for i, _ := range kv.lastApplies { 210 | kv.lastApplies[i] = make(map[int64]int64) 211 | } 212 | 213 | kv.inputShards = make(map[int]bool) 214 | kv.outputShards = make(map[int]map[int]MergeShardData) 215 | //kv.cleanOutputDataNotifyCh = make(map[string]chan struct{}) 216 | config := shardctrler.Config{ 217 | Num: 0, 218 | Shards: [shardctrler.NShards]int{}, 219 | Groups: map[int][]string{}, 220 | } 221 | kv.config = config 222 | kv.oldConfig = config 223 | 224 | //读取快照内容 225 | kv.readPersist(true, 0, 0, kv.persister.ReadSnapshot()) 226 | 227 | kv.commandNotifyCh = make(map[int64]chan CommandResult) 228 | //设置定时器 229 | kv.pullConfigTimer = time.NewTimer(PullConfigInterval) 230 | kv.pullShardsTimer = time.NewTimer(PullShardsInterval) 231 | 232 | kv.ticker() 233 | 234 | return kv 235 | } 236 | -------------------------------------------------------------------------------- /shardkv/server_apply.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import ( 4 | "6.824/shardctrler" 5 | ) 6 | 7 | func (kv *ShardKV) notifyWaitCommand(reqId int64, err Err, value string) { 8 | if ch, ok := kv.commandNotifyCh[reqId]; ok { 9 | ch <- CommandResult{ 10 | Err: err, 11 | Value: value, 12 | } 13 | } 14 | } 15 | 16 | func (kv *ShardKV) getValueByKey(key string) (err Err, value string) { 17 | if v, ok := kv.data[key2shard(key)][key]; ok { 18 | err = OK 19 | value = v 20 | } else { 21 | err = ErrNoKey 22 | value = "" 23 | } 24 | return 25 | } 26 | 27 | //判断能否执行客户端发来的命令 28 | func (kv *ShardKV) ProcessKeyReady(configNum int, key string) Err { 29 | //config不对 30 | if configNum == 0 || configNum != kv.config.Num { 31 | kv.log("process key ready err config.") 32 | return ErrWrongGroup 33 | } 34 | shardId := key2shard(key) 35 | //没有分配该shard 36 | if _, ok := kv.meShards[shardId]; !ok { 37 | kv.log("process key ready err shard.") 38 | return ErrWrongGroup 39 | } 40 | //正在迁移,这里有优化的空间,如果没有迁移完成,可以直接请求目标节点完成操作并返回,但是这样就太复杂了,这里简略了 41 | if _, ok := kv.inputShards[shardId]; ok { 42 | kv.log("process key ready err waitShard.") 43 | return ErrWrongGroup 44 | } 45 | return OK 46 | } 47 | 48 | //应用每一条命令 49 | func (kv *ShardKV) handleApplyCh() { 50 | for { 51 | select { 52 | case <-kv.stopCh: 53 | kv.log("get from stopCh,server-%v stop!", kv.me) 54 | return 55 | case cmd := <-kv.applyCh: 56 | //处理快照命令,读取快照的内容 57 | if cmd.SnapshotValid { 58 | kv.log("%v get install sn,%v %v", kv.me, cmd.SnapshotIndex, cmd.SnapshotTerm) 59 | kv.lock("waitApplyCh_sn") 60 | kv.readPersist(false, cmd.SnapshotTerm, cmd.SnapshotIndex, cmd.Snapshot) 61 | kv.unlock("waitApplyCh_sn") 62 | continue 63 | } 64 | //处理普通命令 65 | if !cmd.CommandValid { 66 | continue 67 | } 68 | cmdIdx := cmd.CommandIndex 69 | //处理不同的命令 70 | if op, ok := cmd.Command.(Op); ok { 71 | kv.handleOpCommand(cmdIdx, op) 72 | } else if config, ok := cmd.Command.(shardctrler.Config); ok { 73 | kv.handleConfigCommand(cmdIdx, config) 74 | } else if mergeData, ok := cmd.Command.(MergeShardData); ok { 75 | kv.handleMergeShardDataCommand(cmdIdx, mergeData) 76 | } else if cleanData, ok := cmd.Command.(CleanShardDataArgs); ok { 77 | kv.handleCleanShardDataCommand(cmdIdx, cleanData) 78 | } else { 79 | panic("apply command,NOT FOUND COMMDN!") 80 | } 81 | 82 | } 83 | 84 | } 85 | 86 | } 87 | 88 | //处理get、put、append命令 89 | func (kv *ShardKV) handleOpCommand(cmdIdx int, op Op) { 90 | kv.log("start apply command %v:%+v", cmdIdx, op) 91 | kv.lock("handleApplyCh") 92 | defer kv.unlock("handleApplyCh") 93 | shardId := key2shard(op.Key) 94 | if err := kv.ProcessKeyReady(op.ConfigNum, op.Key); err != OK { 95 | kv.notifyWaitCommand(op.ReqId, err, "") 96 | return 97 | } 98 | if op.Method == "Get" { 99 | //处理读 100 | e, v := kv.getValueByKey(op.Key) 101 | kv.notifyWaitCommand(op.ReqId, e, v) 102 | } else if op.Method == "Put" || op.Method == "Append" { 103 | //处理写 104 | //判断命令是否重复 105 | isRepeated := false 106 | if v, ok := kv.lastApplies[shardId][op.ClientId]; ok { 107 | if v == op.CommandId { 108 | isRepeated = true 109 | } 110 | } 111 | 112 | if !isRepeated { 113 | switch op.Method { 114 | case "Put": 115 | kv.data[shardId][op.Key] = op.Value 116 | kv.lastApplies[shardId][op.ClientId] = op.CommandId 117 | case "Append": 118 | e, v := kv.getValueByKey(op.Key) 119 | if e == ErrNoKey { 120 | //按put处理 121 | kv.data[shardId][op.Key] = op.Value 122 | kv.lastApplies[shardId][op.ClientId] = op.CommandId 123 | } else { 124 | //追加 125 | kv.data[shardId][op.Key] = v + op.Value 126 | kv.lastApplies[shardId][op.ClientId] = op.CommandId 127 | } 128 | default: 129 | panic("unknown method " + op.Method) 130 | } 131 | 132 | } 133 | //命令处理成功 134 | kv.notifyWaitCommand(op.ReqId, OK, "") 135 | } else { 136 | panic("unknown method " + op.Method) 137 | } 138 | 139 | kv.log("apply op: cmdId:%d, op: %+v, data:%v", cmdIdx, op, kv.data[shardId][op.Key]) 140 | //每应用一条命令,就判断是否进行持久化 141 | kv.saveSnapshot(cmdIdx) 142 | } 143 | 144 | //处理config命令,即更新config 145 | //主要是处理meshard、inputshard、outputshard 146 | func (kv *ShardKV) handleConfigCommand(cmdIdx int, config shardctrler.Config) { 147 | kv.log("start handle config %v:%+v", cmdIdx, config) 148 | kv.lock("handleApplyCh") 149 | defer kv.unlock("handleApplyCh") 150 | if config.Num <= kv.config.Num { 151 | kv.saveSnapshot(cmdIdx) 152 | return 153 | } 154 | 155 | if config.Num != kv.config.Num+1 { 156 | panic("applyConfig err") 157 | } 158 | 159 | oldConfig := kv.config.Copy() 160 | outputShards := make([]int, 0, shardctrler.NShards) 161 | inputShards := make([]int, 0, shardctrler.NShards) 162 | meShards := make([]int, 0, shardctrler.NShards) 163 | 164 | for i := 0; i < shardctrler.NShards; i++ { 165 | if config.Shards[i] == kv.gid { 166 | meShards = append(meShards, i) 167 | if oldConfig.Shards[i] != kv.gid { 168 | inputShards = append(inputShards, i) 169 | } 170 | } else { 171 | if oldConfig.Shards[i] == kv.gid { 172 | outputShards = append(outputShards, i) 173 | } 174 | } 175 | } 176 | 177 | //处理当前的shard 178 | kv.meShards = make(map[int]bool) 179 | for _, shardId := range meShards { 180 | kv.meShards[shardId] = true 181 | } 182 | 183 | //处理移出的shard 184 | //保存当前所处配置的所有移除的shard数据 185 | d := make(map[int]MergeShardData) 186 | for _, shardId := range outputShards { 187 | mergeShardData := MergeShardData{ 188 | ConfigNum: oldConfig.Num, 189 | ShardNum: shardId, 190 | Data: kv.data[shardId], 191 | CommandIndexes: kv.lastApplies[shardId], 192 | } 193 | d[shardId] = mergeShardData 194 | //初始化数据 195 | kv.data[shardId] = make(map[string]string) 196 | kv.lastApplies[shardId] = make(map[int64]int64) 197 | } 198 | kv.outputShards[oldConfig.Num] = d 199 | 200 | //处理移入的shard 201 | kv.inputShards = make(map[int]bool) 202 | if oldConfig.Num != 0 { 203 | for _, shardId := range inputShards { 204 | kv.inputShards[shardId] = true 205 | } 206 | } 207 | 208 | kv.config = config 209 | kv.oldConfig = oldConfig 210 | kv.log("apply op: cmdId:%d, config:%+v", cmdIdx, config) 211 | kv.saveSnapshot(cmdIdx) 212 | } 213 | 214 | //处理新的shard数据,即input shard 215 | func (kv *ShardKV) handleMergeShardDataCommand(cmdIdx int, data MergeShardData) { 216 | kv.log("start merge Shard Data %v:%+v", cmdIdx, data) 217 | kv.lock("handleApplyCh") 218 | defer kv.unlock("handleApplyCh") 219 | if kv.config.Num != data.ConfigNum+1 { 220 | return 221 | } 222 | 223 | if _, ok := kv.inputShards[data.ShardNum]; !ok { 224 | return 225 | } 226 | 227 | kv.data[data.ShardNum] = make(map[string]string) 228 | kv.lastApplies[data.ShardNum] = make(map[int64]int64) 229 | 230 | for k, v := range data.Data { 231 | kv.data[data.ShardNum][k] = v 232 | } 233 | for k, v := range data.CommandIndexes { 234 | kv.lastApplies[data.ShardNum][k] = v 235 | } 236 | delete(kv.inputShards, data.ShardNum) 237 | 238 | kv.log("apply op: cmdId:%d, mergeShardData:%+v", cmdIdx, data) 239 | kv.saveSnapshot(cmdIdx) 240 | go kv.callPeerCleanShardData(kv.oldConfig, data.ShardNum) 241 | } 242 | 243 | //处理已经迁移走的shard,即output shard 244 | func (kv *ShardKV) handleCleanShardDataCommand(cmdIdx int, data CleanShardDataArgs) { 245 | kv.log("start clean shard data %v:%+v", cmdIdx, data) 246 | kv.lock("handleApplyCh") 247 | defer kv.unlock("handleApplyCh") 248 | //如果要清除的shard确实是在outputShard中,且没有被清除,则需要清除 249 | if kv.OutputDataExist(data.ConfigNum, data.ShardNum) { 250 | delete(kv.outputShards[data.ConfigNum], data.ShardNum) 251 | } 252 | 253 | //通知等待协程 254 | //if ch, ok := kv.cleanOutputDataNotifyCh[fmt.Sprintf("%d%d", data.ConfigNum, data.ShardNum)]; ok { 255 | // ch <- struct{}{} 256 | //} 257 | 258 | kv.saveSnapshot(cmdIdx) 259 | } 260 | -------------------------------------------------------------------------------- /raft/raft_append_entries.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type AppendEntriesArgs struct { 8 | Term int 9 | LeaderId int 10 | PrevLogIndex int 11 | PrevLogTerm int 12 | Entries []LogEntry 13 | LeaderCommit int 14 | } 15 | 16 | type AppendEntriesReply struct { 17 | Term int 18 | Success bool 19 | NextLogTerm int 20 | NextLogIndex int 21 | } 22 | 23 | //立马发送 24 | func (rf *Raft) resetAppendEntriesTimersZero() { 25 | for _, timer := range rf.appendEntriesTimers { 26 | timer.Stop() 27 | timer.Reset(0) 28 | } 29 | } 30 | 31 | func (rf *Raft) resetAppendEntriesTimerZero(peerId int) { 32 | rf.appendEntriesTimers[peerId].Stop() 33 | rf.appendEntriesTimers[peerId].Reset(0) 34 | } 35 | 36 | //重置单个timer 37 | func (rf *Raft) resetAppendEntriesTimer(peerId int) { 38 | rf.appendEntriesTimers[peerId].Stop() 39 | rf.appendEntriesTimers[peerId].Reset(HeartBeatInterval) 40 | } 41 | 42 | //判断当前raft的日志记录是否超过发送过来的日志记录 43 | func (rf *Raft) isOutOfArgsAppendEntries(args *AppendEntriesArgs) bool { 44 | argsLastLogIndex := args.PrevLogIndex + len(args.Entries) 45 | lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex() 46 | if lastLogTerm == args.Term && argsLastLogIndex < lastLogIndex { 47 | return true 48 | } 49 | return false 50 | } 51 | 52 | //获取当前存储位置的索引 53 | func (rf *Raft) getStoreIndexByLogIndex(logIndex int) int { 54 | storeIndex := logIndex - rf.lastSnapshotIndex 55 | if storeIndex < 0 { 56 | return -1 57 | } 58 | return storeIndex 59 | } 60 | 61 | //接收端处理rpc 62 | //主要进行三个处理: 63 | // 1. 判断任期 64 | // 2. 判断是否接收数据,success:数据全部接受,或者根本就没有数据 65 | // 3. 判断是否提交数据 66 | func (rf *Raft) AppendEntries(args *AppendEntriesArgs, reply *AppendEntriesReply) { 67 | rf.mu.Lock() 68 | DPrintf("%v receive a appendEntries: %+v", rf.me, args) 69 | reply.Term = rf.currentTerm 70 | if args.Term < rf.currentTerm { 71 | rf.mu.Unlock() 72 | return 73 | } 74 | rf.currentTerm = args.Term 75 | rf.changeRole(Role_Follower) 76 | rf.resetElectionTimer() 77 | 78 | _, lastLogIndex := rf.getLastLogTermAndIndex() 79 | //先判断两边,再判断刚好从快照开始,再判断中间的情况 80 | if args.PrevLogIndex < rf.lastSnapshotIndex { 81 | //1.要插入的前一个index小于快照index,几乎不会发生 82 | reply.Success = false 83 | reply.NextLogIndex = rf.lastSnapshotIndex + 1 84 | } else if args.PrevLogIndex > lastLogIndex { 85 | //2. 要插入的前一个index大于最后一个log的index,说明中间还有log 86 | reply.Success = false 87 | reply.NextLogIndex = lastLogIndex + 1 88 | } else if args.PrevLogIndex == rf.lastSnapshotIndex { 89 | //3. 要插入的前一个index刚好等于快照的index,说明可以全覆盖,但要判断是否是全覆盖 90 | if rf.isOutOfArgsAppendEntries(args) { 91 | reply.Success = false 92 | reply.NextLogIndex = 0 //=0代表着插入会导致乱序 93 | } else { 94 | reply.Success = true 95 | rf.logs = append(rf.logs[:1], args.Entries...) 96 | _, currentLogIndex := rf.getLastLogTermAndIndex() 97 | reply.NextLogIndex = currentLogIndex + 1 98 | } 99 | } else if args.PrevLogTerm == rf.logs[rf.getStoreIndexByLogIndex(args.PrevLogIndex)].Term { 100 | //4. 中间的情况:索引处的两个term相同 101 | if rf.isOutOfArgsAppendEntries(args) { 102 | reply.Success = false 103 | reply.NextLogIndex = 0 104 | } else { 105 | reply.Success = true 106 | rf.logs = append(rf.logs[:rf.getStoreIndexByLogIndex(args.PrevLogIndex)+1], args.Entries...) 107 | _, currentLogIndex := rf.getLastLogTermAndIndex() 108 | reply.NextLogIndex = currentLogIndex + 1 109 | } 110 | } else { 111 | //5. 中间的情况:索引处的两个term不相同,跳过一个term 112 | term := rf.logs[rf.getStoreIndexByLogIndex(args.PrevLogIndex)].Term 113 | index := args.PrevLogIndex 114 | for index > rf.commitIndex && index > rf.lastSnapshotIndex && rf.logs[rf.getStoreIndexByLogIndex(index)].Term == term { 115 | index-- 116 | } 117 | reply.Success = false 118 | reply.NextLogIndex = index + 1 119 | } 120 | 121 | if reply.Success { 122 | DPrintf("%v current commit: %v, try to commit %v", rf.me, rf.commitIndex, args.LeaderCommit) 123 | if rf.commitIndex < args.LeaderCommit { 124 | rf.commitIndex = args.LeaderCommit 125 | rf.notifyApplyCh <- struct{}{} 126 | } 127 | } 128 | 129 | rf.persist() 130 | DPrintf("%v role: %v, get appendentries finish,args = %v,reply = %+v", rf.me, rf.role, *args, *reply) 131 | rf.mu.Unlock() 132 | 133 | } 134 | 135 | //获取要向指定节点发送的日志 136 | func (rf *Raft) getAppendLogs(peerId int) (prevLogIndex int, prevLogTerm int, logEntries []LogEntry) { 137 | nextIndex := rf.nextIndex[peerId] 138 | lastLogTerm, lastLogIndex := rf.getLastLogTermAndIndex() 139 | if nextIndex <= rf.lastSnapshotIndex || nextIndex > lastLogIndex { 140 | //没有要发送的log 141 | prevLogTerm = lastLogTerm 142 | prevLogIndex = lastLogIndex 143 | return 144 | } 145 | //这里一定要进行深拷贝,不然会和Snapshot()发生数据上的冲突 146 | //logEntries = rf.logs[nextIndex-rf.lastSnapshotIndex:] 147 | logEntries = make([]LogEntry, lastLogIndex-nextIndex+1) 148 | copy(logEntries, rf.logs[nextIndex-rf.lastSnapshotIndex:]) 149 | prevLogIndex = nextIndex - 1 150 | if prevLogIndex == rf.lastSnapshotIndex { 151 | prevLogTerm = rf.lastSnapshotTerm 152 | } else { 153 | prevLogTerm = rf.logs[prevLogIndex-rf.lastSnapshotIndex].Term 154 | } 155 | 156 | return 157 | } 158 | 159 | //尝试去提交日志 160 | //会依次判断,可以提交多个,但不能有间断 161 | func (rf *Raft) tryCommitLog() { 162 | _, lastLogIndex := rf.getLastLogTermAndIndex() 163 | hasCommit := false 164 | 165 | for i := rf.commitIndex + 1; i <= lastLogIndex; i++ { 166 | count := 0 167 | for _, m := range rf.matchIndex { 168 | if m >= i { 169 | count += 1 170 | //提交数达到多数派 171 | if count > len(rf.peers)/2 { 172 | rf.commitIndex = i 173 | hasCommit = true 174 | DPrintf("%v role: %v,commit index %v", rf.me, rf.role, i) 175 | break 176 | } 177 | } 178 | } 179 | if rf.commitIndex != i { 180 | break 181 | } 182 | } 183 | 184 | if hasCommit { 185 | rf.notifyApplyCh <- struct{}{} 186 | } 187 | } 188 | 189 | //发送端发送数据 190 | func (rf *Raft) sendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply) { 191 | rpcTimer := time.NewTimer(RPCTimeout) 192 | defer rpcTimer.Stop() 193 | 194 | ch := make(chan bool, 1) 195 | go func() { 196 | //尝试10次 197 | for i := 0; i < 10 && !rf.killed(); i++ { 198 | ok := rf.peers[server].Call("Raft.AppendEntries", args, reply) 199 | if !ok { 200 | time.Sleep(time.Millisecond * 10) 201 | continue 202 | } else { 203 | ch <- ok 204 | return 205 | } 206 | } 207 | }() 208 | 209 | select { 210 | case <-rpcTimer.C: 211 | DPrintf("%v role: %v, send append entries to peer %v TIME OUT!!!", rf.me, rf.role, server) 212 | return 213 | case <-ch: 214 | return 215 | } 216 | } 217 | 218 | func (rf *Raft) sendAppendEntriesToPeer(peerId int) { 219 | if rf.killed() { 220 | return 221 | } 222 | 223 | rf.mu.Lock() 224 | if rf.role != Role_Leader { 225 | rf.resetAppendEntriesTimer(peerId) 226 | rf.mu.Unlock() 227 | return 228 | } 229 | DPrintf("%v send append entries to peer %v", rf.me, peerId) 230 | 231 | prevLogIndex, prevLogTerm, logEntries := rf.getAppendLogs(peerId) 232 | args := AppendEntriesArgs{ 233 | Term: rf.currentTerm, 234 | LeaderId: rf.me, 235 | PrevLogIndex: prevLogIndex, 236 | PrevLogTerm: prevLogTerm, 237 | Entries: logEntries, 238 | LeaderCommit: rf.commitIndex, 239 | } 240 | reply := AppendEntriesReply{} 241 | rf.resetAppendEntriesTimer(peerId) 242 | rf.mu.Unlock() 243 | 244 | rf.sendAppendEntries(peerId, &args, &reply) 245 | 246 | DPrintf("%v role: %v, send append entries to peer finish,%v,args = %+v,reply = %+v", rf.me, rf.role, peerId, args, reply) 247 | 248 | rf.mu.Lock() 249 | if reply.Term > rf.currentTerm { 250 | rf.changeRole(Role_Follower) 251 | rf.currentTerm = reply.Term 252 | rf.resetElectionTimer() 253 | rf.persist() 254 | rf.mu.Unlock() 255 | return 256 | } 257 | 258 | if rf.role != Role_Leader || rf.currentTerm != args.Term { 259 | rf.mu.Unlock() 260 | return 261 | } 262 | 263 | //响应:成功了,即:发送的数据全部接收了,或者根本没有数据 264 | if reply.Success { 265 | if reply.NextLogIndex > rf.nextIndex[peerId] { 266 | rf.nextIndex[peerId] = reply.NextLogIndex 267 | rf.matchIndex[peerId] = reply.NextLogIndex - 1 268 | } 269 | if len(args.Entries) > 0 && args.Entries[len(args.Entries)-1].Term == rf.currentTerm { 270 | //每个leader只能提交自己任期的日志 271 | rf.tryCommitLog() 272 | } 273 | rf.persist() 274 | rf.mu.Unlock() 275 | return 276 | } 277 | 278 | //响应:失败了,此时要修改nextIndex或者不做处理 279 | if reply.NextLogIndex != 0 { 280 | if reply.NextLogIndex > rf.lastSnapshotIndex { 281 | rf.nextIndex[peerId] = reply.NextLogIndex 282 | //为了一致性,立马发送 283 | rf.resetAppendEntriesTimerZero(peerId) 284 | } else { 285 | //发送快照 286 | go rf.sendInstallSnapshotToPeer(peerId) 287 | } 288 | rf.mu.Unlock() 289 | return 290 | } else { 291 | //reply.NextLogIndex = 0,此时如果插入会导致乱序,可以不进行处理 292 | } 293 | 294 | rf.mu.Unlock() 295 | return 296 | 297 | } 298 | -------------------------------------------------------------------------------- /main/test-mr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # map-reduce tests 5 | # 6 | 7 | # comment this out to run the tests without the Go race detector. 8 | RACE=-race 9 | 10 | if [[ "$OSTYPE" = "darwin"* ]] 11 | then 12 | if go version | grep 'go1.17.[012345]' 13 | then 14 | # -race with plug-ins on x86 MacOS 12 with 15 | # go1.17 before 1.17.6 sometimes crash. 16 | RACE= 17 | echo '*** Turning off -race since it may not work on a Mac' 18 | echo ' with ' `go version` 19 | fi 20 | fi 21 | 22 | TIMEOUT=timeout 23 | if timeout 2s sleep 1 > /dev/null 2>&1 24 | then 25 | : 26 | else 27 | if gtimeout 2s sleep 1 > /dev/null 2>&1 28 | then 29 | TIMEOUT=gtimeout 30 | else 31 | # no timeout command 32 | TIMEOUT= 33 | echo '*** Cannot find timeout command; proceeding without timeouts.' 34 | fi 35 | fi 36 | if [ "$TIMEOUT" != "" ] 37 | then 38 | TIMEOUT+=" -k 2s 180s " 39 | fi 40 | 41 | # run the test in a fresh sub-directory. 42 | rm -rf mr-tmp 43 | mkdir mr-tmp || exit 1 44 | cd mr-tmp || exit 1 45 | rm -f mr-* 46 | 47 | # make sure software is freshly built. 48 | (cd ../../mrapps && go clean) 49 | (cd .. && go clean) 50 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1 51 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1 52 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1 53 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1 54 | (cd ../../mrapps && go build $RACE -buildmode=plugin jobcount.go) || exit 1 55 | (cd ../../mrapps && go build $RACE -buildmode=plugin early_exit.go) || exit 1 56 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1 57 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1 58 | (cd .. && go build $RACE mrcoordinator.go) || exit 1 59 | (cd .. && go build $RACE mrworker.go) || exit 1 60 | (cd .. && go build $RACE mrsequential.go) || exit 1 61 | 62 | failed_any=0 63 | 64 | ######################################################### 65 | # first word-count 66 | 67 | # generate the correct output 68 | ../mrsequential ../../mrapps/wc.so ../pg*txt || exit 1 69 | sort mr-out-0 > mr-correct-wc.txt 70 | rm -f mr-out* 71 | 72 | echo '***' Starting wc test. 73 | 74 | $TIMEOUT ../mrcoordinator ../pg*txt & 75 | pid=$! 76 | 77 | # give the coordinator time to create the sockets. 78 | sleep 1 79 | 80 | # start multiple workers. 81 | $TIMEOUT ../mrworker ../../mrapps/wc.so & 82 | $TIMEOUT ../mrworker ../../mrapps/wc.so & 83 | $TIMEOUT ../mrworker ../../mrapps/wc.so & 84 | 85 | # wait for the coordinator to exit. 86 | wait $pid 87 | 88 | # since workers are required to exit when a job is completely finished, 89 | # and not before, that means the job has finished. 90 | sort mr-out* | grep . > mr-wc-all 91 | if cmp mr-wc-all mr-correct-wc.txt 92 | then 93 | echo '' wc test: PASS 94 | else 95 | echo '---' wc output is not the same as mr-correct-wc.txt 96 | echo '---' wc test: FAIL 97 | failed_any=1 98 | fi 99 | 100 | # wait for remaining workers and coordinator to exit. 101 | wait 102 | 103 | ######################################################### 104 | # now indexer 105 | rm -f mr-* 106 | 107 | # generate the correct output 108 | ../mrsequential ../../mrapps/indexer.so ../pg*txt || exit 1 109 | sort mr-out-0 > mr-correct-indexer.txt 110 | rm -f mr-out* 111 | 112 | echo '***' Starting indexer test. 113 | 114 | $TIMEOUT ../mrcoordinator ../pg*txt & 115 | sleep 1 116 | 117 | # start multiple workers 118 | $TIMEOUT ../mrworker ../../mrapps/indexer.so & 119 | $TIMEOUT ../mrworker ../../mrapps/indexer.so 120 | 121 | sort mr-out* | grep . > mr-indexer-all 122 | if cmp mr-indexer-all mr-correct-indexer.txt 123 | then 124 | echo '---' indexer test: PASS 125 | else 126 | echo '---' indexer output is not the same as mr-correct-indexer.txt 127 | echo '---' indexer test: FAIL 128 | failed_any=1 129 | fi 130 | 131 | wait 132 | 133 | ######################################################### 134 | echo '***' Starting map parallelism test. 135 | 136 | rm -f mr-* 137 | 138 | $TIMEOUT ../mrcoordinator ../pg*txt & 139 | sleep 1 140 | 141 | $TIMEOUT ../mrworker ../../mrapps/mtiming.so & 142 | $TIMEOUT ../mrworker ../../mrapps/mtiming.so 143 | 144 | NT=`cat mr-out* | grep '^times-' | wc -l | sed 's/ //g'` 145 | if [ "$NT" != "2" ] 146 | then 147 | echo '---' saw "$NT" workers rather than 2 148 | echo '---' map parallelism test: FAIL 149 | failed_any=1 150 | fi 151 | 152 | if cat mr-out* | grep '^parallel.* 2' > /dev/null 153 | then 154 | echo '---' map parallelism test: PASS 155 | else 156 | echo '---' map workers did not run in parallel 157 | echo '---' map parallelism test: FAIL 158 | failed_any=1 159 | fi 160 | 161 | wait 162 | 163 | 164 | ######################################################### 165 | echo '***' Starting reduce parallelism test. 166 | 167 | rm -f mr-* 168 | 169 | $TIMEOUT ../mrcoordinator ../pg*txt & 170 | sleep 1 171 | 172 | $TIMEOUT ../mrworker ../../mrapps/rtiming.so & 173 | $TIMEOUT ../mrworker ../../mrapps/rtiming.so 174 | 175 | NT=`cat mr-out* | grep '^[a-z] 2' | wc -l | sed 's/ //g'` 176 | if [ "$NT" -lt "2" ] 177 | then 178 | echo '---' too few parallel reduces. 179 | echo '---' reduce parallelism test: FAIL 180 | failed_any=1 181 | else 182 | echo '---' reduce parallelism test: PASS 183 | fi 184 | 185 | wait 186 | 187 | ######################################################### 188 | echo '***' Starting job count test. 189 | 190 | rm -f mr-* 191 | 192 | $TIMEOUT ../mrcoordinator ../pg*txt & 193 | sleep 1 194 | 195 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so & 196 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so 197 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so & 198 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so 199 | 200 | NT=`cat mr-out* | awk '{print $2}'` 201 | if [ "$NT" -eq "8" ] 202 | then 203 | echo '---' job count test: PASS 204 | else 205 | echo '---' map jobs ran incorrect number of times "($NT != 8)" 206 | echo '---' job count test: FAIL 207 | failed_any=1 208 | fi 209 | 210 | wait 211 | 212 | ######################################################### 213 | # test whether any worker or coordinator exits before the 214 | # task has completed (i.e., all output files have been finalized) 215 | rm -f mr-* 216 | 217 | echo '***' Starting early exit test. 218 | 219 | DF=anydone$$ 220 | rm -f $DF 221 | 222 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch $DF) & 223 | 224 | # give the coordinator time to create the sockets. 225 | sleep 1 226 | 227 | # start multiple workers. 228 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 229 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 230 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 231 | 232 | # wait for any of the coord or workers to exit. 233 | # `jobs` ensures that any completed old processes from other tests 234 | # are not waited upon. 235 | jobs &> /dev/null 236 | if [[ "$OSTYPE" = "darwin"* ]] 237 | then 238 | # bash on the Mac doesn't have wait -n 239 | while [ ! -e $DF ] 240 | do 241 | sleep 0.2 242 | done 243 | else 244 | # the -n causes wait to wait for just one child process, 245 | # rather than waiting for all to finish. 246 | wait -n 247 | fi 248 | 249 | rm -f $DF 250 | 251 | # a process has exited. this means that the output should be finalized 252 | # otherwise, either a worker or the coordinator exited early 253 | sort mr-out* | grep . > mr-wc-all-initial 254 | 255 | # wait for remaining workers and coordinator to exit. 256 | wait 257 | 258 | # compare initial and final outputs 259 | sort mr-out* | grep . > mr-wc-all-final 260 | if cmp mr-wc-all-final mr-wc-all-initial 261 | then 262 | echo '---' early exit test: PASS 263 | else 264 | echo '---' output changed after first worker exited 265 | echo '---' early exit test: FAIL 266 | failed_any=1 267 | fi 268 | rm -f mr-* 269 | 270 | ######################################################### 271 | echo '***' Starting crash test. 272 | 273 | # generate the correct output 274 | ../mrsequential ../../mrapps/nocrash.so ../pg*txt || exit 1 275 | sort mr-out-0 > mr-correct-crash.txt 276 | rm -f mr-out* 277 | 278 | rm -f mr-done 279 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch mr-done ) & 280 | sleep 1 281 | 282 | # start multiple workers 283 | $TIMEOUT ../mrworker ../../mrapps/crash.so & 284 | 285 | # mimic rpc.go's coordinatorSock() 286 | SOCKNAME=/var/tmp/824-mr-`id -u` 287 | 288 | ( while [ -e $SOCKNAME -a ! -f mr-done ] 289 | do 290 | $TIMEOUT ../mrworker ../../mrapps/crash.so 291 | sleep 1 292 | done ) & 293 | 294 | ( while [ -e $SOCKNAME -a ! -f mr-done ] 295 | do 296 | $TIMEOUT ../mrworker ../../mrapps/crash.so 297 | sleep 1 298 | done ) & 299 | 300 | while [ -e $SOCKNAME -a ! -f mr-done ] 301 | do 302 | $TIMEOUT ../mrworker ../../mrapps/crash.so 303 | sleep 1 304 | done 305 | 306 | wait 307 | 308 | rm $SOCKNAME 309 | sort mr-out* | grep . > mr-crash-all 310 | if cmp mr-crash-all mr-correct-crash.txt 311 | then 312 | echo '---' crash test: PASS 313 | else 314 | echo '---' crash output is not the same as mr-correct-crash.txt 315 | echo '---' crash test: FAIL 316 | failed_any=1 317 | fi 318 | 319 | ######################################################### 320 | if [ $failed_any -eq 0 ]; then 321 | echo '***' PASSED ALL TESTS 322 | else 323 | echo '***' FAILED SOME TESTS 324 | exit 1 325 | fi 326 | -------------------------------------------------------------------------------- /shardctrler/config.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import "6.824/labrpc" 4 | import "6.824/raft" 5 | import "testing" 6 | import "os" 7 | 8 | // import "log" 9 | import crand "crypto/rand" 10 | import "math/rand" 11 | import "encoding/base64" 12 | import "sync" 13 | import "runtime" 14 | import "time" 15 | 16 | func randstring(n int) string { 17 | b := make([]byte, 2*n) 18 | crand.Read(b) 19 | s := base64.URLEncoding.EncodeToString(b) 20 | return s[0:n] 21 | } 22 | 23 | // Randomize server handles 24 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 25 | sa := make([]*labrpc.ClientEnd, len(kvh)) 26 | copy(sa, kvh) 27 | for i := range sa { 28 | j := rand.Intn(i + 1) 29 | sa[i], sa[j] = sa[j], sa[i] 30 | } 31 | return sa 32 | } 33 | 34 | type config struct { 35 | mu sync.Mutex 36 | t *testing.T 37 | net *labrpc.Network 38 | n int 39 | servers []*ShardCtrler 40 | saved []*raft.Persister 41 | endnames [][]string // names of each server's sending ClientEnds 42 | clerks map[*Clerk][]string 43 | nextClientId int 44 | start time.Time // time at which make_config() was called 45 | } 46 | 47 | func (cfg *config) checkTimeout() { 48 | // enforce a two minute real-time limit on each test 49 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 50 | cfg.t.Fatal("test took longer than 120 seconds") 51 | } 52 | } 53 | 54 | func (cfg *config) cleanup() { 55 | cfg.mu.Lock() 56 | defer cfg.mu.Unlock() 57 | for i := 0; i < len(cfg.servers); i++ { 58 | if cfg.servers[i] != nil { 59 | cfg.servers[i].Kill() 60 | } 61 | } 62 | cfg.net.Cleanup() 63 | cfg.checkTimeout() 64 | } 65 | 66 | // Maximum log size across all servers 67 | func (cfg *config) LogSize() int { 68 | logsize := 0 69 | for i := 0; i < cfg.n; i++ { 70 | n := cfg.saved[i].RaftStateSize() 71 | if n > logsize { 72 | logsize = n 73 | } 74 | } 75 | return logsize 76 | } 77 | 78 | // attach server i to servers listed in to 79 | // caller must hold cfg.mu 80 | func (cfg *config) connectUnlocked(i int, to []int) { 81 | // log.Printf("connect peer %d to %v\n", i, to) 82 | 83 | // outgoing socket files 84 | for j := 0; j < len(to); j++ { 85 | endname := cfg.endnames[i][to[j]] 86 | cfg.net.Enable(endname, true) 87 | } 88 | 89 | // incoming socket files 90 | for j := 0; j < len(to); j++ { 91 | endname := cfg.endnames[to[j]][i] 92 | cfg.net.Enable(endname, true) 93 | } 94 | } 95 | 96 | func (cfg *config) connect(i int, to []int) { 97 | cfg.mu.Lock() 98 | defer cfg.mu.Unlock() 99 | cfg.connectUnlocked(i, to) 100 | } 101 | 102 | // detach server i from the servers listed in from 103 | // caller must hold cfg.mu 104 | func (cfg *config) disconnectUnlocked(i int, from []int) { 105 | // log.Printf("disconnect peer %d from %v\n", i, from) 106 | 107 | // outgoing socket files 108 | for j := 0; j < len(from); j++ { 109 | if cfg.endnames[i] != nil { 110 | endname := cfg.endnames[i][from[j]] 111 | cfg.net.Enable(endname, false) 112 | } 113 | } 114 | 115 | // incoming socket files 116 | for j := 0; j < len(from); j++ { 117 | if cfg.endnames[j] != nil { 118 | endname := cfg.endnames[from[j]][i] 119 | cfg.net.Enable(endname, false) 120 | } 121 | } 122 | } 123 | 124 | func (cfg *config) disconnect(i int, from []int) { 125 | cfg.mu.Lock() 126 | defer cfg.mu.Unlock() 127 | cfg.disconnectUnlocked(i, from) 128 | } 129 | 130 | func (cfg *config) All() []int { 131 | all := make([]int, cfg.n) 132 | for i := 0; i < cfg.n; i++ { 133 | all[i] = i 134 | } 135 | return all 136 | } 137 | 138 | func (cfg *config) ConnectAll() { 139 | cfg.mu.Lock() 140 | defer cfg.mu.Unlock() 141 | for i := 0; i < cfg.n; i++ { 142 | cfg.connectUnlocked(i, cfg.All()) 143 | } 144 | } 145 | 146 | // Sets up 2 partitions with connectivity between servers in each partition. 147 | func (cfg *config) partition(p1 []int, p2 []int) { 148 | cfg.mu.Lock() 149 | defer cfg.mu.Unlock() 150 | // log.Printf("partition servers into: %v %v\n", p1, p2) 151 | for i := 0; i < len(p1); i++ { 152 | cfg.disconnectUnlocked(p1[i], p2) 153 | cfg.connectUnlocked(p1[i], p1) 154 | } 155 | for i := 0; i < len(p2); i++ { 156 | cfg.disconnectUnlocked(p2[i], p1) 157 | cfg.connectUnlocked(p2[i], p2) 158 | } 159 | } 160 | 161 | // Create a clerk with clerk specific server names. 162 | // Give it connections to all of the servers, but for 163 | // now enable only connections to servers in to[]. 164 | func (cfg *config) makeClient(to []int) *Clerk { 165 | cfg.mu.Lock() 166 | defer cfg.mu.Unlock() 167 | 168 | // a fresh set of ClientEnds. 169 | ends := make([]*labrpc.ClientEnd, cfg.n) 170 | endnames := make([]string, cfg.n) 171 | for j := 0; j < cfg.n; j++ { 172 | endnames[j] = randstring(20) 173 | ends[j] = cfg.net.MakeEnd(endnames[j]) 174 | cfg.net.Connect(endnames[j], j) 175 | } 176 | 177 | ck := MakeClerk(random_handles(ends)) 178 | cfg.clerks[ck] = endnames 179 | cfg.nextClientId++ 180 | cfg.ConnectClientUnlocked(ck, to) 181 | return ck 182 | } 183 | 184 | func (cfg *config) deleteClient(ck *Clerk) { 185 | cfg.mu.Lock() 186 | defer cfg.mu.Unlock() 187 | 188 | v := cfg.clerks[ck] 189 | for i := 0; i < len(v); i++ { 190 | os.Remove(v[i]) 191 | } 192 | delete(cfg.clerks, ck) 193 | } 194 | 195 | // caller should hold cfg.mu 196 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) { 197 | // log.Printf("ConnectClient %v to %v\n", ck, to) 198 | endnames := cfg.clerks[ck] 199 | for j := 0; j < len(to); j++ { 200 | s := endnames[to[j]] 201 | cfg.net.Enable(s, true) 202 | } 203 | } 204 | 205 | func (cfg *config) ConnectClient(ck *Clerk, to []int) { 206 | cfg.mu.Lock() 207 | defer cfg.mu.Unlock() 208 | cfg.ConnectClientUnlocked(ck, to) 209 | } 210 | 211 | // caller should hold cfg.mu 212 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) { 213 | // log.Printf("DisconnectClient %v from %v\n", ck, from) 214 | endnames := cfg.clerks[ck] 215 | for j := 0; j < len(from); j++ { 216 | s := endnames[from[j]] 217 | cfg.net.Enable(s, false) 218 | } 219 | } 220 | 221 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) { 222 | cfg.mu.Lock() 223 | defer cfg.mu.Unlock() 224 | cfg.DisconnectClientUnlocked(ck, from) 225 | } 226 | 227 | // Shutdown a server by isolating it 228 | func (cfg *config) ShutdownServer(i int) { 229 | cfg.mu.Lock() 230 | defer cfg.mu.Unlock() 231 | 232 | cfg.disconnectUnlocked(i, cfg.All()) 233 | 234 | // disable client connections to the server. 235 | // it's important to do this before creating 236 | // the new Persister in saved[i], to avoid 237 | // the possibility of the server returning a 238 | // positive reply to an Append but persisting 239 | // the result in the superseded Persister. 240 | cfg.net.DeleteServer(i) 241 | 242 | // a fresh persister, in case old instance 243 | // continues to update the Persister. 244 | // but copy old persister's content so that we always 245 | // pass Make() the last persisted state. 246 | if cfg.saved[i] != nil { 247 | cfg.saved[i] = cfg.saved[i].Copy() 248 | } 249 | 250 | kv := cfg.servers[i] 251 | if kv != nil { 252 | cfg.mu.Unlock() 253 | kv.Kill() 254 | cfg.mu.Lock() 255 | cfg.servers[i] = nil 256 | } 257 | } 258 | 259 | // If restart servers, first call ShutdownServer 260 | func (cfg *config) StartServer(i int) { 261 | cfg.mu.Lock() 262 | 263 | // a fresh set of outgoing ClientEnd names. 264 | cfg.endnames[i] = make([]string, cfg.n) 265 | for j := 0; j < cfg.n; j++ { 266 | cfg.endnames[i][j] = randstring(20) 267 | } 268 | 269 | // a fresh set of ClientEnds. 270 | ends := make([]*labrpc.ClientEnd, cfg.n) 271 | for j := 0; j < cfg.n; j++ { 272 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 273 | cfg.net.Connect(cfg.endnames[i][j], j) 274 | } 275 | 276 | // a fresh persister, so old instance doesn't overwrite 277 | // new instance's persisted state. 278 | // give the fresh persister a copy of the old persister's 279 | // state, so that the spec is that we pass StartKVServer() 280 | // the last persisted state. 281 | if cfg.saved[i] != nil { 282 | cfg.saved[i] = cfg.saved[i].Copy() 283 | } else { 284 | cfg.saved[i] = raft.MakePersister() 285 | } 286 | 287 | cfg.mu.Unlock() 288 | 289 | cfg.servers[i] = StartServer(ends, i, cfg.saved[i]) 290 | 291 | kvsvc := labrpc.MakeService(cfg.servers[i]) 292 | rfsvc := labrpc.MakeService(cfg.servers[i].rf) 293 | srv := labrpc.MakeServer() 294 | srv.AddService(kvsvc) 295 | srv.AddService(rfsvc) 296 | cfg.net.AddServer(i, srv) 297 | } 298 | 299 | func (cfg *config) Leader() (bool, int) { 300 | cfg.mu.Lock() 301 | defer cfg.mu.Unlock() 302 | 303 | for i := 0; i < cfg.n; i++ { 304 | if cfg.servers[i] != nil { 305 | _, is_leader := cfg.servers[i].rf.GetState() 306 | if is_leader { 307 | return true, i 308 | } 309 | } 310 | } 311 | return false, 0 312 | } 313 | 314 | // Partition servers into 2 groups and put current leader in minority 315 | func (cfg *config) make_partition() ([]int, []int) { 316 | _, l := cfg.Leader() 317 | p1 := make([]int, cfg.n/2+1) 318 | p2 := make([]int, cfg.n/2) 319 | j := 0 320 | for i := 0; i < cfg.n; i++ { 321 | if i != l { 322 | if j < len(p1) { 323 | p1[j] = i 324 | } else { 325 | p2[j-len(p1)] = i 326 | } 327 | j++ 328 | } 329 | } 330 | p2[len(p2)-1] = l 331 | return p1, p2 332 | } 333 | 334 | func make_config(t *testing.T, n int, unreliable bool) *config { 335 | runtime.GOMAXPROCS(4) 336 | cfg := &config{} 337 | cfg.t = t 338 | cfg.net = labrpc.MakeNetwork() 339 | cfg.n = n 340 | cfg.servers = make([]*ShardCtrler, cfg.n) 341 | cfg.saved = make([]*raft.Persister, cfg.n) 342 | cfg.endnames = make([][]string, cfg.n) 343 | cfg.clerks = make(map[*Clerk][]string) 344 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 345 | cfg.start = time.Now() 346 | 347 | // create a full set of KV servers. 348 | for i := 0; i < cfg.n; i++ { 349 | cfg.StartServer(i) 350 | } 351 | 352 | cfg.ConnectAll() 353 | 354 | cfg.net.Reliable(!unreliable) 355 | 356 | return cfg 357 | } 358 | -------------------------------------------------------------------------------- /kvraft/server.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "6.824/labgob" 5 | "6.824/labrpc" 6 | "6.824/raft" 7 | "bytes" 8 | "log" 9 | "sync" 10 | "sync/atomic" 11 | "time" 12 | ) 13 | 14 | const WaitCmdTimeOut = time.Millisecond * 500 // cmd执行超过这个时间,就返回timeout 15 | const MaxLockTime = time.Millisecond * 10 // debug 16 | 17 | type Op struct { 18 | // Your definitions here. 19 | // Field names must start with capital letters, 20 | // otherwise RPC will break. 21 | ReqId int64 //用来标识commandNotify 22 | CommandId int64 23 | ClientId int64 24 | Key string 25 | Value string 26 | Method string 27 | } 28 | 29 | type CommandResult struct { 30 | Err Err 31 | Value string 32 | } 33 | 34 | type KVServer struct { 35 | mu sync.Mutex 36 | me int 37 | rf *raft.Raft 38 | applyCh chan raft.ApplyMsg 39 | dead int32 // set by Kill() 40 | stopCh chan struct{} 41 | 42 | maxraftstate int // snapshot if log grows this big 43 | 44 | // Your definitions here. 45 | commandNotifyCh map[int64]chan CommandResult 46 | lastApplies map[int64]int64 //k-v:ClientId-CommandId 47 | data map[string]string 48 | 49 | //持久化 50 | persister *raft.Persister 51 | 52 | //用于互斥锁 53 | lockStartTime time.Time 54 | lockEndTime time.Time 55 | lockMsg string 56 | } 57 | 58 | //自定义锁 59 | func (kv *KVServer) lock(msg string) { 60 | kv.mu.Lock() 61 | kv.lockStartTime = time.Now() 62 | kv.lockMsg = msg 63 | } 64 | 65 | func (kv *KVServer) unlock(msg string) { 66 | kv.lockEndTime = time.Now() 67 | duration := kv.lockEndTime.Sub(kv.lockStartTime) 68 | kv.lockMsg = "" 69 | kv.mu.Unlock() 70 | if duration > MaxLockTime { 71 | DPrintf("lock too long:%s:%s\n", msg, duration) 72 | } 73 | } 74 | 75 | func (kv *KVServer) removeCh(reqId int64) { 76 | kv.lock("removeCh") 77 | defer kv.unlock("removeCh") 78 | delete(kv.commandNotifyCh, reqId) 79 | } 80 | 81 | //调用start向raft请求命令 82 | func (kv *KVServer) waitCmd(op Op) (res CommandResult) { 83 | DPrintf("server %v wait cmd start,Op: %+v.\n", kv.me, op) 84 | 85 | //提交命令,其实这里的start要改,一个kv数据库get命令可以发生在所有节点上 86 | index, term, isLeader := kv.rf.Start(op) 87 | if !isLeader { 88 | res.Err = ErrWrongLeader 89 | return 90 | } 91 | 92 | kv.lock("waitCmd") 93 | ch := make(chan CommandResult, 1) 94 | kv.commandNotifyCh[op.ReqId] = ch 95 | kv.unlock("waitCmd") 96 | DPrintf("start cmd: index:%d, term:%d, op:%+v", index, term, op) 97 | 98 | t := time.NewTimer(WaitCmdTimeOut) 99 | defer t.Stop() 100 | select { 101 | case <-kv.stopCh: 102 | DPrintf("stop ch waitCmd") 103 | kv.removeCh(op.ReqId) 104 | res.Err = ErrServer 105 | return 106 | case res = <-ch: 107 | kv.removeCh(op.ReqId) 108 | return 109 | case <-t.C: 110 | kv.removeCh(op.ReqId) 111 | res.Err = ErrTimeOut 112 | return 113 | 114 | } 115 | } 116 | 117 | //处理Get rpc 118 | func (kv *KVServer) Get(args *GetArgs, reply *GetReply) { 119 | // Your code here. 120 | DPrintf("server %v in rpc Get,args: %+v", kv.me, args) 121 | 122 | _, isLeader := kv.rf.GetState() 123 | if !isLeader { 124 | reply.Err = ErrWrongLeader 125 | return 126 | } 127 | 128 | op := Op{ 129 | ReqId: nrand(), 130 | ClientId: args.ClientId, 131 | CommandId: args.CommandId, 132 | Key: args.Key, 133 | Method: "Get", 134 | } 135 | //等待命令执行 136 | res := kv.waitCmd(op) 137 | reply.Err = res.Err 138 | reply.Value = res.Value 139 | 140 | DPrintf("server %v in rpc Get,args:%+v,reply:%+v", kv.me, args, reply) 141 | } 142 | 143 | //处理Put rpc 144 | func (kv *KVServer) PutAppend(args *PutAppendArgs, reply *PutAppendReply) { 145 | // Your code here. 146 | DPrintf("server %v in rpc PutAppend,args: %+v", kv.me, args) 147 | _, isLeader := kv.rf.GetState() 148 | if !isLeader { 149 | reply.Err = ErrWrongLeader 150 | return 151 | } 152 | op := Op{ 153 | ReqId: nrand(), 154 | ClientId: args.ClientId, 155 | CommandId: args.CommandId, 156 | Key: args.Key, 157 | Value: args.Value, 158 | Method: args.Op, 159 | } 160 | //等待命令执行 161 | res := kv.waitCmd(op) 162 | reply.Err = res.Err 163 | 164 | DPrintf("server %v in rpc PutAppend,args:%+v,reply:%+v", kv.me, args, reply) 165 | } 166 | 167 | // 168 | // the tester calls Kill() when a KVServer instance won't 169 | // be needed again. for your convenience, we supply 170 | // code to set rf.dead (without needing a lock), 171 | // and a killed() method to test rf.dead in 172 | // long-running loops. you can also add your own 173 | // code to Kill(). you're not required to do anything 174 | // about this, but it may be convenient (for example) 175 | // to suppress debug output from a Kill()ed instance. 176 | // 177 | func (kv *KVServer) Kill() { 178 | atomic.StoreInt32(&kv.dead, 1) 179 | kv.rf.Kill() 180 | close(kv.stopCh) 181 | // Your code here, if desired. 182 | } 183 | 184 | func (kv *KVServer) killed() bool { 185 | z := atomic.LoadInt32(&kv.dead) 186 | return z == 1 187 | } 188 | 189 | //保存快照 190 | func (kv *KVServer) saveSnapshot(logIndex int) { 191 | if kv.maxraftstate == -1 || kv.persister.RaftStateSize() < kv.maxraftstate { 192 | return 193 | } 194 | 195 | //生成快照数据 196 | w := new(bytes.Buffer) 197 | e := labgob.NewEncoder(w) 198 | if err := e.Encode(kv.data); err != nil { 199 | panic(err) 200 | } 201 | if err := e.Encode(kv.lastApplies); err != nil { 202 | panic(err) 203 | } 204 | data := w.Bytes() 205 | kv.rf.Snapshot(logIndex, data) 206 | } 207 | 208 | //读取快照 209 | //两处调用:初始化阶段;收到Snapshot命令,即接收了leader的Snapshot 210 | func (kv *KVServer) readPersist(isInit bool, snapshotTerm, snapshotIndex int, data []byte) { 211 | if data == nil || len(data) < 1 { 212 | return 213 | } 214 | //只要不是初始化调用,即如果收到一个Snapshot命令,就要执行该函数 215 | if !isInit { 216 | res := kv.rf.CondInstallSnapshot(snapshotTerm, snapshotIndex, data) 217 | if !res { 218 | log.Panicln("kv read persist err in CondInstallSnapshot!") 219 | return 220 | } 221 | } 222 | //对数据进行同步 223 | r := bytes.NewBuffer(data) 224 | d := labgob.NewDecoder(r) 225 | var kvData map[string]string 226 | var lastApplies map[int64]int64 227 | 228 | if d.Decode(&kvData) != nil || 229 | d.Decode(&lastApplies) != nil { 230 | log.Fatal("kv read persist err!") 231 | } else { 232 | kv.data = kvData 233 | kv.lastApplies = lastApplies 234 | } 235 | } 236 | 237 | func (kv *KVServer) getValueByKey(key string) (err Err, value string) { 238 | if v, ok := kv.data[key]; ok { 239 | err = OK 240 | value = v 241 | } else { 242 | err = ErrNoKey 243 | } 244 | return 245 | } 246 | 247 | func (kv *KVServer) notifyWaitCommand(reqId int64, err Err, value string) { 248 | if ch, ok := kv.commandNotifyCh[reqId]; ok { 249 | ch <- CommandResult{ 250 | Err: err, 251 | Value: value, 252 | } 253 | } 254 | } 255 | 256 | //应用每一条命令 257 | func (kv *KVServer) handleApplyCh() { 258 | for { 259 | select { 260 | case <-kv.stopCh: 261 | DPrintf("get from stopCh,server-%v stop!", kv.me) 262 | return 263 | case cmd := <-kv.applyCh: 264 | //处理快照命令,读取快照的内容 265 | if cmd.SnapshotValid { 266 | DPrintf("%v get install sn,%v %v", kv.me, cmd.SnapshotIndex, cmd.SnapshotTerm) 267 | kv.lock("waitApplyCh_sn") 268 | kv.readPersist(false, cmd.SnapshotTerm, cmd.SnapshotIndex, cmd.Snapshot) 269 | kv.unlock("waitApplyCh_sn") 270 | continue 271 | } 272 | //处理普通命令 273 | if !cmd.CommandValid { 274 | continue 275 | } 276 | cmdIdx := cmd.CommandIndex 277 | DPrintf("server %v start apply command %v:%+v", kv.me, cmdIdx, cmd.Command) 278 | op := cmd.Command.(Op) 279 | kv.lock("handleApplyCh") 280 | 281 | if op.Method == "Get" { 282 | //处理读 283 | e, v := kv.getValueByKey(op.Key) 284 | kv.notifyWaitCommand(op.ReqId, e, v) 285 | } else if op.Method == "Put" || op.Method == "Append" { 286 | //处理写 287 | //判断命令是否重复 288 | isRepeated := false 289 | if v, ok := kv.lastApplies[op.ClientId]; ok { 290 | if v == op.CommandId { 291 | isRepeated = true 292 | } 293 | } 294 | 295 | if !isRepeated { 296 | switch op.Method { 297 | case "Put": 298 | kv.data[op.Key] = op.Value 299 | kv.lastApplies[op.ClientId] = op.CommandId 300 | case "Append": 301 | e, v := kv.getValueByKey(op.Key) 302 | if e == ErrNoKey { 303 | //按put处理 304 | kv.data[op.Key] = op.Value 305 | kv.lastApplies[op.ClientId] = op.CommandId 306 | } else { 307 | //追加 308 | kv.data[op.Key] = v + op.Value 309 | kv.lastApplies[op.ClientId] = op.CommandId 310 | } 311 | default: 312 | kv.unlock("handleApplyCh") 313 | panic("unknown method " + op.Method) 314 | } 315 | 316 | } 317 | //命令处理成功 318 | kv.notifyWaitCommand(op.ReqId, OK, "") 319 | } else { 320 | kv.unlock("handleApplyCh") 321 | panic("unknown method " + op.Method) 322 | } 323 | 324 | DPrintf("apply op: cmdId:%d, op: %+v, data:%v", cmdIdx, op, kv.data[op.Key]) 325 | //每应用一条命令,就判断是否进行持久化 326 | kv.saveSnapshot(cmdIdx) 327 | 328 | kv.unlock("handleApplyCh") 329 | } 330 | 331 | } 332 | 333 | } 334 | 335 | // 336 | // servers[] contains the ports of the set of 337 | // servers that will cooperate via Raft to 338 | // form the fault-tolerant key/value service. 339 | // me is the index of the current server in servers[]. 340 | // the k/v server should store snapshots through the underlying Raft 341 | // implementation, which should call persister.SaveStateAndSnapshot() to 342 | // atomically save the Raft state along with the snapshot. 343 | // the k/v server should snapshot when Raft's saved state exceeds maxraftstate bytes, 344 | // in order to allow Raft to garbage-collect its log. if maxraftstate is -1, 345 | // you don't need to snapshot. 346 | // StartKVServer() must return quickly, so it should start goroutines 347 | // for any long-running work. 348 | // 349 | func StartKVServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int) *KVServer { 350 | // call labgob.Register on structures you want 351 | // Go's RPC library to marshall/unmarshall. 352 | labgob.Register(Op{}) 353 | 354 | kv := new(KVServer) 355 | kv.me = me 356 | kv.maxraftstate = maxraftstate 357 | kv.persister = persister 358 | 359 | // You may need initialization code here. 360 | kv.lastApplies = make(map[int64]int64) 361 | kv.data = make(map[string]string) 362 | 363 | kv.stopCh = make(chan struct{}) 364 | //读取快照 365 | kv.readPersist(true, 0, 0, kv.persister.ReadSnapshot()) 366 | 367 | kv.commandNotifyCh = make(map[int64]chan CommandResult) 368 | kv.applyCh = make(chan raft.ApplyMsg) 369 | kv.rf = raft.Make(servers, me, persister, kv.applyCh) 370 | 371 | go kv.handleApplyCh() 372 | 373 | return kv 374 | } 375 | -------------------------------------------------------------------------------- /shardctrler/test_test.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | // import "time" 11 | 12 | func check(t *testing.T, groups []int, ck *Clerk) { 13 | c := ck.Query(-1) 14 | if len(c.Groups) != len(groups) { 15 | t.Fatalf("wanted %v groups, got %v", len(groups), len(c.Groups)) 16 | } 17 | 18 | // are the groups as expected? 19 | for _, g := range groups { 20 | _, ok := c.Groups[g] 21 | if ok != true { 22 | t.Fatalf("missing group %v", g) 23 | } 24 | } 25 | 26 | // any un-allocated shards? 27 | if len(groups) > 0 { 28 | for s, g := range c.Shards { 29 | _, ok := c.Groups[g] 30 | if ok == false { 31 | t.Fatalf("shard %v -> invalid group %v", s, g) 32 | } 33 | } 34 | } 35 | 36 | // more or less balanced sharding? 37 | counts := map[int]int{} 38 | for _, g := range c.Shards { 39 | counts[g] += 1 40 | } 41 | min := 257 42 | max := 0 43 | for g, _ := range c.Groups { 44 | if counts[g] > max { 45 | max = counts[g] 46 | } 47 | if counts[g] < min { 48 | min = counts[g] 49 | } 50 | } 51 | if max > min+1 { 52 | t.Fatalf("max %v too much larger than min %v", max, min) 53 | } 54 | } 55 | 56 | func check_same_config(t *testing.T, c1 Config, c2 Config) { 57 | if c1.Num != c2.Num { 58 | t.Fatalf("Num wrong") 59 | } 60 | if c1.Shards != c2.Shards { 61 | t.Fatalf("Shards wrong") 62 | } 63 | if len(c1.Groups) != len(c2.Groups) { 64 | t.Fatalf("number of Groups is wrong") 65 | } 66 | for gid, sa := range c1.Groups { 67 | sa1, ok := c2.Groups[gid] 68 | if ok == false || len(sa1) != len(sa) { 69 | t.Fatalf("len(Groups) wrong") 70 | } 71 | if ok && len(sa1) == len(sa) { 72 | for j := 0; j < len(sa); j++ { 73 | if sa[j] != sa1[j] { 74 | t.Fatalf("Groups wrong") 75 | } 76 | } 77 | } 78 | } 79 | } 80 | 81 | func TestBasic(t *testing.T) { 82 | const nservers = 3 83 | cfg := make_config(t, nservers, false) 84 | defer cfg.cleanup() 85 | 86 | ck := cfg.makeClient(cfg.All()) 87 | 88 | fmt.Printf("Test: Basic leave/join ...\n") 89 | 90 | cfa := make([]Config, 6) 91 | cfa[0] = ck.Query(-1) 92 | 93 | check(t, []int{}, ck) 94 | 95 | var gid1 int = 1 96 | ck.Join(map[int][]string{gid1: []string{"x", "y", "z"}}) 97 | check(t, []int{gid1}, ck) 98 | cfa[1] = ck.Query(-1) 99 | 100 | var gid2 int = 2 101 | ck.Join(map[int][]string{gid2: []string{"a", "b", "c"}}) 102 | check(t, []int{gid1, gid2}, ck) 103 | cfa[2] = ck.Query(-1) 104 | 105 | cfx := ck.Query(-1) 106 | sa1 := cfx.Groups[gid1] 107 | if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" { 108 | t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1) 109 | } 110 | sa2 := cfx.Groups[gid2] 111 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 112 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 113 | } 114 | 115 | ck.Leave([]int{gid1}) 116 | check(t, []int{gid2}, ck) 117 | cfa[4] = ck.Query(-1) 118 | 119 | ck.Leave([]int{gid2}) 120 | cfa[5] = ck.Query(-1) 121 | 122 | fmt.Printf(" ... Passed\n") 123 | 124 | fmt.Printf("Test: Historical queries ...\n") 125 | 126 | for s := 0; s < nservers; s++ { 127 | cfg.ShutdownServer(s) 128 | for i := 0; i < len(cfa); i++ { 129 | c := ck.Query(cfa[i].Num) 130 | check_same_config(t, c, cfa[i]) 131 | } 132 | cfg.StartServer(s) 133 | cfg.ConnectAll() 134 | } 135 | 136 | fmt.Printf(" ... Passed\n") 137 | 138 | fmt.Printf("Test: Move ...\n") 139 | { 140 | var gid3 int = 503 141 | ck.Join(map[int][]string{gid3: []string{"3a", "3b", "3c"}}) 142 | var gid4 int = 504 143 | ck.Join(map[int][]string{gid4: []string{"4a", "4b", "4c"}}) 144 | for i := 0; i < NShards; i++ { 145 | cf := ck.Query(-1) 146 | if i < NShards/2 { 147 | ck.Move(i, gid3) 148 | if cf.Shards[i] != gid3 { 149 | cf1 := ck.Query(-1) 150 | if cf1.Num <= cf.Num { 151 | t.Fatalf("Move should increase Config.Num") 152 | } 153 | } 154 | } else { 155 | ck.Move(i, gid4) 156 | if cf.Shards[i] != gid4 { 157 | cf1 := ck.Query(-1) 158 | if cf1.Num <= cf.Num { 159 | t.Fatalf("Move should increase Config.Num") 160 | } 161 | } 162 | } 163 | } 164 | cf2 := ck.Query(-1) 165 | for i := 0; i < NShards; i++ { 166 | if i < NShards/2 { 167 | if cf2.Shards[i] != gid3 { 168 | t.Fatalf("expected shard %v on gid %v actually %v", 169 | i, gid3, cf2.Shards[i]) 170 | } 171 | } else { 172 | if cf2.Shards[i] != gid4 { 173 | t.Fatalf("expected shard %v on gid %v actually %v", 174 | i, gid4, cf2.Shards[i]) 175 | } 176 | } 177 | } 178 | ck.Leave([]int{gid3}) 179 | ck.Leave([]int{gid4}) 180 | } 181 | fmt.Printf(" ... Passed\n") 182 | 183 | fmt.Printf("Test: Concurrent leave/join ...\n") 184 | 185 | const npara = 10 186 | var cka [npara]*Clerk 187 | for i := 0; i < len(cka); i++ { 188 | cka[i] = cfg.makeClient(cfg.All()) 189 | } 190 | gids := make([]int, npara) 191 | ch := make(chan bool) 192 | for xi := 0; xi < npara; xi++ { 193 | gids[xi] = int((xi * 10) + 100) 194 | go func(i int) { 195 | defer func() { ch <- true }() 196 | var gid int = gids[i] 197 | var sid1 = fmt.Sprintf("s%da", gid) 198 | var sid2 = fmt.Sprintf("s%db", gid) 199 | cka[i].Join(map[int][]string{gid + 1000: []string{sid1}}) 200 | cka[i].Join(map[int][]string{gid: []string{sid2}}) 201 | cka[i].Leave([]int{gid + 1000}) 202 | }(xi) 203 | } 204 | for i := 0; i < npara; i++ { 205 | <-ch 206 | } 207 | check(t, gids, ck) 208 | 209 | fmt.Printf(" ... Passed\n") 210 | 211 | fmt.Printf("Test: Minimal transfers after joins ...\n") 212 | 213 | c1 := ck.Query(-1) 214 | for i := 0; i < 5; i++ { 215 | var gid = int(npara + 1 + i) 216 | ck.Join(map[int][]string{gid: []string{ 217 | fmt.Sprintf("%da", gid), 218 | fmt.Sprintf("%db", gid), 219 | fmt.Sprintf("%db", gid)}}) 220 | } 221 | c2 := ck.Query(-1) 222 | for i := int(1); i <= npara; i++ { 223 | for j := 0; j < len(c1.Shards); j++ { 224 | if c2.Shards[j] == i { 225 | if c1.Shards[j] != i { 226 | t.Fatalf("non-minimal transfer after Join()s") 227 | } 228 | } 229 | } 230 | } 231 | 232 | fmt.Printf(" ... Passed\n") 233 | 234 | fmt.Printf("Test: Minimal transfers after leaves ...\n") 235 | 236 | for i := 0; i < 5; i++ { 237 | ck.Leave([]int{int(npara + 1 + i)}) 238 | } 239 | c3 := ck.Query(-1) 240 | for i := int(1); i <= npara; i++ { 241 | for j := 0; j < len(c1.Shards); j++ { 242 | if c2.Shards[j] == i { 243 | if c3.Shards[j] != i { 244 | t.Fatalf("non-minimal transfer after Leave()s") 245 | } 246 | } 247 | } 248 | } 249 | 250 | fmt.Printf(" ... Passed\n") 251 | } 252 | 253 | func TestMulti(t *testing.T) { 254 | const nservers = 3 255 | cfg := make_config(t, nservers, false) 256 | defer cfg.cleanup() 257 | 258 | ck := cfg.makeClient(cfg.All()) 259 | 260 | fmt.Printf("Test: Multi-group join/leave ...\n") 261 | 262 | cfa := make([]Config, 6) 263 | cfa[0] = ck.Query(-1) 264 | 265 | check(t, []int{}, ck) 266 | 267 | var gid1 int = 1 268 | var gid2 int = 2 269 | ck.Join(map[int][]string{ 270 | gid1: []string{"x", "y", "z"}, 271 | gid2: []string{"a", "b", "c"}, 272 | }) 273 | check(t, []int{gid1, gid2}, ck) 274 | cfa[1] = ck.Query(-1) 275 | 276 | var gid3 int = 3 277 | ck.Join(map[int][]string{gid3: []string{"j", "k", "l"}}) 278 | check(t, []int{gid1, gid2, gid3}, ck) 279 | cfa[2] = ck.Query(-1) 280 | 281 | cfx := ck.Query(-1) 282 | sa1 := cfx.Groups[gid1] 283 | if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" { 284 | t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1) 285 | } 286 | sa2 := cfx.Groups[gid2] 287 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 288 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 289 | } 290 | sa3 := cfx.Groups[gid3] 291 | if len(sa3) != 3 || sa3[0] != "j" || sa3[1] != "k" || sa3[2] != "l" { 292 | t.Fatalf("wrong servers for gid %v: %v\n", gid3, sa3) 293 | } 294 | 295 | ck.Leave([]int{gid1, gid3}) 296 | check(t, []int{gid2}, ck) 297 | cfa[3] = ck.Query(-1) 298 | 299 | cfx = ck.Query(-1) 300 | sa2 = cfx.Groups[gid2] 301 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 302 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 303 | } 304 | 305 | ck.Leave([]int{gid2}) 306 | 307 | fmt.Printf(" ... Passed\n") 308 | 309 | fmt.Printf("Test: Concurrent multi leave/join ...\n") 310 | 311 | const npara = 10 312 | var cka [npara]*Clerk 313 | for i := 0; i < len(cka); i++ { 314 | cka[i] = cfg.makeClient(cfg.All()) 315 | } 316 | gids := make([]int, npara) 317 | var wg sync.WaitGroup 318 | for xi := 0; xi < npara; xi++ { 319 | wg.Add(1) 320 | gids[xi] = int(xi + 1000) 321 | go func(i int) { 322 | defer wg.Done() 323 | var gid int = gids[i] 324 | cka[i].Join(map[int][]string{ 325 | gid: []string{ 326 | fmt.Sprintf("%da", gid), 327 | fmt.Sprintf("%db", gid), 328 | fmt.Sprintf("%dc", gid)}, 329 | gid + 1000: []string{fmt.Sprintf("%da", gid+1000)}, 330 | gid + 2000: []string{fmt.Sprintf("%da", gid+2000)}, 331 | }) 332 | cka[i].Leave([]int{gid + 1000, gid + 2000}) 333 | }(xi) 334 | } 335 | wg.Wait() 336 | check(t, gids, ck) 337 | 338 | fmt.Printf(" ... Passed\n") 339 | 340 | fmt.Printf("Test: Minimal transfers after multijoins ...\n") 341 | 342 | c1 := ck.Query(-1) 343 | m := make(map[int][]string) 344 | for i := 0; i < 5; i++ { 345 | var gid = npara + 1 + i 346 | m[gid] = []string{fmt.Sprintf("%da", gid), fmt.Sprintf("%db", gid)} 347 | } 348 | ck.Join(m) 349 | c2 := ck.Query(-1) 350 | for i := int(1); i <= npara; i++ { 351 | for j := 0; j < len(c1.Shards); j++ { 352 | if c2.Shards[j] == i { 353 | if c1.Shards[j] != i { 354 | t.Fatalf("non-minimal transfer after Join()s") 355 | } 356 | } 357 | } 358 | } 359 | 360 | fmt.Printf(" ... Passed\n") 361 | 362 | fmt.Printf("Test: Minimal transfers after multileaves ...\n") 363 | 364 | var l []int 365 | for i := 0; i < 5; i++ { 366 | l = append(l, npara+1+i) 367 | } 368 | ck.Leave(l) 369 | c3 := ck.Query(-1) 370 | for i := int(1); i <= npara; i++ { 371 | for j := 0; j < len(c1.Shards); j++ { 372 | if c2.Shards[j] == i { 373 | if c3.Shards[j] != i { 374 | t.Fatalf("non-minimal transfer after Leave()s") 375 | } 376 | } 377 | } 378 | } 379 | 380 | fmt.Printf(" ... Passed\n") 381 | 382 | fmt.Printf("Test: Check Same config on servers ...\n") 383 | 384 | isLeader, leader := cfg.Leader() 385 | if !isLeader { 386 | t.Fatalf("Leader not found") 387 | } 388 | c := ck.Query(-1) // Config leader claims 389 | 390 | cfg.ShutdownServer(leader) 391 | 392 | attempts := 0 393 | for isLeader, leader = cfg.Leader(); isLeader; time.Sleep(1 * time.Second) { 394 | if attempts++; attempts >= 3 { 395 | t.Fatalf("Leader not found") 396 | } 397 | } 398 | 399 | c1 = ck.Query(-1) 400 | check_same_config(t, c, c1) 401 | 402 | fmt.Printf(" ... Passed\n") 403 | } 404 | -------------------------------------------------------------------------------- /porcupine/checker.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import ( 4 | "sort" 5 | "sync/atomic" 6 | "time" 7 | ) 8 | 9 | type entryKind bool 10 | 11 | const ( 12 | callEntry entryKind = false 13 | returnEntry = true 14 | ) 15 | 16 | type entry struct { 17 | kind entryKind 18 | value interface{} 19 | id int 20 | time int64 21 | clientId int 22 | } 23 | 24 | type linearizationInfo struct { 25 | history [][]entry // for each partition, a list of entries 26 | partialLinearizations [][][]int // for each partition, a set of histories (list of ids) 27 | } 28 | 29 | type byTime []entry 30 | 31 | func (a byTime) Len() int { 32 | return len(a) 33 | } 34 | 35 | func (a byTime) Swap(i, j int) { 36 | a[i], a[j] = a[j], a[i] 37 | } 38 | 39 | func (a byTime) Less(i, j int) bool { 40 | if a[i].time != a[j].time { 41 | return a[i].time < a[j].time 42 | } 43 | // if the timestamps are the same, we need to make sure we order calls 44 | // before returns 45 | return a[i].kind == callEntry && a[j].kind == returnEntry 46 | } 47 | 48 | func makeEntries(history []Operation) []entry { 49 | var entries []entry = nil 50 | id := 0 51 | for _, elem := range history { 52 | entries = append(entries, entry{ 53 | callEntry, elem.Input, id, elem.Call, elem.ClientId}) 54 | entries = append(entries, entry{ 55 | returnEntry, elem.Output, id, elem.Return, elem.ClientId}) 56 | id++ 57 | } 58 | sort.Sort(byTime(entries)) 59 | return entries 60 | } 61 | 62 | type node struct { 63 | value interface{} 64 | match *node // call if match is nil, otherwise return 65 | id int 66 | next *node 67 | prev *node 68 | } 69 | 70 | func insertBefore(n *node, mark *node) *node { 71 | if mark != nil { 72 | beforeMark := mark.prev 73 | mark.prev = n 74 | n.next = mark 75 | if beforeMark != nil { 76 | n.prev = beforeMark 77 | beforeMark.next = n 78 | } 79 | } 80 | return n 81 | } 82 | 83 | func length(n *node) int { 84 | l := 0 85 | for n != nil { 86 | n = n.next 87 | l++ 88 | } 89 | return l 90 | } 91 | 92 | func renumber(events []Event) []Event { 93 | var e []Event 94 | m := make(map[int]int) // renumbering 95 | id := 0 96 | for _, v := range events { 97 | if r, ok := m[v.Id]; ok { 98 | e = append(e, Event{v.ClientId, v.Kind, v.Value, r}) 99 | } else { 100 | e = append(e, Event{v.ClientId, v.Kind, v.Value, id}) 101 | m[v.Id] = id 102 | id++ 103 | } 104 | } 105 | return e 106 | } 107 | 108 | func convertEntries(events []Event) []entry { 109 | var entries []entry 110 | for i, elem := range events { 111 | kind := callEntry 112 | if elem.Kind == ReturnEvent { 113 | kind = returnEntry 114 | } 115 | // use index as "time" 116 | entries = append(entries, entry{kind, elem.Value, elem.Id, int64(i), elem.ClientId}) 117 | } 118 | return entries 119 | } 120 | 121 | func makeLinkedEntries(entries []entry) *node { 122 | var root *node = nil 123 | match := make(map[int]*node) 124 | for i := len(entries) - 1; i >= 0; i-- { 125 | elem := entries[i] 126 | if elem.kind == returnEntry { 127 | entry := &node{value: elem.value, match: nil, id: elem.id} 128 | match[elem.id] = entry 129 | insertBefore(entry, root) 130 | root = entry 131 | } else { 132 | entry := &node{value: elem.value, match: match[elem.id], id: elem.id} 133 | insertBefore(entry, root) 134 | root = entry 135 | } 136 | } 137 | return root 138 | } 139 | 140 | type cacheEntry struct { 141 | linearized bitset 142 | state interface{} 143 | } 144 | 145 | func cacheContains(model Model, cache map[uint64][]cacheEntry, entry cacheEntry) bool { 146 | for _, elem := range cache[entry.linearized.hash()] { 147 | if entry.linearized.equals(elem.linearized) && model.Equal(entry.state, elem.state) { 148 | return true 149 | } 150 | } 151 | return false 152 | } 153 | 154 | type callsEntry struct { 155 | entry *node 156 | state interface{} 157 | } 158 | 159 | func lift(entry *node) { 160 | entry.prev.next = entry.next 161 | entry.next.prev = entry.prev 162 | match := entry.match 163 | match.prev.next = match.next 164 | if match.next != nil { 165 | match.next.prev = match.prev 166 | } 167 | } 168 | 169 | func unlift(entry *node) { 170 | match := entry.match 171 | match.prev.next = match 172 | if match.next != nil { 173 | match.next.prev = match 174 | } 175 | entry.prev.next = entry 176 | entry.next.prev = entry 177 | } 178 | 179 | func checkSingle(model Model, history []entry, computePartial bool, kill *int32) (bool, []*[]int) { 180 | entry := makeLinkedEntries(history) 181 | n := length(entry) / 2 182 | linearized := newBitset(uint(n)) 183 | cache := make(map[uint64][]cacheEntry) // map from hash to cache entry 184 | var calls []callsEntry 185 | // longest linearizable prefix that includes the given entry 186 | longest := make([]*[]int, n) 187 | 188 | state := model.Init() 189 | headEntry := insertBefore(&node{value: nil, match: nil, id: -1}, entry) 190 | for headEntry.next != nil { 191 | if atomic.LoadInt32(kill) != 0 { 192 | return false, longest 193 | } 194 | if entry.match != nil { 195 | matching := entry.match // the return entry 196 | ok, newState := model.Step(state, entry.value, matching.value) 197 | if ok { 198 | newLinearized := linearized.clone().set(uint(entry.id)) 199 | newCacheEntry := cacheEntry{newLinearized, newState} 200 | if !cacheContains(model, cache, newCacheEntry) { 201 | hash := newLinearized.hash() 202 | cache[hash] = append(cache[hash], newCacheEntry) 203 | calls = append(calls, callsEntry{entry, state}) 204 | state = newState 205 | linearized.set(uint(entry.id)) 206 | lift(entry) 207 | entry = headEntry.next 208 | } else { 209 | entry = entry.next 210 | } 211 | } else { 212 | entry = entry.next 213 | } 214 | } else { 215 | if len(calls) == 0 { 216 | return false, longest 217 | } 218 | // longest 219 | if computePartial { 220 | callsLen := len(calls) 221 | var seq []int = nil 222 | for _, v := range calls { 223 | if longest[v.entry.id] == nil || callsLen > len(*longest[v.entry.id]) { 224 | // create seq lazily 225 | if seq == nil { 226 | seq = make([]int, len(calls)) 227 | for i, v := range calls { 228 | seq[i] = v.entry.id 229 | } 230 | } 231 | longest[v.entry.id] = &seq 232 | } 233 | } 234 | } 235 | callsTop := calls[len(calls)-1] 236 | entry = callsTop.entry 237 | state = callsTop.state 238 | linearized.clear(uint(entry.id)) 239 | calls = calls[:len(calls)-1] 240 | unlift(entry) 241 | entry = entry.next 242 | } 243 | } 244 | // longest linearization is the complete linearization, which is calls 245 | seq := make([]int, len(calls)) 246 | for i, v := range calls { 247 | seq[i] = v.entry.id 248 | } 249 | for i := 0; i < n; i++ { 250 | longest[i] = &seq 251 | } 252 | return true, longest 253 | } 254 | 255 | func fillDefault(model Model) Model { 256 | if model.Partition == nil { 257 | model.Partition = NoPartition 258 | } 259 | if model.PartitionEvent == nil { 260 | model.PartitionEvent = NoPartitionEvent 261 | } 262 | if model.Equal == nil { 263 | model.Equal = ShallowEqual 264 | } 265 | if model.DescribeOperation == nil { 266 | model.DescribeOperation = DefaultDescribeOperation 267 | } 268 | if model.DescribeState == nil { 269 | model.DescribeState = DefaultDescribeState 270 | } 271 | return model 272 | } 273 | 274 | func checkParallel(model Model, history [][]entry, computeInfo bool, timeout time.Duration) (CheckResult, linearizationInfo) { 275 | ok := true 276 | timedOut := false 277 | results := make(chan bool, len(history)) 278 | longest := make([][]*[]int, len(history)) 279 | kill := int32(0) 280 | for i, subhistory := range history { 281 | go func(i int, subhistory []entry) { 282 | ok, l := checkSingle(model, subhistory, computeInfo, &kill) 283 | longest[i] = l 284 | results <- ok 285 | }(i, subhistory) 286 | } 287 | var timeoutChan <-chan time.Time 288 | if timeout > 0 { 289 | timeoutChan = time.After(timeout) 290 | } 291 | count := 0 292 | loop: 293 | for { 294 | select { 295 | case result := <-results: 296 | count++ 297 | ok = ok && result 298 | if !ok && !computeInfo { 299 | atomic.StoreInt32(&kill, 1) 300 | break loop 301 | } 302 | if count >= len(history) { 303 | break loop 304 | } 305 | case <-timeoutChan: 306 | timedOut = true 307 | atomic.StoreInt32(&kill, 1) 308 | break loop // if we time out, we might get a false positive 309 | } 310 | } 311 | var info linearizationInfo 312 | if computeInfo { 313 | // make sure we've waited for all goroutines to finish, 314 | // otherwise we might race on access to longest[] 315 | for count < len(history) { 316 | <-results 317 | count++ 318 | } 319 | // return longest linearizable prefixes that include each history element 320 | partialLinearizations := make([][][]int, len(history)) 321 | for i := 0; i < len(history); i++ { 322 | var partials [][]int 323 | // turn longest into a set of unique linearizations 324 | set := make(map[*[]int]struct{}) 325 | for _, v := range longest[i] { 326 | if v != nil { 327 | set[v] = struct{}{} 328 | } 329 | } 330 | for k := range set { 331 | arr := make([]int, len(*k)) 332 | for i, v := range *k { 333 | arr[i] = v 334 | } 335 | partials = append(partials, arr) 336 | } 337 | partialLinearizations[i] = partials 338 | } 339 | info.history = history 340 | info.partialLinearizations = partialLinearizations 341 | } 342 | var result CheckResult 343 | if !ok { 344 | result = Illegal 345 | } else { 346 | if timedOut { 347 | result = Unknown 348 | } else { 349 | result = Ok 350 | } 351 | } 352 | return result, info 353 | } 354 | 355 | func checkEvents(model Model, history []Event, verbose bool, timeout time.Duration) (CheckResult, linearizationInfo) { 356 | model = fillDefault(model) 357 | partitions := model.PartitionEvent(history) 358 | l := make([][]entry, len(partitions)) 359 | for i, subhistory := range partitions { 360 | l[i] = convertEntries(renumber(subhistory)) 361 | } 362 | return checkParallel(model, l, verbose, timeout) 363 | } 364 | 365 | func checkOperations(model Model, history []Operation, verbose bool, timeout time.Duration) (CheckResult, linearizationInfo) { 366 | model = fillDefault(model) 367 | partitions := model.Partition(history) 368 | l := make([][]entry, len(partitions)) 369 | for i, subhistory := range partitions { 370 | l[i] = makeEntries(subhistory) 371 | } 372 | return checkParallel(model, l, verbose, timeout) 373 | } 374 | -------------------------------------------------------------------------------- /shardkv/config.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import "6.824/shardctrler" 4 | import "6.824/labrpc" 5 | import "testing" 6 | import "os" 7 | 8 | // import "log" 9 | import crand "crypto/rand" 10 | import "math/big" 11 | import "math/rand" 12 | import "encoding/base64" 13 | import "sync" 14 | import "runtime" 15 | import "6.824/raft" 16 | import "strconv" 17 | import "fmt" 18 | import "time" 19 | 20 | func randstring(n int) string { 21 | b := make([]byte, 2*n) 22 | crand.Read(b) 23 | s := base64.URLEncoding.EncodeToString(b) 24 | return s[0:n] 25 | } 26 | 27 | func makeSeed() int64 { 28 | max := big.NewInt(int64(1) << 62) 29 | bigx, _ := crand.Int(crand.Reader, max) 30 | x := bigx.Int64() 31 | return x 32 | } 33 | 34 | // Randomize server handles 35 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 36 | sa := make([]*labrpc.ClientEnd, len(kvh)) 37 | copy(sa, kvh) 38 | for i := range sa { 39 | j := rand.Intn(i + 1) 40 | sa[i], sa[j] = sa[j], sa[i] 41 | } 42 | return sa 43 | } 44 | 45 | type group struct { 46 | gid int 47 | servers []*ShardKV 48 | saved []*raft.Persister 49 | endnames [][]string 50 | mendnames [][]string 51 | } 52 | 53 | type config struct { 54 | mu sync.Mutex 55 | t *testing.T 56 | net *labrpc.Network 57 | start time.Time // time at which make_config() was called 58 | 59 | nctrlers int 60 | ctrlerservers []*shardctrler.ShardCtrler 61 | mck *shardctrler.Clerk 62 | 63 | ngroups int 64 | n int // servers per k/v group 65 | groups []*group 66 | 67 | clerks map[*Clerk][]string 68 | nextClientId int 69 | maxraftstate int 70 | } 71 | 72 | func (cfg *config) checkTimeout() { 73 | // enforce a two minute real-time limit on each test 74 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 75 | cfg.t.Fatal("test took longer than 120 seconds") 76 | } 77 | } 78 | 79 | func (cfg *config) cleanup() { 80 | for gi := 0; gi < cfg.ngroups; gi++ { 81 | cfg.ShutdownGroup(gi) 82 | } 83 | for i := 0; i < cfg.nctrlers; i++ { 84 | cfg.ctrlerservers[i].Kill() 85 | } 86 | cfg.net.Cleanup() 87 | cfg.checkTimeout() 88 | } 89 | 90 | // check that no server's log is too big. 91 | func (cfg *config) checklogs() { 92 | for gi := 0; gi < cfg.ngroups; gi++ { 93 | for i := 0; i < cfg.n; i++ { 94 | raft := cfg.groups[gi].saved[i].RaftStateSize() 95 | snap := len(cfg.groups[gi].saved[i].ReadSnapshot()) 96 | if cfg.maxraftstate >= 0 && raft > 8*cfg.maxraftstate { 97 | cfg.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v", 98 | raft, cfg.maxraftstate) 99 | } 100 | if cfg.maxraftstate < 0 && snap > 0 { 101 | cfg.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!") 102 | } 103 | } 104 | } 105 | } 106 | 107 | // controler server name for labrpc. 108 | func (cfg *config) ctrlername(i int) string { 109 | return "ctrler" + strconv.Itoa(i) 110 | } 111 | 112 | // shard server name for labrpc. 113 | // i'th server of group gid. 114 | func (cfg *config) servername(gid int, i int) string { 115 | return "server-" + strconv.Itoa(gid) + "-" + strconv.Itoa(i) 116 | } 117 | 118 | func (cfg *config) makeClient() *Clerk { 119 | cfg.mu.Lock() 120 | defer cfg.mu.Unlock() 121 | 122 | // ClientEnds to talk to controler service. 123 | ends := make([]*labrpc.ClientEnd, cfg.nctrlers) 124 | endnames := make([]string, cfg.n) 125 | for j := 0; j < cfg.nctrlers; j++ { 126 | endnames[j] = randstring(20) 127 | ends[j] = cfg.net.MakeEnd(endnames[j]) 128 | cfg.net.Connect(endnames[j], cfg.ctrlername(j)) 129 | cfg.net.Enable(endnames[j], true) 130 | } 131 | 132 | ck := MakeClerk(ends, func(servername string) *labrpc.ClientEnd { 133 | name := randstring(20) 134 | end := cfg.net.MakeEnd(name) 135 | cfg.net.Connect(name, servername) 136 | cfg.net.Enable(name, true) 137 | return end 138 | }) 139 | cfg.clerks[ck] = endnames 140 | cfg.nextClientId++ 141 | return ck 142 | } 143 | 144 | func (cfg *config) deleteClient(ck *Clerk) { 145 | cfg.mu.Lock() 146 | defer cfg.mu.Unlock() 147 | 148 | v := cfg.clerks[ck] 149 | for i := 0; i < len(v); i++ { 150 | os.Remove(v[i]) 151 | } 152 | delete(cfg.clerks, ck) 153 | } 154 | 155 | // Shutdown i'th server of gi'th group, by isolating it 156 | func (cfg *config) ShutdownServer(gi int, i int) { 157 | cfg.mu.Lock() 158 | defer cfg.mu.Unlock() 159 | 160 | gg := cfg.groups[gi] 161 | 162 | // prevent this server from sending 163 | for j := 0; j < len(gg.servers); j++ { 164 | name := gg.endnames[i][j] 165 | cfg.net.Enable(name, false) 166 | } 167 | for j := 0; j < len(gg.mendnames[i]); j++ { 168 | name := gg.mendnames[i][j] 169 | cfg.net.Enable(name, false) 170 | } 171 | 172 | // disable client connections to the server. 173 | // it's important to do this before creating 174 | // the new Persister in saved[i], to avoid 175 | // the possibility of the server returning a 176 | // positive reply to an Append but persisting 177 | // the result in the superseded Persister. 178 | cfg.net.DeleteServer(cfg.servername(gg.gid, i)) 179 | 180 | // a fresh persister, in case old instance 181 | // continues to update the Persister. 182 | // but copy old persister's content so that we always 183 | // pass Make() the last persisted state. 184 | if gg.saved[i] != nil { 185 | gg.saved[i] = gg.saved[i].Copy() 186 | } 187 | 188 | kv := gg.servers[i] 189 | if kv != nil { 190 | cfg.mu.Unlock() 191 | kv.Kill() 192 | cfg.mu.Lock() 193 | gg.servers[i] = nil 194 | } 195 | } 196 | 197 | func (cfg *config) ShutdownGroup(gi int) { 198 | for i := 0; i < cfg.n; i++ { 199 | cfg.ShutdownServer(gi, i) 200 | } 201 | } 202 | 203 | // start i'th server in gi'th group 204 | func (cfg *config) StartServer(gi int, i int) { 205 | cfg.mu.Lock() 206 | 207 | gg := cfg.groups[gi] 208 | 209 | // a fresh set of outgoing ClientEnd names 210 | // to talk to other servers in this group. 211 | gg.endnames[i] = make([]string, cfg.n) 212 | for j := 0; j < cfg.n; j++ { 213 | gg.endnames[i][j] = randstring(20) 214 | } 215 | 216 | // and the connections to other servers in this group. 217 | ends := make([]*labrpc.ClientEnd, cfg.n) 218 | for j := 0; j < cfg.n; j++ { 219 | ends[j] = cfg.net.MakeEnd(gg.endnames[i][j]) 220 | cfg.net.Connect(gg.endnames[i][j], cfg.servername(gg.gid, j)) 221 | cfg.net.Enable(gg.endnames[i][j], true) 222 | } 223 | 224 | // ends to talk to shardctrler service 225 | mends := make([]*labrpc.ClientEnd, cfg.nctrlers) 226 | gg.mendnames[i] = make([]string, cfg.nctrlers) 227 | for j := 0; j < cfg.nctrlers; j++ { 228 | gg.mendnames[i][j] = randstring(20) 229 | mends[j] = cfg.net.MakeEnd(gg.mendnames[i][j]) 230 | cfg.net.Connect(gg.mendnames[i][j], cfg.ctrlername(j)) 231 | cfg.net.Enable(gg.mendnames[i][j], true) 232 | } 233 | 234 | // a fresh persister, so old instance doesn't overwrite 235 | // new instance's persisted state. 236 | // give the fresh persister a copy of the old persister's 237 | // state, so that the spec is that we pass StartKVServer() 238 | // the last persisted state. 239 | if gg.saved[i] != nil { 240 | gg.saved[i] = gg.saved[i].Copy() 241 | } else { 242 | gg.saved[i] = raft.MakePersister() 243 | } 244 | cfg.mu.Unlock() 245 | 246 | gg.servers[i] = StartServer(ends, i, gg.saved[i], cfg.maxraftstate, 247 | gg.gid, mends, 248 | func(servername string) *labrpc.ClientEnd { 249 | name := randstring(20) 250 | end := cfg.net.MakeEnd(name) 251 | cfg.net.Connect(name, servername) 252 | cfg.net.Enable(name, true) 253 | return end 254 | }) 255 | 256 | kvsvc := labrpc.MakeService(gg.servers[i]) 257 | rfsvc := labrpc.MakeService(gg.servers[i].rf) 258 | srv := labrpc.MakeServer() 259 | srv.AddService(kvsvc) 260 | srv.AddService(rfsvc) 261 | cfg.net.AddServer(cfg.servername(gg.gid, i), srv) 262 | } 263 | 264 | func (cfg *config) StartGroup(gi int) { 265 | for i := 0; i < cfg.n; i++ { 266 | cfg.StartServer(gi, i) 267 | } 268 | } 269 | 270 | func (cfg *config) StartCtrlerserver(i int) { 271 | // ClientEnds to talk to other controler replicas. 272 | ends := make([]*labrpc.ClientEnd, cfg.nctrlers) 273 | for j := 0; j < cfg.nctrlers; j++ { 274 | endname := randstring(20) 275 | ends[j] = cfg.net.MakeEnd(endname) 276 | cfg.net.Connect(endname, cfg.ctrlername(j)) 277 | cfg.net.Enable(endname, true) 278 | } 279 | 280 | p := raft.MakePersister() 281 | 282 | cfg.ctrlerservers[i] = shardctrler.StartServer(ends, i, p) 283 | 284 | msvc := labrpc.MakeService(cfg.ctrlerservers[i]) 285 | rfsvc := labrpc.MakeService(cfg.ctrlerservers[i].Raft()) 286 | srv := labrpc.MakeServer() 287 | srv.AddService(msvc) 288 | srv.AddService(rfsvc) 289 | cfg.net.AddServer(cfg.ctrlername(i), srv) 290 | } 291 | 292 | func (cfg *config) shardclerk() *shardctrler.Clerk { 293 | // ClientEnds to talk to ctrler service. 294 | ends := make([]*labrpc.ClientEnd, cfg.nctrlers) 295 | for j := 0; j < cfg.nctrlers; j++ { 296 | name := randstring(20) 297 | ends[j] = cfg.net.MakeEnd(name) 298 | cfg.net.Connect(name, cfg.ctrlername(j)) 299 | cfg.net.Enable(name, true) 300 | } 301 | 302 | return shardctrler.MakeClerk(ends) 303 | } 304 | 305 | // tell the shardctrler that a group is joining. 306 | func (cfg *config) join(gi int) { 307 | cfg.joinm([]int{gi}) 308 | } 309 | 310 | func (cfg *config) joinm(gis []int) { 311 | m := make(map[int][]string, len(gis)) 312 | for _, g := range gis { 313 | gid := cfg.groups[g].gid 314 | servernames := make([]string, cfg.n) 315 | for i := 0; i < cfg.n; i++ { 316 | servernames[i] = cfg.servername(gid, i) 317 | } 318 | m[gid] = servernames 319 | } 320 | cfg.mck.Join(m) 321 | } 322 | 323 | // tell the shardctrler that a group is leaving. 324 | func (cfg *config) leave(gi int) { 325 | cfg.leavem([]int{gi}) 326 | } 327 | 328 | func (cfg *config) leavem(gis []int) { 329 | gids := make([]int, 0, len(gis)) 330 | for _, g := range gis { 331 | gids = append(gids, cfg.groups[g].gid) 332 | } 333 | cfg.mck.Leave(gids) 334 | } 335 | 336 | var ncpu_once sync.Once 337 | 338 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config { 339 | ncpu_once.Do(func() { 340 | if runtime.NumCPU() < 2 { 341 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 342 | } 343 | rand.Seed(makeSeed()) 344 | }) 345 | runtime.GOMAXPROCS(4) 346 | cfg := &config{} 347 | cfg.t = t 348 | cfg.maxraftstate = maxraftstate 349 | cfg.net = labrpc.MakeNetwork() 350 | cfg.start = time.Now() 351 | 352 | // controler 353 | cfg.nctrlers = 3 354 | cfg.ctrlerservers = make([]*shardctrler.ShardCtrler, cfg.nctrlers) 355 | for i := 0; i < cfg.nctrlers; i++ { 356 | cfg.StartCtrlerserver(i) 357 | } 358 | cfg.mck = cfg.shardclerk() 359 | 360 | cfg.ngroups = 3 361 | cfg.groups = make([]*group, cfg.ngroups) 362 | cfg.n = n 363 | for gi := 0; gi < cfg.ngroups; gi++ { 364 | gg := &group{} 365 | cfg.groups[gi] = gg 366 | gg.gid = 100 + gi 367 | gg.servers = make([]*ShardKV, cfg.n) 368 | gg.saved = make([]*raft.Persister, cfg.n) 369 | gg.endnames = make([][]string, cfg.n) 370 | gg.mendnames = make([][]string, cfg.nctrlers) 371 | for i := 0; i < cfg.n; i++ { 372 | cfg.StartServer(gi, i) 373 | } 374 | } 375 | 376 | cfg.clerks = make(map[*Clerk][]string) 377 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 378 | 379 | cfg.net.Reliable(!unreliable) 380 | 381 | return cfg 382 | } 383 | -------------------------------------------------------------------------------- /kvraft/config.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import "6.824/labrpc" 4 | import "testing" 5 | import "os" 6 | 7 | // import "log" 8 | import crand "crypto/rand" 9 | import "math/big" 10 | import "math/rand" 11 | import "encoding/base64" 12 | import "sync" 13 | import "runtime" 14 | import "6.824/raft" 15 | import "fmt" 16 | import "time" 17 | import "sync/atomic" 18 | 19 | func randstring(n int) string { 20 | b := make([]byte, 2*n) 21 | crand.Read(b) 22 | s := base64.URLEncoding.EncodeToString(b) 23 | return s[0:n] 24 | } 25 | 26 | func makeSeed() int64 { 27 | max := big.NewInt(int64(1) << 62) 28 | bigx, _ := crand.Int(crand.Reader, max) 29 | x := bigx.Int64() 30 | return x 31 | } 32 | 33 | // Randomize server handles 34 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 35 | sa := make([]*labrpc.ClientEnd, len(kvh)) 36 | copy(sa, kvh) 37 | for i := range sa { 38 | j := rand.Intn(i + 1) 39 | sa[i], sa[j] = sa[j], sa[i] 40 | } 41 | return sa 42 | } 43 | 44 | type config struct { 45 | mu sync.Mutex 46 | t *testing.T 47 | net *labrpc.Network 48 | n int 49 | kvservers []*KVServer 50 | saved []*raft.Persister 51 | endnames [][]string // names of each server's sending ClientEnds 52 | clerks map[*Clerk][]string 53 | nextClientId int 54 | maxraftstate int 55 | start time.Time // time at which make_config() was called 56 | // begin()/end() statistics 57 | t0 time.Time // time at which test_test.go called cfg.begin() 58 | rpcs0 int // rpcTotal() at start of test 59 | ops int32 // number of clerk get/put/append method calls 60 | } 61 | 62 | func (cfg *config) checkTimeout() { 63 | // enforce a two minute real-time limit on each test 64 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 65 | cfg.t.Fatal("test took longer than 120 seconds") 66 | } 67 | } 68 | 69 | func (cfg *config) cleanup() { 70 | cfg.mu.Lock() 71 | defer cfg.mu.Unlock() 72 | for i := 0; i < len(cfg.kvservers); i++ { 73 | if cfg.kvservers[i] != nil { 74 | cfg.kvservers[i].Kill() 75 | } 76 | } 77 | cfg.net.Cleanup() 78 | cfg.checkTimeout() 79 | } 80 | 81 | // Maximum log size across all servers 82 | func (cfg *config) LogSize() int { 83 | logsize := 0 84 | for i := 0; i < cfg.n; i++ { 85 | n := cfg.saved[i].RaftStateSize() 86 | if n > logsize { 87 | logsize = n 88 | } 89 | } 90 | return logsize 91 | } 92 | 93 | // Maximum snapshot size across all servers 94 | func (cfg *config) SnapshotSize() int { 95 | snapshotsize := 0 96 | for i := 0; i < cfg.n; i++ { 97 | n := cfg.saved[i].SnapshotSize() 98 | if n > snapshotsize { 99 | snapshotsize = n 100 | } 101 | } 102 | return snapshotsize 103 | } 104 | 105 | // attach server i to servers listed in to 106 | // caller must hold cfg.mu 107 | func (cfg *config) connectUnlocked(i int, to []int) { 108 | // log.Printf("connect peer %d to %v\n", i, to) 109 | 110 | // outgoing socket files 111 | for j := 0; j < len(to); j++ { 112 | endname := cfg.endnames[i][to[j]] 113 | cfg.net.Enable(endname, true) 114 | } 115 | 116 | // incoming socket files 117 | for j := 0; j < len(to); j++ { 118 | endname := cfg.endnames[to[j]][i] 119 | cfg.net.Enable(endname, true) 120 | } 121 | } 122 | 123 | func (cfg *config) connect(i int, to []int) { 124 | cfg.mu.Lock() 125 | defer cfg.mu.Unlock() 126 | cfg.connectUnlocked(i, to) 127 | } 128 | 129 | // detach server i from the servers listed in from 130 | // caller must hold cfg.mu 131 | func (cfg *config) disconnectUnlocked(i int, from []int) { 132 | // log.Printf("disconnect peer %d from %v\n", i, from) 133 | 134 | // outgoing socket files 135 | for j := 0; j < len(from); j++ { 136 | if cfg.endnames[i] != nil { 137 | endname := cfg.endnames[i][from[j]] 138 | cfg.net.Enable(endname, false) 139 | } 140 | } 141 | 142 | // incoming socket files 143 | for j := 0; j < len(from); j++ { 144 | if cfg.endnames[j] != nil { 145 | endname := cfg.endnames[from[j]][i] 146 | cfg.net.Enable(endname, false) 147 | } 148 | } 149 | } 150 | 151 | func (cfg *config) disconnect(i int, from []int) { 152 | cfg.mu.Lock() 153 | defer cfg.mu.Unlock() 154 | cfg.disconnectUnlocked(i, from) 155 | } 156 | 157 | func (cfg *config) All() []int { 158 | all := make([]int, cfg.n) 159 | for i := 0; i < cfg.n; i++ { 160 | all[i] = i 161 | } 162 | return all 163 | } 164 | 165 | func (cfg *config) ConnectAll() { 166 | cfg.mu.Lock() 167 | defer cfg.mu.Unlock() 168 | for i := 0; i < cfg.n; i++ { 169 | cfg.connectUnlocked(i, cfg.All()) 170 | } 171 | } 172 | 173 | // Sets up 2 partitions with connectivity between servers in each partition. 174 | func (cfg *config) partition(p1 []int, p2 []int) { 175 | cfg.mu.Lock() 176 | defer cfg.mu.Unlock() 177 | // log.Printf("partition servers into: %v %v\n", p1, p2) 178 | for i := 0; i < len(p1); i++ { 179 | cfg.disconnectUnlocked(p1[i], p2) 180 | cfg.connectUnlocked(p1[i], p1) 181 | } 182 | for i := 0; i < len(p2); i++ { 183 | cfg.disconnectUnlocked(p2[i], p1) 184 | cfg.connectUnlocked(p2[i], p2) 185 | } 186 | } 187 | 188 | // Create a clerk with clerk specific server names. 189 | // Give it connections to all of the servers, but for 190 | // now enable only connections to servers in to[]. 191 | func (cfg *config) makeClient(to []int) *Clerk { 192 | cfg.mu.Lock() 193 | defer cfg.mu.Unlock() 194 | 195 | // a fresh set of ClientEnds. 196 | ends := make([]*labrpc.ClientEnd, cfg.n) 197 | endnames := make([]string, cfg.n) 198 | for j := 0; j < cfg.n; j++ { 199 | endnames[j] = randstring(20) 200 | ends[j] = cfg.net.MakeEnd(endnames[j]) 201 | cfg.net.Connect(endnames[j], j) 202 | } 203 | 204 | ck := MakeClerk(random_handles(ends)) 205 | cfg.clerks[ck] = endnames 206 | cfg.nextClientId++ 207 | cfg.ConnectClientUnlocked(ck, to) 208 | return ck 209 | } 210 | 211 | func (cfg *config) deleteClient(ck *Clerk) { 212 | cfg.mu.Lock() 213 | defer cfg.mu.Unlock() 214 | 215 | v := cfg.clerks[ck] 216 | for i := 0; i < len(v); i++ { 217 | os.Remove(v[i]) 218 | } 219 | delete(cfg.clerks, ck) 220 | } 221 | 222 | // caller should hold cfg.mu 223 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) { 224 | // log.Printf("ConnectClient %v to %v\n", ck, to) 225 | endnames := cfg.clerks[ck] 226 | for j := 0; j < len(to); j++ { 227 | s := endnames[to[j]] 228 | cfg.net.Enable(s, true) 229 | } 230 | } 231 | 232 | func (cfg *config) ConnectClient(ck *Clerk, to []int) { 233 | cfg.mu.Lock() 234 | defer cfg.mu.Unlock() 235 | cfg.ConnectClientUnlocked(ck, to) 236 | } 237 | 238 | // caller should hold cfg.mu 239 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) { 240 | // log.Printf("DisconnectClient %v from %v\n", ck, from) 241 | endnames := cfg.clerks[ck] 242 | for j := 0; j < len(from); j++ { 243 | s := endnames[from[j]] 244 | cfg.net.Enable(s, false) 245 | } 246 | } 247 | 248 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) { 249 | cfg.mu.Lock() 250 | defer cfg.mu.Unlock() 251 | cfg.DisconnectClientUnlocked(ck, from) 252 | } 253 | 254 | // Shutdown a server by isolating it 255 | func (cfg *config) ShutdownServer(i int) { 256 | cfg.mu.Lock() 257 | defer cfg.mu.Unlock() 258 | 259 | cfg.disconnectUnlocked(i, cfg.All()) 260 | 261 | // disable client connections to the server. 262 | // it's important to do this before creating 263 | // the new Persister in saved[i], to avoid 264 | // the possibility of the server returning a 265 | // positive reply to an Append but persisting 266 | // the result in the superseded Persister. 267 | cfg.net.DeleteServer(i) 268 | 269 | // a fresh persister, in case old instance 270 | // continues to update the Persister. 271 | // but copy old persister's content so that we always 272 | // pass Make() the last persisted state. 273 | if cfg.saved[i] != nil { 274 | cfg.saved[i] = cfg.saved[i].Copy() 275 | } 276 | 277 | kv := cfg.kvservers[i] 278 | if kv != nil { 279 | cfg.mu.Unlock() 280 | kv.Kill() 281 | cfg.mu.Lock() 282 | cfg.kvservers[i] = nil 283 | } 284 | } 285 | 286 | // If restart servers, first call ShutdownServer 287 | func (cfg *config) StartServer(i int) { 288 | cfg.mu.Lock() 289 | 290 | // a fresh set of outgoing ClientEnd names. 291 | cfg.endnames[i] = make([]string, cfg.n) 292 | for j := 0; j < cfg.n; j++ { 293 | cfg.endnames[i][j] = randstring(20) 294 | } 295 | 296 | // a fresh set of ClientEnds. 297 | ends := make([]*labrpc.ClientEnd, cfg.n) 298 | for j := 0; j < cfg.n; j++ { 299 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 300 | cfg.net.Connect(cfg.endnames[i][j], j) 301 | } 302 | 303 | // a fresh persister, so old instance doesn't overwrite 304 | // new instance's persisted state. 305 | // give the fresh persister a copy of the old persister's 306 | // state, so that the spec is that we pass StartKVServer() 307 | // the last persisted state. 308 | if cfg.saved[i] != nil { 309 | cfg.saved[i] = cfg.saved[i].Copy() 310 | } else { 311 | cfg.saved[i] = raft.MakePersister() 312 | } 313 | cfg.mu.Unlock() 314 | 315 | cfg.kvservers[i] = StartKVServer(ends, i, cfg.saved[i], cfg.maxraftstate) 316 | 317 | kvsvc := labrpc.MakeService(cfg.kvservers[i]) 318 | rfsvc := labrpc.MakeService(cfg.kvservers[i].rf) 319 | srv := labrpc.MakeServer() 320 | srv.AddService(kvsvc) 321 | srv.AddService(rfsvc) 322 | cfg.net.AddServer(i, srv) 323 | } 324 | 325 | func (cfg *config) Leader() (bool, int) { 326 | cfg.mu.Lock() 327 | defer cfg.mu.Unlock() 328 | 329 | for i := 0; i < cfg.n; i++ { 330 | _, is_leader := cfg.kvservers[i].rf.GetState() 331 | if is_leader { 332 | return true, i 333 | } 334 | } 335 | return false, 0 336 | } 337 | 338 | // Partition servers into 2 groups and put current leader in minority 339 | func (cfg *config) make_partition() ([]int, []int) { 340 | _, l := cfg.Leader() 341 | p1 := make([]int, cfg.n/2+1) 342 | p2 := make([]int, cfg.n/2) 343 | j := 0 344 | for i := 0; i < cfg.n; i++ { 345 | if i != l { 346 | if j < len(p1) { 347 | p1[j] = i 348 | } else { 349 | p2[j-len(p1)] = i 350 | } 351 | j++ 352 | } 353 | } 354 | p2[len(p2)-1] = l 355 | return p1, p2 356 | } 357 | 358 | var ncpu_once sync.Once 359 | 360 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config { 361 | ncpu_once.Do(func() { 362 | if runtime.NumCPU() < 2 { 363 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 364 | } 365 | rand.Seed(makeSeed()) 366 | }) 367 | runtime.GOMAXPROCS(4) 368 | cfg := &config{} 369 | cfg.t = t 370 | cfg.net = labrpc.MakeNetwork() 371 | cfg.n = n 372 | cfg.kvservers = make([]*KVServer, cfg.n) 373 | cfg.saved = make([]*raft.Persister, cfg.n) 374 | cfg.endnames = make([][]string, cfg.n) 375 | cfg.clerks = make(map[*Clerk][]string) 376 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 377 | cfg.maxraftstate = maxraftstate 378 | cfg.start = time.Now() 379 | 380 | // create a full set of KV servers. 381 | for i := 0; i < cfg.n; i++ { 382 | cfg.StartServer(i) 383 | } 384 | 385 | cfg.ConnectAll() 386 | 387 | cfg.net.Reliable(!unreliable) 388 | 389 | return cfg 390 | } 391 | 392 | func (cfg *config) rpcTotal() int { 393 | return cfg.net.GetTotalCount() 394 | } 395 | 396 | // start a Test. 397 | // print the Test message. 398 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high") 399 | func (cfg *config) begin(description string) { 400 | fmt.Printf("%s ...\n", description) 401 | cfg.t0 = time.Now() 402 | cfg.rpcs0 = cfg.rpcTotal() 403 | atomic.StoreInt32(&cfg.ops, 0) 404 | } 405 | 406 | func (cfg *config) op() { 407 | atomic.AddInt32(&cfg.ops, 1) 408 | } 409 | 410 | // end a Test -- the fact that we got here means there 411 | // was no failure. 412 | // print the Passed message, 413 | // and some performance numbers. 414 | func (cfg *config) end() { 415 | cfg.checkTimeout() 416 | if cfg.t.Failed() == false { 417 | t := time.Since(cfg.t0).Seconds() // real time 418 | npeers := cfg.n // number of Raft peers 419 | nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends 420 | ops := atomic.LoadInt32(&cfg.ops) // number of clerk get/put/append calls 421 | 422 | fmt.Printf(" ... Passed --") 423 | fmt.Printf(" %4.1f %d %5d %4d\n", t, npeers, nrpc, ops) 424 | } 425 | } 426 | -------------------------------------------------------------------------------- /shardctrler/server.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import ( 4 | "6.824/labgob" 5 | "6.824/raft" 6 | "sort" 7 | "time" 8 | ) 9 | import "6.824/labrpc" 10 | import "sync" 11 | 12 | const WaitCmdTimeOut = time.Millisecond * 500 // cmd执行超过这个时间,就返回timeout 13 | const MaxLockTime = time.Millisecond * 10 // debug 14 | 15 | type ShardCtrler struct { 16 | mu sync.Mutex 17 | me int 18 | rf *raft.Raft 19 | applyCh chan raft.ApplyMsg 20 | 21 | // Your data here. 22 | stopCh chan struct{} 23 | commandNotifyCh map[int64]chan CommandResult 24 | lastApplies map[int64]int64 //k-v:ClientId-CommandId 25 | 26 | configs []Config // indexed by config num 27 | 28 | //用于互斥锁 29 | lockStartTime time.Time 30 | lockEndTime time.Time 31 | lockMsg string 32 | } 33 | 34 | type CommandResult struct { 35 | Err Err 36 | Config Config 37 | } 38 | 39 | type Op struct { 40 | // Your definitions here. 41 | // Field names must start with capital letters, 42 | // otherwise RPC will break. 43 | ReqId int64 //用来标识commandNotify 44 | CommandId int64 45 | ClientId int64 46 | Args interface{} 47 | Method string 48 | } 49 | 50 | //自定义锁 51 | func (sc *ShardCtrler) lock(msg string) { 52 | sc.mu.Lock() 53 | sc.lockStartTime = time.Now() 54 | sc.lockMsg = msg 55 | } 56 | 57 | func (sc *ShardCtrler) unlock(msg string) { 58 | sc.lockEndTime = time.Now() 59 | duration := sc.lockEndTime.Sub(sc.lockStartTime) 60 | sc.lockMsg = "" 61 | sc.mu.Unlock() 62 | if duration > MaxLockTime { 63 | DPrintf("lock too long:%s:%s\n", msg, duration) 64 | } 65 | } 66 | 67 | // 68 | // the tester calls Kill() when a ShardCtrler instance won't 69 | // be needed again. you are not required to do anything 70 | // in Kill(), but it might be convenient to (for example) 71 | // turn off debug output from this instance. 72 | // 73 | func (sc *ShardCtrler) Kill() { 74 | sc.rf.Kill() 75 | close(sc.stopCh) 76 | // Your code here, if desired. 77 | } 78 | 79 | func (sc *ShardCtrler) removeCh(reqId int64) { 80 | sc.lock("removeCh") 81 | defer sc.unlock("removeCh") 82 | delete(sc.commandNotifyCh, reqId) 83 | } 84 | 85 | func (sc *ShardCtrler) getConfigByIndex(idx int) Config { 86 | if idx < 0 || idx >= len(sc.configs) { 87 | //因为会在config的基础上进行修改形成新的config,又涉及到map需要深拷贝 88 | return sc.configs[len(sc.configs)-1].Copy() 89 | } 90 | return sc.configs[idx].Copy() 91 | } 92 | 93 | // needed by shardkv tester 94 | func (sc *ShardCtrler) Raft() *raft.Raft { 95 | return sc.rf 96 | } 97 | 98 | /* 99 | rpc 100 | */ 101 | 102 | func (sc *ShardCtrler) Join(args *JoinArgs, reply *JoinReply) { 103 | // Your code here. 104 | res := sc.waitCommand(args.ClientId, args.CommandId, "Join", *args) 105 | if res.Err == ErrWrongLeader { 106 | reply.WrongLeader = true 107 | } 108 | reply.Err = res.Err 109 | } 110 | 111 | func (sc *ShardCtrler) Leave(args *LeaveArgs, reply *LeaveReply) { 112 | res := sc.waitCommand(args.ClientId, args.CommandId, "Leave", *args) 113 | if res.Err == ErrWrongLeader { 114 | reply.WrongLeader = true 115 | } 116 | reply.Err = res.Err 117 | } 118 | 119 | func (sc *ShardCtrler) Move(args *MoveArgs, reply *MoveReply) { 120 | res := sc.waitCommand(args.ClientId, args.CommandId, "Move", *args) 121 | if res.Err == ErrWrongLeader { 122 | reply.WrongLeader = true 123 | } 124 | reply.Err = res.Err 125 | } 126 | 127 | func (sc *ShardCtrler) Query(args *QueryArgs, reply *QueryReply) { 128 | // Your code here. 129 | DPrintf("server %v query:args %+v", sc.me, args) 130 | 131 | //如果是查询已经存在的配置可以直接返回,因为存在的配置是不会改变的; 132 | //如果是-1,则必须在handleApplyCh中进行处理,按照命令顺序执行,不然不准确。 133 | sc.lock("query") 134 | if args.Num >= 0 && args.Num < len(sc.configs) { 135 | reply.Err = OK 136 | reply.WrongLeader = false 137 | reply.Config = sc.getConfigByIndex(args.Num) 138 | sc.unlock("query") 139 | return 140 | } 141 | sc.unlock("query") 142 | res := sc.waitCommand(args.ClientId, args.CommandId, "Query", *args) 143 | if res.Err == ErrWrongLeader { 144 | reply.WrongLeader = true 145 | } 146 | reply.Err = res.Err 147 | reply.Config = res.Config 148 | } 149 | 150 | func (sc *ShardCtrler) waitCommand(clientId int64, commandId int64, method string, args interface{}) (res CommandResult) { 151 | DPrintf("server %v wait cmd start,clientId:%v,commandId: %v,method: %s,args: %+v", sc.me, clientId, commandId, method, args) 152 | op := Op{ 153 | ReqId: nrand(), 154 | ClientId: clientId, 155 | CommandId: commandId, 156 | Method: method, 157 | Args: args, 158 | } 159 | index, term, isLeader := sc.rf.Start(op) 160 | if !isLeader { 161 | res.Err = ErrWrongLeader 162 | DPrintf("server %v wait cmd NOT LEADER.", sc.me) 163 | return 164 | } 165 | sc.lock("waitCommand") 166 | ch := make(chan CommandResult, 1) 167 | sc.commandNotifyCh[op.ReqId] = ch 168 | sc.unlock("waitCommand") 169 | DPrintf("server %v wait cmd notify,index: %v,term: %v,op: %+v", sc.me, index, term, op) 170 | 171 | t := time.NewTimer(WaitCmdTimeOut) 172 | defer t.Stop() 173 | 174 | select { 175 | case <-t.C: 176 | res.Err = ErrTimeout 177 | case res = <-ch: 178 | case <-sc.stopCh: 179 | res.Err = ErrServer 180 | } 181 | 182 | sc.removeCh(op.ReqId) 183 | DPrintf("server %v wait cmd end,Op: %+v.", sc.me, op) 184 | return 185 | 186 | } 187 | 188 | /* 189 | 配置调整代码 190 | */ 191 | 192 | //配置的调整 193 | //我们的策略是尽量不改变当前的配置 194 | func (sc *ShardCtrler) adjustConfig(conf *Config) { 195 | //针对三种情况分别进行调整 196 | if len(conf.Groups) == 0 { 197 | conf.Shards = [NShards]int{} 198 | } else if len(conf.Groups) == 1 { 199 | for gid, _ := range conf.Groups { 200 | for i, _ := range conf.Shards { 201 | conf.Shards[i] = gid 202 | } 203 | } 204 | } else if len(conf.Groups) <= NShards { 205 | //group数小于shard数,因此某些group可能会分配多一个或多个shard 206 | avgShardsCount := NShards / len(conf.Groups) 207 | otherShardsCount := NShards - avgShardsCount*len(conf.Groups) 208 | isTryAgain := true 209 | 210 | for isTryAgain { 211 | isTryAgain = false 212 | DPrintf("adjust config,%+v", conf) 213 | //获取所有的gid 214 | var gids []int 215 | for gid, _ := range conf.Groups { 216 | gids = append(gids, gid) 217 | } 218 | sort.Ints(gids) 219 | //遍历每一个server 220 | for _, gid := range gids { 221 | count := 0 222 | for _, val := range conf.Shards { 223 | if val == gid { 224 | count++ 225 | } 226 | } 227 | 228 | //判断是否要改变配置 229 | if count == avgShardsCount { 230 | //不需要改变配置 231 | continue 232 | } else if count > avgShardsCount && otherShardsCount == 0 { 233 | //多出来的设置为0 234 | temp := 0 235 | for k, v := range conf.Shards { 236 | if gid == v { 237 | if temp < avgShardsCount { 238 | temp += 1 239 | } else { 240 | conf.Shards[k] = 0 241 | } 242 | } 243 | } 244 | } else if count > avgShardsCount && otherShardsCount > 0 { 245 | //此时看看多出的shard能否全部分配给该server 246 | //如果没有全部分配完,下一次循环再看 247 | //如果全部分配完还不够,则需要将多出的部分设置为0 248 | temp := 0 249 | for k, v := range conf.Shards { 250 | if gid == v { 251 | if temp < avgShardsCount { 252 | temp += 1 253 | } else if temp == avgShardsCount && otherShardsCount != 0 { 254 | otherShardsCount -= 1 255 | } else { 256 | conf.Shards[k] = 0 257 | } 258 | } 259 | } 260 | 261 | } else { 262 | //count < arg 263 | for k, v := range conf.Shards { 264 | if v == 0 && count < avgShardsCount { 265 | conf.Shards[k] = gid 266 | count += 1 267 | } 268 | if count == avgShardsCount { 269 | break 270 | } 271 | } 272 | //因为调整的顺序问题,可能前面调整的server没有足够的shard进行分配,需要在进行一次调整 273 | if count < avgShardsCount { 274 | DPrintf("adjust config try again.") 275 | isTryAgain = true 276 | continue 277 | } 278 | } 279 | } 280 | 281 | //调整完成后,可能会有所有group都打到平均的shard数,但是多出来的shard没有进行分配 282 | //此时可以采用轮询的方法 283 | cur := 0 284 | for k, v := range conf.Shards { 285 | //需要进行分配的 286 | if v == 0 { 287 | conf.Shards[k] = gids[cur] 288 | cur += 1 289 | cur %= len(conf.Groups) 290 | } 291 | } 292 | 293 | } 294 | } else { 295 | //group数大于shard数,每一个group最多一个shard,会有group没有shard 296 | 297 | gidsFlag := make(map[int]int) 298 | emptyShards := make([]int, 0, NShards) 299 | for k, gid := range conf.Shards { 300 | if gid == 0 { 301 | emptyShards = append(emptyShards, k) 302 | continue 303 | } 304 | if _, ok := gidsFlag[gid]; ok { 305 | conf.Shards[k] = 0 306 | emptyShards = append(emptyShards, k) 307 | } else { 308 | gidsFlag[gid] = 1 309 | } 310 | } 311 | if len(emptyShards) > 0 { 312 | var gids []int 313 | for k, _ := range conf.Groups { 314 | gids = append(gids, k) 315 | } 316 | sort.Ints(gids) 317 | temp := 0 318 | for _, gid := range gids { 319 | if _, ok := gidsFlag[gid]; !ok { 320 | conf.Shards[emptyShards[temp]] = gid 321 | temp += 1 322 | } 323 | if temp >= len(emptyShards) { 324 | break 325 | } 326 | } 327 | 328 | } 329 | } 330 | } 331 | 332 | /* 333 | applych处理代码 334 | */ 335 | 336 | func (sc *ShardCtrler) handleJoinCommand(args JoinArgs) { 337 | conf := sc.getConfigByIndex(-1) 338 | conf.Num += 1 339 | 340 | //加入组 341 | for k, v := range args.Servers { 342 | conf.Groups[k] = v 343 | } 344 | 345 | sc.adjustConfig(&conf) 346 | sc.configs = append(sc.configs, conf) 347 | } 348 | 349 | func (sc *ShardCtrler) handleLeaveCommand(args LeaveArgs) { 350 | conf := sc.getConfigByIndex(-1) 351 | conf.Num += 1 352 | 353 | //删掉server,并重置分配的shard 354 | for _, gid := range args.GIDs { 355 | delete(conf.Groups, gid) 356 | for i, v := range conf.Shards { 357 | if v == gid { 358 | conf.Shards[i] = 0 359 | } 360 | } 361 | } 362 | 363 | sc.adjustConfig(&conf) 364 | sc.configs = append(sc.configs, conf) 365 | } 366 | 367 | func (sc *ShardCtrler) handleMoveCommand(args MoveArgs) { 368 | conf := sc.getConfigByIndex(-1) 369 | conf.Num += 1 370 | conf.Shards[args.Shard] = args.GID 371 | sc.configs = append(sc.configs, conf) 372 | } 373 | 374 | func (sc *ShardCtrler) notifyWaitCommand(reqId int64, err Err, conf Config) { 375 | if ch, ok := sc.commandNotifyCh[reqId]; ok { 376 | ch <- CommandResult{ 377 | Err: err, 378 | Config: conf, 379 | } 380 | } 381 | } 382 | 383 | //处理applych 384 | func (sc *ShardCtrler) handleApplyCh() { 385 | for { 386 | select { 387 | case <-sc.stopCh: 388 | DPrintf("get from stopCh,server-%v stop!", sc.me) 389 | return 390 | case cmd := <-sc.applyCh: 391 | //处理快照命令,读取快照的内容 392 | if cmd.SnapshotValid { 393 | continue 394 | } 395 | //处理普通命令 396 | if !cmd.CommandValid { 397 | continue 398 | } 399 | cmdIdx := cmd.CommandIndex 400 | DPrintf("server %v start apply command %v:%+v", sc.me, cmdIdx, cmd.Command) 401 | op := cmd.Command.(Op) 402 | sc.lock("handleApplyCh") 403 | 404 | if op.Method == "Query" { 405 | //处理读 406 | conf := sc.getConfigByIndex(op.Args.(QueryArgs).Num) 407 | sc.notifyWaitCommand(op.ReqId, OK, conf) 408 | } else { 409 | //处理其他命令 410 | //判断命令是否重复 411 | isRepeated := false 412 | if v, ok := sc.lastApplies[op.ClientId]; ok { 413 | if v == op.CommandId { 414 | isRepeated = true 415 | } 416 | } 417 | if !isRepeated { 418 | switch op.Method { 419 | case "Join": 420 | sc.handleJoinCommand(op.Args.(JoinArgs)) 421 | case "Leave": 422 | sc.handleLeaveCommand(op.Args.(LeaveArgs)) 423 | case "Move": 424 | sc.handleMoveCommand(op.Args.(MoveArgs)) 425 | default: 426 | panic("unknown method") 427 | } 428 | } 429 | sc.lastApplies[op.ClientId] = op.CommandId 430 | sc.notifyWaitCommand(op.ReqId, OK, Config{}) 431 | } 432 | 433 | DPrintf("apply op: cmdId:%d, op: %+v", cmdIdx, op) 434 | sc.unlock("handleApplyCh") 435 | } 436 | } 437 | } 438 | 439 | /* 440 | 初始化代码 441 | */ 442 | 443 | // 444 | // servers[] contains the ports of the set of 445 | // servers that will cooperate via Raft to 446 | // form the fault-tolerant shardctrler service. 447 | // me is the index of the current server in servers[]. 448 | // 449 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister) *ShardCtrler { 450 | labgob.Register(Op{}) 451 | 452 | sc := new(ShardCtrler) 453 | sc.me = me 454 | 455 | sc.configs = make([]Config, 1) 456 | sc.configs[0].Groups = map[int][]string{} 457 | 458 | sc.applyCh = make(chan raft.ApplyMsg) 459 | sc.rf = raft.Make(servers, me, persister, sc.applyCh) 460 | 461 | // Your code here. 462 | sc.stopCh = make(chan struct{}) 463 | sc.commandNotifyCh = make(map[int64]chan CommandResult) 464 | sc.lastApplies = make(map[int64]int64) 465 | 466 | go sc.handleApplyCh() 467 | 468 | return sc 469 | } 470 | --------------------------------------------------------------------------------