├── .idea ├── .gitignore ├── 6.824.iml ├── deployment.xml ├── leetcode │ └── statistics.xml ├── modules.xml ├── sshConfigs.xml ├── vcs.xml └── webServers.xml ├── Makefile ├── README.md └── src ├── .gitignore ├── go.mod ├── go.sum ├── kvraft ├── client.go ├── common.go ├── config.go ├── dstest.py ├── server.go └── test_test.go ├── labgob ├── labgob.go └── test_test.go ├── labrpc ├── labrpc.go └── test_test.go ├── main ├── diskvd.go ├── lockc.go ├── lockd.go ├── mrcoordinator.go ├── mrsequential.go ├── mrworker.go ├── pbc.go ├── pbd.go ├── pg-being_ernest.txt ├── pg-dorian_gray.txt ├── pg-frankenstein.txt ├── pg-grimm.txt ├── pg-huckleberry_finn.txt ├── pg-metamorphosis.txt ├── pg-sherlock_holmes.txt ├── pg-tom_sawyer.txt ├── test-mr-many.sh ├── test-mr.sh └── viewd.go ├── models └── kv.go ├── mr ├── coordinator.go ├── rpc.go └── worker.go ├── mrapps ├── crash.go ├── early_exit.go ├── indexer.go ├── jobcount.go ├── mtiming.go ├── nocrash.go ├── rtiming.go └── wc.go ├── porcupine ├── bitset.go ├── checker.go ├── model.go ├── porcupine.go └── visualization.go ├── raft ├── config.go ├── dstest.py ├── persister.go ├── raft.go ├── test_test.go └── util.go ├── shardctrler ├── client.go ├── common.go ├── config.go ├── server.go └── test_test.go └── shardkv ├── client.go ├── common.go ├── config.go ├── server.go └── test_test.go /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/6.824.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | -------------------------------------------------------------------------------- /.idea/leetcode/statistics.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 20 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/sshConfigs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/webServers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # This is the Makefile helping you submit the labs. 2 | # Just create 6.824/api.key with your API key in it, 3 | # and submit your lab with the following command: 4 | # $ make [lab1|lab2a|lab2b|lab2c|lab2d|lab3a|lab3b|lab4a|lab4b] 5 | 6 | LABS=" lab1 lab2a lab2b lab2c lab2d lab3a lab3b lab4a lab4b " 7 | 8 | %: check-% 9 | @echo "Preparing $@-handin.tar.gz" 10 | @if echo $(LABS) | grep -q " $@ " ; then \ 11 | echo "Tarring up your submission..." ; \ 12 | COPYFILE_DISABLE=1 tar cvzf $@-handin.tar.gz \ 13 | "--exclude=src/main/pg-*.txt" \ 14 | "--exclude=src/main/diskvd" \ 15 | "--exclude=src/mapreduce/824-mrinput-*.txt" \ 16 | "--exclude=src/main/mr-*" \ 17 | "--exclude=mrtmp.*" \ 18 | "--exclude=src/main/diff.out" \ 19 | "--exclude=src/main/mrcoordinator" \ 20 | "--exclude=src/main/mrsequential" \ 21 | "--exclude=src/main/mrworker" \ 22 | "--exclude=*.so" \ 23 | Makefile src; \ 24 | if ! test -e api.key ; then \ 25 | echo "Missing $(PWD)/api.key. Please create the file with your key in it or submit the $@-handin.tar.gz via the web interface."; \ 26 | else \ 27 | echo "Are you sure you want to submit $@? Enter 'yes' to continue:"; \ 28 | read line; \ 29 | if test "$$line" != "yes" ; then echo "Giving up submission"; exit; fi; \ 30 | if test `stat -c "%s" "$@-handin.tar.gz" 2>/dev/null || stat -f "%z" "$@-handin.tar.gz"` -ge 20971520 ; then echo "File exceeds 20MB."; exit; fi; \ 31 | cat api.key | tr -d '\n' > .api.key.trimmed ; \ 32 | curl --silent --fail --show-error -F file=@$@-handin.tar.gz -F "key=<.api.key.trimmed" \ 33 | https://6824.scripts.mit.edu/2022/handin.py/upload > /dev/null || { \ 34 | echo ; \ 35 | echo "Submit seems to have failed."; \ 36 | echo "Please upload the tarball manually on the submission website."; } \ 37 | fi; \ 38 | else \ 39 | echo "Bad target $@. Usage: make [$(LABS)]"; \ 40 | fi 41 | 42 | .PHONY: check-% 43 | check-%: 44 | @echo "Checking that your submission builds correctly..." 45 | @./.check-build git://g.csail.mit.edu/6.824-golabs-2022 $(patsubst check-%,%,$@) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MIT6.824-2022 2 | 课程链接:https://pdos.csail.mit.edu/6.824/
3 | 4 | ### 完成的Labs:
5 | * Lab 1 : 2022/06/6->2022/06/12 [6天] 6 | * 7 | * Lab2A: 2022/06/12->2022/09/13 [10天] 8 | * Lab2B: 2022/09/13->2022/09/19 [7天] 9 | * Lab2C: 2022/09/20->2022/10/09 [19天] 10 | * Lab2D: 2022/10/11->2022/10/17 [7天] 11 | * 12 | * Lab3A: 2022/10/18->2022/10/22 [5天] 13 | * Lab3B: 2022/10/27->2022/11/24 [10天] -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.*/ 2 | main/mr-tmp/ 3 | mrtmp.* 4 | 824-mrinput-*.txt 5 | /main/diff.out 6 | /mapreduce/x.txt 7 | /pbservice/x.txt 8 | /kvpaxos/x.txt 9 | *.so 10 | /main/mrcoordinator 11 | /main/mrsequential 12 | /main/mrworker 13 | -------------------------------------------------------------------------------- /src/go.mod: -------------------------------------------------------------------------------- 1 | module mit6.824 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /src/go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vacant2333/MIT6.824-2022/937eb6e6d400c7ee7cdc6f9e978bb54687e596a4/src/go.sum -------------------------------------------------------------------------------- /src/kvraft/client.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "mit6.824/labrpc" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | type task struct { 10 | index RequestId // 对于当前Client的任务的Index 11 | op string // 任务类型 12 | key string // Get/PutAppend参数 13 | value string // PutAppend参数 14 | resultCh chan string // 传Get的返回值和Block住Get/PutAppend方法 15 | } 16 | 17 | type Clerk struct { 18 | servers []*labrpc.ClientEnd 19 | taskMu sync.Mutex 20 | taskQueue chan task // 任务队列 21 | clientTag ClientId // Client的唯一标识 22 | taskIndex RequestId // 最后一条任务的下标(包括未完成的任务) 23 | leaderIndex int // 上一次成功完成任务的Leader的Index,没有的话为-1 24 | } 25 | 26 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk { 27 | ck := &Clerk{ 28 | servers: servers, 29 | taskQueue: make(chan task), 30 | clientTag: nRand(), 31 | leaderIndex: -1, 32 | } 33 | go ck.doTasks() 34 | return ck 35 | } 36 | 37 | // 持续通过ck.taskQueue接受新的任务 38 | func (ck *Clerk) doTasks() { 39 | for { 40 | currentTask := <-ck.taskQueue 41 | DPrintf("C[%v] start a task:[%v]\n", ck.clientTag, currentTask) 42 | var args interface{} 43 | // 根据任务类型设置args 44 | if currentTask.op == "Get" { 45 | // Get task 46 | args = &GetArgs{ 47 | Key: currentTask.key, 48 | TaskIndex: currentTask.index, 49 | ClientTag: ck.clientTag, 50 | } 51 | } else { 52 | // Put/Append task 53 | args = &PutAppendArgs{ 54 | Key: currentTask.key, 55 | Value: currentTask.value, 56 | Op: currentTask.op, 57 | TaskIndex: currentTask.index, 58 | ClientTag: ck.clientTag, 59 | } 60 | } 61 | for { 62 | if err, value := ck.startTask(currentTask.op, args); err != ErrNoLeader { 63 | // 任务完成,Err不一定是OK,也可能是ErrNoKey 64 | DPrintf("C[%v] success a task:[%v]\n", ck.clientTag, currentTask) 65 | // 如果是Get会传回value,如果是Put/Append会传回"",让Append请求完成 66 | currentTask.resultCh <- value 67 | break 68 | } 69 | time.Sleep(clientNoLeaderSleepTime) 70 | } 71 | } 72 | } 73 | 74 | // 并行的向所有Servers发送某个Task 75 | func (ck *Clerk) startTask(op string, args interface{}) (Err, string) { 76 | // 所有的Reply发送到该Ch 77 | replyCh := make(chan interface{}, len(ck.servers)) 78 | // 当前Reply的Server 79 | serverCh := make(chan int, len(ck.servers)) 80 | // 初始化Reply 81 | replies := make([]interface{}, len(ck.servers)) 82 | for index := range replies { 83 | if op == "Get" { 84 | replies[index] = &GetReply{} 85 | } else { 86 | replies[index] = &PutAppendReply{} 87 | } 88 | } 89 | // 向某个Server提交Task 90 | askServer := func(server int) { 91 | if op == "Get" { 92 | ck.servers[server].Call("KVServer.Get", args, replies[server]) 93 | } else { 94 | ck.servers[server].Call("KVServer.PutAppend", args, replies[server]) 95 | } 96 | replyCh <- replies[server] 97 | serverCh <- server 98 | } 99 | // 会收到的Reply的数量 100 | replyCount := len(ck.servers) 101 | if ck.leaderIndex != -1 { 102 | // 优先发给上一次保存的Leader 103 | go askServer(ck.leaderIndex) 104 | replyCount = 1 105 | } else { 106 | // 没有保存leaderIndex,从所有服务器拿结果 107 | for server := 0; server < len(ck.servers); server++ { 108 | go askServer(server) 109 | } 110 | } 111 | // 持续检查replyCh,如果有可用的reply则直接返回 112 | timeOut := time.After(clientDoTaskTimeOut) 113 | for ; replyCount > 0; replyCount-- { 114 | var reply interface{} 115 | select { 116 | case reply = <-replyCh: 117 | // 拿到了reply 118 | case <-timeOut: 119 | // 任务超时 120 | DPrintf("C[%v] task[%v] timeout,leaderIndex[%v]\n", ck.clientTag, args, ck.leaderIndex) 121 | ck.leaderIndex = -1 122 | return ErrNoLeader, "" 123 | } 124 | server := <-serverCh 125 | // 如果Reply不为空则返回对应的数据给ch 126 | if op == "Get" && reply != nil { 127 | // Get 128 | getReply := reply.(*GetReply) 129 | if getReply.Err == OK || getReply.Err == ErrNoKey { 130 | ck.leaderIndex = server 131 | return getReply.Err, getReply.Value 132 | } 133 | } else if reply != nil { 134 | // Put/Append 135 | putAppendReply := reply.(*PutAppendReply) 136 | if putAppendReply.Err == OK { 137 | ck.leaderIndex = server 138 | return putAppendReply.Err, "" 139 | } 140 | } 141 | } 142 | // 没有可用的Leader或是保存的leaderIndex失效 143 | ck.leaderIndex = -1 144 | return ErrNoLeader, "" 145 | } 146 | 147 | // 添加任务,返回任务结果的chan 148 | func (ck *Clerk) addTask(op string, key string, value string) chan string { 149 | resultCh := make(chan string) 150 | ck.taskMu.Lock() 151 | ck.taskQueue <- task{ 152 | index: ck.taskIndex + 1, 153 | op: op, 154 | key: key, 155 | value: value, 156 | resultCh: resultCh, 157 | } 158 | ck.taskIndex++ 159 | ck.taskMu.Unlock() 160 | return resultCh 161 | } 162 | 163 | func (ck *Clerk) Get(key string) string { 164 | return <-ck.addTask("Get", key, "") 165 | } 166 | 167 | func (ck *Clerk) PutAppend(key string, value string, op string) { 168 | <-ck.addTask(op, key, value) 169 | } 170 | 171 | func (ck *Clerk) Put(key string, value string) { 172 | ck.PutAppend(key, value, "Put") 173 | } 174 | func (ck *Clerk) Append(key string, value string) { 175 | ck.PutAppend(key, value, "Append") 176 | } 177 | -------------------------------------------------------------------------------- /src/kvraft/common.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "crypto/rand" 5 | "log" 6 | "math/big" 7 | "time" 8 | ) 9 | 10 | const ( 11 | Debug = false 12 | 13 | OK = "OK" 14 | ErrNoKey = "ErrNoKey" 15 | ErrWrongLeader = "ErrWrongLeader" 16 | ErrNoLeader = "ErrNoLeader" 17 | // Client任务超时 18 | clientDoTaskTimeOut = 800 * time.Millisecond 19 | // Client没找到Leader的等待时间 20 | clientNoLeaderSleepTime = 65 * time.Millisecond 21 | // 当Raft的ReadStateSize大于(该值*maxRaftState)时开始Snapshot 22 | serverSnapshotStatePercent = 0.9 23 | ) 24 | 25 | type Err string 26 | 27 | // ClientId 每个Client的唯一Tag 28 | type ClientId int64 29 | 30 | // RequestId 每个Task的Index 31 | type RequestId int64 32 | 33 | type PutAppendArgs struct { 34 | Key string 35 | Value string 36 | Op string // Put/Append 37 | ClientTag ClientId 38 | TaskIndex RequestId 39 | } 40 | 41 | type PutAppendReply struct { 42 | Err Err 43 | } 44 | 45 | type GetArgs struct { 46 | Key string 47 | ClientTag ClientId 48 | TaskIndex RequestId 49 | } 50 | 51 | type GetReply struct { 52 | Err Err 53 | Value string 54 | } 55 | 56 | func DPrintf(format string, a ...interface{}) { 57 | if Debug { 58 | log.Printf(format, a...) 59 | } 60 | } 61 | 62 | // 获得一个int64的随机数(Client的Tag) 63 | func nRand() ClientId { 64 | max := big.NewInt(int64(1) << 62) 65 | bigX, _ := rand.Int(rand.Reader, max) 66 | return ClientId(bigX.Int64()) 67 | } 68 | -------------------------------------------------------------------------------- /src/kvraft/config.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import "mit6.824/labrpc" 4 | import "testing" 5 | import "os" 6 | 7 | // import "log" 8 | import crand "crypto/rand" 9 | import "math/big" 10 | import "math/rand" 11 | import "encoding/base64" 12 | import "sync" 13 | import "runtime" 14 | import "mit6.824/raft" 15 | import "fmt" 16 | import "time" 17 | import "sync/atomic" 18 | 19 | func randstring(n int) string { 20 | b := make([]byte, 2*n) 21 | crand.Read(b) 22 | s := base64.URLEncoding.EncodeToString(b) 23 | return s[0:n] 24 | } 25 | 26 | func makeSeed() int64 { 27 | max := big.NewInt(int64(1) << 62) 28 | bigx, _ := crand.Int(crand.Reader, max) 29 | x := bigx.Int64() 30 | return x 31 | } 32 | 33 | // Randomize server handles 34 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 35 | sa := make([]*labrpc.ClientEnd, len(kvh)) 36 | copy(sa, kvh) 37 | for i := range sa { 38 | j := rand.Intn(i + 1) 39 | sa[i], sa[j] = sa[j], sa[i] 40 | } 41 | return sa 42 | } 43 | 44 | type config struct { 45 | mu sync.Mutex 46 | t *testing.T 47 | net *labrpc.Network 48 | n int 49 | kvservers []*KVServer 50 | saved []*raft.Persister 51 | endnames [][]string // names of each server's sending ClientEnds 52 | clerks map[*Clerk][]string 53 | nextClientId int 54 | maxraftstate int 55 | start time.Time // time at which make_config() was called 56 | // begin()/end() statistics 57 | t0 time.Time // time at which test_test.go called cfg.begin() 58 | rpcs0 int // rpcTotal() at start of test 59 | ops int32 // number of clerk get/put/append method calls 60 | } 61 | 62 | func (cfg *config) checkTimeout() { 63 | // enforce a two minute real-time limit on each test 64 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 65 | cfg.t.Fatal("test took longer than 120 seconds") 66 | } 67 | } 68 | 69 | func (cfg *config) cleanup() { 70 | cfg.mu.Lock() 71 | defer cfg.mu.Unlock() 72 | for i := 0; i < len(cfg.kvservers); i++ { 73 | if cfg.kvservers[i] != nil { 74 | cfg.kvservers[i].Kill() 75 | } 76 | } 77 | cfg.net.Cleanup() 78 | cfg.checkTimeout() 79 | } 80 | 81 | // Maximum log size across all servers 82 | func (cfg *config) LogSize() int { 83 | logsize := 0 84 | for i := 0; i < cfg.n; i++ { 85 | n := cfg.saved[i].RaftStateSize() 86 | if n > logsize { 87 | logsize = n 88 | } 89 | } 90 | return logsize 91 | } 92 | 93 | // Maximum snapshot size across all servers 94 | func (cfg *config) SnapshotSize() int { 95 | snapshotsize := 0 96 | for i := 0; i < cfg.n; i++ { 97 | n := cfg.saved[i].SnapshotSize() 98 | if n > snapshotsize { 99 | snapshotsize = n 100 | } 101 | } 102 | return snapshotsize 103 | } 104 | 105 | // attach server i to servers listed in to 106 | // caller must hold cfg.mu 107 | func (cfg *config) connectUnlocked(i int, to []int) { 108 | // log.Printf("connect peer %d to %v\n", i, to) 109 | 110 | // outgoing socket files 111 | for j := 0; j < len(to); j++ { 112 | endname := cfg.endnames[i][to[j]] 113 | cfg.net.Enable(endname, true) 114 | } 115 | 116 | // incoming socket files 117 | for j := 0; j < len(to); j++ { 118 | endname := cfg.endnames[to[j]][i] 119 | cfg.net.Enable(endname, true) 120 | } 121 | } 122 | 123 | func (cfg *config) connect(i int, to []int) { 124 | cfg.mu.Lock() 125 | defer cfg.mu.Unlock() 126 | cfg.connectUnlocked(i, to) 127 | } 128 | 129 | // detach server i from the servers listed in from 130 | // caller must hold cfg.mu 131 | func (cfg *config) disconnectUnlocked(i int, from []int) { 132 | // log.Printf("disconnect peer %d from %v\n", i, from) 133 | 134 | // outgoing socket files 135 | for j := 0; j < len(from); j++ { 136 | if cfg.endnames[i] != nil { 137 | endname := cfg.endnames[i][from[j]] 138 | cfg.net.Enable(endname, false) 139 | } 140 | } 141 | 142 | // incoming socket files 143 | for j := 0; j < len(from); j++ { 144 | if cfg.endnames[j] != nil { 145 | endname := cfg.endnames[from[j]][i] 146 | cfg.net.Enable(endname, false) 147 | } 148 | } 149 | } 150 | 151 | func (cfg *config) disconnect(i int, from []int) { 152 | cfg.mu.Lock() 153 | defer cfg.mu.Unlock() 154 | cfg.disconnectUnlocked(i, from) 155 | } 156 | 157 | func (cfg *config) All() []int { 158 | all := make([]int, cfg.n) 159 | for i := 0; i < cfg.n; i++ { 160 | all[i] = i 161 | } 162 | return all 163 | } 164 | 165 | func (cfg *config) ConnectAll() { 166 | cfg.mu.Lock() 167 | defer cfg.mu.Unlock() 168 | for i := 0; i < cfg.n; i++ { 169 | cfg.connectUnlocked(i, cfg.All()) 170 | } 171 | } 172 | 173 | // Sets up 2 partitions with connectivity between servers in each partition. 174 | func (cfg *config) partition(p1 []int, p2 []int) { 175 | cfg.mu.Lock() 176 | defer cfg.mu.Unlock() 177 | // log.Printf("partition servers into: %v %v\n", p1, p2) 178 | for i := 0; i < len(p1); i++ { 179 | cfg.disconnectUnlocked(p1[i], p2) 180 | cfg.connectUnlocked(p1[i], p1) 181 | } 182 | for i := 0; i < len(p2); i++ { 183 | cfg.disconnectUnlocked(p2[i], p1) 184 | cfg.connectUnlocked(p2[i], p2) 185 | } 186 | } 187 | 188 | // Create a clerk with clerk specific server names. 189 | // Give it connections to all of the servers, but for 190 | // now enable only connections to servers in to[]. 191 | func (cfg *config) makeClient(to []int) *Clerk { 192 | cfg.mu.Lock() 193 | defer cfg.mu.Unlock() 194 | 195 | // a fresh set of ClientEnds. 196 | ends := make([]*labrpc.ClientEnd, cfg.n) 197 | endnames := make([]string, cfg.n) 198 | for j := 0; j < cfg.n; j++ { 199 | endnames[j] = randstring(20) 200 | ends[j] = cfg.net.MakeEnd(endnames[j]) 201 | cfg.net.Connect(endnames[j], j) 202 | } 203 | 204 | ck := MakeClerk(random_handles(ends)) 205 | cfg.clerks[ck] = endnames 206 | cfg.nextClientId++ 207 | cfg.ConnectClientUnlocked(ck, to) 208 | return ck 209 | } 210 | 211 | func (cfg *config) deleteClient(ck *Clerk) { 212 | cfg.mu.Lock() 213 | defer cfg.mu.Unlock() 214 | 215 | v := cfg.clerks[ck] 216 | for i := 0; i < len(v); i++ { 217 | os.Remove(v[i]) 218 | } 219 | delete(cfg.clerks, ck) 220 | } 221 | 222 | // caller should hold cfg.mu 223 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) { 224 | // log.Printf("ConnectClient %v to %v\n", ck, to) 225 | endnames := cfg.clerks[ck] 226 | for j := 0; j < len(to); j++ { 227 | s := endnames[to[j]] 228 | cfg.net.Enable(s, true) 229 | } 230 | } 231 | 232 | func (cfg *config) ConnectClient(ck *Clerk, to []int) { 233 | cfg.mu.Lock() 234 | defer cfg.mu.Unlock() 235 | cfg.ConnectClientUnlocked(ck, to) 236 | } 237 | 238 | // caller should hold cfg.mu 239 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) { 240 | // log.Printf("DisconnectClient %v from %v\n", ck, from) 241 | endnames := cfg.clerks[ck] 242 | for j := 0; j < len(from); j++ { 243 | s := endnames[from[j]] 244 | cfg.net.Enable(s, false) 245 | } 246 | } 247 | 248 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) { 249 | cfg.mu.Lock() 250 | defer cfg.mu.Unlock() 251 | cfg.DisconnectClientUnlocked(ck, from) 252 | } 253 | 254 | // Shutdown a server by isolating it 255 | func (cfg *config) ShutdownServer(i int) { 256 | cfg.mu.Lock() 257 | defer cfg.mu.Unlock() 258 | 259 | cfg.disconnectUnlocked(i, cfg.All()) 260 | 261 | // disable client connections to the server. 262 | // it's important to do this before creating 263 | // the new Persister in saved[i], to avoid 264 | // the possibility of the server returning a 265 | // positive reply to an Append but persisting 266 | // the result in the superseded Persister. 267 | cfg.net.DeleteServer(i) 268 | 269 | // a fresh persister, in case old instance 270 | // continues to update the Persister. 271 | // but copy old persister's content so that we always 272 | // pass Make() the last persisted state. 273 | if cfg.saved[i] != nil { 274 | cfg.saved[i] = cfg.saved[i].Copy() 275 | } 276 | 277 | kv := cfg.kvservers[i] 278 | if kv != nil { 279 | cfg.mu.Unlock() 280 | kv.Kill() 281 | cfg.mu.Lock() 282 | cfg.kvservers[i] = nil 283 | } 284 | } 285 | 286 | // If restart servers, first call ShutdownServer 287 | func (cfg *config) StartServer(i int) { 288 | cfg.mu.Lock() 289 | 290 | // a fresh set of outgoing ClientEnd names. 291 | cfg.endnames[i] = make([]string, cfg.n) 292 | for j := 0; j < cfg.n; j++ { 293 | cfg.endnames[i][j] = randstring(20) 294 | } 295 | 296 | // a fresh set of ClientEnds. 297 | ends := make([]*labrpc.ClientEnd, cfg.n) 298 | for j := 0; j < cfg.n; j++ { 299 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 300 | cfg.net.Connect(cfg.endnames[i][j], j) 301 | } 302 | 303 | // a fresh persister, so old instance doesn't overwrite 304 | // new instance's persisted state. 305 | // give the fresh persister a copy of the old persister's 306 | // state, so that the spec is that we pass StartKVServer() 307 | // the last persisted state. 308 | if cfg.saved[i] != nil { 309 | cfg.saved[i] = cfg.saved[i].Copy() 310 | } else { 311 | cfg.saved[i] = raft.MakePersister() 312 | } 313 | cfg.mu.Unlock() 314 | 315 | cfg.kvservers[i] = StartKVServer(ends, i, cfg.saved[i], cfg.maxraftstate) 316 | 317 | kvsvc := labrpc.MakeService(cfg.kvservers[i]) 318 | rfsvc := labrpc.MakeService(cfg.kvservers[i].rf) 319 | srv := labrpc.MakeServer() 320 | srv.AddService(kvsvc) 321 | srv.AddService(rfsvc) 322 | cfg.net.AddServer(i, srv) 323 | } 324 | 325 | func (cfg *config) Leader() (bool, int) { 326 | cfg.mu.Lock() 327 | defer cfg.mu.Unlock() 328 | 329 | for i := 0; i < cfg.n; i++ { 330 | _, is_leader := cfg.kvservers[i].rf.GetState() 331 | if is_leader { 332 | return true, i 333 | } 334 | } 335 | return false, 0 336 | } 337 | 338 | // Partition servers into 2 groups and put current leader in minority 339 | func (cfg *config) make_partition() ([]int, []int) { 340 | _, l := cfg.Leader() 341 | p1 := make([]int, cfg.n/2+1) 342 | p2 := make([]int, cfg.n/2) 343 | j := 0 344 | for i := 0; i < cfg.n; i++ { 345 | if i != l { 346 | if j < len(p1) { 347 | p1[j] = i 348 | } else { 349 | p2[j-len(p1)] = i 350 | } 351 | j++ 352 | } 353 | } 354 | p2[len(p2)-1] = l 355 | return p1, p2 356 | } 357 | 358 | var ncpu_once sync.Once 359 | 360 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config { 361 | ncpu_once.Do(func() { 362 | if runtime.NumCPU() < 2 { 363 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 364 | } 365 | rand.Seed(makeSeed()) 366 | }) 367 | runtime.GOMAXPROCS(4) 368 | cfg := &config{} 369 | cfg.t = t 370 | cfg.net = labrpc.MakeNetwork() 371 | cfg.n = n 372 | cfg.kvservers = make([]*KVServer, cfg.n) 373 | cfg.saved = make([]*raft.Persister, cfg.n) 374 | cfg.endnames = make([][]string, cfg.n) 375 | cfg.clerks = make(map[*Clerk][]string) 376 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 377 | cfg.maxraftstate = maxraftstate 378 | cfg.start = time.Now() 379 | 380 | // create a full set of KV servers. 381 | for i := 0; i < cfg.n; i++ { 382 | cfg.StartServer(i) 383 | } 384 | 385 | cfg.ConnectAll() 386 | 387 | cfg.net.Reliable(!unreliable) 388 | 389 | return cfg 390 | } 391 | 392 | func (cfg *config) rpcTotal() int { 393 | return cfg.net.GetTotalCount() 394 | } 395 | 396 | // start a Test. 397 | // print the Test message. 398 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high") 399 | func (cfg *config) begin(description string) { 400 | fmt.Printf("%s ...\n", description) 401 | cfg.t0 = time.Now() 402 | cfg.rpcs0 = cfg.rpcTotal() 403 | atomic.StoreInt32(&cfg.ops, 0) 404 | } 405 | 406 | func (cfg *config) op() { 407 | atomic.AddInt32(&cfg.ops, 1) 408 | } 409 | 410 | // end a Test -- the fact that we got here means there 411 | // was no failure. 412 | // print the Passed message, 413 | // and some performance numbers. 414 | func (cfg *config) end() { 415 | cfg.checkTimeout() 416 | if cfg.t.Failed() == false { 417 | t := time.Since(cfg.t0).Seconds() // real time 418 | npeers := cfg.n // number of Raft peers 419 | nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends 420 | ops := atomic.LoadInt32(&cfg.ops) // number of clerk get/put/append calls 421 | 422 | fmt.Printf(" ... Passed --") 423 | fmt.Printf(" %4.1f %d %5d %4d\n", t, npeers, nrpc, ops) 424 | } 425 | } 426 | -------------------------------------------------------------------------------- /src/kvraft/dstest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import itertools 4 | import math 5 | import signal 6 | import subprocess 7 | import tempfile 8 | import shutil 9 | import time 10 | import os 11 | import sys 12 | import datetime 13 | from collections import defaultdict 14 | from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED 15 | from dataclasses import dataclass 16 | from pathlib import Path 17 | from typing import List, Optional, Dict, DefaultDict, Tuple 18 | 19 | import typer 20 | import rich 21 | from rich import print 22 | from rich.table import Table 23 | from rich.progress import ( 24 | Progress, 25 | TimeElapsedColumn, 26 | TimeRemainingColumn, 27 | TextColumn, 28 | BarColumn, 29 | SpinnerColumn, 30 | ) 31 | from rich.live import Live 32 | from rich.panel import Panel 33 | from rich.traceback import install 34 | 35 | install(show_locals=True) 36 | 37 | 38 | @dataclass 39 | class StatsMeter: 40 | """ 41 | Auxiliary classs to keep track of online stats including: count, mean, variance 42 | Uses Welford's algorithm to compute sample mean and sample variance incrementally. 43 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 44 | """ 45 | 46 | n: int = 0 47 | mean: float = 0.0 48 | S: float = 0.0 49 | 50 | def add(self, datum): 51 | self.n += 1 52 | delta = datum - self.mean 53 | # Mk = Mk-1+ (xk – Mk-1)/k 54 | self.mean += delta / self.n 55 | # Sk = Sk-1 + (xk – Mk-1)*(xk – Mk). 56 | self.S += delta * (datum - self.mean) 57 | 58 | @property 59 | def variance(self): 60 | return self.S / self.n 61 | 62 | @property 63 | def std(self): 64 | return math.sqrt(self.variance) 65 | 66 | 67 | def print_results(results: Dict[str, Dict[str, StatsMeter]], timing=False): 68 | table = Table(show_header=True, header_style="bold") 69 | table.add_column("Test") 70 | table.add_column("Failed", justify="right") 71 | table.add_column("Total", justify="right") 72 | if not timing: 73 | table.add_column("Time", justify="right") 74 | else: 75 | table.add_column("Real Time", justify="right") 76 | table.add_column("User Time", justify="right") 77 | table.add_column("System Time", justify="right") 78 | 79 | for test, stats in results.items(): 80 | if stats["completed"].n == 0: 81 | continue 82 | color = "green" if stats["failed"].n == 0 else "red" 83 | row = [ 84 | f"[{color}]{test}[/{color}]", 85 | str(stats["failed"].n), 86 | str(stats["completed"].n), 87 | ] 88 | if not timing: 89 | row.append(f"{stats['time'].mean:.2f} ± {stats['time'].std:.2f}") 90 | else: 91 | row.extend( 92 | [ 93 | f"{stats['real_time'].mean:.2f} ± {stats['real_time'].std:.2f}", 94 | f"{stats['user_time'].mean:.2f} ± {stats['user_time'].std:.2f}", 95 | f"{stats['system_time'].mean:.2f} ± {stats['system_time'].std:.2f}", 96 | ] 97 | ) 98 | table.add_row(*row) 99 | 100 | print(table) 101 | 102 | 103 | def run_test(test: str, race: bool, timing: bool): 104 | test_cmd = ["go", "test", f"-run={test}"] 105 | if race: 106 | test_cmd.append("-race") 107 | if timing: 108 | test_cmd = ["time"] + cmd 109 | f, path = tempfile.mkstemp() 110 | start = time.time() 111 | proc = subprocess.run(test_cmd, stdout=f, stderr=f) 112 | runtime = time.time() - start 113 | os.close(f) 114 | return test, path, proc.returncode, runtime 115 | 116 | 117 | def last_line(file: str) -> str: 118 | with open(file, "rb") as f: 119 | f.seek(-2, os.SEEK_END) 120 | while f.read(1) != b"\n": 121 | f.seek(-2, os.SEEK_CUR) 122 | line = f.readline().decode() 123 | return line 124 | 125 | 126 | # fmt: off 127 | def run_tests( 128 | tests: List[str], 129 | sequential: bool = typer.Option(False, '--sequential', '-s', help='Run all test of each group in order'), 130 | workers: int = typer.Option(1, '--workers', '-p', help='Number of parallel tasks'), 131 | iterations: int = typer.Option(10, '--iter', '-n', help='Number of iterations to run'), 132 | output: Optional[Path] = typer.Option(None, '--output', '-o', help='Output path to use'), 133 | verbose: int = typer.Option(0, '--verbose', '-v', help='Verbosity level', count=True), 134 | archive: bool = typer.Option(False, '--archive', '-a', help='Save all logs intead of only failed ones'), 135 | race: bool = typer.Option(False, '--race/--no-race', '-r/-R', help='Run with race checker'), 136 | loop: bool = typer.Option(False, '--loop', '-l', help='Run continuously'), 137 | growth: int = typer.Option(10, '--growth', '-g', help='Growth ratio of iterations when using --loop'), 138 | timing: bool = typer.Option(False, '--timing', '-t', help='Report timing, only works on macOS'), 139 | # fmt: on 140 | ): 141 | 142 | if output is None: 143 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 144 | output = Path(timestamp) 145 | 146 | if race: 147 | print("[yellow]Running with the race detector\n[/yellow]") 148 | 149 | if verbose > 0: 150 | print(f"[yellow] Verbosity level set to {verbose}[/yellow]") 151 | os.environ['VERBOSE'] = str(verbose) 152 | 153 | while True: 154 | 155 | total = iterations * len(tests) 156 | completed = 0 157 | 158 | results = {test: defaultdict(StatsMeter) for test in tests} 159 | 160 | if sequential: 161 | test_instances = itertools.chain.from_iterable(itertools.repeat(test, iterations) for test in tests) 162 | else: 163 | test_instances = itertools.chain.from_iterable(itertools.repeat(tests, iterations)) 164 | test_instances = iter(test_instances) 165 | 166 | total_progress = Progress( 167 | "[progress.description]{task.description}", 168 | BarColumn(), 169 | TimeRemainingColumn(), 170 | "[progress.percentage]{task.percentage:>3.0f}%", 171 | TimeElapsedColumn(), 172 | ) 173 | total_task = total_progress.add_task("[yellow]Tests[/yellow]", total=total) 174 | 175 | task_progress = Progress( 176 | "[progress.description]{task.description}", 177 | SpinnerColumn(), 178 | BarColumn(), 179 | "{task.completed}/{task.total}", 180 | ) 181 | tasks = {test: task_progress.add_task(test, total=iterations) for test in tests} 182 | 183 | progress_table = Table.grid() 184 | progress_table.add_row(total_progress) 185 | progress_table.add_row(Panel.fit(task_progress)) 186 | 187 | with Live(progress_table, transient=True) as live: 188 | 189 | def handler(_, frame): 190 | live.stop() 191 | print('\n') 192 | print_results(results) 193 | sys.exit(1) 194 | 195 | signal.signal(signal.SIGINT, handler) 196 | 197 | with ThreadPoolExecutor(max_workers=workers) as executor: 198 | 199 | futures = [] 200 | while completed < total: 201 | n = len(futures) 202 | if n < workers: 203 | for test in itertools.islice(test_instances, workers-n): 204 | futures.append(executor.submit(run_test, test, race, timing)) 205 | 206 | done, not_done = wait(futures, return_when=FIRST_COMPLETED) 207 | 208 | for future in done: 209 | test, path, rc, runtime = future.result() 210 | 211 | results[test]['completed'].add(1) 212 | results[test]['time'].add(runtime) 213 | task_progress.update(tasks[test], advance=1) 214 | dest = (output / f"{test}_{completed}.log").as_posix() 215 | if rc != 0: 216 | print(f"Failed test {test} - {dest}") 217 | task_progress.update(tasks[test], description=f"[red]{test}[/red]") 218 | results[test]['failed'].add(1) 219 | else: 220 | if results[test]['completed'].n == iterations and results[test]['failed'].n == 0: 221 | task_progress.update(tasks[test], description=f"[green]{test}[/green]") 222 | 223 | if rc != 0 or archive: 224 | output.mkdir(exist_ok=True, parents=True) 225 | shutil.copy(path, dest) 226 | 227 | if timing: 228 | line = last_line(path) 229 | real, _, user, _, system, _ = line.replace(' '*8, '').split(' ') 230 | results[test]['real_time'].add(float(real)) 231 | results[test]['user_time'].add(float(user)) 232 | results[test]['system_time'].add(float(system)) 233 | 234 | os.remove(path) 235 | 236 | completed += 1 237 | total_progress.update(total_task, advance=1) 238 | 239 | futures = list(not_done) 240 | 241 | print_results(results, timing) 242 | 243 | if loop: 244 | iterations *= growth 245 | print(f"[yellow]Increasing iterations to {iterations}[/yellow]") 246 | else: 247 | break 248 | 249 | 250 | if __name__ == "__main__": 251 | typer.run(run_tests) 252 | -------------------------------------------------------------------------------- /src/kvraft/server.go: -------------------------------------------------------------------------------- 1 | package kvraft 2 | 3 | import ( 4 | "bytes" 5 | "mit6.824/labgob" 6 | "mit6.824/labrpc" 7 | "mit6.824/raft" 8 | "sync" 9 | "sync/atomic" 10 | ) 11 | 12 | type Op struct { 13 | Type string // 任务类型 14 | Key string 15 | Value string 16 | ClientId ClientId // 任务的Client 17 | RequestId RequestId // 对应的Client的这条任务的下标 18 | } 19 | 20 | type KVServer struct { 21 | mu sync.Mutex 22 | me int 23 | rf *raft.Raft // 该状态机对应的Raft 24 | applyCh chan raft.ApplyMsg // Raft apply的Logs 25 | dead int32 // 同Raft的dead 26 | // 3A 27 | kv map[string]string // (持久化)Key/Value数据库 28 | clientLastTaskIndex map[ClientId]RequestId // (持久化)每个客户端已完成的最后一个任务的下标 29 | taskTerm map[int]int // 已完成的任务(用于校验完成的Index对应的任务是不是自己发布的任务) 30 | doneCond map[int]*sync.Cond // Client发送到该Server的任务,任务完成后通知Cond回复Client 31 | // 3B 32 | persister *raft.Persister 33 | maxRaftState int // 当Raft的RaftStateSize接近该值时进行Snapshot 34 | } 35 | 36 | func (kv *KVServer) Get(args *GetArgs, reply *GetReply) { 37 | kv.mu.Lock() 38 | done, isLeader, index, term := kv.startOp("Get", args.Key, "", args.ClientTag, args.TaskIndex) 39 | if done { 40 | // 任务已完成过 41 | reply.Err = OK 42 | reply.Value = kv.kv[args.Key] 43 | } else if !isLeader { 44 | // 不是Leader 45 | reply.Err = ErrWrongLeader 46 | } else { 47 | cond := kv.doneCond[index] 48 | kv.mu.Unlock() 49 | cond.L.Lock() 50 | // 等待任务完成,推送至cond唤醒 51 | cond.Wait() 52 | cond.L.Unlock() 53 | kv.mu.Lock() 54 | // 任务对应的Index已经被Apply,检查完成的任务是否是自己发布的那个 55 | if term == kv.taskTerm[index] { 56 | // 完成的任务和自己发布的任务相同 57 | if value, ok := kv.kv[args.Key]; ok { 58 | reply.Err = OK 59 | reply.Value = value 60 | } else { 61 | reply.Err = ErrNoKey 62 | } 63 | } else { 64 | // 任务被其他Server处理了 65 | reply.Err = ErrWrongLeader 66 | } 67 | delete(kv.taskTerm, index) 68 | delete(kv.doneCond, index) 69 | } 70 | kv.mu.Unlock() 71 | } 72 | 73 | func (kv *KVServer) PutAppend(args *PutAppendArgs, reply *PutAppendReply) { 74 | kv.mu.Lock() 75 | done, isLeader, index, term := kv.startOp(args.Op, args.Key, args.Value, args.ClientTag, args.TaskIndex) 76 | if done { 77 | // 任务已完成过 78 | reply.Err = OK 79 | } else if !isLeader { 80 | // 不是Leader 81 | reply.Err = ErrWrongLeader 82 | } else { 83 | cond := kv.doneCond[index] 84 | kv.mu.Unlock() 85 | cond.L.Lock() 86 | // 等待任务完成,推送至cond唤醒 87 | cond.Wait() 88 | cond.L.Unlock() 89 | kv.mu.Lock() 90 | if term == kv.taskTerm[index] { 91 | // 完成的任务和自己发布的任务相同 92 | reply.Err = OK 93 | } else { 94 | // 任务被其他Server处理了 95 | reply.Err = ErrWrongLeader 96 | } 97 | delete(kv.taskTerm, index) 98 | delete(kv.doneCond, index) 99 | } 100 | kv.mu.Unlock() 101 | } 102 | 103 | // 开始执行一条Log,Lock使用(已完成过该任务,是否为Leader,新任务的index) 104 | func (kv *KVServer) startOp(op string, key string, value string, clientTag ClientId, clientTaskIndex RequestId) (bool, bool, int, int) { 105 | if kv.getClientLastIndex(clientTag) >= clientTaskIndex { 106 | // 这个任务已经完成过,直接返回 107 | return true, false, 0, 0 108 | } 109 | // 要求Raft开始一次提交 110 | index, term, isLeader := kv.rf.Start(Op{ 111 | Type: op, 112 | Key: key, 113 | Value: value, 114 | ClientId: clientTag, 115 | RequestId: clientTaskIndex, 116 | }) 117 | if !isLeader { 118 | // 不是Leader 119 | return false, false, 0, 0 120 | } 121 | if _, ok := kv.doneCond[index]; !ok { 122 | // 没有执行过这条Log,存入一个cond,完成该任务后通过这个cond通知所有goroutine 123 | kv.doneCond[index] = &sync.Cond{L: &sync.Mutex{}} 124 | } 125 | return false, true, index, term 126 | } 127 | 128 | // 持续接受来自Raft的Log 129 | func (kv *KVServer) applier() { 130 | for kv.killed() == false { 131 | // 接受一条被Apply的Log 132 | msg := <-kv.applyCh 133 | kv.mu.Lock() 134 | if msg.CommandValid { 135 | // Command Log,解析Log中的command 136 | command, _ := msg.Command.(Op) 137 | // 检查任务是否已完成过(一个任务/Log可能会发送多次,因为前几次可能因为某种原因没有及时提交) 138 | // 最后一条已完成的任务的Index必须小于当前任务才算没有完成过,因为线性一致性 139 | DPrintf("S[%v] apply %v[%v, %v] index[%v]\n", kv.me, command.Type, command.Key, command.Value, msg.CommandIndex) 140 | if command.Type != "Get" && kv.getClientLastIndex(command.ClientId) < command.RequestId { 141 | // 如果是第一次完成该任务/Log,才保存到KV中 142 | if command.Type == "Put" { 143 | // Put 144 | kv.kv[command.Key] = command.Value 145 | } else { 146 | // Append 147 | if _, ok := kv.kv[command.Key]; ok { 148 | kv.kv[command.Key] += command.Value 149 | } else { 150 | kv.kv[command.Key] = command.Value 151 | } 152 | } 153 | // 该任务的Index比之前存的任务Index大,更新 154 | kv.clientLastTaskIndex[command.ClientId] = command.RequestId 155 | } 156 | if cond, ok := kv.doneCond[msg.CommandIndex]; ok { 157 | // 这个任务被给到过自己,保存它的Term,用来校验 158 | kv.taskTerm[msg.CommandIndex] = msg.CommandTerm 159 | // 通知所有在等待该任务的goroutine 160 | cond.Broadcast() 161 | } 162 | // 检查是否需要Snapshot 163 | if kv.maxRaftState != -1 && float64(kv.persister.RaftStateSize()) > float64(kv.maxRaftState)*serverSnapshotStatePercent { 164 | // Raft状态的大小接近阈值,要求Raft进行Snapshot 165 | kv.saveSnapshot(msg.CommandIndex) 166 | } 167 | } else if msg.SnapshotValid { 168 | // Snapshot Log,只有在Leader发给该Server的InstallSnapshot种才会走到这里,这表明该Server的Logs过于老旧 169 | if kv.rf.CondInstallSnapshot(msg.SnapshotTerm, msg.SnapshotIndex, msg.Snapshot) { 170 | kv.readSnapshot(msg.Snapshot) 171 | } 172 | } 173 | kv.mu.Unlock() 174 | } 175 | } 176 | 177 | // 通过ClientTag获得该Client完成的最后一条任务的下标,0则没有完成 178 | func (kv *KVServer) getClientLastIndex(client ClientId) RequestId { 179 | if last, ok := kv.clientLastTaskIndex[client]; ok { 180 | return last 181 | } 182 | return 0 183 | } 184 | 185 | func StartKVServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxRaftState int) *KVServer { 186 | labgob.Register(Op{}) 187 | applyCh := make(chan raft.ApplyMsg) 188 | kv := &KVServer{ 189 | me: me, 190 | rf: raft.Make(servers, me, persister, applyCh), 191 | applyCh: applyCh, 192 | maxRaftState: maxRaftState, 193 | kv: make(map[string]string), 194 | clientLastTaskIndex: make(map[ClientId]RequestId), 195 | taskTerm: make(map[int]int), 196 | doneCond: make(map[int]*sync.Cond), 197 | persister: persister, 198 | } 199 | kv.readSnapshot(kv.persister.ReadSnapshot()) 200 | go kv.applier() 201 | return kv 202 | } 203 | 204 | // 保存Snapshot(被动快照) 205 | func (kv *KVServer) saveSnapshot(lastIndex int) { 206 | writer := new(bytes.Buffer) 207 | encoder := labgob.NewEncoder(writer) 208 | if encoder.Encode(kv.kv) == nil && 209 | encoder.Encode(kv.clientLastTaskIndex) == nil { 210 | kv.rf.Snapshot(lastIndex, writer.Bytes()) 211 | DPrintf("S[%v] save snapshot(%v) size[%v]\n", kv.me, lastIndex, len(writer.Bytes())) 212 | } 213 | } 214 | 215 | // 读取Snapshot 216 | func (kv *KVServer) readSnapshot(data []byte) { 217 | if data == nil || len(data) < 1 { 218 | return 219 | } 220 | decoder := labgob.NewDecoder(bytes.NewBuffer(data)) 221 | var kvMap map[string]string 222 | var clientLastTaskIndex map[ClientId]RequestId 223 | if decoder.Decode(&kvMap) == nil && 224 | decoder.Decode(&clientLastTaskIndex) == nil { 225 | kv.kv = kvMap 226 | kv.clientLastTaskIndex = clientLastTaskIndex 227 | DPrintf("S[%v] readSnapshot size[%v]\n", kv.me, len(data)) 228 | } 229 | } 230 | 231 | func (kv *KVServer) Kill() { 232 | atomic.StoreInt32(&kv.dead, 1) 233 | DPrintf("S[%v] killed\n", kv.me) 234 | kv.rf.Kill() 235 | } 236 | 237 | func (kv *KVServer) killed() bool { 238 | z := atomic.LoadInt32(&kv.dead) 239 | return z == 1 240 | } 241 | -------------------------------------------------------------------------------- /src/labgob/labgob.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | // 4 | // trying to send non-capitalized fields over RPC produces a range of 5 | // misbehavior, including both mysterious incorrect computation and 6 | // outright crashes. so this wrapper around Go's encoding/gob warns 7 | // about non-capitalized field names. 8 | // 9 | 10 | import "encoding/gob" 11 | import "io" 12 | import "reflect" 13 | import "fmt" 14 | import "sync" 15 | import "unicode" 16 | import "unicode/utf8" 17 | 18 | var mu sync.Mutex 19 | var errorCount int // for TestCapital 20 | var checked map[reflect.Type]bool 21 | 22 | type LabEncoder struct { 23 | gob *gob.Encoder 24 | } 25 | 26 | func NewEncoder(w io.Writer) *LabEncoder { 27 | enc := &LabEncoder{} 28 | enc.gob = gob.NewEncoder(w) 29 | return enc 30 | } 31 | 32 | func (enc *LabEncoder) Encode(e interface{}) error { 33 | checkValue(e) 34 | return enc.gob.Encode(e) 35 | } 36 | 37 | func (enc *LabEncoder) EncodeValue(value reflect.Value) error { 38 | checkValue(value.Interface()) 39 | return enc.gob.EncodeValue(value) 40 | } 41 | 42 | type LabDecoder struct { 43 | gob *gob.Decoder 44 | } 45 | 46 | func NewDecoder(r io.Reader) *LabDecoder { 47 | dec := &LabDecoder{} 48 | dec.gob = gob.NewDecoder(r) 49 | return dec 50 | } 51 | 52 | func (dec *LabDecoder) Decode(e interface{}) error { 53 | checkValue(e) 54 | checkDefault(e) 55 | return dec.gob.Decode(e) 56 | } 57 | 58 | func Register(value interface{}) { 59 | checkValue(value) 60 | gob.Register(value) 61 | } 62 | 63 | func RegisterName(name string, value interface{}) { 64 | checkValue(value) 65 | gob.RegisterName(name, value) 66 | } 67 | 68 | func checkValue(value interface{}) { 69 | checkType(reflect.TypeOf(value)) 70 | } 71 | 72 | func checkType(t reflect.Type) { 73 | k := t.Kind() 74 | 75 | mu.Lock() 76 | // only complain once, and avoid recursion. 77 | if checked == nil { 78 | checked = map[reflect.Type]bool{} 79 | } 80 | if checked[t] { 81 | mu.Unlock() 82 | return 83 | } 84 | checked[t] = true 85 | mu.Unlock() 86 | 87 | switch k { 88 | case reflect.Struct: 89 | for i := 0; i < t.NumField(); i++ { 90 | f := t.Field(i) 91 | rune, _ := utf8.DecodeRuneInString(f.Name) 92 | if unicode.IsUpper(rune) == false { 93 | // ta da 94 | fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n", 95 | f.Name, t.Name()) 96 | mu.Lock() 97 | errorCount += 1 98 | mu.Unlock() 99 | } 100 | checkType(f.Type) 101 | } 102 | return 103 | case reflect.Slice, reflect.Array, reflect.Ptr: 104 | checkType(t.Elem()) 105 | return 106 | case reflect.Map: 107 | checkType(t.Elem()) 108 | checkType(t.Key()) 109 | return 110 | default: 111 | return 112 | } 113 | } 114 | 115 | // 116 | // warn if the value contains non-default values, 117 | // as it would if one sent an RPC but the reply 118 | // struct was already modified. if the RPC reply 119 | // contains default values, GOB won't overwrite 120 | // the non-default value. 121 | // 122 | func checkDefault(value interface{}) { 123 | if value == nil { 124 | return 125 | } 126 | checkDefault1(reflect.ValueOf(value), 1, "") 127 | } 128 | 129 | func checkDefault1(value reflect.Value, depth int, name string) { 130 | if depth > 3 { 131 | return 132 | } 133 | 134 | t := value.Type() 135 | k := t.Kind() 136 | 137 | switch k { 138 | case reflect.Struct: 139 | for i := 0; i < t.NumField(); i++ { 140 | vv := value.Field(i) 141 | name1 := t.Field(i).Name 142 | if name != "" { 143 | name1 = name + "." + name1 144 | } 145 | checkDefault1(vv, depth+1, name1) 146 | } 147 | return 148 | case reflect.Ptr: 149 | if value.IsNil() { 150 | return 151 | } 152 | checkDefault1(value.Elem(), depth+1, name) 153 | return 154 | case reflect.Bool, 155 | reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, 156 | reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, 157 | reflect.Uintptr, reflect.Float32, reflect.Float64, 158 | reflect.String: 159 | if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false { 160 | mu.Lock() 161 | if errorCount < 1 { 162 | what := name 163 | if what == "" { 164 | what = t.Name() 165 | } 166 | // this warning typically arises if code re-uses the same RPC reply 167 | // variable for multiple RPC calls, or if code restores persisted 168 | // state into variable that already have non-default values. 169 | fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n", 170 | what) 171 | } 172 | errorCount += 1 173 | mu.Unlock() 174 | } 175 | return 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/labgob/test_test.go: -------------------------------------------------------------------------------- 1 | package labgob 2 | 3 | import "testing" 4 | 5 | import "bytes" 6 | 7 | type T1 struct { 8 | T1int0 int 9 | T1int1 int 10 | T1string0 string 11 | T1string1 string 12 | } 13 | 14 | type T2 struct { 15 | T2slice []T1 16 | T2map map[int]*T1 17 | T2t3 interface{} 18 | } 19 | 20 | type T3 struct { 21 | T3int999 int 22 | } 23 | 24 | // 25 | // test that we didn't break GOB. 26 | // 27 | func TestGOB(t *testing.T) { 28 | e0 := errorCount 29 | 30 | w := new(bytes.Buffer) 31 | 32 | Register(T3{}) 33 | 34 | { 35 | x0 := 0 36 | x1 := 1 37 | t1 := T1{} 38 | t1.T1int1 = 1 39 | t1.T1string1 = "6.824" 40 | t2 := T2{} 41 | t2.T2slice = []T1{T1{}, t1} 42 | t2.T2map = map[int]*T1{} 43 | t2.T2map[99] = &T1{1, 2, "x", "y"} 44 | t2.T2t3 = T3{999} 45 | 46 | e := NewEncoder(w) 47 | e.Encode(x0) 48 | e.Encode(x1) 49 | e.Encode(t1) 50 | e.Encode(t2) 51 | } 52 | data := w.Bytes() 53 | 54 | { 55 | var x0 int 56 | var x1 int 57 | var t1 T1 58 | var t2 T2 59 | 60 | r := bytes.NewBuffer(data) 61 | d := NewDecoder(r) 62 | if d.Decode(&x0) != nil || 63 | d.Decode(&x1) != nil || 64 | d.Decode(&t1) != nil || 65 | d.Decode(&t2) != nil { 66 | t.Fatalf("Decode failed") 67 | } 68 | 69 | if x0 != 0 { 70 | t.Fatalf("wrong x0 %v\n", x0) 71 | } 72 | if x1 != 1 { 73 | t.Fatalf("wrong x1 %v\n", x1) 74 | } 75 | if t1.T1int0 != 0 { 76 | t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0) 77 | } 78 | if t1.T1int1 != 1 { 79 | t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1) 80 | } 81 | if t1.T1string0 != "" { 82 | t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0) 83 | } 84 | if t1.T1string1 != "6.824" { 85 | t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1) 86 | } 87 | if len(t2.T2slice) != 2 { 88 | t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice)) 89 | } 90 | if t2.T2slice[1].T1int1 != 1 { 91 | t.Fatalf("wrong slice value\n") 92 | } 93 | if len(t2.T2map) != 1 { 94 | t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map)) 95 | } 96 | if t2.T2map[99].T1string1 != "y" { 97 | t.Fatalf("wrong map value\n") 98 | } 99 | t3 := (t2.T2t3).(T3) 100 | if t3.T3int999 != 999 { 101 | t.Fatalf("wrong t2.T2t3.T3int999\n") 102 | } 103 | } 104 | 105 | if errorCount != e0 { 106 | t.Fatalf("there were errors, but should not have been") 107 | } 108 | } 109 | 110 | type T4 struct { 111 | Yes int 112 | no int 113 | } 114 | 115 | // 116 | // make sure we check capitalization 117 | // labgob prints one warning during this test. 118 | // 119 | func TestCapital(t *testing.T) { 120 | e0 := errorCount 121 | 122 | v := []map[*T4]int{} 123 | 124 | w := new(bytes.Buffer) 125 | e := NewEncoder(w) 126 | e.Encode(v) 127 | data := w.Bytes() 128 | 129 | var v1 []map[T4]int 130 | r := bytes.NewBuffer(data) 131 | d := NewDecoder(r) 132 | d.Decode(&v1) 133 | 134 | if errorCount != e0+1 { 135 | t.Fatalf("failed to warn about lower-case field") 136 | } 137 | } 138 | 139 | // 140 | // check that we warn when someone sends a default value over 141 | // RPC but the target into which we're decoding holds a non-default 142 | // value, which GOB seems not to overwrite as you'd expect. 143 | // 144 | // labgob does not print a warning. 145 | // 146 | func TestDefault(t *testing.T) { 147 | e0 := errorCount 148 | 149 | type DD struct { 150 | X int 151 | } 152 | 153 | // send a default value... 154 | dd1 := DD{} 155 | 156 | w := new(bytes.Buffer) 157 | e := NewEncoder(w) 158 | e.Encode(dd1) 159 | data := w.Bytes() 160 | 161 | // and receive it into memory that already 162 | // holds non-default values. 163 | reply := DD{99} 164 | 165 | r := bytes.NewBuffer(data) 166 | d := NewDecoder(r) 167 | d.Decode(&reply) 168 | 169 | if errorCount != e0+1 { 170 | t.Fatalf("failed to warn about decoding into non-default value") 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/labrpc/labrpc.go: -------------------------------------------------------------------------------- 1 | package labrpc 2 | 3 | // 4 | // channel-based RPC, for 824 labs. 5 | // 6 | // simulates a network that can lose requests, lose replies, 7 | // delay messages, and entirely disconnect particular hosts. 8 | // 9 | // we will use the original labrpc.go to test your code for grading. 10 | // so, while you can modify this code to help you debug, please 11 | // test against the original before submitting. 12 | // 13 | // adapted from Go net/rpc/server.go. 14 | // 15 | // sends labgob-encoded values to ensure that RPCs 16 | // don't include references to program objects. 17 | // 18 | // net := MakeNetwork() -- holds network, clients, servers. 19 | // end := net.MakeEnd(endname) -- create a client end-point, to talk to one server. 20 | // net.AddServer(servername, server) -- adds a named server to network. 21 | // net.DeleteServer(servername) -- eliminate the named server. 22 | // net.Connect(endname, servername) -- connect a client to a server. 23 | // net.Enable(endname, enabled) -- enable/disable a client. 24 | // net.Reliable(bool) -- false means drop/delay messages 25 | // 26 | // end.Call("Raft.AppendEntries", &args, &reply) -- send an RPC, wait for reply. 27 | // the "Raft" is the name of the server struct to be called. 28 | // the "AppendEntries" is the name of the method to be called. 29 | // Call() returns true to indicate that the server executed the request 30 | // and the reply is valid. 31 | // Call() returns false if the network lost the request or reply 32 | // or the server is down. 33 | // It is OK to have multiple Call()s in progress at the same time on the 34 | // same ClientEnd. 35 | // Concurrent calls to Call() may be delivered to the server out of order, 36 | // since the network may re-order messages. 37 | // Call() is guaranteed to return (perhaps after a delay) *except* if the 38 | // handler function on the server side does not return. 39 | // the server RPC handler function must declare its args and reply arguments 40 | // as pointers, so that their types exactly match the types of the arguments 41 | // to Call(). 42 | // 43 | // srv := MakeServer() 44 | // srv.AddService(svc) -- a server can have multiple services, e.g. Raft and k/v 45 | // pass srv to net.AddServer() 46 | // 47 | // svc := MakeService(receiverObject) -- obj's methods will handle RPCs 48 | // much like Go's rpcs.Register() 49 | // pass svc to srv.AddService() 50 | // 51 | 52 | import "mit6.824/labgob" 53 | import "bytes" 54 | import "reflect" 55 | import "sync" 56 | import "log" 57 | import "strings" 58 | import "math/rand" 59 | import "time" 60 | import "sync/atomic" 61 | 62 | type reqMsg struct { 63 | endname interface{} // name of sending ClientEnd 64 | svcMeth string // e.g. "Raft.AppendEntries" 65 | argsType reflect.Type 66 | args []byte 67 | replyCh chan replyMsg 68 | } 69 | 70 | type replyMsg struct { 71 | ok bool 72 | reply []byte 73 | } 74 | 75 | type ClientEnd struct { 76 | endname interface{} // this end-point's name 77 | ch chan reqMsg // copy of Network.endCh 78 | done chan struct{} // closed when Network is cleaned up 79 | } 80 | 81 | // send an RPC, wait for the reply. 82 | // the return value indicates success; false means that 83 | // no reply was received from the server. 84 | func (e *ClientEnd) Call(svcMeth string, args interface{}, reply interface{}) bool { 85 | req := reqMsg{} 86 | req.endname = e.endname 87 | req.svcMeth = svcMeth 88 | req.argsType = reflect.TypeOf(args) 89 | req.replyCh = make(chan replyMsg) 90 | 91 | qb := new(bytes.Buffer) 92 | qe := labgob.NewEncoder(qb) 93 | if err := qe.Encode(args); err != nil { 94 | panic(err) 95 | } 96 | req.args = qb.Bytes() 97 | 98 | // 99 | // send the request. 100 | // 101 | select { 102 | case e.ch <- req: 103 | // the request has been sent. 104 | case <-e.done: 105 | // entire Network has been destroyed. 106 | return false 107 | } 108 | 109 | // 110 | // wait for the reply. 111 | // 112 | rep := <-req.replyCh 113 | if rep.ok { 114 | rb := bytes.NewBuffer(rep.reply) 115 | rd := labgob.NewDecoder(rb) 116 | if err := rd.Decode(reply); err != nil { 117 | log.Fatalf("ClientEnd.Call(): decode reply: %v\n", err) 118 | } 119 | return true 120 | } else { 121 | return false 122 | } 123 | } 124 | 125 | type Network struct { 126 | mu sync.Mutex 127 | reliable bool 128 | longDelays bool // pause a long time on send on disabled connection 129 | longReordering bool // sometimes delay replies a long time 130 | ends map[interface{}]*ClientEnd // ends, by name 131 | enabled map[interface{}]bool // by end name 132 | servers map[interface{}]*Server // servers, by name 133 | connections map[interface{}]interface{} // endname -> servername 134 | endCh chan reqMsg 135 | done chan struct{} // closed when Network is cleaned up 136 | count int32 // total RPC count, for statistics 137 | bytes int64 // total bytes send, for statistics 138 | } 139 | 140 | func MakeNetwork() *Network { 141 | rn := &Network{} 142 | rn.reliable = true 143 | rn.ends = map[interface{}]*ClientEnd{} 144 | rn.enabled = map[interface{}]bool{} 145 | rn.servers = map[interface{}]*Server{} 146 | rn.connections = map[interface{}](interface{}){} 147 | rn.endCh = make(chan reqMsg) 148 | rn.done = make(chan struct{}) 149 | 150 | // single goroutine to handle all ClientEnd.Call()s 151 | go func() { 152 | for { 153 | select { 154 | case xreq := <-rn.endCh: 155 | atomic.AddInt32(&rn.count, 1) 156 | atomic.AddInt64(&rn.bytes, int64(len(xreq.args))) 157 | go rn.processReq(xreq) 158 | case <-rn.done: 159 | return 160 | } 161 | } 162 | }() 163 | 164 | return rn 165 | } 166 | 167 | func (rn *Network) Cleanup() { 168 | close(rn.done) 169 | } 170 | 171 | func (rn *Network) Reliable(yes bool) { 172 | rn.mu.Lock() 173 | defer rn.mu.Unlock() 174 | 175 | rn.reliable = yes 176 | } 177 | 178 | func (rn *Network) LongReordering(yes bool) { 179 | rn.mu.Lock() 180 | defer rn.mu.Unlock() 181 | 182 | rn.longReordering = yes 183 | } 184 | 185 | func (rn *Network) LongDelays(yes bool) { 186 | rn.mu.Lock() 187 | defer rn.mu.Unlock() 188 | 189 | rn.longDelays = yes 190 | } 191 | 192 | func (rn *Network) readEndnameInfo(endname interface{}) (enabled bool, 193 | servername interface{}, server *Server, reliable bool, longreordering bool, 194 | ) { 195 | rn.mu.Lock() 196 | defer rn.mu.Unlock() 197 | 198 | enabled = rn.enabled[endname] 199 | servername = rn.connections[endname] 200 | if servername != nil { 201 | server = rn.servers[servername] 202 | } 203 | reliable = rn.reliable 204 | longreordering = rn.longReordering 205 | return 206 | } 207 | 208 | func (rn *Network) isServerDead(endname interface{}, servername interface{}, server *Server) bool { 209 | rn.mu.Lock() 210 | defer rn.mu.Unlock() 211 | 212 | if rn.enabled[endname] == false || rn.servers[servername] != server { 213 | return true 214 | } 215 | return false 216 | } 217 | 218 | func (rn *Network) processReq(req reqMsg) { 219 | enabled, servername, server, reliable, longreordering := rn.readEndnameInfo(req.endname) 220 | 221 | if enabled && servername != nil && server != nil { 222 | if reliable == false { 223 | // short delay 224 | ms := (rand.Int() % 27) 225 | time.Sleep(time.Duration(ms) * time.Millisecond) 226 | } 227 | 228 | if reliable == false && (rand.Int()%1000) < 100 { 229 | // drop the request, return as if timeout 230 | req.replyCh <- replyMsg{false, nil} 231 | return 232 | } 233 | 234 | // execute the request (call the RPC handler). 235 | // in a separate thread so that we can periodically check 236 | // if the server has been killed and the RPC should get a 237 | // failure reply. 238 | ech := make(chan replyMsg) 239 | go func() { 240 | r := server.dispatch(req) 241 | ech <- r 242 | }() 243 | 244 | // wait for handler to return, 245 | // but stop waiting if DeleteServer() has been called, 246 | // and return an error. 247 | var reply replyMsg 248 | replyOK := false 249 | serverDead := false 250 | for replyOK == false && serverDead == false { 251 | select { 252 | case reply = <-ech: 253 | replyOK = true 254 | case <-time.After(100 * time.Millisecond): 255 | serverDead = rn.isServerDead(req.endname, servername, server) 256 | if serverDead { 257 | go func() { 258 | <-ech // drain channel to let the goroutine created earlier terminate 259 | }() 260 | } 261 | } 262 | } 263 | 264 | // do not reply if DeleteServer() has been called, i.e. 265 | // the server has been killed. this is needed to avoid 266 | // situation in which a client gets a positive reply 267 | // to an Append, but the server persisted the update 268 | // into the old Persister. config.go is careful to call 269 | // DeleteServer() before superseding the Persister. 270 | serverDead = rn.isServerDead(req.endname, servername, server) 271 | 272 | if replyOK == false || serverDead == true { 273 | // server was killed while we were waiting; return error. 274 | req.replyCh <- replyMsg{false, nil} 275 | } else if reliable == false && (rand.Int()%1000) < 100 { 276 | // drop the reply, return as if timeout 277 | req.replyCh <- replyMsg{false, nil} 278 | } else if longreordering == true && rand.Intn(900) < 600 { 279 | // delay the response for a while 280 | ms := 200 + rand.Intn(1+rand.Intn(2000)) 281 | // Russ points out that this timer arrangement will decrease 282 | // the number of goroutines, so that the race 283 | // detector is less likely to get upset. 284 | time.AfterFunc(time.Duration(ms)*time.Millisecond, func() { 285 | atomic.AddInt64(&rn.bytes, int64(len(reply.reply))) 286 | req.replyCh <- reply 287 | }) 288 | } else { 289 | atomic.AddInt64(&rn.bytes, int64(len(reply.reply))) 290 | req.replyCh <- reply 291 | } 292 | } else { 293 | // simulate no reply and eventual timeout. 294 | ms := 0 295 | if rn.longDelays { 296 | // let Raft tests check that leader doesn't send 297 | // RPCs synchronously. 298 | ms = (rand.Int() % 7000) 299 | } else { 300 | // many kv tests require the client to try each 301 | // server in fairly rapid succession. 302 | ms = (rand.Int() % 100) 303 | } 304 | time.AfterFunc(time.Duration(ms)*time.Millisecond, func() { 305 | req.replyCh <- replyMsg{false, nil} 306 | }) 307 | } 308 | 309 | } 310 | 311 | // create a client end-point. 312 | // start the thread that listens and delivers. 313 | func (rn *Network) MakeEnd(endname interface{}) *ClientEnd { 314 | rn.mu.Lock() 315 | defer rn.mu.Unlock() 316 | 317 | if _, ok := rn.ends[endname]; ok { 318 | log.Fatalf("MakeEnd: %v already exists\n", endname) 319 | } 320 | 321 | e := &ClientEnd{} 322 | e.endname = endname 323 | e.ch = rn.endCh 324 | e.done = rn.done 325 | rn.ends[endname] = e 326 | rn.enabled[endname] = false 327 | rn.connections[endname] = nil 328 | 329 | return e 330 | } 331 | 332 | func (rn *Network) AddServer(servername interface{}, rs *Server) { 333 | rn.mu.Lock() 334 | defer rn.mu.Unlock() 335 | 336 | rn.servers[servername] = rs 337 | } 338 | 339 | func (rn *Network) DeleteServer(servername interface{}) { 340 | rn.mu.Lock() 341 | defer rn.mu.Unlock() 342 | 343 | rn.servers[servername] = nil 344 | } 345 | 346 | // connect a ClientEnd to a server. 347 | // a ClientEnd can only be connected once in its lifetime. 348 | func (rn *Network) Connect(endname interface{}, servername interface{}) { 349 | rn.mu.Lock() 350 | defer rn.mu.Unlock() 351 | 352 | rn.connections[endname] = servername 353 | } 354 | 355 | // enable/disable a ClientEnd. 356 | func (rn *Network) Enable(endname interface{}, enabled bool) { 357 | rn.mu.Lock() 358 | defer rn.mu.Unlock() 359 | 360 | rn.enabled[endname] = enabled 361 | } 362 | 363 | // get a server's count of incoming RPCs. 364 | func (rn *Network) GetCount(servername interface{}) int { 365 | rn.mu.Lock() 366 | defer rn.mu.Unlock() 367 | 368 | svr := rn.servers[servername] 369 | return svr.GetCount() 370 | } 371 | 372 | func (rn *Network) GetTotalCount() int { 373 | x := atomic.LoadInt32(&rn.count) 374 | return int(x) 375 | } 376 | 377 | func (rn *Network) GetTotalBytes() int64 { 378 | x := atomic.LoadInt64(&rn.bytes) 379 | return x 380 | } 381 | 382 | // 383 | // a server is a collection of services, all sharing 384 | // the same rpc dispatcher. so that e.g. both a Raft 385 | // and a k/v server can listen to the same rpc endpoint. 386 | // 387 | type Server struct { 388 | mu sync.Mutex 389 | services map[string]*Service 390 | count int // incoming RPCs 391 | } 392 | 393 | func MakeServer() *Server { 394 | rs := &Server{} 395 | rs.services = map[string]*Service{} 396 | return rs 397 | } 398 | 399 | func (rs *Server) AddService(svc *Service) { 400 | rs.mu.Lock() 401 | defer rs.mu.Unlock() 402 | rs.services[svc.name] = svc 403 | } 404 | 405 | func (rs *Server) dispatch(req reqMsg) replyMsg { 406 | rs.mu.Lock() 407 | 408 | rs.count += 1 409 | 410 | // split Raft.AppendEntries into service and method 411 | dot := strings.LastIndex(req.svcMeth, ".") 412 | serviceName := req.svcMeth[:dot] 413 | methodName := req.svcMeth[dot+1:] 414 | 415 | service, ok := rs.services[serviceName] 416 | 417 | rs.mu.Unlock() 418 | 419 | if ok { 420 | return service.dispatch(methodName, req) 421 | } else { 422 | choices := []string{} 423 | for k, _ := range rs.services { 424 | choices = append(choices, k) 425 | } 426 | log.Fatalf("labrpc.Server.dispatch(): unknown service %v in %v.%v; expecting one of %v\n", 427 | serviceName, serviceName, methodName, choices) 428 | return replyMsg{false, nil} 429 | } 430 | } 431 | 432 | func (rs *Server) GetCount() int { 433 | rs.mu.Lock() 434 | defer rs.mu.Unlock() 435 | return rs.count 436 | } 437 | 438 | // an object with methods that can be called via RPC. 439 | // a single server may have more than one Service. 440 | type Service struct { 441 | name string 442 | rcvr reflect.Value 443 | typ reflect.Type 444 | methods map[string]reflect.Method 445 | } 446 | 447 | func MakeService(rcvr interface{}) *Service { 448 | svc := &Service{} 449 | svc.typ = reflect.TypeOf(rcvr) 450 | svc.rcvr = reflect.ValueOf(rcvr) 451 | svc.name = reflect.Indirect(svc.rcvr).Type().Name() 452 | svc.methods = map[string]reflect.Method{} 453 | 454 | for m := 0; m < svc.typ.NumMethod(); m++ { 455 | method := svc.typ.Method(m) 456 | mtype := method.Type 457 | mname := method.Name 458 | 459 | //fmt.Printf("%v pp %v ni %v 1k %v 2k %v no %v\n", 460 | // mname, method.PkgPath, mtype.NumIn(), mtype.In(1).Kind(), mtype.In(2).Kind(), mtype.NumOut()) 461 | 462 | if method.PkgPath != "" || // capitalized? 463 | mtype.NumIn() != 3 || 464 | //mtype.In(1).Kind() != reflect.Ptr || 465 | mtype.In(2).Kind() != reflect.Ptr || 466 | mtype.NumOut() != 0 { 467 | // the method is not suitable for a handler 468 | //fmt.Printf("bad method: %v\n", mname) 469 | } else { 470 | // the method looks like a handler 471 | svc.methods[mname] = method 472 | } 473 | } 474 | 475 | return svc 476 | } 477 | 478 | func (svc *Service) dispatch(methname string, req reqMsg) replyMsg { 479 | if method, ok := svc.methods[methname]; ok { 480 | // prepare space into which to read the argument. 481 | // the Value's type will be a pointer to req.argsType. 482 | args := reflect.New(req.argsType) 483 | 484 | // decode the argument. 485 | ab := bytes.NewBuffer(req.args) 486 | ad := labgob.NewDecoder(ab) 487 | ad.Decode(args.Interface()) 488 | 489 | // allocate space for the reply. 490 | replyType := method.Type.In(2) 491 | replyType = replyType.Elem() 492 | replyv := reflect.New(replyType) 493 | 494 | // call the method. 495 | function := method.Func 496 | function.Call([]reflect.Value{svc.rcvr, args.Elem(), replyv}) 497 | 498 | // encode the reply. 499 | rb := new(bytes.Buffer) 500 | re := labgob.NewEncoder(rb) 501 | re.EncodeValue(replyv) 502 | 503 | return replyMsg{true, rb.Bytes()} 504 | } else { 505 | choices := []string{} 506 | for k, _ := range svc.methods { 507 | choices = append(choices, k) 508 | } 509 | log.Fatalf("labrpc.Service.dispatch(): unknown method %v in %v; expecting one of %v\n", 510 | methname, req.svcMeth, choices) 511 | return replyMsg{false, nil} 512 | } 513 | } 514 | -------------------------------------------------------------------------------- /src/labrpc/test_test.go: -------------------------------------------------------------------------------- 1 | package labrpc 2 | 3 | import "testing" 4 | import "strconv" 5 | import "sync" 6 | import "runtime" 7 | import "time" 8 | import "fmt" 9 | 10 | type JunkArgs struct { 11 | X int 12 | } 13 | type JunkReply struct { 14 | X string 15 | } 16 | 17 | type JunkServer struct { 18 | mu sync.Mutex 19 | log1 []string 20 | log2 []int 21 | } 22 | 23 | func (js *JunkServer) Handler1(args string, reply *int) { 24 | js.mu.Lock() 25 | defer js.mu.Unlock() 26 | js.log1 = append(js.log1, args) 27 | *reply, _ = strconv.Atoi(args) 28 | } 29 | 30 | func (js *JunkServer) Handler2(args int, reply *string) { 31 | js.mu.Lock() 32 | defer js.mu.Unlock() 33 | js.log2 = append(js.log2, args) 34 | *reply = "handler2-" + strconv.Itoa(args) 35 | } 36 | 37 | func (js *JunkServer) Handler3(args int, reply *int) { 38 | js.mu.Lock() 39 | defer js.mu.Unlock() 40 | time.Sleep(20 * time.Second) 41 | *reply = -args 42 | } 43 | 44 | // args is a pointer 45 | func (js *JunkServer) Handler4(args *JunkArgs, reply *JunkReply) { 46 | reply.X = "pointer" 47 | } 48 | 49 | // args is a not pointer 50 | func (js *JunkServer) Handler5(args JunkArgs, reply *JunkReply) { 51 | reply.X = "no pointer" 52 | } 53 | 54 | func (js *JunkServer) Handler6(args string, reply *int) { 55 | js.mu.Lock() 56 | defer js.mu.Unlock() 57 | *reply = len(args) 58 | } 59 | 60 | func (js *JunkServer) Handler7(args int, reply *string) { 61 | js.mu.Lock() 62 | defer js.mu.Unlock() 63 | *reply = "" 64 | for i := 0; i < args; i++ { 65 | *reply = *reply + "y" 66 | } 67 | } 68 | 69 | func TestBasic(t *testing.T) { 70 | runtime.GOMAXPROCS(4) 71 | 72 | rn := MakeNetwork() 73 | defer rn.Cleanup() 74 | 75 | e := rn.MakeEnd("end1-99") 76 | 77 | js := &JunkServer{} 78 | svc := MakeService(js) 79 | 80 | rs := MakeServer() 81 | rs.AddService(svc) 82 | rn.AddServer("server99", rs) 83 | 84 | rn.Connect("end1-99", "server99") 85 | rn.Enable("end1-99", true) 86 | 87 | { 88 | reply := "" 89 | e.Call("JunkServer.Handler2", 111, &reply) 90 | if reply != "handler2-111" { 91 | t.Fatalf("wrong reply from Handler2") 92 | } 93 | } 94 | 95 | { 96 | reply := 0 97 | e.Call("JunkServer.Handler1", "9099", &reply) 98 | if reply != 9099 { 99 | t.Fatalf("wrong reply from Handler1") 100 | } 101 | } 102 | } 103 | 104 | func TestTypes(t *testing.T) { 105 | runtime.GOMAXPROCS(4) 106 | 107 | rn := MakeNetwork() 108 | defer rn.Cleanup() 109 | 110 | e := rn.MakeEnd("end1-99") 111 | 112 | js := &JunkServer{} 113 | svc := MakeService(js) 114 | 115 | rs := MakeServer() 116 | rs.AddService(svc) 117 | rn.AddServer("server99", rs) 118 | 119 | rn.Connect("end1-99", "server99") 120 | rn.Enable("end1-99", true) 121 | 122 | { 123 | var args JunkArgs 124 | var reply JunkReply 125 | // args must match type (pointer or not) of handler. 126 | e.Call("JunkServer.Handler4", &args, &reply) 127 | if reply.X != "pointer" { 128 | t.Fatalf("wrong reply from Handler4") 129 | } 130 | } 131 | 132 | { 133 | var args JunkArgs 134 | var reply JunkReply 135 | // args must match type (pointer or not) of handler. 136 | e.Call("JunkServer.Handler5", args, &reply) 137 | if reply.X != "no pointer" { 138 | t.Fatalf("wrong reply from Handler5") 139 | } 140 | } 141 | } 142 | 143 | // 144 | // does net.Enable(endname, false) really disconnect a client? 145 | // 146 | func TestDisconnect(t *testing.T) { 147 | runtime.GOMAXPROCS(4) 148 | 149 | rn := MakeNetwork() 150 | defer rn.Cleanup() 151 | 152 | e := rn.MakeEnd("end1-99") 153 | 154 | js := &JunkServer{} 155 | svc := MakeService(js) 156 | 157 | rs := MakeServer() 158 | rs.AddService(svc) 159 | rn.AddServer("server99", rs) 160 | 161 | rn.Connect("end1-99", "server99") 162 | 163 | { 164 | reply := "" 165 | e.Call("JunkServer.Handler2", 111, &reply) 166 | if reply != "" { 167 | t.Fatalf("unexpected reply from Handler2") 168 | } 169 | } 170 | 171 | rn.Enable("end1-99", true) 172 | 173 | { 174 | reply := 0 175 | e.Call("JunkServer.Handler1", "9099", &reply) 176 | if reply != 9099 { 177 | t.Fatalf("wrong reply from Handler1") 178 | } 179 | } 180 | } 181 | 182 | // 183 | // test net.GetCount() 184 | // 185 | func TestCounts(t *testing.T) { 186 | runtime.GOMAXPROCS(4) 187 | 188 | rn := MakeNetwork() 189 | defer rn.Cleanup() 190 | 191 | e := rn.MakeEnd("end1-99") 192 | 193 | js := &JunkServer{} 194 | svc := MakeService(js) 195 | 196 | rs := MakeServer() 197 | rs.AddService(svc) 198 | rn.AddServer(99, rs) 199 | 200 | rn.Connect("end1-99", 99) 201 | rn.Enable("end1-99", true) 202 | 203 | for i := 0; i < 17; i++ { 204 | reply := "" 205 | e.Call("JunkServer.Handler2", i, &reply) 206 | wanted := "handler2-" + strconv.Itoa(i) 207 | if reply != wanted { 208 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 209 | } 210 | } 211 | 212 | n := rn.GetCount(99) 213 | if n != 17 { 214 | t.Fatalf("wrong GetCount() %v, expected 17\n", n) 215 | } 216 | } 217 | 218 | // 219 | // test net.GetTotalBytes() 220 | // 221 | func TestBytes(t *testing.T) { 222 | runtime.GOMAXPROCS(4) 223 | 224 | rn := MakeNetwork() 225 | defer rn.Cleanup() 226 | 227 | e := rn.MakeEnd("end1-99") 228 | 229 | js := &JunkServer{} 230 | svc := MakeService(js) 231 | 232 | rs := MakeServer() 233 | rs.AddService(svc) 234 | rn.AddServer(99, rs) 235 | 236 | rn.Connect("end1-99", 99) 237 | rn.Enable("end1-99", true) 238 | 239 | for i := 0; i < 17; i++ { 240 | args := "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 241 | args = args + args 242 | args = args + args 243 | reply := 0 244 | e.Call("JunkServer.Handler6", args, &reply) 245 | wanted := len(args) 246 | if reply != wanted { 247 | t.Fatalf("wrong reply %v from Handler6, expecting %v", reply, wanted) 248 | } 249 | } 250 | 251 | n := rn.GetTotalBytes() 252 | if n < 4828 || n > 6000 { 253 | t.Fatalf("wrong GetTotalBytes() %v, expected about 5000\n", n) 254 | } 255 | 256 | for i := 0; i < 17; i++ { 257 | args := 107 258 | reply := "" 259 | e.Call("JunkServer.Handler7", args, &reply) 260 | wanted := args 261 | if len(reply) != wanted { 262 | t.Fatalf("wrong reply len=%v from Handler6, expecting %v", len(reply), wanted) 263 | } 264 | } 265 | 266 | nn := rn.GetTotalBytes() - n 267 | if nn < 1800 || nn > 2500 { 268 | t.Fatalf("wrong GetTotalBytes() %v, expected about 2000\n", nn) 269 | } 270 | } 271 | 272 | // 273 | // test RPCs from concurrent ClientEnds 274 | // 275 | func TestConcurrentMany(t *testing.T) { 276 | runtime.GOMAXPROCS(4) 277 | 278 | rn := MakeNetwork() 279 | defer rn.Cleanup() 280 | 281 | js := &JunkServer{} 282 | svc := MakeService(js) 283 | 284 | rs := MakeServer() 285 | rs.AddService(svc) 286 | rn.AddServer(1000, rs) 287 | 288 | ch := make(chan int) 289 | 290 | nclients := 20 291 | nrpcs := 10 292 | for ii := 0; ii < nclients; ii++ { 293 | go func(i int) { 294 | n := 0 295 | defer func() { ch <- n }() 296 | 297 | e := rn.MakeEnd(i) 298 | rn.Connect(i, 1000) 299 | rn.Enable(i, true) 300 | 301 | for j := 0; j < nrpcs; j++ { 302 | arg := i*100 + j 303 | reply := "" 304 | e.Call("JunkServer.Handler2", arg, &reply) 305 | wanted := "handler2-" + strconv.Itoa(arg) 306 | if reply != wanted { 307 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 308 | } 309 | n += 1 310 | } 311 | }(ii) 312 | } 313 | 314 | total := 0 315 | for ii := 0; ii < nclients; ii++ { 316 | x := <-ch 317 | total += x 318 | } 319 | 320 | if total != nclients*nrpcs { 321 | t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nclients*nrpcs) 322 | } 323 | 324 | n := rn.GetCount(1000) 325 | if n != total { 326 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, total) 327 | } 328 | } 329 | 330 | // 331 | // test unreliable 332 | // 333 | func TestUnreliable(t *testing.T) { 334 | runtime.GOMAXPROCS(4) 335 | 336 | rn := MakeNetwork() 337 | defer rn.Cleanup() 338 | rn.Reliable(false) 339 | 340 | js := &JunkServer{} 341 | svc := MakeService(js) 342 | 343 | rs := MakeServer() 344 | rs.AddService(svc) 345 | rn.AddServer(1000, rs) 346 | 347 | ch := make(chan int) 348 | 349 | nclients := 300 350 | for ii := 0; ii < nclients; ii++ { 351 | go func(i int) { 352 | n := 0 353 | defer func() { ch <- n }() 354 | 355 | e := rn.MakeEnd(i) 356 | rn.Connect(i, 1000) 357 | rn.Enable(i, true) 358 | 359 | arg := i * 100 360 | reply := "" 361 | ok := e.Call("JunkServer.Handler2", arg, &reply) 362 | if ok { 363 | wanted := "handler2-" + strconv.Itoa(arg) 364 | if reply != wanted { 365 | t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted) 366 | } 367 | n += 1 368 | } 369 | }(ii) 370 | } 371 | 372 | total := 0 373 | for ii := 0; ii < nclients; ii++ { 374 | x := <-ch 375 | total += x 376 | } 377 | 378 | if total == nclients || total == 0 { 379 | t.Fatalf("all RPCs succeeded despite unreliable") 380 | } 381 | } 382 | 383 | // 384 | // test concurrent RPCs from a single ClientEnd 385 | // 386 | func TestConcurrentOne(t *testing.T) { 387 | runtime.GOMAXPROCS(4) 388 | 389 | rn := MakeNetwork() 390 | defer rn.Cleanup() 391 | 392 | js := &JunkServer{} 393 | svc := MakeService(js) 394 | 395 | rs := MakeServer() 396 | rs.AddService(svc) 397 | rn.AddServer(1000, rs) 398 | 399 | e := rn.MakeEnd("c") 400 | rn.Connect("c", 1000) 401 | rn.Enable("c", true) 402 | 403 | ch := make(chan int) 404 | 405 | nrpcs := 20 406 | for ii := 0; ii < nrpcs; ii++ { 407 | go func(i int) { 408 | n := 0 409 | defer func() { ch <- n }() 410 | 411 | arg := 100 + i 412 | reply := "" 413 | e.Call("JunkServer.Handler2", arg, &reply) 414 | wanted := "handler2-" + strconv.Itoa(arg) 415 | if reply != wanted { 416 | t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted) 417 | } 418 | n += 1 419 | }(ii) 420 | } 421 | 422 | total := 0 423 | for ii := 0; ii < nrpcs; ii++ { 424 | x := <-ch 425 | total += x 426 | } 427 | 428 | if total != nrpcs { 429 | t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nrpcs) 430 | } 431 | 432 | js.mu.Lock() 433 | defer js.mu.Unlock() 434 | if len(js.log2) != nrpcs { 435 | t.Fatalf("wrong number of RPCs delivered") 436 | } 437 | 438 | n := rn.GetCount(1000) 439 | if n != total { 440 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, total) 441 | } 442 | } 443 | 444 | // 445 | // regression: an RPC that's delayed during Enabled=false 446 | // should not delay subsequent RPCs (e.g. after Enabled=true). 447 | // 448 | func TestRegression1(t *testing.T) { 449 | runtime.GOMAXPROCS(4) 450 | 451 | rn := MakeNetwork() 452 | defer rn.Cleanup() 453 | 454 | js := &JunkServer{} 455 | svc := MakeService(js) 456 | 457 | rs := MakeServer() 458 | rs.AddService(svc) 459 | rn.AddServer(1000, rs) 460 | 461 | e := rn.MakeEnd("c") 462 | rn.Connect("c", 1000) 463 | 464 | // start some RPCs while the ClientEnd is disabled. 465 | // they'll be delayed. 466 | rn.Enable("c", false) 467 | ch := make(chan bool) 468 | nrpcs := 20 469 | for ii := 0; ii < nrpcs; ii++ { 470 | go func(i int) { 471 | ok := false 472 | defer func() { ch <- ok }() 473 | 474 | arg := 100 + i 475 | reply := "" 476 | // this call ought to return false. 477 | e.Call("JunkServer.Handler2", arg, &reply) 478 | ok = true 479 | }(ii) 480 | } 481 | 482 | time.Sleep(100 * time.Millisecond) 483 | 484 | // now enable the ClientEnd and check that an RPC completes quickly. 485 | t0 := time.Now() 486 | rn.Enable("c", true) 487 | { 488 | arg := 99 489 | reply := "" 490 | e.Call("JunkServer.Handler2", arg, &reply) 491 | wanted := "handler2-" + strconv.Itoa(arg) 492 | if reply != wanted { 493 | t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted) 494 | } 495 | } 496 | dur := time.Since(t0).Seconds() 497 | 498 | if dur > 0.03 { 499 | t.Fatalf("RPC took too long (%v) after Enable", dur) 500 | } 501 | 502 | for ii := 0; ii < nrpcs; ii++ { 503 | <-ch 504 | } 505 | 506 | js.mu.Lock() 507 | defer js.mu.Unlock() 508 | if len(js.log2) != 1 { 509 | t.Fatalf("wrong number (%v) of RPCs delivered, expected 1", len(js.log2)) 510 | } 511 | 512 | n := rn.GetCount(1000) 513 | if n != 1 { 514 | t.Fatalf("wrong GetCount() %v, expected %v\n", n, 1) 515 | } 516 | } 517 | 518 | // 519 | // if an RPC is stuck in a server, and the server 520 | // is killed with DeleteServer(), does the RPC 521 | // get un-stuck? 522 | // 523 | func TestKilled(t *testing.T) { 524 | runtime.GOMAXPROCS(4) 525 | 526 | rn := MakeNetwork() 527 | defer rn.Cleanup() 528 | 529 | e := rn.MakeEnd("end1-99") 530 | 531 | js := &JunkServer{} 532 | svc := MakeService(js) 533 | 534 | rs := MakeServer() 535 | rs.AddService(svc) 536 | rn.AddServer("server99", rs) 537 | 538 | rn.Connect("end1-99", "server99") 539 | rn.Enable("end1-99", true) 540 | 541 | doneCh := make(chan bool) 542 | go func() { 543 | reply := 0 544 | ok := e.Call("JunkServer.Handler3", 99, &reply) 545 | doneCh <- ok 546 | }() 547 | 548 | time.Sleep(1000 * time.Millisecond) 549 | 550 | select { 551 | case <-doneCh: 552 | t.Fatalf("Handler3 should not have returned yet") 553 | case <-time.After(100 * time.Millisecond): 554 | } 555 | 556 | rn.DeleteServer("server99") 557 | 558 | select { 559 | case x := <-doneCh: 560 | if x != false { 561 | t.Fatalf("Handler3 returned successfully despite DeleteServer()") 562 | } 563 | case <-time.After(100 * time.Millisecond): 564 | t.Fatalf("Handler3 should return after DeleteServer()") 565 | } 566 | } 567 | 568 | func TestBenchmark(t *testing.T) { 569 | runtime.GOMAXPROCS(4) 570 | 571 | rn := MakeNetwork() 572 | defer rn.Cleanup() 573 | 574 | e := rn.MakeEnd("end1-99") 575 | 576 | js := &JunkServer{} 577 | svc := MakeService(js) 578 | 579 | rs := MakeServer() 580 | rs.AddService(svc) 581 | rn.AddServer("server99", rs) 582 | 583 | rn.Connect("end1-99", "server99") 584 | rn.Enable("end1-99", true) 585 | 586 | t0 := time.Now() 587 | n := 100000 588 | for iters := 0; iters < n; iters++ { 589 | reply := "" 590 | e.Call("JunkServer.Handler2", 111, &reply) 591 | if reply != "handler2-111" { 592 | t.Fatalf("wrong reply from Handler2") 593 | } 594 | } 595 | fmt.Printf("%v for %v\n", time.Since(t0), n) 596 | // march 2016, rtm laptop, 22 microseconds per RPC 597 | } 598 | -------------------------------------------------------------------------------- /src/main/diskvd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start a diskvd server. it's a member of some replica 5 | // group, which has other members, and it needs to know 6 | // how to talk to the members of the shardmaster service. 7 | // used by ../diskv/test_test.go 8 | // 9 | // arguments: 10 | // -g groupid 11 | // -m masterport1 -m masterport2 ... 12 | // -s replicaport1 -s replicaport2 ... 13 | // -i my-index-in-server-port-list 14 | // -u unreliable 15 | // -d directory 16 | // -r restart 17 | 18 | import "time" 19 | import "6.824/diskv" 20 | import "os" 21 | import "fmt" 22 | import "strconv" 23 | import "runtime" 24 | 25 | func usage() { 26 | fmt.Printf("Usage: diskvd -g gid -m master... -s server... -i my-index -d dir\n") 27 | os.Exit(1) 28 | } 29 | 30 | func main() { 31 | var gid int64 = -1 // my replica group ID 32 | masters := []string{} // ports of shardmasters 33 | replicas := []string{} // ports of servers in my replica group 34 | me := -1 // my index in replicas[] 35 | unreliable := false 36 | dir := "" // store persistent data here 37 | restart := false 38 | 39 | for i := 1; i+1 < len(os.Args); i += 2 { 40 | a0 := os.Args[i] 41 | a1 := os.Args[i+1] 42 | if a0 == "-g" { 43 | gid, _ = strconv.ParseInt(a1, 10, 64) 44 | } else if a0 == "-m" { 45 | masters = append(masters, a1) 46 | } else if a0 == "-s" { 47 | replicas = append(replicas, a1) 48 | } else if a0 == "-i" { 49 | me, _ = strconv.Atoi(a1) 50 | } else if a0 == "-u" { 51 | unreliable, _ = strconv.ParseBool(a1) 52 | } else if a0 == "-d" { 53 | dir = a1 54 | } else if a0 == "-r" { 55 | restart, _ = strconv.ParseBool(a1) 56 | } else { 57 | usage() 58 | } 59 | } 60 | 61 | if gid < 0 || me < 0 || len(masters) < 1 || me >= len(replicas) || dir == "" { 62 | usage() 63 | } 64 | 65 | runtime.GOMAXPROCS(4) 66 | 67 | srv := diskv.StartServer(gid, masters, replicas, me, dir, restart) 68 | srv.Setunreliable(unreliable) 69 | 70 | // for safety, force quit after 10 minutes. 71 | time.Sleep(10 * 60 * time.Second) 72 | mep, _ := os.FindProcess(os.Getpid()) 73 | mep.Kill() 74 | } 75 | -------------------------------------------------------------------------------- /src/main/lockc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see comments in lockd.go 5 | // 6 | 7 | import "6.824/lockservice" 8 | import "os" 9 | import "fmt" 10 | 11 | func usage() { 12 | fmt.Printf("Usage: lockc -l|-u primaryport backupport lockname\n") 13 | os.Exit(1) 14 | } 15 | 16 | func main() { 17 | if len(os.Args) == 5 { 18 | ck := lockservice.MakeClerk(os.Args[2], os.Args[3]) 19 | var ok bool 20 | if os.Args[1] == "-l" { 21 | ok = ck.Lock(os.Args[4]) 22 | } else if os.Args[1] == "-u" { 23 | ok = ck.Unlock(os.Args[4]) 24 | } else { 25 | usage() 26 | } 27 | fmt.Printf("reply: %v\n", ok) 28 | } else { 29 | usage() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/lockd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // export GOPATH=~/6.824 4 | // go build lockd.go 5 | // go build lockc.go 6 | // ./lockd -p a b & 7 | // ./lockd -b a b & 8 | // ./lockc -l a b lx 9 | // ./lockc -u a b lx 10 | // 11 | // on Athena, use /tmp/myname-a and /tmp/myname-b 12 | // instead of a and b. 13 | 14 | import "time" 15 | import "6.824/lockservice" 16 | import "os" 17 | import "fmt" 18 | 19 | func main() { 20 | if len(os.Args) == 4 && os.Args[1] == "-p" { 21 | lockservice.StartServer(os.Args[2], os.Args[3], true) 22 | } else if len(os.Args) == 4 && os.Args[1] == "-b" { 23 | lockservice.StartServer(os.Args[2], os.Args[3], false) 24 | } else { 25 | fmt.Printf("Usage: lockd -p|-b primaryport backupport\n") 26 | os.Exit(1) 27 | } 28 | for { 29 | time.Sleep(100 * time.Second) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/mrcoordinator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start the coordinator process, which is implemented 5 | // in ../mr/coordinator.go 6 | // 7 | // go run mrcoordinator.go pg*.txt 8 | // 9 | // Please do not change this file. 10 | // 11 | 12 | import "mit6.824/mr" 13 | import "time" 14 | import "os" 15 | import "fmt" 16 | 17 | func main() { 18 | if len(os.Args) < 2 { 19 | fmt.Fprintf(os.Stderr, "Usage: mrcoordinator inputfiles...\n") 20 | os.Exit(1) 21 | } 22 | 23 | m := mr.MakeCoordinator(os.Args[1:], 10) 24 | for m.Done() == false { 25 | time.Sleep(time.Second) 26 | } 27 | 28 | time.Sleep(time.Second) 29 | } 30 | -------------------------------------------------------------------------------- /src/main/mrsequential.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // simple sequential MapReduce. 5 | // 6 | // go run mrsequential.go wc.so pg*.txt 7 | // 8 | 9 | import "fmt" 10 | import "mit6.824/mr" 11 | import "plugin" 12 | import "os" 13 | import "log" 14 | import "io/ioutil" 15 | import "sort" 16 | 17 | // for sorting by key. 18 | type ByKey []mr.KeyValue 19 | 20 | // for sorting by key. 21 | func (a ByKey) Len() int { return len(a) } 22 | func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 23 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key } 24 | 25 | func main() { 26 | if len(os.Args) < 3 { 27 | fmt.Fprintf(os.Stderr, "Usage: mrsequential xxx.so inputfiles...\n") 28 | os.Exit(1) 29 | } 30 | // 读出Map Reduce函数 31 | mapf, reducef := loadPlugin(os.Args[1]) 32 | 33 | // 34 | // read each input file, 35 | // pass it to Map, 36 | // accumulate the intermediate Map output. 37 | // 38 | // 遍历文件名,读取单词后存入intermediate 39 | var intermediate []mr.KeyValue 40 | for _, filename := range os.Args[2:] { 41 | //fmt.Println(filename) 42 | file, err := os.Open(filename) 43 | if err != nil { 44 | log.Fatalf("cannot open %v", filename) 45 | } 46 | content, err := ioutil.ReadAll(file) 47 | if err != nil { 48 | log.Fatalf("cannot read %v", filename) 49 | } 50 | file.Close() 51 | kva := mapf(filename, string(content)) 52 | intermediate = append(intermediate, kva...) 53 | } 54 | 55 | // 56 | // a big difference from real MapReduce is that all the 57 | // intermediate data is in one place, intermediate[], 58 | // rather than being partitioned into NxM buckets. 59 | // 60 | 61 | sort.Sort(ByKey(intermediate)) 62 | 63 | oname := "mr-out-0" 64 | ofile, _ := os.Create(oname) 65 | 66 | // 67 | // call Reduce on each distinct key in intermediate[], 68 | // and print the result to mr-out-0. 69 | // 70 | i := 0 71 | for i < len(intermediate) { 72 | j := i + 1 73 | for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key { 74 | j++ 75 | } 76 | var values []string 77 | for k := i; k < j; k++ { 78 | values = append(values, intermediate[k].Value) 79 | } 80 | output := reducef(intermediate[i].Key, values) 81 | 82 | // this is the correct format for each line of Reduce output. 83 | fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output) 84 | 85 | i = j 86 | } 87 | 88 | ofile.Close() 89 | } 90 | 91 | // 92 | // load the application Map and Reduce functions 93 | // from a plugin file, e.g. ../mrapps/wc.so 94 | // 95 | // 读取插件中的Map Reduce函数,然后返回 96 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) { 97 | p, err := plugin.Open(filename) 98 | if err != nil { 99 | log.Fatalf("cannot load plugin %v", filename) 100 | } 101 | xmapf, err := p.Lookup("Map") 102 | if err != nil { 103 | log.Fatalf("cannot find Map in %v", filename) 104 | } 105 | mapf := xmapf.(func(string, string) []mr.KeyValue) 106 | xreducef, err := p.Lookup("Reduce") 107 | if err != nil { 108 | log.Fatalf("cannot find Reduce in %v", filename) 109 | } 110 | reducef := xreducef.(func(string, []string) string) 111 | 112 | return mapf, reducef 113 | } 114 | -------------------------------------------------------------------------------- /src/main/mrworker.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // start a worker process, which is implemented 5 | // in ../mr/worker.go. typically there will be 6 | // multiple worker processes, talking to one coordinator. 7 | // 8 | // go run mrworker.go wc.so 9 | // 10 | // Please do not change this file. 11 | // 12 | 13 | import "mit6.824/mr" 14 | import "plugin" 15 | import "os" 16 | import "fmt" 17 | import "log" 18 | 19 | func main() { 20 | if len(os.Args) != 2 { 21 | fmt.Fprintf(os.Stderr, "Usage: mrworker xxx.so\n") 22 | os.Exit(1) 23 | } 24 | 25 | mapf, reducef := loadPlugin(os.Args[1]) 26 | 27 | mr.Worker(mapf, reducef) 28 | } 29 | 30 | // 31 | // load the application Map and Reduce functions 32 | // from a plugin file, e.g. ../mrapps/wc.so 33 | // 34 | func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) { 35 | p, err := plugin.Open(filename) 36 | if err != nil { 37 | log.Fatalf("cannot load plugin %v", filename) 38 | } 39 | xmapf, err := p.Lookup("Map") 40 | if err != nil { 41 | log.Fatalf("cannot find Map in %v", filename) 42 | } 43 | mapf := xmapf.(func(string, string) []mr.KeyValue) 44 | xreducef, err := p.Lookup("Reduce") 45 | if err != nil { 46 | log.Fatalf("cannot find Reduce in %v", filename) 47 | } 48 | reducef := xreducef.(func(string, []string) string) 49 | 50 | return mapf, reducef 51 | } 52 | -------------------------------------------------------------------------------- /src/main/pbc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // pbservice client application 5 | // 6 | // export GOPATH=~/6.824 7 | // go build viewd.go 8 | // go build pbd.go 9 | // go build pbc.go 10 | // ./viewd /tmp/rtm-v & 11 | // ./pbd /tmp/rtm-v /tmp/rtm-1 & 12 | // ./pbd /tmp/rtm-v /tmp/rtm-2 & 13 | // ./pbc /tmp/rtm-v key1 value1 14 | // ./pbc /tmp/rtm-v key1 15 | // 16 | // change "rtm" to your user name. 17 | // start the pbd programs in separate windows and kill 18 | // and restart them to exercise fault tolerance. 19 | // 20 | 21 | import "6.824/pbservice" 22 | import "os" 23 | import "fmt" 24 | 25 | func usage() { 26 | fmt.Printf("Usage: pbc viewport key\n") 27 | fmt.Printf(" pbc viewport key value\n") 28 | os.Exit(1) 29 | } 30 | 31 | func main() { 32 | if len(os.Args) == 3 { 33 | // get 34 | ck := pbservice.MakeClerk(os.Args[1], "") 35 | v := ck.Get(os.Args[2]) 36 | fmt.Printf("%v\n", v) 37 | } else if len(os.Args) == 4 { 38 | // put 39 | ck := pbservice.MakeClerk(os.Args[1], "") 40 | ck.Put(os.Args[2], os.Args[3]) 41 | } else { 42 | usage() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/pbd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see directions in pbc.go 5 | // 6 | 7 | import "time" 8 | import "6.824/pbservice" 9 | import "os" 10 | import "fmt" 11 | 12 | func main() { 13 | if len(os.Args) != 3 { 14 | fmt.Printf("Usage: pbd viewport myport\n") 15 | os.Exit(1) 16 | } 17 | 18 | pbservice.StartServer(os.Args[1], os.Args[2]) 19 | 20 | for { 21 | time.Sleep(100 * time.Second) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/test-mr-many.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "Usage: $0 numTrials" 5 | exit 1 6 | fi 7 | 8 | trap 'kill -INT -$pid; exit 1' INT 9 | 10 | # Note: because the socketID is based on the current userID, 11 | # ./test-mr.sh cannot be run in parallel 12 | runs=$1 13 | chmod +x test-mr.sh 14 | 15 | for i in $(seq 1 $runs); do 16 | timeout -k 2s 900s ./test-mr.sh & 17 | pid=$! 18 | if ! wait $pid; then 19 | echo '***' FAILED TESTS IN TRIAL $i 20 | exit 1 21 | fi 22 | done 23 | echo '***' PASSED ALL $i TESTING TRIALS 24 | -------------------------------------------------------------------------------- /src/main/test-mr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # map-reduce tests 5 | # 6 | 7 | # comment this out to run the tests without the Go race detector. 8 | RACE=-race 9 | 10 | if [[ "$OSTYPE" = "darwin"* ]] 11 | then 12 | if go version | grep 'go1.17.[012345]' 13 | then 14 | # -race with plug-ins on x86 MacOS 12 with 15 | # go1.17 before 1.17.6 sometimes crash. 16 | RACE= 17 | echo '*** Turning off -race since it may not work on a Mac' 18 | echo ' with ' `go version` 19 | fi 20 | fi 21 | 22 | TIMEOUT=timeout 23 | if timeout 2s sleep 1 > /dev/null 2>&1 24 | then 25 | : 26 | else 27 | if gtimeout 2s sleep 1 > /dev/null 2>&1 28 | then 29 | TIMEOUT=gtimeout 30 | else 31 | # no timeout command 32 | TIMEOUT= 33 | # echo '*** Cannot find timeout command; proceeding without timeouts.' 34 | fi 35 | fi 36 | if [ "$TIMEOUT" != "" ] 37 | then 38 | TIMEOUT+=" -k 2s 180s " 39 | fi 40 | 41 | # run the test in a fresh sub-directory. 42 | rm -rf mr-tmp 43 | mkdir mr-tmp || exit 1 44 | cd mr-tmp || exit 1 45 | # 没用吧这个指令 :) 46 | # rm -f mr-* 47 | 48 | # make sure software is freshly built. 49 | (cd ../../mrapps && go clean) 50 | (cd .. && go clean) 51 | (cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1 52 | (cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1 53 | (cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1 54 | (cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1 55 | (cd ../../mrapps && go build $RACE -buildmode=plugin jobcount.go) || exit 1 56 | (cd ../../mrapps && go build $RACE -buildmode=plugin early_exit.go) || exit 1 57 | (cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1 58 | (cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1 59 | (cd .. && go build $RACE mrcoordinator.go) || exit 1 60 | (cd .. && go build $RACE mrworker.go) || exit 1 61 | (cd .. && go build $RACE mrsequential.go) || exit 1 62 | 63 | failed_any=0 64 | 65 | ######################################################### 66 | # first word-count 67 | 68 | # generate the correct output 69 | ../mrsequential ../../mrapps/wc.so ../pg*txt || exit 1 70 | sort mr-out-0 > mr-correct-wc.txt 71 | rm -f mr-out* 72 | 73 | echo '***' Starting wc test. 74 | 75 | $TIMEOUT ../mrcoordinator ../pg*txt & 76 | pid=$! 77 | 78 | # give the coordinator time to create the sockets. 79 | sleep 1 80 | 81 | # start multiple workers. 82 | $TIMEOUT ../mrworker ../../mrapps/wc.so & 83 | $TIMEOUT ../mrworker ../../mrapps/wc.so & 84 | $TIMEOUT ../mrworker ../../mrapps/wc.so & 85 | 86 | # wait for the coordinator to exit. 87 | wait $pid 88 | 89 | # since workers are required to exit when a job is completely finished, 90 | # and not before, that means the job has finished. 91 | sort mr-out* | grep . > mr-wc-all 92 | if cmp mr-wc-all mr-correct-wc.txt 93 | then 94 | echo '---' wc test: PASS 95 | else 96 | echo '---' wc output is not the same as mr-correct-wc.txt 97 | echo '---' wc test: FAIL 98 | failed_any=1 99 | fi 100 | 101 | # wait for remaining workers and coordinator to exit. 102 | wait 103 | 104 | # 断点只测试wc 105 | # exit 1 106 | 107 | ######################################################### 108 | # now indexer 109 | rm -f mr-* 110 | 111 | # generate the correct output 112 | ../mrsequential ../../mrapps/indexer.so ../pg*txt || exit 1 113 | sort mr-out-0 > mr-correct-indexer.txt 114 | rm -f mr-out* 115 | 116 | echo '***' Starting indexer test. 117 | 118 | $TIMEOUT ../mrcoordinator ../pg*txt & 119 | sleep 1 120 | 121 | # start multiple workers 122 | $TIMEOUT ../mrworker ../../mrapps/indexer.so & 123 | $TIMEOUT ../mrworker ../../mrapps/indexer.so 124 | 125 | sort mr-out* | grep . > mr-indexer-all 126 | if cmp mr-indexer-all mr-correct-indexer.txt 127 | then 128 | echo '---' indexer test: PASS 129 | else 130 | echo '---' indexer output is not the same as mr-correct-indexer.txt 131 | echo '---' indexer test: FAIL 132 | failed_any=1 133 | fi 134 | 135 | wait 136 | 137 | ######################################################### 138 | echo '***' Starting map parallelism test. 139 | 140 | rm -f mr-* 141 | 142 | $TIMEOUT ../mrcoordinator ../pg*txt & 143 | sleep 1 144 | 145 | $TIMEOUT ../mrworker ../../mrapps/mtiming.so & 146 | $TIMEOUT ../mrworker ../../mrapps/mtiming.so 147 | 148 | NT=`cat mr-out* | grep '^times-' | wc -l | sed 's/ //g'` 149 | if [ "$NT" != "2" ] 150 | then 151 | echo '---' saw "$NT" workers rather than 2 152 | echo '---' map parallelism test: FAIL 153 | failed_any=1 154 | fi 155 | 156 | if cat mr-out* | grep '^parallel.* 2' > /dev/null 157 | then 158 | echo '---' map parallelism test: PASS 159 | else 160 | echo '---' map workers did not run in parallel 161 | echo '---' map parallelism test: FAIL 162 | failed_any=1 163 | fi 164 | 165 | wait 166 | 167 | 168 | ######################################################### 169 | echo '***' Starting reduce parallelism test. 170 | 171 | rm -f mr-* 172 | 173 | $TIMEOUT ../mrcoordinator ../pg*txt & 174 | sleep 1 175 | 176 | $TIMEOUT ../mrworker ../../mrapps/rtiming.so & 177 | $TIMEOUT ../mrworker ../../mrapps/rtiming.so 178 | 179 | NT=`cat mr-out* | grep '^[a-z] 2' | wc -l | sed 's/ //g'` 180 | if [ "$NT" -lt "2" ] 181 | then 182 | echo '---' too few parallel reduces. 183 | echo '---' reduce parallelism test: FAIL 184 | failed_any=1 185 | else 186 | echo '---' reduce parallelism test: PASS 187 | fi 188 | 189 | wait 190 | 191 | ######################################################### 192 | echo '***' Starting job count test. 193 | 194 | rm -f mr-* 195 | 196 | $TIMEOUT ../mrcoordinator ../pg*txt & 197 | sleep 1 198 | 199 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so & 200 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so 201 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so & 202 | $TIMEOUT ../mrworker ../../mrapps/jobcount.so 203 | 204 | NT=`cat mr-out* | awk '{print $2}'` 205 | if [ "$NT" -eq "8" ] 206 | then 207 | echo '---' job count test: PASS 208 | else 209 | echo '---' map jobs ran incorrect number of times "($NT != 8)" 210 | echo '---' job count test: FAIL 211 | failed_any=1 212 | fi 213 | 214 | wait 215 | 216 | ######################################################### 217 | # test whether any worker or coordinator exits before the 218 | # task has completed (i.e., all output files have been finalized) 219 | rm -f mr-* 220 | 221 | echo '***' Starting early exit test. 222 | 223 | DF=anydone$$ 224 | rm -f $DF 225 | 226 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch $DF) & 227 | 228 | # give the coordinator time to create the sockets. 229 | sleep 1 230 | 231 | # start multiple workers. 232 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 233 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 234 | ($TIMEOUT ../mrworker ../../mrapps/early_exit.so ; touch $DF) & 235 | 236 | # wait for any of the coord or workers to exit. 237 | # `jobs` ensures that any completed old processes from other tests 238 | # are not waited upon. 239 | jobs &> /dev/null 240 | if [[ "$OSTYPE" = "darwin"* ]] 241 | then 242 | # bash on the Mac doesn't have wait -n 243 | while [ ! -e $DF ] 244 | do 245 | sleep 0.2 246 | done 247 | else 248 | # the -n causes wait to wait for just one child process, 249 | # rather than waiting for all to finish. 250 | wait -n 251 | fi 252 | 253 | rm -f $DF 254 | 255 | # a process has exited. this means that the output should be finalized 256 | # otherwise, either a worker or the coordinator exited early 257 | sort mr-out* | grep . > mr-wc-all-initial 258 | 259 | # wait for remaining workers and coordinator to exit. 260 | wait 261 | 262 | # compare initial and final outputs 263 | sort mr-out* | grep . > mr-wc-all-final 264 | if cmp mr-wc-all-final mr-wc-all-initial 265 | then 266 | echo '---' early exit test: PASS 267 | else 268 | echo '---' output changed after first worker exited 269 | echo '---' early exit test: FAIL 270 | failed_any=1 271 | fi 272 | rm -f mr-* 273 | 274 | ######################################################### 275 | echo '***' Starting crash test. 276 | 277 | # generate the correct output 278 | ../mrsequential ../../mrapps/nocrash.so ../pg*txt || exit 1 279 | sort mr-out-0 > mr-correct-crash.txt 280 | rm -f mr-out* 281 | 282 | rm -f mr-done 283 | ($TIMEOUT ../mrcoordinator ../pg*txt ; touch mr-done ) & 284 | sleep 1 285 | 286 | # start multiple workers 287 | $TIMEOUT ../mrworker ../../mrapps/crash.so & 288 | 289 | # mimic rpc.go's coordinatorSock() 290 | SOCKNAME=/var/tmp/824-mr-`id -u` 291 | 292 | ( while [ -e $SOCKNAME -a ! -f mr-done ] 293 | do 294 | $TIMEOUT ../mrworker ../../mrapps/crash.so 295 | sleep 1 296 | done ) & 297 | 298 | ( while [ -e $SOCKNAME -a ! -f mr-done ] 299 | do 300 | $TIMEOUT ../mrworker ../../mrapps/crash.so 301 | sleep 1 302 | done ) & 303 | 304 | while [ -e $SOCKNAME -a ! -f mr-done ] 305 | do 306 | $TIMEOUT ../mrworker ../../mrapps/crash.so 307 | sleep 1 308 | done 309 | 310 | wait 311 | 312 | rm $SOCKNAME 313 | sort mr-out* | grep . > mr-crash-all 314 | if cmp mr-crash-all mr-correct-crash.txt 315 | then 316 | echo '---' crash test: PASS 317 | else 318 | echo '---' crash output is not the same as mr-correct-crash.txt 319 | echo '---' crash test: FAIL 320 | failed_any=1 321 | fi 322 | 323 | ######################################################### 324 | if [ $failed_any -eq 0 ]; then 325 | echo '***' PASSED ALL TESTS 326 | else 327 | echo '***' FAILED SOME TESTS 328 | exit 1 329 | fi 330 | -------------------------------------------------------------------------------- /src/main/viewd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // see directions in pbc.go 5 | // 6 | 7 | import "time" 8 | import "6.824/viewservice" 9 | import "os" 10 | import "fmt" 11 | 12 | func main() { 13 | if len(os.Args) != 2 { 14 | fmt.Printf("Usage: viewd port\n") 15 | os.Exit(1) 16 | } 17 | 18 | viewservice.StartServer(os.Args[1]) 19 | 20 | for { 21 | time.Sleep(100 * time.Second) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/models/kv.go: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | import "mit6.824/porcupine" 4 | import "fmt" 5 | import "sort" 6 | 7 | type KvInput struct { 8 | Op uint8 // 0 => get, 1 => put, 2 => append 9 | Key string 10 | Value string 11 | } 12 | 13 | type KvOutput struct { 14 | Value string 15 | } 16 | 17 | var KvModel = porcupine.Model{ 18 | Partition: func(history []porcupine.Operation) [][]porcupine.Operation { 19 | m := make(map[string][]porcupine.Operation) 20 | for _, v := range history { 21 | key := v.Input.(KvInput).Key 22 | m[key] = append(m[key], v) 23 | } 24 | keys := make([]string, 0, len(m)) 25 | for k := range m { 26 | keys = append(keys, k) 27 | } 28 | sort.Strings(keys) 29 | ret := make([][]porcupine.Operation, 0, len(keys)) 30 | for _, k := range keys { 31 | ret = append(ret, m[k]) 32 | } 33 | return ret 34 | }, 35 | Init: func() interface{} { 36 | // note: we are modeling a single key's value here; 37 | // we're partitioning by key, so this is okay 38 | return "" 39 | }, 40 | Step: func(state, input, output interface{}) (bool, interface{}) { 41 | inp := input.(KvInput) 42 | out := output.(KvOutput) 43 | st := state.(string) 44 | if inp.Op == 0 { 45 | // get 46 | return out.Value == st, state 47 | } else if inp.Op == 1 { 48 | // put 49 | return true, inp.Value 50 | } else { 51 | // append 52 | return true, (st + inp.Value) 53 | } 54 | }, 55 | DescribeOperation: func(input, output interface{}) string { 56 | inp := input.(KvInput) 57 | out := output.(KvOutput) 58 | switch inp.Op { 59 | case 0: 60 | return fmt.Sprintf("get('%s') -> '%s'", inp.Key, out.Value) 61 | case 1: 62 | return fmt.Sprintf("put('%s', '%s')", inp.Key, inp.Value) 63 | case 2: 64 | return fmt.Sprintf("append('%s', '%s')", inp.Key, inp.Value) 65 | default: 66 | return "" 67 | } 68 | }, 69 | } 70 | -------------------------------------------------------------------------------- /src/mr/coordinator.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "log" 5 | "sync" 6 | "time" 7 | ) 8 | import "net" 9 | import "os" 10 | import "net/rpc" 11 | import "net/http" 12 | 13 | type Coordinator struct { 14 | // Mutex锁 15 | lock sync.Mutex 16 | // Reduce任务数量 17 | reduceCount int 18 | // Worker进程ID 19 | workers []int 20 | // 当前状态,0正在做Map任务,1正在做Reduce任务,2等Worker全部退出 21 | status int 22 | 23 | // Map任务 24 | mapTasks map[string]*mapTask 25 | // Map已完成数量 26 | mapTaskDoneCount int 27 | 28 | // Reduce任务 29 | reduceTasks []*reduceTask 30 | // Reduce任务完成数量 31 | reduceTaskDoneCount int 32 | } 33 | 34 | // Reduce任务结构 35 | type reduceTask struct { 36 | id int 37 | working bool 38 | done bool 39 | workerID int 40 | } 41 | 42 | // Map任务结构 43 | type mapTask struct { 44 | id int 45 | name string 46 | working bool 47 | done bool 48 | workerID int 49 | } 50 | 51 | // Your code here -- RPC handlers for the worker to call. 52 | /* 53 | 1.分Map任务给worker,worker完成之后call一个task ok(10s计时) 54 | 2.全部Map完成后开始reduce,每个key的所有values传给worker 55 | 3.写入文件mr-out-x 先排序好所有的intermediate 56 | 4.关闭所有worker后退出自己 57 | */ 58 | 59 | // Fuck Worker访问此接口拿到一个任务 60 | func (c *Coordinator) Fuck(args *FuckArgs, reply *FuckReply) error { 61 | c.lock.Lock() 62 | defer c.lock.Unlock() 63 | // 检查状态 64 | if c.status == 0 { 65 | // Map任务没做完,找一个没做的任务给worker 66 | mapName := "" 67 | for _, task := range c.mapTasks { 68 | if task.working == false && task.done == false { 69 | // 找到了可以给他的map任务 70 | mapName = task.name 71 | task.workerID = args.WorkerID 72 | task.working = true 73 | break 74 | } 75 | } 76 | if mapName == "" { 77 | // 可能找不到可以做的map任务,就传0让worker等一会 78 | reply.TaskType = 0 79 | return nil 80 | } 81 | // 回传给worker的数据 82 | reply.MapID = c.mapTasks[mapName].id 83 | reply.MapName = mapName 84 | reply.TaskType = 1 85 | reply.ReduceCount = c.reduceCount 86 | go c.checkTaskTimeOut(1, mapName, 0) 87 | } else if c.status == 1 { 88 | // Map做完了,在做Reduce任务 89 | reduceID := -1 90 | for i, task := range c.reduceTasks { 91 | if task.working == false && task.done == false { 92 | // 找到了可以给他的reduce任务 93 | reduceID = i 94 | task.workerID = args.WorkerID 95 | task.working = true 96 | break 97 | } 98 | } 99 | if reduceID == -1 { 100 | // 没找到可以做的reduce任务,也传0 101 | reply.TaskType = 0 102 | return nil 103 | } 104 | reply.TaskType = 2 105 | reply.ReduceID = reduceID 106 | reply.MapTaskCount = c.mapTaskDoneCount 107 | go c.checkTaskTimeOut(2, "", reduceID) 108 | } else if c.status == 2 { 109 | // 发送退出信号,任务都完成了 110 | reply.Exit = true 111 | } 112 | return nil 113 | } 114 | 115 | // WorkerExit Worker退出回传 116 | func (c *Coordinator) WorkerExit(args *WorkerExitArgs, n *None) error { 117 | c.lock.Lock() 118 | log.Printf("Worker[%v] exit!", args.WorkerID) 119 | c.deleteWorker(args.WorkerID) 120 | c.lock.Unlock() 121 | return nil 122 | } 123 | 124 | // 删除worker,需要提前lock 125 | func (c *Coordinator) deleteWorker(workerID int) { 126 | // 从workers删除这个worker 127 | workerKey := -1 128 | for i, worker := range c.workers { 129 | if worker == workerID { 130 | workerKey = i 131 | break 132 | } 133 | } 134 | // 检查是否有这个worker,可能这是以前没死完的worker 135 | if workerKey == -1 { 136 | log.Printf("Worker[%v] exit error! its not my worker!", workerID) 137 | } else { 138 | // 删除这个worker 139 | c.workers = append(c.workers[:workerKey], c.workers[workerKey+1:]...) 140 | } 141 | } 142 | 143 | // checkTaskTimeOut 检查任务超时 144 | func (c *Coordinator) checkTaskTimeOut(taskType int, mapName string, reduceID int) { 145 | time.Sleep(10 * time.Second) 146 | c.lock.Lock() 147 | if taskType == 1 { 148 | // 检查map任务 149 | if c.mapTasks[mapName].done == false { 150 | log.Printf("Map task[%v] dead, worker[%v] dead!:", mapName, c.mapTasks[mapName].workerID) 151 | c.mapTasks[mapName].working = false 152 | c.deleteWorker(c.mapTasks[mapName].workerID) 153 | } 154 | } else if taskType == 2 { 155 | // 检查reduce任务 156 | if c.reduceTasks[reduceID].done == false { 157 | log.Printf("Reduce task[%v] dead, worker[%v] dead!", reduceID, c.reduceTasks[reduceID].workerID) 158 | c.reduceTasks[reduceID].working = false 159 | c.deleteWorker(c.reduceTasks[reduceID].workerID) 160 | } 161 | } 162 | c.lock.Unlock() 163 | } 164 | 165 | // TaskDone worker任务完成回传 166 | func (c *Coordinator) TaskDone(args *TaskDoneArgs, n *None) error { 167 | c.lock.Lock() 168 | if args.TaskType == 1 { 169 | // Map任务完成 170 | c.mapTasks[args.MapName].done = true 171 | c.mapTasks[args.MapName].working = false 172 | c.mapTaskDoneCount++ 173 | log.Printf("Map task[%v] done", args.MapName) 174 | if c.mapTaskDoneCount == len(c.mapTasks) { 175 | // 所有Map任务已完成 176 | c.status = 1 177 | log.Println("All map tasks done!") 178 | } 179 | } else if args.TaskType == 2 { 180 | // Reduce任务完成 181 | c.reduceTasks[args.ReduceID].done = true 182 | c.reduceTasks[args.ReduceID].working = false 183 | c.reduceTaskDoneCount++ 184 | log.Printf("Reduce Task[%v] done", args.ReduceID) 185 | if c.reduceTaskDoneCount == len(c.reduceTasks) { 186 | // 所有Reduce任务已完成 187 | c.status = 2 188 | log.Printf("All reduce tasks done!") 189 | } 190 | } 191 | c.lock.Unlock() 192 | return nil 193 | } 194 | 195 | // RegisterWorker Worker访问此接口来注册到Master,传回一个id 196 | func (c *Coordinator) RegisterWorker(n *None, workerID *int) error { 197 | c.lock.Lock() 198 | *workerID = len(c.workers) 199 | c.workers = append(c.workers, *workerID) 200 | log.Printf("Worker[%v] register to master now!", *workerID) 201 | c.lock.Unlock() 202 | return nil 203 | } 204 | 205 | // code end 206 | 207 | // 208 | // start a thread that listens for RPCs from worker.go 209 | // 210 | func (c *Coordinator) server() { 211 | rpc.Register(c) 212 | rpc.HandleHTTP() 213 | sockname := coordinatorSock() 214 | os.Remove(sockname) 215 | l, e := net.Listen("unix", sockname) 216 | if e != nil { 217 | log.Fatal("listen error:", e) 218 | } 219 | go http.Serve(l, nil) 220 | } 221 | 222 | // Done 223 | // main/mrcoordinator.go calls Done() periodically to find out 224 | // if the entire job has finished. 225 | // 226 | // mrcoordinator通过这个来检查是否所有任务已完成 227 | func (c *Coordinator) Done() bool { 228 | ret := false 229 | // Your code here. 230 | c.lock.Lock() 231 | if c.status == 2 && len(c.workers) == 0 { 232 | log.Printf("Master done now!") 233 | ret = true 234 | } 235 | c.lock.Unlock() 236 | // code end 237 | return ret 238 | } 239 | 240 | // MakeCoordinator 241 | // create a Coordinator. 242 | // main/mrcoordinator.go calls this function. 243 | // nReduce is the number of reduce tasks to use. 244 | // 245 | func MakeCoordinator(files []string, nReduce int) *Coordinator { 246 | c := Coordinator{} 247 | // Your code here. 248 | // 初始化coordinator 249 | c.reduceCount = nReduce 250 | c.workers = make([]int, 0) 251 | c.status = 0 252 | // 初始化Map任务 253 | c.mapTasks = make(map[string]*mapTask) 254 | for i, fileName := range files { 255 | c.mapTasks[fileName] = &mapTask{i, fileName, false, false, 0} 256 | } 257 | // 初始化Reduce任务 258 | c.reduceTasks = make([]*reduceTask, nReduce) 259 | for i := 0; i < nReduce; i++ { 260 | c.reduceTasks[i] = &reduceTask{i, false, false, 0} 261 | } 262 | // code end 263 | c.server() 264 | return &c 265 | } 266 | -------------------------------------------------------------------------------- /src/mr/rpc.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | // 4 | // RPC definitions. 5 | // 6 | // remember to capitalize all names. 7 | // 8 | 9 | import ( 10 | "os" 11 | ) 12 | import "strconv" 13 | 14 | // 15 | // example to show how to declare the arguments 16 | // and reply for an RPC. 17 | // 18 | 19 | // Args Add your RPC definitions here. 20 | 21 | type FuckArgs struct { 22 | WorkerID int 23 | } 24 | 25 | type FuckReply struct { 26 | // 任务类型:[1] Map任务, [2] Reduce任务 27 | TaskType int 28 | // Reduce任务数量 29 | ReduceCount int 30 | // Map任务文件名 31 | MapName string 32 | // Map任务ID 33 | MapID int 34 | // Reduce任务ID 35 | ReduceID int 36 | // Reduce任务用的,map任务数量 37 | MapTaskCount int 38 | // 是否退出Worker 39 | Exit bool 40 | } 41 | 42 | // TaskDoneArgs 任务完成结构 43 | type TaskDoneArgs struct { 44 | // 任务类型:[1] Map任务, [2] Reduce任务 45 | TaskType int 46 | // Map任务文件名 47 | MapName string 48 | // Reduce任务ID 49 | ReduceID int 50 | } 51 | 52 | // WorkerExitArgs worker退出请求 53 | type WorkerExitArgs struct { 54 | WorkerID int 55 | } 56 | 57 | // None 空结构,用来占位 58 | type None struct{} 59 | 60 | // 中间文件格式 mr-MapID-ReduceID 61 | var interFileName = "mr-%v-%v" 62 | 63 | // 输出文件格式 mr-out-ReduceID 64 | var outFileName = "mr-out-%v" 65 | 66 | // code end 67 | 68 | // Cook up a unique-ish UNIX-domain socket name 69 | // in /var/tmp, for the coordinator. 70 | // Can't use the current directory since 71 | // Athena AFS doesn't support UNIX-domain sockets. 72 | func coordinatorSock() string { 73 | s := "/var/tmp/824-mr-" 74 | s += strconv.Itoa(os.Getuid()) 75 | return s 76 | } 77 | -------------------------------------------------------------------------------- /src/mr/worker.go: -------------------------------------------------------------------------------- 1 | package mr 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "sort" 9 | ) 10 | import "log" 11 | import "net/rpc" 12 | import "hash/fnv" 13 | 14 | // KeyValue 15 | // Map functions return a slice of KeyValue. 16 | // 17 | type KeyValue struct { 18 | Key string 19 | Value string 20 | } 21 | 22 | // ByKey for sorting by key. 23 | type ByKey []KeyValue 24 | 25 | // Len for sorting by key. 26 | func (a ByKey) Len() int { return len(a) } 27 | func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 28 | func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key } 29 | 30 | // 31 | // use ihash(key) % NReduce to choose the reduce 32 | // task number for each KeyValue emitted by Map. 33 | // 34 | func ihash(key string) int { 35 | h := fnv.New32a() 36 | h.Write([]byte(key)) 37 | return int(h.Sum32() & 0x7fffffff) 38 | } 39 | 40 | // Worker 41 | // main/mrworker.go calls this function. 42 | // 43 | func Worker(mapf func(string, string) []KeyValue, reducef func(string, []string) string) { 44 | workerID := register() 45 | args := FuckArgs{workerID} 46 | for { 47 | reply := FuckReply{} 48 | ok := call("Coordinator.Fuck", &args, &reply) 49 | if !ok { 50 | log.Fatalf("Worker get task fail!") 51 | } 52 | if reply.Exit == false { 53 | if reply.TaskType == 0 { 54 | // 没拿到任务休息一会 55 | // time.Sleep(time.Millisecond) 56 | } else if reply.TaskType == 1 { 57 | // Map任务 58 | mapResult := mapf(reply.MapName, readFile(reply.MapName)) 59 | // 根据ReduceCount拆分 60 | reduceContent := make([][]KeyValue, reply.ReduceCount) 61 | // 存进content 62 | for _, kv := range mapResult { 63 | key := ihash(kv.Key) % reply.ReduceCount 64 | reduceContent[key] = append(reduceContent[key], kv) 65 | } 66 | // 写Inter文件,nReduce有多少就写多少 67 | for i, content := range reduceContent { 68 | fileName := fmt.Sprintf(interFileName, reply.MapID, i) 69 | f, _ := os.Create(fileName) 70 | enc := json.NewEncoder(f) 71 | for _, line := range content { 72 | enc.Encode(&line) 73 | } 74 | f.Close() 75 | } 76 | // 回传Map任务完成 77 | taskDone(1, reply.MapName, 0) 78 | } else if reply.TaskType == 2 { 79 | // reduce任务,把所有key相同的values传给reduce函数,然后写入文件 80 | // 要先读入同一个reduceID的文件,然后排序,整理 81 | inter := make([]KeyValue, 0) 82 | for i := 0; i < reply.MapTaskCount; i++ { 83 | // 读取所有这个reduceID的文件 84 | fileName := fmt.Sprintf(interFileName, i, reply.ReduceID) 85 | reduceF, _ := os.Open(fileName) 86 | dec := json.NewDecoder(reduceF) 87 | for { 88 | var kv KeyValue 89 | if err := dec.Decode(&kv); err != nil { 90 | break 91 | } 92 | inter = append(inter, kv) 93 | } 94 | reduceF.Close() 95 | } 96 | // 读入了所有的kv 排序 97 | sort.Sort(ByKey(inter)) 98 | // 合并同类kv且写入文件 99 | fileName := fmt.Sprintf(outFileName, reply.ReduceID) 100 | outF, _ := os.Create(fileName) 101 | i := 0 102 | for i < len(inter) { 103 | j := i + 1 104 | for j < len(inter) && inter[j].Key == inter[i].Key { 105 | j++ 106 | } 107 | var values []string 108 | for k := i; k < j; k++ { 109 | values = append(values, inter[k].Value) 110 | } 111 | output := reducef(inter[i].Key, values) 112 | fmt.Fprintf(outF, "%v %v\n", inter[i].Key, output) 113 | i = j 114 | } 115 | outF.Close() 116 | // 回传Task任务完成 117 | taskDone(2, "", reply.ReduceID) 118 | } 119 | } else { 120 | workerExit(workerID) 121 | os.Exit(1) 122 | } 123 | } 124 | } 125 | 126 | // worker退出回传给master 127 | func workerExit(workerID int) { 128 | args := WorkerExitArgs{workerID} 129 | ok := call("Coordinator.WorkerExit", &args, &None{}) 130 | if !ok { 131 | log.Fatalf("Worker[%v] exit fail!", workerID) 132 | } 133 | } 134 | 135 | // 任务完成回传 [1]:map [2]:reduce 136 | func taskDone(taskType int, mapName string, reduceID int) { 137 | args := TaskDoneArgs{taskType, mapName, reduceID} 138 | ok := call("Coordinator.TaskDone", &args, &None{}) 139 | if !ok { 140 | log.Fatalf("Worker matTaskDone fail!") 141 | } 142 | } 143 | 144 | // 注册当前worker到master,返回master给的id 145 | func register() int { 146 | var workerID int 147 | ok := call("Coordinator.RegisterWorker", &None{}, &workerID) 148 | if !ok { 149 | log.Fatalf("Worker register to master fail!") 150 | } 151 | return workerID 152 | } 153 | 154 | // 读取文件,返回内容 155 | func readFile(fileName string) string { 156 | file, err := os.Open(fileName) 157 | if err != nil { 158 | log.Fatalf("Master cannot open %v", fileName) 159 | } 160 | content, err := ioutil.ReadAll(file) 161 | if err != nil { 162 | log.Fatalf("Master cannot read %v", fileName) 163 | } 164 | err = file.Close() 165 | if err != nil { 166 | log.Fatalf("Master cannot close %v", fileName) 167 | } 168 | return string(content) 169 | } 170 | 171 | // 172 | // send an RPC request to the coordinator, wait for the response. 173 | // usually returns true. 174 | // returns false if something goes wrong. 175 | // 176 | func call(rpcname string, args interface{}, reply interface{}) bool { 177 | sockname := coordinatorSock() 178 | c, err := rpc.DialHTTP("unix", sockname) 179 | if err != nil { 180 | log.Fatal("dialing:", err) 181 | } 182 | defer c.Close() 183 | err = c.Call(rpcname, args, reply) 184 | if err == nil { 185 | return true 186 | } 187 | fmt.Println(err) 188 | return false 189 | } 190 | -------------------------------------------------------------------------------- /src/mrapps/crash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application that sometimes crashes, 5 | // and sometimes takes a long time, 6 | // to test MapReduce's ability to recover. 7 | // 8 | // go build -buildmode=plugin crash.go 9 | // 10 | 11 | import "mit6.824/mr" 12 | import crand "crypto/rand" 13 | import "math/big" 14 | import "strings" 15 | import "os" 16 | import "sort" 17 | import "strconv" 18 | import "time" 19 | 20 | func maybeCrash() { 21 | max := big.NewInt(1000) 22 | rr, _ := crand.Int(crand.Reader, max) 23 | if rr.Int64() < 330 { 24 | // crash! 25 | os.Exit(1) 26 | } else if rr.Int64() < 660 { 27 | // delay for a while. 28 | maxms := big.NewInt(10 * 1000) 29 | ms, _ := crand.Int(crand.Reader, maxms) 30 | time.Sleep(time.Duration(ms.Int64()) * time.Millisecond) 31 | } 32 | } 33 | 34 | func Map(filename string, contents string) []mr.KeyValue { 35 | maybeCrash() 36 | 37 | kva := []mr.KeyValue{} 38 | kva = append(kva, mr.KeyValue{"a", filename}) 39 | kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))}) 40 | kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))}) 41 | kva = append(kva, mr.KeyValue{"d", "xyzzy"}) 42 | return kva 43 | } 44 | 45 | func Reduce(key string, values []string) string { 46 | maybeCrash() 47 | 48 | // sort values to ensure deterministic output. 49 | vv := make([]string, len(values)) 50 | copy(vv, values) 51 | sort.Strings(vv) 52 | 53 | val := strings.Join(vv, " ") 54 | return val 55 | } 56 | -------------------------------------------------------------------------------- /src/mrapps/early_exit.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a word-count application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin wc_long.go 7 | // 8 | 9 | import ( 10 | "strconv" 11 | "strings" 12 | "time" 13 | 14 | "mit6.824/mr" 15 | ) 16 | 17 | // 18 | // The map function is called once for each file of input. 19 | // This map function just returns 1 for each file 20 | // 21 | func Map(filename string, contents string) []mr.KeyValue { 22 | kva := []mr.KeyValue{} 23 | kva = append(kva, mr.KeyValue{filename, "1"}) 24 | return kva 25 | } 26 | 27 | // 28 | // The reduce function is called once for each key generated by the 29 | // map tasks, with a list of all the values created for that key by 30 | // any map task. 31 | // 32 | func Reduce(key string, values []string) string { 33 | // some reduce tasks sleep for a long time; potentially seeing if 34 | // a worker will accidentally exit early 35 | if strings.Contains(key, "sherlock") || strings.Contains(key, "tom") { 36 | time.Sleep(time.Duration(3 * time.Second)) 37 | } 38 | // return the number of occurrences of this file. 39 | return strconv.Itoa(len(values)) 40 | } 41 | -------------------------------------------------------------------------------- /src/mrapps/indexer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // an indexing application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin indexer.go 7 | // 8 | 9 | import "fmt" 10 | import "mit6.824/mr" 11 | 12 | import "strings" 13 | import "unicode" 14 | import "sort" 15 | 16 | // Map The mapping function is called once for each piece of the input. 17 | // In this framework, the key is the name of the file that is being processed, 18 | // and the value is the file's contents. The return value should be a slice of 19 | // key/value pairs, each represented by a mr.KeyValue. 20 | func Map(document string, value string) (res []mr.KeyValue) { 21 | m := make(map[string]bool) 22 | words := strings.FieldsFunc(value, func(x rune) bool { return !unicode.IsLetter(x) }) 23 | for _, w := range words { 24 | m[w] = true 25 | } 26 | for w := range m { 27 | kv := mr.KeyValue{w, document} 28 | res = append(res, kv) 29 | } 30 | return 31 | } 32 | 33 | // Reduce The reduce function is called once for each key generated by Map, with a 34 | // list of that key's string value (merged across all inputs). The return value 35 | // should be a single output value for that key. 36 | func Reduce(key string, values []string) string { 37 | sort.Strings(values) 38 | return fmt.Sprintf("%d %s", len(values), strings.Join(values, ",")) 39 | } 40 | -------------------------------------------------------------------------------- /src/mrapps/jobcount.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application that counts the number of times map/reduce 5 | // tasks are run, to test whether jobs are assigned multiple times even when 6 | // there is no failure. 7 | // 8 | // go build -buildmode=plugin crash.go 9 | // 10 | 11 | import "mit6.824/mr" 12 | import "math/rand" 13 | import "strings" 14 | import "strconv" 15 | import "time" 16 | import "fmt" 17 | import "os" 18 | import "io/ioutil" 19 | 20 | var count int 21 | 22 | func Map(filename string, contents string) []mr.KeyValue { 23 | me := os.Getpid() 24 | f := fmt.Sprintf("mr-worker-jobcount-%d-%d", me, count) 25 | count++ 26 | err := ioutil.WriteFile(f, []byte("x"), 0666) 27 | if err != nil { 28 | panic(err) 29 | } 30 | time.Sleep(time.Duration(2000+rand.Intn(3000)) * time.Millisecond) 31 | return []mr.KeyValue{mr.KeyValue{"a", "x"}} 32 | } 33 | 34 | func Reduce(key string, values []string) string { 35 | files, err := ioutil.ReadDir(".") 36 | if err != nil { 37 | panic(err) 38 | } 39 | invocations := 0 40 | for _, f := range files { 41 | if strings.HasPrefix(f.Name(), "mr-worker-jobcount") { 42 | invocations++ 43 | } 44 | } 45 | return strconv.Itoa(invocations) 46 | } 47 | -------------------------------------------------------------------------------- /src/mrapps/mtiming.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application to test that workers 5 | // execute map tasks in parallel. 6 | // 7 | // go build -buildmode=plugin mtiming.go 8 | // 9 | 10 | import "mit6.824/mr" 11 | import "strings" 12 | import "fmt" 13 | import "os" 14 | import "syscall" 15 | import "time" 16 | import "sort" 17 | import "io/ioutil" 18 | 19 | func nparallel(phase string) int { 20 | // create a file so that other workers will see that 21 | // we're running at the same time as them. 22 | pid := os.Getpid() 23 | myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid) 24 | err := ioutil.WriteFile(myfilename, []byte("x"), 0666) 25 | if err != nil { 26 | panic(err) 27 | } 28 | 29 | // are any other workers running? 30 | // find their PIDs by scanning directory for mr-worker-XXX files. 31 | dd, err := os.Open(".") 32 | if err != nil { 33 | panic(err) 34 | } 35 | names, err := dd.Readdirnames(1000000) 36 | if err != nil { 37 | panic(err) 38 | } 39 | ret := 0 40 | for _, name := range names { 41 | var xpid int 42 | pat := fmt.Sprintf("mr-worker-%s-%%d", phase) 43 | n, err := fmt.Sscanf(name, pat, &xpid) 44 | if n == 1 && err == nil { 45 | err := syscall.Kill(xpid, 0) 46 | if err == nil { 47 | // if err == nil, xpid is alive. 48 | ret += 1 49 | } 50 | } 51 | } 52 | dd.Close() 53 | 54 | time.Sleep(1 * time.Second) 55 | 56 | err = os.Remove(myfilename) 57 | if err != nil { 58 | panic(err) 59 | } 60 | 61 | return ret 62 | } 63 | 64 | func Map(filename string, contents string) []mr.KeyValue { 65 | t0 := time.Now() 66 | ts := float64(t0.Unix()) + (float64(t0.Nanosecond()) / 1000000000.0) 67 | pid := os.Getpid() 68 | 69 | n := nparallel("map") 70 | 71 | kva := []mr.KeyValue{} 72 | kva = append(kva, mr.KeyValue{ 73 | fmt.Sprintf("times-%v", pid), 74 | fmt.Sprintf("%.1f", ts)}) 75 | kva = append(kva, mr.KeyValue{ 76 | fmt.Sprintf("parallel-%v", pid), 77 | fmt.Sprintf("%d", n)}) 78 | return kva 79 | } 80 | 81 | func Reduce(key string, values []string) string { 82 | //n := nparallel("reduce") 83 | 84 | // sort values to ensure deterministic output. 85 | vv := make([]string, len(values)) 86 | copy(vv, values) 87 | sort.Strings(vv) 88 | 89 | val := strings.Join(vv, " ") 90 | return val 91 | } 92 | -------------------------------------------------------------------------------- /src/mrapps/nocrash.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // same as crash.go but doesn't actually crash. 5 | // 6 | // go build -buildmode=plugin nocrash.go 7 | // 8 | 9 | import "mit6.824/mr" 10 | import crand "crypto/rand" 11 | import "math/big" 12 | import "strings" 13 | import "os" 14 | import "sort" 15 | import "strconv" 16 | 17 | func maybeCrash() { 18 | max := big.NewInt(1000) 19 | rr, _ := crand.Int(crand.Reader, max) 20 | if false && rr.Int64() < 500 { 21 | // crash! 22 | os.Exit(1) 23 | } 24 | } 25 | 26 | func Map(filename string, contents string) []mr.KeyValue { 27 | maybeCrash() 28 | 29 | kva := []mr.KeyValue{} 30 | kva = append(kva, mr.KeyValue{"a", filename}) 31 | kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))}) 32 | kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))}) 33 | kva = append(kva, mr.KeyValue{"d", "xyzzy"}) 34 | return kva 35 | } 36 | 37 | func Reduce(key string, values []string) string { 38 | maybeCrash() 39 | 40 | // sort values to ensure deterministic output. 41 | vv := make([]string, len(values)) 42 | copy(vv, values) 43 | sort.Strings(vv) 44 | 45 | val := strings.Join(vv, " ") 46 | return val 47 | } 48 | -------------------------------------------------------------------------------- /src/mrapps/rtiming.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a MapReduce pseudo-application to test that workers 5 | // execute reduce tasks in parallel. 6 | // 7 | // go build -buildmode=plugin rtiming.go 8 | // 9 | 10 | import "mit6.824/mr" 11 | import "fmt" 12 | import "os" 13 | import "syscall" 14 | import "time" 15 | import "io/ioutil" 16 | 17 | func nparallel(phase string) int { 18 | // create a file so that other workers will see that 19 | // we're running at the same time as them. 20 | pid := os.Getpid() 21 | myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid) 22 | err := ioutil.WriteFile(myfilename, []byte("x"), 0666) 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | // are any other workers running? 28 | // find their PIDs by scanning directory for mr-worker-XXX files. 29 | dd, err := os.Open(".") 30 | if err != nil { 31 | panic(err) 32 | } 33 | names, err := dd.Readdirnames(1000000) 34 | if err != nil { 35 | panic(err) 36 | } 37 | ret := 0 38 | for _, name := range names { 39 | var xpid int 40 | pat := fmt.Sprintf("mr-worker-%s-%%d", phase) 41 | n, err := fmt.Sscanf(name, pat, &xpid) 42 | if n == 1 && err == nil { 43 | err := syscall.Kill(xpid, 0) 44 | if err == nil { 45 | // if err == nil, xpid is alive. 46 | ret += 1 47 | } 48 | } 49 | } 50 | dd.Close() 51 | 52 | time.Sleep(1 * time.Second) 53 | 54 | err = os.Remove(myfilename) 55 | if err != nil { 56 | panic(err) 57 | } 58 | 59 | return ret 60 | } 61 | 62 | func Map(filename string, contents string) []mr.KeyValue { 63 | 64 | kva := []mr.KeyValue{} 65 | kva = append(kva, mr.KeyValue{"a", "1"}) 66 | kva = append(kva, mr.KeyValue{"b", "1"}) 67 | kva = append(kva, mr.KeyValue{"c", "1"}) 68 | kva = append(kva, mr.KeyValue{"d", "1"}) 69 | kva = append(kva, mr.KeyValue{"e", "1"}) 70 | kva = append(kva, mr.KeyValue{"f", "1"}) 71 | kva = append(kva, mr.KeyValue{"g", "1"}) 72 | kva = append(kva, mr.KeyValue{"h", "1"}) 73 | kva = append(kva, mr.KeyValue{"i", "1"}) 74 | kva = append(kva, mr.KeyValue{"j", "1"}) 75 | return kva 76 | } 77 | 78 | func Reduce(key string, values []string) string { 79 | n := nparallel("reduce") 80 | 81 | val := fmt.Sprintf("%d", n) 82 | 83 | return val 84 | } 85 | -------------------------------------------------------------------------------- /src/mrapps/wc.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // 4 | // a word-count application "plugin" for MapReduce. 5 | // 6 | // go build -buildmode=plugin wc.go 7 | // 8 | 9 | import "mit6.824/mr" 10 | import "unicode" 11 | import "strings" 12 | import "strconv" 13 | 14 | // Map 15 | // The map function is called once for each file of input. The first 16 | // argument is the name of the input file, and the second is the 17 | // file's complete contents. You should ignore the input file name, 18 | // and look only at the contents argument. The return value is a slice 19 | // of key/value pairs. 20 | // 21 | // Map处理原始数据 22 | func Map(filename string, contents string) []mr.KeyValue { 23 | // function to detect word separators. 24 | ff := func(r rune) bool { return !unicode.IsLetter(r) } 25 | 26 | // split contents into an array of words. 27 | words := strings.FieldsFunc(contents, ff) 28 | 29 | kva := []mr.KeyValue{} 30 | for _, w := range words { 31 | kv := mr.KeyValue{w, "1"} 32 | kva = append(kva, kv) 33 | } 34 | return kva 35 | } 36 | 37 | // Reduce 38 | // The reduce function is called once for each key generated by the 39 | // map tasks, with a list of all the values created for that key by 40 | // any map task. 41 | // 42 | // reduce处理整理完的数据 43 | func Reduce(key string, values []string) string { 44 | // return the number of occurrences of this word. 45 | return strconv.Itoa(len(values)) 46 | } 47 | -------------------------------------------------------------------------------- /src/porcupine/bitset.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import "math/bits" 4 | 5 | type bitset []uint64 6 | 7 | // data layout: 8 | // bits 0-63 are in data[0], the next are in data[1], etc. 9 | 10 | func newBitset(bits uint) bitset { 11 | extra := uint(0) 12 | if bits%64 != 0 { 13 | extra = 1 14 | } 15 | chunks := bits/64 + extra 16 | return bitset(make([]uint64, chunks)) 17 | } 18 | 19 | func (b bitset) clone() bitset { 20 | dataCopy := make([]uint64, len(b)) 21 | copy(dataCopy, b) 22 | return bitset(dataCopy) 23 | } 24 | 25 | func bitsetIndex(pos uint) (uint, uint) { 26 | return pos / 64, pos % 64 27 | } 28 | 29 | func (b bitset) set(pos uint) bitset { 30 | major, minor := bitsetIndex(pos) 31 | b[major] |= (1 << minor) 32 | return b 33 | } 34 | 35 | func (b bitset) clear(pos uint) bitset { 36 | major, minor := bitsetIndex(pos) 37 | b[major] &^= (1 << minor) 38 | return b 39 | } 40 | 41 | func (b bitset) get(pos uint) bool { 42 | major, minor := bitsetIndex(pos) 43 | return b[major]&(1<= 0; i-- { 125 | elem := entries[i] 126 | if elem.kind == returnEntry { 127 | entry := &node{value: elem.value, match: nil, id: elem.id} 128 | match[elem.id] = entry 129 | insertBefore(entry, root) 130 | root = entry 131 | } else { 132 | entry := &node{value: elem.value, match: match[elem.id], id: elem.id} 133 | insertBefore(entry, root) 134 | root = entry 135 | } 136 | } 137 | return root 138 | } 139 | 140 | type cacheEntry struct { 141 | linearized bitset 142 | state interface{} 143 | } 144 | 145 | func cacheContains(model Model, cache map[uint64][]cacheEntry, entry cacheEntry) bool { 146 | for _, elem := range cache[entry.linearized.hash()] { 147 | if entry.linearized.equals(elem.linearized) && model.Equal(entry.state, elem.state) { 148 | return true 149 | } 150 | } 151 | return false 152 | } 153 | 154 | type callsEntry struct { 155 | entry *node 156 | state interface{} 157 | } 158 | 159 | func lift(entry *node) { 160 | entry.prev.next = entry.next 161 | entry.next.prev = entry.prev 162 | match := entry.match 163 | match.prev.next = match.next 164 | if match.next != nil { 165 | match.next.prev = match.prev 166 | } 167 | } 168 | 169 | func unlift(entry *node) { 170 | match := entry.match 171 | match.prev.next = match 172 | if match.next != nil { 173 | match.next.prev = match 174 | } 175 | entry.prev.next = entry 176 | entry.next.prev = entry 177 | } 178 | 179 | func checkSingle(model Model, history []entry, computePartial bool, kill *int32) (bool, []*[]int) { 180 | entry := makeLinkedEntries(history) 181 | n := length(entry) / 2 182 | linearized := newBitset(uint(n)) 183 | cache := make(map[uint64][]cacheEntry) // map from hash to cache entry 184 | var calls []callsEntry 185 | // longest linearizable prefix that includes the given entry 186 | longest := make([]*[]int, n) 187 | 188 | state := model.Init() 189 | headEntry := insertBefore(&node{value: nil, match: nil, id: -1}, entry) 190 | for headEntry.next != nil { 191 | if atomic.LoadInt32(kill) != 0 { 192 | return false, longest 193 | } 194 | if entry.match != nil { 195 | matching := entry.match // the return entry 196 | ok, newState := model.Step(state, entry.value, matching.value) 197 | if ok { 198 | newLinearized := linearized.clone().set(uint(entry.id)) 199 | newCacheEntry := cacheEntry{newLinearized, newState} 200 | if !cacheContains(model, cache, newCacheEntry) { 201 | hash := newLinearized.hash() 202 | cache[hash] = append(cache[hash], newCacheEntry) 203 | calls = append(calls, callsEntry{entry, state}) 204 | state = newState 205 | linearized.set(uint(entry.id)) 206 | lift(entry) 207 | entry = headEntry.next 208 | } else { 209 | entry = entry.next 210 | } 211 | } else { 212 | entry = entry.next 213 | } 214 | } else { 215 | if len(calls) == 0 { 216 | return false, longest 217 | } 218 | // longest 219 | if computePartial { 220 | callsLen := len(calls) 221 | var seq []int = nil 222 | for _, v := range calls { 223 | if longest[v.entry.id] == nil || callsLen > len(*longest[v.entry.id]) { 224 | // create seq lazily 225 | if seq == nil { 226 | seq = make([]int, len(calls)) 227 | for i, v := range calls { 228 | seq[i] = v.entry.id 229 | } 230 | } 231 | longest[v.entry.id] = &seq 232 | } 233 | } 234 | } 235 | callsTop := calls[len(calls)-1] 236 | entry = callsTop.entry 237 | state = callsTop.state 238 | linearized.clear(uint(entry.id)) 239 | calls = calls[:len(calls)-1] 240 | unlift(entry) 241 | entry = entry.next 242 | } 243 | } 244 | // longest linearization is the complete linearization, which is calls 245 | seq := make([]int, len(calls)) 246 | for i, v := range calls { 247 | seq[i] = v.entry.id 248 | } 249 | for i := 0; i < n; i++ { 250 | longest[i] = &seq 251 | } 252 | return true, longest 253 | } 254 | 255 | func fillDefault(model Model) Model { 256 | if model.Partition == nil { 257 | model.Partition = NoPartition 258 | } 259 | if model.PartitionEvent == nil { 260 | model.PartitionEvent = NoPartitionEvent 261 | } 262 | if model.Equal == nil { 263 | model.Equal = ShallowEqual 264 | } 265 | if model.DescribeOperation == nil { 266 | model.DescribeOperation = DefaultDescribeOperation 267 | } 268 | if model.DescribeState == nil { 269 | model.DescribeState = DefaultDescribeState 270 | } 271 | return model 272 | } 273 | 274 | func checkParallel(model Model, history [][]entry, computeInfo bool, timeout time.Duration) (CheckResult, linearizationInfo) { 275 | ok := true 276 | timedOut := false 277 | results := make(chan bool, len(history)) 278 | longest := make([][]*[]int, len(history)) 279 | kill := int32(0) 280 | for i, subhistory := range history { 281 | go func(i int, subhistory []entry) { 282 | ok, l := checkSingle(model, subhistory, computeInfo, &kill) 283 | longest[i] = l 284 | results <- ok 285 | }(i, subhistory) 286 | } 287 | var timeoutChan <-chan time.Time 288 | if timeout > 0 { 289 | timeoutChan = time.After(timeout) 290 | } 291 | count := 0 292 | loop: 293 | for { 294 | select { 295 | case result := <-results: 296 | count++ 297 | ok = ok && result 298 | if !ok && !computeInfo { 299 | atomic.StoreInt32(&kill, 1) 300 | break loop 301 | } 302 | if count >= len(history) { 303 | break loop 304 | } 305 | case <-timeoutChan: 306 | timedOut = true 307 | atomic.StoreInt32(&kill, 1) 308 | break loop // if we time out, we might get a false positive 309 | } 310 | } 311 | var info linearizationInfo 312 | if computeInfo { 313 | // make sure we've waited for all goroutines to finish, 314 | // otherwise we might race on access to longest[] 315 | for count < len(history) { 316 | <-results 317 | count++ 318 | } 319 | // return longest linearizable prefixes that include each history element 320 | partialLinearizations := make([][][]int, len(history)) 321 | for i := 0; i < len(history); i++ { 322 | var partials [][]int 323 | // turn longest into a set of unique linearizations 324 | set := make(map[*[]int]struct{}) 325 | for _, v := range longest[i] { 326 | if v != nil { 327 | set[v] = struct{}{} 328 | } 329 | } 330 | for k := range set { 331 | arr := make([]int, len(*k)) 332 | for i, v := range *k { 333 | arr[i] = v 334 | } 335 | partials = append(partials, arr) 336 | } 337 | partialLinearizations[i] = partials 338 | } 339 | info.history = history 340 | info.partialLinearizations = partialLinearizations 341 | } 342 | var result CheckResult 343 | if !ok { 344 | result = Illegal 345 | } else { 346 | if timedOut { 347 | result = Unknown 348 | } else { 349 | result = Ok 350 | } 351 | } 352 | return result, info 353 | } 354 | 355 | func checkEvents(model Model, history []Event, verbose bool, timeout time.Duration) (CheckResult, linearizationInfo) { 356 | model = fillDefault(model) 357 | partitions := model.PartitionEvent(history) 358 | l := make([][]entry, len(partitions)) 359 | for i, subhistory := range partitions { 360 | l[i] = convertEntries(renumber(subhistory)) 361 | } 362 | return checkParallel(model, l, verbose, timeout) 363 | } 364 | 365 | func checkOperations(model Model, history []Operation, verbose bool, timeout time.Duration) (CheckResult, linearizationInfo) { 366 | model = fillDefault(model) 367 | partitions := model.Partition(history) 368 | l := make([][]entry, len(partitions)) 369 | for i, subhistory := range partitions { 370 | l[i] = makeEntries(subhistory) 371 | } 372 | return checkParallel(model, l, verbose, timeout) 373 | } 374 | -------------------------------------------------------------------------------- /src/porcupine/model.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import "fmt" 4 | 5 | type Operation struct { 6 | ClientId int // optional, unless you want a visualization; zero-indexed 7 | Input interface{} 8 | Call int64 // invocation time 9 | Output interface{} 10 | Return int64 // response time 11 | } 12 | 13 | type EventKind bool 14 | 15 | const ( 16 | CallEvent EventKind = false 17 | ReturnEvent EventKind = true 18 | ) 19 | 20 | type Event struct { 21 | ClientId int // optional, unless you want a visualization; zero-indexed 22 | Kind EventKind 23 | Value interface{} 24 | Id int 25 | } 26 | 27 | type Model struct { 28 | // Partition functions, such that a history is linearizable if and only 29 | // if each partition is linearizable. If you don't want to implement 30 | // this, you can always use the `NoPartition` functions implemented 31 | // below. 32 | Partition func(history []Operation) [][]Operation 33 | PartitionEvent func(history []Event) [][]Event 34 | // Initial state of the system. 35 | Init func() interface{} 36 | // Step function for the system. Returns whether or not the system 37 | // could take this step with the given inputs and outputs and also 38 | // returns the new state. This should not mutate the existing state. 39 | Step func(state interface{}, input interface{}, output interface{}) (bool, interface{}) 40 | // Equality on states. If you are using a simple data type for states, 41 | // you can use the `ShallowEqual` function implemented below. 42 | Equal func(state1, state2 interface{}) bool 43 | // For visualization, describe an operation as a string. 44 | // For example, "Get('x') -> 'y'". 45 | DescribeOperation func(input interface{}, output interface{}) string 46 | // For visualization purposes, describe a state as a string. 47 | // For example, "{'x' -> 'y', 'z' -> 'w'}" 48 | DescribeState func(state interface{}) string 49 | } 50 | 51 | func NoPartition(history []Operation) [][]Operation { 52 | return [][]Operation{history} 53 | } 54 | 55 | func NoPartitionEvent(history []Event) [][]Event { 56 | return [][]Event{history} 57 | } 58 | 59 | func ShallowEqual(state1, state2 interface{}) bool { 60 | return state1 == state2 61 | } 62 | 63 | func DefaultDescribeOperation(input interface{}, output interface{}) string { 64 | return fmt.Sprintf("%v -> %v", input, output) 65 | } 66 | 67 | func DefaultDescribeState(state interface{}) string { 68 | return fmt.Sprintf("%v", state) 69 | } 70 | 71 | type CheckResult string 72 | 73 | const ( 74 | Unknown CheckResult = "Unknown" // timed out 75 | Ok = "Ok" 76 | Illegal = "Illegal" 77 | ) 78 | -------------------------------------------------------------------------------- /src/porcupine/porcupine.go: -------------------------------------------------------------------------------- 1 | package porcupine 2 | 3 | import "time" 4 | 5 | func CheckOperations(model Model, history []Operation) bool { 6 | res, _ := checkOperations(model, history, false, 0) 7 | return res == Ok 8 | } 9 | 10 | // timeout = 0 means no timeout 11 | // if this operation times out, then a false positive is possible 12 | func CheckOperationsTimeout(model Model, history []Operation, timeout time.Duration) CheckResult { 13 | res, _ := checkOperations(model, history, false, timeout) 14 | return res 15 | } 16 | 17 | // timeout = 0 means no timeout 18 | // if this operation times out, then a false positive is possible 19 | func CheckOperationsVerbose(model Model, history []Operation, timeout time.Duration) (CheckResult, linearizationInfo) { 20 | return checkOperations(model, history, true, timeout) 21 | } 22 | 23 | func CheckEvents(model Model, history []Event) bool { 24 | res, _ := checkEvents(model, history, false, 0) 25 | return res == Ok 26 | } 27 | 28 | // timeout = 0 means no timeout 29 | // if this operation times out, then a false positive is possible 30 | func CheckEventsTimeout(model Model, history []Event, timeout time.Duration) CheckResult { 31 | res, _ := checkEvents(model, history, false, timeout) 32 | return res 33 | } 34 | 35 | // timeout = 0 means no timeout 36 | // if this operation times out, then a false positive is possible 37 | func CheckEventsVerbose(model Model, history []Event, timeout time.Duration) (CheckResult, linearizationInfo) { 38 | return checkEvents(model, history, true, timeout) 39 | } 40 | -------------------------------------------------------------------------------- /src/raft/config.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // support for Raft tester. 5 | // 6 | // we will use the original config.go to test your code for grading. 7 | // so, while you can modify this code to help you debug, please 8 | // test with the original before submitting. 9 | // 10 | 11 | import "mit6.824/labgob" 12 | import "mit6.824/labrpc" 13 | import "bytes" 14 | import "log" 15 | import "sync" 16 | import "sync/atomic" 17 | import "testing" 18 | import "runtime" 19 | import "math/rand" 20 | import crand "crypto/rand" 21 | import "math/big" 22 | import "encoding/base64" 23 | import "time" 24 | import "fmt" 25 | 26 | func randstring(n int) string { 27 | b := make([]byte, 2*n) 28 | crand.Read(b) 29 | s := base64.URLEncoding.EncodeToString(b) 30 | return s[0:n] 31 | } 32 | 33 | func makeSeed() int64 { 34 | max := big.NewInt(int64(1) << 62) 35 | bigx, _ := crand.Int(crand.Reader, max) 36 | x := bigx.Int64() 37 | return x 38 | } 39 | 40 | type config struct { 41 | mu sync.Mutex 42 | t *testing.T 43 | finished int32 44 | net *labrpc.Network 45 | n int 46 | rafts []*Raft 47 | applyErr []string // from apply channel readers 48 | connected []bool // whether each server is on the net 49 | saved []*Persister 50 | endnames [][]string // the port file names each sends to 51 | logs []map[int]interface{} // copy of each server's committed entries 52 | lastApplied []int 53 | start time.Time // time at which make_config() was called 54 | // begin()/end() statistics 55 | t0 time.Time // time at which test_test.go called cfg.begin() 56 | rpcs0 int // rpcTotal() at start of test 57 | cmds0 int // number of agreements 58 | bytes0 int64 59 | maxIndex int 60 | maxIndex0 int 61 | } 62 | 63 | var ncpu_once sync.Once 64 | 65 | func make_config(t *testing.T, n int, unreliable bool, snapshot bool) *config { 66 | ncpu_once.Do(func() { 67 | if runtime.NumCPU() < 2 { 68 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 69 | } 70 | rand.Seed(makeSeed()) 71 | }) 72 | runtime.GOMAXPROCS(4) 73 | cfg := &config{} 74 | cfg.t = t 75 | cfg.net = labrpc.MakeNetwork() 76 | cfg.n = n 77 | cfg.applyErr = make([]string, cfg.n) 78 | cfg.rafts = make([]*Raft, cfg.n) 79 | cfg.connected = make([]bool, cfg.n) 80 | cfg.saved = make([]*Persister, cfg.n) 81 | cfg.endnames = make([][]string, cfg.n) 82 | cfg.logs = make([]map[int]interface{}, cfg.n) 83 | cfg.lastApplied = make([]int, cfg.n) 84 | cfg.start = time.Now() 85 | 86 | cfg.setunreliable(unreliable) 87 | 88 | cfg.net.LongDelays(true) 89 | 90 | applier := cfg.applier 91 | if snapshot { 92 | applier = cfg.applierSnap 93 | } 94 | // create a full set of Rafts. 95 | for i := 0; i < cfg.n; i++ { 96 | cfg.logs[i] = map[int]interface{}{} 97 | cfg.start1(i, applier) 98 | } 99 | 100 | // connect everyone 101 | for i := 0; i < cfg.n; i++ { 102 | cfg.connect(i) 103 | } 104 | 105 | return cfg 106 | } 107 | 108 | // shut down a Raft server but save its persistent state. 109 | func (cfg *config) crash1(i int) { 110 | cfg.disconnect(i) 111 | cfg.net.DeleteServer(i) // disable client connections to the server. 112 | 113 | cfg.mu.Lock() 114 | defer cfg.mu.Unlock() 115 | 116 | // a fresh persister, in case old instance 117 | // continues to update the Persister. 118 | // but copy old persister's content so that we always 119 | // pass Make() the last persisted state. 120 | if cfg.saved[i] != nil { 121 | cfg.saved[i] = cfg.saved[i].Copy() 122 | } 123 | 124 | rf := cfg.rafts[i] 125 | if rf != nil { 126 | cfg.mu.Unlock() 127 | rf.Kill() 128 | cfg.mu.Lock() 129 | cfg.rafts[i] = nil 130 | } 131 | 132 | if cfg.saved[i] != nil { 133 | raftlog := cfg.saved[i].ReadRaftState() 134 | snapshot := cfg.saved[i].ReadSnapshot() 135 | cfg.saved[i] = &Persister{} 136 | cfg.saved[i].SaveStateAndSnapshot(raftlog, snapshot) 137 | } 138 | } 139 | 140 | func (cfg *config) checkLogs(i int, m ApplyMsg) (string, bool) { 141 | err_msg := "" 142 | v := m.Command 143 | for j := 0; j < len(cfg.logs); j++ { 144 | if old, oldok := cfg.logs[j][m.CommandIndex]; oldok && old != v { 145 | log.Printf("%v: log %v; server %v\n", i, cfg.logs[i], cfg.logs[j]) 146 | // some server has already committed a different value for this entry! 147 | err_msg = fmt.Sprintf("commit index=%v server=%v %v != server=%v %v", 148 | m.CommandIndex, i, m.Command, j, old) 149 | } 150 | } 151 | _, prevok := cfg.logs[i][m.CommandIndex-1] 152 | cfg.logs[i][m.CommandIndex] = v 153 | if m.CommandIndex > cfg.maxIndex { 154 | cfg.maxIndex = m.CommandIndex 155 | } 156 | return err_msg, prevok 157 | } 158 | 159 | // applier reads message from apply ch and checks that they match the log 160 | // contents 161 | func (cfg *config) applier(i int, applyCh chan ApplyMsg) { 162 | for m := range applyCh { 163 | if m.CommandValid == false { 164 | // ignore other types of ApplyMsg 165 | } else { 166 | cfg.mu.Lock() 167 | err_msg, prevok := cfg.checkLogs(i, m) 168 | cfg.mu.Unlock() 169 | if m.CommandIndex > 1 && prevok == false { 170 | err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex) 171 | } 172 | if err_msg != "" { 173 | log.Fatalf("apply error: %v", err_msg) 174 | cfg.applyErr[i] = err_msg 175 | // keep reading after error so that Raft doesn't block 176 | // holding locks... 177 | } 178 | } 179 | } 180 | } 181 | 182 | // returns "" or error string 183 | func (cfg *config) ingestSnap(i int, snapshot []byte, index int) string { 184 | if snapshot == nil { 185 | log.Fatalf("nil snapshot") 186 | return "nil snapshot" 187 | } 188 | r := bytes.NewBuffer(snapshot) 189 | d := labgob.NewDecoder(r) 190 | var lastIncludedIndex int 191 | var xlog []interface{} 192 | if d.Decode(&lastIncludedIndex) != nil || 193 | d.Decode(&xlog) != nil { 194 | log.Fatalf("snapshot decode error") 195 | return "snapshot Decode() error" 196 | } 197 | if index != -1 && index != lastIncludedIndex { 198 | err := fmt.Sprintf("server %v snapshot doesn't match m.SnapshotIndex", i) 199 | return err 200 | } 201 | cfg.logs[i] = map[int]interface{}{} 202 | for j := 0; j < len(xlog); j++ { 203 | cfg.logs[i][j] = xlog[j] 204 | } 205 | cfg.lastApplied[i] = lastIncludedIndex 206 | return "" 207 | } 208 | 209 | const SnapShotInterval = 10 210 | 211 | // periodically snapshot raft state 212 | func (cfg *config) applierSnap(i int, applyCh chan ApplyMsg) { 213 | cfg.mu.Lock() 214 | rf := cfg.rafts[i] 215 | cfg.mu.Unlock() 216 | if rf == nil { 217 | return // ??? 218 | } 219 | 220 | for m := range applyCh { 221 | err_msg := "" 222 | if m.SnapshotValid { 223 | if rf.CondInstallSnapshot(m.SnapshotTerm, m.SnapshotIndex, m.Snapshot) { 224 | cfg.mu.Lock() 225 | err_msg = cfg.ingestSnap(i, m.Snapshot, m.SnapshotIndex) 226 | cfg.mu.Unlock() 227 | } 228 | } else if m.CommandValid { 229 | if m.CommandIndex != cfg.lastApplied[i]+1 { 230 | err_msg = fmt.Sprintf("server %v apply out of order, expected index %v, got %v", i, cfg.lastApplied[i]+1, m.CommandIndex) 231 | } 232 | 233 | if err_msg == "" { 234 | cfg.mu.Lock() 235 | var prevok bool 236 | err_msg, prevok = cfg.checkLogs(i, m) 237 | cfg.mu.Unlock() 238 | if m.CommandIndex > 1 && prevok == false { 239 | err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex) 240 | } 241 | } 242 | 243 | cfg.mu.Lock() 244 | cfg.lastApplied[i] = m.CommandIndex 245 | cfg.mu.Unlock() 246 | 247 | if (m.CommandIndex+1)%SnapShotInterval == 0 { 248 | w := new(bytes.Buffer) 249 | e := labgob.NewEncoder(w) 250 | e.Encode(m.CommandIndex) 251 | var xlog []interface{} 252 | for j := 0; j <= m.CommandIndex; j++ { 253 | xlog = append(xlog, cfg.logs[i][j]) 254 | } 255 | e.Encode(xlog) 256 | rf.Snapshot(m.CommandIndex, w.Bytes()) 257 | } 258 | } else { 259 | // Ignore other types of ApplyMsg. 260 | } 261 | if err_msg != "" { 262 | log.Fatalf("apply error: %v", err_msg) 263 | cfg.applyErr[i] = err_msg 264 | // keep reading after error so that Raft doesn't block 265 | // holding locks... 266 | } 267 | } 268 | } 269 | 270 | // 271 | // start or re-start a Raft. 272 | // if one already exists, "kill" it first. 273 | // allocate new outgoing port file names, and a new 274 | // state persister, to isolate previous instance of 275 | // this server. since we cannot really kill it. 276 | // 277 | func (cfg *config) start1(i int, applier func(int, chan ApplyMsg)) { 278 | cfg.crash1(i) 279 | 280 | // a fresh set of outgoing ClientEnd names. 281 | // so that old crashed instance's ClientEnds can't send. 282 | cfg.endnames[i] = make([]string, cfg.n) 283 | for j := 0; j < cfg.n; j++ { 284 | cfg.endnames[i][j] = randstring(20) 285 | } 286 | 287 | // a fresh set of ClientEnds. 288 | ends := make([]*labrpc.ClientEnd, cfg.n) 289 | for j := 0; j < cfg.n; j++ { 290 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 291 | cfg.net.Connect(cfg.endnames[i][j], j) 292 | } 293 | 294 | cfg.mu.Lock() 295 | 296 | cfg.lastApplied[i] = 0 297 | 298 | // a fresh persister, so old instance doesn't overwrite 299 | // new instance's persisted state. 300 | // but copy old persister's content so that we always 301 | // pass Make() the last persisted state. 302 | if cfg.saved[i] != nil { 303 | cfg.saved[i] = cfg.saved[i].Copy() 304 | 305 | snapshot := cfg.saved[i].ReadSnapshot() 306 | if snapshot != nil && len(snapshot) > 0 { 307 | // mimic KV server and process snapshot now. 308 | // ideally Raft should send it up on applyCh... 309 | err := cfg.ingestSnap(i, snapshot, -1) 310 | if err != "" { 311 | cfg.t.Fatal(err) 312 | } 313 | } 314 | } else { 315 | cfg.saved[i] = MakePersister() 316 | } 317 | 318 | cfg.mu.Unlock() 319 | 320 | applyCh := make(chan ApplyMsg) 321 | 322 | rf := Make(ends, i, cfg.saved[i], applyCh) 323 | 324 | cfg.mu.Lock() 325 | cfg.rafts[i] = rf 326 | cfg.mu.Unlock() 327 | 328 | go applier(i, applyCh) 329 | 330 | svc := labrpc.MakeService(rf) 331 | srv := labrpc.MakeServer() 332 | srv.AddService(svc) 333 | cfg.net.AddServer(i, srv) 334 | } 335 | 336 | func (cfg *config) checkTimeout() { 337 | // enforce a two minute real-time limit on each test 338 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 339 | cfg.t.Fatal("test took longer than 120 seconds") 340 | } 341 | } 342 | 343 | func (cfg *config) checkFinished() bool { 344 | z := atomic.LoadInt32(&cfg.finished) 345 | return z != 0 346 | } 347 | 348 | func (cfg *config) cleanup() { 349 | atomic.StoreInt32(&cfg.finished, 1) 350 | for i := 0; i < len(cfg.rafts); i++ { 351 | if cfg.rafts[i] != nil { 352 | cfg.rafts[i].Kill() 353 | } 354 | } 355 | cfg.net.Cleanup() 356 | cfg.checkTimeout() 357 | } 358 | 359 | // attach server i to the net. 360 | func (cfg *config) connect(i int) { 361 | DPrintf("Test: connect(%d)\n", i) 362 | 363 | cfg.connected[i] = true 364 | 365 | // outgoing ClientEnds 366 | for j := 0; j < cfg.n; j++ { 367 | if cfg.connected[j] { 368 | endname := cfg.endnames[i][j] 369 | cfg.net.Enable(endname, true) 370 | } 371 | } 372 | 373 | // incoming ClientEnds 374 | for j := 0; j < cfg.n; j++ { 375 | if cfg.connected[j] { 376 | endname := cfg.endnames[j][i] 377 | cfg.net.Enable(endname, true) 378 | } 379 | } 380 | } 381 | 382 | // detach server i from the net. 383 | func (cfg *config) disconnect(i int) { 384 | DPrintf("Test: disconnect(%d)\n", i) 385 | 386 | cfg.connected[i] = false 387 | 388 | // outgoing ClientEnds 389 | for j := 0; j < cfg.n; j++ { 390 | if cfg.endnames[i] != nil { 391 | endname := cfg.endnames[i][j] 392 | cfg.net.Enable(endname, false) 393 | } 394 | } 395 | 396 | // incoming ClientEnds 397 | for j := 0; j < cfg.n; j++ { 398 | if cfg.endnames[j] != nil { 399 | endname := cfg.endnames[j][i] 400 | cfg.net.Enable(endname, false) 401 | } 402 | } 403 | } 404 | 405 | func (cfg *config) rpcCount(server int) int { 406 | return cfg.net.GetCount(server) 407 | } 408 | 409 | func (cfg *config) rpcTotal() int { 410 | return cfg.net.GetTotalCount() 411 | } 412 | 413 | func (cfg *config) setunreliable(unrel bool) { 414 | cfg.net.Reliable(!unrel) 415 | } 416 | 417 | func (cfg *config) bytesTotal() int64 { 418 | return cfg.net.GetTotalBytes() 419 | } 420 | 421 | func (cfg *config) setlongreordering(longrel bool) { 422 | cfg.net.LongReordering(longrel) 423 | } 424 | 425 | // 426 | // check that one of the connected servers thinks 427 | // it is the leader, and that no other connected 428 | // server thinks otherwise. 429 | // 430 | // try a few times in case re-elections are needed. 431 | // 432 | func (cfg *config) checkOneLeader() int { 433 | for iters := 0; iters < 10; iters++ { 434 | ms := 450 + (rand.Int63() % 100) 435 | time.Sleep(time.Duration(ms) * time.Millisecond) 436 | 437 | leaders := make(map[int][]int) 438 | for i := 0; i < cfg.n; i++ { 439 | if cfg.connected[i] { 440 | if term, leader := cfg.rafts[i].GetState(); leader { 441 | leaders[term] = append(leaders[term], i) 442 | } 443 | } 444 | } 445 | 446 | lastTermWithLeader := -1 447 | for term, leaders := range leaders { 448 | if len(leaders) > 1 { 449 | cfg.t.Fatalf("term %d has %d (>1) leaders", term, len(leaders)) 450 | } 451 | if term > lastTermWithLeader { 452 | lastTermWithLeader = term 453 | } 454 | } 455 | 456 | if len(leaders) != 0 { 457 | return leaders[lastTermWithLeader][0] 458 | } 459 | } 460 | cfg.t.Fatalf("expected one leader, got none") 461 | return -1 462 | } 463 | 464 | // check that everyone agrees on the term. 465 | func (cfg *config) checkTerms() int { 466 | term := -1 467 | for i := 0; i < cfg.n; i++ { 468 | if cfg.connected[i] { 469 | xterm, _ := cfg.rafts[i].GetState() 470 | if term == -1 { 471 | term = xterm 472 | } else if term != xterm { 473 | cfg.t.Fatalf("servers disagree on term") 474 | } 475 | } 476 | } 477 | return term 478 | } 479 | 480 | // 481 | // check that none of the connected servers 482 | // thinks it is the leader. 483 | // 484 | func (cfg *config) checkNoLeader() { 485 | for i := 0; i < cfg.n; i++ { 486 | if cfg.connected[i] { 487 | _, is_leader := cfg.rafts[i].GetState() 488 | if is_leader { 489 | cfg.t.Fatalf("expected no leader among connected servers, but %v claims to be leader", i) 490 | } 491 | } 492 | } 493 | } 494 | 495 | // how many servers think a log entry is committed? 496 | func (cfg *config) nCommitted(index int) (int, interface{}) { 497 | count := 0 498 | var cmd interface{} = nil 499 | for i := 0; i < len(cfg.rafts); i++ { 500 | if cfg.applyErr[i] != "" { 501 | cfg.t.Fatal(cfg.applyErr[i]) 502 | } 503 | 504 | cfg.mu.Lock() 505 | cmd1, ok := cfg.logs[i][index] 506 | cfg.mu.Unlock() 507 | 508 | if ok { 509 | if count > 0 && cmd != cmd1 { 510 | cfg.t.Fatalf("committed values do not match: index %v, %v, %v", 511 | index, cmd, cmd1) 512 | } 513 | count += 1 514 | cmd = cmd1 515 | } 516 | } 517 | return count, cmd 518 | } 519 | 520 | // wait for at least n servers to commit. 521 | // but don't wait forever. 522 | func (cfg *config) wait(index int, n int, startTerm int) interface{} { 523 | to := 10 * time.Millisecond 524 | for iters := 0; iters < 30; iters++ { 525 | nd, _ := cfg.nCommitted(index) 526 | if nd >= n { 527 | break 528 | } 529 | time.Sleep(to) 530 | if to < time.Second { 531 | to *= 2 532 | } 533 | if startTerm > -1 { 534 | for _, r := range cfg.rafts { 535 | if t, _ := r.GetState(); t > startTerm { 536 | // someone has moved on 537 | // can no longer guarantee that we'll "win" 538 | return -1 539 | } 540 | } 541 | } 542 | } 543 | nd, cmd := cfg.nCommitted(index) 544 | if nd < n { 545 | cfg.t.Fatalf("only %d decided for index %d; wanted %d", 546 | nd, index, n) 547 | } 548 | return cmd 549 | } 550 | 551 | // do a complete agreement. 552 | // it might choose the wrong leader initially, 553 | // and have to re-submit after giving up. 554 | // entirely gives up after about 10 seconds. 555 | // indirectly checks that the servers agree on the 556 | // same value, since nCommitted() checks this, 557 | // as do the threads that read from applyCh. 558 | // returns index. 559 | // if retry==true, may submit the command multiple 560 | // times, in case a leader fails just after Start(). 561 | // if retry==false, calls Start() only once, in order 562 | // to simplify the early Lab 2B tests. 563 | func (cfg *config) one(cmd interface{}, expectedServers int, retry bool) int { 564 | t0 := time.Now() 565 | starts := 0 566 | for time.Since(t0).Seconds() < 10 && cfg.checkFinished() == false { 567 | // try all the servers, maybe one is the leader. 568 | index := -1 569 | for si := 0; si < cfg.n; si++ { 570 | starts = (starts + 1) % cfg.n 571 | var rf *Raft 572 | cfg.mu.Lock() 573 | if cfg.connected[starts] { 574 | rf = cfg.rafts[starts] 575 | } 576 | cfg.mu.Unlock() 577 | if rf != nil { 578 | index1, _, ok := rf.Start(cmd) 579 | if ok { 580 | index = index1 581 | break 582 | } 583 | } 584 | } 585 | 586 | if index != -1 { 587 | // somebody claimed to be the leader and to have 588 | // submitted our command; wait a while for agreement. 589 | t1 := time.Now() 590 | for time.Since(t1).Seconds() < 2 { 591 | nd, cmd1 := cfg.nCommitted(index) 592 | if nd > 0 && nd >= expectedServers { 593 | // committed 594 | if cmd1 == cmd { 595 | // and it was the command we submitted. 596 | return index 597 | } 598 | } 599 | time.Sleep(20 * time.Millisecond) 600 | } 601 | if retry == false { 602 | cfg.t.Fatalf("one(%v) failed to reach agreement", cmd) 603 | } 604 | } else { 605 | time.Sleep(50 * time.Millisecond) 606 | } 607 | } 608 | if cfg.checkFinished() == false { 609 | cfg.t.Fatalf("one(%v) failed to reach agreement", cmd) 610 | } 611 | return -1 612 | } 613 | 614 | // start a Test. 615 | // print the Test message. 616 | // e.g. cfg.begin("Test (2B): RPC counts aren't too high") 617 | func (cfg *config) begin(description string) { 618 | fmt.Printf("%s ...\n", description) 619 | cfg.t0 = time.Now() 620 | cfg.rpcs0 = cfg.rpcTotal() 621 | cfg.bytes0 = cfg.bytesTotal() 622 | cfg.cmds0 = 0 623 | cfg.maxIndex0 = cfg.maxIndex 624 | } 625 | 626 | // end a Test -- the fact that we got here means there 627 | // was no failure. 628 | // print the Passed message, 629 | // and some performance numbers. 630 | func (cfg *config) end() { 631 | cfg.checkTimeout() 632 | if cfg.t.Failed() == false { 633 | cfg.mu.Lock() 634 | t := time.Since(cfg.t0).Seconds() // real time 635 | npeers := cfg.n // number of Raft peers 636 | nrpc := cfg.rpcTotal() - cfg.rpcs0 // number of RPC sends 637 | nbytes := cfg.bytesTotal() - cfg.bytes0 // number of bytes 638 | ncmds := cfg.maxIndex - cfg.maxIndex0 // number of Raft agreements reported 639 | cfg.mu.Unlock() 640 | 641 | fmt.Printf(" ... Passed --") 642 | fmt.Printf(" %4.1f %d %4d %7d %4d\n", t, npeers, nrpc, nbytes, ncmds) 643 | } 644 | } 645 | 646 | // Maximum log size across all servers 647 | func (cfg *config) LogSize() int { 648 | logsize := 0 649 | for i := 0; i < cfg.n; i++ { 650 | n := cfg.saved[i].RaftStateSize() 651 | if n > logsize { 652 | logsize = n 653 | } 654 | } 655 | return logsize 656 | } 657 | -------------------------------------------------------------------------------- /src/raft/dstest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import itertools 4 | import math 5 | import signal 6 | import subprocess 7 | import tempfile 8 | import shutil 9 | import time 10 | import os 11 | import sys 12 | import datetime 13 | from collections import defaultdict 14 | from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED 15 | from dataclasses import dataclass 16 | from pathlib import Path 17 | from typing import List, Optional, Dict, DefaultDict, Tuple 18 | 19 | import typer 20 | import rich 21 | from rich import print 22 | from rich.table import Table 23 | from rich.progress import ( 24 | Progress, 25 | TimeElapsedColumn, 26 | TimeRemainingColumn, 27 | TextColumn, 28 | BarColumn, 29 | SpinnerColumn, 30 | ) 31 | from rich.live import Live 32 | from rich.panel import Panel 33 | from rich.traceback import install 34 | 35 | install(show_locals=True) 36 | 37 | 38 | @dataclass 39 | class StatsMeter: 40 | """ 41 | Auxiliary classs to keep track of online stats including: count, mean, variance 42 | Uses Welford's algorithm to compute sample mean and sample variance incrementally. 43 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 44 | """ 45 | 46 | n: int = 0 47 | mean: float = 0.0 48 | S: float = 0.0 49 | 50 | def add(self, datum): 51 | self.n += 1 52 | delta = datum - self.mean 53 | # Mk = Mk-1+ (xk – Mk-1)/k 54 | self.mean += delta / self.n 55 | # Sk = Sk-1 + (xk – Mk-1)*(xk – Mk). 56 | self.S += delta * (datum - self.mean) 57 | 58 | @property 59 | def variance(self): 60 | return self.S / self.n 61 | 62 | @property 63 | def std(self): 64 | return math.sqrt(self.variance) 65 | 66 | 67 | def print_results(results: Dict[str, Dict[str, StatsMeter]], timing=False): 68 | table = Table(show_header=True, header_style="bold") 69 | table.add_column("Test") 70 | table.add_column("Failed", justify="right") 71 | table.add_column("Total", justify="right") 72 | if not timing: 73 | table.add_column("Time", justify="right") 74 | else: 75 | table.add_column("Real Time", justify="right") 76 | table.add_column("User Time", justify="right") 77 | table.add_column("System Time", justify="right") 78 | 79 | for test, stats in results.items(): 80 | if stats["completed"].n == 0: 81 | continue 82 | color = "green" if stats["failed"].n == 0 else "red" 83 | row = [ 84 | f"[{color}]{test}[/{color}]", 85 | str(stats["failed"].n), 86 | str(stats["completed"].n), 87 | ] 88 | if not timing: 89 | row.append(f"{stats['time'].mean:.2f} ± {stats['time'].std:.2f}") 90 | else: 91 | row.extend( 92 | [ 93 | f"{stats['real_time'].mean:.2f} ± {stats['real_time'].std:.2f}", 94 | f"{stats['user_time'].mean:.2f} ± {stats['user_time'].std:.2f}", 95 | f"{stats['system_time'].mean:.2f} ± {stats['system_time'].std:.2f}", 96 | ] 97 | ) 98 | table.add_row(*row) 99 | 100 | print(table) 101 | 102 | 103 | def run_test(test: str, race: bool, timing: bool): 104 | test_cmd = ["go", "test", f"-run={test}"] 105 | if race: 106 | test_cmd.append("-race") 107 | if timing: 108 | test_cmd = ["time"] + cmd 109 | f, path = tempfile.mkstemp() 110 | start = time.time() 111 | proc = subprocess.run(test_cmd, stdout=f, stderr=f) 112 | runtime = time.time() - start 113 | os.close(f) 114 | return test, path, proc.returncode, runtime 115 | 116 | 117 | def last_line(file: str) -> str: 118 | with open(file, "rb") as f: 119 | f.seek(-2, os.SEEK_END) 120 | while f.read(1) != b"\n": 121 | f.seek(-2, os.SEEK_CUR) 122 | line = f.readline().decode() 123 | return line 124 | 125 | 126 | # fmt: off 127 | def run_tests( 128 | tests: List[str], 129 | sequential: bool = typer.Option(False, '--sequential', '-s', help='Run all test of each group in order'), 130 | workers: int = typer.Option(1, '--workers', '-p', help='Number of parallel tasks'), 131 | iterations: int = typer.Option(10, '--iter', '-n', help='Number of iterations to run'), 132 | output: Optional[Path] = typer.Option(None, '--output', '-o', help='Output path to use'), 133 | verbose: int = typer.Option(0, '--verbose', '-v', help='Verbosity level', count=True), 134 | archive: bool = typer.Option(False, '--archive', '-a', help='Save all logs intead of only failed ones'), 135 | race: bool = typer.Option(False, '--race/--no-race', '-r/-R', help='Run with race checker'), 136 | loop: bool = typer.Option(False, '--loop', '-l', help='Run continuously'), 137 | growth: int = typer.Option(10, '--growth', '-g', help='Growth ratio of iterations when using --loop'), 138 | timing: bool = typer.Option(False, '--timing', '-t', help='Report timing, only works on macOS'), 139 | # fmt: on 140 | ): 141 | 142 | if output is None: 143 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 144 | output = Path(timestamp) 145 | 146 | if race: 147 | print("[yellow]Running with the race detector\n[/yellow]") 148 | 149 | if verbose > 0: 150 | print(f"[yellow] Verbosity level set to {verbose}[/yellow]") 151 | os.environ['VERBOSE'] = str(verbose) 152 | 153 | while True: 154 | 155 | total = iterations * len(tests) 156 | completed = 0 157 | 158 | results = {test: defaultdict(StatsMeter) for test in tests} 159 | 160 | if sequential: 161 | test_instances = itertools.chain.from_iterable(itertools.repeat(test, iterations) for test in tests) 162 | else: 163 | test_instances = itertools.chain.from_iterable(itertools.repeat(tests, iterations)) 164 | test_instances = iter(test_instances) 165 | 166 | total_progress = Progress( 167 | "[progress.description]{task.description}", 168 | BarColumn(), 169 | TimeRemainingColumn(), 170 | "[progress.percentage]{task.percentage:>3.0f}%", 171 | TimeElapsedColumn(), 172 | ) 173 | total_task = total_progress.add_task("[yellow]Tests[/yellow]", total=total) 174 | 175 | task_progress = Progress( 176 | "[progress.description]{task.description}", 177 | SpinnerColumn(), 178 | BarColumn(), 179 | "{task.completed}/{task.total}", 180 | ) 181 | tasks = {test: task_progress.add_task(test, total=iterations) for test in tests} 182 | 183 | progress_table = Table.grid() 184 | progress_table.add_row(total_progress) 185 | progress_table.add_row(Panel.fit(task_progress)) 186 | 187 | with Live(progress_table, transient=True) as live: 188 | 189 | def handler(_, frame): 190 | live.stop() 191 | print('\n') 192 | print_results(results) 193 | sys.exit(1) 194 | 195 | signal.signal(signal.SIGINT, handler) 196 | 197 | with ThreadPoolExecutor(max_workers=workers) as executor: 198 | 199 | futures = [] 200 | while completed < total: 201 | n = len(futures) 202 | if n < workers: 203 | for test in itertools.islice(test_instances, workers-n): 204 | futures.append(executor.submit(run_test, test, race, timing)) 205 | 206 | done, not_done = wait(futures, return_when=FIRST_COMPLETED) 207 | 208 | for future in done: 209 | test, path, rc, runtime = future.result() 210 | 211 | results[test]['completed'].add(1) 212 | results[test]['time'].add(runtime) 213 | task_progress.update(tasks[test], advance=1) 214 | dest = (output / f"{test}_{completed}.log").as_posix() 215 | if rc != 0: 216 | print(f"Failed test {test} - {dest}") 217 | task_progress.update(tasks[test], description=f"[red]{test}[/red]") 218 | results[test]['failed'].add(1) 219 | else: 220 | if results[test]['completed'].n == iterations and results[test]['failed'].n == 0: 221 | task_progress.update(tasks[test], description=f"[green]{test}[/green]") 222 | 223 | if rc != 0 or archive: 224 | output.mkdir(exist_ok=True, parents=True) 225 | shutil.copy(path, dest) 226 | 227 | if timing: 228 | line = last_line(path) 229 | real, _, user, _, system, _ = line.replace(' '*8, '').split(' ') 230 | results[test]['real_time'].add(float(real)) 231 | results[test]['user_time'].add(float(user)) 232 | results[test]['system_time'].add(float(system)) 233 | 234 | os.remove(path) 235 | 236 | completed += 1 237 | total_progress.update(total_task, advance=1) 238 | 239 | futures = list(not_done) 240 | 241 | print_results(results, timing) 242 | 243 | if loop: 244 | iterations *= growth 245 | print(f"[yellow]Increasing iterations to {iterations}[/yellow]") 246 | else: 247 | break 248 | 249 | 250 | if __name__ == "__main__": 251 | typer.run(run_tests) 252 | -------------------------------------------------------------------------------- /src/raft/persister.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | // 4 | // support for Raft and kvraft to save persistent 5 | // Raft state (log &c) and k/v server snapshots. 6 | // 7 | // we will use the original persister.go to test your code for grading. 8 | // so, while you can modify this code to help you debug, please 9 | // test with the original before submitting. 10 | // 11 | 12 | import "sync" 13 | 14 | type Persister struct { 15 | mu sync.Mutex 16 | raftstate []byte 17 | snapshot []byte 18 | } 19 | 20 | func MakePersister() *Persister { 21 | return &Persister{} 22 | } 23 | 24 | func clone(orig []byte) []byte { 25 | x := make([]byte, len(orig)) 26 | copy(x, orig) 27 | return x 28 | } 29 | 30 | func (ps *Persister) Copy() *Persister { 31 | ps.mu.Lock() 32 | defer ps.mu.Unlock() 33 | np := MakePersister() 34 | np.raftstate = ps.raftstate 35 | np.snapshot = ps.snapshot 36 | return np 37 | } 38 | 39 | func (ps *Persister) SaveRaftState(state []byte) { 40 | ps.mu.Lock() 41 | defer ps.mu.Unlock() 42 | ps.raftstate = clone(state) 43 | } 44 | 45 | func (ps *Persister) ReadRaftState() []byte { 46 | ps.mu.Lock() 47 | defer ps.mu.Unlock() 48 | return clone(ps.raftstate) 49 | } 50 | 51 | func (ps *Persister) RaftStateSize() int { 52 | ps.mu.Lock() 53 | defer ps.mu.Unlock() 54 | return len(ps.raftstate) 55 | } 56 | 57 | // Save both Raft state and K/V snapshot as a single atomic action, 58 | // to help avoid them getting out of sync. 59 | func (ps *Persister) SaveStateAndSnapshot(state []byte, snapshot []byte) { 60 | ps.mu.Lock() 61 | defer ps.mu.Unlock() 62 | ps.raftstate = clone(state) 63 | ps.snapshot = clone(snapshot) 64 | } 65 | 66 | func (ps *Persister) ReadSnapshot() []byte { 67 | ps.mu.Lock() 68 | defer ps.mu.Unlock() 69 | return clone(ps.snapshot) 70 | } 71 | 72 | func (ps *Persister) SnapshotSize() int { 73 | ps.mu.Lock() 74 | defer ps.mu.Unlock() 75 | return len(ps.snapshot) 76 | } 77 | -------------------------------------------------------------------------------- /src/raft/util.go: -------------------------------------------------------------------------------- 1 | package raft 2 | 3 | import ( 4 | "log" 5 | "math/rand" 6 | "time" 7 | ) 8 | 9 | func DPrintf(format string, a ...interface{}) { 10 | if Debug { 11 | log.Printf(format, a...) 12 | } 13 | } 14 | 15 | func min(a int, b int) int { 16 | if a <= b { 17 | return a 18 | } 19 | return b 20 | } 21 | 22 | func max(a int, b int) int { 23 | if a >= b { 24 | return a 25 | } 26 | return b 27 | } 28 | 29 | // 获得一个随机选举超时时间 30 | func getRandElectionTimeOut() time.Duration { 31 | return time.Duration((rand.Int()%(electionTimeOutMax-electionTimeOutMin))+electionTimeOutMin) * time.Millisecond 32 | } 33 | -------------------------------------------------------------------------------- /src/shardctrler/client.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | // 4 | // Shardctrler clerk. 5 | // 6 | 7 | import "mit6.824/labrpc" 8 | import "time" 9 | import "crypto/rand" 10 | import "math/big" 11 | 12 | type Clerk struct { 13 | servers []*labrpc.ClientEnd 14 | // Your data here. 15 | } 16 | 17 | func nrand() int64 { 18 | max := big.NewInt(int64(1) << 62) 19 | bigx, _ := rand.Int(rand.Reader, max) 20 | x := bigx.Int64() 21 | return x 22 | } 23 | 24 | func MakeClerk(servers []*labrpc.ClientEnd) *Clerk { 25 | ck := new(Clerk) 26 | ck.servers = servers 27 | // Your code here. 28 | return ck 29 | } 30 | 31 | func (ck *Clerk) Query(num int) Config { 32 | args := &QueryArgs{} 33 | // Your code here. 34 | args.Num = num 35 | for { 36 | // try each known server. 37 | for _, srv := range ck.servers { 38 | var reply QueryReply 39 | ok := srv.Call("ShardCtrler.Query", args, &reply) 40 | if ok && reply.WrongLeader == false { 41 | return reply.Config 42 | } 43 | } 44 | time.Sleep(100 * time.Millisecond) 45 | } 46 | } 47 | 48 | func (ck *Clerk) Join(servers map[int][]string) { 49 | args := &JoinArgs{} 50 | // Your code here. 51 | args.Servers = servers 52 | 53 | for { 54 | // try each known server. 55 | for _, srv := range ck.servers { 56 | var reply JoinReply 57 | ok := srv.Call("ShardCtrler.Join", args, &reply) 58 | if ok && reply.WrongLeader == false { 59 | return 60 | } 61 | } 62 | time.Sleep(100 * time.Millisecond) 63 | } 64 | } 65 | 66 | func (ck *Clerk) Leave(gids []int) { 67 | args := &LeaveArgs{} 68 | // Your code here. 69 | args.GIDs = gids 70 | 71 | for { 72 | // try each known server. 73 | for _, srv := range ck.servers { 74 | var reply LeaveReply 75 | ok := srv.Call("ShardCtrler.Leave", args, &reply) 76 | if ok && reply.WrongLeader == false { 77 | return 78 | } 79 | } 80 | time.Sleep(100 * time.Millisecond) 81 | } 82 | } 83 | 84 | func (ck *Clerk) Move(shard int, gid int) { 85 | args := &MoveArgs{} 86 | // Your code here. 87 | args.Shard = shard 88 | args.GID = gid 89 | 90 | for { 91 | // try each known server. 92 | for _, srv := range ck.servers { 93 | var reply MoveReply 94 | ok := srv.Call("ShardCtrler.Move", args, &reply) 95 | if ok && reply.WrongLeader == false { 96 | return 97 | } 98 | } 99 | time.Sleep(100 * time.Millisecond) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/shardctrler/common.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | // 4 | // Shard controler: assigns shards to replication groups. 5 | // 6 | // RPC interface: 7 | // Join(servers) -- add a set of groups (gid -> server-list mapping). 8 | // Leave(gids) -- delete a set of groups. 9 | // Move(shard, gid) -- hand off one shard from current owner to gid. 10 | // Query(num) -> fetch Config # num, or latest config if num==-1. 11 | // 12 | // A Config (configuration) describes a set of replica groups, and the 13 | // replica group responsible for each shard. Configs are numbered. Config 14 | // #0 is the initial configuration, with no groups and all shards 15 | // assigned to group 0 (the invalid group). 16 | // 17 | // You will need to add fields to the RPC argument structs. 18 | // 19 | 20 | // The number of shards. 21 | const NShards = 10 22 | 23 | // A configuration -- an assignment of shards to groups. 24 | // Please don't change this. 25 | type Config struct { 26 | Num int // config number 27 | Shards [NShards]int // shard -> gid 28 | Groups map[int][]string // gid -> servers[] 29 | } 30 | 31 | const ( 32 | OK = "OK" 33 | ) 34 | 35 | type Err string 36 | 37 | type JoinArgs struct { 38 | Servers map[int][]string // new GID -> servers mappings 39 | } 40 | 41 | type JoinReply struct { 42 | WrongLeader bool 43 | Err Err 44 | } 45 | 46 | type LeaveArgs struct { 47 | GIDs []int 48 | } 49 | 50 | type LeaveReply struct { 51 | WrongLeader bool 52 | Err Err 53 | } 54 | 55 | type MoveArgs struct { 56 | Shard int 57 | GID int 58 | } 59 | 60 | type MoveReply struct { 61 | WrongLeader bool 62 | Err Err 63 | } 64 | 65 | type QueryArgs struct { 66 | Num int // desired config number 67 | } 68 | 69 | type QueryReply struct { 70 | WrongLeader bool 71 | Err Err 72 | Config Config 73 | } 74 | -------------------------------------------------------------------------------- /src/shardctrler/config.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import "mit6.824/labrpc" 4 | import "mit6.824/raft" 5 | import "testing" 6 | import "os" 7 | 8 | // import "log" 9 | import crand "crypto/rand" 10 | import "math/rand" 11 | import "encoding/base64" 12 | import "sync" 13 | import "runtime" 14 | import "time" 15 | 16 | func randstring(n int) string { 17 | b := make([]byte, 2*n) 18 | crand.Read(b) 19 | s := base64.URLEncoding.EncodeToString(b) 20 | return s[0:n] 21 | } 22 | 23 | // Randomize server handles 24 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 25 | sa := make([]*labrpc.ClientEnd, len(kvh)) 26 | copy(sa, kvh) 27 | for i := range sa { 28 | j := rand.Intn(i + 1) 29 | sa[i], sa[j] = sa[j], sa[i] 30 | } 31 | return sa 32 | } 33 | 34 | type config struct { 35 | mu sync.Mutex 36 | t *testing.T 37 | net *labrpc.Network 38 | n int 39 | servers []*ShardCtrler 40 | saved []*raft.Persister 41 | endnames [][]string // names of each server's sending ClientEnds 42 | clerks map[*Clerk][]string 43 | nextClientId int 44 | start time.Time // time at which make_config() was called 45 | } 46 | 47 | func (cfg *config) checkTimeout() { 48 | // enforce a two minute real-time limit on each test 49 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 50 | cfg.t.Fatal("test took longer than 120 seconds") 51 | } 52 | } 53 | 54 | func (cfg *config) cleanup() { 55 | cfg.mu.Lock() 56 | defer cfg.mu.Unlock() 57 | for i := 0; i < len(cfg.servers); i++ { 58 | if cfg.servers[i] != nil { 59 | cfg.servers[i].Kill() 60 | } 61 | } 62 | cfg.net.Cleanup() 63 | cfg.checkTimeout() 64 | } 65 | 66 | // Maximum log size across all servers 67 | func (cfg *config) LogSize() int { 68 | logsize := 0 69 | for i := 0; i < cfg.n; i++ { 70 | n := cfg.saved[i].RaftStateSize() 71 | if n > logsize { 72 | logsize = n 73 | } 74 | } 75 | return logsize 76 | } 77 | 78 | // attach server i to servers listed in to 79 | // caller must hold cfg.mu 80 | func (cfg *config) connectUnlocked(i int, to []int) { 81 | // log.Printf("connect peer %d to %v\n", i, to) 82 | 83 | // outgoing socket files 84 | for j := 0; j < len(to); j++ { 85 | endname := cfg.endnames[i][to[j]] 86 | cfg.net.Enable(endname, true) 87 | } 88 | 89 | // incoming socket files 90 | for j := 0; j < len(to); j++ { 91 | endname := cfg.endnames[to[j]][i] 92 | cfg.net.Enable(endname, true) 93 | } 94 | } 95 | 96 | func (cfg *config) connect(i int, to []int) { 97 | cfg.mu.Lock() 98 | defer cfg.mu.Unlock() 99 | cfg.connectUnlocked(i, to) 100 | } 101 | 102 | // detach server i from the servers listed in from 103 | // caller must hold cfg.mu 104 | func (cfg *config) disconnectUnlocked(i int, from []int) { 105 | // log.Printf("disconnect peer %d from %v\n", i, from) 106 | 107 | // outgoing socket files 108 | for j := 0; j < len(from); j++ { 109 | if cfg.endnames[i] != nil { 110 | endname := cfg.endnames[i][from[j]] 111 | cfg.net.Enable(endname, false) 112 | } 113 | } 114 | 115 | // incoming socket files 116 | for j := 0; j < len(from); j++ { 117 | if cfg.endnames[j] != nil { 118 | endname := cfg.endnames[from[j]][i] 119 | cfg.net.Enable(endname, false) 120 | } 121 | } 122 | } 123 | 124 | func (cfg *config) disconnect(i int, from []int) { 125 | cfg.mu.Lock() 126 | defer cfg.mu.Unlock() 127 | cfg.disconnectUnlocked(i, from) 128 | } 129 | 130 | func (cfg *config) All() []int { 131 | all := make([]int, cfg.n) 132 | for i := 0; i < cfg.n; i++ { 133 | all[i] = i 134 | } 135 | return all 136 | } 137 | 138 | func (cfg *config) ConnectAll() { 139 | cfg.mu.Lock() 140 | defer cfg.mu.Unlock() 141 | for i := 0; i < cfg.n; i++ { 142 | cfg.connectUnlocked(i, cfg.All()) 143 | } 144 | } 145 | 146 | // Sets up 2 partitions with connectivity between servers in each partition. 147 | func (cfg *config) partition(p1 []int, p2 []int) { 148 | cfg.mu.Lock() 149 | defer cfg.mu.Unlock() 150 | // log.Printf("partition servers into: %v %v\n", p1, p2) 151 | for i := 0; i < len(p1); i++ { 152 | cfg.disconnectUnlocked(p1[i], p2) 153 | cfg.connectUnlocked(p1[i], p1) 154 | } 155 | for i := 0; i < len(p2); i++ { 156 | cfg.disconnectUnlocked(p2[i], p1) 157 | cfg.connectUnlocked(p2[i], p2) 158 | } 159 | } 160 | 161 | // Create a clerk with clerk specific server names. 162 | // Give it connections to all of the servers, but for 163 | // now enable only connections to servers in to[]. 164 | func (cfg *config) makeClient(to []int) *Clerk { 165 | cfg.mu.Lock() 166 | defer cfg.mu.Unlock() 167 | 168 | // a fresh set of ClientEnds. 169 | ends := make([]*labrpc.ClientEnd, cfg.n) 170 | endnames := make([]string, cfg.n) 171 | for j := 0; j < cfg.n; j++ { 172 | endnames[j] = randstring(20) 173 | ends[j] = cfg.net.MakeEnd(endnames[j]) 174 | cfg.net.Connect(endnames[j], j) 175 | } 176 | 177 | ck := MakeClerk(random_handles(ends)) 178 | cfg.clerks[ck] = endnames 179 | cfg.nextClientId++ 180 | cfg.ConnectClientUnlocked(ck, to) 181 | return ck 182 | } 183 | 184 | func (cfg *config) deleteClient(ck *Clerk) { 185 | cfg.mu.Lock() 186 | defer cfg.mu.Unlock() 187 | 188 | v := cfg.clerks[ck] 189 | for i := 0; i < len(v); i++ { 190 | os.Remove(v[i]) 191 | } 192 | delete(cfg.clerks, ck) 193 | } 194 | 195 | // caller should hold cfg.mu 196 | func (cfg *config) ConnectClientUnlocked(ck *Clerk, to []int) { 197 | // log.Printf("ConnectClient %v to %v\n", ck, to) 198 | endnames := cfg.clerks[ck] 199 | for j := 0; j < len(to); j++ { 200 | s := endnames[to[j]] 201 | cfg.net.Enable(s, true) 202 | } 203 | } 204 | 205 | func (cfg *config) ConnectClient(ck *Clerk, to []int) { 206 | cfg.mu.Lock() 207 | defer cfg.mu.Unlock() 208 | cfg.ConnectClientUnlocked(ck, to) 209 | } 210 | 211 | // caller should hold cfg.mu 212 | func (cfg *config) DisconnectClientUnlocked(ck *Clerk, from []int) { 213 | // log.Printf("DisconnectClient %v from %v\n", ck, from) 214 | endnames := cfg.clerks[ck] 215 | for j := 0; j < len(from); j++ { 216 | s := endnames[from[j]] 217 | cfg.net.Enable(s, false) 218 | } 219 | } 220 | 221 | func (cfg *config) DisconnectClient(ck *Clerk, from []int) { 222 | cfg.mu.Lock() 223 | defer cfg.mu.Unlock() 224 | cfg.DisconnectClientUnlocked(ck, from) 225 | } 226 | 227 | // Shutdown a server by isolating it 228 | func (cfg *config) ShutdownServer(i int) { 229 | cfg.mu.Lock() 230 | defer cfg.mu.Unlock() 231 | 232 | cfg.disconnectUnlocked(i, cfg.All()) 233 | 234 | // disable client connections to the server. 235 | // it's important to do this before creating 236 | // the new Persister in saved[i], to avoid 237 | // the possibility of the server returning a 238 | // positive reply to an Append but persisting 239 | // the result in the superseded Persister. 240 | cfg.net.DeleteServer(i) 241 | 242 | // a fresh persister, in case old instance 243 | // continues to update the Persister. 244 | // but copy old persister's content so that we always 245 | // pass Make() the last persisted state. 246 | if cfg.saved[i] != nil { 247 | cfg.saved[i] = cfg.saved[i].Copy() 248 | } 249 | 250 | kv := cfg.servers[i] 251 | if kv != nil { 252 | cfg.mu.Unlock() 253 | kv.Kill() 254 | cfg.mu.Lock() 255 | cfg.servers[i] = nil 256 | } 257 | } 258 | 259 | // If restart servers, first call ShutdownServer 260 | func (cfg *config) StartServer(i int) { 261 | cfg.mu.Lock() 262 | 263 | // a fresh set of outgoing ClientEnd names. 264 | cfg.endnames[i] = make([]string, cfg.n) 265 | for j := 0; j < cfg.n; j++ { 266 | cfg.endnames[i][j] = randstring(20) 267 | } 268 | 269 | // a fresh set of ClientEnds. 270 | ends := make([]*labrpc.ClientEnd, cfg.n) 271 | for j := 0; j < cfg.n; j++ { 272 | ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j]) 273 | cfg.net.Connect(cfg.endnames[i][j], j) 274 | } 275 | 276 | // a fresh persister, so old instance doesn't overwrite 277 | // new instance's persisted state. 278 | // give the fresh persister a copy of the old persister's 279 | // state, so that the spec is that we pass StartKVServer() 280 | // the last persisted state. 281 | if cfg.saved[i] != nil { 282 | cfg.saved[i] = cfg.saved[i].Copy() 283 | } else { 284 | cfg.saved[i] = raft.MakePersister() 285 | } 286 | 287 | cfg.mu.Unlock() 288 | 289 | cfg.servers[i] = StartServer(ends, i, cfg.saved[i]) 290 | 291 | kvsvc := labrpc.MakeService(cfg.servers[i]) 292 | rfsvc := labrpc.MakeService(cfg.servers[i].rf) 293 | srv := labrpc.MakeServer() 294 | srv.AddService(kvsvc) 295 | srv.AddService(rfsvc) 296 | cfg.net.AddServer(i, srv) 297 | } 298 | 299 | func (cfg *config) Leader() (bool, int) { 300 | cfg.mu.Lock() 301 | defer cfg.mu.Unlock() 302 | 303 | for i := 0; i < cfg.n; i++ { 304 | if cfg.servers[i] != nil { 305 | _, is_leader := cfg.servers[i].rf.GetState() 306 | if is_leader { 307 | return true, i 308 | } 309 | } 310 | } 311 | return false, 0 312 | } 313 | 314 | // Partition servers into 2 groups and put current leader in minority 315 | func (cfg *config) make_partition() ([]int, []int) { 316 | _, l := cfg.Leader() 317 | p1 := make([]int, cfg.n/2+1) 318 | p2 := make([]int, cfg.n/2) 319 | j := 0 320 | for i := 0; i < cfg.n; i++ { 321 | if i != l { 322 | if j < len(p1) { 323 | p1[j] = i 324 | } else { 325 | p2[j-len(p1)] = i 326 | } 327 | j++ 328 | } 329 | } 330 | p2[len(p2)-1] = l 331 | return p1, p2 332 | } 333 | 334 | func make_config(t *testing.T, n int, unreliable bool) *config { 335 | runtime.GOMAXPROCS(4) 336 | cfg := &config{} 337 | cfg.t = t 338 | cfg.net = labrpc.MakeNetwork() 339 | cfg.n = n 340 | cfg.servers = make([]*ShardCtrler, cfg.n) 341 | cfg.saved = make([]*raft.Persister, cfg.n) 342 | cfg.endnames = make([][]string, cfg.n) 343 | cfg.clerks = make(map[*Clerk][]string) 344 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 345 | cfg.start = time.Now() 346 | 347 | // create a full set of KV servers. 348 | for i := 0; i < cfg.n; i++ { 349 | cfg.StartServer(i) 350 | } 351 | 352 | cfg.ConnectAll() 353 | 354 | cfg.net.Reliable(!unreliable) 355 | 356 | return cfg 357 | } 358 | -------------------------------------------------------------------------------- /src/shardctrler/server.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import "mit6.824/raft" 4 | import "mit6.824/labrpc" 5 | import "sync" 6 | import "mit6.824/labgob" 7 | 8 | type ShardCtrler struct { 9 | mu sync.Mutex 10 | me int 11 | rf *raft.Raft 12 | applyCh chan raft.ApplyMsg 13 | 14 | // Your data here. 15 | 16 | configs []Config // indexed by config num 17 | } 18 | 19 | type Op struct { 20 | // Your data here. 21 | } 22 | 23 | func (sc *ShardCtrler) Join(args *JoinArgs, reply *JoinReply) { 24 | // Your code here. 25 | } 26 | 27 | func (sc *ShardCtrler) Leave(args *LeaveArgs, reply *LeaveReply) { 28 | // Your code here. 29 | } 30 | 31 | func (sc *ShardCtrler) Move(args *MoveArgs, reply *MoveReply) { 32 | // Your code here. 33 | } 34 | 35 | func (sc *ShardCtrler) Query(args *QueryArgs, reply *QueryReply) { 36 | // Your code here. 37 | } 38 | 39 | // the tester calls Kill() when a ShardCtrler instance won't 40 | // be needed again. you are not required to do anything 41 | // in Kill(), but it might be convenient to (for example) 42 | // turn off debug output from this instance. 43 | func (sc *ShardCtrler) Kill() { 44 | sc.rf.Kill() 45 | // Your code here, if desired. 46 | } 47 | 48 | // needed by shardkv tester 49 | func (sc *ShardCtrler) Raft() *raft.Raft { 50 | return sc.rf 51 | } 52 | 53 | // servers[] contains the ports of the set of 54 | // servers that will cooperate via Raft to 55 | // form the fault-tolerant shardctrler service. 56 | // me is the index of the current server in servers[]. 57 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister) *ShardCtrler { 58 | sc := new(ShardCtrler) 59 | sc.me = me 60 | 61 | sc.configs = make([]Config, 1) 62 | sc.configs[0].Groups = map[int][]string{} 63 | 64 | labgob.Register(Op{}) 65 | sc.applyCh = make(chan raft.ApplyMsg) 66 | sc.rf = raft.Make(servers, me, persister, sc.applyCh) 67 | 68 | // Your code here. 69 | 70 | return sc 71 | } 72 | -------------------------------------------------------------------------------- /src/shardctrler/test_test.go: -------------------------------------------------------------------------------- 1 | package shardctrler 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | // import "time" 11 | 12 | func check(t *testing.T, groups []int, ck *Clerk) { 13 | c := ck.Query(-1) 14 | if len(c.Groups) != len(groups) { 15 | t.Fatalf("wanted %v groups, got %v", len(groups), len(c.Groups)) 16 | } 17 | 18 | // are the groups as expected? 19 | for _, g := range groups { 20 | _, ok := c.Groups[g] 21 | if ok != true { 22 | t.Fatalf("missing group %v", g) 23 | } 24 | } 25 | 26 | // any un-allocated shards? 27 | if len(groups) > 0 { 28 | for s, g := range c.Shards { 29 | _, ok := c.Groups[g] 30 | if ok == false { 31 | t.Fatalf("shard %v -> invalid group %v", s, g) 32 | } 33 | } 34 | } 35 | 36 | // more or less balanced sharding? 37 | counts := map[int]int{} 38 | for _, g := range c.Shards { 39 | counts[g] += 1 40 | } 41 | min := 257 42 | max := 0 43 | for g, _ := range c.Groups { 44 | if counts[g] > max { 45 | max = counts[g] 46 | } 47 | if counts[g] < min { 48 | min = counts[g] 49 | } 50 | } 51 | if max > min+1 { 52 | t.Fatalf("max %v too much larger than min %v", max, min) 53 | } 54 | } 55 | 56 | func check_same_config(t *testing.T, c1 Config, c2 Config) { 57 | if c1.Num != c2.Num { 58 | t.Fatalf("Num wrong") 59 | } 60 | if c1.Shards != c2.Shards { 61 | t.Fatalf("Shards wrong") 62 | } 63 | if len(c1.Groups) != len(c2.Groups) { 64 | t.Fatalf("number of Groups is wrong") 65 | } 66 | for gid, sa := range c1.Groups { 67 | sa1, ok := c2.Groups[gid] 68 | if ok == false || len(sa1) != len(sa) { 69 | t.Fatalf("len(Groups) wrong") 70 | } 71 | if ok && len(sa1) == len(sa) { 72 | for j := 0; j < len(sa); j++ { 73 | if sa[j] != sa1[j] { 74 | t.Fatalf("Groups wrong") 75 | } 76 | } 77 | } 78 | } 79 | } 80 | 81 | func TestBasic(t *testing.T) { 82 | const nservers = 3 83 | cfg := make_config(t, nservers, false) 84 | defer cfg.cleanup() 85 | 86 | ck := cfg.makeClient(cfg.All()) 87 | 88 | fmt.Printf("Test: Basic leave/join ...\n") 89 | 90 | cfa := make([]Config, 6) 91 | cfa[0] = ck.Query(-1) 92 | 93 | check(t, []int{}, ck) 94 | 95 | var gid1 int = 1 96 | ck.Join(map[int][]string{gid1: []string{"x", "y", "z"}}) 97 | check(t, []int{gid1}, ck) 98 | cfa[1] = ck.Query(-1) 99 | 100 | var gid2 int = 2 101 | ck.Join(map[int][]string{gid2: []string{"a", "b", "c"}}) 102 | check(t, []int{gid1, gid2}, ck) 103 | cfa[2] = ck.Query(-1) 104 | 105 | cfx := ck.Query(-1) 106 | sa1 := cfx.Groups[gid1] 107 | if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" { 108 | t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1) 109 | } 110 | sa2 := cfx.Groups[gid2] 111 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 112 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 113 | } 114 | 115 | ck.Leave([]int{gid1}) 116 | check(t, []int{gid2}, ck) 117 | cfa[4] = ck.Query(-1) 118 | 119 | ck.Leave([]int{gid2}) 120 | cfa[5] = ck.Query(-1) 121 | 122 | fmt.Printf(" ... Passed\n") 123 | 124 | fmt.Printf("Test: Historical queries ...\n") 125 | 126 | for s := 0; s < nservers; s++ { 127 | cfg.ShutdownServer(s) 128 | for i := 0; i < len(cfa); i++ { 129 | c := ck.Query(cfa[i].Num) 130 | check_same_config(t, c, cfa[i]) 131 | } 132 | cfg.StartServer(s) 133 | cfg.ConnectAll() 134 | } 135 | 136 | fmt.Printf(" ... Passed\n") 137 | 138 | fmt.Printf("Test: Move ...\n") 139 | { 140 | var gid3 int = 503 141 | ck.Join(map[int][]string{gid3: []string{"3a", "3b", "3c"}}) 142 | var gid4 int = 504 143 | ck.Join(map[int][]string{gid4: []string{"4a", "4b", "4c"}}) 144 | for i := 0; i < NShards; i++ { 145 | cf := ck.Query(-1) 146 | if i < NShards/2 { 147 | ck.Move(i, gid3) 148 | if cf.Shards[i] != gid3 { 149 | cf1 := ck.Query(-1) 150 | if cf1.Num <= cf.Num { 151 | t.Fatalf("Move should increase Config.Num") 152 | } 153 | } 154 | } else { 155 | ck.Move(i, gid4) 156 | if cf.Shards[i] != gid4 { 157 | cf1 := ck.Query(-1) 158 | if cf1.Num <= cf.Num { 159 | t.Fatalf("Move should increase Config.Num") 160 | } 161 | } 162 | } 163 | } 164 | cf2 := ck.Query(-1) 165 | for i := 0; i < NShards; i++ { 166 | if i < NShards/2 { 167 | if cf2.Shards[i] != gid3 { 168 | t.Fatalf("expected shard %v on gid %v actually %v", 169 | i, gid3, cf2.Shards[i]) 170 | } 171 | } else { 172 | if cf2.Shards[i] != gid4 { 173 | t.Fatalf("expected shard %v on gid %v actually %v", 174 | i, gid4, cf2.Shards[i]) 175 | } 176 | } 177 | } 178 | ck.Leave([]int{gid3}) 179 | ck.Leave([]int{gid4}) 180 | } 181 | fmt.Printf(" ... Passed\n") 182 | 183 | fmt.Printf("Test: Concurrent leave/join ...\n") 184 | 185 | const npara = 10 186 | var cka [npara]*Clerk 187 | for i := 0; i < len(cka); i++ { 188 | cka[i] = cfg.makeClient(cfg.All()) 189 | } 190 | gids := make([]int, npara) 191 | ch := make(chan bool) 192 | for xi := 0; xi < npara; xi++ { 193 | gids[xi] = int((xi * 10) + 100) 194 | go func(i int) { 195 | defer func() { ch <- true }() 196 | var gid int = gids[i] 197 | var sid1 = fmt.Sprintf("s%da", gid) 198 | var sid2 = fmt.Sprintf("s%db", gid) 199 | cka[i].Join(map[int][]string{gid + 1000: []string{sid1}}) 200 | cka[i].Join(map[int][]string{gid: []string{sid2}}) 201 | cka[i].Leave([]int{gid + 1000}) 202 | }(xi) 203 | } 204 | for i := 0; i < npara; i++ { 205 | <-ch 206 | } 207 | check(t, gids, ck) 208 | 209 | fmt.Printf(" ... Passed\n") 210 | 211 | fmt.Printf("Test: Minimal transfers after joins ...\n") 212 | 213 | c1 := ck.Query(-1) 214 | for i := 0; i < 5; i++ { 215 | var gid = int(npara + 1 + i) 216 | ck.Join(map[int][]string{gid: []string{ 217 | fmt.Sprintf("%da", gid), 218 | fmt.Sprintf("%db", gid), 219 | fmt.Sprintf("%db", gid)}}) 220 | } 221 | c2 := ck.Query(-1) 222 | for i := int(1); i <= npara; i++ { 223 | for j := 0; j < len(c1.Shards); j++ { 224 | if c2.Shards[j] == i { 225 | if c1.Shards[j] != i { 226 | t.Fatalf("non-minimal transfer after Join()s") 227 | } 228 | } 229 | } 230 | } 231 | 232 | fmt.Printf(" ... Passed\n") 233 | 234 | fmt.Printf("Test: Minimal transfers after leaves ...\n") 235 | 236 | for i := 0; i < 5; i++ { 237 | ck.Leave([]int{int(npara + 1 + i)}) 238 | } 239 | c3 := ck.Query(-1) 240 | for i := int(1); i <= npara; i++ { 241 | for j := 0; j < len(c1.Shards); j++ { 242 | if c2.Shards[j] == i { 243 | if c3.Shards[j] != i { 244 | t.Fatalf("non-minimal transfer after Leave()s") 245 | } 246 | } 247 | } 248 | } 249 | 250 | fmt.Printf(" ... Passed\n") 251 | } 252 | 253 | func TestMulti(t *testing.T) { 254 | const nservers = 3 255 | cfg := make_config(t, nservers, false) 256 | defer cfg.cleanup() 257 | 258 | ck := cfg.makeClient(cfg.All()) 259 | 260 | fmt.Printf("Test: Multi-group join/leave ...\n") 261 | 262 | cfa := make([]Config, 6) 263 | cfa[0] = ck.Query(-1) 264 | 265 | check(t, []int{}, ck) 266 | 267 | var gid1 int = 1 268 | var gid2 int = 2 269 | ck.Join(map[int][]string{ 270 | gid1: []string{"x", "y", "z"}, 271 | gid2: []string{"a", "b", "c"}, 272 | }) 273 | check(t, []int{gid1, gid2}, ck) 274 | cfa[1] = ck.Query(-1) 275 | 276 | var gid3 int = 3 277 | ck.Join(map[int][]string{gid3: []string{"j", "k", "l"}}) 278 | check(t, []int{gid1, gid2, gid3}, ck) 279 | cfa[2] = ck.Query(-1) 280 | 281 | cfx := ck.Query(-1) 282 | sa1 := cfx.Groups[gid1] 283 | if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" { 284 | t.Fatalf("wrong servers for gid %v: %v\n", gid1, sa1) 285 | } 286 | sa2 := cfx.Groups[gid2] 287 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 288 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 289 | } 290 | sa3 := cfx.Groups[gid3] 291 | if len(sa3) != 3 || sa3[0] != "j" || sa3[1] != "k" || sa3[2] != "l" { 292 | t.Fatalf("wrong servers for gid %v: %v\n", gid3, sa3) 293 | } 294 | 295 | ck.Leave([]int{gid1, gid3}) 296 | check(t, []int{gid2}, ck) 297 | cfa[3] = ck.Query(-1) 298 | 299 | cfx = ck.Query(-1) 300 | sa2 = cfx.Groups[gid2] 301 | if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" { 302 | t.Fatalf("wrong servers for gid %v: %v\n", gid2, sa2) 303 | } 304 | 305 | ck.Leave([]int{gid2}) 306 | 307 | fmt.Printf(" ... Passed\n") 308 | 309 | fmt.Printf("Test: Concurrent multi leave/join ...\n") 310 | 311 | const npara = 10 312 | var cka [npara]*Clerk 313 | for i := 0; i < len(cka); i++ { 314 | cka[i] = cfg.makeClient(cfg.All()) 315 | } 316 | gids := make([]int, npara) 317 | var wg sync.WaitGroup 318 | for xi := 0; xi < npara; xi++ { 319 | wg.Add(1) 320 | gids[xi] = int(xi + 1000) 321 | go func(i int) { 322 | defer wg.Done() 323 | var gid int = gids[i] 324 | cka[i].Join(map[int][]string{ 325 | gid: []string{ 326 | fmt.Sprintf("%da", gid), 327 | fmt.Sprintf("%db", gid), 328 | fmt.Sprintf("%dc", gid)}, 329 | gid + 1000: []string{fmt.Sprintf("%da", gid+1000)}, 330 | gid + 2000: []string{fmt.Sprintf("%da", gid+2000)}, 331 | }) 332 | cka[i].Leave([]int{gid + 1000, gid + 2000}) 333 | }(xi) 334 | } 335 | wg.Wait() 336 | check(t, gids, ck) 337 | 338 | fmt.Printf(" ... Passed\n") 339 | 340 | fmt.Printf("Test: Minimal transfers after multijoins ...\n") 341 | 342 | c1 := ck.Query(-1) 343 | m := make(map[int][]string) 344 | for i := 0; i < 5; i++ { 345 | var gid = npara + 1 + i 346 | m[gid] = []string{fmt.Sprintf("%da", gid), fmt.Sprintf("%db", gid)} 347 | } 348 | ck.Join(m) 349 | c2 := ck.Query(-1) 350 | for i := int(1); i <= npara; i++ { 351 | for j := 0; j < len(c1.Shards); j++ { 352 | if c2.Shards[j] == i { 353 | if c1.Shards[j] != i { 354 | t.Fatalf("non-minimal transfer after Join()s") 355 | } 356 | } 357 | } 358 | } 359 | 360 | fmt.Printf(" ... Passed\n") 361 | 362 | fmt.Printf("Test: Minimal transfers after multileaves ...\n") 363 | 364 | var l []int 365 | for i := 0; i < 5; i++ { 366 | l = append(l, npara+1+i) 367 | } 368 | ck.Leave(l) 369 | c3 := ck.Query(-1) 370 | for i := int(1); i <= npara; i++ { 371 | for j := 0; j < len(c1.Shards); j++ { 372 | if c2.Shards[j] == i { 373 | if c3.Shards[j] != i { 374 | t.Fatalf("non-minimal transfer after Leave()s") 375 | } 376 | } 377 | } 378 | } 379 | 380 | fmt.Printf(" ... Passed\n") 381 | 382 | fmt.Printf("Test: Check Same config on servers ...\n") 383 | 384 | isLeader, leader := cfg.Leader() 385 | if !isLeader { 386 | t.Fatalf("Leader not found") 387 | } 388 | c := ck.Query(-1) // Config leader claims 389 | 390 | cfg.ShutdownServer(leader) 391 | 392 | attempts := 0 393 | for isLeader, leader = cfg.Leader(); isLeader; time.Sleep(1 * time.Second) { 394 | if attempts++; attempts >= 3 { 395 | t.Fatalf("Leader not found") 396 | } 397 | } 398 | 399 | c1 = ck.Query(-1) 400 | check_same_config(t, c, c1) 401 | 402 | fmt.Printf(" ... Passed\n") 403 | } 404 | -------------------------------------------------------------------------------- /src/shardkv/client.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | // 4 | // client code to talk to a sharded key/value service. 5 | // 6 | // the client first talks to the shardctrler to find out 7 | // the assignment of shards (keys) to groups, and then 8 | // talks to the group that holds the key's shard. 9 | // 10 | 11 | import "mit6.824/labrpc" 12 | import "crypto/rand" 13 | import "math/big" 14 | import "mit6.824/shardctrler" 15 | import "time" 16 | 17 | // which shard is a key in? 18 | // please use this function, 19 | // and please do not change it. 20 | func key2shard(key string) int { 21 | shard := 0 22 | if len(key) > 0 { 23 | shard = int(key[0]) 24 | } 25 | shard %= shardctrler.NShards 26 | return shard 27 | } 28 | 29 | func nrand() int64 { 30 | max := big.NewInt(int64(1) << 62) 31 | bigx, _ := rand.Int(rand.Reader, max) 32 | x := bigx.Int64() 33 | return x 34 | } 35 | 36 | type Clerk struct { 37 | sm *shardctrler.Clerk 38 | config shardctrler.Config 39 | make_end func(string) *labrpc.ClientEnd 40 | // You will have to modify this struct. 41 | } 42 | 43 | // the tester calls MakeClerk. 44 | // 45 | // ctrlers[] is needed to call shardctrler.MakeClerk(). 46 | // 47 | // make_end(servername) turns a server name from a 48 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can 49 | // send RPCs. 50 | func MakeClerk(ctrlers []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *Clerk { 51 | ck := new(Clerk) 52 | ck.sm = shardctrler.MakeClerk(ctrlers) 53 | ck.make_end = make_end 54 | // You'll have to add code here. 55 | return ck 56 | } 57 | 58 | // fetch the current value for a key. 59 | // returns "" if the key does not exist. 60 | // keeps trying forever in the face of all other errors. 61 | // You will have to modify this function. 62 | func (ck *Clerk) Get(key string) string { 63 | args := GetArgs{} 64 | args.Key = key 65 | 66 | for { 67 | shard := key2shard(key) 68 | gid := ck.config.Shards[shard] 69 | if servers, ok := ck.config.Groups[gid]; ok { 70 | // try each server for the shard. 71 | for si := 0; si < len(servers); si++ { 72 | srv := ck.make_end(servers[si]) 73 | var reply GetReply 74 | ok := srv.Call("ShardKV.Get", &args, &reply) 75 | if ok && (reply.Err == OK || reply.Err == ErrNoKey) { 76 | return reply.Value 77 | } 78 | if ok && (reply.Err == ErrWrongGroup) { 79 | break 80 | } 81 | // ... not ok, or ErrWrongLeader 82 | } 83 | } 84 | time.Sleep(100 * time.Millisecond) 85 | // ask controler for the latest configuration. 86 | ck.config = ck.sm.Query(-1) 87 | } 88 | 89 | return "" 90 | } 91 | 92 | // shared by Put and Append. 93 | // You will have to modify this function. 94 | func (ck *Clerk) PutAppend(key string, value string, op string) { 95 | args := PutAppendArgs{} 96 | args.Key = key 97 | args.Value = value 98 | args.Op = op 99 | 100 | for { 101 | shard := key2shard(key) 102 | gid := ck.config.Shards[shard] 103 | if servers, ok := ck.config.Groups[gid]; ok { 104 | for si := 0; si < len(servers); si++ { 105 | srv := ck.make_end(servers[si]) 106 | var reply PutAppendReply 107 | ok := srv.Call("ShardKV.PutAppend", &args, &reply) 108 | if ok && reply.Err == OK { 109 | return 110 | } 111 | if ok && reply.Err == ErrWrongGroup { 112 | break 113 | } 114 | // ... not ok, or ErrWrongLeader 115 | } 116 | } 117 | time.Sleep(100 * time.Millisecond) 118 | // ask controler for the latest configuration. 119 | ck.config = ck.sm.Query(-1) 120 | } 121 | } 122 | 123 | func (ck *Clerk) Put(key string, value string) { 124 | ck.PutAppend(key, value, "Put") 125 | } 126 | func (ck *Clerk) Append(key string, value string) { 127 | ck.PutAppend(key, value, "Append") 128 | } 129 | -------------------------------------------------------------------------------- /src/shardkv/common.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | // 4 | // Sharded key/value server. 5 | // Lots of replica groups, each running Raft. 6 | // Shardctrler decides which group serves each shard. 7 | // Shardctrler may change shard assignment from time to time. 8 | // 9 | // You will have to modify these definitions. 10 | // 11 | 12 | const ( 13 | OK = "OK" 14 | ErrNoKey = "ErrNoKey" 15 | ErrWrongGroup = "ErrWrongGroup" 16 | ErrWrongLeader = "ErrWrongLeader" 17 | ) 18 | 19 | type Err string 20 | 21 | // Put or Append 22 | type PutAppendArgs struct { 23 | // You'll have to add definitions here. 24 | Key string 25 | Value string 26 | Op string // "Put" or "Append" 27 | // You'll have to add definitions here. 28 | // Field names must start with capital letters, 29 | // otherwise RPC will break. 30 | } 31 | 32 | type PutAppendReply struct { 33 | Err Err 34 | } 35 | 36 | type GetArgs struct { 37 | Key string 38 | // You'll have to add definitions here. 39 | } 40 | 41 | type GetReply struct { 42 | Err Err 43 | Value string 44 | } 45 | -------------------------------------------------------------------------------- /src/shardkv/config.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import "mit6.824/shardctrler" 4 | import "mit6.824/labrpc" 5 | import "testing" 6 | import "os" 7 | 8 | // import "log" 9 | import crand "crypto/rand" 10 | import "math/big" 11 | import "math/rand" 12 | import "encoding/base64" 13 | import "sync" 14 | import "runtime" 15 | import "mit6.824/raft" 16 | import "strconv" 17 | import "fmt" 18 | import "time" 19 | 20 | func randstring(n int) string { 21 | b := make([]byte, 2*n) 22 | crand.Read(b) 23 | s := base64.URLEncoding.EncodeToString(b) 24 | return s[0:n] 25 | } 26 | 27 | func makeSeed() int64 { 28 | max := big.NewInt(int64(1) << 62) 29 | bigx, _ := crand.Int(crand.Reader, max) 30 | x := bigx.Int64() 31 | return x 32 | } 33 | 34 | // Randomize server handles 35 | func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd { 36 | sa := make([]*labrpc.ClientEnd, len(kvh)) 37 | copy(sa, kvh) 38 | for i := range sa { 39 | j := rand.Intn(i + 1) 40 | sa[i], sa[j] = sa[j], sa[i] 41 | } 42 | return sa 43 | } 44 | 45 | type group struct { 46 | gid int 47 | servers []*ShardKV 48 | saved []*raft.Persister 49 | endnames [][]string 50 | mendnames [][]string 51 | } 52 | 53 | type config struct { 54 | mu sync.Mutex 55 | t *testing.T 56 | net *labrpc.Network 57 | start time.Time // time at which make_config() was called 58 | 59 | nctrlers int 60 | ctrlerservers []*shardctrler.ShardCtrler 61 | mck *shardctrler.Clerk 62 | 63 | ngroups int 64 | n int // servers per k/v group 65 | groups []*group 66 | 67 | clerks map[*Clerk][]string 68 | nextClientId int 69 | maxraftstate int 70 | } 71 | 72 | func (cfg *config) checkTimeout() { 73 | // enforce a two minute real-time limit on each test 74 | if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second { 75 | cfg.t.Fatal("test took longer than 120 seconds") 76 | } 77 | } 78 | 79 | func (cfg *config) cleanup() { 80 | for gi := 0; gi < cfg.ngroups; gi++ { 81 | cfg.ShutdownGroup(gi) 82 | } 83 | for i := 0; i < cfg.nctrlers; i++ { 84 | cfg.ctrlerservers[i].Kill() 85 | } 86 | cfg.net.Cleanup() 87 | cfg.checkTimeout() 88 | } 89 | 90 | // check that no server's log is too big. 91 | func (cfg *config) checklogs() { 92 | for gi := 0; gi < cfg.ngroups; gi++ { 93 | for i := 0; i < cfg.n; i++ { 94 | raft := cfg.groups[gi].saved[i].RaftStateSize() 95 | snap := len(cfg.groups[gi].saved[i].ReadSnapshot()) 96 | if cfg.maxraftstate >= 0 && raft > 8*cfg.maxraftstate { 97 | cfg.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v", 98 | raft, cfg.maxraftstate) 99 | } 100 | if cfg.maxraftstate < 0 && snap > 0 { 101 | cfg.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!") 102 | } 103 | } 104 | } 105 | } 106 | 107 | // controler server name for labrpc. 108 | func (cfg *config) ctrlername(i int) string { 109 | return "ctrler" + strconv.Itoa(i) 110 | } 111 | 112 | // shard server name for labrpc. 113 | // i'th server of group gid. 114 | func (cfg *config) servername(gid int, i int) string { 115 | return "server-" + strconv.Itoa(gid) + "-" + strconv.Itoa(i) 116 | } 117 | 118 | func (cfg *config) makeClient() *Clerk { 119 | cfg.mu.Lock() 120 | defer cfg.mu.Unlock() 121 | 122 | // ClientEnds to talk to controler service. 123 | ends := make([]*labrpc.ClientEnd, cfg.nctrlers) 124 | endnames := make([]string, cfg.n) 125 | for j := 0; j < cfg.nctrlers; j++ { 126 | endnames[j] = randstring(20) 127 | ends[j] = cfg.net.MakeEnd(endnames[j]) 128 | cfg.net.Connect(endnames[j], cfg.ctrlername(j)) 129 | cfg.net.Enable(endnames[j], true) 130 | } 131 | 132 | ck := MakeClerk(ends, func(servername string) *labrpc.ClientEnd { 133 | name := randstring(20) 134 | end := cfg.net.MakeEnd(name) 135 | cfg.net.Connect(name, servername) 136 | cfg.net.Enable(name, true) 137 | return end 138 | }) 139 | cfg.clerks[ck] = endnames 140 | cfg.nextClientId++ 141 | return ck 142 | } 143 | 144 | func (cfg *config) deleteClient(ck *Clerk) { 145 | cfg.mu.Lock() 146 | defer cfg.mu.Unlock() 147 | 148 | v := cfg.clerks[ck] 149 | for i := 0; i < len(v); i++ { 150 | os.Remove(v[i]) 151 | } 152 | delete(cfg.clerks, ck) 153 | } 154 | 155 | // Shutdown i'th server of gi'th group, by isolating it 156 | func (cfg *config) ShutdownServer(gi int, i int) { 157 | cfg.mu.Lock() 158 | defer cfg.mu.Unlock() 159 | 160 | gg := cfg.groups[gi] 161 | 162 | // prevent this server from sending 163 | for j := 0; j < len(gg.servers); j++ { 164 | name := gg.endnames[i][j] 165 | cfg.net.Enable(name, false) 166 | } 167 | for j := 0; j < len(gg.mendnames[i]); j++ { 168 | name := gg.mendnames[i][j] 169 | cfg.net.Enable(name, false) 170 | } 171 | 172 | // disable client connections to the server. 173 | // it's important to do this before creating 174 | // the new Persister in saved[i], to avoid 175 | // the possibility of the server returning a 176 | // positive reply to an Append but persisting 177 | // the result in the superseded Persister. 178 | cfg.net.DeleteServer(cfg.servername(gg.gid, i)) 179 | 180 | // a fresh persister, in case old instance 181 | // continues to update the Persister. 182 | // but copy old persister's content so that we always 183 | // pass Make() the last persisted state. 184 | if gg.saved[i] != nil { 185 | gg.saved[i] = gg.saved[i].Copy() 186 | } 187 | 188 | kv := gg.servers[i] 189 | if kv != nil { 190 | cfg.mu.Unlock() 191 | kv.Kill() 192 | cfg.mu.Lock() 193 | gg.servers[i] = nil 194 | } 195 | } 196 | 197 | func (cfg *config) ShutdownGroup(gi int) { 198 | for i := 0; i < cfg.n; i++ { 199 | cfg.ShutdownServer(gi, i) 200 | } 201 | } 202 | 203 | // start i'th server in gi'th group 204 | func (cfg *config) StartServer(gi int, i int) { 205 | cfg.mu.Lock() 206 | 207 | gg := cfg.groups[gi] 208 | 209 | // a fresh set of outgoing ClientEnd names 210 | // to talk to other servers in this group. 211 | gg.endnames[i] = make([]string, cfg.n) 212 | for j := 0; j < cfg.n; j++ { 213 | gg.endnames[i][j] = randstring(20) 214 | } 215 | 216 | // and the connections to other servers in this group. 217 | ends := make([]*labrpc.ClientEnd, cfg.n) 218 | for j := 0; j < cfg.n; j++ { 219 | ends[j] = cfg.net.MakeEnd(gg.endnames[i][j]) 220 | cfg.net.Connect(gg.endnames[i][j], cfg.servername(gg.gid, j)) 221 | cfg.net.Enable(gg.endnames[i][j], true) 222 | } 223 | 224 | // ends to talk to shardctrler service 225 | mends := make([]*labrpc.ClientEnd, cfg.nctrlers) 226 | gg.mendnames[i] = make([]string, cfg.nctrlers) 227 | for j := 0; j < cfg.nctrlers; j++ { 228 | gg.mendnames[i][j] = randstring(20) 229 | mends[j] = cfg.net.MakeEnd(gg.mendnames[i][j]) 230 | cfg.net.Connect(gg.mendnames[i][j], cfg.ctrlername(j)) 231 | cfg.net.Enable(gg.mendnames[i][j], true) 232 | } 233 | 234 | // a fresh persister, so old instance doesn't overwrite 235 | // new instance's persisted state. 236 | // give the fresh persister a copy of the old persister's 237 | // state, so that the spec is that we pass StartKVServer() 238 | // the last persisted state. 239 | if gg.saved[i] != nil { 240 | gg.saved[i] = gg.saved[i].Copy() 241 | } else { 242 | gg.saved[i] = raft.MakePersister() 243 | } 244 | cfg.mu.Unlock() 245 | 246 | gg.servers[i] = StartServer(ends, i, gg.saved[i], cfg.maxraftstate, 247 | gg.gid, mends, 248 | func(servername string) *labrpc.ClientEnd { 249 | name := randstring(20) 250 | end := cfg.net.MakeEnd(name) 251 | cfg.net.Connect(name, servername) 252 | cfg.net.Enable(name, true) 253 | return end 254 | }) 255 | 256 | kvsvc := labrpc.MakeService(gg.servers[i]) 257 | rfsvc := labrpc.MakeService(gg.servers[i].rf) 258 | srv := labrpc.MakeServer() 259 | srv.AddService(kvsvc) 260 | srv.AddService(rfsvc) 261 | cfg.net.AddServer(cfg.servername(gg.gid, i), srv) 262 | } 263 | 264 | func (cfg *config) StartGroup(gi int) { 265 | for i := 0; i < cfg.n; i++ { 266 | cfg.StartServer(gi, i) 267 | } 268 | } 269 | 270 | func (cfg *config) StartCtrlerserver(i int) { 271 | // ClientEnds to talk to other controler replicas. 272 | ends := make([]*labrpc.ClientEnd, cfg.nctrlers) 273 | for j := 0; j < cfg.nctrlers; j++ { 274 | endname := randstring(20) 275 | ends[j] = cfg.net.MakeEnd(endname) 276 | cfg.net.Connect(endname, cfg.ctrlername(j)) 277 | cfg.net.Enable(endname, true) 278 | } 279 | 280 | p := raft.MakePersister() 281 | 282 | cfg.ctrlerservers[i] = shardctrler.StartServer(ends, i, p) 283 | 284 | msvc := labrpc.MakeService(cfg.ctrlerservers[i]) 285 | rfsvc := labrpc.MakeService(cfg.ctrlerservers[i].Raft()) 286 | srv := labrpc.MakeServer() 287 | srv.AddService(msvc) 288 | srv.AddService(rfsvc) 289 | cfg.net.AddServer(cfg.ctrlername(i), srv) 290 | } 291 | 292 | func (cfg *config) shardclerk() *shardctrler.Clerk { 293 | // ClientEnds to talk to ctrler service. 294 | ends := make([]*labrpc.ClientEnd, cfg.nctrlers) 295 | for j := 0; j < cfg.nctrlers; j++ { 296 | name := randstring(20) 297 | ends[j] = cfg.net.MakeEnd(name) 298 | cfg.net.Connect(name, cfg.ctrlername(j)) 299 | cfg.net.Enable(name, true) 300 | } 301 | 302 | return shardctrler.MakeClerk(ends) 303 | } 304 | 305 | // tell the shardctrler that a group is joining. 306 | func (cfg *config) join(gi int) { 307 | cfg.joinm([]int{gi}) 308 | } 309 | 310 | func (cfg *config) joinm(gis []int) { 311 | m := make(map[int][]string, len(gis)) 312 | for _, g := range gis { 313 | gid := cfg.groups[g].gid 314 | servernames := make([]string, cfg.n) 315 | for i := 0; i < cfg.n; i++ { 316 | servernames[i] = cfg.servername(gid, i) 317 | } 318 | m[gid] = servernames 319 | } 320 | cfg.mck.Join(m) 321 | } 322 | 323 | // tell the shardctrler that a group is leaving. 324 | func (cfg *config) leave(gi int) { 325 | cfg.leavem([]int{gi}) 326 | } 327 | 328 | func (cfg *config) leavem(gis []int) { 329 | gids := make([]int, 0, len(gis)) 330 | for _, g := range gis { 331 | gids = append(gids, cfg.groups[g].gid) 332 | } 333 | cfg.mck.Leave(gids) 334 | } 335 | 336 | var ncpu_once sync.Once 337 | 338 | func make_config(t *testing.T, n int, unreliable bool, maxraftstate int) *config { 339 | ncpu_once.Do(func() { 340 | if runtime.NumCPU() < 2 { 341 | fmt.Printf("warning: only one CPU, which may conceal locking bugs\n") 342 | } 343 | rand.Seed(makeSeed()) 344 | }) 345 | runtime.GOMAXPROCS(4) 346 | cfg := &config{} 347 | cfg.t = t 348 | cfg.maxraftstate = maxraftstate 349 | cfg.net = labrpc.MakeNetwork() 350 | cfg.start = time.Now() 351 | 352 | // controler 353 | cfg.nctrlers = 3 354 | cfg.ctrlerservers = make([]*shardctrler.ShardCtrler, cfg.nctrlers) 355 | for i := 0; i < cfg.nctrlers; i++ { 356 | cfg.StartCtrlerserver(i) 357 | } 358 | cfg.mck = cfg.shardclerk() 359 | 360 | cfg.ngroups = 3 361 | cfg.groups = make([]*group, cfg.ngroups) 362 | cfg.n = n 363 | for gi := 0; gi < cfg.ngroups; gi++ { 364 | gg := &group{} 365 | cfg.groups[gi] = gg 366 | gg.gid = 100 + gi 367 | gg.servers = make([]*ShardKV, cfg.n) 368 | gg.saved = make([]*raft.Persister, cfg.n) 369 | gg.endnames = make([][]string, cfg.n) 370 | gg.mendnames = make([][]string, cfg.nctrlers) 371 | for i := 0; i < cfg.n; i++ { 372 | cfg.StartServer(gi, i) 373 | } 374 | } 375 | 376 | cfg.clerks = make(map[*Clerk][]string) 377 | cfg.nextClientId = cfg.n + 1000 // client ids start 1000 above the highest serverid 378 | 379 | cfg.net.Reliable(!unreliable) 380 | 381 | return cfg 382 | } 383 | -------------------------------------------------------------------------------- /src/shardkv/server.go: -------------------------------------------------------------------------------- 1 | package shardkv 2 | 3 | import "mit6.824/labrpc" 4 | import "mit6.824/raft" 5 | import "sync" 6 | import "mit6.824/labgob" 7 | 8 | type Op struct { 9 | // Your definitions here. 10 | // Field names must start with capital letters, 11 | // otherwise RPC will break. 12 | } 13 | 14 | type ShardKV struct { 15 | mu sync.Mutex 16 | me int 17 | rf *raft.Raft 18 | applyCh chan raft.ApplyMsg 19 | make_end func(string) *labrpc.ClientEnd 20 | gid int 21 | ctrlers []*labrpc.ClientEnd 22 | maxraftstate int // snapshot if log grows this big 23 | 24 | // Your definitions here. 25 | } 26 | 27 | func (kv *ShardKV) Get(args *GetArgs, reply *GetReply) { 28 | // Your code here. 29 | } 30 | 31 | func (kv *ShardKV) PutAppend(args *PutAppendArgs, reply *PutAppendReply) { 32 | // Your code here. 33 | } 34 | 35 | // the tester calls Kill() when a ShardKV instance won't 36 | // be needed again. you are not required to do anything 37 | // in Kill(), but it might be convenient to (for example) 38 | // turn off debug output from this instance. 39 | func (kv *ShardKV) Kill() { 40 | kv.rf.Kill() 41 | // Your code here, if desired. 42 | } 43 | 44 | // servers[] contains the ports of the servers in this group. 45 | // 46 | // me is the index of the current server in servers[]. 47 | // 48 | // the k/v server should store snapshots through the underlying Raft 49 | // implementation, which should call persister.SaveStateAndSnapshot() to 50 | // atomically save the Raft state along with the snapshot. 51 | // 52 | // the k/v server should snapshot when Raft's saved state exceeds 53 | // maxraftstate bytes, in order to allow Raft to garbage-collect its 54 | // log. if maxraftstate is -1, you don't need to snapshot. 55 | // 56 | // gid is this group's GID, for interacting with the shardctrler. 57 | // 58 | // pass ctrlers[] to shardctrler.MakeClerk() so you can send 59 | // RPCs to the shardctrler. 60 | // 61 | // make_end(servername) turns a server name from a 62 | // Config.Groups[gid][i] into a labrpc.ClientEnd on which you can 63 | // send RPCs. You'll need this to send RPCs to other groups. 64 | // 65 | // look at client.go for examples of how to use ctrlers[] 66 | // and make_end() to send RPCs to the group owning a specific shard. 67 | // 68 | // StartServer() must return quickly, so it should start goroutines 69 | // for any long-running work. 70 | func StartServer(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int, gid int, ctrlers []*labrpc.ClientEnd, make_end func(string) *labrpc.ClientEnd) *ShardKV { 71 | // call labgob.Register on structures you want 72 | // Go's RPC library to marshall/unmarshall. 73 | labgob.Register(Op{}) 74 | 75 | kv := new(ShardKV) 76 | kv.me = me 77 | kv.maxraftstate = maxraftstate 78 | kv.make_end = make_end 79 | kv.gid = gid 80 | kv.ctrlers = ctrlers 81 | 82 | // Your initialization code here. 83 | 84 | // Use something like this to talk to the shardctrler: 85 | // kv.mck = shardctrler.MakeClerk(kv.ctrlers) 86 | 87 | kv.applyCh = make(chan raft.ApplyMsg) 88 | kv.rf = raft.Make(servers, me, persister, kv.applyCh) 89 | 90 | return kv 91 | } 92 | --------------------------------------------------------------------------------