├── .gitignore ├── .vscode └── tasks.json ├── LICENSE ├── benchmark ├── binary-serialize │ ├── binary-serialize.md │ └── binary-serialize_test.go ├── generate │ ├── generate-data.go │ └── generate_test.go ├── multi-raft │ ├── client │ │ ├── client.go │ │ └── main │ │ │ └── main.go │ ├── raft-server-benchmark.md │ ├── raft_client │ │ ├── main │ │ │ └── main.go │ │ └── raft_client.go │ ├── raft_server │ │ ├── main │ │ │ └── main.go │ │ └── raft_server.go │ ├── server │ │ ├── main │ │ │ └── main.go │ │ └── server.go │ └── simple-server-benchmark.md └── thrift-serialize │ ├── thrift-serialize.md │ └── thrift-serialize_test.go ├── config └── config.go ├── experiment ├── ondisk │ ├── db.go │ ├── engine │ │ ├── engine.go │ │ └── mraft_router.go │ ├── fsm.go │ ├── main │ │ └── app.go │ ├── metrics.go │ ├── ondisk.go │ └── raftd │ │ └── mraft.go ├── simpleondisk │ ├── db.go │ ├── fsm.go │ ├── httpengine │ │ └── engine.go │ ├── main │ │ └── main.go │ ├── ondisk.go │ └── test │ │ └── test.go └── store │ ├── kv.go │ └── kvstore.go ├── go.mod ├── go.sum ├── gossip ├── config.go ├── coordinate │ ├── client.go │ ├── client_test.go │ ├── config.go │ ├── coordinate.go │ ├── coordinate_test.go │ ├── performance_test.go │ ├── phantom.go │ └── util_test.go ├── delegate.go ├── event.go ├── gossip.go ├── gossip_test.go ├── message.go └── ping_delegate.go ├── logger └── zaplog.go ├── productready ├── README.md ├── config │ └── config.go ├── engine.go ├── httpd │ └── handle.go ├── ilogger │ └── logger.go ├── main │ └── app.go ├── router.go ├── storage │ ├── command.go │ ├── config.go │ ├── del.go │ ├── event.go │ ├── get.go │ ├── op.go │ ├── put.go │ ├── sm.go │ ├── storage.go │ └── store │ │ ├── pebbledb.go │ │ ├── store.go │ │ └── utils.go └── utils │ └── utils.go ├── readme.md └── test ├── metrics └── main.go ├── serialize └── serialize.go └── test.go /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/windows,osx,linux,code 3 | 4 | ### Code ### 5 | # Visual Studio Code - https://code.visualstudio.com/ 6 | /vendor 7 | .settings/ 8 | tsconfig.json 9 | jsconfig.json 10 | idl/ 11 | rpc/ 12 | productready/cmd/main/data 13 | productready/cmd/main/main 14 | 15 | ### Linux ### 16 | *~ 17 | 18 | # temporary files which can be created if a process still has a handle open of a deleted file 19 | .fuse_hidden* 20 | 21 | # KDE directory preferences 22 | .directory 23 | 24 | # Linux trash folder which might appear on any partition or disk 25 | .Trash-* 26 | 27 | # .nfs files are created when an open file is removed but is still being accessed 28 | .nfs* 29 | 30 | ### OSX ### 31 | *.DS_Store 32 | .AppleDouble 33 | .LSOverride 34 | 35 | # Icon must end with two \r 36 | Icon 37 | 38 | # Thumbnails 39 | ._* 40 | 41 | # Files that might appear in the root of a volume 42 | .DocumentRevisions-V100 43 | .fseventsd 44 | .Spotlight-V100 45 | .TemporaryItems 46 | .Trashes 47 | .VolumeIcon.icns 48 | .com.apple.timemachine.donotpresent 49 | 50 | # Directories potentially created on remote AFP share 51 | .AppleDB 52 | .AppleDesktop 53 | Network Trash Folder 54 | Temporary Items 55 | .apdisk 56 | 57 | ### Windows ### 58 | # Windows thumbnail cache files 59 | Thumbs.db 60 | ehthumbs.db 61 | ehthumbs_vista.db 62 | 63 | # Folder config file 64 | Desktop.ini 65 | 66 | # Recycle Bin used on file shares 67 | $RECYCLE.BIN/ 68 | 69 | # Windows Installer files 70 | *.cab 71 | *.msi 72 | *.msm 73 | *.msp 74 | 75 | # Windows shortcuts 76 | *.lnk 77 | 78 | 79 | # End of https://www.gitignore.io/api/windows,osx,linux,code 80 | mraft 81 | simpleondisk/test/test 82 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "command": "go", 4 | "type": "shell", 5 | "presentation" : { 6 | "reveal": "always" 7 | }, 8 | "options":{ 9 | "cwd": "${fileDirname}" 10 | }, 11 | "problemMatcher":[], 12 | "tasks": [ 13 | { 14 | "label": "run", 15 | "options": { 16 | "env": { 17 | "CGO_CFLAGS": "-I/usr/local/include/rocksdb", 18 | "CGO_LDFLAGS":"-L/usr/local/lib -lrocksdb -lstdc++ -lm -lz -lbz2 -lsnappy -llz4", 19 | "http_proxy": "", 20 | "https_proxy": "", 21 | "all_proxy": "" 22 | } 23 | }, 24 | "osx": { 25 | "args": [ 26 | "run", 27 | "${workspaceRoot}/app.go" 28 | ] 29 | } 30 | }, 31 | { 32 | "label": "build", 33 | "options": { 34 | "cwd": "${fileDirname}", 35 | "env": { 36 | "CGO_CFLAGS": "-I/usr/local/include/rocksdb", 37 | "CGO_LDFLAGS":"-L/usr/local/lib -lrocksdb -lstdc++ -lm -lz -lbz2 -lsnappy -llz4", 38 | "http_proxy": "", 39 | "https_proxy": "", 40 | "all_proxy": "" 41 | } 42 | }, 43 | "args":[ 44 | "build", 45 | "-v", 46 | //"-x", 47 | "." 48 | ] 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /benchmark/binary-serialize/binary-serialize.md: -------------------------------------------------------------------------------- 1 | go test -bench=. -benchmem 2 | 3 |
4 | 5 | ``` 6 | goos: darwin 7 | goarch: amd64 8 | pkg: github.com/xkeyideal/mraft/benchmark/binary-serialize 9 | BenchmarkBinaryEncode-4 10000000 214 ns/op 149 B/op 3 allocs/op 10 | BenchmarkBinaryDecode-4 50000000 29.9 ns/op 8 B/op 1 allocs/op 11 | PASS 12 | ok github.com/xkeyideal/mraft/benchmark/binary-serialize 4.059s 13 | ``` -------------------------------------------------------------------------------- /benchmark/binary-serialize/binary-serialize_test.go: -------------------------------------------------------------------------------- 1 | package serialize 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | "io" 8 | "testing" 9 | ) 10 | 11 | func encode(key, val []byte, w io.Writer) error { 12 | dataSize := make([]byte, 8) 13 | keySize := make([]byte, 8) 14 | valSize := make([]byte, 8) 15 | 16 | kl := len(key) 17 | vl := len(val) 18 | 19 | binary.LittleEndian.PutUint64(dataSize, uint64(kl+vl+8+8)) 20 | if _, err := w.Write(dataSize); err != nil { 21 | return err 22 | } 23 | 24 | binary.LittleEndian.PutUint64(keySize, uint64(kl)) 25 | if _, err := w.Write(keySize); err != nil { 26 | return err 27 | } 28 | 29 | if _, err := w.Write(key); err != nil { 30 | return err 31 | } 32 | 33 | binary.LittleEndian.PutUint64(valSize, uint64(vl)) 34 | if _, err := w.Write(valSize); err != nil { 35 | return err 36 | } 37 | 38 | if _, err := w.Write(val); err != nil { 39 | return err 40 | } 41 | 42 | return nil 43 | } 44 | 45 | func decode(r io.Reader) ([]byte, []byte, error) { 46 | sz := make([]byte, 8) 47 | if _, err := io.ReadFull(r, sz); err != nil { 48 | return nil, nil, err 49 | } 50 | dataSize := binary.LittleEndian.Uint64(sz) 51 | data := make([]byte, dataSize) 52 | if _, err := io.ReadFull(r, data); err != nil { 53 | return nil, nil, err 54 | } 55 | 56 | kl := binary.LittleEndian.Uint64(data[:8]) 57 | key := data[8 : kl+8] 58 | vl := binary.LittleEndian.Uint64(data[kl+8 : kl+16]) 59 | val := data[kl+16:] 60 | if uint64(len(val)) != vl { 61 | return nil, nil, errors.New("size isn't equal") 62 | } 63 | 64 | return key, val, nil 65 | } 66 | 67 | func TestBinarySerialize(t *testing.T) { 68 | key := []byte("multi-raft-key") 69 | val := []byte("multi-raft-value") 70 | 71 | buf := &bytes.Buffer{} 72 | err := encode(key, val, buf) 73 | if err != nil { 74 | t.Fatalf("binary marshal fatal, %+v", err) 75 | return 76 | } 77 | 78 | key1, val1, err := decode(buf) 79 | if err != nil { 80 | t.Fatalf("binary unmarshal fatal, %+v", err) 81 | return 82 | } 83 | 84 | if !bytes.Equal(key1, key) { 85 | t.Fatalf("binary unmarshal expected %v, got %v", key, key1) 86 | return 87 | } 88 | 89 | if !bytes.Equal(val1, val) { 90 | t.Fatalf("binary unmarshal expected %v, got %v", val, val1) 91 | return 92 | } 93 | } 94 | 95 | func BenchmarkBinaryEncode(b *testing.B) { 96 | key := []byte("multi-raft-key") 97 | val := []byte("multi-raft-value") 98 | 99 | buf := &bytes.Buffer{} 100 | b.ResetTimer() 101 | for i := 0; i < b.N; i++ { 102 | encode(key, val, buf) 103 | } 104 | } 105 | 106 | func BenchmarkBinaryDecode(b *testing.B) { 107 | key := []byte("multi-raft-key") 108 | val := []byte("multi-raft-value") 109 | 110 | buf := &bytes.Buffer{} 111 | err := encode(key, val, buf) 112 | if err != nil { 113 | b.Fatalf("binary marshal fatal, %+v", err) 114 | return 115 | } 116 | 117 | b.ResetTimer() 118 | for i := 0; i < b.N; i++ { 119 | decode(buf) 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /benchmark/generate/generate-data.go: -------------------------------------------------------------------------------- 1 | package generate 2 | 3 | import ( 4 | "crypto/rand" 5 | "io" 6 | mrand "math/rand" 7 | "time" 8 | 9 | "github.com/xkeyideal/mraft/experiment/store" 10 | ) 11 | 12 | var idChars = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") 13 | 14 | const idLen = 20 15 | 16 | func GenerateData() *store.RaftAttribute { 17 | mrand.Seed(time.Now().UnixNano()) 18 | 19 | attr := &store.RaftAttribute{ 20 | AttrID: uint64(mrand.Int31n(1000000) + 1000000), 21 | AttrName: randomId(), 22 | Ages: []int32{}, 23 | Locations: make(map[string]string), 24 | Timestamp: time.Now().UnixNano(), 25 | } 26 | 27 | l := mrand.Intn(100) + 1 28 | for i := 0; i < l; i++ { 29 | attr.Ages = append(attr.Ages, mrand.Int31n(200)+1) 30 | } 31 | 32 | n := mrand.Intn(50) + 1 33 | for i := 0; i < n; i++ { 34 | attr.Locations[randomId()] = randomId() 35 | } 36 | 37 | return attr 38 | } 39 | 40 | // randomId returns a new random id string. 41 | func randomId() string { 42 | b := randomBytesMod(idLen, byte(len(idChars))) 43 | for i, c := range b { 44 | b[i] = idChars[c] 45 | } 46 | return string(b) 47 | } 48 | 49 | func randomBytes(length int) (b []byte) { 50 | b = make([]byte, length) 51 | io.ReadFull(rand.Reader, b) 52 | return 53 | } 54 | 55 | func randomBytesMod(length int, mod byte) (b []byte) { 56 | maxrb := 255 - byte(256%int(mod)) 57 | b = make([]byte, length) 58 | i := 0 59 | for { 60 | r := randomBytes(length + (length / 4)) 61 | for _, c := range r { 62 | if c > maxrb { 63 | continue 64 | } 65 | b[i] = c % mod 66 | i++ 67 | if i == length { 68 | return b 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /benchmark/generate/generate_test.go: -------------------------------------------------------------------------------- 1 | package generate 2 | 3 | import "testing" 4 | 5 | func TestGenerateAttr(t *testing.T) { 6 | attr := GenerateData() 7 | t.Logf("%+v", attr) 8 | } 9 | 10 | func BenchmarkGenerateAttr(b *testing.B) { 11 | for i := 0; i < b.N; i++ { 12 | GenerateData() 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /benchmark/multi-raft/client/client.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "fmt" 7 | "io" 8 | "net" 9 | "sync" 10 | "time" 11 | 12 | "github.com/xkeyideal/mraft/experiment/store" 13 | ) 14 | 15 | const defaultBufferSize = 5 * 1024 16 | 17 | type SimpleClient struct { 18 | mu *sync.Mutex 19 | message chan *store.RaftAttribute 20 | conn net.Conn 21 | reader *bufio.Reader 22 | writer *bufio.Writer 23 | 24 | recv chan struct{} 25 | } 26 | 27 | func NewSimpleClient(address string, recv chan struct{}) (*SimpleClient, error) { 28 | conn, err := net.Dial("tcp", address) 29 | if err != nil { 30 | return nil, err 31 | } 32 | 33 | sc := &SimpleClient{ 34 | mu: &sync.Mutex{}, 35 | message: make(chan *store.RaftAttribute, 1000), 36 | conn: conn, 37 | writer: bufio.NewWriterSize(conn, defaultBufferSize), 38 | reader: bufio.NewReaderSize(conn, defaultBufferSize), 39 | 40 | recv: recv, 41 | } 42 | 43 | go sc.handleSend() 44 | go sc.handleRecv() 45 | 46 | return sc, nil 47 | } 48 | 49 | // Flush writes all buffered data to the underlying TCP connection 50 | func (sc *SimpleClient) Flush() error { 51 | return sc.writer.Flush() 52 | } 53 | 54 | func (sc *SimpleClient) Write(b []byte) (int, error) { 55 | return sc.writer.Write(b) 56 | } 57 | 58 | func (sc *SimpleClient) writeCommand(attr *store.RaftAttribute) error { 59 | sc.mu.Lock() 60 | 61 | _, err := attr.WriteTo2(sc) 62 | if err != nil { 63 | fmt.Println("Write failed,", err.Error()) 64 | sc.mu.Unlock() 65 | return err 66 | } 67 | 68 | sc.conn.SetWriteDeadline(time.Now().Add(3 * time.Second)) 69 | 70 | sc.Flush() 71 | 72 | sc.mu.Unlock() 73 | 74 | return nil 75 | } 76 | 77 | func (sc *SimpleClient) SendMessage(attr *store.RaftAttribute) { 78 | sc.message <- attr 79 | } 80 | 81 | func (sc *SimpleClient) handleSend() { 82 | 83 | for { 84 | select { 85 | case msg := <-sc.message: 86 | err := sc.writeCommand(msg) 87 | if err != nil { 88 | fmt.Println("Error to send message because of ", err.Error()) 89 | break 90 | } 91 | } 92 | } 93 | } 94 | 95 | func (sc *SimpleClient) handleRecv() { 96 | var err error 97 | sz := make([]byte, 8) 98 | 99 | for { 100 | sc.conn.SetReadDeadline(time.Now().Add(3 * time.Second)) 101 | _, err = sc.reader.Read(sz) 102 | fmt.Println("Read:", err) 103 | if err != nil { 104 | if err == io.EOF { 105 | err = nil 106 | } else { 107 | err = fmt.Errorf("failed to read datasize - %s", err) 108 | } 109 | break 110 | } 111 | 112 | dataSize := binary.LittleEndian.Uint64(sz) 113 | body := make([]byte, dataSize) 114 | 115 | _, err = io.ReadFull(sc.reader, body) 116 | if err != nil { 117 | err = fmt.Errorf("failed to read databody - %s", err) 118 | break 119 | } 120 | 121 | sc.recv <- struct{}{} 122 | 123 | // fmt.Println(string(body)) 124 | } 125 | } 126 | 127 | func (sc *SimpleClient) Stop() { 128 | sc.conn.Close() 129 | } 130 | -------------------------------------------------------------------------------- /benchmark/multi-raft/client/main/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "time" 7 | 8 | "github.com/xkeyideal/mraft/benchmark/generate" 9 | "github.com/xkeyideal/mraft/benchmark/multi-raft/client" 10 | "github.com/xkeyideal/mraft/experiment/store" 11 | ) 12 | 13 | type TestClient struct { 14 | client *client.SimpleClient 15 | send chan *store.RaftAttribute 16 | recv chan struct{} 17 | 18 | exitchan chan struct{} 19 | wg sync.WaitGroup 20 | } 21 | 22 | func (tc *TestClient) Gen(n int) { 23 | for i := 0; i < n; i++ { 24 | attr := generate.GenerateData() 25 | tc.send <- attr 26 | } 27 | } 28 | 29 | func (tc *TestClient) Send() { 30 | for { 31 | select { 32 | case <-tc.exitchan: 33 | return 34 | case attr := <-tc.send: 35 | tc.client.SendMessage(attr) 36 | } 37 | } 38 | } 39 | 40 | func (tc *TestClient) Recv() { 41 | for { 42 | select { 43 | case <-tc.exitchan: 44 | return 45 | case <-tc.recv: 46 | tc.wg.Done() 47 | } 48 | } 49 | } 50 | 51 | func (tc *TestClient) Stop() { 52 | close(tc.exitchan) 53 | tc.client.Stop() 54 | } 55 | 56 | func makeClient(connections, taskNum int, wg *sync.WaitGroup) { 57 | 58 | cwg := &sync.WaitGroup{} 59 | cwg.Add(connections) 60 | 61 | for i := 0; i < connections; i++ { 62 | go func(cwg *sync.WaitGroup) { 63 | tc := &TestClient{ 64 | send: make(chan *store.RaftAttribute, 1000), 65 | recv: make(chan struct{}, 1000), 66 | exitchan: make(chan struct{}), 67 | wg: sync.WaitGroup{}, 68 | } 69 | 70 | client, err := client.NewSimpleClient("10.101.44.4:25701", tc.recv) 71 | if err != nil { 72 | panic(err) 73 | } 74 | 75 | tc.client = client 76 | 77 | tc.wg.Add(taskNum) 78 | 79 | go tc.Gen(taskNum) 80 | go tc.Send() 81 | go tc.Recv() 82 | 83 | tc.wg.Wait() 84 | 85 | cwg.Done() 86 | }(cwg) 87 | } 88 | 89 | cwg.Wait() 90 | 91 | wg.Done() 92 | } 93 | 94 | func main() { 95 | st := time.Now() 96 | 97 | g, c, n := 1, 1, 1 98 | 99 | wg := &sync.WaitGroup{} 100 | 101 | wg.Add(g) 102 | 103 | for i := 0; i < g; i++ { 104 | go makeClient(c, n, wg) 105 | } 106 | 107 | wg.Wait() 108 | 109 | ed := time.Now() 110 | op := float64(ed.UnixNano()-st.UnixNano()) / float64(n*1000) 111 | 112 | fmt.Printf("线程数:%d, 每个线程连接数:%d, 请求次数:%d, 平均耗时:%.1f us/op\n", g, c, n, op) 113 | } 114 | -------------------------------------------------------------------------------- /benchmark/multi-raft/raft-server-benchmark.md: -------------------------------------------------------------------------------- 1 | ## multi-raft 压测结果 2 | 3 | ```json 4 | 总次数: 6000, 错误数: 0, 线程数: 6, 每个线程连接数: 1, 请求次数: 1000 5 | 最小值: 663us, 最大值: 203700us, 中间值: 6182.6us 6 | 75百分位: 3939.0us, 90百分位: 5404.0us, 95百分位: 11294.0us, 99百分位: 93239.6us 7 | ``` 8 | 9 | ### 压测程序Server端 10 | 11 | [server端主程序](https://github.com/xkeyideal/mraft/blob/master/benchmark/multi-raft/raft_server/raft_server.go)注意修改ip地址 12 | [server端启动程序](https://github.com/xkeyideal/mraft/blob/master/benchmark/multi-raft/raft_server/main/main.go) 注意启动参数,同时注意修改raft的存储目录地址 13 | 14 | 第一个参数为raft的NodeID,第二个参数为应用程序的TCP端口号 15 | 16 | ```go 17 | go run main.go 10000 25700 18 | go run main.go 10001 25800 19 | go run main.go 10002 25900 20 | ``` 21 | 22 | server端总计配置了10个cluster 23 | 24 | ### 压测程序Client端 25 | 26 | [client端主程序](https://github.com/xkeyideal/mraft/blob/master/benchmark/multi-raft/raft_client/raft_client.go) 27 | [client端启动程序](https://github.com/xkeyideal/mraft/blob/master/benchmark/multi-raft/raft_client/main/main.go) 注意ip地址和端口号要与server对应 28 | 29 | ### 压测环境 30 | 31 | 机器采用的是开发环境的机器,操作系统macOS High Sierra,Darwin Kernel Version 18.6.0 root:xnu-4903.261.4~2/RELEASE_X86_64 x86_64 i386 iMac14,2 Darwin 32 | 33 | CPU:3.29 GHz Intel Core i5 34 | 35 | 内存:20 GB 1600 MHz DDR3 36 | 37 | 磁盘:256GB SATA SSD -------------------------------------------------------------------------------- /benchmark/multi-raft/raft_client/main/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "time" 7 | 8 | "github.com/xkeyideal/mraft/benchmark/generate" 9 | "github.com/xkeyideal/mraft/benchmark/multi-raft/raft_client" 10 | "github.com/xkeyideal/mraft/experiment/store" 11 | 12 | "github.com/rcrowley/go-metrics" 13 | ) 14 | 15 | var histogram metrics.Histogram 16 | var counter metrics.Counter 17 | 18 | var servers = []string{"10.101.44.4:25700", "10.101.44.4:25800", "10.101.44.4:25900"} 19 | 20 | type TestClient struct { 21 | client *raft_client.RaftSimpleClient 22 | send chan *store.RaftAttribute 23 | query chan *store.ReadArgument 24 | 25 | exitchan chan struct{} 26 | wg sync.WaitGroup 27 | } 28 | 29 | func (tc *TestClient) Gen(n int) { 30 | for i := 0; i < n; i++ { 31 | attr := generate.GenerateData() 32 | tc.send <- attr 33 | } 34 | } 35 | 36 | func (tc *TestClient) SendWriteCommand() { 37 | for { 38 | select { 39 | case <-tc.exitchan: 40 | return 41 | case attr := <-tc.send: 42 | st := time.Now().UnixNano() 43 | _, err := tc.client.PublishCommand(attr) 44 | if err != nil { 45 | counter.Inc(1) 46 | } 47 | 48 | x := time.Now().UnixNano() - st 49 | histogram.Update(x) 50 | 51 | // n := rand.Intn(10) 52 | // fmt.Println("random_n:", n) 53 | // if n < 10 { // 30%入查询 54 | // tc.query <- &store.ReadArgument{ 55 | // Key: fmt.Sprintf("%d_%s", attr.AttrID, attr.AttrName), 56 | // HashKey: attr.AttrID, 57 | // Sync: true, 58 | // } 59 | // } else { 60 | tc.wg.Done() 61 | // } 62 | } 63 | } 64 | } 65 | 66 | func (tc *TestClient) SendQueryCommand() { 67 | for { 68 | select { 69 | case <-tc.exitchan: 70 | return 71 | case arg := <-tc.query: 72 | attr, err := tc.client.PublishCommand(arg) 73 | fmt.Println(attr, err) 74 | tc.wg.Done() 75 | } 76 | } 77 | } 78 | 79 | func (tc *TestClient) Stop() { 80 | close(tc.exitchan) 81 | tc.client.Stop() 82 | } 83 | 84 | func makeClient(index, connections, taskNum int, wg *sync.WaitGroup) { 85 | 86 | cwg := &sync.WaitGroup{} 87 | cwg.Add(connections) 88 | 89 | for i := 0; i < connections; i++ { 90 | go func(index int, cwg *sync.WaitGroup) { 91 | tc := &TestClient{ 92 | send: make(chan *store.RaftAttribute, 100), 93 | query: make(chan *store.ReadArgument, 30), 94 | exitchan: make(chan struct{}), 95 | wg: sync.WaitGroup{}, 96 | } 97 | 98 | client, err := raft_client.NewRaftSimpleClient(servers[index%3]) 99 | if err != nil { 100 | panic(err) 101 | } 102 | 103 | tc.client = client 104 | 105 | tc.wg.Add(taskNum) 106 | 107 | go tc.Gen(taskNum) 108 | go tc.SendWriteCommand() 109 | go tc.SendQueryCommand() 110 | 111 | tc.wg.Wait() 112 | 113 | cwg.Done() 114 | }(index, cwg) 115 | } 116 | 117 | cwg.Wait() 118 | 119 | wg.Done() 120 | } 121 | 122 | func main() { 123 | g, c, n := 6, 1, 1000 124 | 125 | s := metrics.NewExpDecaySample(10240, 0.015) // or metrics.NewUniformSample(1028) 126 | histogram = metrics.NewHistogram(s) 127 | 128 | counter = metrics.NewCounter() 129 | 130 | wg := &sync.WaitGroup{} 131 | 132 | wg.Add(g) 133 | 134 | for i := 0; i < g; i++ { 135 | go makeClient(i, c, n, wg) 136 | } 137 | 138 | wg.Wait() 139 | 140 | fmt.Printf("总次数: %d, 错误数: %d, 线程数: %d, 每个线程连接数: %d, 请求次数: %d\n", histogram.Count(), counter.Count(), g, c, n) 141 | fmt.Printf("最小值: %dus, 最大值: %dus, 中间值: %.1fus\n", histogram.Min()/1e3, histogram.Max()/1e3, histogram.Mean()/1e3) 142 | fmt.Printf("75百分位: %.1fus, 90百分位: %.1fus, 95百分位: %.1fus, 99百分位: %.1fus\n", histogram.Percentile(0.75)/1e3, histogram.Percentile(0.9)/1e3, histogram.Percentile(0.95)/1e3, histogram.Percentile(0.99)/1e3) 143 | } 144 | -------------------------------------------------------------------------------- /benchmark/multi-raft/raft_client/raft_client.go: -------------------------------------------------------------------------------- 1 | package raft_client 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "net" 10 | "sync" 11 | "time" 12 | 13 | "github.com/xkeyideal/mraft/experiment/store" 14 | ) 15 | 16 | const defaultBufferSize = 5 * 1024 17 | 18 | var ErrStopped = errors.New("stopped") 19 | 20 | type ProducerTransaction struct { 21 | cmd interface{} 22 | doneChan chan *ProducerTransaction 23 | Resp *store.RaftAttribute 24 | Error error 25 | } 26 | 27 | type RaftSimpleClient struct { 28 | mu *sync.Mutex 29 | 30 | conn net.Conn 31 | 32 | reader *bufio.Reader 33 | writer *bufio.Writer 34 | 35 | responseChan chan []byte 36 | 37 | transactionChan chan *ProducerTransaction 38 | transactions []*ProducerTransaction 39 | 40 | exitChan chan struct{} 41 | } 42 | 43 | func NewRaftSimpleClient(address string) (*RaftSimpleClient, error) { 44 | conn, err := net.Dial("tcp", address) 45 | if err != nil { 46 | return nil, err 47 | } 48 | 49 | sc := &RaftSimpleClient{ 50 | mu: &sync.Mutex{}, 51 | conn: conn, 52 | writer: bufio.NewWriterSize(conn, defaultBufferSize), 53 | reader: bufio.NewReaderSize(conn, defaultBufferSize), 54 | responseChan: make(chan []byte, 10), 55 | transactionChan: make(chan *ProducerTransaction), 56 | exitChan: make(chan struct{}), 57 | } 58 | 59 | go sc.handleSend() 60 | go sc.handleRecv() 61 | 62 | return sc, nil 63 | } 64 | 65 | // Flush writes all buffered data to the underlying TCP connection 66 | func (sc *RaftSimpleClient) Flush() error { 67 | return sc.writer.Flush() 68 | } 69 | 70 | func (sc *RaftSimpleClient) Write(b []byte) (int, error) { 71 | return sc.writer.Write(b) 72 | } 73 | 74 | func (sc *RaftSimpleClient) readCommand(arg *store.ReadArgument) error { 75 | sc.mu.Lock() 76 | 77 | _, err := arg.WriteTo(store.CommandRead, sc) 78 | if err != nil { 79 | sc.mu.Unlock() 80 | return err 81 | } 82 | 83 | sc.conn.SetWriteDeadline(time.Now().Add(3 * time.Second)) 84 | 85 | sc.Flush() 86 | 87 | sc.mu.Unlock() 88 | 89 | return nil 90 | } 91 | 92 | func (sc *RaftSimpleClient) writeCommand(attr *store.RaftAttribute) error { 93 | sc.mu.Lock() 94 | 95 | _, err := attr.WriteTo(store.CommandUpsert, sc) 96 | if err != nil { 97 | sc.mu.Unlock() 98 | return err 99 | } 100 | 101 | sc.conn.SetWriteDeadline(time.Now().Add(3 * time.Second)) 102 | 103 | sc.Flush() 104 | 105 | sc.mu.Unlock() 106 | 107 | return nil 108 | } 109 | 110 | func (sc *RaftSimpleClient) PublishCommand(cmd interface{}) (*store.RaftAttribute, error) { 111 | doneChan := make(chan *ProducerTransaction) 112 | err := sc.sendCommandAsync(cmd, doneChan) 113 | if err != nil { 114 | close(doneChan) 115 | return nil, err 116 | } 117 | 118 | // 阻塞 119 | t := <-doneChan 120 | return t.Resp, t.Error 121 | } 122 | 123 | func (sc *RaftSimpleClient) sendCommandAsync(cmd interface{}, doneChan chan *ProducerTransaction) error { 124 | t := &ProducerTransaction{ 125 | cmd: cmd, 126 | doneChan: doneChan, 127 | } 128 | 129 | select { 130 | case sc.transactionChan <- t: 131 | case <-sc.exitChan: 132 | return ErrStopped 133 | } 134 | 135 | return nil 136 | } 137 | 138 | func (sc *RaftSimpleClient) popTransaction(data []byte) { 139 | t := sc.transactions[0] 140 | sc.transactions = sc.transactions[1:] 141 | 142 | cmdSize := binary.LittleEndian.Uint32(data[:4]) 143 | cmd := string(data[4 : 4+cmdSize]) 144 | errSignal := string(data[4+cmdSize : 4+cmdSize+1]) 145 | switch cmd { 146 | case store.CommandUpsert: 147 | if errSignal == "0" { 148 | t.Error = errors.New(string(data[4+cmdSize+1:])) 149 | } 150 | case store.CommandRead: 151 | if errSignal == "0" { 152 | t.Error = errors.New(string(data[4+cmdSize+1:])) 153 | } else { 154 | attr := &store.RaftAttribute{} 155 | t.Error = attr.Unmarshal(data[4+cmdSize+1:]) 156 | t.Resp = attr 157 | } 158 | } 159 | 160 | t.doneChan <- t 161 | } 162 | 163 | func (sc *RaftSimpleClient) handleSend() { 164 | for { 165 | select { 166 | case t := <-sc.transactionChan: 167 | sc.transactions = append(sc.transactions, t) 168 | switch t.cmd.(type) { 169 | case *store.ReadArgument: 170 | sc.readCommand(t.cmd.(*store.ReadArgument)) 171 | case *store.RaftAttribute: 172 | sc.writeCommand(t.cmd.(*store.RaftAttribute)) 173 | } 174 | case data := <-sc.responseChan: 175 | sc.popTransaction(data) 176 | case <-sc.exitChan: 177 | return 178 | } 179 | } 180 | } 181 | 182 | func (sc *RaftSimpleClient) handleRecv() { 183 | var err error 184 | sz := make([]byte, 8) 185 | 186 | for { 187 | sc.conn.SetReadDeadline(time.Now().Add(6 * time.Second)) 188 | _, err = sc.reader.Read(sz) 189 | if err != nil { 190 | if err == io.EOF { 191 | err = nil 192 | } else { 193 | err = fmt.Errorf("failed to read datasize - %s", err) 194 | } 195 | break 196 | } 197 | 198 | dataSize := binary.LittleEndian.Uint64(sz) 199 | body := make([]byte, dataSize) 200 | 201 | _, err = io.ReadFull(sc.reader, body) 202 | if err != nil { 203 | err = fmt.Errorf("failed to read databody - %s", err) 204 | break 205 | } 206 | 207 | sc.responseChan <- body 208 | } 209 | } 210 | 211 | func (sc *RaftSimpleClient) Stop() { 212 | close(sc.exitChan) 213 | sc.conn.Close() 214 | } 215 | -------------------------------------------------------------------------------- /benchmark/multi-raft/raft_server/main/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/signal" 7 | "strconv" 8 | "syscall" 9 | 10 | "github.com/xkeyideal/mraft/benchmark/multi-raft/raft_server" 11 | ) 12 | 13 | func main() { 14 | nodeID, err := strconv.ParseUint(os.Args[1], 10, 64) 15 | if err != nil { 16 | fmt.Println(err.Error()) 17 | os.Exit(1) 18 | } 19 | 20 | port := os.Args[2] 21 | 22 | // nodeID: 10000, 10001, 10002 23 | // port: 25700, 25800, 25900 24 | server, err := raft_server.NewRaftSimpleServer(fmt.Sprintf("10.101.44.4:%s", port), nodeID) 25 | if err != nil { 26 | panic(err) 27 | } 28 | 29 | signals := make(chan os.Signal, 1) 30 | signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM, syscall.SIGKILL) 31 | <-signals 32 | 33 | server.Stop() 34 | } 35 | -------------------------------------------------------------------------------- /benchmark/multi-raft/raft_server/raft_server.go: -------------------------------------------------------------------------------- 1 | package raft_server 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "fmt" 7 | "io" 8 | "net" 9 | "runtime" 10 | "strings" 11 | "sync" 12 | "time" 13 | 14 | "github.com/xkeyideal/mraft/experiment/ondisk" 15 | "github.com/xkeyideal/mraft/experiment/store" 16 | ) 17 | 18 | const defaultBufferSize = 5 * 1024 19 | 20 | var ( 21 | raftDataDir = "/Users/xkey/raftlab/mraft-server-ondisk" 22 | raftNodePeers = map[uint64]string{ 23 | 10000: "10.101.44.4:54000", 24 | 10001: "10.101.44.4:54100", 25 | 10002: "10.101.44.4:54200", 26 | } 27 | // raftClusterIDs = []uint64{254000, 254100, 254200} 28 | ) 29 | 30 | type clientConn struct { 31 | writeLock sync.RWMutex 32 | net.Conn 33 | 34 | Reader *bufio.Reader 35 | Writer *bufio.Writer 36 | } 37 | 38 | func newClientConn(conn net.Conn) *clientConn { 39 | return &clientConn{ 40 | Conn: conn, 41 | Reader: bufio.NewReaderSize(conn, defaultBufferSize), 42 | Writer: bufio.NewWriterSize(conn, defaultBufferSize), 43 | } 44 | } 45 | 46 | type RaftSimpleServer struct { 47 | writelock *sync.Mutex 48 | 49 | nh *ondisk.OnDiskRaft 50 | 51 | tcpListener net.Listener 52 | } 53 | 54 | func NewRaftSimpleServer(address string, nodeID uint64) (*RaftSimpleServer, error) { 55 | l, err := net.Listen("tcp", address) 56 | if err != nil { 57 | return nil, err 58 | } 59 | 60 | raftClusterIDs := []uint64{} 61 | var clusterID uint64 = 250000 62 | var i uint64 63 | for i = 0; i < 10; i++ { 64 | raftClusterIDs = append(raftClusterIDs, clusterID+i) 65 | } 66 | 67 | ss := &RaftSimpleServer{ 68 | writelock: &sync.Mutex{}, 69 | nh: ondisk.NewOnDiskRaft(raftNodePeers, raftClusterIDs), 70 | tcpListener: l, 71 | } 72 | 73 | ss.nh.Start(raftDataDir, nodeID, "", false) 74 | 75 | go ss.RaftTCPServer() 76 | 77 | return ss, nil 78 | } 79 | 80 | func (ss *RaftSimpleServer) RaftTCPServer() error { 81 | for { 82 | conn, err := ss.tcpListener.Accept() 83 | if err != nil { 84 | if nerr, ok := err.(net.Error); ok && nerr.Temporary() { 85 | runtime.Gosched() 86 | continue 87 | } 88 | // theres no direct way to detect this error because it is not exposed 89 | if !strings.Contains(err.Error(), "use of closed network connection") { 90 | return fmt.Errorf("listener.Accept() error - %s", err) 91 | } 92 | break 93 | } 94 | 95 | go ss.handle(conn) 96 | } 97 | 98 | return nil 99 | } 100 | 101 | func (ss *RaftSimpleServer) handle(conn net.Conn) error { 102 | 103 | var err error 104 | 105 | client := newClientConn(conn) 106 | 107 | sz := make([]byte, 8) 108 | 109 | for { 110 | client.SetReadDeadline(time.Now().Add(3 * time.Second)) 111 | 112 | _, err = client.Reader.Read(sz) 113 | if err != nil { 114 | if err == io.EOF { 115 | err = nil 116 | } else { 117 | err = fmt.Errorf("failed to read datasize - %s", err) 118 | } 119 | break 120 | } 121 | 122 | dataSize := binary.LittleEndian.Uint64(sz) 123 | body := make([]byte, dataSize) 124 | 125 | _, err = io.ReadFull(client.Reader, body) 126 | if err != nil { 127 | err = fmt.Errorf("failed to read databody - %s", err) 128 | break 129 | } 130 | 131 | cmdSize := binary.LittleEndian.Uint32(body[:4]) 132 | 133 | command := string(body[4 : 4+cmdSize]) 134 | 135 | data := body[4+cmdSize:] 136 | 137 | attr, err := ss.execCommand(command, data) 138 | 139 | ss.sendResponse(client, command, attr, err) 140 | } 141 | 142 | return err 143 | } 144 | 145 | func (ss *RaftSimpleServer) execCommand(command string, data []byte) (*store.RaftAttribute, error) { 146 | switch command { 147 | case store.CommandRead: 148 | arg := &store.ReadArgument{} 149 | err := arg.Unmarshal(data) 150 | 151 | if err != nil { 152 | return nil, err 153 | } 154 | 155 | if arg.Sync { 156 | return ss.nh.SyncRead(arg.Key, arg.HashKey) 157 | } 158 | return ss.nh.ReadLocal(arg.Key, arg.HashKey) 159 | case store.CommandUpsert: 160 | attr := &store.RaftAttribute{} 161 | err := attr.Unmarshal(data) 162 | if err != nil { 163 | return nil, err 164 | } 165 | 166 | cmd, _ := attr.GenerateCommand(store.CommandUpsert) 167 | return nil, ss.nh.AdvanceWrite(cmd) 168 | } 169 | 170 | return nil, nil 171 | } 172 | 173 | func (ss *RaftSimpleServer) sendResponse(client *clientConn, command string, attr *store.RaftAttribute, err error) (int, error) { 174 | client.writeLock.Lock() 175 | 176 | var e error 177 | var n int 178 | 179 | if err != nil { 180 | n, e = sendFramedResponse(client.Writer, command, []byte("0"), []byte(err.Error())) 181 | } else { 182 | b, _ := attr.Marshal() 183 | n, e = sendFramedResponse(client.Writer, command, []byte("1"), b) 184 | } 185 | 186 | e = client.SetWriteDeadline(time.Now().Add(3 * time.Second)) 187 | 188 | e = client.Writer.Flush() 189 | 190 | client.writeLock.Unlock() 191 | 192 | return n, e 193 | } 194 | 195 | func (ss *RaftSimpleServer) Stop() { 196 | ss.tcpListener.Close() 197 | } 198 | 199 | func sendFramedResponse(w io.Writer, command string, errSignal, b []byte) (int, error) { 200 | 201 | dataSize := make([]byte, 8) 202 | 203 | l := len(b) + 4 + len(command) + 1 204 | 205 | binary.LittleEndian.PutUint64(dataSize, uint64(l)) 206 | if _, err := w.Write(dataSize); err != nil { 207 | return 0, err 208 | } 209 | 210 | cmdSize := make([]byte, 4) 211 | binary.LittleEndian.PutUint32(cmdSize, uint32(len(command))) 212 | if _, err := w.Write(cmdSize); err != nil { 213 | return 0, err 214 | } 215 | 216 | if _, err := w.Write([]byte(command)); err != nil { 217 | return 0, err 218 | } 219 | 220 | if _, err := w.Write(errSignal); err != nil { 221 | return 0, err 222 | } 223 | 224 | if _, err := w.Write(b); err != nil { 225 | return 0, err 226 | } 227 | 228 | return l + 8, nil 229 | } 230 | -------------------------------------------------------------------------------- /benchmark/multi-raft/server/main/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "os/signal" 6 | "syscall" 7 | 8 | "github.com/xkeyideal/mraft/benchmark/multi-raft/server" 9 | ) 10 | 11 | func main() { 12 | server, err := server.NewSimpleServer("10.101.44.4:25701") 13 | if err != nil { 14 | panic(err) 15 | } 16 | 17 | signals := make(chan os.Signal, 1) 18 | signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM, syscall.SIGKILL) 19 | <-signals 20 | 21 | server.Stop() 22 | } 23 | -------------------------------------------------------------------------------- /benchmark/multi-raft/server/server.go: -------------------------------------------------------------------------------- 1 | package server 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/binary" 7 | "fmt" 8 | "io" 9 | "net" 10 | "runtime" 11 | "strings" 12 | "sync" 13 | "time" 14 | 15 | "github.com/xkeyideal/mraft/experiment/store" 16 | ) 17 | 18 | const defaultBufferSize = 5 * 1024 19 | 20 | type SimpleServer struct { 21 | writelock *sync.Mutex 22 | 23 | tcpListener net.Listener 24 | } 25 | 26 | func NewSimpleServer(address string) (*SimpleServer, error) { 27 | l, err := net.Listen("tcp", address) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | ss := &SimpleServer{ 33 | writelock: &sync.Mutex{}, 34 | tcpListener: l, 35 | } 36 | 37 | go ss.TCPServer() 38 | 39 | return ss, nil 40 | } 41 | 42 | func (ss *SimpleServer) TCPServer() error { 43 | for { 44 | conn, err := ss.tcpListener.Accept() 45 | if err != nil { 46 | if nerr, ok := err.(net.Error); ok && nerr.Temporary() { 47 | runtime.Gosched() 48 | continue 49 | } 50 | // theres no direct way to detect this error because it is not exposed 51 | if !strings.Contains(err.Error(), "use of closed network connection") { 52 | return fmt.Errorf("listener.Accept() error - %s", err) 53 | } 54 | fmt.Println(err) 55 | break 56 | } 57 | 58 | go ss.handle(conn) 59 | } 60 | 61 | return nil 62 | } 63 | 64 | func (ss *SimpleServer) handle(conn net.Conn) error { 65 | 66 | var err error 67 | 68 | reader := bufio.NewReaderSize(conn, defaultBufferSize) 69 | writer := bufio.NewWriterSize(conn, defaultBufferSize) 70 | 71 | sz := make([]byte, 8) 72 | 73 | for { 74 | conn.SetReadDeadline(time.Now().Add(3 * time.Second)) 75 | 76 | _, err = reader.Read(sz) 77 | if err != nil { 78 | if err == io.EOF { 79 | err = nil 80 | } else { 81 | err = fmt.Errorf("failed to read datasize - %s", err) 82 | } 83 | break 84 | } 85 | 86 | dataSize := binary.LittleEndian.Uint64(sz) 87 | body := make([]byte, dataSize) 88 | 89 | _, err = io.ReadFull(reader, body) 90 | if err != nil { 91 | err = fmt.Errorf("failed to read databody - %s", err) 92 | break 93 | } 94 | 95 | attr := &store.RaftAttribute{} 96 | err = attr.Unmarshal(body) 97 | if err != nil { 98 | err = fmt.Errorf("failed to unmarshal databody - %s", err) 99 | break 100 | } 101 | 102 | ss.writelock.Lock() 103 | 104 | wd, e := encode([]byte("done")) 105 | if e != nil { 106 | fmt.Println(e) 107 | } 108 | 109 | writer.Write(wd) 110 | //fmt.Println("send:", n, e) 111 | 112 | conn.SetWriteDeadline(time.Now().Add(3 * time.Second)) 113 | writer.Flush() 114 | 115 | ss.writelock.Unlock() 116 | } 117 | 118 | return err 119 | } 120 | 121 | func (ss *SimpleServer) Stop() { 122 | ss.tcpListener.Close() 123 | } 124 | 125 | func encode(b []byte) ([]byte, error) { 126 | buf := &bytes.Buffer{} 127 | 128 | dataSize := make([]byte, 8) 129 | 130 | l := len(b) 131 | 132 | binary.LittleEndian.PutUint64(dataSize, uint64(l)) 133 | if _, err := buf.Write(dataSize); err != nil { 134 | return nil, err 135 | } 136 | 137 | if _, err := buf.Write(b); err != nil { 138 | return nil, err 139 | } 140 | 141 | return buf.Bytes(), nil 142 | } 143 | -------------------------------------------------------------------------------- /benchmark/multi-raft/simple-server-benchmark.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | TCPServer的压测是multi-group raft压测的前提, multi-group raft的网络协议和数据格式均与simple-server服务一致, 下述是simple-server的简单压测数据 4 | 5 | ``` 6 | 线程数:5, 每个线程连接数:5, 请求次数:1000, 平均耗时:2062.8 us/op 7 | 线程数:1, 每个线程连接数:1, 请求次数:1000, 平均耗时:99.3 us/op 8 | 线程数:10, 每个线程连接数:1, 请求次数:1000, 平均耗时:812.8 us/op 9 | 线程数:10, 每个线程连接数:4, 请求次数:1000, 平均耗时:3441.0 us/op 10 | ``` -------------------------------------------------------------------------------- /benchmark/thrift-serialize/thrift-serialize.md: -------------------------------------------------------------------------------- 1 | go test -bench=. -benchmem 2 | 3 |
4 | 5 | ``` 6 | goos: darwin 7 | goarch: amd64 8 | pkg: github.com/xkeyideal/mraft/benchmark/thrift-serialize 9 | BenchmarkMarshalByThrift-4 3000000 413 ns/op 208 B/op 6 allocs/op 10 | BenchmarkUnmarshalByThrift-4 3000000 418 ns/op 152 B/op 5 allocs/op 11 | PASS 12 | ok github.com/xkeyideal/mraft/benchmark/thrift-serialize 3.343s 13 | ``` -------------------------------------------------------------------------------- /benchmark/thrift-serialize/thrift-serialize_test.go: -------------------------------------------------------------------------------- 1 | package serialize 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "github.com/xkeyideal/mraft/experiment/store" 8 | ) 9 | 10 | func TestMarshalByThrift(t *testing.T) { 11 | cmd := store.NewCommand("command", "key", "value", 10000) 12 | b, err := cmd.Marshal() 13 | if err != nil { 14 | t.Fatalf("thrift marshal fatal, %+v", err) 15 | return 16 | } 17 | 18 | cmd2 := &store.Command{} 19 | err = cmd2.Unmarshal(b) 20 | if err != nil { 21 | t.Fatalf("thrift unmarshal fatal, %+v", err) 22 | return 23 | } 24 | 25 | if !reflect.DeepEqual(cmd, cmd2) { 26 | t.Fatalf("thrift unmarshal expected %v, got %v", cmd, cmd2) 27 | } 28 | 29 | t.Logf("%+v, %+v", cmd, cmd2) 30 | } 31 | 32 | func BenchmarkMarshalByThrift(b *testing.B) { 33 | cmd := store.NewCommand("command", "key", "value", 10000) 34 | 35 | for i := 0; i < b.N; i++ { 36 | cmd.Marshal() 37 | } 38 | } 39 | 40 | func BenchmarkUnmarshalByThrift(b *testing.B) { 41 | cmd := store.NewCommand("command", "key", "value", 10000) 42 | bs, err := cmd.Marshal() 43 | if err != nil { 44 | b.Fatalf("thrift marshal fatal, %+v", err) 45 | return 46 | } 47 | 48 | cmd2 := &store.Command{} 49 | b.ResetTimer() 50 | for i := 0; i < b.N; i++ { 51 | cmd2.Unmarshal(bs) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | type OnDiskRaftConfig struct { 4 | RaftDataDir string 5 | 6 | RaftNodePeers map[uint64]string 7 | 8 | RaftClusterIDs []uint64 9 | } 10 | 11 | func NewOnDiskRaftConfig() *OnDiskRaftConfig { 12 | return &OnDiskRaftConfig{ 13 | RaftDataDir: "/Users/xkey/test/mraft-ondisk1", 14 | RaftNodePeers: map[uint64]string{ 15 | 10000: "10.181.20.34:11000", 16 | 10001: "10.181.20.34:11100", 17 | 10002: "10.181.20.34:11200", 18 | //10004: "10.181.20.34:11400", 19 | }, 20 | RaftClusterIDs: []uint64{14000, 14100, 14200}, 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /experiment/ondisk/db.go: -------------------------------------------------------------------------------- 1 | package ondisk 2 | 3 | import ( 4 | "bytes" 5 | "crypto/md5" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "io/ioutil" 10 | "math/rand" 11 | "os" 12 | "path/filepath" 13 | "runtime" 14 | "time" 15 | ) 16 | 17 | const ( 18 | mraftDBDirName string = "/Users/xkey/raftlab/mraft-rocksdb" 19 | currentDBFilename string = "current" 20 | updatingDBFilename string = "current.updating" 21 | ) 22 | 23 | func isNewRun(dir string) bool { 24 | fp := filepath.Join(dir, currentDBFilename) 25 | if _, err := os.Stat(fp); os.IsNotExist(err) { 26 | return true 27 | } 28 | return false 29 | } 30 | 31 | func getNodeDBDirName(clusterID uint64, nodeID uint64) string { 32 | return filepath.Join(mraftDBDirName, fmt.Sprintf("%d_%d", clusterID, nodeID)) 33 | } 34 | 35 | func getNewRandomDBDirName(dir string) string { 36 | part := "%d_%d" 37 | rn := rand.Uint64() 38 | ct := time.Now().UnixNano() 39 | return filepath.Join(dir, fmt.Sprintf(part, rn, ct)) 40 | } 41 | 42 | func createNodeDataDir(dir string) error { 43 | return os.MkdirAll(dir, 0755) 44 | } 45 | 46 | func getCurrentDBDirName(dir string) (string, error) { 47 | fp := filepath.Join(dir, currentDBFilename) 48 | f, err := os.OpenFile(fp, os.O_RDONLY, 0755) 49 | if err != nil { 50 | return "", err 51 | } 52 | defer f.Close() 53 | 54 | data, err := io.ReadAll(f) 55 | if err != nil { 56 | return "", err 57 | } 58 | if len(data) <= 8 { 59 | return "", errors.New("corrupted content") 60 | } 61 | crc := data[:8] 62 | content := data[8:] 63 | h := md5.New() 64 | if _, err := h.Write(content); err != nil { 65 | return "", err 66 | } 67 | if !bytes.Equal(crc, h.Sum(nil)[:8]) { 68 | return "", errors.New("corrupted content with not matched crc") 69 | } 70 | return string(content), nil 71 | } 72 | 73 | func cleanupNodeDataDir(dir string) error { 74 | os.RemoveAll(filepath.Join(dir, updatingDBFilename)) 75 | 76 | dbdir, err := getCurrentDBDirName(dir) 77 | if err != nil { 78 | return err 79 | } 80 | 81 | files, err := ioutil.ReadDir(dir) 82 | if err != nil { 83 | return err 84 | } 85 | 86 | for _, fi := range files { 87 | if !fi.IsDir() { 88 | continue 89 | } 90 | //fmt.Printf("dbdir %s, fi.name %s, dir %s\n", dbdir, fi.Name(), dir) 91 | toDelete := filepath.Join(dir, fi.Name()) 92 | if toDelete != dbdir { 93 | //fmt.Printf("removing %s\n", toDelete) 94 | if err := os.RemoveAll(toDelete); err != nil { 95 | return err 96 | } 97 | } 98 | } 99 | 100 | return nil 101 | } 102 | 103 | func replaceCurrentDBFile(dir string) error { 104 | fp := filepath.Join(dir, currentDBFilename) 105 | tmpFp := filepath.Join(dir, updatingDBFilename) 106 | if err := os.Rename(tmpFp, fp); err != nil { 107 | return err 108 | } 109 | return SyncDir(dir) 110 | } 111 | 112 | func saveCurrentDBDirName(dir string, dbdir string) error { 113 | h := md5.New() 114 | if _, err := h.Write([]byte(dbdir)); err != nil { 115 | return err 116 | } 117 | 118 | fp := filepath.Join(dir, updatingDBFilename) 119 | f, err := os.Create(fp) 120 | if err != nil { 121 | return err 122 | } 123 | 124 | defer func() { 125 | f.Close() 126 | SyncDir(dir) 127 | }() 128 | 129 | if _, err := f.Write(h.Sum(nil)[:8]); err != nil { 130 | return err 131 | } 132 | if _, err := f.Write([]byte(dbdir)); err != nil { 133 | return err 134 | } 135 | 136 | if err := f.Sync(); err != nil { 137 | return err 138 | } 139 | 140 | return nil 141 | } 142 | 143 | const ( 144 | // DefaultFileMode is the default file mode for files generated by 145 | // Dragonboat. 146 | DefaultFileMode = 0640 147 | defaultDirFileMode = 0750 148 | deleteFilename = "DELETED.dragonboat" 149 | ) 150 | 151 | // Exist returns whether the specified filesystem entry exists. 152 | func Exist(name string) (bool, error) { 153 | _, err := os.Stat(name) 154 | if err != nil && os.IsNotExist(err) { 155 | return false, nil 156 | } 157 | if err != nil { 158 | return false, err 159 | } 160 | return true, nil 161 | } 162 | 163 | // MkdirAll creates the specified dir along with any necessary parents. 164 | func MkdirAll(dir string) error { 165 | exist, err := Exist(dir) 166 | if err != nil { 167 | return err 168 | } 169 | if exist { 170 | return nil 171 | } 172 | parent := filepath.Dir(dir) 173 | exist, err = Exist(parent) 174 | if err != nil { 175 | return err 176 | } 177 | if !exist { 178 | if err := MkdirAll(parent); err != nil { 179 | return err 180 | } 181 | } 182 | return Mkdir(dir) 183 | } 184 | 185 | // Mkdir creates the specified dir. 186 | func Mkdir(dir string) error { 187 | if err := os.Mkdir(dir, defaultDirFileMode); err != nil { 188 | return err 189 | } 190 | return SyncDir(filepath.Dir(dir)) 191 | } 192 | 193 | // SyncDir calls fsync on the specified directory. 194 | func SyncDir(dir string) (err error) { 195 | if runtime.GOOS == "windows" { 196 | return nil 197 | } 198 | fileInfo, err := os.Stat(dir) 199 | if err != nil { 200 | return err 201 | } 202 | if !fileInfo.IsDir() { 203 | panic("not a dir") 204 | } 205 | df, err := os.Open(filepath.Clean(dir)) 206 | if err != nil { 207 | return err 208 | } 209 | defer func() { 210 | if cerr := df.Close(); err == nil { 211 | err = cerr 212 | } 213 | }() 214 | return df.Sync() 215 | } 216 | -------------------------------------------------------------------------------- /experiment/ondisk/engine/engine.go: -------------------------------------------------------------------------------- 1 | package engine 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | "time" 9 | 10 | "github.com/xkeyideal/mraft/config" 11 | "github.com/xkeyideal/mraft/experiment/ondisk" 12 | "github.com/xkeyideal/mraft/experiment/ondisk/raftd" 13 | 14 | "github.com/gin-gonic/gin" 15 | ) 16 | 17 | type Engine struct { 18 | prefix string 19 | 20 | nodeID uint64 21 | raftDataDir string 22 | 23 | server *http.Server 24 | router *gin.Engine 25 | 26 | nh *ondisk.OnDiskRaft 27 | 28 | mraftHandle *raftd.MRaftHandle 29 | } 30 | 31 | func NewEngine(nodeID uint64, port string) *Engine { 32 | 33 | cfg := config.NewOnDiskRaftConfig() 34 | 35 | router := gin.New() 36 | router.Use(gin.Recovery()) 37 | 38 | var nh *ondisk.OnDiskRaft 39 | if nodeID == 10003 || nodeID == 10004 || nodeID == 5 { 40 | nh = ondisk.NewOnDiskRaft(map[uint64]string{}, cfg.RaftClusterIDs) 41 | } else { 42 | nh = ondisk.NewOnDiskRaft(cfg.RaftNodePeers, cfg.RaftClusterIDs) 43 | } 44 | 45 | engine := &Engine{ 46 | nodeID: nodeID, 47 | raftDataDir: cfg.RaftDataDir, 48 | prefix: "/mraft", 49 | router: router, 50 | server: &http.Server{ 51 | Addr: fmt.Sprintf("0.0.0.0:%s", port), //"9080" 52 | Handler: router, 53 | ReadTimeout: 20 * time.Second, 54 | WriteTimeout: 40 * time.Second, 55 | }, 56 | nh: nh, 57 | mraftHandle: raftd.NewMRaftHandle(nh), 58 | } 59 | 60 | engine.registerMraftRouter(router) 61 | 62 | return engine 63 | } 64 | 65 | func (engine *Engine) Start() { 66 | join := false 67 | nodeAddr := "" 68 | if engine.nodeID == 10003 { 69 | join = true 70 | nodeAddr = "10.181.20.34:11300" 71 | } else if engine.nodeID == 10004 { 72 | join = true 73 | nodeAddr = "10.181.20.34:11400" 74 | } else if engine.nodeID == 10005 { 75 | join = true 76 | nodeAddr = "10.181.20.34:11500" 77 | } 78 | 79 | engine.nh.Start(engine.raftDataDir, engine.nodeID, nodeAddr, join) 80 | 81 | // 等待raft集群ready 82 | for { 83 | if engine.nh.ClusterAllReady() { 84 | break 85 | } 86 | time.Sleep(2 * time.Second) 87 | } 88 | 89 | log.Println("cluster all ready") 90 | 91 | if err := engine.server.ListenAndServe(); err != nil { 92 | panic(err.Error()) 93 | } 94 | } 95 | 96 | func (engine *Engine) Stop() { 97 | if engine.server != nil { 98 | if err := engine.server.Shutdown(context.Background()); err != nil { 99 | fmt.Println("Server Shutdown: ", err) 100 | } 101 | } 102 | 103 | engine.nh.Stop() 104 | } 105 | -------------------------------------------------------------------------------- /experiment/ondisk/engine/mraft_router.go: -------------------------------------------------------------------------------- 1 | package engine 2 | 3 | import "github.com/gin-gonic/gin" 4 | 5 | func (engine *Engine) registerMraftRouter(router *gin.Engine) { 6 | group := router.Group(engine.prefix) 7 | { 8 | group.GET("/info", engine.mraftHandle.Info) 9 | group.GET("/metrics", engine.mraftHandle.RaftMetrics) 10 | 11 | group.GET("/key", engine.mraftHandle.Query) 12 | group.POST("/key", engine.mraftHandle.Upsert) 13 | group.DELETE("/key", engine.mraftHandle.Delete) 14 | 15 | group.GET("/join", engine.mraftHandle.JoinNode) 16 | group.GET("/del", engine.mraftHandle.DelNode) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /experiment/ondisk/fsm.go: -------------------------------------------------------------------------------- 1 | package ondisk 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "os" 9 | "sync/atomic" 10 | 11 | "github.com/cockroachdb/pebble" 12 | "github.com/xkeyideal/mraft/experiment/store" 13 | 14 | sm "github.com/lni/dragonboat/v3/statemachine" 15 | ) 16 | 17 | const ( 18 | appliedIndexKey = "disk_kv_applied_index" 19 | endSignal = "mraft-end-signal" 20 | ) 21 | 22 | type DiskKV struct { 23 | clusterID uint64 24 | nodeID uint64 25 | 26 | dbIndex uint32 27 | stores []*store.Store 28 | lastApplied uint64 29 | } 30 | 31 | func NewDiskKV(cluserID uint64, nodeID uint64) sm.IOnDiskStateMachine { 32 | return &DiskKV{ 33 | clusterID: cluserID, 34 | nodeID: nodeID, 35 | stores: make([]*store.Store, 2), 36 | } 37 | } 38 | 39 | func (d *DiskKV) queryAppliedIndex() (uint64, error) { 40 | idx := atomic.LoadUint32(&d.dbIndex) 41 | 42 | return d.stores[idx].LookupAppliedIndex([]byte(appliedIndexKey)) 43 | } 44 | 45 | func (d *DiskKV) Open(stopc <-chan struct{}) (uint64, error) { 46 | select { 47 | case <-stopc: 48 | return 0, sm.ErrOpenStopped 49 | default: 50 | dir := getNodeDBDirName(d.clusterID, d.nodeID) 51 | if err := createNodeDataDir(dir); err != nil { 52 | return 0, nil 53 | } 54 | 55 | var dbdir string 56 | if !isNewRun(dir) { 57 | if err := cleanupNodeDataDir(dir); err != nil { 58 | return 0, err 59 | } 60 | var err error 61 | dbdir, err = getCurrentDBDirName(dir) 62 | if err != nil { 63 | return 0, err 64 | } 65 | if _, err := os.Stat(dbdir); err != nil { 66 | if os.IsNotExist(err) { 67 | return 0, err 68 | } 69 | } 70 | } else { 71 | dbdir = getNewRandomDBDirName(dir) 72 | if err := saveCurrentDBDirName(dir, dbdir); err != nil { 73 | return 0, err 74 | } 75 | if err := replaceCurrentDBFile(dir); err != nil { 76 | return 0, err 77 | } 78 | } 79 | 80 | store, err := store.NewStore(dbdir) 81 | if err != nil { 82 | return 0, err 83 | } 84 | 85 | d.dbIndex = 0 86 | 87 | d.stores[d.dbIndex] = store 88 | appliedIndex, err := d.queryAppliedIndex() 89 | if err != nil { 90 | return 0, err 91 | } 92 | 93 | d.lastApplied = appliedIndex 94 | 95 | return appliedIndex, nil 96 | } 97 | } 98 | 99 | // Update 与 LookUp, SaveSnapshot的调用是并发安全的 100 | func (d *DiskKV) Update(ents []sm.Entry) ([]sm.Entry, error) { 101 | 102 | if len(ents) == 0 { 103 | return ents, nil 104 | } 105 | 106 | dbIndex := atomic.LoadUint32(&d.dbIndex) 107 | db := d.stores[dbIndex] 108 | 109 | batch := db.Batch() 110 | defer batch.Close() 111 | 112 | for index, entry := range ents { 113 | if entry.Index <= d.lastApplied { 114 | continue 115 | } 116 | 117 | cmd := &store.Command{} 118 | err := cmd.Unmarshal(entry.Cmd) 119 | if err != nil { 120 | continue 121 | } 122 | 123 | switch cmd.Cmd { 124 | case store.CommandDelete: 125 | batch.Delete([]byte(cmd.Key), db.GetWo()) 126 | case store.CommandUpsert: 127 | batch.Set([]byte(cmd.Key), []byte(cmd.Val), db.GetWo()) 128 | default: 129 | } 130 | 131 | ents[index].Result = sm.Result{Value: uint64(len(ents[index].Cmd))} 132 | } 133 | 134 | idx := fmt.Sprintf("%d", ents[len(ents)-1].Index) 135 | batch.Set([]byte(appliedIndexKey), []byte(idx), db.GetWo()) 136 | 137 | if err := db.Write(batch); err != nil { 138 | return nil, err 139 | } 140 | 141 | d.lastApplied = ents[len(ents)-1].Index 142 | 143 | return ents, nil 144 | } 145 | 146 | // Lookup 与 Update and RecoverFromSnapshot 是并发安全的 147 | func (d *DiskKV) Lookup(key interface{}) (interface{}, error) { 148 | dbIndex := atomic.LoadUint32(&d.dbIndex) 149 | if d.stores[dbIndex] != nil { 150 | v, err := d.stores[dbIndex].Lookup(key.([]byte)) 151 | return v, err 152 | } 153 | return nil, errors.New("db is nil") 154 | } 155 | 156 | func (d *DiskKV) NALookup(key []byte) ([]byte, error) { 157 | dbIndex := atomic.LoadUint32(&d.dbIndex) 158 | if d.stores[dbIndex] != nil { 159 | return d.stores[dbIndex].NALookup(key) 160 | } 161 | return nil, errors.New("db is nil") 162 | } 163 | 164 | type diskKVCtx struct { 165 | store *store.Store 166 | snapshot *pebble.Snapshot 167 | } 168 | 169 | func (d *DiskKV) PrepareSnapshot() (interface{}, error) { 170 | dbIndex := atomic.LoadUint32(&d.dbIndex) 171 | store := d.stores[dbIndex] 172 | 173 | return &diskKVCtx{ 174 | store: store, 175 | snapshot: store.NewSnapshot(), 176 | }, nil 177 | } 178 | 179 | func (d *DiskKV) saveToWriter(store *store.Store, snapshot *pebble.Snapshot, w io.Writer) error { 180 | iter := snapshot.NewIter(store.GetRo()) 181 | defer iter.Close() 182 | 183 | keySize := make([]byte, 4) 184 | valSize := make([]byte, 4) 185 | for iter.First(); iter.Valid(); iter.Next() { 186 | key := iter.Key() 187 | val := iter.Value() 188 | 189 | kl := len(key) 190 | vl := len(val) 191 | 192 | binary.LittleEndian.PutUint32(keySize, uint32(kl)) 193 | if _, err := w.Write(keySize); err != nil { 194 | return err 195 | } 196 | 197 | if _, err := w.Write(key); err != nil { 198 | return err 199 | } 200 | 201 | binary.LittleEndian.PutUint32(valSize, uint32(vl)) 202 | if _, err := w.Write(valSize); err != nil { 203 | return err 204 | } 205 | 206 | if _, err := w.Write(val); err != nil { 207 | return err 208 | } 209 | } 210 | 211 | return nil 212 | } 213 | 214 | func (d *DiskKV) SaveSnapshot(ctx interface{}, w io.Writer, done <-chan struct{}) error { 215 | select { 216 | case <-done: 217 | return sm.ErrSnapshotStopped 218 | default: 219 | ctxdata := ctx.(*diskKVCtx) 220 | 221 | store := ctxdata.store 222 | ss := ctxdata.snapshot 223 | defer ss.Close() 224 | 225 | return d.saveToWriter(store, ss, w) 226 | } 227 | } 228 | 229 | // RecoverFromSnapshot 执行时,sm 的其他接口不会被同时执行 230 | func (d *DiskKV) RecoverFromSnapshot(reader io.Reader, done <-chan struct{}) error { 231 | dir := getNodeDBDirName(d.clusterID, d.nodeID) 232 | dbdir := getNewRandomDBDirName(dir) 233 | oldDirName, err := getCurrentDBDirName(dir) 234 | if err != nil { 235 | return err 236 | } 237 | 238 | store, err := store.NewStore(dbdir) 239 | if err != nil { 240 | return err 241 | } 242 | 243 | sz := make([]byte, 4) 244 | for { 245 | if isStop(done) { 246 | return sm.ErrSnapshotStopped 247 | } 248 | 249 | // 先读key 250 | _, err := io.ReadFull(reader, sz) // key size 251 | if err == io.EOF { 252 | break 253 | } 254 | 255 | if err != nil { 256 | return err 257 | } 258 | 259 | toRead := binary.LittleEndian.Uint64(sz) 260 | kdata := make([]byte, toRead) 261 | _, err = io.ReadFull(reader, kdata) // key data 262 | if err == io.EOF { 263 | break 264 | } 265 | if err != nil { 266 | return err 267 | } 268 | 269 | // 再读val 270 | _, err = io.ReadFull(reader, sz) // val size 271 | if err == io.EOF { 272 | break 273 | } 274 | if err != nil { 275 | return err 276 | } 277 | 278 | toRead = binary.LittleEndian.Uint64(sz) 279 | vdata := make([]byte, toRead) 280 | _, err = io.ReadFull(reader, vdata) // val data 281 | if err == io.EOF { 282 | break 283 | } 284 | if err != nil { 285 | return err 286 | } 287 | 288 | store.SetKv(kdata, vdata) 289 | } 290 | 291 | store.Flush() // db 刷盘 292 | 293 | if err := saveCurrentDBDirName(dir, dbdir); err != nil { 294 | return err 295 | } 296 | if err := replaceCurrentDBFile(dir); err != nil { 297 | return err 298 | } 299 | 300 | oldDbIndex := atomic.LoadUint32(&d.dbIndex) 301 | newDbIndex := 1 - oldDbIndex 302 | atomic.StoreUint32(&d.dbIndex, newDbIndex) 303 | d.stores[newDbIndex] = store 304 | 305 | newLastApplied, err := d.queryAppliedIndex() 306 | if err != nil { 307 | return err 308 | } 309 | 310 | d.stores[oldDbIndex].Close() 311 | 312 | d.lastApplied = newLastApplied 313 | 314 | return os.RemoveAll(oldDirName) 315 | } 316 | 317 | func (d *DiskKV) Close() error { 318 | for i := 0; i < 2; i++ { 319 | if d.stores[i] != nil { 320 | d.stores[i].Close() 321 | } 322 | } 323 | 324 | return nil 325 | } 326 | 327 | func (d *DiskKV) Sync() error { 328 | return nil 329 | } 330 | 331 | func isStop(ch <-chan struct{}) bool { 332 | select { 333 | case <-ch: 334 | return true 335 | default: 336 | return false 337 | } 338 | } 339 | -------------------------------------------------------------------------------- /experiment/ondisk/main/app.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/signal" 7 | "strconv" 8 | "syscall" 9 | 10 | "github.com/xkeyideal/mraft/experiment/ondisk/engine" 11 | ) 12 | 13 | // CGO_CFLAGS="-I/usr/local/include/rocksdb" CGO_LDFLAGS="-L/usr/local/lib -lrocksdb -lstdc++ -lm -lz -lbz2 -lsnappy -llz4" go run app.go 10000 9800 14 | func main() { 15 | if len(os.Args) <= 2 { 16 | fmt.Println("input arg $1 nodeID, arg $2 port") 17 | os.Exit(1) 18 | } 19 | 20 | nodeID, err := strconv.ParseUint(os.Args[1], 10, 64) 21 | if err != nil { 22 | fmt.Println(err.Error()) 23 | os.Exit(1) 24 | } 25 | 26 | port := os.Args[2] 27 | 28 | engine := engine.NewEngine(nodeID, port) 29 | 30 | go engine.Start() 31 | 32 | signals := make(chan os.Signal, 1) 33 | signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM, syscall.SIGKILL) 34 | <-signals 35 | 36 | engine.Stop() 37 | } 38 | -------------------------------------------------------------------------------- /experiment/ondisk/metrics.go: -------------------------------------------------------------------------------- 1 | package ondisk 2 | 3 | import "github.com/rcrowley/go-metrics" 4 | 5 | type ondiskMetrics struct { 6 | total metrics.Counter 7 | err metrics.Counter 8 | } 9 | 10 | func newOndiskMetrics() *ondiskMetrics { 11 | return &ondiskMetrics{ 12 | total: metrics.NewCounter(), 13 | err: metrics.NewCounter(), 14 | } 15 | } 16 | 17 | func (m *ondiskMetrics) add(delta int64, err bool) { 18 | m.total.Inc(delta) 19 | 20 | if err { 21 | m.err.Inc(delta) 22 | } 23 | } 24 | 25 | func (m *ondiskMetrics) clear() { 26 | m.total.Clear() 27 | m.err.Clear() 28 | } 29 | -------------------------------------------------------------------------------- /experiment/ondisk/raftd/mraft.go: -------------------------------------------------------------------------------- 1 | package raftd 2 | 3 | import ( 4 | "encoding/json" 5 | "io/ioutil" 6 | "net/http" 7 | "strconv" 8 | 9 | "github.com/xkeyideal/mraft/experiment/ondisk" 10 | "github.com/xkeyideal/mraft/experiment/store" 11 | 12 | "github.com/gin-gonic/gin" 13 | ) 14 | 15 | type MRaftHandle struct { 16 | raft *ondisk.OnDiskRaft 17 | } 18 | 19 | func NewMRaftHandle(raft *ondisk.OnDiskRaft) *MRaftHandle { 20 | return &MRaftHandle{ 21 | raft: raft, 22 | } 23 | } 24 | 25 | func (mh *MRaftHandle) Info(c *gin.Context) { 26 | SetStrResp(http.StatusOK, 0, "", mh.raft.Info(), c) 27 | } 28 | 29 | func (mh *MRaftHandle) Query(c *gin.Context) { 30 | key := c.Query("key") 31 | sync := c.Query("sync") 32 | hashKey, err := strconv.ParseUint(c.Query("hashKey"), 10, 64) 33 | if err != nil { 34 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 35 | return 36 | } 37 | 38 | if sync == "true" { 39 | val, err := mh.raft.SyncRead(key, hashKey) 40 | if err != nil { 41 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 42 | return 43 | } 44 | SetStrResp(http.StatusOK, 0, "", val, c) 45 | return 46 | } 47 | 48 | val, err := mh.raft.ReadLocal(key, hashKey) 49 | if err != nil { 50 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 51 | return 52 | } 53 | 54 | SetStrResp(http.StatusOK, 0, "", val, c) 55 | } 56 | 57 | func (mh *MRaftHandle) Upsert(c *gin.Context) { 58 | bytes, err := ioutil.ReadAll(c.Request.Body) 59 | if err != nil { 60 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 61 | return 62 | } 63 | 64 | attr := &store.RaftAttribute{} 65 | err = json.Unmarshal(bytes, attr) 66 | if err != nil { 67 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 68 | return 69 | } 70 | 71 | cmd, err := attr.GenerateCommand(store.CommandUpsert) 72 | if err != nil { 73 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 74 | return 75 | } 76 | 77 | mh.raft.Write(cmd) 78 | 79 | SetStrResp(http.StatusOK, 0, "", "OK", c) 80 | } 81 | 82 | func (mh *MRaftHandle) Delete(c *gin.Context) { 83 | bytes, err := ioutil.ReadAll(c.Request.Body) 84 | if err != nil { 85 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 86 | return 87 | } 88 | 89 | attr := &store.RaftAttribute{} 90 | err = json.Unmarshal(bytes, attr) 91 | if err != nil { 92 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 93 | return 94 | } 95 | 96 | cmd, err := attr.GenerateCommand(store.CommandDelete) 97 | if err != nil { 98 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 99 | return 100 | } 101 | 102 | mh.raft.Write(cmd) 103 | 104 | SetStrResp(http.StatusOK, 0, "", "OK", c) 105 | } 106 | 107 | func (mh *MRaftHandle) JoinNode(c *gin.Context) { 108 | nodeID, err := strconv.ParseUint(c.Query("nodeID"), 10, 64) 109 | if err != nil { 110 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 111 | return 112 | } 113 | 114 | nodeAddr := c.Query("nodeAddr") 115 | 116 | err = mh.raft.RaftAddNode(nodeID, nodeAddr) 117 | if err != nil { 118 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 119 | return 120 | } 121 | SetStrResp(http.StatusOK, 0, "", "OK", c) 122 | } 123 | 124 | func (mh *MRaftHandle) DelNode(c *gin.Context) { 125 | nodeID, err := strconv.ParseUint(c.Query("nodeID"), 10, 64) 126 | if err != nil { 127 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 128 | return 129 | } 130 | 131 | err = mh.raft.RaftRemoveNode(nodeID) 132 | if err != nil { 133 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 134 | return 135 | } 136 | SetStrResp(http.StatusOK, 0, "", "OK", c) 137 | } 138 | 139 | func (mh *MRaftHandle) RaftMetrics(c *gin.Context) { 140 | SetStrResp(http.StatusOK, 0, "", mh.raft.MetricsInfo(), c) 141 | } 142 | 143 | func SetStrResp(httpCode, code int, msg string, result interface{}, c *gin.Context) { 144 | 145 | m := msg 146 | 147 | if code == 0 { 148 | c.JSON(httpCode, gin.H{ 149 | "code": code, 150 | "msg": m, 151 | "result": result, 152 | }) 153 | } else { 154 | c.JSON(httpCode, gin.H{ 155 | "code": code, 156 | "msg": m, 157 | }) 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /experiment/simpleondisk/db.go: -------------------------------------------------------------------------------- 1 | package simpleondisk 2 | 3 | import ( 4 | "bytes" 5 | "crypto/md5" 6 | "errors" 7 | "fmt" 8 | "io/ioutil" 9 | "math/rand" 10 | "os" 11 | "path/filepath" 12 | "runtime" 13 | "time" 14 | ) 15 | 16 | const ( 17 | mraftDBDirName string = "/Volumes/ST1000/mraft-simplerocksdb" 18 | currentDBFilename string = "current" 19 | updatingDBFilename string = "current.updating" 20 | ) 21 | 22 | func isNewRun(dir string) bool { 23 | fp := filepath.Join(dir, currentDBFilename) 24 | if _, err := os.Stat(fp); os.IsNotExist(err) { 25 | return true 26 | } 27 | return false 28 | } 29 | 30 | func getNodeDBDirName(clusterID uint64, nodeID uint64) string { 31 | return filepath.Join(mraftDBDirName, fmt.Sprintf("%d_%d", clusterID, nodeID)) 32 | } 33 | 34 | func getNewRandomDBDirName(dir string) string { 35 | part := "%d_%d" 36 | rn := rand.Uint64() 37 | ct := time.Now().UnixNano() 38 | return filepath.Join(dir, fmt.Sprintf(part, rn, ct)) 39 | } 40 | 41 | func createNodeDataDir(dir string) error { 42 | return os.MkdirAll(dir, 0755) 43 | } 44 | 45 | func getCurrentDBDirName(dir string) (string, error) { 46 | fp := filepath.Join(dir, currentDBFilename) 47 | f, err := os.OpenFile(fp, os.O_RDONLY, 0755) 48 | if err != nil { 49 | return "", err 50 | } 51 | defer f.Close() 52 | 53 | data, err := ioutil.ReadAll(f) 54 | if err != nil { 55 | return "", err 56 | } 57 | if len(data) <= 8 { 58 | return "", errors.New("corrupted content") 59 | } 60 | crc := data[:8] 61 | content := data[8:] 62 | h := md5.New() 63 | if _, err := h.Write(content); err != nil { 64 | return "", err 65 | } 66 | if !bytes.Equal(crc, h.Sum(nil)[:8]) { 67 | return "", errors.New("corrupted content with not matched crc") 68 | } 69 | return string(content), nil 70 | } 71 | 72 | func cleanupNodeDataDir(dir string) error { 73 | os.RemoveAll(filepath.Join(dir, updatingDBFilename)) 74 | 75 | dbdir, err := getCurrentDBDirName(dir) 76 | if err != nil { 77 | return err 78 | } 79 | 80 | files, err := ioutil.ReadDir(dir) 81 | if err != nil { 82 | return err 83 | } 84 | 85 | for _, fi := range files { 86 | if !fi.IsDir() { 87 | continue 88 | } 89 | //fmt.Printf("dbdir %s, fi.name %s, dir %s\n", dbdir, fi.Name(), dir) 90 | toDelete := filepath.Join(dir, fi.Name()) 91 | if toDelete != dbdir { 92 | //fmt.Printf("removing %s\n", toDelete) 93 | if err := os.RemoveAll(toDelete); err != nil { 94 | return err 95 | } 96 | } 97 | } 98 | 99 | return nil 100 | } 101 | 102 | func replaceCurrentDBFile(dir string) error { 103 | fp := filepath.Join(dir, currentDBFilename) 104 | tmpFp := filepath.Join(dir, updatingDBFilename) 105 | if err := os.Rename(tmpFp, fp); err != nil { 106 | return err 107 | } 108 | return SyncDir(dir) 109 | } 110 | 111 | func saveCurrentDBDirName(dir string, dbdir string) error { 112 | h := md5.New() 113 | if _, err := h.Write([]byte(dbdir)); err != nil { 114 | return err 115 | } 116 | 117 | fp := filepath.Join(dir, updatingDBFilename) 118 | f, err := os.Create(fp) 119 | if err != nil { 120 | return err 121 | } 122 | 123 | defer func() { 124 | f.Close() 125 | SyncDir(dir) 126 | }() 127 | 128 | if _, err := f.Write(h.Sum(nil)[:8]); err != nil { 129 | return err 130 | } 131 | if _, err := f.Write([]byte(dbdir)); err != nil { 132 | return err 133 | } 134 | 135 | if err := f.Sync(); err != nil { 136 | return err 137 | } 138 | 139 | return nil 140 | } 141 | 142 | const ( 143 | // DefaultFileMode is the default file mode for files generated by 144 | // Dragonboat. 145 | DefaultFileMode = 0640 146 | defaultDirFileMode = 0750 147 | deleteFilename = "DELETED.dragonboat" 148 | ) 149 | 150 | // Exist returns whether the specified filesystem entry exists. 151 | func Exist(name string) (bool, error) { 152 | _, err := os.Stat(name) 153 | if err != nil && os.IsNotExist(err) { 154 | return false, nil 155 | } 156 | if err != nil { 157 | return false, err 158 | } 159 | return true, nil 160 | } 161 | 162 | // MkdirAll creates the specified dir along with any necessary parents. 163 | func MkdirAll(dir string) error { 164 | exist, err := Exist(dir) 165 | if err != nil { 166 | return err 167 | } 168 | if exist { 169 | return nil 170 | } 171 | parent := filepath.Dir(dir) 172 | exist, err = Exist(parent) 173 | if err != nil { 174 | return err 175 | } 176 | if !exist { 177 | if err := MkdirAll(parent); err != nil { 178 | return err 179 | } 180 | } 181 | return Mkdir(dir) 182 | } 183 | 184 | // Mkdir creates the specified dir. 185 | func Mkdir(dir string) error { 186 | if err := os.Mkdir(dir, defaultDirFileMode); err != nil { 187 | return err 188 | } 189 | return SyncDir(filepath.Dir(dir)) 190 | } 191 | 192 | // SyncDir calls fsync on the specified directory. 193 | func SyncDir(dir string) (err error) { 194 | if runtime.GOOS == "windows" { 195 | return nil 196 | } 197 | fileInfo, err := os.Stat(dir) 198 | if err != nil { 199 | return err 200 | } 201 | if !fileInfo.IsDir() { 202 | panic("not a dir") 203 | } 204 | df, err := os.Open(filepath.Clean(dir)) 205 | if err != nil { 206 | return err 207 | } 208 | defer func() { 209 | if cerr := df.Close(); err == nil { 210 | err = cerr 211 | } 212 | }() 213 | return df.Sync() 214 | } 215 | -------------------------------------------------------------------------------- /experiment/simpleondisk/fsm.go: -------------------------------------------------------------------------------- 1 | package simpleondisk 2 | 3 | import ( 4 | "encoding/binary" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "os" 10 | "strconv" 11 | "sync/atomic" 12 | 13 | "github.com/cockroachdb/pebble" 14 | "github.com/xkeyideal/mraft/experiment/store" 15 | 16 | sm "github.com/lni/dragonboat/v3/statemachine" 17 | ) 18 | 19 | const ( 20 | appliedIndexKey = "disk_kv_applied_index" 21 | endSignal = "mraft-end-signal" 22 | ) 23 | 24 | type kv struct { 25 | Key string `json:"key"` 26 | Val int `json:"val"` 27 | } 28 | 29 | type SimpleDiskKV struct { 30 | clusterID uint64 31 | nodeID uint64 32 | 33 | dbIndex uint32 34 | stores []*store.Store 35 | lastApplied uint64 36 | } 37 | 38 | func NewSimpleDiskKV(cluserID uint64, nodeID uint64) sm.IOnDiskStateMachine { 39 | return &SimpleDiskKV{ 40 | clusterID: cluserID, 41 | nodeID: nodeID, 42 | stores: make([]*store.Store, 2), 43 | } 44 | } 45 | 46 | func (d *SimpleDiskKV) queryAppliedIndex() (uint64, error) { 47 | idx := atomic.LoadUint32(&d.dbIndex) 48 | 49 | return d.stores[idx].LookupAppliedIndex([]byte(appliedIndexKey)) 50 | } 51 | 52 | func (d *SimpleDiskKV) Open(stopc <-chan struct{}) (uint64, error) { 53 | select { 54 | case <-stopc: 55 | return 0, sm.ErrOpenStopped 56 | default: 57 | dir := getNodeDBDirName(d.clusterID, d.nodeID) 58 | if err := createNodeDataDir(dir); err != nil { 59 | return 0, nil 60 | } 61 | 62 | var dbdir string 63 | if !isNewRun(dir) { 64 | if err := cleanupNodeDataDir(dir); err != nil { 65 | return 0, err 66 | } 67 | var err error 68 | dbdir, err = getCurrentDBDirName(dir) 69 | if err != nil { 70 | return 0, err 71 | } 72 | if _, err := os.Stat(dbdir); err != nil { 73 | if os.IsNotExist(err) { 74 | return 0, err 75 | } 76 | } 77 | } else { 78 | dbdir = getNewRandomDBDirName(dir) 79 | if err := saveCurrentDBDirName(dir, dbdir); err != nil { 80 | return 0, err 81 | } 82 | if err := replaceCurrentDBFile(dir); err != nil { 83 | return 0, err 84 | } 85 | } 86 | 87 | store, err := store.NewStore(dbdir) 88 | if err != nil { 89 | return 0, err 90 | } 91 | 92 | d.dbIndex = 0 93 | 94 | d.stores[d.dbIndex] = store 95 | appliedIndex, err := d.queryAppliedIndex() 96 | if err != nil { 97 | return 0, err 98 | } 99 | 100 | d.lastApplied = appliedIndex 101 | 102 | return appliedIndex, nil 103 | } 104 | } 105 | 106 | // Update 与 LookUp, SaveSnapshot的调用是并发安全的 107 | func (d *SimpleDiskKV) Update(ents []sm.Entry) ([]sm.Entry, error) { 108 | 109 | fmt.Println("SimpleDiskKV Entry length: ", len(ents)) 110 | 111 | if len(ents) == 0 { 112 | return ents, nil 113 | } 114 | 115 | dbIndex := atomic.LoadUint32(&d.dbIndex) 116 | 117 | for index, entry := range ents { 118 | if entry.Index <= d.lastApplied { 119 | continue 120 | } 121 | 122 | data := &kv{} 123 | json.Unmarshal(entry.Cmd, data) 124 | 125 | oldVal, err := d.NALookup([]byte(data.Key)) 126 | 127 | if err != nil { 128 | d.stores[dbIndex].SetKv([]byte(data.Key), []byte(strconv.Itoa(data.Val))) 129 | } else { 130 | v, err := strconv.ParseInt(string(oldVal), 10, 32) 131 | if err != nil { 132 | fmt.Printf("%s ParseInt %s", string(oldVal), err.Error()) 133 | continue 134 | } 135 | 136 | d.stores[dbIndex].SetKv([]byte(data.Key), []byte(strconv.Itoa(data.Val+int(v)))) 137 | } 138 | 139 | ents[index].Result = sm.Result{Value: uint64(len(ents[index].Cmd))} 140 | } 141 | 142 | idx := fmt.Sprintf("%d", ents[len(ents)-1].Index) 143 | d.stores[dbIndex].SetKv([]byte(appliedIndexKey), []byte(idx)) 144 | 145 | d.lastApplied = ents[len(ents)-1].Index 146 | 147 | return ents, nil 148 | } 149 | 150 | // Lookup 与 Update and RecoverFromSnapshot 是并发安全的 151 | func (d *SimpleDiskKV) Lookup(key interface{}) (interface{}, error) { 152 | dbIndex := atomic.LoadUint32(&d.dbIndex) 153 | if d.stores[dbIndex] != nil { 154 | v, err := d.stores[dbIndex].NALookup(key.([]byte)) 155 | return v, err 156 | } 157 | return nil, errors.New("db is nil") 158 | } 159 | 160 | func (d *SimpleDiskKV) NALookup(key []byte) ([]byte, error) { 161 | dbIndex := atomic.LoadUint32(&d.dbIndex) 162 | if d.stores[dbIndex] != nil { 163 | return d.stores[dbIndex].NALookup(key) 164 | } 165 | return nil, errors.New("db is nil") 166 | } 167 | 168 | type diskKVCtx struct { 169 | store *store.Store 170 | snapshot *pebble.Snapshot 171 | } 172 | 173 | func (d *SimpleDiskKV) PrepareSnapshot() (interface{}, error) { 174 | dbIndex := atomic.LoadUint32(&d.dbIndex) 175 | store := d.stores[dbIndex] 176 | 177 | return &diskKVCtx{ 178 | store: store, 179 | snapshot: store.NewSnapshot(), 180 | }, nil 181 | } 182 | 183 | func (d *SimpleDiskKV) saveToWriter(store *store.Store, snapshot *pebble.Snapshot, w io.Writer) error { 184 | iter := snapshot.NewIter(store.GetRo()) 185 | defer iter.Close() 186 | 187 | keySize := make([]byte, 4) 188 | valSize := make([]byte, 4) 189 | for iter.First(); iter.Valid(); iter.Next() { 190 | key := iter.Key() 191 | val := iter.Value() 192 | 193 | kl := len(key) 194 | vl := len(val) 195 | 196 | binary.LittleEndian.PutUint32(keySize, uint32(kl)) 197 | if _, err := w.Write(keySize); err != nil { 198 | return err 199 | } 200 | 201 | if _, err := w.Write(key); err != nil { 202 | return err 203 | } 204 | 205 | binary.LittleEndian.PutUint32(valSize, uint32(vl)) 206 | if _, err := w.Write(valSize); err != nil { 207 | return err 208 | } 209 | 210 | if _, err := w.Write(val); err != nil { 211 | return err 212 | } 213 | } 214 | 215 | return nil 216 | } 217 | 218 | func (d *SimpleDiskKV) SaveSnapshot(ctx interface{}, w io.Writer, done <-chan struct{}) error { 219 | select { 220 | case <-done: 221 | return sm.ErrSnapshotStopped 222 | default: 223 | ctxdata := ctx.(*diskKVCtx) 224 | 225 | store := ctxdata.store 226 | ss := ctxdata.snapshot 227 | 228 | defer ss.Close() 229 | 230 | return d.saveToWriter(store, ss, w) 231 | } 232 | } 233 | 234 | // RecoverFromSnapshot 执行时,sm 的其他接口不会被同时执行 235 | func (d *SimpleDiskKV) RecoverFromSnapshot(reader io.Reader, done <-chan struct{}) error { 236 | dir := getNodeDBDirName(d.clusterID, d.nodeID) 237 | dbdir := getNewRandomDBDirName(dir) 238 | oldDirName, err := getCurrentDBDirName(dir) 239 | if err != nil { 240 | return err 241 | } 242 | 243 | store, err := store.NewStore(dbdir) 244 | if err != nil { 245 | return err 246 | } 247 | 248 | sz := make([]byte, 4) 249 | for { 250 | if isStop(done) { 251 | return sm.ErrSnapshotStopped 252 | } 253 | 254 | // 先读key 255 | _, err := io.ReadFull(reader, sz) // key size 256 | if err == io.EOF { 257 | break 258 | } 259 | 260 | if err != nil { 261 | return err 262 | } 263 | 264 | toRead := binary.LittleEndian.Uint64(sz) 265 | kdata := make([]byte, toRead) 266 | _, err = io.ReadFull(reader, kdata) // key data 267 | if err == io.EOF { 268 | break 269 | } 270 | if err != nil { 271 | return err 272 | } 273 | 274 | // 再读val 275 | _, err = io.ReadFull(reader, sz) // val size 276 | if err == io.EOF { 277 | break 278 | } 279 | if err != nil { 280 | return err 281 | } 282 | 283 | toRead = binary.LittleEndian.Uint64(sz) 284 | vdata := make([]byte, toRead) 285 | _, err = io.ReadFull(reader, vdata) // val data 286 | if err == io.EOF { 287 | break 288 | } 289 | if err != nil { 290 | return err 291 | } 292 | 293 | store.SetKv(kdata, vdata) 294 | } 295 | 296 | store.Flush() // db 刷盘 297 | 298 | if err := saveCurrentDBDirName(dir, dbdir); err != nil { 299 | return err 300 | } 301 | if err := replaceCurrentDBFile(dir); err != nil { 302 | return err 303 | } 304 | 305 | oldDbIndex := atomic.LoadUint32(&d.dbIndex) 306 | newDbIndex := 1 - oldDbIndex 307 | atomic.StoreUint32(&d.dbIndex, newDbIndex) 308 | d.stores[newDbIndex] = store 309 | 310 | newLastApplied, err := d.queryAppliedIndex() 311 | if err != nil { 312 | return err 313 | } 314 | 315 | d.stores[oldDbIndex].Close() 316 | 317 | d.lastApplied = newLastApplied 318 | 319 | return os.RemoveAll(oldDirName) 320 | } 321 | 322 | func (d *SimpleDiskKV) Close() error { 323 | for i := 0; i < 2; i++ { 324 | if d.stores[i] != nil { 325 | d.stores[i].Close() 326 | } 327 | } 328 | 329 | return nil 330 | } 331 | 332 | func (d *SimpleDiskKV) Sync() error { 333 | return nil 334 | } 335 | 336 | func isStop(ch <-chan struct{}) bool { 337 | select { 338 | case <-ch: 339 | return true 340 | default: 341 | return false 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /experiment/simpleondisk/httpengine/engine.go: -------------------------------------------------------------------------------- 1 | package httpengine 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | "strconv" 8 | "time" 9 | 10 | "github.com/gin-gonic/gin" 11 | "github.com/xkeyideal/mraft/experiment/simpleondisk" 12 | ) 13 | 14 | var ( 15 | RaftDataDir = "/Volumes/ST1000/mraft-simpleondisk" 16 | RaftNodePeers = map[uint64]string{ 17 | 10000: "10.101.44.4:34000", 18 | 10001: "10.101.44.4:34100", 19 | 10002: "10.101.44.4:34200", 20 | } 21 | RaftClusterIDs = []uint64{234000, 234100, 234200} 22 | ) 23 | 24 | type Engine struct { 25 | nodeID uint64 26 | raftDataDir string 27 | 28 | server *http.Server 29 | router *gin.Engine 30 | 31 | nh *simpleondisk.SimpleOnDiskRaft 32 | } 33 | 34 | func NewEngine(nodeID uint64, port string) *Engine { 35 | 36 | router := gin.New() 37 | router.Use(gin.Recovery()) 38 | 39 | nh := simpleondisk.NewSimpleOnDiskRaft(RaftNodePeers, RaftClusterIDs) 40 | 41 | engine := &Engine{ 42 | nodeID: nodeID, 43 | raftDataDir: RaftDataDir, 44 | router: router, 45 | server: &http.Server{ 46 | Addr: fmt.Sprintf("0.0.0.0:%s", port), //"9080" 47 | Handler: router, 48 | ReadTimeout: 20 * time.Second, 49 | WriteTimeout: 40 * time.Second, 50 | }, 51 | nh: nh, 52 | } 53 | 54 | engine.router.GET("/msimpleraft/key", engine.Query) 55 | engine.router.POST("/msimpleraft/key", engine.Upsert) 56 | 57 | return engine 58 | } 59 | 60 | func (engine *Engine) Start() { 61 | 62 | engine.nh.Start(engine.raftDataDir, engine.nodeID, "", false) 63 | 64 | if err := engine.server.ListenAndServe(); err != nil { 65 | panic(err.Error()) 66 | } 67 | } 68 | 69 | func (engine *Engine) Stop() { 70 | if engine.server != nil { 71 | if err := engine.server.Shutdown(context.Background()); err != nil { 72 | fmt.Println("Server Shutdown: ", err) 73 | } 74 | } 75 | 76 | engine.nh.Stop() 77 | } 78 | 79 | func (engine *Engine) Query(c *gin.Context) { 80 | key := c.Query("key") 81 | hashKey, err := strconv.ParseUint(c.Query("hashKey"), 10, 64) 82 | if err != nil { 83 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 84 | return 85 | } 86 | 87 | val, err := engine.nh.SyncRead(key, hashKey) 88 | if err != nil { 89 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 90 | return 91 | } 92 | SetStrResp(http.StatusOK, 0, "", string(val), c) 93 | } 94 | 95 | func (engine *Engine) Upsert(c *gin.Context) { 96 | key := c.Query("key") 97 | hashKey, err := strconv.ParseUint(c.Query("hashKey"), 10, 64) 98 | if err != nil { 99 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 100 | return 101 | } 102 | 103 | val, err := strconv.ParseUint(c.Query("val"), 10, 64) 104 | if err != nil { 105 | SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 106 | return 107 | } 108 | 109 | engine.nh.Write(key, hashKey, int(val)) 110 | 111 | SetStrResp(http.StatusOK, 0, "", "OK", c) 112 | } 113 | 114 | func SetStrResp(httpCode, code int, msg string, result interface{}, c *gin.Context) { 115 | 116 | m := msg 117 | 118 | if code == 0 { 119 | c.JSON(httpCode, gin.H{ 120 | "code": code, 121 | "msg": m, 122 | "result": result, 123 | }) 124 | } else { 125 | c.JSON(httpCode, gin.H{ 126 | "code": code, 127 | "msg": m, 128 | }) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /experiment/simpleondisk/main/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/signal" 7 | "strconv" 8 | "syscall" 9 | 10 | "github.com/xkeyideal/mraft/experiment/simpleondisk/httpengine" 11 | ) 12 | 13 | // CGO_CFLAGS="-I/usr/local/include/rocksdb" CGO_LDFLAGS="-L/usr/local/lib -lrocksdb -lstdc++ -lm -lz -lbz2 -lsnappy -llz4" go run app.go 10000 9800 14 | func main() { 15 | if len(os.Args) <= 2 { 16 | fmt.Println("input arg $1 nodeID, arg $2 port") 17 | os.Exit(1) 18 | } 19 | 20 | nodeID, err := strconv.ParseUint(os.Args[1], 10, 64) 21 | if err != nil { 22 | fmt.Println(err.Error()) 23 | os.Exit(1) 24 | } 25 | 26 | port := os.Args[2] 27 | 28 | engine := httpengine.NewEngine(nodeID, port) 29 | 30 | go engine.Start() 31 | 32 | signals := make(chan os.Signal, 1) 33 | signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM, syscall.SIGKILL) 34 | <-signals 35 | 36 | engine.Stop() 37 | } 38 | -------------------------------------------------------------------------------- /experiment/simpleondisk/ondisk.go: -------------------------------------------------------------------------------- 1 | package simpleondisk 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "path/filepath" 8 | "sync" 9 | "time" 10 | 11 | "github.com/lni/dragonboat/v3" 12 | "github.com/lni/dragonboat/v3/client" 13 | "github.com/lni/dragonboat/v3/config" 14 | "github.com/lni/dragonboat/v3/logger" 15 | ) 16 | 17 | type SimpleOnDiskRaft struct { 18 | RaftNodePeers map[uint64]string // mraft节点地址 19 | RaftClusterIDs []uint64 20 | 21 | nodehost *dragonboat.NodeHost 22 | clusterSession map[uint64]*client.Session 23 | lock sync.RWMutex 24 | } 25 | 26 | func NewSimpleOnDiskRaft(peers map[uint64]string, clusterIDs []uint64) *SimpleOnDiskRaft { 27 | 28 | dr := &SimpleOnDiskRaft{ 29 | RaftNodePeers: peers, 30 | RaftClusterIDs: clusterIDs, 31 | clusterSession: make(map[uint64]*client.Session), 32 | lock: sync.RWMutex{}, 33 | } 34 | 35 | return dr 36 | } 37 | 38 | func (disk *SimpleOnDiskRaft) Start(raftDataDir string, nodeID uint64, nodeAddr string, join bool) error { 39 | 40 | datadir := filepath.Join(raftDataDir, fmt.Sprintf("node%d", nodeID)) 41 | 42 | logger.GetLogger("raft").SetLevel(logger.ERROR) 43 | logger.GetLogger("rsm").SetLevel(logger.WARNING) 44 | logger.GetLogger("transport").SetLevel(logger.WARNING) 45 | logger.GetLogger("grpc").SetLevel(logger.WARNING) 46 | logger.GetLogger("dragonboat").SetLevel(logger.WARNING) 47 | logger.GetLogger("logdb").SetLevel(logger.WARNING) 48 | 49 | raftAddress := disk.RaftNodePeers[nodeID] 50 | peers := disk.RaftNodePeers 51 | if join { 52 | raftAddress = nodeAddr 53 | peers = make(map[uint64]string) 54 | } 55 | 56 | nhc := config.NodeHostConfig{ 57 | DeploymentID: 20, 58 | WALDir: datadir, 59 | NodeHostDir: datadir, 60 | RTTMillisecond: 100, 61 | RaftAddress: raftAddress, 62 | } 63 | 64 | nh, err := dragonboat.NewNodeHost(nhc) 65 | if err != nil { 66 | return err 67 | } 68 | 69 | disk.nodehost = nh 70 | 71 | for _, clusterID := range disk.RaftClusterIDs { 72 | rc := config.Config{ 73 | NodeID: nodeID, 74 | ClusterID: clusterID, 75 | ElectionRTT: 10, 76 | HeartbeatRTT: 1, 77 | CheckQuorum: true, 78 | SnapshotEntries: 1000, 79 | CompactionOverhead: 100, 80 | } 81 | 82 | if err := nh.StartOnDiskCluster(peers, join, NewSimpleDiskKV, rc); err != nil { 83 | panic(err) 84 | } 85 | 86 | disk.clusterSession[clusterID] = disk.nodehost.GetNoOPSession(clusterID) 87 | } 88 | 89 | return nil 90 | } 91 | 92 | func (disk *SimpleOnDiskRaft) Write(key string, hashKey uint64, value int) error { 93 | idx := hashKey % uint64(len(disk.RaftClusterIDs)) 94 | clusterID := disk.RaftClusterIDs[idx] 95 | cs := disk.clusterSession[clusterID] 96 | ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 97 | 98 | d := kv{key, value} 99 | b, _ := json.Marshal(d) 100 | 101 | _, err := disk.nodehost.SyncPropose(ctx, cs, b) 102 | 103 | cancel() 104 | return err 105 | } 106 | 107 | // SyncRead 线性读 108 | func (disk *SimpleOnDiskRaft) SyncRead(key string, hashKey uint64) ([]byte, error) { 109 | idx := hashKey % uint64(len(disk.RaftClusterIDs)) 110 | clusterID := disk.RaftClusterIDs[idx] 111 | ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 112 | result, err := disk.nodehost.SyncRead(ctx, clusterID, []byte(key)) 113 | cancel() 114 | 115 | if err != nil { 116 | return nil, err 117 | } 118 | 119 | return result.([]byte), nil 120 | } 121 | 122 | // ReadLocal 读本地 123 | func (disk *SimpleOnDiskRaft) ReadLocal(key string, hashKey uint64) ([]byte, error) { 124 | idx := hashKey % uint64(len(disk.RaftClusterIDs)) 125 | clusterID := disk.RaftClusterIDs[idx] 126 | result, err := disk.nodehost.StaleRead(clusterID, []byte(key)) 127 | 128 | if err != nil { 129 | return nil, err 130 | } 131 | 132 | return result.([]byte), nil 133 | } 134 | 135 | func (disk *SimpleOnDiskRaft) Stop() { 136 | disk.nodehost.Stop() 137 | 138 | disk.clusterSession = make(map[uint64]*client.Session) 139 | } 140 | 141 | // Info 查询NodeHostInfo 142 | func (disk *SimpleOnDiskRaft) Info() *dragonboat.NodeHostInfo { 143 | return disk.nodehost.GetNodeHostInfo(dragonboat.NodeHostInfoOption{SkipLogInfo: false}) 144 | } 145 | -------------------------------------------------------------------------------- /experiment/simpleondisk/test/test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "sync" 7 | "time" 8 | 9 | "github.com/xkeyideal/gokit/httpkit" 10 | ) 11 | 12 | const ( 13 | HttpConnTimeout = 3 * time.Second 14 | HttpReadWriteTimeout = 5 * time.Second 15 | HttpRetry = 0 16 | HttpRetryInterval = 2 * time.Second 17 | ) 18 | 19 | func doHttp(addr string, val int, wg *sync.WaitGroup) { 20 | 21 | defer wg.Done() 22 | 23 | client := httpkit.NewHttpClient(HttpReadWriteTimeout, HttpRetry, HttpRetryInterval, HttpConnTimeout, nil) 24 | 25 | client = client.SetParam("key", "simple_4").SetParam("hashKey", "123").SetParam("val", strconv.Itoa(val)) 26 | 27 | resp, err := client.Post(fmt.Sprintf("http://%s/msimpleraft/key", addr)) 28 | 29 | if err != nil { 30 | fmt.Println(addr, err) 31 | return 32 | } 33 | 34 | if resp.StatusCode != 200 { 35 | fmt.Println(resp.StatusCode) 36 | } 37 | } 38 | 39 | func doQuery(addr string) { 40 | client := httpkit.NewHttpClient(HttpReadWriteTimeout, HttpRetry, HttpRetryInterval, HttpConnTimeout, nil) 41 | 42 | client = client.SetParam("key", "simple_4").SetParam("hashKey", "123") 43 | 44 | resp, err := client.Get(fmt.Sprintf("http://%s/msimpleraft/key", addr)) 45 | 46 | if err != nil { 47 | fmt.Println(addr, err) 48 | return 49 | } 50 | 51 | if resp.StatusCode != 200 { 52 | fmt.Println(resp.StatusCode) 53 | return 54 | } 55 | 56 | fmt.Println(string(resp.Body)) 57 | } 58 | 59 | func main() { 60 | addrs := []string{"10.101.44.4:10100", "10.101.44.4:10101", "10.101.44.4:10102"} 61 | 62 | n := 10 63 | 64 | wg := &sync.WaitGroup{} 65 | wg.Add(n) 66 | 67 | for i := 0; i < n; i++ { 68 | go doHttp(addrs[i%3], i, wg) 69 | } 70 | 71 | wg.Wait() 72 | 73 | doQuery(addrs[0]) 74 | } 75 | -------------------------------------------------------------------------------- /experiment/store/kv.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "io" 7 | 8 | thrifter "github.com/thrift-iterator/go" 9 | ) 10 | 11 | const ( 12 | CommandRead = "read" 13 | CommandDelete = "delete" 14 | CommandUpsert = "upsert" 15 | ) 16 | 17 | type Command struct { 18 | Cmd string `thrift:"Cmd,1"` 19 | HashKey uint64 `thrift:"HashKey,2"` 20 | Key string `thrift:"Key,3"` 21 | Val string `thrift:"Val,4"` 22 | } 23 | 24 | func NewCommand(cmd, key, val string, hashKey uint64) *Command { 25 | return &Command{ 26 | Cmd: cmd, 27 | HashKey: hashKey, 28 | Key: key, 29 | Val: val, 30 | } 31 | } 32 | 33 | func (cmd *Command) Unmarshal(b []byte) error { 34 | return thrifter.Unmarshal(b, cmd) 35 | } 36 | 37 | func (cmd *Command) Marshal() ([]byte, error) { 38 | return thrifter.Marshal(cmd) 39 | } 40 | 41 | type RaftAttribute struct { 42 | AttrID uint64 `thrift:"AttrID,1" db:"AttrID" json:"AttrID"` 43 | AttrName string `thrift:"AttrName,2" db:"AttrName" json:"AttrName"` 44 | Ages []int32 `thrift:"Ages,3" db:"Ages" json:"Ages"` 45 | Locations map[string]string `thrift:"Locations,4" db:"Locations" json:"Locations"` 46 | Timestamp int64 `thrift:"Timestamp,5" db:"Timestamp" json:"Timestamp"` 47 | } 48 | 49 | func (attr *RaftAttribute) Marshal() ([]byte, error) { 50 | if attr == nil { 51 | return []byte{}, nil 52 | } 53 | return thrifter.Marshal(attr) 54 | } 55 | 56 | func (attr *RaftAttribute) GenerateCommand(cmd string) (*Command, error) { 57 | b, err := attr.Marshal() 58 | if err != nil { 59 | return nil, err 60 | } 61 | 62 | return NewCommand(cmd, fmt.Sprintf("%d_%s", attr.AttrID, attr.AttrName), string(b), attr.AttrID), nil 63 | } 64 | 65 | func (attr *RaftAttribute) Unmarshal(b []byte) error { 66 | return thrifter.Unmarshal(b, attr) 67 | } 68 | 69 | func (attr *RaftAttribute) WriteTo(command string, w io.Writer) (int64, error) { 70 | dataSize := make([]byte, 8) 71 | 72 | b, err := attr.Marshal() 73 | if err != nil { 74 | return 0, err 75 | } 76 | 77 | l := len(b) + 4 + len(command) 78 | 79 | binary.LittleEndian.PutUint64(dataSize, uint64(l)) 80 | if _, err := w.Write(dataSize); err != nil { 81 | return 0, err 82 | } 83 | 84 | cmdSize := make([]byte, 4) 85 | binary.LittleEndian.PutUint32(cmdSize, uint32(len(command))) 86 | if _, err := w.Write(cmdSize); err != nil { 87 | return 0, err 88 | } 89 | 90 | if _, err := w.Write([]byte(command)); err != nil { 91 | return 0, err 92 | } 93 | 94 | if _, err := w.Write(b); err != nil { 95 | return 0, err 96 | } 97 | 98 | return int64(8 + l), nil 99 | } 100 | 101 | func (attr *RaftAttribute) WriteTo2(w io.Writer) (int64, error) { 102 | dataSize := make([]byte, 8) 103 | 104 | b, err := attr.Marshal() 105 | if err != nil { 106 | return 0, err 107 | } 108 | 109 | l := len(b) 110 | 111 | binary.LittleEndian.PutUint64(dataSize, uint64(l)) 112 | if _, err := w.Write(dataSize); err != nil { 113 | return 0, err 114 | } 115 | 116 | if _, err := w.Write(b); err != nil { 117 | return 0, err 118 | } 119 | 120 | return int64(8 + l), nil 121 | } 122 | 123 | type ReadArgument struct { 124 | Key string `thrift:"Key,1"` 125 | HashKey uint64 `thrift:"HashKey,2"` 126 | Sync bool `thrift:"Sync,3"` 127 | } 128 | 129 | func (arg *ReadArgument) Marshal() ([]byte, error) { 130 | return thrifter.Marshal(arg) 131 | } 132 | 133 | func (arg *ReadArgument) Unmarshal(b []byte) error { 134 | return thrifter.Unmarshal(b, arg) 135 | } 136 | 137 | func (arg *ReadArgument) WriteTo(command string, w io.Writer) (int64, error) { 138 | dataSize := make([]byte, 8) 139 | 140 | b, err := arg.Marshal() 141 | if err != nil { 142 | return 0, err 143 | } 144 | 145 | l := 4 + len(command) + len(b) 146 | 147 | binary.LittleEndian.PutUint64(dataSize, uint64(l)) 148 | if _, err := w.Write(dataSize); err != nil { 149 | return 0, err 150 | } 151 | 152 | cmdSize := make([]byte, 4) 153 | binary.LittleEndian.PutUint32(cmdSize, uint32(len(command))) 154 | if _, err := w.Write(cmdSize); err != nil { 155 | return 0, err 156 | } 157 | 158 | if _, err := w.Write([]byte(command)); err != nil { 159 | return 0, err 160 | } 161 | 162 | if _, err := w.Write(b); err != nil { 163 | return 0, err 164 | } 165 | 166 | return int64(8 + l), nil 167 | } 168 | -------------------------------------------------------------------------------- /experiment/store/kvstore.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "strconv" 7 | "sync" 8 | 9 | "github.com/cockroachdb/pebble" 10 | "go.uber.org/atomic" 11 | ) 12 | 13 | type Store struct { 14 | mu sync.RWMutex 15 | db *pebble.DB 16 | ro *pebble.IterOptions 17 | wo *pebble.WriteOptions 18 | syncwo *pebble.WriteOptions 19 | closed *atomic.Bool 20 | } 21 | 22 | func NewStore(dbdir string) (*Store, error) { 23 | cfg := getDefaultPebbleDBConfig() 24 | 25 | db, err := openPebbleDB(cfg, dbdir) 26 | if err != nil { 27 | return nil, err 28 | } 29 | 30 | return &Store{ 31 | db: db, 32 | ro: &pebble.IterOptions{}, 33 | wo: &pebble.WriteOptions{Sync: false}, 34 | syncwo: &pebble.WriteOptions{Sync: true}, 35 | closed: atomic.NewBool(false), 36 | }, nil 37 | } 38 | 39 | func (db *Store) LookupAppliedIndex(key []byte) (uint64, error) { 40 | if db.closed.Load() { 41 | return 0, pebble.ErrClosed 42 | } 43 | 44 | db.mu.RLock() 45 | defer db.mu.RUnlock() 46 | 47 | val, closer, err := db.db.Get(key) 48 | if err != nil { 49 | return 0, err 50 | } 51 | 52 | // 这里需要copy 53 | data := make([]byte, len(val)) 54 | copy(data, val) 55 | 56 | if err := closer.Close(); err != nil { 57 | return 0, err 58 | } 59 | 60 | return strconv.ParseUint(string(data), 10, 64) 61 | } 62 | 63 | func (db *Store) Lookup(key []byte) (*RaftAttribute, error) { 64 | if db.closed.Load() { 65 | return nil, pebble.ErrClosed 66 | } 67 | 68 | db.mu.RLock() 69 | defer db.mu.RUnlock() 70 | 71 | val, closer, err := db.db.Get(key) 72 | if err != nil { 73 | return nil, err 74 | } 75 | 76 | // 这里需要copy 77 | data := make([]byte, len(val)) 78 | copy(data, val) 79 | 80 | if err := closer.Close(); err != nil { 81 | return nil, err 82 | } 83 | 84 | attr := &RaftAttribute{} 85 | err = attr.Unmarshal(data) 86 | 87 | return attr, err 88 | } 89 | 90 | func (db *Store) NALookup(key []byte) ([]byte, error) { 91 | db.mu.RLock() 92 | defer db.mu.RUnlock() 93 | 94 | val, closer, err := db.db.Get(key) 95 | if err != nil { 96 | return nil, err 97 | } 98 | 99 | // 这里需要copy 100 | data := make([]byte, len(val)) 101 | copy(data, val) 102 | 103 | if err := closer.Close(); err != nil { 104 | return nil, err 105 | } 106 | 107 | return data, err 108 | } 109 | 110 | func (db *Store) Batch() *pebble.Batch { 111 | return db.db.NewBatch() 112 | } 113 | 114 | func (db *Store) GetWo() *pebble.WriteOptions { 115 | return db.wo 116 | } 117 | 118 | func (db *Store) GetRo() *pebble.IterOptions { 119 | return db.ro 120 | } 121 | 122 | func (db *Store) Write(b *pebble.Batch) error { 123 | return b.Commit(db.wo) 124 | } 125 | 126 | func (db *Store) SetKv(key, val []byte) { 127 | db.db.Set(key, val, db.wo) 128 | } 129 | 130 | func (db *Store) Flush() { 131 | db.db.Flush() 132 | } 133 | 134 | func (db *Store) NewSnapshot() *pebble.Snapshot { 135 | db.mu.RLock() 136 | defer db.mu.RUnlock() 137 | 138 | return db.db.NewSnapshot() 139 | } 140 | 141 | func (db *Store) ReleaseSnapshot(snap *pebble.Snapshot) { 142 | snap.Close() 143 | } 144 | 145 | func (db *Store) GetIterator() *pebble.Iterator { 146 | return db.db.NewIter(db.ro) 147 | } 148 | 149 | func (db *Store) Close() error { 150 | if db == nil { 151 | return nil 152 | } 153 | 154 | db.mu.Lock() 155 | defer db.mu.Unlock() 156 | 157 | db.closed.Store(true) // set pebbledb closed 158 | 159 | if db.db != nil { 160 | db.db.Flush() 161 | db.db.Close() 162 | db.db = nil 163 | } 164 | 165 | return nil 166 | } 167 | 168 | type PebbleDBConfig struct { 169 | KVLRUCacheSize int64 170 | KVWriteBufferSize int 171 | KVMaxWriteBufferNumber int 172 | KVLevel0FileNumCompactionTrigger int 173 | KVLevel0StopWritesTrigger int 174 | KVMaxBytesForLevelBase int64 175 | KVTargetFileSizeBase int64 176 | KVTargetFileSizeMultiplier int64 177 | KVNumOfLevels int 178 | KVMaxOpenFiles int 179 | KVMaxConcurrentCompactions int 180 | KVBlockSize int 181 | KVMaxManifestFileSize int64 182 | KVBytesPerSync int 183 | KVWALBytesPerSync int 184 | } 185 | 186 | func getDefaultPebbleDBConfig() PebbleDBConfig { 187 | return PebbleDBConfig{ 188 | KVLRUCacheSize: 128 * 1024 * 1024, // 128MB 189 | KVWriteBufferSize: 32 * 1024 * 1024, // 32MB 190 | KVMaxWriteBufferNumber: 4, 191 | KVLevel0FileNumCompactionTrigger: 1, 192 | KVLevel0StopWritesTrigger: 24, 193 | KVMaxBytesForLevelBase: 512 * 1024 * 1024, // 512MB 194 | KVTargetFileSizeBase: 128 * 1024 * 1024, // 128MB 195 | KVTargetFileSizeMultiplier: 1, 196 | KVNumOfLevels: 7, 197 | KVMaxOpenFiles: 102400, 198 | KVMaxConcurrentCompactions: 8, 199 | KVBlockSize: 64 * 1024, // 64KB 200 | KVMaxManifestFileSize: 128 * 1024 * 1024, // 128MB 201 | KVBytesPerSync: 2 * 1024 * 1024, // 2MB 202 | KVWALBytesPerSync: 2 * 1024 * 1024, // 2MB 203 | } 204 | } 205 | 206 | func openPebbleDB(config PebbleDBConfig, dir string) (*pebble.DB, error) { 207 | blockSize := config.KVBlockSize 208 | levelSizeMultiplier := config.KVTargetFileSizeMultiplier 209 | sz := config.KVTargetFileSizeBase 210 | lopts := make([]pebble.LevelOptions, 0) 211 | 212 | for l := 0; l < config.KVNumOfLevels; l++ { 213 | opt := pebble.LevelOptions{ 214 | Compression: pebble.DefaultCompression, 215 | BlockSize: blockSize, 216 | TargetFileSize: sz, 217 | } 218 | sz = sz * levelSizeMultiplier 219 | lopts = append(lopts, opt) 220 | } 221 | 222 | dataPath := filepath.Join(dir, "data") 223 | if err := os.MkdirAll(dataPath, os.ModePerm); err != nil { 224 | return nil, err 225 | } 226 | 227 | walPath := filepath.Join(dir, "wal") 228 | if err := os.MkdirAll(walPath, os.ModePerm); err != nil { 229 | return nil, err 230 | } 231 | 232 | cache := pebble.NewCache(config.KVLRUCacheSize) 233 | opts := &pebble.Options{ 234 | BytesPerSync: config.KVBytesPerSync, 235 | Levels: lopts, 236 | MaxManifestFileSize: config.KVMaxManifestFileSize, 237 | MemTableSize: config.KVWriteBufferSize, 238 | MemTableStopWritesThreshold: config.KVMaxWriteBufferNumber, 239 | LBaseMaxBytes: config.KVMaxBytesForLevelBase, 240 | L0CompactionThreshold: config.KVLevel0FileNumCompactionTrigger, 241 | L0StopWritesThreshold: config.KVLevel0StopWritesTrigger, 242 | Cache: cache, 243 | WALDir: walPath, 244 | MaxOpenFiles: config.KVMaxOpenFiles, 245 | MaxConcurrentCompactions: config.KVMaxConcurrentCompactions, 246 | WALBytesPerSync: config.KVWALBytesPerSync, 247 | } 248 | 249 | db, err := pebble.Open(dataPath, opts) 250 | if err != nil { 251 | return nil, err 252 | } 253 | cache.Unref() 254 | 255 | return db, nil 256 | } 257 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/xkeyideal/mraft 2 | 3 | go 1.21.0 4 | 5 | require ( 6 | github.com/cockroachdb/pebble v0.0.0-20210331181633-27fc006b8bfb 7 | github.com/gin-gonic/gin v1.9.1 8 | github.com/lni/dragonboat/v3 v3.3.7 9 | github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 10 | github.com/spf13/cobra v1.7.0 11 | github.com/thrift-iterator/go v0.0.0-20190402154806-9b5a67519118 12 | github.com/ugorji/go/codec v1.2.11 13 | github.com/xkeyideal/gokit v1.4.2 14 | go.uber.org/atomic v1.11.0 15 | go.uber.org/zap v1.25.0 16 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 17 | ) 18 | 19 | require ( 20 | github.com/VictoriaMetrics/metrics v1.6.2 // indirect 21 | github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da // indirect 22 | github.com/bytedance/sonic v1.9.1 // indirect 23 | github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect 24 | github.com/cockroachdb/errors v1.8.1 // indirect 25 | github.com/cockroachdb/logtags v0.0.0-20190617123548-eb05cc24525f // indirect 26 | github.com/cockroachdb/redact v1.0.8 // indirect 27 | github.com/cockroachdb/sentry-go v0.6.1-cockroachdb.2 // indirect 28 | github.com/gabriel-vasile/mimetype v1.4.2 // indirect 29 | github.com/gin-contrib/sse v0.1.0 // indirect 30 | github.com/go-playground/locales v0.14.1 // indirect 31 | github.com/go-playground/universal-translator v0.18.1 // indirect 32 | github.com/go-playground/validator/v10 v10.14.0 // indirect 33 | github.com/goccy/go-json v0.10.2 // indirect 34 | github.com/gogo/protobuf v1.3.2 // indirect 35 | github.com/golang/protobuf v1.5.2 // indirect 36 | github.com/golang/snappy v0.0.4 // indirect 37 | github.com/google/btree v1.0.0 // indirect 38 | github.com/hashicorp/errwrap v1.0.0 // indirect 39 | github.com/hashicorp/go-immutable-radix v1.0.0 // indirect 40 | github.com/hashicorp/go-msgpack v0.5.3 // indirect 41 | github.com/hashicorp/go-multierror v1.0.0 // indirect 42 | github.com/hashicorp/go-sockaddr v1.0.0 // indirect 43 | github.com/hashicorp/golang-lru v0.5.1 // indirect 44 | github.com/hashicorp/memberlist v0.2.2 // indirect 45 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 46 | github.com/json-iterator/go v1.1.12 // indirect 47 | github.com/juju/ratelimit v1.0.2-0.20191002062651-f60b32039441 // indirect 48 | github.com/klauspost/cpuid/v2 v2.2.4 // indirect 49 | github.com/kr/pretty v0.2.1 // indirect 50 | github.com/kr/text v0.2.0 // indirect 51 | github.com/leodido/go-urn v1.2.4 // indirect 52 | github.com/lni/goutils v1.3.0 // indirect 53 | github.com/mattn/go-isatty v0.0.19 // indirect 54 | github.com/miekg/dns v1.1.26 // indirect 55 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 56 | github.com/modern-go/reflect2 v1.0.2 // indirect 57 | github.com/pelletier/go-toml/v2 v2.0.8 // indirect 58 | github.com/pkg/errors v0.9.1 // indirect 59 | github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect 60 | github.com/spf13/pflag v1.0.5 // indirect 61 | github.com/twitchyliquid64/golang-asm v0.15.1 // indirect 62 | github.com/v2pro/plz v0.0.0-20221028024117-e5f9aec5b631 // indirect 63 | github.com/v2pro/quokka v0.0.0-20171201153428-382cb39c6ee6 // indirect 64 | github.com/v2pro/wombat v0.0.0-20180402055224-a56dbdcddef2 // indirect 65 | github.com/valyala/fastrand v1.0.0 // indirect 66 | github.com/valyala/histogram v1.0.1 // indirect 67 | go.uber.org/multierr v1.10.0 // indirect 68 | golang.org/x/arch v0.3.0 // indirect 69 | golang.org/x/crypto v0.9.0 // indirect 70 | golang.org/x/exp v0.0.0-20200513190911-00229845015e // indirect 71 | golang.org/x/net v0.10.0 // indirect 72 | golang.org/x/sync v0.0.0-20201207232520-09787c993a3a // indirect 73 | golang.org/x/sys v0.8.0 // indirect 74 | golang.org/x/text v0.9.0 // indirect 75 | google.golang.org/protobuf v1.30.0 // indirect 76 | gopkg.in/yaml.v3 v3.0.1 // indirect 77 | ) 78 | -------------------------------------------------------------------------------- /gossip/config.go: -------------------------------------------------------------------------------- 1 | package gossip 2 | 3 | import ( 4 | "errors" 5 | "net" 6 | "strconv" 7 | 8 | "github.com/lni/goutils/stringutil" 9 | "go.uber.org/zap/zapcore" 10 | ) 11 | 12 | type GossipOptions struct { 13 | Name string 14 | MoveToGrpcAddr string 15 | RubikGrpcAddr string 16 | GossipNodes int 17 | LogDir string 18 | LogLevel zapcore.Level 19 | DisableCoordinates bool 20 | } 21 | 22 | type GossipConfig struct { 23 | // GossipProbeInterval define the probe interval used by the gossip 24 | // service in tests. 25 | // GossipProbeInterval time.Duration `json:"gossipProbeInterval"` 26 | // BindAddress is the address for the gossip service to bind to and listen on. 27 | // Both UDP and TCP ports are used by the gossip service. The local gossip 28 | // service should be able to receive gossip service related messages by 29 | // binding to and listening on this address. BindAddress is usually in the 30 | // format of IP:Port, Hostname:Port or DNS Name:Port. 31 | BindAddress string `json:"-"` 32 | BindPort uint16 `json:"bindPort"` 33 | // AdvertiseAddress is the address to advertise to other NodeHost instances 34 | // used for NAT traversal. Gossip services running on remote NodeHost 35 | // instances will use AdvertiseAddress to exchange gossip service related 36 | // messages. AdvertiseAddress is in the format of IP:Port. 37 | // AdvertiseAddress string 38 | // Seed is a list of AdvertiseAddress of remote NodeHost instances. Local 39 | // NodeHost instance will try to contact all of them to bootstrap the gossip 40 | // service. At least one reachable NodeHost instance is required to 41 | // successfully bootstrap the gossip service. Each seed address is in the 42 | // format of IP:Port, Hostname:Port or DNS Name:Port. 43 | // 44 | // It is ok to include seed addresses that are temporarily unreachable, e.g. 45 | // when launching the first NodeHost instance in your deployment, you can 46 | // include AdvertiseAddresses from other NodeHost instances that you plan to 47 | // launch shortly afterwards. 48 | Seeds []string `json:"seeds"` 49 | 50 | // 用于当cluster的数据在gossip内更新后,存储到文件中的接口 51 | clusterCallback ClusterCallback `json:"-"` 52 | } 53 | 54 | // IsEmpty returns a boolean flag indicating whether the GossipConfig instance 55 | // is empty. 56 | func (g *GossipConfig) IsEmpty() bool { 57 | return len(g.BindAddress) == 0 && len(g.Seeds) == 0 58 | } 59 | 60 | func (g *GossipConfig) SetClusterCallback(fn ClusterCallback) { 61 | g.clusterCallback = fn 62 | } 63 | 64 | // Validate validates the GossipConfig instance. 65 | func (g *GossipConfig) Validate() error { 66 | if len(g.BindAddress) > 0 && !stringutil.IsValidAddress(g.BindAddress) { 67 | return errors.New("invalid GossipConfig.BindAddress") 68 | } else if len(g.BindAddress) == 0 { 69 | return errors.New("BindAddress not set") 70 | } 71 | 72 | if g.clusterCallback == nil { 73 | return errors.New("clusterCallback not set") 74 | } 75 | 76 | // if len(g.AdvertiseAddress) > 0 && !isValidAdvertiseAddress(g.AdvertiseAddress) { 77 | // return errors.New("invalid GossipConfig.AdvertiseAddress") 78 | // } 79 | if len(g.Seeds) == 0 { 80 | return errors.New("seed nodes not set") 81 | } 82 | count := 0 83 | for _, v := range g.Seeds { 84 | if v != g.BindAddress /*&& v != g.AdvertiseAddress*/ { 85 | count++ 86 | } 87 | if !stringutil.IsValidAddress(v) { 88 | return errors.New("invalid GossipConfig.Seed value") 89 | } 90 | } 91 | if count == 0 { 92 | return errors.New("no valid seed node") 93 | } 94 | return nil 95 | } 96 | 97 | func isValidAdvertiseAddress(addr string) bool { 98 | host, sp, err := net.SplitHostPort(addr) 99 | if err != nil { 100 | return false 101 | } 102 | port, err := strconv.ParseUint(sp, 10, 16) 103 | if err != nil { 104 | return false 105 | } 106 | if port > 65535 { 107 | return false 108 | } 109 | // the memberlist package doesn't allow hostname or DNS name to be used in 110 | // advertise address 111 | return stringutil.IPV4Regex.MatchString(host) 112 | } 113 | 114 | func parseAddress(addr string) (string, int, error) { 115 | host, sp, err := net.SplitHostPort(addr) 116 | if err != nil { 117 | return "", 0, err 118 | } 119 | port, err := strconv.ParseUint(sp, 10, 16) 120 | if err != nil { 121 | return "", 0, err 122 | } 123 | return host, int(port), nil 124 | } 125 | -------------------------------------------------------------------------------- /gossip/coordinate/client.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "sort" 7 | "sync" 8 | "time" 9 | ) 10 | 11 | // Client manages the estimated network coordinate for a given node, and adjusts 12 | // it as the node observes round trip times and estimated coordinates from other 13 | // nodes. The core algorithm is based on Vivaldi, see the documentation for Config 14 | // for more details. 15 | type Client struct { 16 | // coord is the current estimate of the client's network coordinate. 17 | coord *Coordinate 18 | 19 | // origin is a coordinate sitting at the origin. 20 | origin *Coordinate 21 | 22 | // config contains the tuning parameters that govern the performance of 23 | // the algorithm. 24 | config *Config 25 | 26 | // adjustmentIndex is the current index into the adjustmentSamples slice. 27 | adjustmentIndex uint 28 | 29 | // adjustment is used to store samples for the adjustment calculation. 30 | adjustmentSamples []float64 31 | 32 | // latencyFilterSamples is used to store the last several RTT samples, 33 | // keyed by node name. We will use the config's LatencyFilterSamples 34 | // value to determine how many samples we keep, per node. 35 | latencyFilterSamples map[string][]float64 36 | 37 | // stats is used to record events that occur when updating coordinates. 38 | stats ClientStats 39 | 40 | // mutex enables safe concurrent access to the client. 41 | mutex sync.RWMutex 42 | } 43 | 44 | // ClientStats is used to record events that occur when updating coordinates. 45 | type ClientStats struct { 46 | // Resets is incremented any time we reset our local coordinate because 47 | // our calculations have resulted in an invalid state. 48 | Resets int 49 | } 50 | 51 | // NewClient creates a new Client and verifies the configuration is valid. 52 | func NewClient(config *Config) (*Client, error) { 53 | if !(config.Dimensionality > 0) { 54 | return nil, fmt.Errorf("dimensionality must be >0") 55 | } 56 | 57 | return &Client{ 58 | coord: NewCoordinate(config), 59 | origin: NewCoordinate(config), 60 | config: config, 61 | adjustmentIndex: 0, 62 | adjustmentSamples: make([]float64, config.AdjustmentWindowSize), 63 | latencyFilterSamples: make(map[string][]float64), 64 | }, nil 65 | } 66 | 67 | // GetCoordinate returns a copy of the coordinate for this client. 68 | func (c *Client) GetCoordinate() *Coordinate { 69 | c.mutex.RLock() 70 | defer c.mutex.RUnlock() 71 | 72 | return c.coord.Clone() 73 | } 74 | 75 | // SetCoordinate forces the client's coordinate to a known state. 76 | func (c *Client) SetCoordinate(coord *Coordinate) error { 77 | c.mutex.Lock() 78 | defer c.mutex.Unlock() 79 | 80 | if err := c.checkCoordinate(coord); err != nil { 81 | return err 82 | } 83 | 84 | c.coord = coord.Clone() 85 | return nil 86 | } 87 | 88 | // ForgetNode removes any client state for the given node. 89 | func (c *Client) ForgetNode(node string) { 90 | c.mutex.Lock() 91 | defer c.mutex.Unlock() 92 | 93 | delete(c.latencyFilterSamples, node) 94 | } 95 | 96 | // Stats returns a copy of stats for the client. 97 | func (c *Client) Stats() ClientStats { 98 | c.mutex.Lock() 99 | defer c.mutex.Unlock() 100 | 101 | return c.stats 102 | } 103 | 104 | // checkCoordinate returns an error if the coordinate isn't compatible with 105 | // this client, or if the coordinate itself isn't valid. This assumes the mutex 106 | // has been locked already. 107 | func (c *Client) checkCoordinate(coord *Coordinate) error { 108 | if !c.coord.IsCompatibleWith(coord) { 109 | return fmt.Errorf("dimensions aren't compatible") 110 | } 111 | 112 | if !coord.IsValid() { 113 | return fmt.Errorf("coordinate is invalid") 114 | } 115 | 116 | return nil 117 | } 118 | 119 | // latencyFilter applies a simple moving median filter with a new sample for 120 | // a node. This assumes that the mutex has been locked already. 121 | func (c *Client) latencyFilter(node string, rttSeconds float64) float64 { 122 | samples, ok := c.latencyFilterSamples[node] 123 | if !ok { 124 | samples = make([]float64, 0, c.config.LatencyFilterSize) 125 | } 126 | 127 | // Add the new sample and trim the list, if needed. 128 | samples = append(samples, rttSeconds) 129 | if len(samples) > int(c.config.LatencyFilterSize) { 130 | samples = samples[1:] 131 | } 132 | c.latencyFilterSamples[node] = samples 133 | 134 | // Sort a copy of the samples and return the median. 135 | sorted := make([]float64, len(samples)) 136 | copy(sorted, samples) 137 | sort.Float64s(sorted) 138 | return sorted[len(sorted)/2] 139 | } 140 | 141 | // updateVivialdi updates the Vivaldi portion of the client's coordinate. This 142 | // assumes that the mutex has been locked already. 143 | func (c *Client) updateVivaldi(other *Coordinate, rttSeconds float64) { 144 | const zeroThreshold = 1.0e-6 145 | 146 | dist := c.coord.DistanceTo(other).Seconds() 147 | if rttSeconds < zeroThreshold { 148 | rttSeconds = zeroThreshold 149 | } 150 | wrongness := math.Abs(dist-rttSeconds) / rttSeconds 151 | 152 | totalError := c.coord.Error + other.Error 153 | if totalError < zeroThreshold { 154 | totalError = zeroThreshold 155 | } 156 | weight := c.coord.Error / totalError 157 | 158 | c.coord.Error = c.config.VivaldiCE*weight*wrongness + c.coord.Error*(1.0-c.config.VivaldiCE*weight) 159 | if c.coord.Error > c.config.VivaldiErrorMax { 160 | c.coord.Error = c.config.VivaldiErrorMax 161 | } 162 | 163 | delta := c.config.VivaldiCC * weight 164 | force := delta * (rttSeconds - dist) 165 | c.coord = c.coord.ApplyForce(c.config, force, other) 166 | } 167 | 168 | // updateAdjustment updates the adjustment portion of the client's coordinate, if 169 | // the feature is enabled. This assumes that the mutex has been locked already. 170 | func (c *Client) updateAdjustment(other *Coordinate, rttSeconds float64) { 171 | if c.config.AdjustmentWindowSize == 0 { 172 | return 173 | } 174 | 175 | // Note that the existing adjustment factors don't figure in to this 176 | // calculation so we use the raw distance here. 177 | dist := c.coord.rawDistanceTo(other) 178 | c.adjustmentSamples[c.adjustmentIndex] = rttSeconds - dist 179 | c.adjustmentIndex = (c.adjustmentIndex + 1) % c.config.AdjustmentWindowSize 180 | 181 | sum := 0.0 182 | for _, sample := range c.adjustmentSamples { 183 | sum += sample 184 | } 185 | c.coord.Adjustment = sum / (2.0 * float64(c.config.AdjustmentWindowSize)) 186 | } 187 | 188 | // updateGravity applies a small amount of gravity to pull coordinates towards 189 | // the center of the coordinate system to combat drift. This assumes that the 190 | // mutex is locked already. 191 | func (c *Client) updateGravity() { 192 | dist := c.origin.DistanceTo(c.coord).Seconds() 193 | force := -1.0 * math.Pow(dist/c.config.GravityRho, 2.0) 194 | c.coord = c.coord.ApplyForce(c.config, force, c.origin) 195 | } 196 | 197 | // Update takes other, a coordinate for another node, and rtt, a round trip 198 | // time observation for a ping to that node, and updates the estimated position of 199 | // the client's coordinate. Returns the updated coordinate. 200 | func (c *Client) Update(node string, other *Coordinate, rtt time.Duration) (*Coordinate, error) { 201 | c.mutex.Lock() 202 | defer c.mutex.Unlock() 203 | 204 | if err := c.checkCoordinate(other); err != nil { 205 | return nil, err 206 | } 207 | 208 | // The code down below can handle zero RTTs, which we have seen in 209 | // https://github.com/hashicorp/consul/issues/3789, presumably in 210 | // environments with coarse-grained monotonic clocks (we are still 211 | // trying to pin this down). In any event, this is ok from a code PoV 212 | // so we don't need to alert operators with spammy messages. We did 213 | // add a counter so this is still observable, though. 214 | const maxRTT = 10 * time.Second 215 | if rtt < 0 || rtt > maxRTT { 216 | return nil, fmt.Errorf("round trip time not in valid range, duration %v is not a positive value less than %v ", rtt, maxRTT) 217 | } 218 | if rtt == 0 { 219 | //metrics.IncrCounter([]string{"serf", "coordinate", "zero-rtt"}, 1) 220 | } 221 | 222 | rttSeconds := c.latencyFilter(node, rtt.Seconds()) 223 | c.updateVivaldi(other, rttSeconds) 224 | c.updateAdjustment(other, rttSeconds) 225 | c.updateGravity() 226 | if !c.coord.IsValid() { 227 | c.stats.Resets++ 228 | c.coord = NewCoordinate(c.config) 229 | } 230 | 231 | return c.coord.Clone(), nil 232 | } 233 | 234 | // DistanceTo returns the estimated RTT from the client's coordinate to other, the 235 | // coordinate for another node. 236 | func (c *Client) DistanceTo(other *Coordinate) time.Duration { 237 | c.mutex.RLock() 238 | defer c.mutex.RUnlock() 239 | 240 | return c.coord.DistanceTo(other) 241 | } 242 | -------------------------------------------------------------------------------- /gossip/coordinate/client_test.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "reflect" 7 | "strings" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func TestClient_NewClient(t *testing.T) { 13 | config := DefaultConfig() 14 | 15 | config.Dimensionality = 0 16 | client, err := NewClient(config) 17 | if err == nil || !strings.Contains(err.Error(), "dimensionality") { 18 | t.Fatal(err) 19 | } 20 | 21 | config.Dimensionality = 7 22 | client, err = NewClient(config) 23 | if err != nil { 24 | t.Fatal(err) 25 | } 26 | 27 | origin := NewCoordinate(config) 28 | if !reflect.DeepEqual(client.GetCoordinate(), origin) { 29 | t.Fatalf("fresh client should be located at the origin") 30 | } 31 | } 32 | 33 | func TestClient_Update(t *testing.T) { 34 | config := DefaultConfig() 35 | config.Dimensionality = 3 36 | 37 | client, err := NewClient(config) 38 | if err != nil { 39 | t.Fatal(err) 40 | } 41 | 42 | // Make sure the Euclidean part of our coordinate is what we expect. 43 | c := client.GetCoordinate() 44 | verifyEqualVectors(t, c.Vec, []float64{0.0, 0.0, 0.0}) 45 | 46 | // Place a node right above the client and observe an RTT longer than the 47 | // client expects, given its distance. 48 | other := NewCoordinate(config) 49 | other.Vec[2] = 0.001 50 | rtt := time.Duration(2.0 * other.Vec[2] * secondsToNanoseconds) 51 | c, err = client.Update("node", other, rtt) 52 | if err != nil { 53 | t.Fatalf("err: %v", err) 54 | } 55 | 56 | // The client should have scooted down to get away from it. 57 | if !(c.Vec[2] < 0.0) { 58 | t.Fatalf("client z coordinate %9.6f should be < 0.0", c.Vec[2]) 59 | } 60 | 61 | // Set the coordinate to a known state. 62 | c.Vec[2] = 99.0 63 | client.SetCoordinate(c) 64 | c = client.GetCoordinate() 65 | verifyEqualFloats(t, c.Vec[2], 99.0) 66 | } 67 | 68 | func TestClient_InvalidInPingValues(t *testing.T) { 69 | config := DefaultConfig() 70 | config.Dimensionality = 3 71 | 72 | client, err := NewClient(config) 73 | if err != nil { 74 | t.Fatal(err) 75 | } 76 | 77 | // Place another node 78 | other := NewCoordinate(config) 79 | other.Vec[2] = 0.001 80 | dist := client.DistanceTo(other) 81 | 82 | // Update with a series of invalid ping periods, should return an error and estimated rtt remains unchanged 83 | pings := []int{1<<63 - 1, -35, 11} 84 | 85 | for _, ping := range pings { 86 | expectedErr := fmt.Errorf("round trip time not in valid range, duration %v is not a positive value less than %v", ping, 10*time.Second) 87 | _, err = client.Update("node", other, time.Duration(ping*secondsToNanoseconds)) 88 | if err == nil { 89 | t.Fatalf("Unexpected error, wanted %v but got %v", expectedErr, err) 90 | } 91 | 92 | dist_new := client.DistanceTo(other) 93 | if dist_new != dist { 94 | t.Fatalf("distance estimate %v not equal to %v", dist_new, dist) 95 | } 96 | } 97 | 98 | } 99 | 100 | func TestClient_DistanceTo(t *testing.T) { 101 | config := DefaultConfig() 102 | config.Dimensionality = 3 103 | config.HeightMin = 0 104 | 105 | client, err := NewClient(config) 106 | if err != nil { 107 | t.Fatal(err) 108 | } 109 | 110 | // Fiddle a raw coordinate to put it a specific number of seconds away. 111 | other := NewCoordinate(config) 112 | other.Vec[2] = 12.345 113 | expected := time.Duration(other.Vec[2] * secondsToNanoseconds) 114 | dist := client.DistanceTo(other) 115 | if dist != expected { 116 | t.Fatalf("distance doesn't match %9.6f != %9.6f", dist.Seconds(), expected.Seconds()) 117 | } 118 | } 119 | 120 | func TestClient_latencyFilter(t *testing.T) { 121 | config := DefaultConfig() 122 | config.LatencyFilterSize = 3 123 | 124 | client, err := NewClient(config) 125 | if err != nil { 126 | t.Fatal(err) 127 | } 128 | 129 | // Make sure we get the median, and that things age properly. 130 | verifyEqualFloats(t, client.latencyFilter("alice", 0.201), 0.201) 131 | verifyEqualFloats(t, client.latencyFilter("alice", 0.200), 0.201) 132 | verifyEqualFloats(t, client.latencyFilter("alice", 0.207), 0.201) 133 | 134 | // This glitch will get median-ed out and never seen by Vivaldi. 135 | verifyEqualFloats(t, client.latencyFilter("alice", 1.9), 0.207) 136 | verifyEqualFloats(t, client.latencyFilter("alice", 0.203), 0.207) 137 | verifyEqualFloats(t, client.latencyFilter("alice", 0.199), 0.203) 138 | verifyEqualFloats(t, client.latencyFilter("alice", 0.211), 0.203) 139 | 140 | // Make sure different nodes are not coupled. 141 | verifyEqualFloats(t, client.latencyFilter("bob", 0.310), 0.310) 142 | 143 | // Make sure we don't leak coordinates for nodes that leave. 144 | client.ForgetNode("alice") 145 | verifyEqualFloats(t, client.latencyFilter("alice", 0.888), 0.888) 146 | } 147 | 148 | func TestClient_NaN_Defense(t *testing.T) { 149 | config := DefaultConfig() 150 | config.Dimensionality = 3 151 | 152 | client, err := NewClient(config) 153 | if err != nil { 154 | t.Fatal(err) 155 | } 156 | 157 | // Block a bad coordinate from coming in. 158 | other := NewCoordinate(config) 159 | other.Vec[0] = math.NaN() 160 | if other.IsValid() { 161 | t.Fatalf("bad: %#v", *other) 162 | } 163 | rtt := 250 * time.Millisecond 164 | c, err := client.Update("node", other, rtt) 165 | if err == nil || !strings.Contains(err.Error(), "coordinate is invalid") { 166 | t.Fatalf("err: %v", err) 167 | } 168 | if c := client.GetCoordinate(); !c.IsValid() { 169 | t.Fatalf("bad: %#v", *c) 170 | } 171 | 172 | // Block setting an invalid coordinate directly. 173 | err = client.SetCoordinate(other) 174 | if err == nil || !strings.Contains(err.Error(), "coordinate is invalid") { 175 | t.Fatalf("err: %v", err) 176 | } 177 | if c := client.GetCoordinate(); !c.IsValid() { 178 | t.Fatalf("bad: %#v", *c) 179 | } 180 | 181 | // Block an incompatible coordinate. 182 | other.Vec = make([]float64, 2*len(other.Vec)) 183 | c, err = client.Update("node", other, rtt) 184 | if err == nil || !strings.Contains(err.Error(), "dimensions aren't compatible") { 185 | t.Fatalf("err: %v", err) 186 | } 187 | if c := client.GetCoordinate(); !c.IsValid() { 188 | t.Fatalf("bad: %#v", *c) 189 | } 190 | 191 | // Block setting an incompatible coordinate directly. 192 | err = client.SetCoordinate(other) 193 | if err == nil || !strings.Contains(err.Error(), "dimensions aren't compatible") { 194 | t.Fatalf("err: %v", err) 195 | } 196 | if c := client.GetCoordinate(); !c.IsValid() { 197 | t.Fatalf("bad: %#v", *c) 198 | } 199 | 200 | // Poison the internal state and make sure we reset on an update. 201 | client.coord.Vec[0] = math.NaN() 202 | other = NewCoordinate(config) 203 | c, err = client.Update("node", other, rtt) 204 | if err != nil { 205 | t.Fatalf("err: %v", err) 206 | } 207 | if !c.IsValid() { 208 | t.Fatalf("bad: %#v", *c) 209 | } 210 | if got, want := client.Stats().Resets, 1; got != want { 211 | t.Fatalf("got %d want %d", got, want) 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /gossip/coordinate/config.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | // Config is used to set the parameters of the Vivaldi-based coordinate mapping 4 | // algorithm. 5 | // 6 | // The following references are called out at various points in the documentation 7 | // here: 8 | // 9 | // [1] Dabek, Frank, et al. "Vivaldi: A decentralized network coordinate system." 10 | // ACM SIGCOMM Computer Communication Review. Vol. 34. No. 4. ACM, 2004. 11 | // [2] Ledlie, Jonathan, Paul Gardner, and Margo I. Seltzer. "Network Coordinates 12 | // in the Wild." NSDI. Vol. 7. 2007. 13 | // [3] Lee, Sanghwan, et al. "On suitability of Euclidean embedding for 14 | // host-based network coordinate systems." Networking, IEEE/ACM Transactions 15 | // on 18.1 (2010): 27-40. 16 | type Config struct { 17 | // The dimensionality of the coordinate system. As discussed in [2], more 18 | // dimensions improves the accuracy of the estimates up to a point. Per [2] 19 | // we chose 8 dimensions plus a non-Euclidean height. 20 | Dimensionality uint 21 | 22 | // VivaldiErrorMax is the default error value when a node hasn't yet made 23 | // any observations. It also serves as an upper limit on the error value in 24 | // case observations cause the error value to increase without bound. 25 | VivaldiErrorMax float64 26 | 27 | // VivaldiCE is a tuning factor that controls the maximum impact an 28 | // observation can have on a node's confidence. See [1] for more details. 29 | VivaldiCE float64 30 | 31 | // VivaldiCC is a tuning factor that controls the maximum impact an 32 | // observation can have on a node's coordinate. See [1] for more details. 33 | VivaldiCC float64 34 | 35 | // AdjustmentWindowSize is a tuning factor that determines how many samples 36 | // we retain to calculate the adjustment factor as discussed in [3]. Setting 37 | // this to zero disables this feature. 38 | AdjustmentWindowSize uint 39 | 40 | // HeightMin is the minimum value of the height parameter. Since this 41 | // always must be positive, it will introduce a small amount error, so 42 | // the chosen value should be relatively small compared to "normal" 43 | // coordinates. 44 | HeightMin float64 45 | 46 | // LatencyFilterSamples is the maximum number of samples that are retained 47 | // per node, in order to compute a median. The intent is to ride out blips 48 | // but still keep the delay low, since our time to probe any given node is 49 | // pretty infrequent. See [2] for more details. 50 | LatencyFilterSize uint 51 | 52 | // GravityRho is a tuning factor that sets how much gravity has an effect 53 | // to try to re-center coordinates. See [2] for more details. 54 | GravityRho float64 55 | } 56 | 57 | // DefaultConfig returns a Config that has some default values suitable for 58 | // basic testing of the algorithm, but not tuned to any particular type of cluster. 59 | func DefaultConfig() *Config { 60 | return &Config{ 61 | Dimensionality: 8, 62 | VivaldiErrorMax: 1.5, 63 | VivaldiCE: 0.25, 64 | VivaldiCC: 0.25, 65 | AdjustmentWindowSize: 20, 66 | HeightMin: 10.0e-6, 67 | LatencyFilterSize: 3, 68 | GravityRho: 150.0, 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /gossip/coordinate/coordinate.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | "time" 7 | ) 8 | 9 | // Coordinate is a specialized structure for holding network coordinates for the 10 | // Vivaldi-based coordinate mapping algorithm. All of the fields should be public 11 | // to enable this to be serialized. All values in here are in units of seconds. 12 | type Coordinate struct { 13 | // Vec is the Euclidean portion of the coordinate. This is used along 14 | // with the other fields to provide an overall distance estimate. The 15 | // units here are seconds. 16 | Vec []float64 17 | 18 | // Err reflects the confidence in the given coordinate and is updated 19 | // dynamically by the Vivaldi Client. This is dimensionless. 20 | Error float64 21 | 22 | // Adjustment is a distance offset computed based on a calculation over 23 | // observations from all other nodes over a fixed window and is updated 24 | // dynamically by the Vivaldi Client. The units here are seconds. 25 | Adjustment float64 26 | 27 | // Height is a distance offset that accounts for non-Euclidean effects 28 | // which model the access links from nodes to the core Internet. The access 29 | // links are usually set by bandwidth and congestion, and the core links 30 | // usually follow distance based on geography. 31 | Height float64 32 | } 33 | 34 | const ( 35 | // secondsToNanoseconds is used to convert float seconds to nanoseconds. 36 | secondsToNanoseconds = 1.0e9 37 | 38 | // zeroThreshold is used to decide if two coordinates are on top of each 39 | // other. 40 | zeroThreshold = 1.0e-6 41 | ) 42 | 43 | // ErrDimensionalityConflict will be panic-d if you try to perform operations 44 | // with incompatible dimensions. 45 | type DimensionalityConflictError struct{} 46 | 47 | // Adds the error interface. 48 | func (e DimensionalityConflictError) Error() string { 49 | return "coordinate dimensionality does not match" 50 | } 51 | 52 | // NewCoordinate creates a new coordinate at the origin, using the given config 53 | // to supply key initial values. 54 | func NewCoordinate(config *Config) *Coordinate { 55 | return &Coordinate{ 56 | Vec: make([]float64, config.Dimensionality), 57 | Error: config.VivaldiErrorMax, 58 | Adjustment: 0.0, 59 | Height: config.HeightMin, 60 | } 61 | } 62 | 63 | // Clone creates an independent copy of this coordinate. 64 | func (c *Coordinate) Clone() *Coordinate { 65 | vec := make([]float64, len(c.Vec)) 66 | copy(vec, c.Vec) 67 | return &Coordinate{ 68 | Vec: vec, 69 | Error: c.Error, 70 | Adjustment: c.Adjustment, 71 | Height: c.Height, 72 | } 73 | } 74 | 75 | // componentIsValid returns false if a floating point value is a NaN or an 76 | // infinity. 77 | func componentIsValid(f float64) bool { 78 | return !math.IsInf(f, 0) && !math.IsNaN(f) 79 | } 80 | 81 | // IsValid returns false if any component of a coordinate isn't valid, per the 82 | // componentIsValid() helper above. 83 | func (c *Coordinate) IsValid() bool { 84 | for i := range c.Vec { 85 | if !componentIsValid(c.Vec[i]) { 86 | return false 87 | } 88 | } 89 | 90 | return componentIsValid(c.Error) && 91 | componentIsValid(c.Adjustment) && 92 | componentIsValid(c.Height) 93 | } 94 | 95 | // IsCompatibleWith checks to see if the two coordinates are compatible 96 | // dimensionally. If this returns true then you are guaranteed to not get 97 | // any runtime errors operating on them. 98 | func (c *Coordinate) IsCompatibleWith(other *Coordinate) bool { 99 | return len(c.Vec) == len(other.Vec) 100 | } 101 | 102 | // ApplyForce returns the result of applying the force from the direction of the 103 | // other coordinate. 104 | func (c *Coordinate) ApplyForce(config *Config, force float64, other *Coordinate) *Coordinate { 105 | if !c.IsCompatibleWith(other) { 106 | panic(DimensionalityConflictError{}) 107 | } 108 | 109 | ret := c.Clone() 110 | unit, mag := unitVectorAt(c.Vec, other.Vec) 111 | ret.Vec = add(ret.Vec, mul(unit, force)) 112 | if mag > zeroThreshold { 113 | ret.Height = (ret.Height+other.Height)*force/mag + ret.Height 114 | ret.Height = math.Max(ret.Height, config.HeightMin) 115 | } 116 | return ret 117 | } 118 | 119 | // DistanceTo returns the distance between this coordinate and the other 120 | // coordinate, including adjustments. 121 | func (c *Coordinate) DistanceTo(other *Coordinate) time.Duration { 122 | if !c.IsCompatibleWith(other) { 123 | panic(DimensionalityConflictError{}) 124 | } 125 | 126 | dist := c.rawDistanceTo(other) 127 | adjustedDist := dist + c.Adjustment + other.Adjustment 128 | if adjustedDist > 0.0 { 129 | dist = adjustedDist 130 | } 131 | return time.Duration(dist * secondsToNanoseconds) 132 | } 133 | 134 | // rawDistanceTo returns the Vivaldi distance between this coordinate and the 135 | // other coordinate in seconds, not including adjustments. This assumes the 136 | // dimensions have already been checked to be compatible. 137 | func (c *Coordinate) rawDistanceTo(other *Coordinate) float64 { 138 | return magnitude(diff(c.Vec, other.Vec)) + c.Height + other.Height 139 | } 140 | 141 | // add returns the sum of vec1 and vec2. This assumes the dimensions have 142 | // already been checked to be compatible. 143 | func add(vec1 []float64, vec2 []float64) []float64 { 144 | ret := make([]float64, len(vec1)) 145 | for i := range ret { 146 | ret[i] = vec1[i] + vec2[i] 147 | } 148 | return ret 149 | } 150 | 151 | // diff returns the difference between the vec1 and vec2. This assumes the 152 | // dimensions have already been checked to be compatible. 153 | func diff(vec1 []float64, vec2 []float64) []float64 { 154 | ret := make([]float64, len(vec1)) 155 | for i := range ret { 156 | ret[i] = vec1[i] - vec2[i] 157 | } 158 | return ret 159 | } 160 | 161 | // mul returns vec multiplied by a scalar factor. 162 | func mul(vec []float64, factor float64) []float64 { 163 | ret := make([]float64, len(vec)) 164 | for i := range vec { 165 | ret[i] = vec[i] * factor 166 | } 167 | return ret 168 | } 169 | 170 | // magnitude computes the magnitude of the vec. 171 | func magnitude(vec []float64) float64 { 172 | sum := 0.0 173 | for i := range vec { 174 | sum += vec[i] * vec[i] 175 | } 176 | return math.Sqrt(sum) 177 | } 178 | 179 | // unitVectorAt returns a unit vector pointing at vec1 from vec2. If the two 180 | // positions are the same then a random unit vector is returned. We also return 181 | // the distance between the points for use in the later height calculation. 182 | func unitVectorAt(vec1 []float64, vec2 []float64) ([]float64, float64) { 183 | ret := diff(vec1, vec2) 184 | 185 | // If the coordinates aren't on top of each other we can normalize. 186 | if mag := magnitude(ret); mag > zeroThreshold { 187 | return mul(ret, 1.0/mag), mag 188 | } 189 | 190 | // Otherwise, just return a random unit vector. 191 | for i := range ret { 192 | ret[i] = rand.Float64() - 0.5 193 | } 194 | if mag := magnitude(ret); mag > zeroThreshold { 195 | return mul(ret, 1.0/mag), 0.0 196 | } 197 | 198 | // And finally just give up and make a unit vector along the first 199 | // dimension. This should be exceedingly rare. 200 | ret = make([]float64, len(ret)) 201 | ret[0] = 1.0 202 | return ret, 0.0 203 | } 204 | -------------------------------------------------------------------------------- /gossip/coordinate/performance_test.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestPerformance_Line(t *testing.T) { 10 | const spacing = 10 * time.Millisecond 11 | const nodes, cycles = 10, 1000 12 | config := DefaultConfig() 13 | clients, err := GenerateClients(nodes, config) 14 | if err != nil { 15 | t.Fatal(err) 16 | } 17 | truth := GenerateLine(nodes, spacing) 18 | Simulate(clients, truth, cycles) 19 | stats := Evaluate(clients, truth) 20 | if stats.ErrorAvg > 0.0018 || stats.ErrorMax > 0.0092 { 21 | t.Fatalf("performance stats are out of spec: %v", stats) 22 | } 23 | } 24 | 25 | func TestPerformance_Grid(t *testing.T) { 26 | const spacing = 10 * time.Millisecond 27 | const nodes, cycles = 25, 1000 28 | config := DefaultConfig() 29 | clients, err := GenerateClients(nodes, config) 30 | if err != nil { 31 | t.Fatal(err) 32 | } 33 | truth := GenerateGrid(nodes, spacing) 34 | Simulate(clients, truth, cycles) 35 | stats := Evaluate(clients, truth) 36 | if stats.ErrorAvg > 0.0015 || stats.ErrorMax > 0.022 { 37 | t.Fatalf("performance stats are out of spec: %v", stats) 38 | } 39 | } 40 | 41 | func TestPerformance_Split(t *testing.T) { 42 | const lan, wan = 1 * time.Millisecond, 10 * time.Millisecond 43 | const nodes, cycles = 25, 1000 44 | config := DefaultConfig() 45 | clients, err := GenerateClients(nodes, config) 46 | if err != nil { 47 | t.Fatal(err) 48 | } 49 | truth := GenerateSplit(nodes, lan, wan) 50 | Simulate(clients, truth, cycles) 51 | stats := Evaluate(clients, truth) 52 | if stats.ErrorAvg > 0.000060 || stats.ErrorMax > 0.00048 { 53 | t.Fatalf("performance stats are out of spec: %v", stats) 54 | } 55 | } 56 | 57 | func TestPerformance_Height(t *testing.T) { 58 | const radius = 100 * time.Millisecond 59 | const nodes, cycles = 25, 1000 60 | 61 | // Constrain us to two dimensions so that we can just exactly represent 62 | // the circle. 63 | config := DefaultConfig() 64 | config.Dimensionality = 2 65 | clients, err := GenerateClients(nodes, config) 66 | if err != nil { 67 | t.Fatal(err) 68 | } 69 | 70 | // Generate truth where the first coordinate is in the "middle" because 71 | // it's equidistant from all the nodes, but it will have an extra radius 72 | // added to the distance, so it should come out above all the others. 73 | truth := GenerateCircle(nodes, radius) 74 | Simulate(clients, truth, cycles) 75 | 76 | // Make sure the height looks reasonable with the regular nodes all in a 77 | // plane, and the center node up above. 78 | for i := range clients { 79 | coord := clients[i].GetCoordinate() 80 | if i == 0 { 81 | if coord.Height < 0.97*radius.Seconds() { 82 | t.Fatalf("height is out of spec: %9.6f", coord.Height) 83 | } 84 | } else { 85 | if coord.Height > 0.03*radius.Seconds() { 86 | t.Fatalf("height is out of spec: %9.6f", coord.Height) 87 | } 88 | } 89 | } 90 | stats := Evaluate(clients, truth) 91 | if stats.ErrorAvg > 0.0025 || stats.ErrorMax > 0.064 { 92 | t.Fatalf("performance stats are out of spec: %v", stats) 93 | } 94 | } 95 | 96 | func TestPerformance_Drift(t *testing.T) { 97 | const dist = 500 * time.Millisecond 98 | const nodes = 4 99 | config := DefaultConfig() 100 | config.Dimensionality = 2 101 | clients, err := GenerateClients(nodes, config) 102 | if err != nil { 103 | t.Fatal(err) 104 | } 105 | 106 | // Do some icky surgery on the clients to put them into a square, up in 107 | // the first quadrant. 108 | clients[0].coord.Vec = []float64{0.0, 0.0} 109 | clients[1].coord.Vec = []float64{0.0, dist.Seconds()} 110 | clients[2].coord.Vec = []float64{dist.Seconds(), dist.Seconds()} 111 | clients[3].coord.Vec = []float64{dist.Seconds(), dist.Seconds()} 112 | 113 | // Make a corresponding truth matrix. The nodes are laid out like this 114 | // so the distances are all equal, except for the diagonal: 115 | // 116 | // (1) <- dist -> (2) 117 | // 118 | // | <- dist | 119 | // | | 120 | // | dist -> | 121 | // 122 | // (0) <- dist -> (3) 123 | // 124 | truth := make([][]time.Duration, nodes) 125 | for i := range truth { 126 | truth[i] = make([]time.Duration, nodes) 127 | } 128 | for i := 0; i < nodes; i++ { 129 | for j := i + 1; j < nodes; j++ { 130 | rtt := dist 131 | if (i%2 == 0) && (j%2 == 0) { 132 | rtt = time.Duration(math.Sqrt2 * float64(rtt)) 133 | } 134 | truth[i][j], truth[j][i] = rtt, rtt 135 | } 136 | } 137 | 138 | calcCenterError := func() float64 { 139 | min, max := clients[0].GetCoordinate(), clients[0].GetCoordinate() 140 | for i := 1; i < nodes; i++ { 141 | coord := clients[i].GetCoordinate() 142 | for j, v := range coord.Vec { 143 | min.Vec[j] = math.Min(min.Vec[j], v) 144 | max.Vec[j] = math.Max(max.Vec[j], v) 145 | } 146 | } 147 | 148 | mid := make([]float64, config.Dimensionality) 149 | for i := range mid { 150 | mid[i] = min.Vec[i] + (max.Vec[i]-min.Vec[i])/2 151 | } 152 | return magnitude(mid) 153 | } 154 | 155 | // Let the simulation run for a while to stabilize, then snap a baseline 156 | // for the center error. 157 | Simulate(clients, truth, 1000) 158 | baseline := calcCenterError() 159 | 160 | // Now run for a bunch more cycles and see if gravity pulls the center 161 | // in the right direction. 162 | Simulate(clients, truth, 10000) 163 | if error := calcCenterError(); error > 0.8*baseline { 164 | t.Fatalf("drift performance out of spec: %9.6f -> %9.6f", baseline, error) 165 | } 166 | } 167 | 168 | func TestPerformance_Random(t *testing.T) { 169 | const mean, deviation = 100 * time.Millisecond, 10 * time.Millisecond 170 | const nodes, cycles = 25, 1000 171 | config := DefaultConfig() 172 | clients, err := GenerateClients(nodes, config) 173 | if err != nil { 174 | t.Fatal(err) 175 | } 176 | truth := GenerateRandom(nodes, mean, deviation) 177 | Simulate(clients, truth, cycles) 178 | stats := Evaluate(clients, truth) 179 | if stats.ErrorAvg > 0.075 || stats.ErrorMax > 0.33 { 180 | t.Fatalf("performance stats are out of spec: %v", stats) 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /gossip/coordinate/phantom.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | "time" 8 | ) 9 | 10 | // GenerateClients returns a slice with nodes number of clients, all with the 11 | // given config. 12 | func GenerateClients(nodes int, config *Config) ([]*Client, error) { 13 | clients := make([]*Client, nodes) 14 | for i := range clients { 15 | client, err := NewClient(config) 16 | if err != nil { 17 | return nil, err 18 | } 19 | 20 | clients[i] = client 21 | } 22 | return clients, nil 23 | } 24 | 25 | // GenerateLine returns a truth matrix as if all the nodes are in a straight linke 26 | // with the given spacing between them. 27 | func GenerateLine(nodes int, spacing time.Duration) [][]time.Duration { 28 | truth := make([][]time.Duration, nodes) 29 | for i := range truth { 30 | truth[i] = make([]time.Duration, nodes) 31 | } 32 | 33 | for i := 0; i < nodes; i++ { 34 | for j := i + 1; j < nodes; j++ { 35 | rtt := time.Duration(j-i) * spacing 36 | truth[i][j], truth[j][i] = rtt, rtt 37 | } 38 | } 39 | return truth 40 | } 41 | 42 | // GenerateGrid returns a truth matrix as if all the nodes are in a two dimensional 43 | // grid with the given spacing between them. 44 | func GenerateGrid(nodes int, spacing time.Duration) [][]time.Duration { 45 | truth := make([][]time.Duration, nodes) 46 | for i := range truth { 47 | truth[i] = make([]time.Duration, nodes) 48 | } 49 | 50 | n := int(math.Sqrt(float64(nodes))) 51 | for i := 0; i < nodes; i++ { 52 | for j := i + 1; j < nodes; j++ { 53 | x1, y1 := float64(i%n), float64(i/n) 54 | x2, y2 := float64(j%n), float64(j/n) 55 | dx, dy := x2-x1, y2-y1 56 | dist := math.Sqrt(dx*dx + dy*dy) 57 | rtt := time.Duration(dist * float64(spacing)) 58 | truth[i][j], truth[j][i] = rtt, rtt 59 | } 60 | } 61 | return truth 62 | } 63 | 64 | // GenerateSplit returns a truth matrix as if half the nodes are close together in 65 | // one location and half the nodes are close together in another. The lan factor 66 | // is used to separate the nodes locally and the wan factor represents the split 67 | // between the two sides. 68 | func GenerateSplit(nodes int, lan time.Duration, wan time.Duration) [][]time.Duration { 69 | truth := make([][]time.Duration, nodes) 70 | for i := range truth { 71 | truth[i] = make([]time.Duration, nodes) 72 | } 73 | 74 | split := nodes / 2 75 | for i := 0; i < nodes; i++ { 76 | for j := i + 1; j < nodes; j++ { 77 | rtt := lan 78 | if (i <= split && j > split) || (i > split && j <= split) { 79 | rtt += wan 80 | } 81 | truth[i][j], truth[j][i] = rtt, rtt 82 | } 83 | } 84 | return truth 85 | } 86 | 87 | // GenerateCircle returns a truth matrix for a set of nodes, evenly distributed 88 | // around a circle with the given radius. The first node is at the "center" of the 89 | // circle because it's equidistant from all the other nodes, but we place it at 90 | // double the radius, so it should show up above all the other nodes in height. 91 | func GenerateCircle(nodes int, radius time.Duration) [][]time.Duration { 92 | truth := make([][]time.Duration, nodes) 93 | for i := range truth { 94 | truth[i] = make([]time.Duration, nodes) 95 | } 96 | 97 | for i := 0; i < nodes; i++ { 98 | for j := i + 1; j < nodes; j++ { 99 | var rtt time.Duration 100 | if i == 0 { 101 | rtt = 2 * radius 102 | } else { 103 | t1 := 2.0 * math.Pi * float64(i) / float64(nodes) 104 | x1, y1 := math.Cos(t1), math.Sin(t1) 105 | t2 := 2.0 * math.Pi * float64(j) / float64(nodes) 106 | x2, y2 := math.Cos(t2), math.Sin(t2) 107 | dx, dy := x2-x1, y2-y1 108 | dist := math.Sqrt(dx*dx + dy*dy) 109 | rtt = time.Duration(dist * float64(radius)) 110 | } 111 | truth[i][j], truth[j][i] = rtt, rtt 112 | } 113 | } 114 | return truth 115 | } 116 | 117 | // GenerateRandom returns a truth matrix for a set of nodes with normally 118 | // distributed delays, with the given mean and deviation. The RNG is re-seeded 119 | // so you always get the same matrix for a given size. 120 | func GenerateRandom(nodes int, mean time.Duration, deviation time.Duration) [][]time.Duration { 121 | rand.Seed(1) 122 | 123 | truth := make([][]time.Duration, nodes) 124 | for i := range truth { 125 | truth[i] = make([]time.Duration, nodes) 126 | } 127 | 128 | for i := 0; i < nodes; i++ { 129 | for j := i + 1; j < nodes; j++ { 130 | rttSeconds := rand.NormFloat64()*deviation.Seconds() + mean.Seconds() 131 | rtt := time.Duration(rttSeconds * secondsToNanoseconds) 132 | truth[i][j], truth[j][i] = rtt, rtt 133 | } 134 | } 135 | return truth 136 | } 137 | 138 | // Simulate runs the given number of cycles using the given list of clients and 139 | // truth matrix. On each cycle, each client will pick a random node and observe 140 | // the truth RTT, updating its coordinate estimate. The RNG is re-seeded for 141 | // each simulation run to get deterministic results (for this algorithm and the 142 | // underlying algorithm which will use random numbers for position vectors when 143 | // starting out with everything at the origin). 144 | func Simulate(clients []*Client, truth [][]time.Duration, cycles int) { 145 | rand.Seed(1) 146 | 147 | nodes := len(clients) 148 | for cycle := 0; cycle < cycles; cycle++ { 149 | for i := range clients { 150 | if j := rand.Intn(nodes); j != i { 151 | c := clients[j].GetCoordinate() 152 | rtt := truth[i][j] 153 | node := fmt.Sprintf("node_%d", j) 154 | clients[i].Update(node, c, rtt) 155 | } 156 | } 157 | } 158 | } 159 | 160 | // Stats is returned from the Evaluate function with a summary of the algorithm 161 | // performance. 162 | type Stats struct { 163 | ErrorMax float64 164 | ErrorAvg float64 165 | } 166 | 167 | // Evaluate uses the coordinates of the given clients to calculate estimated 168 | // distances and compares them with the given truth matrix, returning summary 169 | // stats. 170 | func Evaluate(clients []*Client, truth [][]time.Duration) (stats Stats) { 171 | nodes := len(clients) 172 | count := 0 173 | for i := 0; i < nodes; i++ { 174 | for j := i + 1; j < nodes; j++ { 175 | est := clients[i].DistanceTo(clients[j].GetCoordinate()).Seconds() 176 | actual := truth[i][j].Seconds() 177 | error := math.Abs(est-actual) / actual 178 | stats.ErrorMax = math.Max(stats.ErrorMax, error) 179 | stats.ErrorAvg += error 180 | count += 1 181 | } 182 | } 183 | 184 | stats.ErrorAvg /= float64(count) 185 | fmt.Printf("Error avg=%9.6f max=%9.6f\n", stats.ErrorAvg, stats.ErrorMax) 186 | return 187 | } 188 | -------------------------------------------------------------------------------- /gossip/coordinate/util_test.go: -------------------------------------------------------------------------------- 1 | package coordinate 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | // verifyEqualFloats will compare f1 and f2 and fail if they are not 9 | // "equal" within a threshold. 10 | func verifyEqualFloats(t *testing.T, f1 float64, f2 float64) { 11 | const zeroThreshold = 1.0e-6 12 | if math.Abs(f1-f2) > zeroThreshold { 13 | t.Fatalf("equal assertion fail, %9.6f != %9.6f", f1, f2) 14 | } 15 | } 16 | 17 | // verifyEqualVectors will compare vec1 and vec2 and fail if they are not 18 | // "equal" within a threshold. 19 | func verifyEqualVectors(t *testing.T, vec1 []float64, vec2 []float64) { 20 | if len(vec1) != len(vec2) { 21 | t.Fatalf("vector length mismatch, %d != %d", len(vec1), len(vec2)) 22 | } 23 | 24 | for i := range vec1 { 25 | verifyEqualFloats(t, vec1[i], vec2[i]) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /gossip/event.go: -------------------------------------------------------------------------------- 1 | package gossip 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/hashicorp/memberlist" 7 | "github.com/lni/goutils/syncutil" 8 | "go.uber.org/zap" 9 | ) 10 | 11 | type eventDelegate struct { 12 | g *GossipManager 13 | memberlist.ChannelEventDelegate 14 | ch chan memberlist.NodeEvent 15 | stopper *syncutil.Stopper 16 | nodes sync.Map 17 | } 18 | 19 | type aliveInstance struct { 20 | mu sync.RWMutex 21 | 22 | // moveTo grpcaddr映射rubik对外提供服务grpcaddr 23 | moveToRubik map[string]string 24 | 25 | moveToInstances map[string]bool 26 | rubikInstances map[string]bool 27 | } 28 | 29 | func newAliveInstance() *aliveInstance { 30 | return &aliveInstance{ 31 | moveToRubik: make(map[string]string), 32 | moveToInstances: make(map[string]bool), 33 | rubikInstances: make(map[string]bool), 34 | } 35 | } 36 | 37 | func (ai *aliveInstance) updateInstance(meta *Meta, alive bool) { 38 | if meta == nil { 39 | return 40 | } 41 | 42 | ai.mu.Lock() 43 | if alive { 44 | ai.moveToRubik[meta.MoveToGrpcAddr] = meta.RubikGrpcAddr 45 | } else { 46 | delete(ai.moveToRubik, meta.MoveToGrpcAddr) 47 | } 48 | ai.moveToInstances[meta.MoveToGrpcAddr] = alive 49 | ai.rubikInstances[meta.RubikGrpcAddr] = alive 50 | ai.mu.Unlock() 51 | } 52 | 53 | func (ai *aliveInstance) getMoveToInstances() map[string]bool { 54 | ai.mu.RLock() 55 | defer ai.mu.RUnlock() 56 | 57 | return ai.moveToInstances 58 | } 59 | 60 | func (ai *aliveInstance) getRubikInstances() map[string]bool { 61 | ai.mu.RLock() 62 | defer ai.mu.RUnlock() 63 | 64 | return ai.rubikInstances 65 | } 66 | 67 | func (ai *aliveInstance) getMoveToRubik() map[string]string { 68 | ai.mu.RLock() 69 | defer ai.mu.RUnlock() 70 | 71 | return ai.moveToRubik 72 | } 73 | 74 | func newEventDelegate(s *syncutil.Stopper, g *GossipManager) *eventDelegate { 75 | ch := make(chan memberlist.NodeEvent, 10) 76 | ed := &eventDelegate{ 77 | g: g, 78 | stopper: s, 79 | ch: ch, 80 | ChannelEventDelegate: memberlist.ChannelEventDelegate{Ch: ch}, 81 | } 82 | return ed 83 | } 84 | 85 | func (d *eventDelegate) decodeMeta(e memberlist.NodeEvent, fields []zap.Field) *Meta { 86 | if len(e.Node.Meta) == 0 || messageType(e.Node.Meta[0]) != tagMagicByte { 87 | d.g.log.Warn("[multiraft] [self-gossip-user] [eventdelegate] [metaType]", 88 | append(fields, 89 | zap.Int("event", int(e.Event)), 90 | zap.String("nodename", e.Node.Name), 91 | zap.String("nodeaddress", e.Node.Address()), 92 | zap.String("meta", string(e.Node.Meta)), 93 | zap.String("err", "meta messageType error"), 94 | )...) 95 | return nil 96 | } 97 | 98 | meta := &Meta{} 99 | if err := decodeMessage(e.Node.Meta[1:], &meta); err != nil { 100 | d.g.log.Warn("[multiraft] [self-gossip-user] [eventdelegate] [metaDecode]", 101 | append(fields, 102 | zap.Int("event", int(e.Event)), 103 | zap.String("nodename", e.Node.Name), 104 | zap.String("nodeaddress", e.Node.Address()), 105 | zap.String("meta", string(e.Node.Meta)), 106 | zap.Error(err), 107 | )...) 108 | return nil 109 | } 110 | 111 | return meta 112 | } 113 | 114 | func (d *eventDelegate) start() { 115 | localNode := d.g.list.LocalNode() 116 | fields := []zap.Field{ 117 | zap.String("name", localNode.Name), 118 | zap.String("address", localNode.Address()), 119 | } 120 | 121 | d.stopper.RunWorker(func() { 122 | for { 123 | select { 124 | case <-d.stopper.ShouldStop(): 125 | return 126 | case e := <-d.ch: 127 | meta := d.decodeMeta(e, fields) 128 | if e.Event == memberlist.NodeJoin || e.Event == memberlist.NodeUpdate { 129 | d.g.log.Info("[multiraft] [self-gossip-user] [eventdelegate] [update]", 130 | append(fields, 131 | zap.Int("event", int(e.Event)), 132 | zap.String("nodename", e.Node.Name), 133 | zap.String("nodeaddress", e.Node.Address()), 134 | zap.String("meta", string(e.Node.Meta)), 135 | )..., 136 | ) 137 | d.nodes.Store(e.Node.Name, string(e.Node.Meta)) 138 | d.g.aliveInstance.updateInstance(meta, true) 139 | } else if e.Event == memberlist.NodeLeave { 140 | d.g.log.Info("[multiraft] [self-gossip-user] [eventdelegate] [delete]", 141 | append(fields, 142 | zap.Int("event", int(e.Event)), 143 | zap.String("nodename", e.Node.Name), 144 | zap.String("nodeaddress", e.Node.Address()), 145 | zap.String("meta", string(e.Node.Meta)), 146 | )..., 147 | ) 148 | d.nodes.Delete(e.Node.Name) 149 | d.g.aliveInstance.updateInstance(meta, false) 150 | } 151 | } 152 | } 153 | }) 154 | } 155 | -------------------------------------------------------------------------------- /gossip/message.go: -------------------------------------------------------------------------------- 1 | package gossip 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "encoding/json" 7 | "io" 8 | 9 | "github.com/hashicorp/go-msgpack/codec" 10 | "github.com/hashicorp/memberlist" 11 | ) 12 | 13 | // messageType are the types of gossip messages will send along 14 | // memberlist. 15 | type messageType uint8 16 | 17 | const ( 18 | messageClusterType messageType = iota 19 | messageMembershipType 20 | messagePushPullType 21 | ) 22 | 23 | type TargetClusterId struct { 24 | GrpcAddr string `json:"grpcAddr"` 25 | ClusterIds []uint64 `json:"clusterIds"` 26 | } 27 | 28 | type RaftClusterMessage struct { 29 | Revision int64 `json:"revision"` 30 | 31 | // 机器ID对应的MoveTo的GRPC地址 32 | // key: 当raft以nodehostid=true的方式起的时候是机器ID,以固定地址方式起是raftAddr 33 | Targets map[string]TargetClusterId `json:"targets"` 34 | 35 | // 每个raft cluster对应的机器ID|raftAddr 36 | Clusters map[uint64][]string `json:"clusters"` 37 | 38 | // 每个raft cluster的initial members 39 | // key: clusterId, key: nodeId, val: raftAddr或nodeHostID 40 | InitialMembers map[uint64]map[uint64]string `json:"initial_members"` 41 | Join map[uint64]map[uint64]bool `json:"join"` 42 | } 43 | 44 | func (rm *RaftClusterMessage) String() string { 45 | b, _ := json.Marshal(rm) 46 | return string(b) 47 | } 48 | 49 | type MemberInfo struct { 50 | ClusterId uint64 51 | ConfigChangeId uint64 52 | Nodes map[uint64]string 53 | Observers map[uint64]string 54 | LeaderId uint64 55 | LeaderValid bool 56 | } 57 | 58 | func (mi *MemberInfo) String() string { 59 | b, _ := json.Marshal(mi) 60 | return string(b) 61 | } 62 | 63 | type RaftMembershipMessage struct { 64 | // key: clusterId 65 | MemberInfos map[uint64]*MemberInfo 66 | } 67 | 68 | func (rm *RaftMembershipMessage) String() string { 69 | b, _ := json.Marshal(rm) 70 | return string(b) 71 | } 72 | 73 | type PushPullMessage struct { 74 | Cluster *RaftClusterMessage 75 | Membership *RaftMembershipMessage 76 | } 77 | 78 | func (pp *PushPullMessage) String() string { 79 | b, _ := json.Marshal(pp) 80 | return string(b) 81 | } 82 | 83 | func decodeMessage(buf []byte, out interface{}) error { 84 | bbuf, err := GZipDecode(buf) 85 | if err != nil { 86 | return err 87 | } 88 | 89 | var handle codec.MsgpackHandle 90 | return codec.NewDecoder(bytes.NewReader(bbuf), &handle).Decode(out) 91 | } 92 | 93 | func encodeMessage(t messageType, msg interface{}) ([]byte, error) { 94 | buf := bytes.NewBuffer(nil) 95 | 96 | handle := codec.MsgpackHandle{} 97 | encoder := codec.NewEncoder(buf, &handle) 98 | err := encoder.Encode(msg) 99 | if err != nil { 100 | return nil, err 101 | } 102 | 103 | gbuf, err := GZipEncode(buf.Bytes()) 104 | if err != nil { 105 | return nil, err 106 | } 107 | 108 | return append([]byte{uint8(t)}, gbuf...), nil 109 | } 110 | 111 | type broadcast struct { 112 | msg []byte 113 | notify chan<- struct{} 114 | } 115 | 116 | func newBroadcast(msg []byte) *broadcast { 117 | return &broadcast{ 118 | msg: msg, 119 | notify: make(chan struct{}), 120 | } 121 | } 122 | 123 | func (b *broadcast) Invalidates(other memberlist.Broadcast) bool { 124 | return false 125 | } 126 | 127 | func (b *broadcast) Message() []byte { 128 | return b.msg 129 | } 130 | 131 | func (b *broadcast) Finished() { 132 | if b.notify != nil { 133 | close(b.notify) 134 | } 135 | } 136 | 137 | func GZipEncode(content []byte) ([]byte, error) { 138 | var buffer bytes.Buffer 139 | writer := gzip.NewWriter(&buffer) 140 | if _, err := writer.Write(content); err != nil { 141 | return nil, err 142 | } 143 | 144 | if err := writer.Flush(); err != nil { 145 | return nil, err 146 | } 147 | 148 | if err := writer.Close(); err != nil { 149 | return nil, err 150 | } 151 | 152 | return buffer.Bytes(), nil 153 | } 154 | 155 | func GZipDecode(buf []byte) ([]byte, error) { 156 | reader, err := gzip.NewReader(bytes.NewReader(buf)) 157 | if err != nil { 158 | return nil, err 159 | } 160 | defer reader.Close() 161 | 162 | return io.ReadAll(reader) 163 | } 164 | -------------------------------------------------------------------------------- /gossip/ping_delegate.go: -------------------------------------------------------------------------------- 1 | package gossip 2 | 3 | import ( 4 | "bytes" 5 | "time" 6 | 7 | "github.com/xkeyideal/mraft/gossip/coordinate" 8 | 9 | "github.com/hashicorp/go-msgpack/codec" 10 | "github.com/hashicorp/memberlist" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | // pingDelegate is notified when memberlist successfully completes a direct ping 15 | // of a peer node. We use this to update our estimated network coordinate, as 16 | // well as cache the coordinate of the peer. 17 | type pingDelegate struct { 18 | g *GossipManager 19 | } 20 | 21 | const ( 22 | // PingVersion is an internal version for the ping message, above the normal 23 | // versioning we get from the protocol version. This enables small updates 24 | // to the ping message without a full protocol bump. 25 | PingVersion = 1 26 | ) 27 | 28 | // AckPayload is called to produce a payload to send back in response to a ping 29 | // request. 30 | func (p *pingDelegate) AckPayload() []byte { 31 | var buf bytes.Buffer 32 | 33 | // The first byte is the version number, forming a simple header. 34 | version := []byte{PingVersion} 35 | buf.Write(version) 36 | 37 | // The rest of the message is the serialized coordinate. 38 | enc := codec.NewEncoder(&buf, &codec.MsgpackHandle{}) 39 | if err := enc.Encode(p.g.coordClient.GetCoordinate()); err != nil { 40 | p.g.log.Error("[multiraft] [self-gossip-user] [ping-delegate] [AckPayload] [encode]", zap.Error(err)) 41 | } 42 | return buf.Bytes() 43 | } 44 | 45 | // NotifyPingComplete is called when this node successfully completes a direct ping 46 | // of a peer node. 47 | func (p *pingDelegate) NotifyPingComplete(other *memberlist.Node, rtt time.Duration, payload []byte) { 48 | if payload == nil || len(payload) == 0 { 49 | return 50 | } 51 | 52 | // Verify ping version in the header. 53 | version := payload[0] 54 | if version != PingVersion { 55 | p.g.log.Error("[multiraft] [self-gossip-user] [ping-delegate] [NotifyPingComplete] [version]", zap.Uint8("version", version)) 56 | return 57 | } 58 | 59 | // Process the remainder of the message as a coordinate. 60 | r := bytes.NewReader(payload[1:]) 61 | dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) 62 | var coord coordinate.Coordinate 63 | if err := dec.Decode(&coord); err != nil { 64 | p.g.log.Error("[multiraft] [self-gossip-user] [ping-delegate] [NotifyPingComplete] [decode]", zap.Error(err)) 65 | return 66 | } 67 | 68 | // Apply the update. 69 | before := p.g.coordClient.GetCoordinate() 70 | after, err := p.g.coordClient.Update(other.Name, &coord, rtt) 71 | if err != nil { 72 | //metrics.IncrCounter([]string{"serf", "coordinate", "rejected"}, 1) 73 | p.g.log.Error("[multiraft] [self-gossip-user] [ping-delegate] [NotifyPingComplete] [rejected]", zap.Error(err)) 74 | return 75 | } 76 | 77 | // Publish some metrics to give us an idea of how much we are 78 | // adjusting each time we update. 79 | d := float32(before.DistanceTo(after).Seconds() * 1.0e3) 80 | //metrics.AddSample([]string{"serf", "coordinate", "adjustment-ms"}, d) 81 | if d >= 100.0 { 82 | p.g.log.Warn("[multiraft] [self-gossip-user] [ping-delegate] [NotifyPingComplete] [DistanceTo]", 83 | zap.String("src", p.g.opts.Name), 84 | zap.String("dest", other.Name), 85 | zap.Int64("rtt", int64(rtt)), 86 | zap.Float32("adjustment-ms", d), 87 | ) 88 | } else { 89 | p.g.log.Info("[multiraft] [self-gossip-user] [ping-delegate] [NotifyPingComplete] [DistanceTo]", 90 | zap.String("src", p.g.opts.Name), 91 | zap.String("dest", other.Name), 92 | zap.Int64("rtt", int64(rtt)), 93 | zap.Float32("adjustment-ms", d), 94 | ) 95 | } 96 | 97 | // Cache the coordinate for the other node, and add our own 98 | // to the cache as well since it just got updated. This lets 99 | // users call GetCachedCoordinate with our node name, which is 100 | // more friendly. 101 | p.g.coordCacheLock.Lock() 102 | p.g.coordCache[other.Name] = &coord 103 | p.g.coordCache[p.g.opts.Name] = p.g.coordClient.GetCoordinate() 104 | p.g.coordCacheLock.Unlock() 105 | } 106 | -------------------------------------------------------------------------------- /logger/zaplog.go: -------------------------------------------------------------------------------- 1 | package logger 2 | 3 | import ( 4 | "os" 5 | "time" 6 | 7 | "go.uber.org/zap" 8 | "go.uber.org/zap/zapcore" 9 | "gopkg.in/natefinch/lumberjack.v2" 10 | ) 11 | 12 | var ( 13 | cstLocal *time.Location 14 | err error 15 | ) 16 | 17 | func init() { 18 | cstLocal, err = time.LoadLocation("Asia/Shanghai") 19 | if err != nil { 20 | panic(err) 21 | } 22 | } 23 | 24 | func encodeTimeLayout(t time.Time, layout string, enc zapcore.PrimitiveArrayEncoder) { 25 | type appendTimeEncoder interface { 26 | AppendTimeLayout(time.Time, string) 27 | } 28 | 29 | if enc, ok := enc.(appendTimeEncoder); ok { 30 | enc.AppendTimeLayout(t, layout) 31 | return 32 | } 33 | 34 | enc.AppendString(t.Format(layout)) 35 | } 36 | 37 | func CSTTimeEncoder(t time.Time, enc zapcore.PrimitiveArrayEncoder) { 38 | encodeTimeLayout(t.In(cstLocal), "2006-01-02 15:04:05.000", enc) 39 | } 40 | 41 | func NewLogger(logFilename string, level zapcore.Level, stdout bool) *zap.Logger { 42 | encoderConfig := zapcore.EncoderConfig{ 43 | TimeKey: "time", 44 | LevelKey: "level", 45 | NameKey: "logger", 46 | CallerKey: "caller", 47 | MessageKey: "msg", 48 | StacktraceKey: "stacktrace", 49 | LineEnding: zapcore.DefaultLineEnding, 50 | EncodeLevel: zapcore.CapitalLevelEncoder, // 小写编码器 51 | EncodeTime: CSTTimeEncoder, 52 | EncodeDuration: zapcore.SecondsDurationEncoder, 53 | EncodeCaller: zapcore.FullCallerEncoder, // 全路径编码器 54 | } 55 | 56 | hook := lumberjack.Logger{ 57 | Filename: logFilename, // 日志文件路径 58 | MaxSize: 512, // 每个日志文件保存的最大尺寸 单位:M 59 | MaxBackups: 300, // 日志文件最多保存多少个备份 60 | MaxAge: 30, // 文件最多保存多少天 61 | Compress: true, // 是否压缩 62 | } 63 | 64 | // 设置日志级别 65 | atomicLevel := zap.NewAtomicLevel() 66 | atomicLevel.SetLevel(level) 67 | 68 | writeSyncer := []zapcore.WriteSyncer{zapcore.AddSync(&hook)} 69 | if stdout { 70 | writeSyncer = append(writeSyncer, zapcore.Lock(os.Stdout)) 71 | } 72 | 73 | core := zapcore.NewCore( 74 | zapcore.NewConsoleEncoder(encoderConfig), // 编码器配置 75 | zapcore.NewMultiWriteSyncer(writeSyncer...), // 打印到控制台和文件 76 | atomicLevel, // 日志级别 77 | ) 78 | 79 | logger := zap.New(core) 80 | 81 | return logger 82 | } 83 | -------------------------------------------------------------------------------- /productready/README.md: -------------------------------------------------------------------------------- 1 | ### 启动方式 2 | 3 | 程序启动入口: productready/main/app.go httpPort, raftPort 4 | 5 | ### 启动配置项 6 | 7 | ```go 8 | type DynamicConfig struct { 9 | // raft数据存储目录 10 | RaftDir string `json:"raftDir"` 11 | 12 | // 日志存储目录 13 | LogDir string `json:"logDir"` 14 | 15 | // 每个raft节点的Id,一旦生成且加入集群,再次启动后,不能变动 16 | NodeId uint64 `json:"nodeId"` 17 | 18 | // key: clusterId, key:nodeId 19 | Join map[uint64]map[uint64]bool `json:"join"` 20 | 21 | // key: clusterId, key:nodeId 22 | // val: 根据dragonboat的启动方式决定 23 | // gossip方式: NodeHostId 24 | // 常规方式: raftAddr 25 | InitialMembers map[uint64]map[uint64]string `json:"initial_members"` 26 | 27 | // 本机的地址 28 | IP string `json:"ip"` 29 | 30 | // raft port 31 | RaftPort uint16 `json:"raft_port"` 32 | 33 | // http port 34 | HttpPort uint16 `json:"http_port"` 35 | } 36 | ``` 37 | 38 | ### dragonboat raft的启动方式 39 | 40 | 该raft框架的启动方式有两种 41 | 42 | 1. 采用常规的ip:port方式,此方式限制了重启后ip:port不能发生改变 43 | 2. 采用gossip方式启动,此方式只需要保证raft config里的 44 | ```go 45 | Expert: config.ExpertConfig{ 46 | TestNodeHostID: nodeId, 47 | }, 48 | ``` 49 | TestNodeHostID不变即可,可以实现重启后的ip改动,但数据文件和raft文件不能丢失,本质上仅支持机器换ip 50 | 51 | 本示例采用的是第一种方式. 52 | 53 | raft集群里的每台集群管理的`clusterIds`不一定必须完全一致,即每台机器没必要存储全量的数据,可以写一个管理端程序来管理clusterId的每台机器分配情况。 54 | 如果每台机器不存全量的`clusterIds`,那么业务请求的key到来,可能该机器并不存在此key的value,解决办法可以采用该机器找到该key实际存在在哪来机器, 55 | 然后帮助业务完成请求并返回。 56 | 57 | ### 本示例目前无法直接启动 58 | 59 | 由于需要配置`DynamicConfig` 里的 `Join` 和 `InitialMembers` 字段后才能顺利启动,本人使用的真实生产环境是将启动配置写入服务器文件,每次重启时读取该文件获取上述必备的启动数据;如果不存在本地文件,则等待管理端推送该份配置。 60 | 61 | 因此想启动,可以先研究一下代码,然后将 `Join` 和 `InitialMembers` 字段写死在`productready/main/app.go`文件的配置里,然后尝试启动 62 | 63 | 刚开始第一步,将所有的`clusterId`和`nodeId`全部以原始节点启动,目前在`productready/engine.go`里系统是将所有`clusterIds`写死的`clusterIds = []uint64{0, 1, 2}`,真实的生产环境`clusterIds`的个数也是预先配置好的,均存储在管理端程序中。 64 | 65 | 如果节点是以`join`的方式加入,那么先启动该节点,然后调用集群接口,将此节点加入进集群,然后重新生成 `Join` 和 `InitialMembers` 字段,推送给新节点,等待即可;该份新配置也应该推送给集群中原本以存在的机器,本人真实使用的情况是对此份配置加上版本号来进行控制的。 66 | 67 | **根据[dragonboat](https://github.com/lni/dragonboat/blob/master/docs/overview.CHS.md)节点启动的文档,当一个节点重启时,不论该节点是一个初始节点还是后续通过成员变更添加的节点,均无需再次提供初始成员信息,也不再需要设置join参数为true**。 68 | 69 | 配置解释: 70 | 71 | 1. raft集群中每个节点ID(NodeID),采用该节点IP+port生成的48位uint64整型值 72 | 2. 若未配置raftPort,则控制中心采用默认的raftPort端口 `13890`启动 73 | 3. 样例提供的http端口采用raftPort的整型值加1作为HttpPort,无需用户配置 74 | 4. `join`字段,每个clusterId下对应存在哪些nodeId,且这些nodeId是集群的原始节点或后加入的节点 75 | 5. `initial_members`字段,告知集群每个clusterId下对应存在哪些nodeId,每个nodeId的raftAddr 76 | 77 | ### 节点加入raft集群 78 | 79 | 根据上述配置的说明,当节点主动加入raft集群时节点无需配置上述配置里的参数 `native` 和 `raftPeers`, 80 | 以在集群里的节点不能加入,**之前被删除的节点不能再次加入集群** 81 | 82 | 1. 调用样例加入节点的http接口,通知控制中心的raft集群,有新的节点需要加入集群, 待接口返回加入集群成功 83 | 2. 启动待加入集群的节点,此时原raft集群会自动寻址该新节点并同步数据 84 | 3. 新节点成功加入集群后,请立即去webapi的控制面板里修改raft集群的节点数据 85 | 86 | ### NodeHostConfig 87 | 88 | ``` 89 | config.NodeHostConfig{ 90 | // DeploymentID用于确定两个NodeHost实例是否属于同一部署,并因此允许彼此通信。 91 | // 通过将上下文消息发送到不相关的Raft节点,这有助于防止意外配置错误的NodeHost实例导致数据损坏错误。 92 | // 对于特定的基于Dragonboat的应用程序,可以在所有生产NodeHost实例上将DeploymentID设置为相同的uint64值, 93 | // 然后在登台和开发环境中使用不同的DeploymentID值。 对于不同的基于Dragonboat的应用程序,也建议使用不同的DeploymentID值。 94 | // 如果未设置,则默认值0将用作部署ID,从而允许所有具有部署ID 0的NodeHost实例相互通信。 95 | DeploymentID: deploymentId, 96 | 97 | // WALDir是用于存储所有Raft日志的WAL的目录,这仅用于存储Raft日志的WAL,它的大小通常很小, 98 | // 每个NodeHost的64GB通常绰绰有余。如果不设置,则所有内容会存储在NodeHostDir中 99 | WALDir: raftDir, 100 | 101 | // NodeHostDir存储所有需要存储的信息 102 | NodeHostDir: raftDir, 103 | 104 | // RTTMillisecond定义了两个NodeHost实例之间的平均往返时间(RTT),以毫秒为单位 105 | // 这样的RTT间隔在内部用作逻辑时钟滴答,raft的心跳和选举间隔都根据有多少这样的RTT间隔来定义 106 | // 请注意,RTTMillisecond是两个NodeHost实例之间的组合延迟,包括由网络传输引起的所有延迟,NodeHost排队和处理引起的所有延迟。 107 | // 例如,在满载时,我们用于基准测试的两个NodeHost实例之间的平均往返时间最多为500微秒,而它们之间的ping时间为100微秒。 108 | // 当您的环境中的RTTMillisecond小于1百万时,请将其设置为1。 109 | RTTMillisecond: 200, 110 | 111 | //当前节点对外的IP和端口,其他raft节点需要通过这个信息获得 112 | RaftAddress: addr, 113 | 114 | // ListenAddress是Raft RPC模块用于侦听Raft消息和快照的IP:端口地址。 115 | // 如果未设置ListenAddress字段,则Raft RPC模块将使用RaftAddress。 116 | // 如果将0.0.0.0指定为ListenAddress的IP,则Dragonboat将侦听所有接口上的指定端口。 117 | // 指定主机名或域名后,它将首先在本地解析为IP地址,而Dragonboat会侦听所有解析的IP地址。 118 | // 一般不指定这个,和RaftAddress保持一致就好了,收发就用一个端口,没有必要分开 119 | ListenAddress: listenAddr, 120 | 121 | //是否使用TLS进行安全认证,整个程序都是部署在内网中,可以认为是安全的,就不打开这个了 122 | MutualTLS: false, 123 | 124 | //当配置了TLS时,需要指定CA文件的地址 125 | //当配置了TLS时,需要指定CertFile的地址 126 | //CertFile string 127 | //当配置了TLS时,需要指定KeyFile的地址 128 | //KeyFile string 129 | //MaxReceiveQueueSize是每个接收队列的最大大小(以字节为单位)。 一旦达到最大大小,将删除更多复制消息以限制内存使用。 设置为0时,表示队列大小不受限制。 130 | //暂时先设置为128M 131 | MaxSendQueueSize: 128 * 1024 * 1024, 132 | 133 | // EnableMetrics确定是否应启用Prometheus格式的健康度量。 134 | EnableMetrics: false, 135 | 136 | //MaxSnapshotSendBytesPerSecond定义了NodeHost实例管理的所有Raft群集每秒可发送多少快照数据。默认值0表示没有为快照流设置限制。 137 | //每秒最多传输256M数据 138 | MaxSnapshotSendBytesPerSecond: 256 * 1024 * 1024, 139 | 140 | // MaxSnapshotRecvBytesPerSecond定义可以存储多少快照数据由NodeHost实例管理的所有Raft群集每秒收到一次。默认值0表示接收快照数据没有限制。 141 | //目前不限制接受的大小,由发送端决定 142 | MaxSnapshotRecvBytesPerSecond: 0, 143 | } 144 | ``` 145 | 146 | ### RaftConfig 147 | 148 | ``` 149 | config.Config{ 150 | //当前节点的ID 151 | NodeID: nodeId, 152 | 153 | //当前节点的分片ID,如果当前raft是多组的,那么这个地方是指定当前组的ID 154 | ClusterID: clusterId, 155 | 156 | //领导节点是否应定期检查非领导者节点的状态,并在其不再具有法定人数时退出成为跟随者节点 157 | //当有5台机器,挂了3台,法定人数不够,则主节点退出,不再是主节点了,所有的写操作和同步读操作应该都不能执行了 158 | //各个节点只能读取本地的数据 159 | CheckQuorum: false, 160 | 161 | // ElectionRTT是两次选举之间的消息RTT的最小数量。 消息RTT由NodeHostConfig.RTTMillisecond定义。 162 | // Raft论文建议其幅度大于HeartbeatRTT(因为是先发现不健康,才会进行选举),即两个心跳之间的间隔。 163 | // 在Raft中,选举之间的实际间隔被随机分配在ElectionRTT和2 * ElectionRTT之间。例如,假设NodeHostConfig.RTTMillisecond为100毫秒, 164 | // 要将选举间隔设置为1秒,则应该将ElectionRTT设置为10。启用CheckQuorum后,ElectionRTT还将定义检查领导者定额的时间间隔。 165 | // 这个值是个比例,具体的RTT时间大小是RTTMillisecond*ElectionRTT,当需要选举主节点时,各个节点的随机间隔在ElectionRTT和2 * ElectionRTT, 166 | // 当CheckQuorum为true,主也会每隔这个时间检查下从机数据是否符合法定人数 167 | ElectionRTT: 60, 168 | 169 | // HeartbeatRTT是两次心跳之间的消息RTT数。 消息RTT由NodeHostConfig.RTTMillisecond定义。 Raft论文建议心跳间隔应接近节点之间的平均RTT。 170 | // 例如,假设NodeHostConfig.RTTMillisecond为100毫秒,要将心跳间隔设置为每200毫秒,则应将HeartbeatRTT设置为2。 171 | HeartbeatRTT: 6, 172 | 173 | // SnapshotEntries定义应自动对状态机进行快照的频率,可以将SnapshotEntries设置为0以禁用此类自动快照。 174 | // 当SnapshotEntries设置为N时,意味着大约每N条Raft日志创建一个快照。这也意味着向跟踪者发送N个日志条目比发送快照要昂贵。 175 | // 生成快照后,可以压缩新快照覆盖的Raft日志条目。这涉及两个步骤,冗余日志条目首先被标记为已删除,然后在稍后发布 LogDB 压缩时将其从基础存储中物理删除。 176 | // 有关在生成快照后实际删除和压缩哪些日志条目的详细信息,请参见CompactionOverhead,通过将SnapshotEntries字段设置为0禁用自动快照后, 177 | // 用户仍然可以使用NodeHost的RequestSnapshot或SyncRequestSnapshot方法手动请求快照。 178 | SnapshotEntries: 25 * 10000 * 10, 179 | 180 | // CompactionOverhead定义每次Raft日志压缩后要保留的最新条目数。 181 | // 假设当前的日志为10000,开始创建快照,那么快照创建完成后,<=10000的日志都会被清理, 182 | // 如果想获得9000这样的日志,那么就得先完全加载快照,再从快照中读取,如果设置了CompactionOverhead为3000, 183 | // 那么就算创建了快照,我们仍然能获得10000-7000之间的日志记录,只有小于7000的,才需要重新加载日志获取 184 | CompactionOverhead: 25 * 10000, 185 | 186 | //确定是否使用ChangeID的顺序强制执行Raft成员资格更改。 187 | OrderedConfigChange: true, 188 | 189 | // MaxInMemLogSize是允许在每个Raft节点上的Raft日志存储在内存中的目标大小(以字节为单位)。 内存中的筏日志是尚未应用的日志。 190 | // MaxInMemLogSize是为防止内存无限增长而实现的目标值,并非用于精确限制确切的内存使用量。 191 | // 当MaxInMemLogSize为0时,目标设置为math.MaxUint64。 设置MaxInMemLogSize并达到目标后,客户端尝试提出新建议时将返回错误。 192 | // 建议将MaxInMemLogSize大于要使用的最大建议。 193 | //内存中未应用的日志大小,暂定为256M,超过256M的大小后会返回错误 194 | MaxInMemLogSize: 256 * 1024 * 1024, 195 | 196 | // SnapshotCompressionType是用于压缩生成的快照数据的压缩类型。 默认情况下不使用压缩。 197 | // 快照数据本身由rocksdb生成,采用了LZ4压缩,所以这边就不再继续压缩了 198 | SnapshotCompressionType: config.NoCompression, 199 | 200 | // EntryCompressionType是用于压缩用户日志。 使用Snappy时,允许的最大建议有效负载大致限制为3.42GB。 201 | EntryCompressionType: config.Snappy, 202 | 203 | // DisableAutoCompactions禁用用于回收Raft条目存储空间的自动压缩。 204 | // 默认情况下,每次捕获快照时都会执行压缩,这有助于以较高的IO开销为代价,尽快回收磁盘空间。 205 | // 用户可以禁用此类自动压缩,并在必要时使用NodeHost.RequestCompaction手动请求此类压缩。 206 | DisableAutoCompactions: false, 207 | 208 | // IsObserver指示当前节点是否是Observer节点,(观察者节点通常用于允许新节点加入群集并追赶其他日志,而不会影响可用性。 还可以引入额外的观察者节点来满足只读请求,而不会影响系统的写吞吐量) 209 | IsObserver: false, 210 | 211 | // IsWitness指示这是否是没有实际日志复制且没有状态机的见证Raft节点,见证节点支持目前处于试验阶段。 212 | IsWitness: false, 213 | 214 | //停顿指定在没有群集活动时是否让Raft群集进入停顿模式。 静默模式下的群集不交换心跳消息以最小化带宽消耗。当前处于试验阶段 215 | Quiesce: false, 216 | } 217 | ``` -------------------------------------------------------------------------------- /productready/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | type DynamicConfig struct { 4 | // raft数据存储目录 5 | RaftDir string `json:"raftDir"` 6 | 7 | // 日志存储目录 8 | LogDir string `json:"logDir"` 9 | 10 | // 每个raft节点的Id,一旦生成且加入集群,再次启动后,不能变动 11 | NodeId uint64 `json:"nodeId"` 12 | 13 | // key: clusterId, key:nodeId 14 | Join map[uint64]map[uint64]bool `json:"join"` 15 | 16 | // key: clusterId, key:nodeId 17 | // val: 根据dragonboat的启动方式决定 18 | // gossip方式: NodeHostId 19 | // 常规方式: raftAddr 20 | InitialMembers map[uint64]map[uint64]string `json:"initial_members"` 21 | 22 | // 本机的地址 23 | IP string `json:"ip"` 24 | 25 | // raft port 26 | RaftPort uint16 `json:"raft_port"` 27 | 28 | // http port 29 | HttpPort uint16 `json:"http_port"` 30 | } 31 | -------------------------------------------------------------------------------- /productready/engine.go: -------------------------------------------------------------------------------- 1 | package productready 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | "time" 9 | 10 | "github.com/xkeyideal/mraft/productready/config" 11 | "github.com/xkeyideal/mraft/productready/httpd" 12 | "github.com/xkeyideal/mraft/productready/storage" 13 | "go.uber.org/zap/zapcore" 14 | 15 | "github.com/gin-gonic/gin" 16 | ) 17 | 18 | type Engine struct { 19 | prefix string 20 | 21 | server *http.Server 22 | router *gin.Engine 23 | 24 | raftStorage *storage.Storage 25 | 26 | kvHandle *httpd.KVHandle 27 | } 28 | 29 | var ( 30 | clusterIds = []uint64{0, 1, 2} 31 | ) 32 | 33 | func NewEngine(cfg *config.DynamicConfig) *Engine { 34 | raftCfg := &storage.RaftConfig{ 35 | LogDir: cfg.LogDir, 36 | LogLevel: zapcore.DebugLevel, 37 | HostIP: cfg.IP, 38 | NodeId: cfg.NodeId, 39 | ClusterIds: clusterIds, 40 | RaftAddr: fmt.Sprintf("%s:%d", cfg.IP, cfg.RaftPort), 41 | MultiGroupSize: uint32(len(clusterIds)), 42 | StorageDir: cfg.RaftDir, 43 | Join: cfg.Join, 44 | InitialMembers: cfg.InitialMembers, 45 | // Gossip: metadata.Gossip, 46 | // GossipPort: metadata.GossipPort, 47 | // GossipSeeds: metadata.GossipSeeds, 48 | Metrics: false, 49 | // BindAddress: fmt.Sprintf("%s:%d", engine.cfg.IP, metadata.GossipConfig.BindPort), 50 | // BindPort: uint16(metadata.GossipConfig.BindPort), 51 | // Seeds: metadata.GossipConfig.Seeds, 52 | } 53 | 54 | raftStorage, err := storage.NewStorage(raftCfg) 55 | if err != nil { 56 | log.Fatal(err) 57 | } 58 | 59 | log.Println("raft started, waiting raft cluster ready") 60 | 61 | // 等待raft集群ready 62 | err = raftStorage.RaftReady() 63 | if err != nil { 64 | log.Fatalf("[ERROR] raft ready %s\n", err.Error()) 65 | } 66 | 67 | router := gin.New() 68 | router.Use(gin.Recovery()) 69 | 70 | engine := &Engine{ 71 | prefix: "/raft", 72 | router: router, 73 | server: &http.Server{ 74 | Addr: fmt.Sprintf("0.0.0.0:%s", cfg.HttpPort), 75 | Handler: router, 76 | ReadTimeout: 20 * time.Second, 77 | WriteTimeout: 40 * time.Second, 78 | }, 79 | raftStorage: raftStorage, 80 | kvHandle: httpd.NewKVHandle("kvstorage", raftStorage), 81 | } 82 | 83 | engine.registerRouter(router) 84 | 85 | go func() { 86 | if err := engine.server.ListenAndServe(); err != nil { 87 | panic(err.Error()) 88 | } 89 | }() 90 | 91 | return engine 92 | } 93 | 94 | func (engine *Engine) Stop() { 95 | if engine.server != nil { 96 | if err := engine.server.Shutdown(context.Background()); err != nil { 97 | fmt.Println("Server Shutdown: ", err) 98 | } 99 | } 100 | 101 | engine.raftStorage.StopRaftNode() 102 | } 103 | -------------------------------------------------------------------------------- /productready/httpd/handle.go: -------------------------------------------------------------------------------- 1 | package httpd 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | "time" 8 | 9 | "github.com/xkeyideal/mraft/productready/storage" 10 | "github.com/xkeyideal/mraft/productready/utils" 11 | 12 | "github.com/gin-gonic/gin" 13 | ) 14 | 15 | type KVHandle struct { 16 | cf string 17 | raftStorage *storage.Storage 18 | } 19 | 20 | func NewKVHandle(cf string, raftStorage *storage.Storage) *KVHandle { 21 | return &KVHandle{ 22 | cf: cf, 23 | raftStorage: raftStorage, 24 | } 25 | } 26 | 27 | func (mh *KVHandle) Info(c *gin.Context) { 28 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 29 | defer cancel() 30 | 31 | info, _ := mh.raftStorage.GetMembership(ctx) 32 | utils.SetStrResp(http.StatusOK, 0, "", info, c) 33 | } 34 | 35 | func (mh *KVHandle) Query(c *gin.Context) { 36 | key := c.Query("key") 37 | sync := c.Query("sync") 38 | 39 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 40 | defer cancel() 41 | 42 | val, err := mh.raftStorage.Get(ctx, mh.cf, key, sync == "true", []byte(key)) 43 | if err != nil { 44 | utils.SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 45 | return 46 | } 47 | 48 | utils.SetStrResp(http.StatusOK, 0, "", string(val), c) 49 | } 50 | 51 | func (mh *KVHandle) Upsert(c *gin.Context) { 52 | key := c.Query("key") 53 | val := c.Query("val") 54 | 55 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 56 | defer cancel() 57 | 58 | err := mh.raftStorage.Put(ctx, mh.cf, key, []byte(key), []byte(val)) 59 | if err != nil { 60 | utils.SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 61 | return 62 | } 63 | 64 | utils.SetStrResp(http.StatusOK, 0, "", "OK", c) 65 | } 66 | 67 | func (mh *KVHandle) Delete(c *gin.Context) { 68 | key := c.Query("key") 69 | 70 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 71 | defer cancel() 72 | 73 | err := mh.raftStorage.Del(ctx, mh.cf, key, []byte(key)) 74 | if err != nil { 75 | utils.SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 76 | return 77 | } 78 | 79 | utils.SetStrResp(http.StatusOK, 0, "", "OK", c) 80 | } 81 | 82 | func (mh *KVHandle) JoinNode(c *gin.Context) { 83 | nodeAddr := c.Query("addr") 84 | 85 | raftAddrs := mh.raftStorage.GetNodeHost() 86 | for _, raftAddr := range raftAddrs { 87 | if nodeAddr == raftAddr { 88 | utils.SetStrResp(http.StatusOK, 1, fmt.Sprintf("%s 待加入的节点已经在集群raft节点中", nodeAddr), "OK", c) 89 | } 90 | } 91 | 92 | err := mh.raftStorage.AddRaftNode(utils.Addr2RaftNodeID(nodeAddr), nodeAddr) 93 | if err != nil { 94 | utils.SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 95 | return 96 | } 97 | utils.SetStrResp(http.StatusOK, 0, "", "OK", c) 98 | } 99 | 100 | func (mh *KVHandle) DelNode(c *gin.Context) { 101 | nodeAddr := c.Query("addr") 102 | 103 | err := mh.raftStorage.RemoveRaftNode(utils.Addr2RaftNodeID(nodeAddr)) 104 | if err != nil { 105 | utils.SetStrResp(http.StatusBadRequest, -1, err.Error(), "", c) 106 | return 107 | } 108 | utils.SetStrResp(http.StatusOK, 0, "", "OK", c) 109 | } 110 | -------------------------------------------------------------------------------- /productready/ilogger/logger.go: -------------------------------------------------------------------------------- 1 | package ilogger 2 | 3 | import ( 4 | "fmt" 5 | "path/filepath" 6 | 7 | zlog "github.com/xkeyideal/mraft/logger" 8 | 9 | "github.com/lni/dragonboat/v3/logger" 10 | "go.uber.org/zap" 11 | "go.uber.org/zap/zapcore" 12 | ) 13 | 14 | const LogDir = "/tmp/logs/mraft/" 15 | 16 | type LoggerOptions struct { 17 | logDir string 18 | nodeId uint64 19 | target string 20 | } 21 | 22 | var Lo *LoggerOptions = &LoggerOptions{ 23 | logDir: LogDir, 24 | } 25 | 26 | func (lo *LoggerOptions) SetLogDir(dir string) { 27 | lo.logDir = dir 28 | } 29 | 30 | func (lo *LoggerOptions) SetNodeId(nodeId uint64) { 31 | lo.nodeId = nodeId 32 | } 33 | 34 | func (lo *LoggerOptions) SetTarget(target string) { 35 | lo.target = target 36 | } 37 | 38 | func init() { 39 | logger.SetLoggerFactory(RaftFactory) 40 | logger.GetLogger("raft").SetLevel(logger.WARNING) 41 | logger.GetLogger("rsm").SetLevel(logger.ERROR) 42 | logger.GetLogger("transport").SetLevel(logger.WARNING) 43 | logger.GetLogger("gossip").SetLevel(logger.ERROR) 44 | logger.GetLogger("grpc").SetLevel(logger.ERROR) 45 | logger.GetLogger("dragonboat").SetLevel(logger.WARNING) 46 | logger.GetLogger("logdb").SetLevel(logger.ERROR) 47 | logger.GetLogger("raftpb").SetLevel(logger.ERROR) 48 | logger.GetLogger("config").SetLevel(logger.ERROR) 49 | logger.GetLogger("settings").SetLevel(logger.INFO) 50 | } 51 | 52 | type raftLogger struct { 53 | pkgName string 54 | logDir string 55 | log *zap.Logger 56 | } 57 | 58 | func NewRaftLogger(logDir, pkgName string, level zapcore.Level) *raftLogger { 59 | name := fmt.Sprintf("%s.log", pkgName) 60 | return &raftLogger{ 61 | pkgName: pkgName, 62 | logDir: logDir, 63 | log: zlog.NewLogger(filepath.Join(logDir, name), level, false), 64 | } 65 | } 66 | 67 | func RaftFactory(pkgName string) logger.ILogger { 68 | return &raftLogger{ 69 | logDir: Lo.logDir, 70 | pkgName: pkgName, 71 | } 72 | } 73 | 74 | var _ logger.ILogger = (*raftLogger)(nil) 75 | 76 | func (c *raftLogger) SetLevel(level logger.LogLevel) { 77 | var cl zapcore.Level 78 | if level == logger.CRITICAL { 79 | cl = zapcore.PanicLevel 80 | } else if level == logger.ERROR { 81 | cl = zapcore.ErrorLevel 82 | } else if level == logger.WARNING { 83 | cl = zapcore.WarnLevel 84 | } else if level == logger.INFO { 85 | cl = zapcore.InfoLevel 86 | } else if level == logger.DEBUG { 87 | cl = zapcore.DebugLevel 88 | } else { 89 | panic("unexpected level") 90 | } 91 | 92 | name := fmt.Sprintf("dragonboat-%s.log", c.pkgName) 93 | c.log = zlog.NewLogger(filepath.Join(c.logDir, name), cl, false) 94 | } 95 | 96 | func (c *raftLogger) fmsg() string { 97 | return "[multiraft] [" + c.pkgName + "]" 98 | } 99 | 100 | func (c *raftLogger) Debugf(format string, args ...interface{}) { 101 | c.log.Debug(c.fmsg(), 102 | zap.String("target", Lo.target), 103 | zap.Uint64("nodeId", Lo.nodeId), 104 | zap.String("msg", fmt.Sprintf(format, args...)), 105 | ) 106 | } 107 | 108 | func (c *raftLogger) Infof(format string, args ...interface{}) { 109 | c.log.Info(c.fmsg(), zap.String("target", Lo.target), 110 | zap.Uint64("nodeId", Lo.nodeId), 111 | zap.String("msg", fmt.Sprintf(format, args...)), 112 | ) 113 | } 114 | 115 | func (c *raftLogger) Warningf(format string, args ...interface{}) { 116 | c.log.Warn(c.fmsg(), 117 | zap.String("target", Lo.target), 118 | zap.Uint64("nodeId", Lo.nodeId), 119 | zap.String("msg", fmt.Sprintf(format, args...)), 120 | ) 121 | } 122 | 123 | func (c *raftLogger) Errorf(format string, args ...interface{}) { 124 | c.log.Error(c.fmsg(), 125 | zap.String("target", Lo.target), 126 | zap.Uint64("nodeId", Lo.nodeId), 127 | zap.String("msg", fmt.Sprintf(format, args...)), 128 | ) 129 | } 130 | 131 | func (c *raftLogger) Panicf(format string, args ...interface{}) { 132 | c.log.Panic(c.fmsg(), 133 | zap.String("target", Lo.target), 134 | zap.Uint64("nodeId", Lo.nodeId), 135 | zap.String("msg", fmt.Sprintf(format, args...)), 136 | ) 137 | } 138 | -------------------------------------------------------------------------------- /productready/main/app.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "os/signal" 7 | "strconv" 8 | "syscall" 9 | 10 | "github.com/xkeyideal/mraft/productready" 11 | "github.com/xkeyideal/mraft/productready/config" 12 | ) 13 | 14 | var DevelopDefaultDynamicConfig = &config.DynamicConfig{ 15 | RaftDir: "/Users/xkey/test/raftdata", 16 | LogDir: "/tmp/", 17 | IP: "127.0.0.1", 18 | } 19 | 20 | func main() { 21 | if len(os.Args) <= 2 { 22 | log.Fatal("input arg $1 httpPort, $2 raftPort") 23 | } 24 | 25 | config := DevelopDefaultDynamicConfig 26 | 27 | httpPort, err := strconv.ParseUint(os.Args[1], 10, 64) 28 | if err != nil { 29 | log.Fatal("[ERROR]", err) 30 | } 31 | 32 | config.HttpPort = uint16(httpPort) 33 | 34 | raftPort, err := strconv.ParseUint(os.Args[1], 10, 64) 35 | if err != nil { 36 | log.Fatal("[ERROR]", err) 37 | } 38 | 39 | config.RaftPort = uint16(raftPort) 40 | 41 | eg := productready.NewEngine(config) 42 | 43 | signals := make(chan os.Signal, 1) 44 | signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM, syscall.SIGKILL) 45 | 46 | log.Println(<-signals) 47 | 48 | eg.Stop() 49 | } 50 | -------------------------------------------------------------------------------- /productready/router.go: -------------------------------------------------------------------------------- 1 | package productready 2 | 3 | import "github.com/gin-gonic/gin" 4 | 5 | func (engine *Engine) registerRouter(router *gin.Engine) { 6 | group := router.Group(engine.prefix) 7 | { 8 | group.GET("/info", engine.kvHandle.Info) 9 | 10 | group.GET("/key", engine.kvHandle.Query) 11 | group.POST("/key", engine.kvHandle.Upsert) 12 | group.DELETE("/key", engine.kvHandle.Delete) 13 | 14 | group.GET("/join", engine.kvHandle.JoinNode) 15 | group.GET("/del", engine.kvHandle.DelNode) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /productready/storage/command.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "errors" 7 | "strconv" 8 | 9 | "github.com/xkeyideal/mraft/productready/storage/store" 10 | 11 | "github.com/lni/dragonboat/v3" 12 | "github.com/lni/dragonboat/v3/client" 13 | "github.com/ugorji/go/codec" 14 | ) 15 | 16 | var ( 17 | revisionKey = []byte("__RAFT_KEY_REVISION__") 18 | ) 19 | 20 | type CommandType byte 21 | 22 | const ( 23 | DELETE CommandType = 0 24 | PUT CommandType = 1 25 | GET CommandType = 2 26 | ) 27 | 28 | type WriteOptions struct { 29 | // 存储key时,此key的revision 30 | Revision uint64 31 | } 32 | 33 | func mergeWriteOptions(opts ...*WriteOptions) *WriteOptions { 34 | wo := &WriteOptions{} 35 | for _, opt := range opts { 36 | if opt == nil { 37 | continue 38 | } 39 | 40 | wo.Revision = opt.Revision 41 | } 42 | 43 | return wo 44 | } 45 | 46 | type RaftCommand interface { 47 | GetType() CommandType 48 | RaftInvoke(ctx context.Context, nh *dragonboat.NodeHost, clusterId uint64, session *client.Session) error 49 | LocalInvoke(s *store.Store, opts ...*WriteOptions) error 50 | GetResp() []byte 51 | } 52 | 53 | func Decode(buf []byte, e interface{}) error { 54 | handle := codec.MsgpackHandle{} 55 | return codec.NewDecoder(bytes.NewReader(buf), &handle).Decode(e) 56 | } 57 | 58 | func DecodeCmd(data []byte) (RaftCommand, error) { 59 | var cmd RaftCommand 60 | switch CommandType(data[0]) { 61 | case DELETE: 62 | cmd = &DelCommand{} 63 | case PUT: 64 | cmd = &PutCommand{} 65 | case GET: 66 | cmd = &GetCommand{} 67 | default: 68 | return nil, errors.New("can not find command type:" + strconv.Itoa(int(data[0]))) 69 | } 70 | 71 | handle := codec.MsgpackHandle{} 72 | return cmd, codec.NewDecoder(bytes.NewReader(data[1:]), &handle).Decode(cmd) 73 | } 74 | 75 | func EncodeCmd(cmd RaftCommand) ([]byte, error) { 76 | buf := bytes.NewBuffer(nil) 77 | buf.WriteByte(byte(cmd.GetType())) 78 | 79 | handle := codec.MsgpackHandle{} 80 | encoder := codec.NewEncoder(buf, &handle) 81 | err := encoder.Encode(cmd) 82 | return buf.Bytes(), err 83 | } 84 | 85 | func syncWrite(ctx context.Context, nh *dragonboat.NodeHost, session *client.Session, cmd RaftCommand) ([]byte, error) { 86 | b, err := EncodeCmd(cmd) 87 | if err != nil { 88 | return nil, err 89 | } 90 | 91 | result, err := nh.SyncPropose(ctx, session, b) 92 | if err != nil { 93 | return nil, err 94 | } 95 | 96 | return result.Data, nil 97 | } 98 | 99 | func syncRead(ctx context.Context, nh *dragonboat.NodeHost, clusterId uint64, cmd RaftCommand) ([]byte, error) { 100 | b, err := EncodeCmd(cmd) 101 | if err != nil { 102 | return nil, err 103 | } 104 | 105 | result, err := nh.SyncRead(ctx, clusterId, b) 106 | if err != nil { 107 | return nil, err 108 | } 109 | 110 | return result.([]byte), nil 111 | } 112 | 113 | func buildRevisionKey(key []byte) []byte { 114 | return append(revisionKey, key...) 115 | } 116 | -------------------------------------------------------------------------------- /productready/storage/config.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "github.com/lni/dragonboat/v3/config" 5 | "go.uber.org/zap/zapcore" 6 | ) 7 | 8 | const deploymentId = 2023082513 9 | 10 | type RaftConfig struct { 11 | LogDir string 12 | // 数据存储地址 13 | StorageDir string 14 | LogLevel zapcore.Level 15 | 16 | // 本机IP地址 17 | HostIP string 18 | 19 | // nodeId一旦生成不能变动 20 | NodeId uint64 21 | 22 | // 该节点被分配的clusterIds 23 | ClusterIds []uint64 24 | 25 | // raft通信地址 26 | RaftAddr string 27 | 28 | // 用于moveTo命令时对方raft节点的grpc端口 29 | // GrpcPort uint16 30 | 31 | // raft cluster的分组个数, 用于hashKey计算clusterId 32 | MultiGroupSize uint32 33 | 34 | // 是否以join的方式加入raft集群 35 | // key: clusterId, key:nodeId 36 | Join map[uint64]map[uint64]bool 37 | 38 | // 此参数需注意的是: 39 | // 采用gossip方式启动时val是nodehostId, 详细参考dragonboat的文档, 40 | // 初次可以使用dragonboat id.NewNodeHostID(id uint64)来生成; 41 | // 若采用raftAddr固定不变的方式启动,val就是raftAddr 42 | // key: clusterId, key:nodeId 43 | InitialMembers map[uint64]map[uint64]string 44 | 45 | // 如果raft集群采用gossip可变IP的方式启动需设置 46 | Gossip bool 47 | GossipPort uint16 48 | GossipSeeds []string 49 | 50 | // dragonboat 是否开启metrics 51 | Metrics bool 52 | } 53 | 54 | func buildNodeHostConfig(raftDir string, raftAddr string, metrics bool, re *raftEvent, se *systemEvent) config.NodeHostConfig { 55 | return config.NodeHostConfig{ 56 | // DeploymentID用于确定两个NodeHost实例是否属于同一部署,并因此允许彼此通信。 57 | // 通过将上下文消息发送到不相关的Raft节点,这有助于防止意外配置错误的NodeHost实例导致数据损坏错误。 58 | // 对于特定的基于Dragonboat的应用程序,可以在所有生产NodeHost实例上将DeploymentID设置为相同的uint64值, 59 | // 然后在登台和开发环境中使用不同的DeploymentID值。 对于不同的基于Dragonboat的应用程序,也建议使用不同的DeploymentID值。 60 | // 如果未设置,则默认值0将用作部署ID,从而允许所有具有部署ID 0的NodeHost实例相互通信。 61 | DeploymentID: deploymentId, 62 | 63 | // WALDir是用于存储所有Raft日志的WAL的目录,这仅用于存储Raft日志的WAL,它的大小通常很小, 64 | // 每个NodeHost的64GB通常绰绰有余。如果不设置,则所有内容会存储在NodeHostDir中 65 | WALDir: raftDir, 66 | 67 | // NodeHostDir存储所有需要存储的信息 68 | NodeHostDir: raftDir, 69 | 70 | // RTTMillisecond定义了两个NodeHost实例之间的平均往返时间(RTT),以毫秒为单位 71 | // 这样的RTT间隔在内部用作逻辑时钟滴答,raft的心跳和选举间隔都根据有多少这样的RTT间隔来定义 72 | // 请注意,RTTMillisecond是两个NodeHost实例之间的组合延迟,包括由网络传输引起的所有延迟,NodeHost排队和处理引起的所有延迟。 73 | // 例如,在满载时,我们用于基准测试的两个NodeHost实例之间的平均往返时间最多为500微秒,而它们之间的ping时间为100微秒。 74 | // 当您的环境中的RTTMillisecond小于1百万时,请将其设置为1。 75 | RTTMillisecond: 200, 76 | 77 | //当前节点对外的IP和端口,其他raft节点需要通过这个信息获得 78 | RaftAddress: raftAddr, 79 | 80 | // ListenAddress是Raft RPC模块用于侦听Raft消息和快照的IP:端口地址。 81 | // 如果未设置ListenAddress字段,则Raft RPC模块将使用RaftAddress。 82 | // 如果将0.0.0.0指定为ListenAddress的IP,则Dragonboat将侦听所有接口上的指定端口。 83 | // 指定主机名或域名后,它将首先在本地解析为IP地址,而Dragonboat会侦听所有解析的IP地址。 84 | // 一般不指定这个,和RaftAddress保持一致就好了,收发就用一个端口,没有必要分开 85 | // ListenAddress: listenAddr, 86 | 87 | //是否使用TLS进行安全认证,整个程序都是部署在内网中,可以认为是安全的,就不打开这个了 88 | MutualTLS: false, 89 | 90 | //当配置了TLS时,需要指定CA文件的地址 91 | //当配置了TLS时,需要指定CertFile的地址 92 | //CertFile string 93 | //当配置了TLS时,需要指定KeyFile的地址 94 | //KeyFile string 95 | //MaxReceiveQueueSize是每个接收队列的最大大小(以字节为单位)。 一旦达到最大大小,将删除更多复制消息以限制内存使用。 设置为0时,表示队列大小不受限制。 96 | //暂时先设置为128M 97 | MaxSendQueueSize: 128 * 1024 * 1024, 98 | 99 | // EnableMetrics确定是否应启用Prometheus格式的健康度量。 100 | EnableMetrics: metrics, 101 | 102 | //MaxSnapshotSendBytesPerSecond定义了NodeHost实例管理的所有Raft群集每秒可发送多少快照数据。默认值0表示没有为快照流设置限制。 103 | //每秒最多传输256M数据 104 | MaxSnapshotSendBytesPerSecond: 256 * 1024 * 1024, 105 | 106 | // MaxSnapshotRecvBytesPerSecond定义可以存储多少快照数据由NodeHost实例管理的所有Raft群集每秒收到一次。默认值0表示接收快照数据没有限制。 107 | //目前不限制接受的大小,由发送端决定 108 | MaxSnapshotRecvBytesPerSecond: 0, 109 | 110 | // RaftEventListener是暴露给用户空间的Raft事件(例如Raft领导变更)的侦听器。 111 | // NodeHost使用一个专用的goroutine来逐个调用所有RaftEventListener方法, 112 | // 可能导致长时间延迟的CPU密集型或IO相关过程,应将其分流到用户管理的工作程序中。 113 | RaftEventListener: re, 114 | 115 | //SystemEventsListener允许向用户通知系统事件,例如快照创建,日志压缩和快照流。 它通常用于测试目的或用于其他高级用途,不需要Dragonboat应用程序来显式设置此字段。 116 | SystemEventListener: se, 117 | } 118 | } 119 | 120 | func buildRaftConfig(nodeId, clusterId uint64) config.Config { 121 | return config.Config{ 122 | //当前节点的ID 123 | NodeID: nodeId, 124 | 125 | //当前节点的分片ID,如果当前raft是多组的,那么这个地方是指定当前组的ID 126 | ClusterID: clusterId, 127 | 128 | //领导节点是否应定期检查非领导者节点的状态,并在其不再具有法定人数时退出成为跟随者节点 129 | //当有5台机器,挂了3台,法定人数不够,则主节点退出,不再是主节点了,所有的写操作和同步读操作应该都不能执行了 130 | //各个节点只能读取本地的数据 131 | CheckQuorum: false, 132 | 133 | // ElectionRTT是两次选举之间的消息RTT的最小数量。 消息RTT由NodeHostConfig.RTTMillisecond定义。 134 | // Raft论文建议其幅度大于HeartbeatRTT(因为是先发现不健康,才会进行选举),即两个心跳之间的间隔。 135 | // 在Raft中,选举之间的实际间隔被随机分配在ElectionRTT和2 * ElectionRTT之间。例如,假设NodeHostConfig.RTTMillisecond为100毫秒, 136 | // 要将选举间隔设置为1秒,则应该将ElectionRTT设置为10。启用CheckQuorum后,ElectionRTT还将定义检查领导者定额的时间间隔。 137 | // 这个值是个比例,具体的RTT时间大小是RTTMillisecond*ElectionRTT,当需要选举主节点时,各个节点的随机间隔在ElectionRTT和2 * ElectionRTT, 138 | // 当CheckQuorum为true,主也会每隔这个时间检查下从机数据是否符合法定人数 139 | ElectionRTT: 60, 140 | 141 | // HeartbeatRTT是两次心跳之间的消息RTT数。 消息RTT由NodeHostConfig.RTTMillisecond定义。 Raft论文建议心跳间隔应接近节点之间的平均RTT。 142 | // 例如,假设NodeHostConfig.RTTMillisecond为100毫秒,要将心跳间隔设置为每200毫秒,则应将HeartbeatRTT设置为2。 143 | HeartbeatRTT: 6, 144 | 145 | // SnapshotEntries定义应自动对状态机进行快照的频率,可以将SnapshotEntries设置为0以禁用此类自动快照。 146 | // 当SnapshotEntries设置为N时,意味着大约每N条Raft日志创建一个快照。这也意味着向跟踪者发送N个日志条目比发送快照要昂贵。 147 | // 生成快照后,可以压缩新快照覆盖的Raft日志条目。这涉及两个步骤,冗余日志条目首先被标记为已删除,然后在稍后发布 LogDB 压缩时将其从基础存储中物理删除。 148 | // 有关在生成快照后实际删除和压缩哪些日志条目的详细信息,请参见CompactionOverhead,通过将SnapshotEntries字段设置为0禁用自动快照后, 149 | // 用户仍然可以使用NodeHost的RequestSnapshot或SyncRequestSnapshot方法手动请求快照。 150 | SnapshotEntries: 25 * 10000 * 10, 151 | 152 | // CompactionOverhead定义每次Raft日志压缩后要保留的最新条目数。 153 | // 假设当前的日志为10000,开始创建快照,那么快照创建完成后,<=10000的日志都会被清理, 154 | // 如果想获得9000这样的日志,那么就得先完全加载快照,再从快照中读取,如果设置了CompactionOverhead为3000, 155 | // 那么就算创建了快照,我们仍然能获得10000-7000之间的日志记录,只有小于7000的,才需要重新加载日志获取 156 | CompactionOverhead: 25 * 10000, 157 | 158 | //确定是否使用ChangeID的顺序强制执行Raft成员资格更改。 159 | OrderedConfigChange: true, 160 | 161 | // MaxInMemLogSize是允许在每个Raft节点上的Raft日志存储在内存中的目标大小(以字节为单位)。 内存中的筏日志是尚未应用的日志。 162 | // MaxInMemLogSize是为防止内存无限增长而实现的目标值,并非用于精确限制确切的内存使用量。 163 | // 当MaxInMemLogSize为0时,目标设置为math.MaxUint64。 设置MaxInMemLogSize并达到目标后,客户端尝试提出新建议时将返回错误。 164 | // 建议将MaxInMemLogSize大于要使用的最大建议。 165 | //内存中未应用的日志大小,暂定为256M,超过256M的大小后会返回错误 166 | MaxInMemLogSize: 256 * 1024 * 1024, 167 | 168 | // SnapshotCompressionType是用于压缩生成的快照数据的压缩类型。 默认情况下不使用压缩。 169 | // 快照数据本身由rocksdb生成,采用了LZ4压缩,所以这边就不再继续压缩了 170 | SnapshotCompressionType: config.NoCompression, 171 | 172 | // EntryCompressionType是用于压缩用户日志。 使用Snappy时,允许的最大建议有效负载大致限制为3.42GB。 173 | EntryCompressionType: config.Snappy, 174 | 175 | // DisableAutoCompactions禁用用于回收Raft条目存储空间的自动压缩。 176 | // 默认情况下,每次捕获快照时都会执行压缩,这有助于以较高的IO开销为代价,尽快回收磁盘空间。 177 | // 用户可以禁用此类自动压缩,并在必要时使用NodeHost.RequestCompaction手动请求此类压缩。 178 | DisableAutoCompactions: false, 179 | 180 | // IsObserver指示当前节点是否是Observer节点,(观察者节点通常用于允许新节点加入群集并追赶其他日志,而不会影响可用性。 还可以引入额外的观察者节点来满足只读请求,而不会影响系统的写吞吐量) 181 | IsObserver: false, 182 | 183 | // IsWitness指示这是否是没有实际日志复制且没有状态机的见证Raft节点,见证节点支持目前处于试验阶段。 184 | IsWitness: false, 185 | 186 | //停顿指定在没有群集活动时是否让Raft群集进入停顿模式。 静默模式下的群集不交换心跳消息以最小化带宽消耗。当前处于试验阶段 187 | Quiesce: false, 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /productready/storage/del.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/cockroachdb/pebble" 7 | "github.com/xkeyideal/mraft/productready/storage/store" 8 | 9 | "github.com/lni/dragonboat/v3" 10 | "github.com/lni/dragonboat/v3/client" 11 | ) 12 | 13 | type DelCommand struct { 14 | CfName string 15 | Key []byte 16 | } 17 | 18 | func NewDelCommand(cfName string, key []byte) *DelCommand { 19 | return &DelCommand{CfName: cfName, Key: key} 20 | } 21 | 22 | func (c *DelCommand) GetType() CommandType { 23 | return DELETE 24 | } 25 | 26 | func (c *DelCommand) RaftInvoke(ctx context.Context, nh *dragonboat.NodeHost, _ uint64, session *client.Session) error { 27 | _, err := syncWrite(ctx, nh, session, c) 28 | return err 29 | } 30 | 31 | func (c *DelCommand) LocalInvoke(s *store.Store, opts ...*WriteOptions) error { 32 | batch := s.Batch() 33 | defer batch.Close() 34 | 35 | cf := s.GetColumnFamily(c.CfName) 36 | 37 | batch.Delete(s.BuildColumnFamilyKey(cf, c.Key), pebble.Sync) 38 | 39 | // 删除revision 40 | revisionKey := buildRevisionKey(c.Key) 41 | batch.Delete(s.BuildColumnFamilyKey(cf, revisionKey), pebble.Sync) 42 | 43 | return s.Write(batch) 44 | } 45 | 46 | func (c *DelCommand) GetResp() []byte { 47 | return nil 48 | } 49 | -------------------------------------------------------------------------------- /productready/storage/event.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "sync/atomic" 5 | "time" 6 | 7 | "github.com/lni/dragonboat/v3/raftio" 8 | "go.uber.org/zap" 9 | ) 10 | 11 | type raftEvent struct { 12 | s *Storage 13 | } 14 | 15 | func (e *raftEvent) LeaderUpdated(info raftio.LeaderInfo) { 16 | e.s.log.Warn("[raftstorage] [event] [LeaderUpdated]", 17 | zap.String("target", e.s.target), 18 | zap.Any("info", info), 19 | ) 20 | 21 | if atomic.LoadUint32(&e.s.status) == ready && info.LeaderID != 0 { 22 | e.s.leaderc <- info 23 | } 24 | } 25 | 26 | type systemEvent struct { 27 | s *Storage 28 | } 29 | 30 | func (e *systemEvent) NodeHostShuttingDown() { 31 | e.s.log.Warn("[raftstorage] [event] [NodeHostShuttingDown]", zap.String("target", e.s.target)) 32 | } 33 | 34 | func (e *systemEvent) NodeUnloaded(info raftio.NodeInfo) { 35 | e.s.log.Warn("[raftstorage] [event] [NodeUnloaded]", zap.String("target", e.s.target), zap.Any("info", info)) 36 | } 37 | 38 | func (e *systemEvent) NodeReady(info raftio.NodeInfo) { 39 | e.s.log.Info("[raftstorage] [event] [NodeReady]", zap.String("target", e.s.target), zap.Any("info", info)) 40 | } 41 | func (e *systemEvent) MembershipChanged(info raftio.NodeInfo) { 42 | e.s.log.Warn("[raftstorage] [event] [MembershipChanged]", zap.String("target", e.s.target), zap.Any("info", info)) 43 | if atomic.LoadUint32(&e.s.status) == ready { 44 | e.s.memberc <- info 45 | } 46 | } 47 | func (e *systemEvent) ConnectionEstablished(info raftio.ConnectionInfo) { 48 | e.s.log.Info("[raftstorage] [event] [ConnectionEstablished]", zap.String("target", e.s.target), zap.Any("info", info)) 49 | } 50 | func (e *systemEvent) ConnectionFailed(info raftio.ConnectionInfo) { 51 | e.s.log.Warn("[raftstorage] [event] [ConnectionFailed]", zap.String("target", e.s.target), zap.Any("info", info)) 52 | } 53 | func (e *systemEvent) SendSnapshotStarted(info raftio.SnapshotInfo) { 54 | e.s.log.Info("[raftstorage] [event] [SendSnapshotStarted]", zap.String("target", e.s.target), zap.Any("info", info)) 55 | } 56 | func (e *systemEvent) SendSnapshotCompleted(info raftio.SnapshotInfo) { 57 | e.s.log.Info("[raftstorage] [event] [SendSnapshotCompleted]", zap.String("target", e.s.target), zap.Any("info", info)) 58 | } 59 | func (e *systemEvent) SendSnapshotAborted(info raftio.SnapshotInfo) { 60 | e.s.log.Info("[raftstorage] [event] [SendSnapshotAborted]", zap.String("target", e.s.target), zap.Any("info", info)) 61 | } 62 | func (e *systemEvent) SnapshotReceived(info raftio.SnapshotInfo) { 63 | e.s.log.Info("[raftstorage] [event] [SnapshotReceived]", zap.String("target", e.s.target), zap.Any("info", info)) 64 | } 65 | func (e *systemEvent) SnapshotRecovered(info raftio.SnapshotInfo) { 66 | e.s.log.Info("[raftstorage] [event] [SnapshotRecovered]", zap.String("target", e.s.target), zap.Any("info", info)) 67 | } 68 | func (e *systemEvent) SnapshotCreated(info raftio.SnapshotInfo) { 69 | e.s.log.Warn("[raftstorage] [event] [SnapshotCreated]", zap.String("target", e.s.target), zap.Any("info", info)) 70 | } 71 | func (e *systemEvent) SnapshotCompacted(info raftio.SnapshotInfo) { 72 | e.s.log.Warn("[raftstorage] [event] [SnapshotCompacted]", zap.String("target", e.s.target), zap.Any("info", info)) 73 | } 74 | func (e *systemEvent) LogCompacted(info raftio.EntryInfo) { 75 | e.s.log.Info("[raftstorage] [event] [LogCompacted]", zap.String("target", e.s.target), zap.Any("info", info)) 76 | } 77 | func (e *systemEvent) LogDBCompacted(info raftio.EntryInfo) { 78 | e.s.log.Info("[raftstorage] [event] [LogDBCompacted]", zap.String("target", e.s.target), zap.Any("info", info)) 79 | } 80 | 81 | func (s *Storage) handleEvents() { 82 | ticker := time.NewTicker(2 * time.Second) 83 | for { 84 | select { 85 | case info := <-s.memberc: 86 | if info.NodeID == s.cfg.NodeId { 87 | m, err := s.getClusterMembership(info.ClusterID) 88 | if err != nil { 89 | continue 90 | } 91 | 92 | s.cmu.Lock() 93 | s.memberCache[info.ClusterID] = m 94 | s.cmu.Unlock() 95 | } 96 | case info := <-s.leaderc: 97 | if info.NodeID == s.cfg.NodeId { 98 | m, err := s.getClusterMembership(info.ClusterID) 99 | if err != nil { 100 | continue 101 | } 102 | 103 | s.cmu.Lock() 104 | s.memberCache[info.ClusterID] = m 105 | s.cmu.Unlock() 106 | } 107 | case <-ticker.C: 108 | s.cmu.Lock() 109 | if len(s.memberCache) > 0 { 110 | // mc := s.memberCache 111 | // s.gossip.UpdateMembershipMessage(&gossip.RaftMembershipMessage{ 112 | // MemberInfos: mc, 113 | // }) 114 | 115 | s.memberCache = make(map[uint64]*MemberInfo) 116 | } 117 | s.cmu.Unlock() 118 | case <-s.stopper.ShouldStop(): 119 | return 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /productready/storage/get.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | "encoding/binary" 6 | 7 | "github.com/xkeyideal/mraft/productready/storage/store" 8 | 9 | "github.com/lni/dragonboat/v3" 10 | "github.com/lni/dragonboat/v3/client" 11 | ) 12 | 13 | type GetCommand struct { 14 | Cf string 15 | Key []byte 16 | resp []byte 17 | } 18 | 19 | func (c *GetCommand) GetResp() []byte { 20 | return c.resp 21 | } 22 | 23 | func NewGetCommand(cf string, key []byte) *GetCommand { 24 | return &GetCommand{Cf: cf, Key: key} 25 | } 26 | 27 | func (c *GetCommand) GetType() CommandType { 28 | return GET 29 | } 30 | 31 | func (c *GetCommand) RaftInvoke(ctx context.Context, nh *dragonboat.NodeHost, clusterId uint64, _ *client.Session) (err error) { 32 | c.resp, err = syncRead(ctx, nh, clusterId, c) 33 | return err 34 | } 35 | 36 | func (c *GetCommand) LocalInvoke(s *store.Store, opts ...*WriteOptions) error { 37 | cf := s.GetColumnFamily(c.Cf) 38 | 39 | // get revision 40 | v, err := s.GetBytes(s.BuildColumnFamilyKey(cf, buildRevisionKey(c.Key))) 41 | if err != nil { 42 | return err 43 | } 44 | 45 | if len(v) == 0 { 46 | v = make([]byte, 8) 47 | binary.BigEndian.PutUint64(v, 0) 48 | } 49 | 50 | // get value 51 | d, err := s.GetBytes(s.BuildColumnFamilyKey(cf, c.Key)) 52 | if err != nil { 53 | return err 54 | } 55 | 56 | c.resp = append(v, d...) 57 | return nil 58 | } 59 | 60 | func (c *GetCommand) GetResult() []byte { 61 | return c.resp 62 | } 63 | -------------------------------------------------------------------------------- /productready/storage/op.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | "time" 6 | ) 7 | 8 | func (s *Storage) Get(ctx context.Context, cf string, hashKey string, linearizable bool, key []byte) ([]byte, error) { 9 | var ( 10 | clusterId = s.getClusterId(hashKey) 11 | cmd = NewGetCommand(cf, key) 12 | err error 13 | ) 14 | 15 | if linearizable { 16 | err = cmd.RaftInvoke(ctx, s.nh, clusterId, s.csMap[clusterId]) 17 | } else { 18 | err = cmd.LocalInvoke(s.smMap[clusterId]) 19 | } 20 | return cmd.GetResult(), err 21 | } 22 | 23 | func (s *Storage) Put(ctx context.Context, cf string, hashKey string, key, val []byte) error { 24 | cmd := NewPutCommand(cf, key, val) 25 | clusterId := s.getClusterId(hashKey) 26 | return cmd.RaftInvoke(ctx, s.nh, clusterId, s.csMap[clusterId]) 27 | } 28 | 29 | func (s *Storage) Del(ctx context.Context, cf string, hashKey string, key []byte) error { 30 | cmd := NewDelCommand(cf, key) 31 | clusterId := s.getClusterId(hashKey) 32 | return cmd.RaftInvoke(ctx, s.nh, clusterId, s.csMap[clusterId]) 33 | } 34 | 35 | func (s *Storage) AddRaftNode(nodeId uint64, target string) error { 36 | for _, clusterId := range s.cfg.ClusterIds { 37 | ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 38 | ms, err := s.nh.SyncGetClusterMembership(ctx, uint64(clusterId)) 39 | cancel() 40 | if err != nil { 41 | return err 42 | } 43 | 44 | ctx, cancel = context.WithTimeout(context.Background(), 3*time.Second) 45 | err = s.nh.SyncRequestAddNode(ctx, uint64(clusterId), nodeId, target, ms.ConfigChangeID) 46 | cancel() 47 | if err != nil { 48 | return err 49 | } 50 | } 51 | 52 | return nil 53 | } 54 | 55 | func (s *Storage) AddRaftObserver(nodeId uint64, addr string) error { 56 | for _, clusterId := range s.cfg.ClusterIds { 57 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 58 | ms, err := s.nh.SyncGetClusterMembership(ctx, uint64(clusterId)) 59 | cancel() 60 | if err != nil { 61 | return err 62 | } 63 | 64 | ctx, cancel = context.WithTimeout(context.Background(), 4*time.Second) 65 | err = s.nh.SyncRequestAddObserver(ctx, uint64(clusterId), nodeId, addr, ms.ConfigChangeID) 66 | cancel() 67 | if err != nil { 68 | return err 69 | } 70 | } 71 | 72 | return nil 73 | } 74 | 75 | func (s *Storage) RemoveRaftNode(nodeId uint64) error { 76 | for _, clusterId := range s.cfg.ClusterIds { 77 | ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) 78 | ms, err := s.nh.SyncGetClusterMembership(ctx, uint64(clusterId)) 79 | cancel() 80 | if err != nil { 81 | return err 82 | } 83 | 84 | ctx, cancel = context.WithTimeout(context.Background(), 4*time.Second) 85 | err = s.nh.SyncRequestDeleteNode(ctx, uint64(clusterId), nodeId, ms.ConfigChangeID) 86 | cancel() 87 | if err != nil { 88 | return err 89 | } 90 | } 91 | 92 | return nil 93 | } 94 | -------------------------------------------------------------------------------- /productready/storage/put.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/cockroachdb/pebble" 7 | "github.com/xkeyideal/mraft/productready/storage/store" 8 | 9 | "github.com/lni/dragonboat/v3" 10 | "github.com/lni/dragonboat/v3/client" 11 | ) 12 | 13 | type PutCommand struct { 14 | Cf string 15 | Key []byte 16 | Value []byte 17 | } 18 | 19 | func (c *PutCommand) GetResp() []byte { 20 | return nil 21 | } 22 | 23 | func NewPutCommand(cf string, key, value []byte) *PutCommand { 24 | return &PutCommand{Cf: cf, Key: key, Value: value} 25 | } 26 | 27 | func (c *PutCommand) GetType() CommandType { 28 | return PUT 29 | } 30 | 31 | func (c *PutCommand) RaftInvoke(ctx context.Context, nh *dragonboat.NodeHost, _ uint64, session *client.Session) error { 32 | _, err := syncWrite(ctx, nh, session, c) 33 | return err 34 | } 35 | 36 | func (c *PutCommand) LocalInvoke(s *store.Store, opts ...*WriteOptions) error { 37 | batch := s.Batch() 38 | defer batch.Close() 39 | 40 | cf := s.GetColumnFamily(c.Cf) 41 | 42 | batch.Delete(s.BuildColumnFamilyKey(cf, c.Key), pebble.Sync) 43 | 44 | // 删除revision 45 | revisionKey := buildRevisionKey(c.Key) 46 | batch.Delete(s.BuildColumnFamilyKey(cf, revisionKey), pebble.Sync) 47 | 48 | return s.Write(batch) 49 | 50 | return s.Write(batch) 51 | } 52 | -------------------------------------------------------------------------------- /productready/storage/sm.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "io" 7 | "log" 8 | 9 | "github.com/xkeyideal/mraft/productready/storage/store" 10 | 11 | "github.com/cockroachdb/pebble" 12 | sm "github.com/lni/dragonboat/v3/statemachine" 13 | ) 14 | 15 | var ( 16 | indexKeyPrefix = []byte("__RAFT_APPLIED_INDEX__") 17 | nodeReadyKey = []byte("__RAFT_NODE_READY__") 18 | nodeReadyVal = []byte("^_^") 19 | 20 | moveToErr = errors.New("MoveTo Jump Exceed") 21 | sessionNotFound = errors.New("Raft clusterId session not found") 22 | storeNotFound = errors.New("Raft clusterId store not found") 23 | ) 24 | 25 | type StateMachine struct { 26 | raftAddr string 27 | target string 28 | ClusterID uint64 29 | NodeID uint64 30 | store *store.Store 31 | 32 | // db里存储revision的key 33 | indexKey []byte 34 | } 35 | 36 | func newStateMachine(raftAddr, target string, clusterid uint64, nodeId uint64, s *store.Store) *StateMachine { 37 | // 生成存储revision的key 38 | smIndexKey := make([]byte, len(indexKeyPrefix)+8) 39 | copy(smIndexKey, indexKeyPrefix) 40 | binary.BigEndian.PutUint64(smIndexKey[len(indexKeyPrefix):], clusterid) 41 | 42 | return &StateMachine{ 43 | raftAddr: raftAddr, 44 | target: target, 45 | ClusterID: clusterid, 46 | NodeID: nodeId, 47 | indexKey: smIndexKey, 48 | store: s, 49 | } 50 | } 51 | 52 | func (r *StateMachine) Open(stopChan <-chan struct{}) (uint64, error) { 53 | select { 54 | case <-stopChan: 55 | return 0, sm.ErrOpenStopped 56 | default: 57 | val, err := r.store.GetBytes(r.indexKey) 58 | if err != nil { 59 | return 0, err 60 | } 61 | 62 | // 系统初次启动时,全局revision应该是不存在的,db里查不到,此时返回0 63 | if len(val) == 0 { 64 | return 0, nil 65 | } 66 | 67 | return binary.BigEndian.Uint64(val), nil 68 | } 69 | } 70 | 71 | func (r *StateMachine) Update(entries []sm.Entry) ([]sm.Entry, error) { 72 | if r.store.Closed() { 73 | return entries, nil 74 | } 75 | 76 | resultEntries := make([]sm.Entry, 0, len(entries)) 77 | 78 | // 将raft的日志转换为db要执行的命令 79 | for _, e := range entries { 80 | r, err := r.processEntry(e) 81 | if err != nil { 82 | return nil, err 83 | } 84 | 85 | resultEntries = append(resultEntries, r) 86 | } 87 | 88 | idx := entries[len(entries)-1].Index 89 | idxByte := make([]byte, 8) 90 | binary.BigEndian.PutUint64(idxByte, idx) 91 | 92 | batch := r.store.Batch() 93 | defer batch.Close() 94 | 95 | // 更新revision的值 96 | batch.Set(r.indexKey, idxByte, pebble.Sync) 97 | if err := r.store.Write(batch); err != nil { 98 | return nil, err 99 | } 100 | 101 | return resultEntries, nil 102 | } 103 | 104 | func (r *StateMachine) processEntry(e sm.Entry) (sm.Entry, error) { 105 | cmd, err := DecodeCmd(e.Cmd) 106 | if err != nil { 107 | return e, err 108 | } 109 | 110 | opts := &WriteOptions{ 111 | Revision: e.Index, 112 | } 113 | 114 | if err := cmd.LocalInvoke(r.store, opts); err != nil { 115 | return e, err 116 | } 117 | 118 | resp := cmd.GetResp() 119 | e.Result = sm.Result{Value: uint64(len(e.Cmd)), Data: resp} 120 | 121 | return e, nil 122 | } 123 | 124 | func (r *StateMachine) Lookup(query interface{}) (interface{}, error) { 125 | if r.store.Closed() { 126 | return nil, pebble.ErrClosed 127 | } 128 | 129 | cmd, err := DecodeCmd(query.([]byte)) 130 | if err != nil { 131 | return nil, err 132 | } 133 | 134 | if err := cmd.LocalInvoke(r.store); err != nil { 135 | return nil, err 136 | } 137 | 138 | return cmd.GetResp(), nil 139 | } 140 | 141 | func (r *StateMachine) Sync() error { 142 | return nil 143 | } 144 | 145 | type stateMachineStoreCtx struct { 146 | snapshot *pebble.Snapshot 147 | } 148 | 149 | func (r *StateMachine) PrepareSnapshot() (interface{}, error) { 150 | if r.store.Closed() { 151 | return nil, pebble.ErrClosed 152 | } 153 | 154 | return &stateMachineStoreCtx{ 155 | snapshot: r.store.GetSnapshot(), 156 | }, nil 157 | } 158 | 159 | func (r *StateMachine) SaveSnapshot(snapshot interface{}, writer io.Writer, stopChan <-chan struct{}) error { 160 | if r.store.Closed() { 161 | return pebble.ErrClosed 162 | } 163 | 164 | log.Println("SaveSnapshot", r.target, r.raftAddr, r.NodeID, r.ClusterID) 165 | ctxData := snapshot.(*stateMachineStoreCtx) 166 | 167 | ss := ctxData.snapshot 168 | defer ss.Close() 169 | 170 | return r.store.SaveSnapshotToWriter(r.target, r.raftAddr, ss, writer, stopChan) 171 | } 172 | 173 | func (r *StateMachine) RecoverFromSnapshot(reader io.Reader, stopChan <-chan struct{}) error { 174 | if r.store.Closed() { 175 | return pebble.ErrClosed 176 | } 177 | 178 | log.Println("RecoverFromSnapshot", r.target, r.raftAddr, r.NodeID, r.ClusterID) 179 | return r.store.LoadSnapShotFromReader(r.target, r.raftAddr, reader, stopChan) 180 | } 181 | 182 | func (r *StateMachine) Close() error { 183 | if r.store.Closed() { 184 | return nil 185 | } 186 | 187 | return r.store.Close() 188 | } 189 | -------------------------------------------------------------------------------- /productready/storage/store/utils.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "bytes" 5 | "crypto/md5" 6 | "errors" 7 | "fmt" 8 | "io/ioutil" 9 | "math/rand" 10 | "os" 11 | "path/filepath" 12 | "runtime" 13 | "time" 14 | ) 15 | 16 | const ( 17 | currentDBFilename string = "pebble.running" 18 | updatingDBFilename string = "pebble.updating" 19 | ) 20 | 21 | func GetPebbleDBDir(dir string) (string, error) { 22 | var dbdir string 23 | 24 | // 判断是否存在 dir/pebble.running文件 25 | newRunning := isNewRun(dir) 26 | 27 | // 全新启动的程序 28 | if newRunning { // 不存在pebble.running文件 29 | // 此处为了兼容,现有数据已经使用了data_node11772876503705/1/current 进行存储了 30 | // fp := filepath.Join(dir, "current") 31 | // if existFilePath(fp) { 32 | // return fp, nil 33 | // } 34 | 35 | // 没有,随机生成一个目录作为pebbledb的存储目录 36 | dbdir = getNewRandomDBDirName(dir) 37 | if err := saveCurrentDBDirName(dir, dbdir); err != nil { 38 | return "", err 39 | } 40 | if err := replaceCurrentDBFile(dir); err != nil { 41 | return "", err 42 | } 43 | 44 | return dbdir, nil 45 | } 46 | 47 | if err := cleanupNodeDataDir(dir); err != nil { 48 | return "", err 49 | } 50 | 51 | var err error 52 | dbdir, err = getCurrentDBDirName(dir) 53 | if err != nil { 54 | return "", err 55 | } 56 | if _, err := os.Stat(dbdir); err != nil { 57 | if os.IsNotExist(err) { 58 | return "", errors.New("db dir unexpectedly deleted") 59 | } 60 | } 61 | 62 | return dbdir, nil 63 | } 64 | 65 | // functions below are used to manage the current data directory of Pebble DB. 66 | func isNewRun(dir string) bool { 67 | fp := filepath.Join(dir, currentDBFilename) 68 | if _, err := os.Stat(fp); os.IsNotExist(err) { 69 | return true 70 | } 71 | return false 72 | } 73 | 74 | func getNewRandomDBDirName(dir string) string { 75 | part := "%d_%d" 76 | rn := rand.Uint64() 77 | ct := time.Now().UnixNano() 78 | return filepath.Join(dir, fmt.Sprintf(part, rn, ct)) 79 | } 80 | 81 | func replaceCurrentDBFile(dir string) error { 82 | fp := filepath.Join(dir, currentDBFilename) 83 | tmpFp := filepath.Join(dir, updatingDBFilename) 84 | if err := os.Rename(tmpFp, fp); err != nil { 85 | return err 86 | } 87 | return syncDir(dir) 88 | } 89 | 90 | func saveCurrentDBDirName(dir string, dbdir string) error { 91 | h := md5.New() 92 | if _, err := h.Write([]byte(dbdir)); err != nil { 93 | return err 94 | } 95 | fp := filepath.Join(dir, updatingDBFilename) 96 | f, err := os.Create(fp) 97 | if err != nil { 98 | return err 99 | } 100 | defer func() { 101 | if err := f.Close(); err != nil { 102 | panic(err) 103 | } 104 | if err := syncDir(dir); err != nil { 105 | panic(err) 106 | } 107 | }() 108 | if _, err := f.Write(h.Sum(nil)[:8]); err != nil { 109 | return err 110 | } 111 | if _, err := f.Write([]byte(dbdir)); err != nil { 112 | return err 113 | } 114 | if err := f.Sync(); err != nil { 115 | return err 116 | } 117 | return nil 118 | } 119 | 120 | func getCurrentDBDirName(dir string) (string, error) { 121 | fp := filepath.Join(dir, currentDBFilename) 122 | f, err := os.OpenFile(fp, os.O_RDONLY, 0755) 123 | if err != nil { 124 | return "", err 125 | } 126 | 127 | defer func() { 128 | f.Close() 129 | }() 130 | 131 | data, err := ioutil.ReadAll(f) 132 | if err != nil { 133 | return "", err 134 | } 135 | if len(data) <= 8 { 136 | return "", errors.New("corrupted content") 137 | } 138 | crc := data[:8] 139 | content := data[8:] 140 | h := md5.New() 141 | if _, err := h.Write(content); err != nil { 142 | return "", err 143 | } 144 | if !bytes.Equal(crc, h.Sum(nil)[:8]) { 145 | return "", errors.New("corrupted content with not matched crc") 146 | } 147 | return string(content), nil 148 | } 149 | 150 | func createNodeDataDir(dir string) error { 151 | if err := os.MkdirAll(dir, 0755); err != nil { 152 | return err 153 | } 154 | return syncDir(filepath.Dir(dir)) 155 | } 156 | 157 | func cleanupNodeDataDir(dir string) error { 158 | os.RemoveAll(filepath.Join(dir, updatingDBFilename)) 159 | dbdir, err := getCurrentDBDirName(dir) 160 | if err != nil { 161 | return err 162 | } 163 | files, err := ioutil.ReadDir(dir) 164 | if err != nil { 165 | return err 166 | } 167 | for _, fi := range files { 168 | if !fi.IsDir() { 169 | continue 170 | } 171 | 172 | toDelete := filepath.Join(dir, fi.Name()) 173 | if toDelete != dbdir { 174 | if err := os.RemoveAll(toDelete); err != nil { 175 | return err 176 | } 177 | } 178 | } 179 | 180 | return nil 181 | } 182 | 183 | func syncDir(dir string) (err error) { 184 | if runtime.GOOS == "windows" { 185 | return nil 186 | } 187 | 188 | fileInfo, err := os.Stat(dir) 189 | if err != nil { 190 | return err 191 | } 192 | 193 | if !fileInfo.IsDir() { 194 | return nil 195 | } 196 | 197 | df, err := os.Open(filepath.Clean(dir)) 198 | if err != nil { 199 | return err 200 | } 201 | 202 | defer func() { 203 | if cerr := df.Close(); err == nil { 204 | err = cerr 205 | } 206 | }() 207 | 208 | return df.Sync() 209 | } 210 | 211 | func existFilePath(path string) bool { 212 | _, err := os.Stat(path) 213 | if err != nil { 214 | if os.IsExist(err) { 215 | return true 216 | } 217 | 218 | return false 219 | } 220 | 221 | return true 222 | } 223 | -------------------------------------------------------------------------------- /productready/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "strconv" 5 | "strings" 6 | 7 | "github.com/gin-gonic/gin" 8 | ) 9 | 10 | func Addr2RaftNodeID(addr string) uint64 { 11 | s := strings.Split(addr, ":") 12 | bits := strings.Split(s[0], ".") 13 | 14 | b0, _ := strconv.Atoi(bits[0]) 15 | b1, _ := strconv.Atoi(bits[1]) 16 | b2, _ := strconv.Atoi(bits[2]) 17 | b3, _ := strconv.Atoi(bits[3]) 18 | 19 | var sum uint64 20 | 21 | sum += uint64(b0) << 24 22 | sum += uint64(b1) << 16 23 | sum += uint64(b2) << 8 24 | sum += uint64(b3) 25 | 26 | port, _ := strconv.Atoi(s[1]) 27 | 28 | sum = sum<<16 + uint64(port) 29 | 30 | return sum 31 | } 32 | 33 | func SetStrResp(httpCode, code int, msg string, result interface{}, c *gin.Context) { 34 | m := msg 35 | 36 | if code == 0 { 37 | c.JSON(httpCode, gin.H{ 38 | "code": code, 39 | "msg": m, 40 | "result": result, 41 | }) 42 | } else { 43 | c.JSON(httpCode, gin.H{ 44 | "code": code, 45 | "msg": m, 46 | }) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## dragonboat multi-group raft simple example 2 | 3 | multi-group raft的简单使用示例,由于对[dragonboat](https://github.com/lni/dragonboat)的理解有限,可能存在部分错误,还望指出。 4 | 5 | ### 生产ready的样例 6 | 7 | 提供生产ready的样例,[productready](https://github.com/xkeyideal/mraft/blob/master/productready/README.md) 8 | 9 | 1. 提供了完整的采用`pebbledb`作为业务数据存储的状态机代码,此代码已用于生产环境。 10 | 2. 提供了支持动态配置的启动方式,提供了`dragonboat`配置需处理节点ID等问题的一个解决思路 11 | 3. 程序化的提供了新增raft节点的方案 12 | 13 | ### 示例说明 14 | 15 | 本示例是对[dragonboat-example](https://github.com/lni/dragonboat-example)中ondisk示例的重写,改变其代码结构,状态机的数据协议采用自定义的二进制协议,尽可能的提高读写性能。 16 | 17 | 本示例[dragonboat](https://github.com/lni/dragonboat) 使用的是v3.3.7版本, [pebbledb](https://github.com/cockroachdb/pebble) 使用的是跟随`dragonboat`所使用的版本 18 | 19 | ### 序列化工具 20 | 21 | 本示例为了兼容后续项目的需要,业务上只能使用 `thrift` 作为序列化方式,`thrift` 序列化库未采用官方库,使用的是[thrifter](https://github.com/thrift-iterator/go),压测结果详见[thrifter-benchmark](https://github.com/xkeyideal/mraft/blob/master/benchmark/thrift-serialize/thrift-serialize.md) 22 | 23 |
24 | 25 | 在Raft SaveSnapshot与RecoverFromSnapshot时,采用的是自定义二进制协议,详细见[fsm.go](https://github.com/xkeyideal/mraft/blob/master/ondisk/fsm.go#L233),压测结果详见[binary-benchmark](https://github.com/xkeyideal/mraft/blob/master/benchmark/binary-serialize/binary-serialize.md) 26 | 27 | ### TCPServer压测结果 28 | 29 | multi-raft的网络协议与数据格式均使用simple-server中相同的方式,压测结果详见[simple-server-benchmark](https://github.com/xkeyideal/mraft/blob/master/benchmark/multi-raft/simple-server-benchmark.md) 30 | 31 | ### RaftServer压测结果 32 | 33 | multi-raft的压测协议与数据格式均使用simple-server中相同的方式,压测结果详见[raft-server-benchmark](https://github.com/xkeyideal/mraft/blob/master/benchmark/multi-raft/raft-server-benchmark.md) 34 | 35 | 压测数据用例使用的是[代码自动化数据生成工具](https://github.com/xkeyideal/mraft/blob/master/benchmark/generate/generate-data.go),每条数据的数据量大约在2KB以上,具体未做统计。 36 | 37 | ### 压测机器说明 38 | 39 | 机器采用的是开发环境的机器,操作系统macOS High Sierra,`Darwin Kernel Version 18.6.0 root:xnu-4903.261.4~2/RELEASE_X86_64 x86_64 i386 iMac14,2 Darwin` 40 | 41 | CPU:3.29 GHz Intel Core i5 42 | 43 | 内存:20 GB 1600 MHz DDR3 44 | 45 | 磁盘:256GB Intel SATA SSD 46 | 47 | 参考了[dragonboat](https://github.com/lni/dragonboat)作者的文章[从共识算法开谈 - 硬盘性能的最大几个误解](https://zhuanlan.zhihu.com/p/55658164), 48 | 特对开发环境的磁盘的fsync()落盘写性能使用**pg_test_fsync**工具进行测试 49 | 50 | ``` 51 | 5 seconds per test 52 | Direct I/O is not supported on this platform. 53 | 54 | Compare file sync methods using one 8kB write: 55 | (in wal_sync_method preference order, except fdatasync is Linux's default) 56 | open_datasync 15293.184 ops/sec 65 usecs/op 57 | fdatasync 15042.152 ops/sec 66 usecs/op 58 | fsync 15062.644 ops/sec 66 usecs/op 59 | fsync_writethrough 87.954 ops/sec 11370 usecs/op 60 | open_sync 15060.335 ops/sec 66 usecs/op 61 | 62 | Compare file sync methods using two 8kB writes: 63 | (in wal_sync_method preference order, except fdatasync is Linux's default) 64 | open_datasync 7342.068 ops/sec 136 usecs/op 65 | fdatasync 11375.823 ops/sec 88 usecs/op 66 | fsync 11035.212 ops/sec 91 usecs/op 67 | fsync_writethrough 87.290 ops/sec 11456 usecs/op 68 | open_sync 6943.205 ops/sec 144 usecs/op 69 | 70 | Compare open_sync with different write sizes: 71 | (This is designed to compare the cost of writing 16kB in different write 72 | open_sync sizes.) 73 | 1 * 16kB open_sync write 11774.650 ops/sec 85 usecs/op 74 | 2 * 8kB open_sync writes 7335.006 ops/sec 136 usecs/op 75 | 4 * 4kB open_sync writes 4147.836 ops/sec 241 usecs/op 76 | 8 * 2kB open_sync writes 2048.232 ops/sec 488 usecs/op 77 | 16 * 1kB open_sync writes 1015.277 ops/sec 985 usecs/op 78 | 79 | Test if fsync on non-write file descriptor is honored: 80 | (If the times are similar, fsync() can sync data written on a different 81 | descriptor.) 82 | write, fsync, close 9232.970 ops/sec 108 usecs/op 83 | write, close, fsync 11632.603 ops/sec 86 usecs/op 84 | 85 | Non-sync'ed 8kB writes: 86 | write 14077.617 ops/sec 71 usecs/op 87 | ``` 88 | 89 | ### 启动方式 90 | 91 | 示例代码已经放弃使用`rocksdb`作为存储,已经是纯`go`实现 92 | 93 | `go run app.go 10000 9800` 94 | 95 | **10000** 是NodeID,已经在代码里限定了(代码中的NodeID分别是10000,10001,10002),不能修改. 96 | **9800**是HTTP的端口号,随意设定即可 97 | 98 | ```go 99 | peers := map[uint64]string{ 100 | 10000: "10.101.44.4:54000", 101 | 10001: "10.101.44.4:54100", 102 | 10002: "10.101.44.4:54200", 103 | } 104 | 105 | clusters := []uint64{254000, 254100, 254200} 106 | ``` 107 | 108 | ### HTTP服务 109 | 110 | 示例的核心入口代码在engine/engine.go中,由于是示例,很多参数直接在代码中写死了。 111 | 112 | HTTP服务采用[gin](https://github.com/gin-gonic/gin) 113 | 114 | ### RequestAddNode 向集群添加节点的注意事项 115 | 116 | 详细的`dragonboat raft` 添加集群节点的示例请参考[productready](https://github.com/xkeyideal/mraft/blob/master/productready/README.md) 117 | 118 | 1. 先在集群中调用添加节点的命令RequestAddNode 119 | 2. 启动新增的节点,注意join节点的启动参数, nh.StartOnDiskCluster(map[uint64]string{}, true, NewDiskKV, rc) 120 | 3. 新增节点成功后,机器会通过Snapshot将数据同步给join节点 121 | 4. 新增节点与集群原有节点的启动顺序不影响集群的工作 122 | 5. 若新的集群需要重启,那么不能改变原有的peers(将新节点加入到peers),否则集群启动不起来,报错如下: 123 | 124 | ```json 125 | join节点的报错 126 | 127 | 2019-08-30 15:29:09.597258 E | raftpb: restarting previously joined node, member list map[10000:10.101.44.4:54000 10001:10.101.44.4:54100 10002:10.101.44.4:54200 10003:10.101.44.4:54300] 128 | 2019-08-30 15:29:09.597454 E | dragonboat: bootstrap validation failed, [54000:10003], map[], true, map[10000:10.101.44.4:54000 10001:10.101.44.4:54100 10002:10.101.44.4:54200 10003:10.101.44.4:54300], false 129 | panic: cluster settings are invalid 130 | ``` 131 | 132 | ```json 133 | 集群原来节点的报错 134 | 135 | 2019-08-30 15:29:06.590245 E | raftpb: inconsistent node list, bootstrap map[10000:10.101.44.4:54000 10001:10.101.44.4:54100 10002:10.101.44.4:54200], incoming map[10000:10.101.44.4:54000 10001:10.101.44.4:54100 10002:10.101.44.4:54200 10003:10.101.44.4:54300] 136 | 2019-08-30 15:29:06.590289 E | dragonboat: bootstrap validation failed, [54000:10002], map[10000:10.101.44.4:54000 10001:10.101.44.4:54100 10002:10.101.44.4:54200], false, map[10000:10.101.44.4:54000 10001:10.101.44.4:54100 10002:10.101.44.4:54200 10003:10.101.44.4:54300], false 137 | panic: cluster settings are invalid 138 | ``` 139 | 140 | ```json 141 | 原来的集群节点 142 | map[uint64]string{ 143 | 10000: "10.101.44.4:54000", 144 | 10001: "10.101.44.4:54100", 145 | 10002: "10.101.44.4:54200", 146 | } 147 | 148 | 新增的节点:10003: "10.101.44.4:54300" 149 | ``` 150 | 151 | ```json 152 | 正确join或重启的方式 153 | join := false 154 | nodeAddr := "" 155 | if engine.nodeID == 10003 { 156 | join = true 157 | nodeAddr = "10.101.44.4:54300" 158 | } 159 | 160 | engine.nh.Start(engine.raftDataDir, engine.nodeID, nodeAddr, join) 161 | ``` -------------------------------------------------------------------------------- /test/metrics/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | ) 7 | 8 | type kv struct { 9 | Key string `json:"key"` 10 | Val int `json:"val"` 11 | } 12 | 13 | func main() { 14 | // s := metrics.NewExpDecaySample(2028, 0.015) // or metrics.NewUniformSample(1028) 15 | // h := metrics.NewHistogram(s) 16 | // metrics.Register("baz", h) 17 | 18 | // for i := 90; i < 105; i++ { 19 | // h.Update(int64(i)) 20 | // } 21 | 22 | // fmt.Println(h.Min(), h.Max(), h.Mean(), h.Percentiles([]float64{0.9, 0.95, 0.99})) 23 | 24 | // for i := 0; i < 10; i++ { 25 | // rand.Seed(time.Now().UnixNano()) 26 | // fmt.Println(rand.Int31n(1000000)) 27 | // } 28 | 29 | // d := kv{"2", 2} 30 | // b, _ := json.Marshal(d) 31 | // fmt.Println(string(b)) 32 | 33 | b := `{"key":"2","val":2}` 34 | d := kv{} 35 | json.Unmarshal([]byte(b), &d) 36 | fmt.Println(d) 37 | } 38 | -------------------------------------------------------------------------------- /test/serialize/serialize.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | "fmt" 8 | "io" 9 | ) 10 | 11 | func encode(key, val []byte, w io.Writer) error { 12 | dataSize := make([]byte, 8) 13 | keySize := make([]byte, 8) 14 | valSize := make([]byte, 8) 15 | 16 | kl := len(key) 17 | vl := len(val) 18 | 19 | binary.LittleEndian.PutUint64(dataSize, uint64(kl+vl+8+8)) 20 | if _, err := w.Write(dataSize); err != nil { 21 | return err 22 | } 23 | 24 | binary.LittleEndian.PutUint64(keySize, uint64(kl)) 25 | if _, err := w.Write(keySize); err != nil { 26 | return err 27 | } 28 | 29 | if _, err := w.Write(key); err != nil { 30 | return err 31 | } 32 | 33 | binary.LittleEndian.PutUint64(valSize, uint64(vl)) 34 | if _, err := w.Write(valSize); err != nil { 35 | return err 36 | } 37 | 38 | if _, err := w.Write(val); err != nil { 39 | return err 40 | } 41 | 42 | return nil 43 | } 44 | 45 | func decode(r io.Reader) ([]byte, []byte, error) { 46 | sz := make([]byte, 8) 47 | if _, err := io.ReadFull(r, sz); err != nil { 48 | return nil, nil, err 49 | } 50 | dataSize := binary.LittleEndian.Uint64(sz) 51 | data := make([]byte, dataSize) 52 | if _, err := io.ReadFull(r, data); err != nil { 53 | return nil, nil, err 54 | } 55 | 56 | kl := binary.LittleEndian.Uint64(data[:8]) 57 | key := data[8 : kl+8] 58 | vl := binary.LittleEndian.Uint64(data[kl+8 : kl+16]) 59 | val := data[kl+16:] 60 | if uint64(len(val)) != vl { 61 | return nil, nil, errors.New("size isn't equal") 62 | } 63 | 64 | return key, val, nil 65 | } 66 | 67 | func main() { 68 | key := []byte("multi-raft-key") 69 | val := []byte("multi-raft-value") 70 | 71 | buf := &bytes.Buffer{} 72 | err := encode(key, val, buf) 73 | if err != nil { 74 | panic(err) 75 | } 76 | 77 | key1, val1, err := decode(buf) 78 | fmt.Println(string(key1), string(val1), err) 79 | } 80 | -------------------------------------------------------------------------------- /test/test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func main() { 8 | b := []byte("1234") 9 | bb := make([]byte, len(b)) 10 | 11 | copy(bb, b) 12 | fmt.Println(string(bb)) 13 | 14 | b[2] = '5' 15 | 16 | fmt.Println(string(bb), string(b)) 17 | } 18 | --------------------------------------------------------------------------------