├── storage ├── wal │ ├── TODO.md │ ├── fileutil_darwin.go │ ├── fileutil_unix.go │ ├── entry_cache.go │ ├── config.go │ ├── bench │ │ └── main.go │ ├── fileutil_test.go │ ├── file_cache.go │ ├── test_util.go │ ├── fileutil.go │ ├── record.go │ ├── record_writer.go │ ├── meta.go │ ├── log_index.go │ ├── record_reader.go │ ├── storage_raft_test.go │ ├── storage.go │ └── log_file.go ├── storage.go └── storage_memory.go ├── etcd ├── NOTICE └── LICENSE ├── NOTICE ├── test ├── kvs │ ├── conf │ │ └── kvs.toml │ ├── main.go │ ├── command.go │ ├── resolver.go │ └── config.go ├── memory_statemachine.go ├── raft_member_test.go └── testserver.go ├── util ├── log │ ├── log_crash_darwin.go │ ├── log_crash_linux.go │ └── log_crash_win.go ├── atomic_bool.go ├── atomic_uint64.go ├── crc32.go ├── util.go ├── bufalloc │ ├── buffer_pool_test.go │ ├── buffer_pool.go │ ├── buffer.go │ └── ibuffer.go ├── runtime.go ├── conn.go ├── uvarint64_test.go ├── io.go ├── uvarint64.go └── io_test.go ├── README.md ├── Documentation └── cn │ └── node_rejoin.md ├── transport.go ├── raft_fsm_test.go ├── errors.go ├── raft_fsm_state.go ├── proto ├── pool.go └── proto.go ├── pool.go ├── statemachine.go ├── future.go ├── transport_multi.go ├── status.go ├── transport_heartbeat.go ├── raft_log_unstable.go ├── raft_fsm_candidate.go ├── logger └── logger.go ├── raft_replica_test.go ├── transport_sender.go ├── raft_replica.go ├── raft_log_unstable_test.go ├── raft_snapshot.go ├── raft_fsm_follower.go ├── read_only.go └── transport_replicate.go /storage/wal/TODO.md: -------------------------------------------------------------------------------- 1 | TODO: 2 | - 限制单文件index的大小 3 | - 异步创建新文件 4 | - 获取相关状态信息 5 | - 按commit sync -------------------------------------------------------------------------------- /etcd/NOTICE: -------------------------------------------------------------------------------- 1 | CoreOS Project 2 | Copyright 2014 CoreOS, Inc 3 | 4 | This product includes software developed at CoreOS, Inc. 5 | (http://www.coreos.com/). 6 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | CoreOS Project 2 | Copyright 2014 CoreOS, Inc 3 | 4 | This product includes software developed at CoreOS, Inc. 5 | (http://www.coreos.com/). 6 | 7 | Modified work Copyright 2018 The tiglabs Authors. 8 | 9 | -------------------------------------------------------------------------------- /etcd/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015 The etcd Authors 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /test/kvs/conf/kvs.toml: -------------------------------------------------------------------------------- 1 | [server] 2 | data-path = "/export/Data/raft-kvs" 3 | log-path = "/export/Logs/raft-kvs" 4 | log-level = "debug" 5 | 6 | [cluster] 7 | [[cluster.nodes]] 8 | node-id=1 9 | host="n1" 10 | http-port=9991 11 | heartbeat-port=9992 12 | replicate-port=9993 13 | 14 | [[cluster.nodes]] 15 | node-id=2 16 | host="n2" 17 | http-port=9991 18 | heartbeat-port=9992 19 | replicate-port=9993 20 | 21 | [[cluster.nodes]] 22 | node-id=3 23 | host="n3" 24 | http-port=9991 25 | heartbeat-port=9992 26 | replicate-port=9993 27 | 28 | [[cluster.nodes]] 29 | node-id=4 30 | host="n4" 31 | http-port=9991 32 | heartbeat-port=9992 33 | replicate-port=9993 34 | 35 | [[cluster.nodes]] 36 | node-id=5 37 | host="n5" 38 | http-port=9991 39 | heartbeat-port=9992 40 | replicate-port=9993 41 | -------------------------------------------------------------------------------- /util/log/log_crash_darwin.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package log 16 | 17 | import ( 18 | "os" 19 | "syscall" 20 | ) 21 | 22 | func logCrash(f *os.File) error { 23 | return syscall.Dup2(int(f.Fd()), 2) 24 | } 25 | -------------------------------------------------------------------------------- /util/log/log_crash_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package log 16 | 17 | import ( 18 | "os" 19 | "syscall" 20 | ) 21 | 22 | func logCrash(f *os.File) error { 23 | return syscall.Dup3(int(f.Fd()), 2, 0) 24 | } 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Raft 2 | 3 | A multi-raft implementation built on top of the [CoreOS etcd raft library](https://github.com/etcd-io/etcd). 4 | 5 | ## Installation 6 | 7 | Download and install to `GOPATH`: 8 | ```bash 9 | go get -u github.com/tiglabs/raft 10 | ``` 11 | 12 | ## Features 13 | 14 | The CoreOS etcd/raft implementation has been modified to add the following features. 15 | 16 | - multi-raft support 17 | - snapshot manager 18 | - merged and compressed heartbeat message 19 | - check down replica 20 | - single raft's panic is allowed, detectable 21 | - new wal implementation 22 | - export more run status 23 | - implementation batch commit 24 | 25 | ## License 26 | 27 | Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). 28 | For detail see [LICENSE](LICENSE) and [NOTICE](NOTICE). 29 | -------------------------------------------------------------------------------- /Documentation/cn/node_rejoin.md: -------------------------------------------------------------------------------- 1 | ## 操作顺序 2 | 考虑一个raft组,初始时组内有三个节点,node id 分别为 1, 2 和 3. 3 | 我们有时候需要先把节点3从组内删除,随后又在节点3上重新启动一个新的raft加入原来的组内。 4 | 在加入时,需要严格遵守先调用ChangeMember接口添加raft成员并且等待命令commit成功后,再启动节点3上的raft。否则可能会有数据丢失的风险。 5 | 6 | 这是因为有以下一种特殊场景的存在: 7 | 假设初始时1、2、3三个节点1是Leader,此时我们向1提交删除成员3的命令(假设对应的日志index=10),但节点2可能暂时存在故障,它的日志只复制到index=5的位置。删除命令只复制给1,3节点,属于大多数,删除3成功。随后我们又在3节点上重新启动了一个新的raft,新raft的日志是空的。 8 | 然后节点2发起选举,注意此时节点2认为集群的成员还是初始时的1,2,3,它比3节点的新raft日志新,可以选为leader,导致一个最新日志index=5的节点成为集群leader,index=5之后的日志有可能被截断丢失。 9 | 10 | 出现这种状况的原因在于raft论文讨论成员变更删除又添加一个成员时,都是删除2,添加3,是不同节点id。同一个节点id的话,成员变更和新节点启动需要有上面的顺序约束。这是multiraft下的一个特殊问题,单raft的话新加入的节点可以新的id来安全地避免这个问题。 11 | 12 | ## PeerID 13 | `proto.Peer`结构中的PeerID成员表示全局唯一的副本ID,不同group的副本PeerID不一样,同group内的不同副本PeerID也不一样。 14 | 15 | PeerID的引入是为了解决node rejoin时,新raft同步旧日志时有可能把之前删除同node上旧raft的命令同步过来并应用,导致新的raft被错误删除的问题。有了PeerID,同node上的新旧raft PeerID不一样,可以区分出来,避免错误删除。 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /transport.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "github.com/tiglabs/raft/proto" 19 | ) 20 | 21 | // Transport raft server transport 22 | type Transport interface { 23 | Send(m *proto.Message) 24 | SendSnapshot(m *proto.Message, rs *snapshotStatus) 25 | Stop() 26 | } 27 | -------------------------------------------------------------------------------- /storage/wal/fileutil_darwin.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "os" 19 | "syscall" 20 | ) 21 | 22 | func fdatasync(f *os.File) error { 23 | _, _, errno := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), uintptr(syscall.F_FULLFSYNC), uintptr(0)) 24 | if errno == 0 { 25 | return nil 26 | } 27 | return errno 28 | } 29 | 30 | func fallocate(f *os.File, sizeInBytes int64) error { 31 | return fallocDegraded(f, sizeInBytes) 32 | } 33 | -------------------------------------------------------------------------------- /test/kvs/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.package wal 14 | 15 | package main 16 | 17 | import ( 18 | "flag" 19 | "fmt" 20 | 21 | "github.com/tiglabs/raft/util/log" 22 | ) 23 | 24 | var nodeID = flag.Uint64("node", 0, "current node id") 25 | var confFile = flag.String("conf", "", "config file path") 26 | 27 | func main() { 28 | flag.Parse() 29 | 30 | // load config 31 | cfg := LoadConfig(*confFile, *nodeID) 32 | 33 | // init log 34 | log.InitFileLog(cfg.ServerCfg.LogPath, fmt.Sprintf("node%d", *nodeID), cfg.ServerCfg.LogLevel) 35 | 36 | // start server 37 | server := NewServer(*nodeID, cfg) 38 | server.Run() 39 | } 40 | -------------------------------------------------------------------------------- /util/atomic_bool.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import "sync/atomic" 18 | 19 | type AtomicBool struct { 20 | v int32 21 | } 22 | 23 | func (b *AtomicBool) Get() bool { 24 | return atomic.LoadInt32(&b.v) != 0 25 | } 26 | 27 | func (b *AtomicBool) Set(newValue bool) { 28 | atomic.StoreInt32(&b.v, boolToInt(newValue)) 29 | } 30 | 31 | func (b *AtomicBool) CompareAndSet(expect, update bool) bool { 32 | return atomic.CompareAndSwapInt32(&b.v, boolToInt(expect), boolToInt(update)) 33 | } 34 | 35 | func boolToInt(v bool) int32 { 36 | if v { 37 | return 1 38 | } else { 39 | return 0 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /util/atomic_uint64.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import "sync/atomic" 18 | 19 | type AtomicUInt64 struct { 20 | v uint64 21 | } 22 | 23 | func (a *AtomicUInt64) Get() uint64 { 24 | return atomic.LoadUint64(&a.v) 25 | } 26 | 27 | func (a *AtomicUInt64) Set(v uint64) { 28 | atomic.StoreUint64(&a.v, v) 29 | } 30 | 31 | func (a *AtomicUInt64) Add(v uint64) uint64 { 32 | return atomic.AddUint64(&a.v, v) 33 | } 34 | 35 | func (a *AtomicUInt64) Incr() uint64 { 36 | return atomic.AddUint64(&a.v, 1) 37 | } 38 | 39 | func (a *AtomicUInt64) CompareAndSwap(o, n uint64) bool { 40 | return atomic.CompareAndSwapUint64(&a.v, o, n) 41 | } 42 | -------------------------------------------------------------------------------- /util/crc32.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "hash/crc32" 19 | ) 20 | 21 | var table = crc32.MakeTable(crc32.Castagnoli) 22 | 23 | // CRC is a CRC-32 checksum computed using Castagnoli's polynomial. 24 | type CRC uint32 25 | 26 | // NewCRC creates a new crc based on the given bytes. 27 | func NewCRC(b []byte) CRC { 28 | return CRC(0).Update(b) 29 | } 30 | 31 | // Update updates the crc with the given bytes. 32 | func (c CRC) Update(b []byte) CRC { 33 | return CRC(crc32.Update(uint32(c), table, b)) 34 | } 35 | 36 | // Value returns a masked crc. 37 | func (c CRC) Value() uint32 { 38 | return uint32(c>>15|c<<17) + 0xa282ead8 39 | } 40 | -------------------------------------------------------------------------------- /util/log/log_crash_win.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +build windows 16 | 17 | package log 18 | 19 | import ( 20 | "os" 21 | "syscall" 22 | ) 23 | 24 | var ( 25 | kernel32 = syscall.MustLoadDLL("kernel32.dll") 26 | procSetStdHandle = kernel32.MustFindProc("SetStdHandle") 27 | ) 28 | 29 | func setStdHandle(stdhandle int32, handle syscall.Handle) error { 30 | r0, _, e1 := syscall.Syscall(procSetStdHandle.Addr(), 2, uintptr(stdhandle), uintptr(handle), 0) 31 | if r0 == 0 { 32 | if e1 != 0 { 33 | return error(e1) 34 | } 35 | return syscall.EINVAL 36 | } 37 | return nil 38 | } 39 | 40 | func logCrash(f *os.File) error { 41 | return setStdHandle(syscall.STD_ERROR_HANDLE, syscall.Handle(f.Fd())) 42 | } 43 | -------------------------------------------------------------------------------- /storage/wal/fileutil_unix.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // +build linux 15 | 16 | package wal 17 | 18 | import ( 19 | "os" 20 | "syscall" 21 | ) 22 | 23 | const ( 24 | fallocateModeDefault uint32 = 0 // 默认模式下预分配的空间全部补0 25 | fallocateModeKeepSize uint32 = 1 // 预分配后保持原来的文件大小,不补0 26 | ) 27 | 28 | func fdatasync(f *os.File) error { 29 | return syscall.Fdatasync(int(f.Fd())) 30 | } 31 | 32 | // 预分配然后补零 33 | func fallocate(f *os.File, sizeInBytes int64) error { 34 | err := syscall.Fallocate(int(f.Fd()), fallocateModeDefault, 0, sizeInBytes) 35 | if err != nil { 36 | errno, ok := err.(syscall.Errno) 37 | if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) { 38 | return fallocDegraded(f, sizeInBytes) 39 | } 40 | } 41 | return err 42 | } 43 | -------------------------------------------------------------------------------- /test/kvs/command.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.package wal 14 | 15 | package main 16 | 17 | import ( 18 | "fmt" 19 | ) 20 | 21 | // CmdType command type 22 | type CmdType int 23 | 24 | const ( 25 | // CmdQuorumGet quorum get a key 26 | CmdQuorumGet CmdType = 1 27 | // CmdPut put key value 28 | CmdPut CmdType = 2 29 | // CmdDelete delete a key 30 | CmdDelete CmdType = 3 31 | ) 32 | 33 | // Command a raft op command 34 | type Command struct { 35 | OP CmdType `json:"op"` 36 | Key []byte `json:"k"` 37 | Value []byte `json:"v"` 38 | } 39 | 40 | func (c *Command) String() string { 41 | switch c.OP { 42 | case CmdQuorumGet: 43 | return fmt.Sprintf("QuorumGet %v", string(c.Key)) 44 | case CmdPut: 45 | return fmt.Sprintf("Put %s %s", string(c.Key), string(c.Value)) 46 | case CmdDelete: 47 | return fmt.Sprintf("Delete %s", string(c.Key)) 48 | default: 49 | return "" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /util/util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "time" 19 | ) 20 | 21 | const ( 22 | _ = iota 23 | KB = 1 << (10 * iota) 24 | MB 25 | GB 26 | ) 27 | 28 | const time_format = "2006-01-02 15:04:05.000" 29 | 30 | type Uint64Slice []uint64 31 | 32 | func (p Uint64Slice) Len() int { return len(p) } 33 | func (p Uint64Slice) Less(i, j int) bool { return p[i] < p[j] } 34 | func (p Uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 35 | 36 | func Min(a, b uint64) uint64 { 37 | if a > b { 38 | return b 39 | } 40 | return a 41 | } 42 | 43 | func Max(a, b uint64) uint64 { 44 | if a > b { 45 | return a 46 | } 47 | return b 48 | } 49 | 50 | func FormatDate(t time.Time) string { 51 | return t.Format(time_format) 52 | } 53 | 54 | func FormatTimestamp(t int64) string { 55 | if t <= 0 { 56 | return "" 57 | } 58 | return time.Unix(0, t).Format(time_format) 59 | } 60 | -------------------------------------------------------------------------------- /raft_fsm_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "math/rand" 19 | "testing" 20 | "time" 21 | 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | func TestRemovePeer(t *testing.T) { 26 | peer := proto.Peer{ 27 | ID: 1, 28 | PeerID: 10, 29 | } 30 | 31 | r := &raftFsm{ 32 | config: &Config{ 33 | NodeID: 1, 34 | ElectionTick: 10, 35 | }, 36 | rand: rand.New(rand.NewSource(1)), 37 | replicas: map[uint64]*replica{peer.ID: newReplica(peer, 100)}, 38 | readOnly: newReadOnly(1, ReadOnlySafe), 39 | } 40 | 41 | removedPeer := proto.Peer{ 42 | ID: 1, 43 | PeerID: 2, 44 | } 45 | 46 | r.removePeer(removedPeer) 47 | if len(r.replicas) != 1 { 48 | t.Errorf("expected replicas size = 1") 49 | } 50 | 51 | removedPeer.PeerID = peer.PeerID 52 | r.removePeer(removedPeer) 53 | if len(r.replicas) != 0 { 54 | t.Error("expected replicas size = 0") 55 | } 56 | 57 | time.Sleep(time.Second) 58 | } 59 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "errors" 20 | ) 21 | 22 | var ( 23 | ErrCompacted = errors.New("requested index is unavailable due to compaction.") 24 | ErrRaftExists = errors.New("raft already exists.") 25 | ErrRaftNotExists = errors.New("raft not exists.") 26 | ErrNotLeader = errors.New("raft is not the leader.") 27 | ErrStopped = errors.New("raft is already shutdown.") 28 | ErrSnapping = errors.New("raft is doing snapshot.") 29 | ErrRetryLater = errors.New("retry later") 30 | ) 31 | 32 | type FatalError struct { 33 | ID uint64 34 | Err error 35 | } 36 | 37 | // AppPanicError is panic error when repl occurred fatal error. 38 | // The server will recover this panic and stop the shard repl. 39 | type AppPanicError string 40 | 41 | func (pe *AppPanicError) Error() string { 42 | return "Occurred application logic panic error: " + string(*pe) 43 | } 44 | -------------------------------------------------------------------------------- /raft_fsm_state.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | type ( 19 | fsmState byte 20 | replicaState byte 21 | ) 22 | 23 | const ( 24 | stateFollower fsmState = 0 25 | stateCandidate = 1 26 | stateLeader = 2 27 | stateElectionACK = 3 28 | 29 | replicaStateProbe replicaState = 0 30 | replicaStateReplicate = 1 31 | replicaStateSnapshot = 2 32 | ) 33 | 34 | func (st fsmState) String() string { 35 | switch st { 36 | case 0: 37 | return "StateFollower" 38 | case 1: 39 | return "StateCandidate" 40 | case 2: 41 | return "StateLeader" 42 | case 3: 43 | return "StateElectionACK" 44 | } 45 | return "" 46 | } 47 | 48 | func (st replicaState) String() string { 49 | switch st { 50 | case 1: 51 | return "ReplicaStateReplicate" 52 | case 2: 53 | return "ReplicaStateSnapshot" 54 | default: 55 | return "ReplicaStateProbe" 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /test/kvs/resolver.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.package wal 14 | 15 | package main 16 | 17 | import ( 18 | "fmt" 19 | 20 | "github.com/tiglabs/raft" 21 | ) 22 | 23 | // ClusterResolver implement raft Resolver 24 | type ClusterResolver struct { 25 | cfg *Config 26 | } 27 | 28 | func newClusterResolver(cfg *Config) *ClusterResolver { 29 | return &ClusterResolver{ 30 | cfg: cfg, 31 | } 32 | } 33 | 34 | // NodeAddress get node address 35 | func (r *ClusterResolver) NodeAddress(nodeID uint64, stype raft.SocketType) (addr string, err error) { 36 | node := r.cfg.FindClusterNode(nodeID) 37 | if node == nil { 38 | return "", fmt.Errorf("could not find node(%v) in cluster config:\n: %v", nodeID, r.cfg.String()) 39 | } 40 | switch stype { 41 | case raft.HeartBeat: 42 | return fmt.Sprintf("%s:%d", node.Host, node.HeartbeatPort), nil 43 | case raft.Replicate: 44 | return fmt.Sprintf("%s:%d", node.Host, node.ReplicatePort), nil 45 | } 46 | return "", fmt.Errorf("unknown socket type: %v", stype) 47 | } 48 | -------------------------------------------------------------------------------- /util/bufalloc/buffer_pool_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package bufalloc 16 | 17 | import ( 18 | "testing" 19 | 20 | "github.com/tiglabs/raft/util" 21 | ) 22 | 23 | func TestGetPoolNum(t *testing.T) { 24 | for i, n := range buffPool.baseline { 25 | num := buffPool.getPoolNum(n) 26 | if num != i { 27 | t.Errorf("Got %v expected %v", num, i) 28 | } 29 | num = buffPool.getPoolNum(n - 1) 30 | if num != i { 31 | t.Errorf("Got %v expected %v", num, i) 32 | } 33 | } 34 | num := buffPool.getPoolNum(2 * util.MB) 35 | if num != baseSize { 36 | t.Errorf("Got %v expected %v", num, baseSize) 37 | } 38 | } 39 | 40 | func TestGetBuffer(t *testing.T) { 41 | for _, n := range buffPool.baseline { 42 | buf := buffPool.getBuffer(n) 43 | if buf.Len() != 0 || buf.Cap() != n { 44 | t.Errorf("Got %v expected %v", buf.Cap(), n) 45 | } 46 | buffPool.putBuffer(buf) 47 | } 48 | buf := buffPool.getBuffer(2 * util.MB) 49 | if buf.Len() != 0 || buf.Cap() != 2*util.MB { 50 | t.Errorf("Got %v expected %v", buf.Cap(), 2*util.MB) 51 | } 52 | buffPool.putBuffer(buf) 53 | } 54 | -------------------------------------------------------------------------------- /proto/pool.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package proto 15 | 16 | import ( 17 | "sync" 18 | ) 19 | 20 | var ( 21 | msgPool = &sync.Pool{ 22 | New: func() interface{} { 23 | return &Message{ 24 | Entries: make([]*Entry, 0, 128), 25 | } 26 | }, 27 | } 28 | 29 | bytePool = &sync.Pool{ 30 | New: func() interface{} { 31 | return make([]byte, 128) 32 | }, 33 | } 34 | ) 35 | 36 | func GetMessage() *Message { 37 | msg := msgPool.Get().(*Message) 38 | msg.Reject = false 39 | msg.RejectIndex = 0 40 | msg.ID = 0 41 | msg.From = 0 42 | msg.To = 0 43 | msg.Term = 0 44 | msg.LogTerm = 0 45 | msg.Index = 0 46 | msg.Commit = 0 47 | msg.SnapshotMeta.Index = 0 48 | msg.SnapshotMeta.Term = 0 49 | msg.SnapshotMeta.Peers = nil 50 | msg.Snapshot = nil 51 | msg.Context = nil 52 | msg.Entries = msg.Entries[0:0] 53 | return msg 54 | } 55 | 56 | func ReturnMessage(msg *Message) { 57 | if msg != nil { 58 | msgPool.Put(msg) 59 | } 60 | } 61 | 62 | func getByteSlice() []byte { 63 | return bytePool.Get().([]byte) 64 | } 65 | 66 | func returnByteSlice(b []byte) { 67 | bytePool.Put(b) 68 | } 69 | -------------------------------------------------------------------------------- /pool.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "sync" 19 | ) 20 | 21 | var pool = newPoolFactory() 22 | 23 | type poolFactory struct { 24 | applyPool *sync.Pool 25 | proposalPool *sync.Pool 26 | } 27 | 28 | func newPoolFactory() *poolFactory { 29 | return &poolFactory{ 30 | applyPool: &sync.Pool{ 31 | New: func() interface{} { 32 | return new(apply) 33 | }, 34 | }, 35 | 36 | proposalPool: &sync.Pool{ 37 | New: func() interface{} { 38 | return new(proposal) 39 | }, 40 | }, 41 | } 42 | } 43 | 44 | func (f *poolFactory) getApply() *apply { 45 | a := f.applyPool.Get().(*apply) 46 | a.command = nil 47 | a.future = nil 48 | a.readIndexes = nil 49 | return a 50 | } 51 | 52 | func (f *poolFactory) returnApply(a *apply) { 53 | if a != nil { 54 | f.applyPool.Put(a) 55 | } 56 | } 57 | 58 | func (f *poolFactory) getProposal() *proposal { 59 | p := f.proposalPool.Get().(*proposal) 60 | p.data = nil 61 | p.future = nil 62 | return p 63 | } 64 | 65 | func (f *poolFactory) returnProposal(p *proposal) { 66 | if p != nil { 67 | f.proposalPool.Put(p) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /statemachine.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "github.com/tiglabs/raft/proto" 20 | ) 21 | 22 | // The StateMachine interface is supplied by the application to persist/snapshot data of application. 23 | type StateMachine interface { 24 | Apply(command []byte, index uint64) (interface{}, error) 25 | ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error) 26 | Snapshot() (proto.Snapshot, error) 27 | ApplySnapshot(peers []proto.Peer, iter proto.SnapIterator) error 28 | HandleFatalEvent(err *FatalError) 29 | HandleLeaderChange(leader uint64) 30 | } 31 | 32 | type SocketType byte 33 | 34 | const ( 35 | HeartBeat SocketType = 0 36 | Replicate SocketType = 1 37 | ) 38 | 39 | func (t SocketType) String() string { 40 | switch t { 41 | case 0: 42 | return "HeartBeat" 43 | case 1: 44 | return "Replicate" 45 | } 46 | return "unkown" 47 | } 48 | 49 | // The SocketResolver interface is supplied by the application to resolve NodeID to net.Addr addresses. 50 | type SocketResolver interface { 51 | NodeAddress(nodeID uint64, stype SocketType) (addr string, err error) 52 | } 53 | -------------------------------------------------------------------------------- /future.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | type respErr struct { 18 | errCh chan error 19 | } 20 | 21 | func (e *respErr) init() { 22 | e.errCh = make(chan error, 1) 23 | } 24 | 25 | func (e *respErr) respond(err error) { 26 | e.errCh <- err 27 | close(e.errCh) 28 | } 29 | 30 | func (e *respErr) error() <-chan error { 31 | return e.errCh 32 | } 33 | 34 | // Future the future 35 | type Future struct { 36 | respErr 37 | respCh chan interface{} 38 | } 39 | 40 | func newFuture() *Future { 41 | f := &Future{ 42 | respCh: make(chan interface{}, 1), 43 | } 44 | f.init() 45 | return f 46 | } 47 | 48 | func (f *Future) respond(resp interface{}, err error) { 49 | if err == nil { 50 | f.respCh <- resp 51 | close(f.respCh) 52 | } else { 53 | f.respErr.respond(err) 54 | } 55 | } 56 | 57 | // Response wait response 58 | func (f *Future) Response() (resp interface{}, err error) { 59 | select { 60 | case err = <-f.error(): 61 | return 62 | case resp = <-f.respCh: 63 | return 64 | } 65 | } 66 | 67 | // AsyncResponse export channels 68 | func (f *Future) AsyncResponse() (respCh <-chan interface{}, errCh <-chan error) { 69 | return f.respCh, f.errCh 70 | } 71 | -------------------------------------------------------------------------------- /util/runtime.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "fmt" 19 | "runtime" 20 | "runtime/debug" 21 | 22 | "github.com/tiglabs/raft/logger" 23 | ) 24 | 25 | func HandleCrash(handlers ...func(interface{})) { 26 | if r := recover(); r != nil { 27 | debug.PrintStack() 28 | logPanic(r) 29 | for _, fn := range handlers { 30 | fn(r) 31 | } 32 | } 33 | } 34 | 35 | func logPanic(r interface{}) { 36 | callers := "" 37 | for i := 0; true; i++ { 38 | _, file, line, ok := runtime.Caller(i) 39 | if !ok { 40 | break 41 | } 42 | callers = callers + fmt.Sprintf("%v:%v\n", file, line) 43 | } 44 | logger.Error("Recovered from panic: %#v (%v)\n%v", r, r, callers) 45 | } 46 | 47 | func RunWorker(f func(), handlers ...func(interface{})) { 48 | go func() { 49 | defer HandleCrash(handlers...) 50 | 51 | f() 52 | }() 53 | } 54 | 55 | func RunWorkerUtilStop(f func(), stopCh <-chan struct{}, handlers ...func(interface{})) { 56 | go func() { 57 | for { 58 | select { 59 | case <-stopCh: 60 | return 61 | 62 | default: 63 | func() { 64 | defer HandleCrash(handlers...) 65 | f() 66 | }() 67 | } 68 | } 69 | }() 70 | } 71 | -------------------------------------------------------------------------------- /storage/wal/entry_cache.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "github.com/google/btree" 19 | "github.com/tiglabs/raft/proto" 20 | ) 21 | 22 | type cacheItem proto.Entry 23 | 24 | func (c *cacheItem) Less(than btree.Item) bool { 25 | return c.Index < than.(*cacheItem).Index 26 | } 27 | 28 | // cache中只保持最新的(index较大的)若干条日志 29 | type entryCache struct { 30 | capacity int 31 | ents *btree.BTree 32 | key *cacheItem 33 | } 34 | 35 | func newEntryCache(capacity int) *entryCache { 36 | return &entryCache{ 37 | capacity: capacity, 38 | ents: btree.New(4), 39 | key: new(cacheItem), 40 | } 41 | } 42 | 43 | func (c *entryCache) Get(index uint64) *proto.Entry { 44 | c.key.Index = index 45 | ent := c.ents.Get(c.key) 46 | if ent != nil { 47 | return (*proto.Entry)(ent.(*cacheItem)) 48 | } else { 49 | return nil 50 | } 51 | } 52 | 53 | func (c *entryCache) Append(ent *proto.Entry) { 54 | // 截断冲突的 55 | for c.ents.Len() > 0 && c.ents.Max().(*cacheItem).Index >= ent.Index { 56 | c.ents.DeleteMax() 57 | } 58 | 59 | c.ents.ReplaceOrInsert((*cacheItem)(ent)) 60 | 61 | // keep capacity 62 | for c.ents.Len() > c.capacity { 63 | c.ents.DeleteMin() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /storage/wal/config.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import "github.com/tiglabs/raft/util" 18 | 19 | const ( 20 | DefaultFileCacheCapacity = 2 21 | DefaultFileSize = 32 * util.MB 22 | DefaultSync = false 23 | ) 24 | 25 | // Config wal config 26 | type Config struct { 27 | // FileCacheCapacity 缓存多少个打开的日志文件(包括index等) 28 | FileCacheCapacity int 29 | 30 | // FileSize 日志文件的大小 31 | FileSize int 32 | 33 | Sync bool 34 | 35 | // TruncateFirstDummy 初始化时添加一条日志然后截断 36 | TruncateFirstDummy bool 37 | } 38 | 39 | func (c *Config) GetFileCacheCapacity() int { 40 | if c == nil || c.FileCacheCapacity <= 0 { 41 | return DefaultFileCacheCapacity 42 | } 43 | return c.FileCacheCapacity 44 | } 45 | 46 | func (c *Config) GetFileSize() int { 47 | if c == nil || c.FileSize <= 0 { 48 | return DefaultFileSize 49 | } 50 | 51 | return c.FileSize 52 | } 53 | 54 | func (c *Config) GetSync() bool { 55 | if c == nil { 56 | return DefaultSync 57 | } 58 | return c.Sync 59 | } 60 | 61 | func (c *Config) GetTruncateFirstDummy() bool { 62 | if c == nil { 63 | return false 64 | } 65 | return c.TruncateFirstDummy 66 | } 67 | 68 | func (c *Config) dup() *Config { 69 | if c != nil { 70 | dc := *c 71 | return &dc 72 | } else { 73 | return nil 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /transport_multi.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "github.com/tiglabs/raft/proto" 19 | "github.com/tiglabs/raft/util" 20 | ) 21 | 22 | type MultiTransport struct { 23 | heartbeat *heartbeatTransport 24 | replicate *replicateTransport 25 | } 26 | 27 | func NewMultiTransport(raft *RaftServer, config *TransportConfig) (Transport, error) { 28 | mt := new(MultiTransport) 29 | 30 | if ht, err := newHeartbeatTransport(raft, config); err != nil { 31 | return nil, err 32 | } else { 33 | mt.heartbeat = ht 34 | } 35 | if rt, err := newReplicateTransport(raft, config); err != nil { 36 | return nil, err 37 | } else { 38 | mt.replicate = rt 39 | } 40 | 41 | mt.heartbeat.start() 42 | mt.replicate.start() 43 | return mt, nil 44 | } 45 | 46 | func (t *MultiTransport) Stop() { 47 | t.heartbeat.stop() 48 | t.replicate.stop() 49 | } 50 | 51 | func (t *MultiTransport) Send(m *proto.Message) { 52 | // if m.IsElectionMsg() { 53 | if m.IsHeartbeatMsg() { 54 | t.heartbeat.send(m) 55 | } else { 56 | t.replicate.send(m) 57 | } 58 | } 59 | 60 | func (t *MultiTransport) SendSnapshot(m *proto.Message, rs *snapshotStatus) { 61 | t.replicate.sendSnapshot(m, rs) 62 | } 63 | 64 | func reciveMessage(r *util.BufferReader) (msg *proto.Message, err error) { 65 | msg = proto.GetMessage() 66 | if err = msg.Decode(r); err != nil { 67 | proto.ReturnMessage(msg) 68 | } 69 | return 70 | } 71 | -------------------------------------------------------------------------------- /storage/wal/bench/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package main 16 | 17 | import ( 18 | "flag" 19 | "io/ioutil" 20 | "math/rand" 21 | "os" 22 | "time" 23 | 24 | "fmt" 25 | 26 | "github.com/tiglabs/raft/proto" 27 | "github.com/tiglabs/raft/storage/wal" 28 | ) 29 | 30 | var n = flag.Int("n", 100000, "requests") 31 | var l = flag.Int("l", 1024, "entry data length") 32 | 33 | const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 34 | 35 | func randomEntry(rnd *rand.Rand, index uint64) *proto.Entry { 36 | data := make([]byte, *l) 37 | for i := 0; i < *l; i++ { 38 | data[i] = letters[rnd.Int()%len(letters)] 39 | } 40 | 41 | ent := &proto.Entry{ 42 | Index: index, 43 | Type: proto.EntryNormal, 44 | Term: uint64(rnd.Uint32()), 45 | Data: data, 46 | } 47 | 48 | return ent 49 | } 50 | 51 | func main() { 52 | flag.Parse() 53 | 54 | dir, err := ioutil.TempDir(os.TempDir(), "db_bench_") 55 | if err != nil { 56 | panic(err) 57 | } 58 | 59 | s, err := wal.NewStorage(dir, nil) 60 | if err != nil { 61 | panic(err) 62 | } 63 | 64 | start := time.Now() 65 | rnd := rand.New(rand.NewSource(time.Now().UnixNano())) 66 | reqs := *n 67 | for i := 0; i < reqs; i++ { 68 | ent := randomEntry(rnd, uint64(i)) 69 | if err := s.StoreEntries([]*proto.Entry{ent}); err != nil { 70 | panic(err) 71 | } 72 | } 73 | spend := time.Since(start) 74 | ops := (int64(reqs) * 1000) / (spend.Nanoseconds() / 1000000) 75 | fmt.Printf("write %d entries spend: %v, ops: %v\n", reqs, spend, ops) 76 | } 77 | -------------------------------------------------------------------------------- /storage/wal/fileutil_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "io/ioutil" 19 | "os" 20 | "path" 21 | "syscall" 22 | "testing" 23 | ) 24 | 25 | var datas = []byte("abcjklds;lsdgkldsjgkdlsjglsdjqroeioewrotl;sgkdjkgsjkdgjsk129428309583908593952abcjklds;lsdgkldsjgkdlsjglsdjqroeioewrotl;sgkdjkgsjkdgjsk129428309583908593952\n") 26 | 27 | func BenchmarkFSync(b *testing.B) { 28 | dir, err := ioutil.TempDir(os.TempDir(), "fbase_test_rafts_sync_") 29 | if err != nil { 30 | b.Fatal(err) 31 | } 32 | defer os.RemoveAll(dir) 33 | 34 | f, err := os.Create(path.Join(dir, "test_fsync.data")) 35 | if err != nil { 36 | b.Fatal(err) 37 | } 38 | defer f.Close() 39 | 40 | b.ResetTimer() 41 | 42 | for i := 0; i < b.N; i++ { 43 | _, err := f.Write(datas) 44 | if err != nil { 45 | b.Error(err) 46 | } 47 | err = f.Sync() 48 | if err != nil { 49 | b.Error(err) 50 | } 51 | } 52 | } 53 | 54 | func BenchmarkFDataSync(b *testing.B) { 55 | dir, err := ioutil.TempDir(os.TempDir(), "fbase_test_rafts_sync_") 56 | if err != nil { 57 | b.Fatal(err) 58 | } 59 | defer os.RemoveAll(dir) 60 | 61 | f, err := os.Create(path.Join(dir, "test_fdatasync.data")) 62 | err = syscall.Fallocate(int(f.Fd()), 0, 0, 1024*1024*10) 63 | if err != nil { 64 | b.Fatal(err) 65 | } 66 | defer f.Close() 67 | 68 | b.ResetTimer() 69 | 70 | for i := 0; i < b.N; i++ { 71 | _, err := f.Write(datas) 72 | if err != nil { 73 | b.Error(err) 74 | } 75 | err = syscall.Fdatasync(int(f.Fd())) 76 | if err != nil { 77 | b.Error(err) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /util/bufalloc/buffer_pool.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package bufalloc 16 | 17 | import ( 18 | "sync" 19 | 20 | "github.com/tiglabs/raft/util" 21 | ) 22 | 23 | const ( 24 | baseSize = 15 25 | bigSize = 64 * util.KB 26 | ) 27 | 28 | var buffPool *bufferPool 29 | 30 | func init() { 31 | buffPool = &bufferPool{ 32 | baseline: [...]int{64, 128, 256, 512, util.KB, 2 * util.KB, 4 * util.KB, 8 * util.KB, 16 * util.KB, 32 * util.KB, 64 * util.KB, 128 * util.KB, 256 * util.KB, 512 * util.KB, util.MB}, 33 | } 34 | for i, n := range buffPool.baseline { 35 | buffPool.pool[i] = createPool(n) 36 | } 37 | buffPool.pool[baseSize] = createPool(0) 38 | } 39 | 40 | func createPool(n int) *sync.Pool { 41 | return &sync.Pool{ 42 | New: func() interface{} { 43 | if n == 0 || n > bigSize { 44 | return &ibuffer{} 45 | } 46 | return &ibuffer{buf: makeSlice(n)} 47 | }, 48 | } 49 | } 50 | 51 | type bufferPool struct { 52 | baseline [baseSize]int 53 | pool [baseSize + 1]*sync.Pool 54 | } 55 | 56 | func (p *bufferPool) getPoolNum(n int) int { 57 | for i, x := range p.baseline { 58 | if n <= x { 59 | return i 60 | } 61 | } 62 | return baseSize 63 | } 64 | 65 | func (p *bufferPool) getBuffer(n int) Buffer { 66 | num := p.getPoolNum(n) 67 | pool := p.pool[num] 68 | buf := pool.Get().(Buffer) 69 | if buf.Cap() < n { 70 | // return old buffer to pool 71 | buffPool.putBuffer(buf) 72 | buf = &ibuffer{buf: makeSlice(n)} 73 | } 74 | buf.Reset() 75 | return buf 76 | } 77 | 78 | func (p *bufferPool) putBuffer(buf Buffer) { 79 | num := p.getPoolNum(buf.Cap()) 80 | pool := p.pool[num] 81 | pool.Put(buf) 82 | } 83 | -------------------------------------------------------------------------------- /storage/wal/file_cache.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import "container/list" 18 | 19 | type openFunc func(logFileName) (*logEntryFile, error) 20 | 21 | type logFileCache struct { 22 | capacity int 23 | 24 | l *list.List 25 | m map[logFileName]*list.Element // key是seq 26 | 27 | f openFunc 28 | } 29 | 30 | func newLogFileCache(capacity int, f openFunc) *logFileCache { 31 | return &logFileCache{ 32 | capacity: capacity, 33 | l: list.New(), 34 | m: make(map[logFileName]*list.Element, capacity), 35 | f: f, 36 | } 37 | } 38 | 39 | func (lc *logFileCache) Get(name logFileName) (lf *logEntryFile, err error) { 40 | e, ok := lc.m[name] 41 | if ok { 42 | lf = (e.Value).(*logEntryFile) 43 | lc.l.MoveToFront(e) 44 | return 45 | } 46 | 47 | // 不存在打开新的 48 | lf, err = lc.f(name) 49 | if err != nil { 50 | return 51 | } 52 | // 缓存 53 | e = lc.l.PushFront(lf) 54 | lc.m[name] = e 55 | 56 | // keep capacity 57 | for lc.l.Len() > lc.capacity { 58 | e = lc.l.Back() 59 | df := (e.Value).(*logEntryFile) 60 | if err = lc.Delete(df.Name(), true); err != nil { 61 | return nil, err 62 | } 63 | } 64 | return 65 | } 66 | 67 | func (lc *logFileCache) Delete(name logFileName, close bool) error { 68 | e, ok := lc.m[name] 69 | if !ok { 70 | return nil 71 | } 72 | 73 | lf := e.Value.(*logEntryFile) 74 | if close { 75 | if err := lf.Close(); err != nil { 76 | return err 77 | } 78 | } 79 | delete(lc.m, lf.Name()) 80 | lc.l.Remove(e) 81 | return nil 82 | } 83 | 84 | func (lc *logFileCache) Close() (err error) { 85 | for _, e := range lc.m { 86 | f := (e.Value).(*logEntryFile) 87 | err = f.Close() 88 | } 89 | return 90 | } 91 | -------------------------------------------------------------------------------- /util/conn.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "net" 19 | "time" 20 | ) 21 | 22 | type ConnTimeout struct { 23 | addr string 24 | conn net.Conn 25 | readTime time.Duration 26 | writeTime time.Duration 27 | } 28 | 29 | func DialTimeout(addr string, connTime time.Duration) (*ConnTimeout, error) { 30 | conn, err := net.DialTimeout("tcp", addr, connTime) 31 | if err != nil { 32 | return nil, err 33 | } 34 | 35 | conn.(*net.TCPConn).SetNoDelay(true) 36 | conn.(*net.TCPConn).SetLinger(0) 37 | conn.(*net.TCPConn).SetKeepAlive(true) 38 | return &ConnTimeout{conn: conn, addr: addr}, nil 39 | } 40 | 41 | func NewConnTimeout(conn net.Conn) *ConnTimeout { 42 | if conn == nil { 43 | return nil 44 | } 45 | 46 | conn.(*net.TCPConn).SetNoDelay(true) 47 | conn.(*net.TCPConn).SetLinger(0) 48 | conn.(*net.TCPConn).SetKeepAlive(true) 49 | return &ConnTimeout{conn: conn, addr: conn.RemoteAddr().String()} 50 | } 51 | 52 | func (c *ConnTimeout) SetReadTimeout(timeout time.Duration) { 53 | c.readTime = timeout 54 | } 55 | 56 | func (c *ConnTimeout) SetWriteTimeout(timeout time.Duration) { 57 | c.writeTime = timeout 58 | } 59 | 60 | func (c *ConnTimeout) Read(p []byte) (n int, err error) { 61 | if c.readTime.Nanoseconds() > 0 { 62 | err = c.conn.SetReadDeadline(time.Now().Add(c.readTime)) 63 | if err != nil { 64 | return 65 | } 66 | } 67 | 68 | n, err = c.conn.Read(p) 69 | return 70 | } 71 | 72 | func (c *ConnTimeout) Write(p []byte) (n int, err error) { 73 | if c.writeTime.Nanoseconds() > 0 { 74 | err = c.conn.SetWriteDeadline(time.Now().Add(c.writeTime)) 75 | if err != nil { 76 | return 77 | } 78 | } 79 | 80 | n, err = c.conn.Write(p) 81 | return 82 | } 83 | 84 | func (c *ConnTimeout) RemoteAddr() string { 85 | return c.addr 86 | } 87 | 88 | func (c *ConnTimeout) Close() error { 89 | return c.conn.Close() 90 | } 91 | -------------------------------------------------------------------------------- /util/uvarint64_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package util 15 | 16 | import ( 17 | "bytes" 18 | "testing" 19 | ) 20 | 21 | var tests = []struct { 22 | decoded uint64 23 | n int 24 | encoded []byte 25 | }{ 26 | {0, 1, []byte{0x00}}, 27 | {1, 1, []byte{0x01}}, 28 | {240, 1, []byte{0xF0}}, 29 | {241, 2, []byte{0xF1, 0x01}}, 30 | {2287, 2, []byte{0xF8, 0xFF}}, 31 | 32 | {2288, 3, []byte{0xF9, 0x00, 0x00}}, 33 | {67823, 3, []byte{0xF9, 0xFF, 0xFF}}, 34 | {67824, 4, []byte{0xFA, 0x01, 0x08, 0xF0}}, 35 | {1<<24 - 1, 4, []byte{0xFA, 0xFF, 0xFF, 0xFF}}, 36 | {1 << 24, 5, []byte{0xFB, 0x01, 0x00, 0x00, 0x00}}, 37 | 38 | {1<<32 - 1, 5, []byte{0xFB, 0xFF, 0xFF, 0xFF, 0xFF}}, 39 | {1 << 32, 6, []byte{0xFC, 0x01, 0x00, 0x00, 0x00, 0x00}}, 40 | {1<<40 - 1, 6, []byte{0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}, 41 | {1 << 40, 7, []byte{0xFD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00}}, 42 | {1<<48 - 1, 7, []byte{0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}, 43 | 44 | {1 << 48, 8, []byte{0xFE, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, 45 | {1<<56 - 1, 8, []byte{0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}, 46 | {1 << 56, 9, []byte{0xFF, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, 47 | {1<<64 - 1, 9, []byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}, 48 | } 49 | 50 | func TestUvarint(t *testing.T) { 51 | for i, test := range tests { 52 | b := make([]byte, len(test.encoded)) 53 | n := PutUvarint64(b, test.decoded) 54 | if n != test.n { 55 | t.Errorf("encode %d: got %d want %d", i, n, test.n) 56 | } 57 | if !bytes.Equal(b, test.encoded) { 58 | t.Errorf("encode %d: got %v want %v", i, b[0:n], test.encoded) 59 | } 60 | v, n := Uvarint64(test.encoded) 61 | if n != test.n { 62 | t.Errorf("decode %d: got %d want %d", i, n, test.n) 63 | } 64 | if v != test.decoded { 65 | t.Errorf("decode %d: got %d want %d", i, v, test.decoded) 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /status.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "fmt" 19 | "time" 20 | ) 21 | 22 | // DownReplica down replica 23 | type DownReplica struct { 24 | NodeID uint64 25 | DownSeconds int 26 | } 27 | 28 | // ReplicaStatus replica status 29 | type ReplicaStatus struct { 30 | Match uint64 // 复制进度 31 | Commit uint64 // commmit位置 32 | Next uint64 33 | State string 34 | Snapshoting bool 35 | Paused bool 36 | Active bool 37 | LastActive time.Time 38 | Inflight int 39 | } 40 | 41 | // Status raft status 42 | type Status struct { 43 | ID uint64 44 | NodeID uint64 45 | Leader uint64 46 | Term uint64 47 | Index uint64 48 | Commit uint64 49 | Applied uint64 50 | Vote uint64 51 | PendQueue int 52 | RecvQueue int 53 | AppQueue int 54 | Stopped bool 55 | RestoringSnapshot bool 56 | State string // leader、follower、candidate 57 | Replicas map[uint64]*ReplicaStatus 58 | } 59 | 60 | func (s *Status) String() string { 61 | st := "running" 62 | if s.Stopped { 63 | st = "stopped" 64 | } else if s.RestoringSnapshot { 65 | st = "snapshot" 66 | } 67 | j := fmt.Sprintf(`{"id":"%v","nodeID":"%v","state":"%v","leader":"%v","term":"%v","index":"%v","commit":"%v","applied":"%v","vote":"%v","pendingQueue":"%v", 68 | "recvQueue":"%v","applyQueue":"%v","status":"%v","replication":{`, s.ID, s.NodeID, s.State, s.Leader, s.Term, s.Index, s.Commit, s.Applied, s.Vote, s.PendQueue, s.RecvQueue, s.AppQueue, st) 69 | if len(s.Replicas) == 0 { 70 | j += "}}" 71 | } else { 72 | for k, v := range s.Replicas { 73 | p := "false" 74 | if v.Paused { 75 | p = "true" 76 | } 77 | subj := fmt.Sprintf(`"%v":{"match":"%v","commit":"%v","next":"%v","state":"%v","paused":"%v","inflight":"%v","active":"%v"},`, k, v.Match, v.Commit, v.Next, v.State, p, v.Inflight, v.Active) 78 | j += subj 79 | } 80 | j = j[:len(j)-1] + "}}" 81 | } 82 | return j 83 | } 84 | -------------------------------------------------------------------------------- /storage/wal/test_util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package wal 15 | 16 | import ( 17 | "bytes" 18 | "fmt" 19 | "math/rand" 20 | "time" 21 | 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | func compapreEntry(le, re *proto.Entry) error { 26 | if le.Index != re.Index { 27 | return fmt.Errorf("unmatch index: %d != %d", le.Index, re.Index) 28 | } 29 | if le.Type != re.Type { 30 | return fmt.Errorf("unmatch type: %d != %d", le.Type, re.Type) 31 | } 32 | if le.Term != re.Term { 33 | return fmt.Errorf("unmatch term: %d != %d", le.Term, re.Term) 34 | } 35 | if !bytes.Equal(le.Data, re.Data) { 36 | return fmt.Errorf("unmatch data: %s != %s", string(le.Data), string(re.Data)) 37 | } 38 | return nil 39 | } 40 | 41 | func compareEntries(lh, rh []*proto.Entry) error { 42 | if len(lh) != len(rh) { 43 | return fmt.Errorf("unmatch size: %d != %d", len(lh), len(rh)) 44 | } 45 | 46 | for i := 0; i < len(lh); i++ { 47 | le := lh[i] 48 | re := rh[i] 49 | if err := compapreEntry(le, re); err != nil { 50 | return fmt.Errorf("%v at %d", err, i) 51 | } 52 | } 53 | return nil 54 | } 55 | 56 | func genLogEntry(rnd *rand.Rand, i uint64) *proto.Entry { 57 | randType := func() proto.EntryType { 58 | switch rnd.Int() % 2 { 59 | case 0: 60 | return proto.EntryNormal 61 | default: 62 | return proto.EntryConfChange 63 | } 64 | } 65 | randTerm := func() uint64 { 66 | return uint64(rnd.Uint32()) 67 | } 68 | randData := func() []byte { 69 | const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 70 | length := 10 + rnd.Int()%100 71 | buf := make([]byte, length) 72 | for i := 0; i < length; i++ { 73 | buf[i] = letters[rnd.Int()%len(letters)] 74 | } 75 | return buf 76 | } 77 | ent := &proto.Entry{ 78 | Index: i, 79 | Type: randType(), 80 | Term: randTerm(), 81 | Data: randData(), 82 | } 83 | return ent 84 | } 85 | 86 | func genLogEntries(lo, hi uint64) (ents []*proto.Entry) { 87 | rnd := rand.New(rand.NewSource(time.Now().UnixNano())) 88 | for i := lo; i < hi; i++ { 89 | ents = append(ents, genLogEntry(rnd, i)) 90 | } 91 | return 92 | } 93 | -------------------------------------------------------------------------------- /storage/wal/fileutil.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "errors" 19 | "fmt" 20 | "os" 21 | "sort" 22 | ) 23 | 24 | // 目录初始化 不存在则创建;存在检查路径是否是目录 25 | func initDir(dir string) error { 26 | info, err := os.Stat(dir) 27 | if err != nil { 28 | if pathErr, ok := err.(*os.PathError); ok { 29 | if os.IsNotExist(pathErr) { 30 | return os.MkdirAll(dir, 0755) 31 | } 32 | } 33 | return err 34 | } 35 | 36 | if !info.IsDir() { 37 | return errors.New("fbase/raftstore: path is not directory") 38 | } 39 | 40 | return nil 41 | } 42 | 43 | // 日志文件名的组成 seq-index.log 44 | type logFileName struct { 45 | seq uint64 // 文件序号 46 | index uint64 // 起始index(log entry) 47 | } 48 | 49 | func (l *logFileName) String() string { 50 | return fmt.Sprintf("%016x-%016x.log", l.seq, l.index) 51 | } 52 | 53 | func (l *logFileName) ParseFrom(s string) bool { 54 | _, err := fmt.Sscanf(s, "%016x-%016x.log", &l.seq, &l.index) 55 | return err == nil 56 | } 57 | 58 | type nameSlice []logFileName 59 | 60 | func (s nameSlice) Len() int { return len(s) } 61 | func (s nameSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 62 | func (s nameSlice) Less(i, j int) bool { return s[i].seq < s[j].seq } 63 | 64 | // 枚举目录下的所有日志文件并按序号排序 65 | func listLogEntryFiles(path string) (fnames []logFileName, err error) { 66 | dir, err := os.Open(path) 67 | if err != nil { 68 | return nil, err 69 | } 70 | defer dir.Close() 71 | 72 | names, err := dir.Readdirnames(0) 73 | if err != nil { 74 | return nil, err 75 | } 76 | 77 | for _, name := range names { 78 | var n logFileName 79 | if n.ParseFrom(name) { 80 | fnames = append(fnames, n) 81 | } 82 | } 83 | sort.Sort(nameSlice(fnames)) 84 | return 85 | } 86 | 87 | // 退化版本的预分配空间 88 | func fallocDegraded(f *os.File, sizeInBytes int64) error { 89 | curOff, err := f.Seek(0, os.SEEK_CUR) 90 | if err != nil { 91 | return err 92 | } 93 | size, err := f.Seek(sizeInBytes, os.SEEK_END) 94 | if err != nil { 95 | return err 96 | } 97 | if _, err = f.Seek(curOff, os.SEEK_SET); err != nil { 98 | return err 99 | } 100 | if sizeInBytes > size { 101 | return nil 102 | } 103 | return f.Truncate(sizeInBytes) 104 | } 105 | -------------------------------------------------------------------------------- /storage/storage.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package storage 17 | 18 | import ( 19 | "github.com/tiglabs/raft/proto" 20 | ) 21 | 22 | // Storage is an interface that may be implemented by the application to retrieve log entries from storage. 23 | // If any Storage method returns an error, the raft instance will become inoperable and refuse to participate in elections; 24 | // the application is responsible for cleanup and recovery in this case. 25 | type Storage interface { 26 | // InitialState returns the saved HardState information to init the repl state. 27 | InitialState() (proto.HardState, error) 28 | // Entries returns a slice of log entries in the range [lo,hi), the hi is not inclusive. 29 | // MaxSize limits the total size of the log entries returned, but Entries returns at least one entry if any. 30 | // If lo <= CompactIndex,then return isCompact true. 31 | // If no entries,then return entries nil. 32 | // Note: math.MaxUint32 is no limit. 33 | Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) 34 | // Term returns the term of entry i, which must be in the range [FirstIndex()-1, LastIndex()]. 35 | // The term of the entry before FirstIndex is retained for matching purposes even though the 36 | // rest of that entry may not be available. 37 | // If lo <= CompactIndex,then return isCompact true. 38 | Term(i uint64) (term uint64, isCompact bool, err error) 39 | // FirstIndex returns the index of the first log entry that is possibly available via Entries (older entries have been incorporated 40 | // into the latest Snapshot; if storage only contains the dummy entry the first log entry is not available). 41 | FirstIndex() (uint64, error) 42 | // LastIndex returns the index of the last entry in the log. 43 | LastIndex() (uint64, error) 44 | // StoreEntries store the log entries to the repository. 45 | // If first index of entries > LastIndex,then append all entries, 46 | // Else write entries at first index and truncate the redundant log entries. 47 | StoreEntries(entries []*proto.Entry) error 48 | // StoreHardState store the raft state to the repository. 49 | StoreHardState(st proto.HardState) error 50 | // Truncate the log to index, The index is inclusive. 51 | Truncate(index uint64) error 52 | // Sync snapshot status. 53 | ApplySnapshot(meta proto.SnapshotMeta) error 54 | // Close the storage. 55 | Close() 56 | } 57 | -------------------------------------------------------------------------------- /storage/wal/record.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package wal 15 | 16 | import ( 17 | "io" 18 | 19 | "encoding/binary" 20 | "fmt" 21 | ) 22 | 23 | // 日志文件({seq}.log)格式: 24 | // [log record] 25 | // ... 26 | // [log record] 27 | // [index record] 28 | // [footer record] 29 | 30 | // ErrCorrupt error 31 | type ErrCorrupt struct { 32 | filename string 33 | offset int64 34 | reason string 35 | } 36 | 37 | func (e *ErrCorrupt) Error() string { 38 | return fmt.Sprintf("corrput data at %s:%d (%v)", e.filename, e.offset, e.reason) 39 | } 40 | 41 | // NewCorruptError new 42 | func NewCorruptError(filename string, offset int64, reason string) *ErrCorrupt { 43 | return &ErrCorrupt{ 44 | filename: filename, 45 | offset: offset, 46 | reason: reason, 47 | } 48 | } 49 | 50 | type recordType uint8 51 | 52 | const ( 53 | recTypeLogEntry recordType = 1 54 | recTypeIndex recordType = 2 55 | recTypeFooter recordType = 3 56 | ) 57 | 58 | func (rt recordType) String() string { 59 | switch rt { 60 | case recTypeLogEntry: 61 | return "type-log" 62 | case recTypeIndex: 63 | return "type-index" 64 | case recTypeFooter: 65 | return "type-footer" 66 | default: 67 | return fmt.Sprintf("type-unknown(%d)", uint8(rt)) 68 | } 69 | } 70 | 71 | var footerMagic = []byte{'\xf9', '\xbf', '\x3e', '\x0a', '\xd3', '\xc5', '\xcc', '\x3f'} 72 | 73 | // record格式 74 | type record struct { 75 | recType recordType // 字节类型 76 | dataLen uint64 // 八字节大端数据长度 77 | data []byte // []byte recordData.Encode() 78 | crc uint32 // 固定四字节 79 | } 80 | 81 | // 一个record写入时最多需要多少字节的空间 82 | func recordSize(data recordData) int { 83 | return 1 + 8 + int(data.Size()) + 4 84 | } 85 | 86 | type recordData interface { 87 | Encode(w io.Writer) error 88 | Size() uint64 89 | } 90 | 91 | type footerRecord struct { 92 | indexOffset uint64 93 | magic []byte 94 | } 95 | 96 | func (fr footerRecord) Encode(w io.Writer) (err error) { 97 | buf := make([]byte, 8) 98 | binary.BigEndian.PutUint64(buf, fr.indexOffset) 99 | if _, err = w.Write(buf); err != nil { 100 | return 101 | } 102 | if _, err = w.Write(footerMagic); err != nil { 103 | return 104 | } 105 | return nil 106 | } 107 | 108 | func (fr footerRecord) Size() uint64 { 109 | return 16 110 | } 111 | 112 | func (fr *footerRecord) Decode(data []byte) { 113 | fr.indexOffset = binary.BigEndian.Uint64(data) 114 | fr.magic = data[8 : 8+len(footerMagic)] 115 | } 116 | -------------------------------------------------------------------------------- /storage/wal/record_writer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "encoding/binary" 19 | "os" 20 | 21 | "github.com/tiglabs/raft/util" 22 | "github.com/tiglabs/raft/util/bufalloc" 23 | ) 24 | 25 | const initialBufferSize = 1024 * 32 26 | const flushTriggerSize = 1024 * 1024 27 | 28 | type recordWriter struct { 29 | f *os.File 30 | buf bufalloc.Buffer 31 | u64Buf []byte 32 | u32Buf []byte 33 | offset int64 34 | } 35 | 36 | func newRecordWriter(f *os.File) *recordWriter { 37 | return &recordWriter{ 38 | f: f, 39 | u64Buf: make([]byte, 8), 40 | u32Buf: make([]byte, 4), 41 | } 42 | } 43 | 44 | func (w *recordWriter) Write(recType recordType, data recordData) error { 45 | if w.buf == nil { 46 | w.buf = bufalloc.AllocBuffer(initialBufferSize) 47 | } 48 | 49 | w.buf.Grow(recordSize(data)) 50 | // write record type 51 | w.buf.WriteByte(byte(recType)) 52 | // write data size 53 | binary.BigEndian.PutUint64(w.u64Buf, data.Size()) 54 | w.buf.Write(w.u64Buf) 55 | // write data 56 | prevLen := w.buf.Len() 57 | data.Encode(w.buf) 58 | if uint64(w.buf.Len()-prevLen) != data.Size() { 59 | panic("fbase/raft/logstorage: unexpected data size when decode " + recType.String()) 60 | } 61 | // write crc 62 | crc := util.NewCRC(w.buf.Bytes()[w.buf.Len()-int(data.Size()):]) 63 | binary.BigEndian.PutUint32(w.u32Buf, crc.Value()) 64 | w.buf.Write(w.u32Buf) 65 | 66 | w.offset += int64(recordSize(data)) 67 | 68 | if err := w.tryToFlush(); err != nil { 69 | return err 70 | } 71 | 72 | return nil 73 | } 74 | 75 | func (w *recordWriter) tryToFlush() error { 76 | if w.buf != nil && w.buf.Len() >= flushTriggerSize { 77 | return w.Flush() 78 | } 79 | return nil 80 | } 81 | 82 | func (w *recordWriter) Offset() int64 { 83 | return w.offset 84 | } 85 | 86 | func (w *recordWriter) Truncate(offset int64) error { 87 | if err := w.f.Truncate(offset); err != nil { 88 | return err 89 | } 90 | w.offset = offset 91 | _, err := w.f.Seek(offset, os.SEEK_SET) 92 | return err 93 | } 94 | 95 | func (w *recordWriter) Flush() error { 96 | if w.buf != nil && w.buf.Len() > 0 { 97 | _, err := w.buf.WriteTo(w.f) 98 | if err != nil { 99 | return err 100 | } 101 | } 102 | return nil 103 | } 104 | 105 | func (w *recordWriter) Sync() error { 106 | if err := w.Flush(); err != nil { 107 | return err 108 | } 109 | 110 | return w.f.Sync() 111 | } 112 | 113 | // 关闭写 114 | func (w *recordWriter) Close() error { 115 | if err := w.Sync(); err != nil { 116 | return err 117 | } 118 | if w.buf != nil { 119 | bufalloc.FreeBuffer(w.buf) 120 | } 121 | return nil 122 | } 123 | -------------------------------------------------------------------------------- /util/io.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "bufio" 19 | "errors" 20 | "io" 21 | ) 22 | 23 | var ( 24 | maxEmptyReads = 100 25 | err_reader_isnil = errors.New("BufferReader: reader is nil!") 26 | err_negative_count = errors.New("BufferReader: read return negative count!") 27 | err_no_progress = errors.New("BufferReader: multiple Read calls return no data or error!") 28 | err_too_large = errors.New("BufferReader: make byte slice too large!") 29 | ) 30 | 31 | type BufferReader struct { 32 | buf []byte 33 | reader io.Reader 34 | size int 35 | r, w int 36 | err error 37 | } 38 | 39 | func NewBufferReader(reader io.Reader, size int) *BufferReader { 40 | return &BufferReader{ 41 | reader: reader, 42 | size: size, 43 | buf: make([]byte, size), 44 | } 45 | } 46 | 47 | func (br *BufferReader) Reset() { 48 | if br.w > br.r { 49 | copy(br.buf, br.buf[br.r:br.w]) 50 | } 51 | br.w = br.w - br.r 52 | br.r = 0 53 | } 54 | 55 | func (br *BufferReader) ReadFull(min int) (data []byte, err error) { 56 | if br.reader == nil { 57 | return nil, err_reader_isnil 58 | } 59 | if min == 0 { 60 | err = br.err 61 | br.err = nil 62 | return make([]byte, 0, 0), err 63 | } 64 | 65 | if min > (cap(br.buf) - br.r) { 66 | br.Grow(min) 67 | } 68 | for (br.w-br.r) < min && err == nil { 69 | br.fill() 70 | err = br.err 71 | } 72 | if (br.w - br.r) >= min { 73 | data = br.buf[br.r : br.r+min] 74 | br.r = br.r + min 75 | err = nil 76 | } else { 77 | data = br.buf[br.r:br.w] 78 | br.r = br.w 79 | err = br.err 80 | br.err = nil 81 | } 82 | return 83 | } 84 | 85 | func (br *BufferReader) fill() { 86 | if br.w >= cap(br.buf) { 87 | br.Grow(br.w - br.r) 88 | } 89 | 90 | for i := maxEmptyReads; i > 0; i-- { 91 | n, err := br.reader.Read(br.buf[br.w:]) 92 | if n < 0 { 93 | panic(err_negative_count) 94 | } 95 | br.w = br.w + n 96 | if err != nil { 97 | br.err = err 98 | return 99 | } 100 | if n > 0 { 101 | return 102 | } 103 | } 104 | br.err = err_no_progress 105 | } 106 | 107 | func (br *BufferReader) Grow(n int) { 108 | defer func() { 109 | if recover() != nil { 110 | panic(err_too_large) 111 | } 112 | }() 113 | 114 | var buf []byte = nil 115 | if n > br.size { 116 | buf = make([]byte, n) 117 | } else { 118 | buf = make([]byte, br.size) 119 | } 120 | 121 | if br.w > br.r { 122 | copy(buf, br.buf[br.r:br.w]) 123 | } 124 | br.w = br.w - br.r 125 | br.r = 0 126 | br.buf = buf 127 | } 128 | 129 | type BufferWriter struct { 130 | *bufio.Writer 131 | } 132 | 133 | func NewBufferWriter(wr io.Writer, size int) *BufferWriter { 134 | return &BufferWriter{ 135 | Writer: bufio.NewWriterSize(wr, size), 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /transport_heartbeat.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "net" 19 | "sync" 20 | 21 | //"fmt" 22 | //"github.com/tiglabs/raft/logger" 23 | "github.com/tiglabs/raft/proto" 24 | "github.com/tiglabs/raft/util" 25 | ) 26 | 27 | type heartbeatTransport struct { 28 | config *TransportConfig 29 | raftServer *RaftServer 30 | listener net.Listener 31 | mu sync.RWMutex 32 | senders map[uint64]*transportSender 33 | stopc chan struct{} 34 | } 35 | 36 | func newHeartbeatTransport(raftServer *RaftServer, config *TransportConfig) (*heartbeatTransport, error) { 37 | var ( 38 | listener net.Listener 39 | err error 40 | ) 41 | 42 | if listener, err = net.Listen("tcp", config.HeartbeatAddr); err != nil { 43 | return nil, err 44 | } 45 | t := &heartbeatTransport{ 46 | config: config, 47 | raftServer: raftServer, 48 | listener: listener, 49 | senders: make(map[uint64]*transportSender), 50 | stopc: make(chan struct{}), 51 | } 52 | return t, nil 53 | } 54 | 55 | func (t *heartbeatTransport) stop() { 56 | t.mu.Lock() 57 | defer t.mu.Unlock() 58 | 59 | select { 60 | case <-t.stopc: 61 | return 62 | default: 63 | close(t.stopc) 64 | t.listener.Close() 65 | for _, s := range t.senders { 66 | s.stop() 67 | } 68 | } 69 | } 70 | 71 | func (t *heartbeatTransport) start() { 72 | util.RunWorkerUtilStop(func() { 73 | for { 74 | select { 75 | case <-t.stopc: 76 | return 77 | default: 78 | conn, err := t.listener.Accept() 79 | if err != nil { 80 | continue 81 | } 82 | t.handleConn(util.NewConnTimeout(conn)) 83 | } 84 | } 85 | }, t.stopc) 86 | } 87 | 88 | func (t *heartbeatTransport) handleConn(conn *util.ConnTimeout) { 89 | util.RunWorker(func() { 90 | defer conn.Close() 91 | 92 | bufRd := util.NewBufferReader(conn, 16*KB) 93 | for { 94 | select { 95 | case <-t.stopc: 96 | return 97 | default: 98 | if msg, err := reciveMessage(bufRd); err != nil { 99 | return 100 | } else { 101 | //logger.Debug(fmt.Sprintf("Recive %v from (%v)", msg.ToString(), conn.RemoteAddr())) 102 | t.raftServer.reciveMessage(msg) 103 | } 104 | } 105 | } 106 | }) 107 | } 108 | 109 | func (t *heartbeatTransport) send(msg *proto.Message) { 110 | s := t.getSender(msg.To) 111 | s.send(msg) 112 | } 113 | 114 | func (t *heartbeatTransport) getSender(nodeId uint64) *transportSender { 115 | t.mu.RLock() 116 | sender, ok := t.senders[nodeId] 117 | t.mu.RUnlock() 118 | if ok { 119 | return sender 120 | } 121 | 122 | t.mu.Lock() 123 | defer t.mu.Unlock() 124 | if sender, ok = t.senders[nodeId]; !ok { 125 | sender = newTransportSender(nodeId, 1, 64, HeartBeat, t.config.Resolver) 126 | t.senders[nodeId] = sender 127 | } 128 | return sender 129 | } 130 | -------------------------------------------------------------------------------- /util/uvarint64.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | func Uvarint64(buf []byte) (uint64, int) { 18 | if buf[0] <= 0xF0 { 19 | return uint64(buf[0]), 1 20 | } 21 | if buf[0] <= 0xF8 { 22 | return 240 + 256*(uint64(buf[0])-241) + uint64(buf[1]), 2 23 | } 24 | if buf[0] == 0xF9 { 25 | return 2288 + 256*uint64(buf[1]) + uint64(buf[2]), 3 26 | } 27 | if buf[0] == 0xFA { 28 | return uint64(buf[1])<<16 | uint64(buf[2])<<8 | uint64(buf[3]), 4 29 | } 30 | if buf[0] == 0xFB { 31 | return uint64(buf[1])<<24 | uint64(buf[2])<<16 | uint64(buf[3])<<8 | uint64(buf[4]), 5 32 | } 33 | if buf[0] == 0xFC { 34 | return uint64(buf[1])<<32 | uint64(buf[2])<<24 | uint64(buf[3])<<16 | uint64(buf[4])<<8 | uint64(buf[5]), 6 35 | } 36 | if buf[0] == 0xFD { 37 | return uint64(buf[1])<<40 | uint64(buf[2])<<32 | uint64(buf[3])<<24 | uint64(buf[4])<<16 | uint64(buf[5])<<8 | uint64(buf[6]), 7 38 | } 39 | if buf[0] == 0xFE { 40 | return uint64(buf[1])<<48 | uint64(buf[2])<<40 | uint64(buf[3])<<32 | uint64(buf[4])<<24 | uint64(buf[5])<<16 | uint64(buf[6])<<8 | uint64(buf[7]), 8 41 | } 42 | return uint64(buf[1])<<56 | uint64(buf[2])<<48 | uint64(buf[3])<<40 | uint64(buf[4])<<32 | uint64(buf[5])<<24 | uint64(buf[6])<<16 | uint64(buf[7])<<8 | uint64(buf[8]), 9 43 | } 44 | 45 | func PutUvarint64(buf []byte, x uint64) int { 46 | if x < 241 { 47 | buf[0] = byte(x) 48 | return 1 49 | } 50 | if x < 2288 { 51 | buf[0] = byte((x-240)/256 + 241) 52 | buf[1] = byte((x - 240) % 256) 53 | return 2 54 | } 55 | if x < 67824 { 56 | buf[0] = 0xF9 57 | buf[1] = byte((x - 2288) / 256) 58 | buf[2] = byte((x - 2288) % 256) 59 | return 3 60 | } 61 | if x < 1<<24 { 62 | buf[0] = 0xFA 63 | buf[1] = byte(x >> 16) 64 | buf[2] = byte(x >> 8) 65 | buf[3] = byte(x) 66 | return 4 67 | } 68 | if x < 1<<32 { 69 | buf[0] = 0xFB 70 | buf[1] = byte(x >> 24) 71 | buf[2] = byte(x >> 16) 72 | buf[3] = byte(x >> 8) 73 | buf[4] = byte(x) 74 | return 5 75 | } 76 | if x < 1<<40 { 77 | buf[0] = 0xFC 78 | buf[1] = byte(x >> 32) 79 | buf[2] = byte(x >> 24) 80 | buf[3] = byte(x >> 16) 81 | buf[4] = byte(x >> 8) 82 | buf[5] = byte(x) 83 | return 6 84 | } 85 | if x < 1<<48 { 86 | buf[0] = 0xFD 87 | buf[1] = byte(x >> 40) 88 | buf[2] = byte(x >> 32) 89 | buf[3] = byte(x >> 24) 90 | buf[4] = byte(x >> 16) 91 | buf[5] = byte(x >> 8) 92 | buf[6] = byte(x) 93 | return 7 94 | } 95 | if x < 1<<56 { 96 | buf[0] = 0xFE 97 | buf[1] = byte(x >> 48) 98 | buf[2] = byte(x >> 40) 99 | buf[3] = byte(x >> 32) 100 | buf[4] = byte(x >> 24) 101 | buf[5] = byte(x >> 16) 102 | buf[6] = byte(x >> 8) 103 | buf[7] = byte(x) 104 | return 8 105 | } 106 | buf[0] = 0xFF 107 | buf[1] = byte(x >> 56) 108 | buf[2] = byte(x >> 48) 109 | buf[3] = byte(x >> 40) 110 | buf[4] = byte(x >> 32) 111 | buf[5] = byte(x >> 24) 112 | buf[6] = byte(x >> 16) 113 | buf[7] = byte(x >> 8) 114 | buf[8] = byte(x) 115 | return 9 116 | } 117 | -------------------------------------------------------------------------------- /storage/wal/meta.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "encoding/binary" 19 | "io" 20 | "os" 21 | "path" 22 | 23 | "github.com/tiglabs/raft/proto" 24 | "github.com/tiglabs/raft/util/bufalloc" 25 | ) 26 | 27 | type truncateMeta struct { 28 | truncIndex uint64 29 | truncTerm uint64 30 | } 31 | 32 | func (m truncateMeta) Size() uint64 { 33 | return 16 34 | } 35 | 36 | func (m truncateMeta) Encode(b []byte) { 37 | binary.BigEndian.PutUint64(b, m.truncIndex) 38 | binary.BigEndian.PutUint64(b[8:], m.truncTerm) 39 | } 40 | 41 | func (m *truncateMeta) Decode(b []byte) { 42 | m.truncIndex = binary.BigEndian.Uint64(b) 43 | m.truncTerm = binary.BigEndian.Uint64(b[8:]) 44 | } 45 | 46 | // 存储HardState和truncateMeta信息 47 | type metaFile struct { 48 | f *os.File 49 | truncOffset int64 50 | } 51 | 52 | func openMetaFile(dir string) (mf *metaFile, hs proto.HardState, meta truncateMeta, err error) { 53 | f, err := os.OpenFile(path.Join(dir, "META"), os.O_RDWR|os.O_CREATE, 0600) 54 | if err != nil { 55 | return 56 | } 57 | 58 | mf = &metaFile{ 59 | f: f, 60 | truncOffset: int64(hs.Size()), 61 | } 62 | 63 | hs, meta, err = mf.load() 64 | return mf, hs, meta, err 65 | } 66 | 67 | func (mf *metaFile) load() (hs proto.HardState, meta truncateMeta, err error) { 68 | // load hardstate 69 | hs_size := int(hs.Size()) 70 | buffer := bufalloc.AllocBuffer(hs_size) 71 | defer bufalloc.FreeBuffer(buffer) 72 | 73 | buf := buffer.Alloc(hs_size) 74 | n, err := mf.f.Read(buf) 75 | if err != nil { 76 | if err == io.EOF { 77 | err = nil 78 | return 79 | } 80 | return 81 | } 82 | if n != hs_size { 83 | err = NewCorruptError("META", 0, "wrong hardstate data size") 84 | return 85 | } 86 | hs.Decode(buf) 87 | 88 | // load trunc meta 89 | buffer.Reset() 90 | mt_size := int(meta.Size()) 91 | buf = buffer.Alloc(mt_size) 92 | n, err = mf.f.Read(buf) 93 | if err != nil { 94 | if err == io.EOF { 95 | err = nil 96 | return 97 | } 98 | return 99 | } 100 | if n != mt_size { 101 | err = NewCorruptError("META", 0, "wrong truncmeta data size") 102 | return 103 | } 104 | meta.Decode(buf) 105 | return 106 | } 107 | 108 | func (mf *metaFile) Close() error { 109 | return mf.f.Close() 110 | } 111 | 112 | func (mf *metaFile) SaveTruncateMeta(meta truncateMeta) error { 113 | mt_size := int(meta.Size()) 114 | buffer := bufalloc.AllocBuffer(mt_size) 115 | defer bufalloc.FreeBuffer(buffer) 116 | 117 | b := buffer.Alloc(mt_size) 118 | meta.Encode(b) 119 | _, err := mf.f.WriteAt(b, mf.truncOffset) 120 | return err 121 | } 122 | 123 | func (mf *metaFile) SaveHardState(hs proto.HardState) error { 124 | hs_size := int(hs.Size()) 125 | buffer := bufalloc.AllocBuffer(hs_size) 126 | defer bufalloc.FreeBuffer(buffer) 127 | 128 | b := buffer.Alloc(hs_size) 129 | hs.Encode(b) 130 | _, err := mf.f.WriteAt(b, 0) 131 | return err 132 | } 133 | 134 | func (mf *metaFile) Sync() error { 135 | return mf.f.Sync() 136 | } 137 | -------------------------------------------------------------------------------- /storage/wal/log_index.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "encoding/binary" 19 | "fmt" 20 | "io" 21 | 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | const indexItemSize = 8 + 8 + 4 26 | 27 | type indexItem struct { 28 | logindex uint64 // 日志的index 29 | logterm uint64 // 日志的term 30 | offset uint32 // 日志在文件中的偏移 31 | } 32 | 33 | type logEntryIndex []indexItem 34 | 35 | func (li logEntryIndex) First() uint64 { 36 | if len(li) == 0 { 37 | return 0 38 | } 39 | return li[0].logindex 40 | } 41 | 42 | func (li logEntryIndex) Last() uint64 { 43 | size := len(li) 44 | if size == 0 { 45 | return 0 46 | } 47 | 48 | return li[size-1].logindex 49 | } 50 | 51 | func (li logEntryIndex) Get(i uint64) (item indexItem, err error) { 52 | size := len(li) 53 | if size == 0 { 54 | err = fmt.Errorf("maybe index(%d) is out of bound lastindex(%d)", i, li.Last()) 55 | return 56 | } 57 | 58 | ibegin := li[0].logindex 59 | iend := li[size-1].logindex 60 | if i < ibegin || i > iend { 61 | err = fmt.Errorf("maybe index(%d) is out of bound lastindex(%d)", i, li.Last()) 62 | return 63 | } 64 | return li[i-ibegin], nil 65 | } 66 | 67 | func (li logEntryIndex) Append(offset uint32, entry *proto.Entry) logEntryIndex { 68 | return append(li, indexItem{ 69 | logindex: entry.Index, 70 | logterm: entry.Term, 71 | offset: offset, 72 | }) 73 | } 74 | 75 | func (li logEntryIndex) Truncate(i uint64) (logEntryIndex, error) { 76 | if _, err := li.Get(i); err != nil { 77 | return nil, err 78 | } 79 | 80 | return li[:i-li[0].logindex], nil 81 | } 82 | 83 | func (li logEntryIndex) Len() int { 84 | return len(li) 85 | } 86 | 87 | // 实现recordData接口Encode方法 88 | func (li logEntryIndex) Encode(w io.Writer) (err error) { 89 | u32Buf := make([]byte, 4) 90 | u64Buf := make([]byte, 8) 91 | 92 | // write index items count 93 | binary.BigEndian.PutUint32(u32Buf, uint32(li.Len())) 94 | if _, err = w.Write(u32Buf); err != nil { 95 | return 96 | } 97 | 98 | // write indexs data 99 | for _, item := range li { 100 | // logindex 101 | binary.BigEndian.PutUint64(u64Buf, item.logindex) 102 | if _, err = w.Write(u64Buf); err != nil { 103 | return 104 | } 105 | // logitem 106 | binary.BigEndian.PutUint64(u64Buf, item.logterm) 107 | if _, err = w.Write(u64Buf); err != nil { 108 | return 109 | } 110 | // logoffset 111 | binary.BigEndian.PutUint32(u32Buf, item.offset) 112 | if _, err = w.Write(u32Buf); err != nil { 113 | return 114 | } 115 | } 116 | return 117 | } 118 | 119 | // 实现recordData接口Size方法 120 | func (li logEntryIndex) Size() uint64 { 121 | return uint64(4 + li.Len()*indexItemSize) 122 | } 123 | 124 | func decodeLogIndex(data []byte) logEntryIndex { 125 | offset := 0 126 | 127 | nItems := binary.BigEndian.Uint32(data[offset:]) 128 | offset += 4 129 | li := make([]indexItem, nItems) 130 | 131 | for i := 0; i < int(nItems); i++ { 132 | li[i].logindex = binary.BigEndian.Uint64(data[offset:]) 133 | offset += 8 134 | li[i].logterm = binary.BigEndian.Uint64(data[offset:]) 135 | offset += 8 136 | li[i].offset = binary.BigEndian.Uint32(data[offset:]) 137 | offset += 4 138 | } 139 | return li 140 | } 141 | -------------------------------------------------------------------------------- /util/bufalloc/buffer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file. 5 | 6 | package bufalloc 7 | 8 | import ( 9 | "io" 10 | ) 11 | 12 | // A Buffer is a variable-sized buffer of bytes with Read and Write methods. 13 | type Buffer interface { 14 | // Alloc allocs n bytes of slice from the buffer, growing the buffer as needed. 15 | // If n is negative, Alloc will panic. If the buffer can't grow it will panic with bytes.ErrTooLarge. 16 | Alloc(n int) []byte 17 | // Truncate discards all but the first n unread bytes from the buffer. 18 | // It panics if n is negative or greater than the length of the buffer. 19 | Truncate(n int) 20 | // Grow grows the buffer's capacity, if necessary, to guarantee space for n bytes. 21 | // If n is negative, Grow will panic. If the buffer can't grow it will panic with bytes.ErrTooLarge. 22 | Grow(n int) 23 | // Write appends the contents of p to the buffer, growing the buffer as needed. 24 | // The return value n is the length of p; err is always nil. 25 | // If the buffer becomes too large, Write will panic with bytes.ErrTooLarge. 26 | Write(p []byte) (n int, err error) 27 | // WriteByte appends the byte c to the buffer, growing the buffer as needed. 28 | // If the buffer becomes too large, WriteByte will panic with bytes.ErrTooLarge. 29 | WriteByte(c byte) error 30 | // WriteTo writes data to w until the buffer is drained or an error occurs. 31 | // The return value n is the number of bytes written; 32 | // Any error encountered during the write is also returned. 33 | WriteTo(w io.Writer) (n int64, err error) 34 | // Read reads the next len(p) bytes from the buffer or until the buffer is drained. 35 | // The return value n is the number of bytes read. 36 | // If the buffer has no data to return, err is io.EOF (unless len(p) is zero); otherwise it is nil. 37 | Read(p []byte) (n int, err error) 38 | // ReadByte reads and returns the next byte from the buffer. 39 | // If no byte is available, it returns error io.EOF. 40 | ReadByte() (c byte, err error) 41 | // ReadBytes reads until the first occurrence of delim in the input, 42 | // returning a slice containing the data up to and including the delimiter. 43 | // If ReadBytes encounters an error before finding a delimiter, it returns the data read before the error and the error itself (often io.EOF). 44 | // ReadBytes returns err != nil if and only if the returned data does not end in delim. 45 | ReadBytes(delim byte) (line []byte, err error) 46 | // ReadFrom reads data from r until EOF and appends it to the buffer, growing the buffer as needed. 47 | // The return value n is the number of bytes read. Any error except io.EOF encountered during the read is also returned. 48 | // If the buffer becomes too large, ReadFrom will panic with bytes.ErrTooLarge. 49 | ReadFrom(r io.Reader) (n int64, err error) 50 | // Bytes returns a slice of the contents of the unread portion of the buffer; 51 | // If the caller changes the contents of the returned slice, the contents of the buffer will change, 52 | // provided there are no intervening method calls on the Buffer. 53 | Bytes() []byte 54 | // Next returns a slice containing the next n bytes from the buffer, advancing the buffer as if the bytes had been returned by Read. 55 | // If there are fewer than n bytes in the buffer, Next returns the entire buffer. 56 | // The slice is only valid until the next call to a read or write method. 57 | Next(n int) []byte 58 | // Reset resets the buffer so it has no content. 59 | // b.Reset() is the same as b.Truncate(0). 60 | Reset() 61 | // String returns the contents of the unread portion of the buffer as a string. 62 | // If the Buffer is a nil pointer, it returns "". 63 | String() string 64 | // Len returns the number of bytes of the unread portion of the buffer; 65 | Len() int 66 | // Cap returns the capacity of the buffer. 67 | Cap() int 68 | } 69 | 70 | func AllocBuffer(n int) Buffer { 71 | return buffPool.getBuffer(n) 72 | } 73 | 74 | func FreeBuffer(buf Buffer) { 75 | buffPool.putBuffer(buf) 76 | } 77 | -------------------------------------------------------------------------------- /raft_log_unstable.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "fmt" 20 | 21 | "github.com/tiglabs/raft/logger" 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | // unstable temporary deposit the unpersistent log entries.It has log position i+unstable.offset. 26 | // unstable can support group commit. 27 | // Note that unstable.offset may be less than the highest log position in storage; 28 | // this means that the next write to storage might need to truncate the log before persisting unstable.entries. 29 | type unstable struct { 30 | offset uint64 31 | // all entries that have not yet been written to storage. 32 | entries []*proto.Entry 33 | } 34 | 35 | // maybeLastIndex returns the last index if it has at least one unstable entry. 36 | func (u *unstable) maybeLastIndex() (uint64, bool) { 37 | if l := len(u.entries); l != 0 { 38 | return u.offset + uint64(l) - 1, true 39 | } 40 | return 0, false 41 | } 42 | 43 | // myabeTerm returns the term of the entry at index i, if there is any. 44 | func (u *unstable) maybeTerm(i uint64) (uint64, bool) { 45 | if i < u.offset { 46 | return 0, false 47 | } 48 | 49 | last, ok := u.maybeLastIndex() 50 | if !ok || i > last { 51 | return 0, false 52 | } 53 | return u.entries[i-u.offset].Term, true 54 | } 55 | 56 | func (u *unstable) stableTo(i, t uint64) { 57 | gt, ok := u.maybeTerm(i) 58 | if !ok { 59 | return 60 | } 61 | if gt == t && i >= u.offset { 62 | l := uint64(len(u.entries)) 63 | diff := l - (i + 1 - u.offset) 64 | if diff > 0 { 65 | copy(u.entries, u.entries[i+1-u.offset:l]) 66 | } 67 | for k := diff; k < l; k++ { 68 | u.entries[k] = nil 69 | } 70 | u.entries = u.entries[0:diff] 71 | u.offset = i + 1 72 | } 73 | } 74 | 75 | func (u *unstable) restore(index uint64) { 76 | for i, l := 0, len(u.entries); i < l; i++ { 77 | u.entries[i] = nil 78 | } 79 | u.entries = u.entries[0:0] 80 | u.offset = index + 1 81 | } 82 | 83 | func (u *unstable) truncateAndAppend(ents []*proto.Entry) { 84 | after := ents[0].Index 85 | switch { 86 | case after == u.offset+uint64(len(u.entries)): 87 | // after is the next index in the u.entries directly append 88 | u.entries = append(u.entries, ents...) 89 | 90 | case after <= u.offset: 91 | // The log is being truncated to before our current offset portion, so set the offset and replace the entries 92 | for i, l := 0, len(u.entries); i < l; i++ { 93 | u.entries[i] = nil 94 | } 95 | u.entries = append(u.entries[0:0], ents...) 96 | u.offset = after 97 | 98 | default: 99 | // truncate to after and copy to u.entries then append 100 | u.entries = append(u.entries[0:0], u.slice(u.offset, after)...) 101 | u.entries = append(u.entries, ents...) 102 | } 103 | } 104 | 105 | func (u *unstable) slice(lo uint64, hi uint64) []*proto.Entry { 106 | u.mustCheckOutOfBounds(lo, hi) 107 | return u.entries[lo-u.offset : hi-u.offset] 108 | } 109 | 110 | // u.offset <= lo <= hi <= u.offset+len(u.offset) 111 | func (u *unstable) mustCheckOutOfBounds(lo, hi uint64) { 112 | if lo > hi { 113 | errMsg := fmt.Sprintf("unstable.slice[%d,%d) is invalid.", lo, hi) 114 | logger.Error(errMsg) 115 | panic(AppPanicError(errMsg)) 116 | } 117 | upper := u.offset + uint64(len(u.entries)) 118 | if lo < u.offset || hi > upper { 119 | errMsg := fmt.Sprintf("unstable.slice[%d,%d) out of bound [%d,%d].", lo, hi, u.offset, upper) 120 | logger.Error(errMsg) 121 | panic(AppPanicError(errMsg)) 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /raft_fsm_candidate.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "fmt" 20 | 21 | "github.com/tiglabs/raft/logger" 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | func (r *raftFsm) becomeCandidate() { 26 | if r.state == stateLeader { 27 | panic(AppPanicError(fmt.Sprintf("[raft->becomeCandidate][%v] invalid transition [leader -> candidate].", r.id))) 28 | } 29 | 30 | r.step = stepCandidate 31 | r.reset(r.term+1, 0, false) 32 | r.tick = r.tickElection 33 | r.vote = r.config.NodeID 34 | r.state = stateCandidate 35 | 36 | if logger.IsEnableDebug() { 37 | logger.Debug("raft[%v] became candidate at term %d.", r.id, r.term) 38 | } 39 | } 40 | 41 | func stepCandidate(r *raftFsm, m *proto.Message) { 42 | switch m.Type { 43 | case proto.LocalMsgProp: 44 | if logger.IsEnableDebug() { 45 | logger.Debug("raft[%v] no leader at term %d; dropping proposal.", r.id, r.term) 46 | } 47 | proto.ReturnMessage(m) 48 | return 49 | 50 | case proto.ReqMsgAppend: 51 | r.becomeFollower(r.term, m.From) 52 | r.handleAppendEntries(m) 53 | proto.ReturnMessage(m) 54 | return 55 | 56 | case proto.ReqMsgHeartBeat: 57 | r.becomeFollower(r.term, m.From) 58 | return 59 | 60 | case proto.ReqMsgElectAck: 61 | r.becomeFollower(r.term, m.From) 62 | nmsg := proto.GetMessage() 63 | nmsg.Type = proto.RespMsgElectAck 64 | nmsg.To = m.From 65 | r.send(nmsg) 66 | proto.ReturnMessage(m) 67 | return 68 | 69 | case proto.ReqMsgVote: 70 | if logger.IsEnableDebug() { 71 | logger.Debug("raft[%v] [logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term) 72 | } 73 | nmsg := proto.GetMessage() 74 | nmsg.Type = proto.RespMsgVote 75 | nmsg.To = m.From 76 | nmsg.Reject = true 77 | r.send(nmsg) 78 | proto.ReturnMessage(m) 79 | return 80 | 81 | case proto.RespMsgVote: 82 | gr := r.poll(m.From, !m.Reject) 83 | if logger.IsEnableDebug() { 84 | logger.Debug("raft[%v] [q:%d] has received %d votes and %d vote rejections.", r.id, r.quorum(), gr, len(r.votes)-gr) 85 | } 86 | switch r.quorum() { 87 | case gr: 88 | if r.config.LeaseCheck { 89 | r.becomeElectionAck() 90 | } else { 91 | r.becomeLeader() 92 | r.bcastAppend() 93 | } 94 | case len(r.votes) - gr: 95 | r.becomeFollower(r.term, NoLeader) 96 | } 97 | } 98 | } 99 | 100 | func (r *raftFsm) campaign(force bool) { 101 | r.becomeCandidate() 102 | if r.quorum() == r.poll(r.config.NodeID, true) { 103 | if r.config.LeaseCheck { 104 | r.becomeElectionAck() 105 | } else { 106 | r.becomeLeader() 107 | } 108 | return 109 | } 110 | 111 | for id := range r.replicas { 112 | if id == r.config.NodeID { 113 | continue 114 | } 115 | li, lt := r.raftLog.lastIndexAndTerm() 116 | if logger.IsEnableDebug() { 117 | logger.Debug("[raft->campaign][%v logterm: %d, index: %d] sent vote request to %v at term %d.", r.id, lt, li, id, r.term) 118 | } 119 | 120 | m := proto.GetMessage() 121 | m.To = id 122 | m.Type = proto.ReqMsgVote 123 | m.ForceVote = force 124 | m.Index = li 125 | m.LogTerm = lt 126 | r.send(m) 127 | } 128 | } 129 | 130 | func (r *raftFsm) poll(id uint64, v bool) (granted int) { 131 | if logger.IsEnableDebug() { 132 | if v { 133 | logger.Debug("raft[%v] received vote from %v at term %d.", r.id, id, r.term) 134 | } else { 135 | logger.Debug("raft[%v] received vote rejection from %v at term %d.", r.id, id, r.term) 136 | } 137 | } 138 | if _, ok := r.votes[id]; !ok { 139 | r.votes[id] = v 140 | } 141 | for _, vv := range r.votes { 142 | if vv { 143 | granted++ 144 | } 145 | } 146 | return granted 147 | } 148 | -------------------------------------------------------------------------------- /test/kvs/config.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License.package wal 14 | 15 | package main 16 | 17 | import ( 18 | "encoding/json" 19 | "errors" 20 | "fmt" 21 | "os" 22 | "path" 23 | 24 | "github.com/BurntSushi/toml" 25 | ) 26 | 27 | const defaultConfigStr = ` 28 | # KVS Configuration. 29 | 30 | [server] 31 | data-path = "/export/Data/raft-kvs" 32 | log-path = "/export/Logs/raft-kvs" 33 | log-level = "debug" 34 | 35 | [cluster] 36 | [[cluster.nodes]] 37 | node-id=1 38 | host="n1" 39 | http-port=9991 40 | heartbeat-port=9992 41 | replicate-port=9993 42 | 43 | [[cluster.nodes]] 44 | node-id=2 45 | host="n2" 46 | http-port=9991 47 | heartbeat-port=9992 48 | replicate-port=9993 49 | 50 | [[cluster.nodes]] 51 | node-id=3 52 | host="n3" 53 | http-port=9991 54 | heartbeat-port=9992 55 | replicate-port=9993 56 | ` 57 | 58 | // ServerConfig server config 59 | type ServerConfig struct { 60 | LogPath string `toml:"log-path,omitempty" json:"log-path"` 61 | LogLevel string `toml:"log-level,omitempty" json:"log-level"` 62 | DataPath string `toml:"data-path,omitempty" json:"data-path"` 63 | } 64 | 65 | // ClusterNode cluster node 66 | type ClusterNode struct { 67 | NodeID uint64 `toml:"node-id,omitempty" json:"node-id"` 68 | Host string `toml:"host,omitempty" json:"host"` 69 | HTTPPort uint32 `toml:"http-port,omitempty" json:"http-port"` 70 | HeartbeatPort uint32 `toml:"heartbeat-port,omitempty" json:"heartbeat-port"` 71 | ReplicatePort uint32 `toml:"replicate-port,omitempty" json:"replicate-port"` 72 | } 73 | 74 | // ClusterConfig cluster configs 75 | type ClusterConfig struct { 76 | Nodes []*ClusterNode `toml:"nodes,omitempty" json:"nodes"` 77 | } 78 | 79 | // Config kvs config 80 | type Config struct { 81 | ServerCfg ServerConfig `toml:"server,omitempty" json:"server"` 82 | ClusterCfg ClusterConfig `toml:"cluster,omitempty" json:"cluster"` 83 | } 84 | 85 | func initDir(dir string) error { 86 | info, err := os.Stat(dir) 87 | if err != nil { 88 | if pathErr, ok := err.(*os.PathError); ok { 89 | if os.IsNotExist(pathErr) { 90 | return os.MkdirAll(dir, 0755) 91 | } 92 | } 93 | return err 94 | } 95 | if !info.IsDir() { 96 | return errors.New("path is not directory") 97 | } 98 | return nil 99 | } 100 | 101 | // Validate check config validate, add node id as dir prefix 102 | func (c *Config) Validate(nodeID uint64) { 103 | if len(c.ServerCfg.LogPath) == 0 { 104 | panic("invalid log path") 105 | } 106 | c.ServerCfg.LogPath = path.Join(c.ServerCfg.LogPath, fmt.Sprintf("node%d", nodeID)) 107 | if err := initDir(c.ServerCfg.LogPath); err != nil { 108 | panic(fmt.Sprintf("init log dir(%s) failed: %v", c.ServerCfg.LogPath, err)) 109 | } 110 | 111 | if len(c.ServerCfg.DataPath) == 0 { 112 | panic("invalid data path") 113 | } 114 | c.ServerCfg.DataPath = path.Join(c.ServerCfg.DataPath, fmt.Sprintf("node%d", nodeID)) 115 | if err := initDir(c.ServerCfg.DataPath); err != nil { 116 | panic(fmt.Sprintf("init data dir(%s) failed: %v", c.ServerCfg.DataPath, err)) 117 | } 118 | 119 | if len(c.ClusterCfg.Nodes) == 0 { 120 | panic("cluster nodes is empty") 121 | } 122 | } 123 | 124 | // FindClusterNode find cluster node by NodeID 125 | func (c *Config) FindClusterNode(NodeID uint64) *ClusterNode { 126 | if c == nil { 127 | return nil 128 | } 129 | for _, n := range c.ClusterCfg.Nodes { 130 | if n.NodeID == NodeID { 131 | return n 132 | } 133 | } 134 | return nil 135 | } 136 | 137 | func (c *Config) String() string { 138 | data, _ := json.Marshal(c) 139 | return string(data) 140 | } 141 | 142 | // LoadConfig load config from file 143 | func LoadConfig(filePath string, nodeID uint64) *Config { 144 | c := new(Config) 145 | if _, err := toml.Decode(defaultConfigStr, c); err != nil { 146 | panic(fmt.Sprintf("fail to decode default config, err: %v", err)) 147 | } 148 | if len(filePath) != 0 { 149 | _, err := toml.DecodeFile(filePath, c) 150 | if err != nil { 151 | panic(fmt.Sprintf("fail to decode config file(%v): %v", filePath, err)) 152 | } 153 | } 154 | c.Validate(nodeID) 155 | return c 156 | } 157 | -------------------------------------------------------------------------------- /logger/logger.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package logger 17 | 18 | import ( 19 | "fmt" 20 | 21 | "github.com/tiglabs/raft/util/log" 22 | ) 23 | 24 | // Logger encapsulation the log interface. 25 | type Logger interface { 26 | IsEnableDebug() bool 27 | IsEnableInfo() bool 28 | IsEnableWarn() bool 29 | 30 | Debug(format string, v ...interface{}) 31 | Info(format string, v ...interface{}) 32 | Warn(format string, v ...interface{}) 33 | Error(format string, v ...interface{}) 34 | } 35 | 36 | var ( 37 | stdLogger = NewDefaultLogger(0) 38 | raftLogger = Logger(stdLogger) 39 | ) 40 | 41 | func SetLogger(l Logger) { 42 | raftLogger = l 43 | } 44 | 45 | func IsEnableDebug() bool { 46 | return raftLogger.IsEnableDebug() 47 | } 48 | 49 | func IsEnableInfo() bool { 50 | return raftLogger.IsEnableInfo() 51 | } 52 | 53 | func IsEnableWarn() bool { 54 | return raftLogger.IsEnableWarn() 55 | } 56 | 57 | func Debug(format string, v ...interface{}) { 58 | raftLogger.Debug(format, v...) 59 | } 60 | 61 | func Info(format string, v ...interface{}) { 62 | raftLogger.Info(format, v...) 63 | } 64 | 65 | func Warn(format string, v ...interface{}) { 66 | raftLogger.Warn(format, v...) 67 | } 68 | 69 | func Error(format string, v ...interface{}) { 70 | raftLogger.Error(format, v...) 71 | } 72 | 73 | // DefaultLogger is a default implementation of the Logger interface. 74 | type DefaultLogger struct { 75 | *log.Log 76 | debugEnable bool 77 | infoEnable bool 78 | warnEnable bool 79 | } 80 | 81 | func NewDefaultLogger(level int) *DefaultLogger { 82 | logger, err := log.NewLog("", "raft", "DEBUG") 83 | if err != nil { 84 | panic(err) 85 | } 86 | return &DefaultLogger{ 87 | Log: logger, 88 | debugEnable: level <= log.DebugLevel, 89 | infoEnable: level <= log.InfoLevel, 90 | warnEnable: level <= log.WarnLevel, 91 | } 92 | } 93 | 94 | func (l *DefaultLogger) header(lvl, msg string) string { 95 | return fmt.Sprintf("%s: %s", lvl, msg) 96 | } 97 | 98 | func (l *DefaultLogger) IsEnableDebug() bool { 99 | return l.debugEnable 100 | } 101 | 102 | func (l *DefaultLogger) Debug(format string, v ...interface{}) { 103 | l.Output(4, l.header("DEBUG", fmt.Sprintf(format, v...)), false) 104 | } 105 | 106 | func (l *DefaultLogger) IsEnableInfo() bool { 107 | return l.infoEnable 108 | } 109 | 110 | func (l *DefaultLogger) Info(format string, v ...interface{}) { 111 | l.Output(4, l.header("INFO", fmt.Sprintf(format, v...)), false) 112 | } 113 | 114 | func (l *DefaultLogger) IsEnableWarn() bool { 115 | return l.warnEnable 116 | } 117 | 118 | func (l *DefaultLogger) Warn(format string, v ...interface{}) { 119 | l.Output(4, l.header("WARN", fmt.Sprintf(format, v...)), false) 120 | } 121 | 122 | func (l *DefaultLogger) Error(format string, v ...interface{}) { 123 | l.Output(4, l.header("ERROR", fmt.Sprintf(format, v...)), false) 124 | } 125 | 126 | type FileLogger struct { 127 | *log.Log 128 | debugEnable bool 129 | infoEnable bool 130 | warnEnable bool 131 | } 132 | 133 | func NewFileLogger(logger *log.Log, level int) *FileLogger { 134 | return &FileLogger{ 135 | Log: logger, 136 | debugEnable: level <= log.DebugLevel, 137 | infoEnable: level <= log.InfoLevel, 138 | warnEnable: level <= log.WarnLevel, 139 | } 140 | } 141 | 142 | func (fl *FileLogger) IsEnableDebug() bool { 143 | return fl.debugEnable 144 | } 145 | 146 | func (fl *FileLogger) Debug(format string, v ...interface{}) { 147 | fl.Debug(fmt.Sprintf(format, v...)) 148 | } 149 | 150 | func (fl *FileLogger) IsEnableInfo() bool { 151 | return fl.infoEnable 152 | } 153 | 154 | func (fl *FileLogger) Info(format string, v ...interface{}) { 155 | fl.Info(fmt.Sprintf(format, v...)) 156 | } 157 | 158 | func (fl *FileLogger) IsEnableWarn() bool { 159 | return fl.warnEnable 160 | } 161 | 162 | func (fl *FileLogger) Warn(format string, v ...interface{}) { 163 | fl.Warn(fmt.Sprintf(format, v...)) 164 | } 165 | 166 | func (fl *FileLogger) Error(format string, v ...interface{}) { 167 | fl.Error(fmt.Sprintf(format, v...)) 168 | } 169 | -------------------------------------------------------------------------------- /raft_replica_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | ) 21 | 22 | func TestInflightsAdd(t *testing.T) { 23 | // no rotating case 24 | in := &inflight{ 25 | size: 10, 26 | buffer: make([]uint64, 10), 27 | } 28 | 29 | for i := 0; i < 5; i++ { 30 | in.add(uint64(i)) 31 | } 32 | 33 | wantIn := &inflight{ 34 | start: 0, 35 | count: 5, 36 | size: 10, 37 | // ↓------------ 38 | buffer: []uint64{0, 1, 2, 3, 4, 0, 0, 0, 0, 0}, 39 | } 40 | 41 | if !reflect.DeepEqual(in, wantIn) { 42 | t.Fatalf("in = %+v, want %+v", in, wantIn) 43 | } 44 | 45 | for i := 5; i < 10; i++ { 46 | in.add(uint64(i)) 47 | } 48 | 49 | wantIn2 := &inflight{ 50 | start: 0, 51 | count: 10, 52 | size: 10, 53 | // ↓--------------------------- 54 | buffer: []uint64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 55 | } 56 | 57 | if !reflect.DeepEqual(in, wantIn2) { 58 | t.Fatalf("in = %+v, want %+v", in, wantIn2) 59 | } 60 | 61 | // rotating case 62 | in2 := &inflight{ 63 | start: 5, 64 | size: 10, 65 | buffer: make([]uint64, 10), 66 | } 67 | 68 | for i := 0; i < 5; i++ { 69 | in2.add(uint64(i)) 70 | } 71 | 72 | wantIn21 := &inflight{ 73 | start: 5, 74 | count: 5, 75 | size: 10, 76 | // ↓------------ 77 | buffer: []uint64{0, 0, 0, 0, 0, 0, 1, 2, 3, 4}, 78 | } 79 | 80 | if !reflect.DeepEqual(in2, wantIn21) { 81 | t.Fatalf("in = %+v, want %+v", in2, wantIn21) 82 | } 83 | 84 | for i := 5; i < 10; i++ { 85 | in2.add(uint64(i)) 86 | } 87 | 88 | wantIn22 := &inflight{ 89 | start: 5, 90 | count: 10, 91 | size: 10, 92 | // -------------- ↓------------ 93 | buffer: []uint64{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, 94 | } 95 | 96 | if !reflect.DeepEqual(in2, wantIn22) { 97 | t.Fatalf("in = %+v, want %+v", in2, wantIn22) 98 | } 99 | } 100 | 101 | func TestInflightFreeTo(t *testing.T) { 102 | // no rotating case 103 | in := &inflight{ 104 | size: 10, 105 | buffer: make([]uint64, 10), 106 | } 107 | for i := 0; i < 10; i++ { 108 | in.add(uint64(i)) 109 | } 110 | 111 | in.freeTo(4) 112 | 113 | wantIn := &inflight{ 114 | start: 5, 115 | count: 5, 116 | size: 10, 117 | // ↓------------ 118 | buffer: []uint64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 119 | } 120 | 121 | if !reflect.DeepEqual(in, wantIn) { 122 | t.Fatalf("in = %+v, want %+v", in, wantIn) 123 | } 124 | 125 | in.freeTo(8) 126 | 127 | wantIn2 := &inflight{ 128 | start: 9, 129 | count: 1, 130 | size: 10, 131 | // ↓ 132 | buffer: []uint64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 133 | } 134 | 135 | if !reflect.DeepEqual(in, wantIn2) { 136 | t.Fatalf("in = %+v, want %+v", in, wantIn2) 137 | } 138 | 139 | // rotating case 140 | for i := 10; i < 15; i++ { 141 | in.add(uint64(i)) 142 | } 143 | 144 | in.freeTo(12) 145 | 146 | wantIn3 := &inflight{ 147 | start: 3, 148 | count: 2, 149 | size: 10, 150 | // ↓----- 151 | buffer: []uint64{10, 11, 12, 13, 14, 5, 6, 7, 8, 9}, 152 | } 153 | 154 | if !reflect.DeepEqual(in, wantIn3) { 155 | t.Fatalf("in = %+v, want %+v", in, wantIn3) 156 | } 157 | 158 | in.freeTo(14) 159 | 160 | wantIn4 := &inflight{ 161 | start: 5, 162 | count: 0, 163 | size: 10, 164 | // ↓ 165 | buffer: []uint64{10, 11, 12, 13, 14, 5, 6, 7, 8, 9}, 166 | } 167 | 168 | if !reflect.DeepEqual(in, wantIn4) { 169 | t.Fatalf("in = %+v, want %+v", in, wantIn4) 170 | } 171 | } 172 | 173 | func TestInflightFreeFirstOne(t *testing.T) { 174 | in := &inflight{ 175 | size: 10, 176 | buffer: make([]uint64, 10), 177 | } 178 | for i := 0; i < 10; i++ { 179 | in.add(uint64(i)) 180 | } 181 | 182 | in.freeFirstOne() 183 | 184 | wantIn := &inflight{ 185 | start: 1, 186 | count: 9, 187 | size: 10, 188 | // ↓------------------------ 189 | buffer: []uint64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 190 | } 191 | 192 | if !reflect.DeepEqual(in, wantIn) { 193 | t.Fatalf("in = %+v, want %+v", in, wantIn) 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /util/bufalloc/ibuffer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file. 5 | 6 | package bufalloc 7 | 8 | import ( 9 | "bytes" 10 | "errors" 11 | "io" 12 | ) 13 | 14 | const minRead = 512 15 | 16 | var ( 17 | ErrTooLarge = errors.New("bufalloc.Buffer: too large.") 18 | ) 19 | 20 | type ibuffer struct { 21 | off int 22 | buf []byte 23 | } 24 | 25 | func makeSlice(n int) []byte { 26 | defer func() { 27 | if recover() != nil { 28 | panic(ErrTooLarge) 29 | } 30 | }() 31 | return make([]byte, n) 32 | } 33 | 34 | func (b *ibuffer) Bytes() []byte { return b.buf[b.off:] } 35 | 36 | func (b *ibuffer) String() string { 37 | if b == nil { 38 | return "" 39 | } 40 | return string(b.buf[b.off:]) 41 | } 42 | 43 | func (b *ibuffer) Len() int { return len(b.buf) - b.off } 44 | 45 | func (b *ibuffer) Cap() int { return cap(b.buf) } 46 | 47 | func (b *ibuffer) Reset() { b.Truncate(0) } 48 | 49 | func (b *ibuffer) Truncate(n int) { 50 | switch { 51 | case n < 0 || n > b.Len(): 52 | panic("bufalloc.Buffer: truncation out of range") 53 | case n == 0: 54 | b.off = 0 55 | } 56 | b.buf = b.buf[0 : b.off+n] 57 | } 58 | 59 | func (b *ibuffer) grow(n int) int { 60 | if b.buf == nil { 61 | b.buf = makeSlice(n) 62 | return 0 63 | } 64 | 65 | m := b.Len() 66 | if m == 0 && b.off != 0 { 67 | b.Truncate(0) 68 | } 69 | if len(b.buf)+n > cap(b.buf) { 70 | var buf []byte 71 | if m+n <= cap(b.buf)/2 { 72 | copy(b.buf[:], b.buf[b.off:]) 73 | buf = b.buf[:m] 74 | } else { 75 | buf = makeSlice(2*cap(b.buf) + n) 76 | copy(buf, b.buf[b.off:]) 77 | } 78 | b.buf = buf 79 | b.off = 0 80 | } 81 | b.buf = b.buf[0 : b.off+m+n] 82 | return b.off + m 83 | } 84 | 85 | func (b *ibuffer) Alloc(n int) []byte { 86 | if n < 0 { 87 | panic("bufalloc.Buffer: negative count") 88 | } 89 | m := b.grow(n) 90 | return b.buf[m:] 91 | } 92 | 93 | func (b *ibuffer) Grow(n int) { 94 | if n < 0 { 95 | panic("bufalloc.Buffer: negative count") 96 | } 97 | m := b.grow(n) 98 | b.buf = b.buf[0:m] 99 | } 100 | 101 | func (b *ibuffer) Write(p []byte) (n int, err error) { 102 | m := b.grow(len(p)) 103 | return copy(b.buf[m:], p), nil 104 | } 105 | 106 | func (b *ibuffer) ReadFrom(r io.Reader) (n int64, err error) { 107 | if b.off >= len(b.buf) { 108 | b.Truncate(0) 109 | } 110 | for { 111 | if free := cap(b.buf) - len(b.buf); free < minRead { 112 | // not enough space at end 113 | newBuf := b.buf 114 | if b.off+free < minRead { 115 | newBuf = makeSlice(2*cap(b.buf) + minRead) 116 | } 117 | copy(newBuf, b.buf[b.off:]) 118 | b.buf = newBuf[:len(b.buf)-b.off] 119 | b.off = 0 120 | } 121 | m, e := r.Read(b.buf[len(b.buf):cap(b.buf)]) 122 | b.buf = b.buf[0 : len(b.buf)+m] 123 | n += int64(m) 124 | if e == io.EOF { 125 | break 126 | } 127 | if e != nil { 128 | return n, e 129 | } 130 | } 131 | return n, nil // err is EOF, so return nil explicitly 132 | } 133 | 134 | func (b *ibuffer) WriteTo(w io.Writer) (n int64, err error) { 135 | if b.off < len(b.buf) { 136 | nBytes := b.Len() 137 | m, e := w.Write(b.buf[b.off:]) 138 | if m > nBytes { 139 | panic("bufalloc.Buffer: invalid Write count") 140 | } 141 | b.off += m 142 | n = int64(m) 143 | if e != nil { 144 | return n, e 145 | } 146 | if m != nBytes { 147 | return n, io.ErrShortWrite 148 | } 149 | } 150 | b.Truncate(0) 151 | return 152 | } 153 | 154 | func (b *ibuffer) WriteByte(c byte) error { 155 | m := b.grow(1) 156 | b.buf[m] = c 157 | return nil 158 | } 159 | 160 | func (b *ibuffer) Read(p []byte) (n int, err error) { 161 | if b.off >= len(b.buf) { 162 | b.Truncate(0) 163 | if len(p) == 0 { 164 | return 165 | } 166 | return 0, io.EOF 167 | } 168 | n = copy(p, b.buf[b.off:]) 169 | b.off += n 170 | return 171 | } 172 | 173 | func (b *ibuffer) Next(n int) []byte { 174 | m := b.Len() 175 | if n > m { 176 | n = m 177 | } 178 | data := b.buf[b.off : b.off+n] 179 | b.off += n 180 | return data 181 | } 182 | 183 | func (b *ibuffer) ReadByte() (c byte, err error) { 184 | if b.off >= len(b.buf) { 185 | b.Truncate(0) 186 | return 0, io.EOF 187 | } 188 | c = b.buf[b.off] 189 | b.off++ 190 | return c, nil 191 | } 192 | 193 | func (b *ibuffer) ReadBytes(delim byte) (line []byte, err error) { 194 | slice, err := b.readSlice(delim) 195 | line = append(line, slice...) 196 | return 197 | } 198 | 199 | func (b *ibuffer) readSlice(delim byte) (line []byte, err error) { 200 | i := bytes.IndexByte(b.buf[b.off:], delim) 201 | end := b.off + i + 1 202 | if i < 0 { 203 | end = len(b.buf) 204 | err = io.EOF 205 | } 206 | line = b.buf[b.off:end] 207 | b.off = end 208 | return line, err 209 | } 210 | -------------------------------------------------------------------------------- /storage/wal/record_reader.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "bufio" 19 | "encoding/binary" 20 | "io" 21 | "os" 22 | 23 | "github.com/tiglabs/raft/util" 24 | ) 25 | 26 | // 初始化完成之后,读取记录只能调用ReadAt方法 27 | type recordReadAt interface { 28 | ReadAt(offset int64) (rec record, err error) 29 | } 30 | 31 | const defaultReadBufferedSize = 512 32 | 33 | type bufferedReader struct { 34 | r *bufio.Reader 35 | } 36 | 37 | func newBufferedReader(f *os.File) *bufferedReader { 38 | return &bufferedReader{ 39 | r: bufio.NewReaderSize(f, defaultReadBufferedSize), 40 | } 41 | } 42 | 43 | func (br *bufferedReader) Read(p []byte) (total int, err error) { 44 | n := 0 45 | for { 46 | n, err = br.r.Read(p) 47 | if err != nil { 48 | return 49 | } 50 | 51 | total += n 52 | 53 | switch { 54 | case n == len(p): 55 | return 56 | case n < len(p): 57 | p = p[n:] 58 | default: 59 | panic("invalid read buffer") 60 | } 61 | } 62 | } 63 | 64 | type recordReader struct { 65 | br *bufferedReader 66 | offset int64 // 当期记录的起始位置 67 | 68 | sr io.ReaderAt // 随机IO 69 | 70 | filename string 71 | 72 | typeLenBuf []byte 73 | } 74 | 75 | func newRecordReader(f *os.File) *recordReader { 76 | return &recordReader{ 77 | br: newBufferedReader(f), 78 | sr: f, 79 | filename: f.Name(), 80 | typeLenBuf: make([]byte, 9), // 1字节类型+8字节dataLen 81 | } 82 | } 83 | 84 | // 顺序读 85 | func (r *recordReader) Read() (recStartOffset int64, rec record, err error) { 86 | recStartOffset = r.offset 87 | 88 | // read record type and data len 89 | n, err := r.br.Read(r.typeLenBuf) 90 | if err != nil { 91 | return 92 | } 93 | if n != len(r.typeLenBuf) { 94 | if n < 1 { 95 | err = NewCorruptError(r.filename, recStartOffset, "too small record type") 96 | } else { 97 | err = NewCorruptError(r.filename, recStartOffset, "too small record datalen") 98 | } 99 | return 100 | } 101 | rec.recType = recordType(r.typeLenBuf[0]) 102 | rec.dataLen = binary.BigEndian.Uint64(r.typeLenBuf[1:]) 103 | 104 | // read data and crc 105 | // WARN:不可以用buffer pool,因为log entry等decode时没有进行拷贝 106 | rec.data = make([]byte, rec.dataLen+4) 107 | n, err = r.br.Read(rec.data) 108 | if err != nil { 109 | return 110 | } 111 | if uint64(n) != rec.dataLen+4 { 112 | err = NewCorruptError(r.filename, recStartOffset, "data size unmatch or too small crc") 113 | return 114 | } 115 | 116 | // decode crc 117 | rec.crc = binary.BigEndian.Uint32(rec.data[len(rec.data)-4:]) 118 | // truncate crc 119 | rec.data = rec.data[:len(rec.data)-4] 120 | // checksum 121 | crc := util.NewCRC(rec.data) 122 | if rec.crc != crc.Value() { 123 | err = NewCorruptError(r.filename, recStartOffset, "crc mismatch") 124 | return 125 | } 126 | 127 | r.offset += (1 + 8 + int64(rec.dataLen) + 4) 128 | 129 | return 130 | } 131 | 132 | // 随机读指定offset 133 | func (r *recordReader) ReadAt(offset int64) (rec record, err error) { 134 | defer func() { 135 | if err == io.EOF { 136 | err = NewCorruptError(r.filename, offset, "unexpected eof") 137 | } 138 | }() 139 | 140 | // read record type and data len 141 | n, err := r.sr.ReadAt(r.typeLenBuf, offset) 142 | if err != nil { 143 | return 144 | } 145 | if n != len(r.typeLenBuf) { 146 | if n < 1 { 147 | err = NewCorruptError(r.filename, offset, "too small record type") 148 | } else { 149 | err = NewCorruptError(r.filename, offset, "too small record datalen") 150 | } 151 | return 152 | } 153 | rec.recType = recordType(r.typeLenBuf[0]) 154 | rec.dataLen = binary.BigEndian.Uint64(r.typeLenBuf[1:]) 155 | 156 | // read data and crc 157 | rec.data = make([]byte, rec.dataLen+4) 158 | n, err = r.sr.ReadAt(rec.data, offset+int64(n)) 159 | if err != nil { 160 | return 161 | } 162 | if uint64(n) != rec.dataLen+4 { 163 | err = NewCorruptError(r.filename, offset, "data size unmatch or too small crc") 164 | return 165 | } 166 | 167 | // decode crc 168 | rec.crc = binary.BigEndian.Uint32(rec.data[len(rec.data)-4:]) 169 | // truncate crc 170 | rec.data = rec.data[:len(rec.data)-4] 171 | // checksum 172 | crc := util.NewCRC(rec.data) 173 | if rec.crc != crc.Value() { 174 | err = NewCorruptError(r.filename, offset, "crc mismatch") 175 | return 176 | } 177 | 178 | return 179 | } 180 | -------------------------------------------------------------------------------- /test/memory_statemachine.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package test 16 | 17 | import ( 18 | "encoding/binary" 19 | "encoding/json" 20 | "errors" 21 | "fmt" 22 | "io" 23 | "sync" 24 | 25 | "github.com/tiglabs/raft" 26 | "github.com/tiglabs/raft/proto" 27 | ) 28 | 29 | var errNotExists = errors.New("Key not exists.") 30 | 31 | type memoryStatemachine struct { 32 | sync.RWMutex 33 | id uint64 34 | applied uint64 35 | raft *raft.RaftServer 36 | data map[string]string 37 | } 38 | 39 | func newMemoryStatemachine(id uint64, raft *raft.RaftServer) *memoryStatemachine { 40 | return &memoryStatemachine{ 41 | id: id, 42 | raft: raft, 43 | data: make(map[string]string), 44 | } 45 | } 46 | 47 | func (ms *memoryStatemachine) Apply(data []byte, index uint64) (interface{}, error) { 48 | ms.Lock() 49 | defer func() { 50 | ms.Unlock() 51 | }() 52 | 53 | var kv map[string]string 54 | if err := json.Unmarshal(data, &kv); err != nil { 55 | return nil, err 56 | } 57 | for k, v := range kv { 58 | ms.data[k] = v 59 | } 60 | return nil, nil 61 | } 62 | 63 | func (ms *memoryStatemachine) ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error) { 64 | ms.Lock() 65 | defer func() { 66 | ms.Unlock() 67 | }() 68 | 69 | return nil, nil 70 | } 71 | 72 | func (ms *memoryStatemachine) Snapshot() (proto.Snapshot, error) { 73 | ms.RLock() 74 | defer ms.RUnlock() 75 | 76 | if data, err := json.Marshal(ms.data); err != nil { 77 | return nil, err 78 | } else { 79 | data = append(make([]byte, 8), data...) 80 | binary.BigEndian.PutUint64(data, ms.applied) 81 | return &memorySnapshot{ 82 | applied: ms.applied, 83 | data: data, 84 | }, nil 85 | } 86 | } 87 | 88 | func (ms *memoryStatemachine) ApplySnapshot(peers []proto.Peer, iter proto.SnapIterator) error { 89 | ms.Lock() 90 | defer ms.Unlock() 91 | 92 | var ( 93 | data []byte 94 | block []byte 95 | err error 96 | ) 97 | for err == nil { 98 | if block, err = iter.Next(); len(block) > 0 { 99 | data = append(data, block...) 100 | } 101 | } 102 | if err != nil && err != io.EOF { 103 | return err 104 | } 105 | 106 | ms.applied = binary.BigEndian.Uint64(data) 107 | if err = json.Unmarshal(data[8:], &ms.data); err != nil { 108 | return err 109 | } 110 | return nil 111 | } 112 | 113 | func (ms *memoryStatemachine) HandleFatalEvent(err *raft.FatalError) { 114 | panic(err.Err) 115 | } 116 | 117 | func (ms *memoryStatemachine) Get(key string) (string, error) { 118 | ms.RLock() 119 | defer ms.RUnlock() 120 | 121 | if v, ok := ms.data[key]; ok { 122 | return v, nil 123 | } else { 124 | return "", errNotExists 125 | } 126 | } 127 | 128 | func (ms *memoryStatemachine) Put(key, value string) error { 129 | kv := map[string]string{key: value} 130 | if data, err := json.Marshal(kv); err != nil { 131 | return err 132 | } else { 133 | resp := ms.raft.Submit(ms.id, data) 134 | _, err = resp.Response() 135 | if err != nil { 136 | return errors.New(fmt.Sprintf("Put error[%v].\r\n", err)) 137 | } 138 | return nil 139 | } 140 | } 141 | 142 | func (ms *memoryStatemachine) AddNode(peer proto.Peer) error { 143 | resp := ms.raft.ChangeMember(ms.id, proto.ConfAddNode, peer, nil) 144 | _, err := resp.Response() 145 | if err != nil { 146 | return errors.New("AddNode error.") 147 | } 148 | return nil 149 | } 150 | 151 | func (ms *memoryStatemachine) RemoveNode(peer proto.Peer) error { 152 | resp := ms.raft.ChangeMember(ms.id, proto.ConfRemoveNode, peer, nil) 153 | _, err := resp.Response() 154 | if err != nil { 155 | return errors.New("RemoveNode error.") 156 | } 157 | return nil 158 | } 159 | 160 | func (ms *memoryStatemachine) setApplied(index uint64) { 161 | ms.Lock() 162 | defer ms.Unlock() 163 | ms.applied = index 164 | } 165 | 166 | func (ms *memoryStatemachine) HandleLeaderChange(leader uint64) {} 167 | 168 | type memorySnapshot struct { 169 | offset int 170 | applied uint64 171 | data []byte 172 | } 173 | 174 | func (s *memorySnapshot) Next() ([]byte, error) { 175 | if s.offset >= len(s.data) { 176 | return nil, io.EOF 177 | } 178 | s.offset = len(s.data) 179 | return s.data, nil 180 | } 181 | 182 | func (s *memorySnapshot) ApplyIndex() uint64 { 183 | return s.applied 184 | } 185 | 186 | func (s *memorySnapshot) Close() { 187 | return 188 | } 189 | -------------------------------------------------------------------------------- /transport_sender.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "runtime" 19 | "sync" 20 | "time" 21 | 22 | //"fmt" 23 | "github.com/tiglabs/raft/logger" 24 | "github.com/tiglabs/raft/proto" 25 | "github.com/tiglabs/raft/util" 26 | ) 27 | 28 | type unreachableReporter func(uint64) 29 | 30 | type transportSender struct { 31 | nodeID uint64 32 | concurrency uint64 33 | senderType SocketType 34 | resolver SocketResolver 35 | inputc []chan *proto.Message 36 | send func(msg *proto.Message) 37 | mu sync.Mutex 38 | stopc chan struct{} 39 | } 40 | 41 | func newTransportSender(nodeID, concurrency uint64, buffSize int, senderType SocketType, resolver SocketResolver) *transportSender { 42 | sender := &transportSender{ 43 | nodeID: nodeID, 44 | concurrency: concurrency, 45 | senderType: senderType, 46 | resolver: resolver, 47 | inputc: make([]chan *proto.Message, concurrency), 48 | stopc: make(chan struct{}), 49 | } 50 | for i := uint64(0); i < concurrency; i++ { 51 | sender.inputc[i] = make(chan *proto.Message, buffSize) 52 | sender.loopSend(sender.inputc[i]) 53 | } 54 | 55 | if (concurrency & (concurrency - 1)) == 0 { 56 | sender.send = func(msg *proto.Message) { 57 | idx := 0 58 | if concurrency > 1 { 59 | idx = int(msg.ID&concurrency - 1) 60 | } 61 | sender.inputc[idx] <- msg 62 | } 63 | } else { 64 | sender.send = func(msg *proto.Message) { 65 | idx := 0 66 | if concurrency > 1 { 67 | idx = int(msg.ID % concurrency) 68 | } 69 | sender.inputc[idx] <- msg 70 | } 71 | } 72 | return sender 73 | } 74 | 75 | func (s *transportSender) stop() { 76 | s.mu.Lock() 77 | defer s.mu.Unlock() 78 | 79 | select { 80 | case <-s.stopc: 81 | return 82 | default: 83 | close(s.stopc) 84 | } 85 | } 86 | 87 | func (s *transportSender) loopSend(recvc chan *proto.Message) { 88 | util.RunWorkerUtilStop(func() { 89 | conn := getConn(s.nodeID, s.senderType, s.resolver, 0, 2*time.Second) 90 | bufWr := util.NewBufferWriter(conn, 16*KB) 91 | 92 | defer func() { 93 | if conn != nil { 94 | conn.Close() 95 | } 96 | }() 97 | 98 | loopCount := 0 99 | var err error 100 | for { 101 | loopCount = loopCount + 1 102 | if loopCount > 8 { 103 | loopCount = 0 104 | runtime.Gosched() 105 | } 106 | 107 | select { 108 | case <-s.stopc: 109 | return 110 | 111 | case msg := <-recvc: 112 | if conn == nil { 113 | conn = getConn(s.nodeID, s.senderType, s.resolver, 0, 2*time.Second) 114 | if conn == nil { 115 | proto.ReturnMessage(msg) 116 | // reset chan 117 | for { 118 | select { 119 | case msg := <-recvc: 120 | proto.ReturnMessage(msg) 121 | continue 122 | default: 123 | } 124 | break 125 | } 126 | time.Sleep(50 * time.Millisecond) 127 | continue 128 | } 129 | bufWr.Reset(conn) 130 | } 131 | err = msg.Encode(bufWr) 132 | proto.ReturnMessage(msg) 133 | if err != nil { 134 | goto flush 135 | } 136 | // group send message 137 | flag := false 138 | for i := 0; i < 16; i++ { 139 | select { 140 | case msg := <-recvc: 141 | err = msg.Encode(bufWr) 142 | //logger.Debug(fmt.Sprintf("SendMesg %v to (%v) ", msg.ToString(), conn.RemoteAddr())) 143 | proto.ReturnMessage(msg) 144 | if err != nil { 145 | goto flush 146 | } 147 | default: 148 | flag = true 149 | } 150 | if flag { 151 | break 152 | } 153 | } 154 | } 155 | 156 | flush: 157 | // flush write 158 | if err == nil { 159 | err = bufWr.Flush() 160 | } 161 | if err != nil { 162 | logger.Error("[Transport]send message[%s] to %v[%s] error:[%v].", s.senderType, s.nodeID, conn.RemoteAddr(), err) 163 | conn.Close() 164 | conn = nil 165 | } 166 | } 167 | }, s.stopc) 168 | } 169 | 170 | func getConn(nodeID uint64, socketType SocketType, resolver SocketResolver, rdTime, wrTime time.Duration) (conn *util.ConnTimeout) { 171 | var ( 172 | addr string 173 | err error 174 | ) 175 | if addr, err = resolver.NodeAddress(nodeID, socketType); err == nil { 176 | if conn, err = util.DialTimeout(addr, 2*time.Second); err == nil { 177 | conn.SetReadTimeout(rdTime) 178 | conn.SetWriteTimeout(wrTime) 179 | } 180 | } 181 | 182 | if err != nil { 183 | conn = nil 184 | if logger.IsEnableDebug() { 185 | logger.Debug("[Transport] get connection[%s] to %v[%s] failed,error is: %s", socketType, nodeID, addr, err) 186 | } 187 | } 188 | return 189 | } 190 | -------------------------------------------------------------------------------- /storage/wal/storage_raft_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "encoding/binary" 19 | "errors" 20 | "fmt" 21 | "io/ioutil" 22 | "os" 23 | "path" 24 | "sync" 25 | "testing" 26 | "time" 27 | 28 | "github.com/tiglabs/raft" 29 | "github.com/tiglabs/raft/proto" 30 | ) 31 | 32 | type raftAddr struct { 33 | heartbeat string 34 | replicate string 35 | } 36 | 37 | var raftAddresses = make(map[uint64]*raftAddr) 38 | 39 | // 三个本地节点 40 | func init() { 41 | for i := 1; i <= 3; i++ { 42 | raftAddresses[uint64(i)] = &raftAddr{ 43 | heartbeat: fmt.Sprintf(":99%d1", i), 44 | replicate: fmt.Sprintf(":99%d2", i), 45 | } 46 | } 47 | } 48 | 49 | type testResolver struct { 50 | } 51 | 52 | func (r *testResolver) AllNodes() (all []uint64) { 53 | for k := range raftAddresses { 54 | all = append(all, k) 55 | } 56 | return 57 | } 58 | 59 | func (r *testResolver) NodeAddress(nodeID uint64, stype raft.SocketType) (addr string, err error) { 60 | raddr, ok := raftAddresses[nodeID] 61 | if !ok { 62 | return "", errors.New("no such node") 63 | } 64 | switch stype { 65 | case raft.HeartBeat: 66 | return "127.0.0.1" + raddr.heartbeat, nil 67 | case raft.Replicate: 68 | return "127.0.0.1" + raddr.replicate, nil 69 | default: 70 | return "", errors.New("unknown socket type") 71 | } 72 | } 73 | 74 | // 状态机为一个数字,命令为一个数字i,收到命令后sum+=i 75 | type testStateMachine struct { 76 | t *testing.T 77 | nodeID uint64 78 | sum uint64 79 | } 80 | 81 | func (m *testStateMachine) Apply(command []byte, index uint64) (interface{}, error) { 82 | u := binary.BigEndian.Uint64(command) 83 | m.sum += u 84 | m.t.Logf("[NODE: %d] sum increased %d, sum=%d", m.nodeID, u, m.sum) 85 | return m.sum, nil 86 | } 87 | 88 | func (m *testStateMachine) ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error) { 89 | return nil, nil 90 | } 91 | 92 | func (m *testStateMachine) Snapshot() (proto.Snapshot, error) { 93 | return nil, nil 94 | } 95 | 96 | func (m *testStateMachine) ApplySnapshot(peers []proto.Peer, iter proto.SnapIterator) error { 97 | return nil 98 | } 99 | 100 | func (m *testStateMachine) HandleFatalEvent(err *raft.FatalError) { 101 | } 102 | 103 | func (m *testStateMachine) HandleLeaderChange(leader uint64) { 104 | } 105 | 106 | func TestRaft(t *testing.T) { 107 | var err error 108 | dir, err := ioutil.TempDir(os.TempDir(), "fbase_test_raft_wal_raft_") 109 | if err != nil { 110 | t.Fatal(err) 111 | } 112 | // dir := "/tmp/fbase_test_raft_wal_raft_549378644" 113 | t.Logf("TestPath: %v", dir) 114 | defer os.RemoveAll(dir) 115 | 116 | var wg sync.WaitGroup 117 | wg.Add(3) 118 | exitC := make(chan struct{}) 119 | for i := 1; i <= 3; i++ { 120 | go runNode(t, uint64(i), dir, exitC, &wg) 121 | } 122 | wg.Wait() 123 | } 124 | 125 | // 启动一个节点,提交命令,当状态机里的sum到达一定值退出 126 | func runNode(t *testing.T, nodeID uint64, dir string, exitC chan struct{}, wg *sync.WaitGroup) { 127 | defer wg.Done() 128 | 129 | listenAddr, ok := raftAddresses[nodeID] 130 | if !ok { 131 | t.Fatal("no such node") 132 | } 133 | 134 | rc := raft.DefaultConfig() 135 | rc.TickInterval = time.Millisecond * 100 136 | rc.NodeID = nodeID 137 | rc.Resolver = &testResolver{} 138 | rc.HeartbeatAddr = listenAddr.heartbeat 139 | rc.ReplicateAddr = listenAddr.replicate 140 | rs, err := raft.NewRaftServer(rc) 141 | if err != nil { 142 | t.Fatal(err) 143 | } 144 | 145 | // path: {dir}/{nodeID} 146 | wal, err := NewStorage(path.Join(dir, fmt.Sprintf("%d", nodeID)), nil) 147 | if err != nil { 148 | t.Fatal(err) 149 | } 150 | 151 | var peers []proto.Peer 152 | for k := range raftAddresses { 153 | peers = append(peers, proto.Peer{ID: k}) 154 | } 155 | statemachine := &testStateMachine{ 156 | t: t, 157 | nodeID: nodeID, 158 | } 159 | raftConfig := &raft.RaftConfig{ 160 | ID: 1, 161 | Applied: 0, 162 | Peers: peers, 163 | Storage: wal, 164 | StateMachine: statemachine, 165 | } 166 | err = rs.CreateRaft(raftConfig) 167 | if err != nil { 168 | t.Fatal(err) 169 | } 170 | 171 | ticker := time.NewTicker(time.Duration(rc.HeartbeatTick) * rc.TickInterval) 172 | var i uint64 173 | for { 174 | select { 175 | case <-ticker.C: 176 | if rs.IsLeader(1) { 177 | b := make([]byte, 8) 178 | i++ 179 | binary.BigEndian.PutUint64(b, i) 180 | f := rs.Submit(1, b) 181 | resp, err := f.Response() 182 | if err != nil { 183 | t.Error(err) 184 | } 185 | sum := resp.(uint64) 186 | if sum >= 50 { 187 | close(exitC) 188 | return 189 | } 190 | } 191 | 192 | case <-exitC: 193 | return 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /test/raft_member_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package test 16 | 17 | import ( 18 | "fmt" 19 | "testing" 20 | "time" 21 | 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | func TestMemberWithNoLease(t *testing.T) { 26 | f, w := getLogFile("changemember_nolease.log") 27 | defer func() { 28 | w.Flush() 29 | f.Close() 30 | }() 31 | 32 | servers := initTestServer(peers, false, true) 33 | fmt.Println("waiting electing leader....") 34 | leadServer := waitElect(servers, w) 35 | printStatus(servers, w) 36 | 37 | // test add node 38 | w.WriteString(fmt.Sprintf("[%s] Add new node \r\n", time.Now().Format(format_time))) 39 | leader, term := leadServer.raft.LeaderTerm(1) 40 | newServer := createRaftServer(4, leader, term, peers, false, true) 41 | // add node 42 | resolver.addNode(4, 0) 43 | fmt.Println("starting add node") 44 | leadServer.sm.AddNode(proto.Peer{ID: 4}) 45 | fmt.Println("added node") 46 | time.Sleep(time.Second) 47 | servers = append(servers, newServer) 48 | printStatus(servers, w) 49 | 50 | fmt.Println("starting put data") 51 | if err := leadServer.sm.Put("test2", "test2_val"); err != nil { 52 | t.Fatal(err) 53 | } 54 | time.Sleep(time.Second) 55 | if vget, err := newServer.sm.Get("test2"); err != nil || vget != "test2_val" { 56 | t.Fatal("new add node not get the data") 57 | } 58 | fmt.Println("success put data") 59 | 60 | // test remove node 61 | w.WriteString(fmt.Sprintf("[%s] Remove node \r\n", time.Now().Format(format_time))) 62 | fmt.Println("starting remove node") 63 | leadServer.sm.RemoveNode(proto.Peer{ID: 4}) 64 | fmt.Println("removed node") 65 | fmt.Println("starting put data") 66 | if err := leadServer.sm.Put("test3", "test3_val"); err != nil { 67 | t.Fatal(err) 68 | } 69 | fmt.Println("success put data") 70 | newServers := make([]*testServer, 0) 71 | for _, s := range servers { 72 | if s.nodeID == newServer.nodeID { 73 | s.raft.Stop() 74 | } else { 75 | newServers = append(newServers, s) 76 | } 77 | } 78 | servers = newServers 79 | time.Sleep(100 * time.Millisecond) 80 | newServer = createRaftServer(4, 0, 10, append(peers, proto.Peer{ID: 4}), false, false) 81 | servers = append(servers, newServer) 82 | time.Sleep(10 * time.Second) 83 | printStatus(servers, w) 84 | resolver.delNode(4) 85 | 86 | for _, s := range servers { 87 | s.raft.Stop() 88 | } 89 | time.Sleep(100 * time.Millisecond) 90 | } 91 | 92 | func TestMemberWithLease(t *testing.T) { 93 | f, w := getLogFile("changemember_lease.log") 94 | defer func() { 95 | w.Flush() 96 | f.Close() 97 | }() 98 | 99 | servers := initTestServer(peers, true, true) 100 | fmt.Println("waiting electing leader....") 101 | leadServer := waitElect(servers, w) 102 | printStatus(servers, w) 103 | 104 | // test add node 105 | w.WriteString(fmt.Sprintf("[%s] Add new node \r\n", time.Now().Format(format_time))) 106 | leader, term := leadServer.raft.LeaderTerm(1) 107 | newServer := createRaftServer(4, leader, term, peers, true, true) 108 | // add node 109 | resolver.addNode(4, 0) 110 | fmt.Println("starting add node") 111 | leadServer.sm.AddNode(proto.Peer{ID: 4}) 112 | fmt.Println("added node") 113 | time.Sleep(time.Second) 114 | servers = append(servers, newServer) 115 | printStatus(servers, w) 116 | 117 | fmt.Println("starting put data") 118 | if err := leadServer.sm.Put("test2", "test2_val"); err != nil { 119 | t.Fatal(err) 120 | } 121 | time.Sleep(time.Second) 122 | if vget, err := newServer.sm.Get("test2"); err != nil || vget != "test2_val" { 123 | t.Fatal("new add node not get the data") 124 | } 125 | fmt.Println("success put data") 126 | 127 | // test remove node 128 | w.WriteString(fmt.Sprintf("[%s] Remove node \r\n", time.Now().Format(format_time))) 129 | fmt.Println("starting remove node") 130 | leadServer.sm.RemoveNode(proto.Peer{ID: 4}) 131 | fmt.Println("removed node") 132 | fmt.Println("starting put data") 133 | if err := leadServer.sm.Put("test3", "test3_val"); err != nil { 134 | t.Fatal(err) 135 | } 136 | fmt.Println("success put data") 137 | newServers := make([]*testServer, 0) 138 | for _, s := range servers { 139 | if s.nodeID == newServer.nodeID { 140 | s.raft.Stop() 141 | } else { 142 | newServers = append(newServers, s) 143 | } 144 | } 145 | servers = newServers 146 | time.Sleep(100 * time.Millisecond) 147 | newServer = createRaftServer(4, 0, 10, append(peers, proto.Peer{ID: 4}), false, false) 148 | servers = append(servers, newServer) 149 | time.Sleep(10 * time.Second) 150 | printStatus(servers, w) 151 | resolver.delNode(4) 152 | 153 | for _, s := range servers { 154 | s.raft.Stop() 155 | } 156 | 157 | time.Sleep(100 * time.Millisecond) 158 | } 159 | -------------------------------------------------------------------------------- /raft_replica.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "fmt" 20 | "time" 21 | 22 | "github.com/tiglabs/raft/proto" 23 | "github.com/tiglabs/raft/util" 24 | ) 25 | 26 | // replication represents a follower’s progress of replicate in the view of the leader. 27 | // Leader maintains progresses of all followers, and sends entries to the follower based on its progress. 28 | type replica struct { 29 | inflight 30 | peer proto.Peer 31 | state replicaState 32 | paused, active, pending bool 33 | match, next, committed, pendingSnap uint64 34 | 35 | lastActive time.Time 36 | } 37 | 38 | func newReplica(peer proto.Peer, maxInflight int) *replica { 39 | repl := &replica{ 40 | peer: peer, 41 | state: replicaStateProbe, 42 | lastActive: time.Now(), 43 | } 44 | if maxInflight > 0 { 45 | repl.inflight.size = maxInflight 46 | repl.inflight.buffer = make([]uint64, maxInflight) 47 | } 48 | 49 | return repl 50 | } 51 | 52 | func (r *replica) resetState(state replicaState) { 53 | r.paused = false 54 | r.pendingSnap = 0 55 | r.state = state 56 | r.reset() 57 | } 58 | 59 | func (r *replica) becomeProbe() { 60 | if r.state == replicaStateSnapshot { 61 | pendingSnap := r.pendingSnap 62 | r.resetState(replicaStateProbe) 63 | r.next = util.Max(r.match+1, pendingSnap+1) 64 | } else { 65 | r.resetState(replicaStateProbe) 66 | r.next = r.match + 1 67 | } 68 | } 69 | 70 | func (r *replica) becomeReplicate() { 71 | r.resetState(replicaStateReplicate) 72 | r.next = r.match + 1 73 | } 74 | 75 | func (r *replica) becomeSnapshot(index uint64) { 76 | r.resetState(replicaStateSnapshot) 77 | r.pendingSnap = index 78 | } 79 | 80 | func (r *replica) update(index uint64) { 81 | r.next = index + 1 82 | } 83 | 84 | func (r *replica) maybeUpdate(index, commit uint64) bool { 85 | updated := false 86 | if r.committed < commit { 87 | r.committed = commit 88 | } 89 | if r.match < index { 90 | r.match = index 91 | updated = true 92 | r.resume() 93 | } 94 | next := index + 1 95 | if r.next < next { 96 | r.next = next 97 | } 98 | return updated 99 | } 100 | 101 | func (r *replica) maybeDecrTo(rejected, last, commit uint64) bool { 102 | if r.state == replicaStateReplicate { 103 | if r.committed < commit { 104 | r.committed = commit 105 | } 106 | if rejected <= r.match { 107 | return false 108 | } 109 | r.next = r.match + 1 110 | return true 111 | } 112 | //Probe State 113 | if r.next-1 != rejected { 114 | return false 115 | } 116 | if r.next = util.Min(rejected, last+1); r.next < 1 { 117 | r.next = 1 118 | } 119 | r.committed = commit 120 | r.resume() 121 | return true 122 | } 123 | 124 | func (r *replica) snapshotFailure() { r.pendingSnap = 0 } 125 | 126 | func (r *replica) needSnapshotAbort() bool { 127 | return r.state == replicaStateSnapshot && r.match >= r.pendingSnap 128 | } 129 | 130 | func (r *replica) pause() { r.paused = true } 131 | 132 | func (r *replica) resume() { r.paused = false } 133 | 134 | func (r *replica) isPaused() bool { 135 | switch r.state { 136 | case replicaStateProbe: 137 | return r.paused 138 | case replicaStateSnapshot: 139 | return true 140 | default: 141 | return r.full() 142 | } 143 | } 144 | 145 | func (r *replica) String() string { 146 | return fmt.Sprintf("next = %d, match = %d, commit = %d, state = %s, waiting = %v, pendingSnapshot = %d", r.next, r.match, r.committed, r.state, r.isPaused(), r.pendingSnap) 147 | } 148 | 149 | // inflight is the replication sliding window,avoid overflowing that sending buffer. 150 | type inflight struct { 151 | start int 152 | count int 153 | size int 154 | buffer []uint64 155 | } 156 | 157 | func (in *inflight) add(index uint64) { 158 | if in.full() { 159 | panic(AppPanicError(fmt.Sprint("inflight.add cannot add into a full inflights."))) 160 | } 161 | next := in.start + in.count 162 | if next >= in.size { 163 | next = next - in.size 164 | } 165 | in.buffer[next] = index 166 | in.count = in.count + 1 167 | } 168 | 169 | func (in *inflight) freeTo(index uint64) { 170 | if in.count == 0 || index < in.buffer[in.start] { 171 | return 172 | } 173 | i, idx := 0, in.start 174 | for ; i < in.count; i++ { 175 | if index < in.buffer[idx] { 176 | break 177 | } 178 | if idx = idx + 1; idx >= in.size { 179 | idx = idx - in.size 180 | } 181 | } 182 | in.count = in.count - i 183 | in.start = idx 184 | } 185 | 186 | func (in *inflight) freeFirstOne() { 187 | in.freeTo(in.buffer[in.start]) 188 | } 189 | 190 | func (in *inflight) full() bool { 191 | return in.count == in.size 192 | } 193 | 194 | func (in *inflight) reset() { 195 | in.count = 0 196 | in.start = 0 197 | } 198 | -------------------------------------------------------------------------------- /test/testserver.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package test 16 | 17 | import ( 18 | "bufio" 19 | "fmt" 20 | "math/rand" 21 | "os" 22 | "path/filepath" 23 | "runtime" 24 | "sync" 25 | "time" 26 | 27 | "github.com/tiglabs/raft" 28 | "github.com/tiglabs/raft/proto" 29 | "github.com/tiglabs/raft/storage" 30 | ) 31 | 32 | var ( 33 | testSnap = true 34 | storageType = 0 35 | elcTick = 5 36 | htbTick = 1 37 | tickInterval = 100 * time.Millisecond 38 | resolver = newNodeManager() 39 | 40 | temp = "0123456789abcdefghijklmnopqrstuvwxyz" 41 | format_time = "2006-01-02 15:04:05.000" 42 | 43 | peers = []proto.Peer{{ID: 1}, {ID: 2}, {ID: 3}} 44 | ) 45 | 46 | func init() { 47 | numCpu := runtime.NumCPU() 48 | runtime.GOMAXPROCS(numCpu) 49 | fmt.Printf("[System], Cpu Num = [%d]\r\n", numCpu) 50 | } 51 | 52 | type replAddr struct { 53 | heart string 54 | repl string 55 | } 56 | 57 | type nodeManager struct { 58 | sync.Mutex 59 | nodes map[uint64]int 60 | allAddrs map[uint64]replAddr 61 | } 62 | 63 | func newNodeManager() *nodeManager { 64 | nm := new(nodeManager) 65 | nm.nodes = map[uint64]int{1: 1, 2: 1, 3: 1} 66 | nm.allAddrs = map[uint64]replAddr{1: {heart: "127.0.0.1:8000", repl: "127.0.0.1:9000"}, 2: {heart: "127.0.0.1:8001", repl: "127.0.0.1:9001"}, 3: {heart: "127.0.0.1:8002", repl: "127.0.0.1:9002"}, 4: {heart: "127.0.0.1:8003", repl: "127.0.0.1:9003"}} 67 | return nm 68 | } 69 | 70 | func (nm *nodeManager) addNode(nodeId uint64, pri int) { 71 | nm.Lock() 72 | defer nm.Unlock() 73 | 74 | nm.nodes[nodeId] = pri 75 | } 76 | 77 | func (nm *nodeManager) delNode(nodeId uint64) { 78 | nm.Lock() 79 | defer nm.Unlock() 80 | 81 | delete(nm.nodes, nodeId) 82 | } 83 | 84 | func (nm *nodeManager) AllNodes() []uint64 { 85 | nm.Lock() 86 | defer nm.Unlock() 87 | 88 | nodes := make([]uint64, 0) 89 | for k := range nm.nodes { 90 | nodes = append(nodes, k) 91 | } 92 | return nodes 93 | } 94 | 95 | func (nm *nodeManager) NodeAddress(nodeID uint64, stype raft.SocketType) (string, error) { 96 | addr := nm.allAddrs[nodeID] 97 | if stype == raft.HeartBeat { 98 | return addr.heart, nil 99 | } 100 | return addr.repl, nil 101 | } 102 | 103 | func randomStr(size int) string { 104 | rand.Seed(time.Now().UnixNano()) 105 | curr := make([]byte, size) 106 | for i := 0; i < size; i++ { 107 | curr[i] = temp[rand.Int()%36] 108 | } 109 | return string(curr) 110 | } 111 | 112 | type testServer struct { 113 | isLease bool 114 | nodeID uint64 115 | sm *memoryStatemachine 116 | store storage.Storage 117 | raft *raft.RaftServer 118 | peers []proto.Peer 119 | hardState proto.HardState 120 | } 121 | 122 | func initTestServer(peers []proto.Peer, isLease, clear bool) []*testServer { 123 | rs := make([]*testServer, 0) 124 | for _, p := range peers { 125 | rs = append(rs, createRaftServer(p.ID, 0, 0, peers, isLease, clear)) 126 | } 127 | return rs 128 | } 129 | 130 | func createRaftServer(nodeId, leader, term uint64, peers []proto.Peer, isLease, clear bool) *testServer { 131 | config := raft.DefaultConfig() 132 | config.NodeID = nodeId 133 | config.TickInterval = tickInterval 134 | config.HeartbeatTick = htbTick 135 | config.ElectionTick = elcTick 136 | config.LeaseCheck = isLease 137 | config.HeartbeatAddr = resolver.allAddrs[nodeId].heart 138 | config.ReplicateAddr = resolver.allAddrs[nodeId].repl 139 | config.Resolver = resolver 140 | config.RetainLogs = 0 141 | 142 | rs, err := raft.NewRaftServer(config) 143 | if err != nil { 144 | panic(err) 145 | } 146 | 147 | sm := newMemoryStatemachine(1, rs) 148 | st := getStorage(rs) 149 | if clear { 150 | st.ApplySnapshot(proto.SnapshotMeta{}) 151 | } 152 | raftConfig := &raft.RaftConfig{ 153 | ID: 1, 154 | Peers: peers, 155 | Term: term, 156 | Leader: leader, 157 | Storage: st, 158 | StateMachine: sm, 159 | } 160 | err = rs.CreateRaft(raftConfig) 161 | if err != nil { 162 | panic(err) 163 | } 164 | return &testServer{ 165 | nodeID: nodeId, 166 | peers: peers, 167 | isLease: isLease, 168 | raft: rs, 169 | sm: sm, 170 | store: st, 171 | } 172 | } 173 | 174 | func getStorage(raft *raft.RaftServer) storage.Storage { 175 | switch storageType { 176 | case 0: 177 | return storage.NewMemoryStorage(raft, 1, 8192) 178 | } 179 | return nil 180 | } 181 | 182 | func getTestPath() string { 183 | path := os.TempDir() + string(filepath.Separator) + "rafttest" 184 | os.MkdirAll(path, os.ModePerm) 185 | return path 186 | } 187 | 188 | func getLogFile(name string) (*os.File, *bufio.Writer) { 189 | filename := getTestPath() + string(filepath.Separator) + name 190 | f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) 191 | if err != nil { 192 | panic(err) 193 | } 194 | w := bufio.NewWriter(f) 195 | return f, w 196 | } 197 | 198 | func getCurrentNanoTime() int64 { 199 | return time.Now().UnixNano() 200 | } 201 | -------------------------------------------------------------------------------- /raft_log_unstable_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/tiglabs/raft/proto" 22 | ) 23 | 24 | func TestMaybeLastIndex(t *testing.T) { 25 | tests := []struct { 26 | entries []*proto.Entry 27 | offset uint64 28 | wok bool 29 | windex uint64 30 | }{ 31 | // last in entries 32 | { 33 | []*proto.Entry{{Index: 5, Term: 1}}, 5, true, 5, 34 | }, 35 | // empty unstable 36 | { 37 | []*proto.Entry{}, 0, false, 0, 38 | }, 39 | } 40 | 41 | for i, tt := range tests { 42 | u := unstable{ 43 | entries: tt.entries, 44 | offset: tt.offset, 45 | } 46 | index, ok := u.maybeLastIndex() 47 | if ok != tt.wok { 48 | t.Errorf("#%d: ok = %t, want %t", i, ok, tt.wok) 49 | } 50 | if index != tt.windex { 51 | t.Errorf("#%d: index = %d, want %d", i, index, tt.windex) 52 | } 53 | } 54 | } 55 | 56 | func TestUnstableMaybeTerm(t *testing.T) { 57 | tests := []struct { 58 | entries []*proto.Entry 59 | offset uint64 60 | index uint64 61 | wok bool 62 | wterm uint64 63 | }{ 64 | // term from entries 65 | { 66 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 5, true, 1, 67 | }, 68 | { 69 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 6, false, 0, 70 | }, 71 | { 72 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 4, false, 0, 73 | }, 74 | { 75 | []*proto.Entry{}, 0, 5, false, 0, 76 | }, 77 | } 78 | 79 | for i, tt := range tests { 80 | u := unstable{ 81 | entries: tt.entries, 82 | offset: tt.offset, 83 | } 84 | term, ok := u.maybeTerm(tt.index) 85 | if ok != tt.wok { 86 | t.Errorf("#%d: ok = %t, want %t", i, ok, tt.wok) 87 | } 88 | if term != tt.wterm { 89 | t.Errorf("#%d: term = %d, want %d", i, term, tt.wterm) 90 | } 91 | } 92 | } 93 | 94 | func TestUnstableRestore(t *testing.T) { 95 | u := unstable{ 96 | entries: []*proto.Entry{{Index: 5, Term: 1}}, 97 | offset: 5, 98 | } 99 | u.restore(6) 100 | 101 | if u.offset != 7 { 102 | t.Errorf("offset = %d, want %d", u.offset, 7) 103 | } 104 | if len(u.entries) != 0 { 105 | t.Errorf("len = %d, want 0", len(u.entries)) 106 | } 107 | } 108 | 109 | func TestUnstableStableTo(t *testing.T) { 110 | tests := []struct { 111 | entries []*proto.Entry 112 | offset uint64 113 | index, term uint64 114 | woffset uint64 115 | wlen int 116 | }{ 117 | { 118 | []*proto.Entry{}, 0, 5, 1, 0, 0, 119 | }, 120 | { 121 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 5, 1, 6, 0, 122 | }, 123 | { 124 | []*proto.Entry{{Index: 5, Term: 1}, {Index: 6, Term: 1}}, 5, 125 | 5, 1, 6, 1, 126 | }, 127 | { 128 | []*proto.Entry{{Index: 6, Term: 2}}, 6, 6, 1, 6, 1, 129 | }, 130 | { 131 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 4, 1, 5, 1, 132 | }, 133 | { 134 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 4, 2, 5, 1, 135 | }, 136 | } 137 | 138 | for i, tt := range tests { 139 | u := unstable{ 140 | entries: tt.entries, 141 | offset: tt.offset, 142 | } 143 | u.stableTo(tt.index, tt.term) 144 | if u.offset != tt.woffset { 145 | t.Errorf("#%d: offset = %d, want %d", i, u.offset, tt.woffset) 146 | } 147 | if len(u.entries) != tt.wlen { 148 | t.Errorf("#%d: len = %d, want %d", i, len(u.entries), tt.wlen) 149 | } 150 | } 151 | } 152 | 153 | func TestUnstableTruncateAndAppend(t *testing.T) { 154 | tests := []struct { 155 | entries []*proto.Entry 156 | offset uint64 157 | toappend []*proto.Entry 158 | 159 | woffset uint64 160 | wentries []*proto.Entry 161 | }{ 162 | // append to the end 163 | { 164 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 165 | []*proto.Entry{{Index: 6, Term: 1}, {Index: 7, Term: 1}}, 166 | 5, []*proto.Entry{{Index: 5, Term: 1}, {Index: 6, Term: 1}, {Index: 7, Term: 1}}, 167 | }, 168 | // replace the unstable entries 169 | { 170 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 171 | []*proto.Entry{{Index: 5, Term: 2}, {Index: 6, Term: 2}}, 172 | 5, []*proto.Entry{{Index: 5, Term: 2}, {Index: 6, Term: 2}}, 173 | }, 174 | { 175 | []*proto.Entry{{Index: 5, Term: 1}}, 5, 176 | []*proto.Entry{{Index: 4, Term: 2}, {Index: 5, Term: 2}, {Index: 6, Term: 2}}, 177 | 4, []*proto.Entry{{Index: 4, Term: 2}, {Index: 5, Term: 2}, {Index: 6, Term: 2}}, 178 | }, 179 | // truncate the existing entries and append 180 | { 181 | []*proto.Entry{{Index: 5, Term: 1}, {Index: 6, Term: 1}, {Index: 7, Term: 1}}, 5, 182 | []*proto.Entry{{Index: 6, Term: 2}}, 183 | 5, []*proto.Entry{{Index: 5, Term: 1}, {Index: 6, Term: 2}}, 184 | }, 185 | { 186 | []*proto.Entry{{Index: 5, Term: 1}, {Index: 6, Term: 1}, {Index: 7, Term: 1}}, 5, 187 | []*proto.Entry{{Index: 7, Term: 2}, {Index: 8, Term: 2}}, 188 | 5, []*proto.Entry{{Index: 5, Term: 1}, {Index: 6, Term: 1}, {Index: 7, Term: 2}, {Index: 8, Term: 2}}, 189 | }, 190 | } 191 | 192 | for i, tt := range tests { 193 | u := unstable{ 194 | entries: tt.entries, 195 | offset: tt.offset, 196 | } 197 | u.truncateAndAppend(tt.toappend) 198 | if u.offset != tt.woffset { 199 | t.Errorf("#%d: offset = %d, want %d", i, u.offset, tt.woffset) 200 | } 201 | if !reflect.DeepEqual(u.entries, tt.wentries) { 202 | t.Errorf("#%d: entries = %v, want %v", i, u.entries, tt.wentries) 203 | } 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /raft_snapshot.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "encoding/binary" 20 | "fmt" 21 | "io" 22 | 23 | "github.com/tiglabs/raft/logger" 24 | "github.com/tiglabs/raft/proto" 25 | "github.com/tiglabs/raft/util" 26 | ) 27 | 28 | type snapshotStatus struct { 29 | respErr 30 | stopCh chan struct{} 31 | } 32 | 33 | func newSnapshotStatus() *snapshotStatus { 34 | f := &snapshotStatus{ 35 | stopCh: make(chan struct{}), 36 | } 37 | f.init() 38 | return f 39 | } 40 | 41 | type snapshotRequest struct { 42 | respErr 43 | snapshotReader 44 | header *proto.Message 45 | } 46 | 47 | func newSnapshotRequest(m *proto.Message, r *util.BufferReader) *snapshotRequest { 48 | f := &snapshotRequest{ 49 | header: m, 50 | snapshotReader: snapshotReader{reader: r}, 51 | } 52 | f.init() 53 | return f 54 | } 55 | 56 | func (r *snapshotRequest) response() error { 57 | return <-r.error() 58 | } 59 | 60 | type snapshotReader struct { 61 | reader *util.BufferReader 62 | err error 63 | } 64 | 65 | func (r *snapshotReader) Next() ([]byte, error) { 66 | if r.err != nil { 67 | return nil, r.err 68 | } 69 | 70 | // read size header 71 | r.reader.Reset() 72 | var buf []byte 73 | if buf, r.err = r.reader.ReadFull(4); r.err != nil { 74 | return nil, r.err 75 | } 76 | size := uint64(binary.BigEndian.Uint32(buf)) 77 | if size == 0 { 78 | r.err = io.EOF 79 | return nil, r.err 80 | } 81 | 82 | // read data 83 | r.reader.Reset() 84 | if buf, r.err = r.reader.ReadFull(int(size)); r.err != nil { 85 | return nil, r.err 86 | } 87 | 88 | return buf, nil 89 | } 90 | 91 | func (s *raft) addSnapping(nodeID uint64, rs *snapshotStatus) { 92 | s.mu.Lock() 93 | defer s.mu.Unlock() 94 | 95 | if snap, ok := s.snapping[nodeID]; ok { 96 | close(snap.stopCh) 97 | } 98 | s.snapping[nodeID] = rs 99 | } 100 | 101 | func (s *raft) removeSnapping(nodeID uint64) { 102 | s.mu.Lock() 103 | defer s.mu.Unlock() 104 | 105 | if snap, ok := s.snapping[nodeID]; ok { 106 | close(snap.stopCh) 107 | delete(s.snapping, nodeID) 108 | } 109 | } 110 | 111 | func (s *raft) stopSnapping() { 112 | s.mu.Lock() 113 | defer s.mu.Unlock() 114 | 115 | for id, snap := range s.snapping { 116 | close(snap.stopCh) 117 | delete(s.snapping, id) 118 | } 119 | } 120 | 121 | func (s *raft) sendSnapshot(m *proto.Message) { 122 | util.RunWorker(func() { 123 | defer func() { 124 | s.removeSnapping(m.To) 125 | m.Snapshot.Close() 126 | proto.ReturnMessage(m) 127 | }() 128 | 129 | // send snapshot 130 | rs := newSnapshotStatus() 131 | s.addSnapping(m.To, rs) 132 | s.config.transport.SendSnapshot(m, rs) 133 | select { 134 | case <-s.stopc: 135 | return 136 | case <-rs.stopCh: 137 | return 138 | case err := <-rs.error(): 139 | nmsg := proto.GetMessage() 140 | nmsg.Type = proto.RespMsgSnapShot 141 | nmsg.ID = m.ID 142 | nmsg.From = m.To 143 | nmsg.Reject = (err != nil) 144 | s.recvc <- nmsg 145 | } 146 | }, func(err interface{}) { 147 | s.doStop() 148 | s.handlePanic(err) 149 | }) 150 | } 151 | 152 | func (s *raft) handleSnapshot(req *snapshotRequest) { 153 | s.restoringSnapshot.Set(true) 154 | var err error 155 | defer func() { 156 | req.respond(err) 157 | s.resetTick() 158 | s.restoringSnapshot.Set(false) 159 | proto.ReturnMessage(req.header) 160 | }() 161 | 162 | // validate snapshot 163 | if req.header.Term < s.raftFsm.term { 164 | err = fmt.Errorf("raft %v [term: %d] ignored a snapshot message with lower term from %v [term: %d]", s.raftFsm.id, s.raftFsm.term, req.header.From, req.header.Term) 165 | return 166 | } 167 | if req.header.Term > s.raftFsm.term || s.raftFsm.state != stateFollower { 168 | s.raftFsm.becomeFollower(req.header.Term, req.header.From) 169 | s.maybeChange(true) 170 | } 171 | if !s.raftFsm.checkSnapshot(req.header.SnapshotMeta) { 172 | if logger.IsEnableWarn() { 173 | logger.Warn("raft %v [commit: %d] ignored snapshot [index: %d, term: %d].", s.raftFsm.id, s.raftFsm.raftLog.committed, req.header.SnapshotMeta.Index, req.header.SnapshotMeta.Term) 174 | } 175 | nmsg := proto.GetMessage() 176 | nmsg.Type = proto.RespMsgAppend 177 | nmsg.To = req.header.From 178 | nmsg.Index = s.raftFsm.raftLog.committed 179 | nmsg.Commit = s.raftFsm.raftLog.committed 180 | s.raftFsm.send(nmsg) 181 | return 182 | } 183 | 184 | // restore snapshot 185 | s.raftConfig.Storage.ApplySnapshot(proto.SnapshotMeta{}) 186 | if err = s.raftConfig.StateMachine.ApplySnapshot(req.header.SnapshotMeta.Peers, req); err != nil { 187 | return 188 | } 189 | if err = s.raftConfig.Storage.ApplySnapshot(req.header.SnapshotMeta); err != nil { 190 | return 191 | } 192 | s.raftFsm.restore(req.header.SnapshotMeta) 193 | s.peerState.replace(req.header.SnapshotMeta.Peers) 194 | s.curApplied.Set(req.header.SnapshotMeta.Index) 195 | 196 | // send snapshot response message 197 | if logger.IsEnableDebug() { 198 | logger.Warn("raft %v [commit: %d] restored snapshot [index: %d, term: %d]", 199 | s.raftFsm.id, s.raftFsm.raftLog.committed, req.header.SnapshotMeta.Index, req.header.SnapshotMeta.Term) 200 | } 201 | nmsg := proto.GetMessage() 202 | nmsg.Type = proto.RespMsgAppend 203 | nmsg.To = req.header.From 204 | nmsg.Index = s.raftFsm.raftLog.lastIndex() 205 | nmsg.Commit = s.raftFsm.raftLog.committed 206 | s.raftFsm.send(nmsg) 207 | } 208 | -------------------------------------------------------------------------------- /proto/proto.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package proto 16 | 17 | import ( 18 | "fmt" 19 | ) 20 | 21 | type ( 22 | MsgType byte 23 | EntryType byte 24 | ConfChangeType byte 25 | PeerType byte 26 | ) 27 | 28 | const ( 29 | ReqMsgAppend MsgType = iota 30 | ReqMsgVote 31 | ReqMsgHeartBeat 32 | ReqMsgSnapShot 33 | ReqMsgElectAck 34 | RespMsgAppend 35 | RespMsgVote 36 | RespMsgHeartBeat 37 | RespMsgSnapShot 38 | RespMsgElectAck 39 | LocalMsgHup 40 | LocalMsgProp 41 | LeaseMsgOffline 42 | LeaseMsgTimeout 43 | ReqCheckQuorum 44 | RespCheckQuorum 45 | ) 46 | 47 | const ( 48 | ConfAddNode ConfChangeType = 0 49 | ConfRemoveNode ConfChangeType = 1 50 | ConfUpdateNode ConfChangeType = 2 51 | 52 | EntryNormal EntryType = 0 53 | EntryConfChange EntryType = 1 54 | 55 | PeerNormal PeerType = 0 56 | PeerArbiter PeerType = 1 57 | ) 58 | 59 | // The Snapshot interface is supplied by the application to access the snapshot data of application. 60 | type Snapshot interface { 61 | SnapIterator 62 | ApplyIndex() uint64 63 | Close() 64 | } 65 | 66 | type SnapIterator interface { 67 | // if error=io.EOF represent snapshot terminated. 68 | Next() ([]byte, error) 69 | } 70 | 71 | type SnapshotMeta struct { 72 | Index uint64 73 | Term uint64 74 | Peers []Peer 75 | } 76 | 77 | type Peer struct { 78 | Type PeerType 79 | Priority uint16 80 | ID uint64 // NodeID 81 | PeerID uint64 // Replica ID, unique over all raft groups and all replicas in the same group 82 | } 83 | 84 | // HardState is the repl state,must persist to the storage. 85 | type HardState struct { 86 | Term uint64 87 | Commit uint64 88 | Vote uint64 89 | } 90 | 91 | // Entry is the repl log entry. 92 | type Entry struct { 93 | Type EntryType 94 | Term uint64 95 | Index uint64 96 | Data []byte 97 | } 98 | 99 | // Message is the transport message. 100 | type Message struct { 101 | Type MsgType 102 | ForceVote bool 103 | Reject bool 104 | RejectIndex uint64 105 | ID uint64 106 | From uint64 107 | To uint64 108 | Term uint64 109 | LogTerm uint64 110 | Index uint64 111 | Commit uint64 112 | SnapshotMeta SnapshotMeta 113 | Entries []*Entry 114 | Context []byte 115 | Snapshot Snapshot // No need for codec 116 | } 117 | 118 | func (m *Message) ToString() (mesg string) { 119 | return fmt.Sprintf("Mesg:[%v] type(%v) ForceVote(%v) Reject(%v) RejectIndex(%v) "+ 120 | "From(%v) To(%v) Term(%v) LogTrem(%v) Index(%v) Commit(%v)", m.ID, m.Type.String(), m.ForceVote, 121 | m.Reject, m.RejectIndex, m.From, m.To, m.Term, m.LogTerm, m.Index, m.Commit) 122 | } 123 | 124 | type ConfChange struct { 125 | Type ConfChangeType 126 | Peer Peer 127 | Context []byte 128 | } 129 | 130 | type HeartbeatContext []uint64 131 | 132 | func (t MsgType) String() string { 133 | switch t { 134 | case 0: 135 | return "ReqMsgAppend" 136 | case 1: 137 | return "ReqMsgVote" 138 | case 2: 139 | return "ReqMsgHeartBeat" 140 | case 3: 141 | return "ReqMsgSnapShot" 142 | case 4: 143 | return "ReqMsgElectAck" 144 | case 5: 145 | return "RespMsgAppend" 146 | case 6: 147 | return "RespMsgVote" 148 | case 7: 149 | return "RespMsgHeartBeat" 150 | case 8: 151 | return "RespMsgSnapShot" 152 | case 9: 153 | return "RespMsgElectAck" 154 | case 10: 155 | return "LocalMsgHup" 156 | case 11: 157 | return "LocalMsgProp" 158 | case 12: 159 | return "LeaseMsgOffline" 160 | case 13: 161 | return "LeaseMsgTimeout" 162 | case 14: 163 | return "ReqCheckQuorum" 164 | case 15: 165 | return "RespCheckQuorum" 166 | } 167 | return "unkown" 168 | } 169 | 170 | func (t EntryType) String() string { 171 | switch t { 172 | case 0: 173 | return "EntryNormal" 174 | case 1: 175 | return "EntryConfChange" 176 | } 177 | return "unkown" 178 | } 179 | 180 | func (t ConfChangeType) String() string { 181 | switch t { 182 | case 0: 183 | return "ConfAddNode" 184 | case 1: 185 | return "ConfRemoveNode" 186 | case 2: 187 | return "ConfUpdateNode" 188 | } 189 | return "unkown" 190 | } 191 | 192 | func (t PeerType) String() string { 193 | switch t { 194 | case 0: 195 | return "PeerNormal" 196 | case 1: 197 | return "PeerArbiter" 198 | } 199 | return "unkown" 200 | } 201 | 202 | func (p Peer) String() string { 203 | return fmt.Sprintf(`"nodeID":"%v","peerID":"%v","priority":"%v","type":"%v"`, 204 | p.ID, p.PeerID, p.Priority, p.Type.String()) 205 | } 206 | 207 | func (cc *ConfChange) String() string { 208 | return fmt.Sprintf(`{"type":"%v",%v}`, cc.Type, cc.Peer.String()) 209 | } 210 | 211 | func (m *Message) IsResponseMsg() bool { 212 | return m.Type == RespMsgAppend || m.Type == RespMsgHeartBeat || m.Type == RespMsgVote || 213 | m.Type == RespMsgElectAck || m.Type == RespMsgSnapShot || m.Type == RespCheckQuorum 214 | } 215 | 216 | func (m *Message) IsElectionMsg() bool { 217 | return m.Type == ReqMsgHeartBeat || m.Type == RespMsgHeartBeat || m.Type == ReqMsgVote || m.Type == RespMsgVote || 218 | m.Type == ReqMsgElectAck || m.Type == RespMsgElectAck || m.Type == LeaseMsgOffline || m.Type == LeaseMsgTimeout 219 | } 220 | 221 | func (m *Message) IsHeartbeatMsg() bool { 222 | return m.Type == ReqMsgHeartBeat || m.Type == RespMsgHeartBeat 223 | } 224 | 225 | func (s *HardState) IsEmpty() bool { 226 | return s.Term == 0 && s.Vote == 0 && s.Commit == 0 227 | } 228 | -------------------------------------------------------------------------------- /raft_fsm_follower.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "math" 20 | 21 | "github.com/tiglabs/raft/logger" 22 | "github.com/tiglabs/raft/proto" 23 | ) 24 | 25 | func (r *raftFsm) becomeFollower(term, lead uint64) { 26 | r.step = stepFollower 27 | r.reset(term, 0, false) 28 | r.tick = r.tickElection 29 | r.leader = lead 30 | r.state = stateFollower 31 | if logger.IsEnableDebug() { 32 | logger.Debug("[raft][%v] became follower at term[%d] leader[%d].", r.id, r.term, r.leader) 33 | } 34 | } 35 | 36 | func stepFollower(r *raftFsm, m *proto.Message) { 37 | switch m.Type { 38 | case proto.LocalMsgProp: 39 | if r.leader == NoLeader { 40 | if logger.IsEnableWarn() { 41 | logger.Warn("raft[%v] no leader at term %d; dropping proposal.", r.id, r.term) 42 | } 43 | return 44 | } 45 | m.To = r.leader 46 | r.send(m) 47 | return 48 | 49 | case proto.ReqMsgAppend: 50 | r.electionElapsed = 0 51 | r.leader = m.From 52 | r.handleAppendEntries(m) 53 | proto.ReturnMessage(m) 54 | return 55 | 56 | case proto.ReqMsgHeartBeat: 57 | r.electionElapsed = 0 58 | r.leader = m.From 59 | return 60 | 61 | case proto.ReqMsgElectAck: 62 | r.electionElapsed = 0 63 | r.leader = m.From 64 | nmsg := proto.GetMessage() 65 | nmsg.Type = proto.RespMsgElectAck 66 | nmsg.To = m.From 67 | r.send(nmsg) 68 | proto.ReturnMessage(m) 69 | return 70 | 71 | case proto.ReqCheckQuorum: 72 | // TODO: remove this 73 | if logger.IsEnableDebug() { 74 | logger.Debug("raft[%d] recv check quorum from %d, index=%d", r.id, m.From, m.Index) 75 | } 76 | r.electionElapsed = 0 77 | r.leader = m.From 78 | nmsg := proto.GetMessage() 79 | nmsg.Type = proto.RespCheckQuorum 80 | nmsg.Index = m.Index 81 | nmsg.To = m.From 82 | r.send(nmsg) 83 | proto.ReturnMessage(m) 84 | return 85 | 86 | case proto.ReqMsgVote: 87 | fpri, lpri := uint16(math.MaxUint16), uint16(0) 88 | if pr, ok := r.replicas[m.From]; ok { 89 | fpri = pr.peer.Priority 90 | } 91 | if pr, ok := r.replicas[r.config.NodeID]; ok { 92 | lpri = pr.peer.Priority 93 | } 94 | 95 | if (!r.config.LeaseCheck || r.leader == NoLeader) && (r.vote == NoLeader || r.vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm, fpri, lpri) { 96 | r.electionElapsed = 0 97 | if logger.IsEnableDebug() { 98 | logger.Debug("raft[%v] [logterm: %d, index: %d, vote: %v] voted for %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term) 99 | } 100 | r.vote = m.From 101 | nmsg := proto.GetMessage() 102 | nmsg.Type = proto.RespMsgVote 103 | nmsg.To = m.From 104 | r.send(nmsg) 105 | } else { 106 | if logger.IsEnableDebug() { 107 | logger.Debug("raf[%v] [logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term) 108 | } 109 | nmsg := proto.GetMessage() 110 | nmsg.Type = proto.RespMsgVote 111 | nmsg.To = m.From 112 | nmsg.Reject = true 113 | r.send(nmsg) 114 | } 115 | proto.ReturnMessage(m) 116 | return 117 | 118 | case proto.LeaseMsgTimeout: 119 | if r.leader == m.From { 120 | r.electionElapsed = 0 121 | nmsg := proto.GetMessage() 122 | nmsg.Type = proto.LocalMsgHup 123 | nmsg.From = r.config.NodeID 124 | r.Step(nmsg) 125 | } 126 | proto.ReturnMessage(m) 127 | return 128 | } 129 | } 130 | 131 | func (r *raftFsm) tickElection() { 132 | if !r.promotable() { 133 | r.electionElapsed = 0 134 | return 135 | } 136 | 137 | r.electionElapsed++ 138 | timeout := false 139 | // check follower lease (2 * electiontimeout) 140 | if r.config.LeaseCheck && r.leader != NoLeader && r.state == stateFollower { 141 | timeout = (r.electionElapsed >= (r.config.ElectionTick << 1)) 142 | } else { 143 | timeout = r.pastElectionTimeout() 144 | } 145 | if timeout { 146 | r.electionElapsed = 0 147 | m := proto.GetMessage() 148 | m.Type = proto.LocalMsgHup 149 | m.From = r.config.NodeID 150 | r.Step(m) 151 | } 152 | } 153 | 154 | func (r *raftFsm) handleAppendEntries(m *proto.Message) { 155 | if m.Index < r.raftLog.committed { 156 | nmsg := proto.GetMessage() 157 | nmsg.Type = proto.RespMsgAppend 158 | nmsg.To = m.From 159 | nmsg.Index = r.raftLog.committed 160 | nmsg.Commit = r.raftLog.committed 161 | r.send(nmsg) 162 | return 163 | } 164 | 165 | if mlastIndex, ok := r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...); ok { 166 | nmsg := proto.GetMessage() 167 | nmsg.Type = proto.RespMsgAppend 168 | nmsg.To = m.From 169 | nmsg.Index = mlastIndex 170 | nmsg.Commit = r.raftLog.committed 171 | r.send(nmsg) 172 | } else { 173 | if logger.IsEnableDebug() { 174 | logger.Debug("raft[%v logterm: %d, index: %d] rejected msgApp [logterm: %d, index: %d] from %v", 175 | r.id, r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(m.Index)), m.Index, m.LogTerm, m.Index, m.From) 176 | } 177 | nmsg := proto.GetMessage() 178 | nmsg.Type = proto.RespMsgAppend 179 | nmsg.To = m.From 180 | nmsg.Index = m.Index 181 | nmsg.Commit = r.raftLog.committed 182 | nmsg.Reject = true 183 | nmsg.RejectIndex = r.raftLog.lastIndex() 184 | r.send(nmsg) 185 | } 186 | } 187 | 188 | func (r *raftFsm) promotable() bool { 189 | _, ok := r.replicas[r.config.NodeID] 190 | return ok 191 | } 192 | -------------------------------------------------------------------------------- /read_only.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | package raft 17 | 18 | import ( 19 | "fmt" 20 | 21 | "github.com/tiglabs/raft/logger" 22 | ) 23 | 24 | // ReadOnlyOption read only option 25 | type ReadOnlyOption int 26 | 27 | const ( 28 | // ReadOnlySafe guarantees the linearizability of the read only request by 29 | // communicating with the quorum. It is the default and suggested option. 30 | ReadOnlySafe ReadOnlyOption = iota 31 | // ReadOnlyLeaseBased ensures linearizability of the read only request by 32 | // relying on the leader lease. It can be affected by clock drift. 33 | // If the clock drift is unbounded, leader might keep the lease longer than it 34 | // should (clock can move backward/pause without any bound). ReadIndex is not safe 35 | // in that case. 36 | ReadOnlyLeaseBased 37 | ) 38 | 39 | type readIndexStatus struct { 40 | index uint64 41 | futures []*Future 42 | acks map[uint64]struct{} 43 | } 44 | 45 | type readIndexReady struct { 46 | index uint64 47 | futures []*Future 48 | } 49 | 50 | type readOnly struct { 51 | id uint64 // raft id 52 | option ReadOnlyOption 53 | 54 | // wait leader to commit an entry in current term 55 | committed bool 56 | // ReadIndex requests before leader commit entry in current term 57 | scratch []*Future 58 | 59 | // wait quorum ack 60 | pendings map[uint64]*readIndexStatus 61 | pendingQueue []uint64 62 | 63 | // quorum acked, wait apply 64 | readys map[uint64]*readIndexReady 65 | readyQueue []uint64 66 | } 67 | 68 | func newReadOnly(id uint64, option ReadOnlyOption) *readOnly { 69 | return &readOnly{ 70 | id: id, 71 | option: option, 72 | pendings: make(map[uint64]*readIndexStatus), 73 | readys: make(map[uint64]*readIndexReady), 74 | } 75 | } 76 | 77 | func (r *readOnly) addPending(index uint64, futures []*Future) { 78 | if status, ok := r.pendings[index]; ok { 79 | status.futures = append(status.futures, futures...) 80 | return 81 | } 82 | 83 | // check index valid 84 | if index <= r.lastPending() { 85 | panic(AppPanicError(fmt.Sprintf("[raft->addReadOnly][%v] invalid index[%d]: less than last[%d]", r.id, index, r.lastPending()))) 86 | } 87 | r.pendingQueue = append(r.pendingQueue, index) 88 | r.pendings[index] = &readIndexStatus{ 89 | index: index, 90 | futures: futures, 91 | acks: make(map[uint64]struct{}), 92 | } 93 | } 94 | 95 | func (r *readOnly) addReady(index uint64, futures []*Future) { 96 | if status, ok := r.readys[index]; ok { 97 | status.futures = append(status.futures, futures...) 98 | return 99 | } 100 | r.readyQueue = append(r.readyQueue, index) 101 | r.readys[index] = &readIndexReady{ 102 | index: index, 103 | futures: futures, 104 | } 105 | } 106 | 107 | func (r *readOnly) add(index uint64, futures []*Future) { 108 | if !r.committed { 109 | r.scratch = append(r.scratch, futures...) 110 | return 111 | } 112 | 113 | if r.option == ReadOnlyLeaseBased { 114 | r.addReady(index, futures) 115 | } else { 116 | r.addPending(index, futures) 117 | } 118 | } 119 | 120 | func (r *readOnly) commit(index uint64) { 121 | if !r.committed { 122 | r.committed = true 123 | if len(r.scratch) > 0 { 124 | r.add(index, r.scratch) 125 | r.scratch = nil 126 | } 127 | } 128 | } 129 | 130 | func (r *readOnly) lastPending() uint64 { 131 | if len(r.pendingQueue) > 0 { 132 | return r.pendingQueue[len(r.pendingQueue)-1] 133 | } 134 | return 0 135 | } 136 | 137 | func (r *readOnly) recvAck(index uint64, from uint64, quorum int) { 138 | status, ok := r.pendings[index] 139 | if !ok { 140 | return 141 | } 142 | status.acks[from] = struct{}{} 143 | // add one to include an ack from local node 144 | if len(status.acks)+1 >= quorum { 145 | r.advance(index) 146 | } 147 | } 148 | 149 | func (r *readOnly) advance(index uint64) { 150 | var i int 151 | for _, idx := range r.pendingQueue { 152 | if idx > index { 153 | break 154 | } 155 | if rs, ok := r.pendings[idx]; ok { 156 | r.addReady(idx, rs.futures) 157 | delete(r.pendings, idx) 158 | } 159 | i++ 160 | } 161 | r.pendingQueue = r.pendingQueue[i:] 162 | } 163 | 164 | func (r *readOnly) getReady(applied uint64) (futures []*Future) { 165 | if len(r.readyQueue) == 0 { 166 | return nil 167 | } 168 | 169 | var i int 170 | for _, idx := range r.readyQueue { 171 | if idx > applied { 172 | break 173 | } 174 | if rs, ok := r.readys[idx]; ok { 175 | futures = append(futures, rs.futures...) 176 | delete(r.readys, idx) 177 | } 178 | i++ 179 | } 180 | r.readyQueue = r.readyQueue[i:] 181 | // TODO: remove this when stable 182 | if logger.IsEnableDebug() { 183 | logger.Debug("raft[%d] get ready index %d, futures len: %d", r.id, applied, len(futures)) 184 | } 185 | return 186 | } 187 | 188 | func (r *readOnly) containsUpdate(applied uint64) bool { 189 | return len(r.readyQueue) > 0 && applied >= r.readyQueue[0] 190 | } 191 | 192 | func (r *readOnly) reset(err error) { 193 | respondReadIndex(r.scratch, err) 194 | for _, status := range r.pendings { 195 | respondReadIndex(status.futures, err) 196 | } 197 | for _, ready := range r.readys { 198 | respondReadIndex(ready.futures, err) 199 | } 200 | 201 | r.committed = false 202 | r.scratch = nil 203 | r.pendings = make(map[uint64]*readIndexStatus) 204 | r.pendingQueue = nil 205 | r.readys = make(map[uint64]*readIndexReady) 206 | } 207 | 208 | func respondReadIndex(future []*Future, err error) { 209 | for _, f := range future { 210 | f.respond(nil, err) 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /util/io_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package util 16 | 17 | import ( 18 | "testing" 19 | ) 20 | 21 | type byteReader struct { 22 | buf []byte 23 | } 24 | 25 | func newByteReader(b byte, n int) *byteReader { 26 | br := new(byteReader) 27 | br.buf = make([]byte, n) 28 | for i := 0; i < n; i++ { 29 | br.buf[i] = b 30 | } 31 | return br 32 | } 33 | 34 | func (r *byteReader) Read(p []byte) (n int, err error) { 35 | if len(r.buf) == 0 { 36 | return 0, nil 37 | } 38 | 39 | n = copy(p, r.buf) 40 | return n, nil 41 | } 42 | 43 | func (r *byteReader) Close() error { 44 | return nil 45 | } 46 | 47 | func TestFill(t *testing.T) { 48 | br := NewBufferReader(newByteReader('a', 100), 8*1024) 49 | if br.r != 0 || br.w != 0 || cap(br.buf) != 8*1024 { 50 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 51 | } 52 | 53 | br.fill() 54 | if br.r != 0 || br.w != 100 || cap(br.buf) != 8*1024 { 55 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 56 | } 57 | 58 | br.fill() 59 | if br.r != 0 || br.w != 200 || cap(br.buf) != 8*1024 { 60 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 61 | } 62 | 63 | br.reader = newByteReader('a', 8*1024) 64 | br.fill() 65 | if br.r != 0 || br.w != 8*1024 || cap(br.buf) != 8*1024 { 66 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 67 | } 68 | for i := 0; i < 8*1024; i++ { 69 | if br.buf[i] != 'a' { 70 | t.Fatal("BufferReader value is wrong!") 71 | } 72 | } 73 | 74 | br.r = 8*1024 - 10 75 | br.reader = newByteReader('b', 5) 76 | br.fill() 77 | if br.r != 0 || br.w != 15 || cap(br.buf) != 8*1024 { 78 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 79 | } 80 | for i := 0; i < 10; i++ { 81 | if br.buf[i] != 'a' { 82 | t.Fatal("BufferReader value is wrong!") 83 | } 84 | } 85 | for i := 10; i < 15; i++ { 86 | if br.buf[i] != 'b' { 87 | t.Fatal("BufferReader value is wrong!") 88 | } 89 | } 90 | br.reader = newByteReader('a', 8*1024) 91 | br.fill() 92 | 93 | br.r = 50 94 | br.reader = newByteReader('c', 10) 95 | br.fill() 96 | if br.r != 0 || br.w != 8*1024-40 || cap(br.buf) != 8*1024 { 97 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 98 | } 99 | for i := 0; i < 8*1024-50; i++ { 100 | if br.buf[i] != 'a' { 101 | t.Fatal("BufferReader value is wrong!") 102 | } 103 | } 104 | for i := 8*1024 - 50; i < 8*1024-40; i++ { 105 | if br.buf[i] != 'c' { 106 | t.Fatal("BufferReader value is wrong!") 107 | } 108 | } 109 | } 110 | 111 | func TestReadFull(t *testing.T) { 112 | br := NewBufferReader(newByteReader('e', 100), 8*1024) 113 | ret, _ := br.ReadFull(8 * 1024) 114 | if br.r != 8*1024 || br.w != 8*1024 || cap(br.buf) != 8*1024 { 115 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 116 | } 117 | for i := 0; i < 8*1024; i++ { 118 | if ret[i] != 'e' { 119 | t.Fatal("BufferReader readfull value is wrong!") 120 | } 121 | } 122 | 123 | br.reader = newByteReader('z', 100) 124 | ret, _ = br.ReadFull(8*1024 - 111) 125 | if br.r != 8*1024-111 || br.w != ((8*1024-111)/100+1)*100 || cap(br.buf) != 8*1024 { 126 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 127 | } 128 | for i := 0; i < 8*1024-111; i++ { 129 | if ret[i] != 'z' { 130 | t.Fatal("BufferReader readfull value is wrong!") 131 | } 132 | } 133 | } 134 | 135 | func TestReadFullWithReset(t *testing.T) { 136 | br := NewBufferReader(newByteReader('e', 100), 8*1024) 137 | ret, _ := br.ReadFull(8) 138 | if br.r != 8 || br.w != 100 || cap(br.buf) != 8*1024 { 139 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 140 | } 141 | for i := 0; i < 8; i++ { 142 | if ret[i] != 'e' { 143 | t.Fatal("BufferReader readfull value is wrong!") 144 | } 145 | } 146 | 147 | br.Reset() 148 | if br.r != 0 || br.w != 100-8 || cap(br.buf) != 8*1024 { 149 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 150 | } 151 | 152 | br.reader = newByteReader('z', 100) 153 | ret, _ = br.ReadFull(8 * 1024) 154 | if br.r != 8*1024 || br.w != 8*1024 || cap(br.buf) != 8*1024 { 155 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 156 | } 157 | for i := 0; i < 92; i++ { 158 | if ret[i] != 'e' { 159 | t.Fatal("BufferReader readfull value is wrong!") 160 | } 161 | } 162 | for i := 92; i < 8*1024; i++ { 163 | if ret[i] != 'z' { 164 | t.Fatal("BufferReader readfull value is wrong!") 165 | } 166 | } 167 | 168 | br.Reset() 169 | ret, _ = br.ReadFull(8) 170 | if br.r != 8 || br.w != 100 || cap(br.buf) != 8*1024 { 171 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 172 | } 173 | for i := 0; i < 8; i++ { 174 | if ret[i] != 'z' { 175 | t.Fatal("BufferReader readfull value is wrong!") 176 | } 177 | } 178 | br.Reset() 179 | if br.r != 0 || br.w != 100-8 || cap(br.buf) != 8*1024 { 180 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 181 | } 182 | br.reader = newByteReader('g', 100) 183 | ret, _ = br.ReadFull(2 * 8 * 1024) 184 | if br.r != 2*8*1024 || br.w != 2*8*1024 || cap(br.buf) != 2*8*1024 { 185 | t.Fatalf("BufferReader status is wrong: [%d], [%d], [%d]", cap(br.buf), br.r, br.w) 186 | } 187 | for i := 0; i < 92; i++ { 188 | if ret[i] != 'z' { 189 | t.Fatal("BufferReader readfull value is wrong!") 190 | } 191 | } 192 | for i := 92; i < 2*8*1024; i++ { 193 | if ret[i] != 'g' { 194 | t.Fatal("BufferReader readfull value is wrong!") 195 | } 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /transport_replicate.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package raft 16 | 17 | import ( 18 | "encoding/binary" 19 | "fmt" 20 | "io" 21 | "net" 22 | "runtime" 23 | "sync" 24 | "sync/atomic" 25 | "time" 26 | 27 | "github.com/tiglabs/raft/logger" 28 | "github.com/tiglabs/raft/proto" 29 | "github.com/tiglabs/raft/util" 30 | ) 31 | 32 | type replicateTransport struct { 33 | config *TransportConfig 34 | raftServer *RaftServer 35 | listener net.Listener 36 | curSnapshot int32 37 | mu sync.RWMutex 38 | senders map[uint64]*transportSender 39 | stopc chan struct{} 40 | } 41 | 42 | func newReplicateTransport(raftServer *RaftServer, config *TransportConfig) (*replicateTransport, error) { 43 | var ( 44 | listener net.Listener 45 | err error 46 | ) 47 | 48 | if listener, err = net.Listen("tcp", config.ReplicateAddr); err != nil { 49 | return nil, err 50 | } 51 | t := &replicateTransport{ 52 | config: config, 53 | raftServer: raftServer, 54 | listener: listener, 55 | senders: make(map[uint64]*transportSender), 56 | stopc: make(chan struct{}), 57 | } 58 | return t, nil 59 | } 60 | 61 | func (t *replicateTransport) stop() { 62 | t.mu.Lock() 63 | defer t.mu.Unlock() 64 | 65 | select { 66 | case <-t.stopc: 67 | return 68 | default: 69 | close(t.stopc) 70 | t.listener.Close() 71 | for _, s := range t.senders { 72 | s.stop() 73 | } 74 | } 75 | } 76 | 77 | func (t *replicateTransport) send(m *proto.Message) { 78 | s := t.getSender(m.To) 79 | s.send(m) 80 | } 81 | 82 | func (t *replicateTransport) getSender(nodeId uint64) *transportSender { 83 | t.mu.RLock() 84 | sender, ok := t.senders[nodeId] 85 | t.mu.RUnlock() 86 | if ok { 87 | return sender 88 | } 89 | 90 | t.mu.Lock() 91 | defer t.mu.Unlock() 92 | if sender, ok = t.senders[nodeId]; !ok { 93 | sender = newTransportSender(nodeId, uint64(t.config.MaxReplConcurrency), t.config.SendBufferSize, Replicate, t.config.Resolver) 94 | t.senders[nodeId] = sender 95 | } 96 | return sender 97 | } 98 | 99 | func (t *replicateTransport) sendSnapshot(m *proto.Message, rs *snapshotStatus) { 100 | var ( 101 | conn *util.ConnTimeout 102 | err error 103 | ) 104 | defer func() { 105 | atomic.AddInt32(&t.curSnapshot, -1) 106 | rs.respond(err) 107 | if conn != nil { 108 | conn.Close() 109 | } 110 | if err != nil { 111 | logger.Error("[Transport] %v send snapshot to %v failed error is: %v.", m.ID, m.To, err) 112 | } else if logger.IsEnableWarn() { 113 | logger.Warn("[Transport] %v send snapshot to %v successful.", m.ID, m.To) 114 | } 115 | }() 116 | 117 | if atomic.AddInt32(&t.curSnapshot, 1) > int32(t.config.MaxSnapConcurrency) { 118 | err = fmt.Errorf("snapshot concurrency exceed the limit %v.", t.config.MaxSnapConcurrency) 119 | return 120 | } 121 | if conn = getConn(m.To, Replicate, t.config.Resolver, 10*time.Minute, 1*time.Minute); conn == nil { 122 | err = fmt.Errorf("can't get connection to %v.", m.To) 123 | return 124 | } 125 | 126 | // send snapshot header message 127 | bufWr := util.NewBufferWriter(conn, 1*MB) 128 | if err = m.Encode(bufWr); err != nil { 129 | return 130 | } 131 | if err = bufWr.Flush(); err != nil { 132 | return 133 | } 134 | 135 | // send snapshot data 136 | var ( 137 | data []byte 138 | loopCount = 0 139 | sizeBuf = make([]byte, 4) 140 | ) 141 | for err == nil { 142 | loopCount = loopCount + 1 143 | if loopCount > 16 { 144 | loopCount = 0 145 | runtime.Gosched() 146 | } 147 | 148 | select { 149 | case <-rs.stopCh: 150 | err = fmt.Errorf("raft has shutdown.") 151 | 152 | default: 153 | data, err = m.Snapshot.Next() 154 | if len(data) > 0 { 155 | // write block size 156 | binary.BigEndian.PutUint32(sizeBuf, uint32(len(data))) 157 | if _, err = bufWr.Write(sizeBuf); err == nil { 158 | _, err = bufWr.Write(data) 159 | } 160 | } 161 | } 162 | } 163 | 164 | // write end flag and flush 165 | if err != nil && err != io.EOF { 166 | return 167 | } 168 | binary.BigEndian.PutUint32(sizeBuf, 0) 169 | if _, err = bufWr.Write(sizeBuf); err != nil { 170 | return 171 | } 172 | if err = bufWr.Flush(); err != nil { 173 | return 174 | } 175 | 176 | // wait response 177 | err = nil 178 | resp := make([]byte, 1) 179 | io.ReadFull(conn, resp) 180 | if resp[0] != 1 { 181 | err = fmt.Errorf("follower response failed.") 182 | } 183 | } 184 | 185 | func (t *replicateTransport) start() { 186 | util.RunWorkerUtilStop(func() { 187 | for { 188 | select { 189 | case <-t.stopc: 190 | return 191 | default: 192 | conn, err := t.listener.Accept() 193 | if err != nil { 194 | continue 195 | } 196 | t.handleConn(util.NewConnTimeout(conn)) 197 | } 198 | } 199 | }, t.stopc) 200 | } 201 | 202 | func (t *replicateTransport) handleConn(conn *util.ConnTimeout) { 203 | util.RunWorker(func() { 204 | defer conn.Close() 205 | 206 | loopCount := 0 207 | bufRd := util.NewBufferReader(conn, 16*KB) 208 | for { 209 | loopCount = loopCount + 1 210 | if loopCount > 16 { 211 | loopCount = 0 212 | runtime.Gosched() 213 | } 214 | 215 | select { 216 | case <-t.stopc: 217 | return 218 | default: 219 | if msg, err := reciveMessage(bufRd); err != nil { 220 | return 221 | } else { 222 | //logger.Debug(fmt.Sprintf("Recive %v from (%v)", msg.ToString(), conn.RemoteAddr())) 223 | if msg.Type == proto.ReqMsgSnapShot { 224 | if err := t.handleSnapshot(msg, conn, bufRd); err != nil { 225 | return 226 | } 227 | } else { 228 | t.raftServer.reciveMessage(msg) 229 | } 230 | } 231 | } 232 | } 233 | }) 234 | } 235 | 236 | var snap_ack = []byte{1} 237 | 238 | func (t *replicateTransport) handleSnapshot(m *proto.Message, conn *util.ConnTimeout, bufRd *util.BufferReader) error { 239 | conn.SetReadTimeout(time.Minute) 240 | conn.SetWriteTimeout(15 * time.Second) 241 | bufRd.Grow(1 * MB) 242 | req := newSnapshotRequest(m, bufRd) 243 | t.raftServer.reciveSnapshot(req) 244 | 245 | // wait snapshot result 246 | if err := req.response(); err != nil { 247 | logger.Error("[Transport] handle snapshot request from %v error: %v.", m.From, err) 248 | return err 249 | } 250 | 251 | _, err := conn.Write(snap_ack) 252 | return err 253 | } 254 | -------------------------------------------------------------------------------- /storage/wal/storage.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "errors" 19 | "fmt" 20 | "time" 21 | 22 | "github.com/tiglabs/raft/logger" 23 | "github.com/tiglabs/raft/proto" 24 | "github.com/tiglabs/raft/util/log" 25 | ) 26 | 27 | // Storage the storage 28 | type Storage struct { 29 | c *Config 30 | 31 | // Log Entry 32 | ls *logEntryStorage 33 | truncIndex uint64 34 | truncTerm uint64 35 | 36 | hardState proto.HardState 37 | metafile *metaFile 38 | prevCommit uint64 // 有commit变化时sync一下 39 | 40 | closed bool 41 | } 42 | 43 | // NewStorage new 44 | func NewStorage(dir string, c *Config) (*Storage, error) { 45 | if err := initDir(dir); err != nil { 46 | return nil, err 47 | } 48 | 49 | // 加载HardState 50 | mf, hardState, meta, err := openMetaFile(dir) 51 | if err != nil { 52 | return nil, err 53 | } 54 | 55 | s := &Storage{ 56 | c: c.dup(), 57 | truncIndex: meta.truncIndex, 58 | truncTerm: meta.truncTerm, 59 | hardState: hardState, 60 | metafile: mf, 61 | prevCommit: hardState.Commit, 62 | } 63 | 64 | // 加载日志文件 65 | ls, err := openLogStorage(dir, s) 66 | if err != nil { 67 | return nil, err 68 | } 69 | s.ls = ls 70 | 71 | if c.GetTruncateFirstDummy() { 72 | if err := s.truncateFirstDummy(); err != nil { 73 | return nil, err 74 | } 75 | } 76 | 77 | return s, nil 78 | } 79 | 80 | func (s *Storage) truncateFirstDummy() error { 81 | // 保证是初始化时(不能已有日志存在) 82 | li, err := s.LastIndex() 83 | if err != nil { 84 | return err 85 | } 86 | if li != 0 { 87 | return errors.New("truncate first dummy forbidden") 88 | } 89 | 90 | meta := truncateMeta{ 91 | truncIndex: 1, 92 | truncTerm: 1, 93 | } 94 | 95 | if err = s.metafile.SaveTruncateMeta(meta); err != nil { 96 | return err 97 | } 98 | if err = s.metafile.Sync(); err != nil { 99 | return err 100 | } 101 | 102 | s.truncIndex = meta.truncIndex 103 | s.truncTerm = meta.truncTerm 104 | 105 | return nil 106 | } 107 | 108 | // InitialState returns the saved HardState information to init the repl state. 109 | func (s *Storage) InitialState() (proto.HardState, error) { 110 | return s.hardState, nil 111 | } 112 | 113 | // Entries returns a slice of log entries in the range [lo,hi), the hi is not inclusive. 114 | // MaxSize limits the total size of the log entries returned, but Entries returns at least one entry if any. 115 | // If lo <= CompactIndex,then return isCompact true. 116 | // If no entries,then return entries nil. 117 | // Note: math.MaxUint32 is no limit. 118 | func (s *Storage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) { 119 | if lo <= s.truncIndex { 120 | return nil, true, nil 121 | } 122 | entries, isCompact, err = s.ls.Entries(lo, hi, maxSize) 123 | return 124 | } 125 | 126 | // Term returns the term of entry i, which must be in the range [FirstIndex()-1, LastIndex()]. 127 | // The term of the entry before FirstIndex is retained for matching purposes even though the 128 | // rest of that entry may not be available. 129 | // If lo <= CompactIndex,then return isCompact true. 130 | func (s *Storage) Term(index uint64) (term uint64, isCompact bool, err error) { 131 | switch { 132 | case index < s.truncIndex: 133 | return 0, true, nil 134 | case index == s.truncIndex: 135 | term = s.truncTerm 136 | return 137 | default: 138 | term, isCompact, err = s.ls.Term(index) 139 | return 140 | } 141 | } 142 | 143 | // FirstIndex returns the index of the first log entry that is possibly available via Entries (older entries have been incorporated 144 | // into the latest Snapshot; if storage only contains the dummy entry the first log entry is not available). 145 | func (s *Storage) FirstIndex() (index uint64, err error) { 146 | index = s.truncIndex + 1 147 | return 148 | } 149 | 150 | // LastIndex returns the index of the last entry in the log. 151 | func (s *Storage) LastIndex() (index uint64, err error) { 152 | index = s.ls.LastIndex() 153 | if index < s.truncIndex { 154 | index = s.truncIndex 155 | } 156 | return 157 | } 158 | 159 | // StoreEntries store the log entries to the repository. 160 | // If first index of entries > LastIndex,then append all entries, 161 | // Else write entries at first index and truncate the redundant log entries. 162 | func (s *Storage) StoreEntries(entries []*proto.Entry) error { 163 | if err := s.ls.SaveEntries(entries); err != nil { 164 | return err 165 | } 166 | return nil 167 | } 168 | 169 | // StoreHardState store the raft state to the repository. 170 | func (s *Storage) StoreHardState(st proto.HardState) error { 171 | if err := s.metafile.SaveHardState(st); err != nil { 172 | return err 173 | } 174 | s.hardState = st 175 | 176 | if s.c.GetSync() { 177 | sync := false 178 | if st.Commit != s.prevCommit { 179 | sync = true 180 | s.prevCommit = st.Commit 181 | } 182 | if sync { 183 | if err := s.metafile.Sync(); err != nil { 184 | return err 185 | } 186 | if err := s.ls.Sync(); err != nil { 187 | return err 188 | } 189 | } 190 | } 191 | 192 | return nil 193 | } 194 | 195 | // Truncate the log to index, The index is inclusive. 196 | func (s *Storage) Truncate(index uint64) error { 197 | if index <= s.truncIndex { 198 | log.Warn("already truncated. index=%d", index) 199 | return nil 200 | } 201 | 202 | term, isCompact, err := s.ls.Term(index) 203 | if err != nil { 204 | return err 205 | } 206 | if isCompact { 207 | return fmt.Errorf("expected compacted term. index:%d", index) 208 | } 209 | 210 | // 更新meta 211 | meta := truncateMeta{ 212 | truncIndex: index, 213 | truncTerm: term, 214 | } 215 | if err = s.metafile.SaveTruncateMeta(meta); err != nil { 216 | return err 217 | } 218 | if err = s.metafile.Sync(); err != nil { 219 | return err 220 | } 221 | 222 | // 截断日志文件 223 | if err = s.ls.TruncateFront(index); err != nil { 224 | return err 225 | } 226 | 227 | s.truncIndex = index 228 | s.truncTerm = term 229 | 230 | return nil 231 | } 232 | 233 | // ApplySnapshot Sync snapshot status. 234 | func (s *Storage) ApplySnapshot(meta proto.SnapshotMeta) error { 235 | tMeta := truncateMeta{ 236 | truncIndex: meta.Index, 237 | truncTerm: meta.Term, 238 | } 239 | 240 | var err error 241 | 242 | // 更新commit位置 243 | s.hardState.Commit = meta.Index 244 | if err := s.metafile.SaveHardState(s.hardState); err != nil { 245 | return err 246 | } 247 | 248 | if err = s.metafile.SaveTruncateMeta(tMeta); err != nil { 249 | return err 250 | } 251 | if err = s.metafile.Sync(); err != nil { 252 | return err 253 | } 254 | 255 | if err = s.ls.TruncateAll(); err != nil { 256 | return err 257 | } 258 | 259 | s.truncIndex = meta.Index 260 | s.truncTerm = meta.Term 261 | 262 | return nil 263 | } 264 | 265 | // Close the storage. 266 | func (s *Storage) Close() { 267 | if !s.closed { 268 | s.ls.Close() 269 | s.metafile.Close() 270 | s.closed = true 271 | } 272 | } 273 | 274 | type metricReporter struct { 275 | ID string 276 | } 277 | 278 | func newReporterWithID(id string) *metricReporter { 279 | return &metricReporter{ 280 | ID: id, 281 | } 282 | } 283 | 284 | func (r *metricReporter) ReportInterval() time.Duration { 285 | return time.Minute 286 | } 287 | 288 | func (r *metricReporter) Report(data []byte) error { 289 | logger.Info("wal [%s] metrics: %s", r.ID, string(data)) 290 | return nil 291 | } 292 | -------------------------------------------------------------------------------- /storage/storage_memory.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // Modified work copyright 2018 The tiglabs Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | package storage 16 | 17 | import ( 18 | "errors" 19 | "fmt" 20 | 21 | "github.com/tiglabs/raft/logger" 22 | "github.com/tiglabs/raft/proto" 23 | "github.com/tiglabs/raft/util" 24 | ) 25 | 26 | type fsm interface { 27 | AppliedIndex(id uint64) uint64 28 | } 29 | 30 | // This storage is circular storage in memory and truncate when over capacity, 31 | // but keep it a high capacity. 32 | type MemoryStorage struct { 33 | fsm fsm 34 | id uint64 35 | // the threshold of truncate 36 | capacity uint64 37 | // the index of last truncate 38 | truncIndex uint64 39 | truncTerm uint64 40 | // the starting offset in the ents 41 | start uint64 42 | // the actual log in the ents 43 | count uint64 44 | // the total size of the ents 45 | size uint64 46 | // ents[i] has raft log position i+snapshot.Metadata.Index 47 | ents []*proto.Entry 48 | hardState proto.HardState 49 | } 50 | 51 | func NewMemoryStorage(fsm fsm, id, capacity uint64) *MemoryStorage { 52 | if logger.IsEnableWarn() { 53 | logger.Warn("Memory Storage capacity is: %v.", capacity) 54 | } 55 | return &MemoryStorage{ 56 | fsm: fsm, 57 | id: id, 58 | capacity: capacity, 59 | size: capacity, 60 | ents: make([]*proto.Entry, capacity), 61 | } 62 | } 63 | 64 | func DefaultMemoryStorage() *MemoryStorage { 65 | return NewMemoryStorage(nil, 0, 4096) 66 | } 67 | 68 | func (ms *MemoryStorage) InitialState() (proto.HardState, error) { 69 | return ms.hardState, nil 70 | } 71 | 72 | func (ms *MemoryStorage) FirstIndex() (uint64, error) { 73 | return ms.truncIndex + 1, nil 74 | } 75 | 76 | func (ms *MemoryStorage) LastIndex() (uint64, error) { 77 | return ms.lastIndex(), nil 78 | } 79 | 80 | func (ms *MemoryStorage) lastIndex() uint64 { 81 | return ms.truncIndex + ms.count 82 | } 83 | 84 | func (ms *MemoryStorage) Term(index uint64) (term uint64, isCompact bool, err error) { 85 | switch { 86 | case index < ms.truncIndex: 87 | return 0, true, nil 88 | case index == ms.truncIndex: 89 | return ms.truncTerm, false, nil 90 | default: 91 | return ms.ents[ms.locatePosition(index)].Term, false, nil 92 | } 93 | } 94 | 95 | func (ms *MemoryStorage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) { 96 | if lo <= ms.truncIndex { 97 | return nil, true, nil 98 | } 99 | if hi > ms.lastIndex()+1 { 100 | return nil, false, fmt.Errorf("[MemoryStorage->Entries]entries's hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex()) 101 | } 102 | // only contains dummy entries. 103 | if ms.count == 0 { 104 | return nil, false, errors.New("requested entry at index is unavailable") 105 | } 106 | 107 | count := hi - lo 108 | if count <= 0 { 109 | return []*proto.Entry{}, false, nil 110 | } 111 | retEnts := make([]*proto.Entry, count) 112 | pos := ms.locatePosition(lo) 113 | retEnts[0] = ms.ents[pos] 114 | size := ms.ents[pos].Size() 115 | limit := uint64(1) 116 | for ; limit < count; limit++ { 117 | pos = pos + 1 118 | if pos >= ms.size { 119 | pos = pos - ms.size 120 | } 121 | size = size + ms.ents[pos].Size() 122 | if uint64(size) > maxSize { 123 | break 124 | } 125 | retEnts[limit] = ms.ents[pos] 126 | } 127 | return retEnts[:limit], false, nil 128 | } 129 | 130 | func (ms *MemoryStorage) StoreEntries(entries []*proto.Entry) error { 131 | if len(entries) == 0 { 132 | return nil 133 | } 134 | 135 | appIndex := uint64(0) 136 | if ms.fsm != nil { 137 | appIndex = ms.fsm.AppliedIndex(ms.id) 138 | } 139 | first := appIndex + 1 140 | last := entries[0].Index + uint64(len(entries)) - 1 141 | if last < first { 142 | // shortcut if there is no new entry. 143 | return nil 144 | } 145 | if first > entries[0].Index { 146 | // truncate compacted entries 147 | entries = entries[first-entries[0].Index:] 148 | } 149 | offset := entries[0].Index - ms.truncIndex - 1 150 | if ms.count < offset { 151 | logger.Error("missing log entry [last: %d, append at: %d]", ms.lastIndex(), entries[0].Index) 152 | return nil 153 | } 154 | 155 | // resize and truncate compacted ents 156 | entriesSize := uint64(len(entries)) 157 | maxSize := offset + entriesSize 158 | minSize := maxSize - (appIndex - ms.truncIndex) 159 | switch { 160 | case minSize > ms.capacity: 161 | // truncate compacted ents 162 | if ms.truncIndex < appIndex { 163 | ms.truncateTo(appIndex) 164 | } 165 | // grow ents 166 | if minSize > ms.size { 167 | ms.resize(ms.capacity+minSize, minSize) 168 | } 169 | 170 | default: 171 | // truncate compacted ents 172 | if maxSize > ms.capacity { 173 | cmpIdx := util.Min(appIndex, maxSize-ms.capacity+ms.truncIndex) 174 | if ms.truncIndex < cmpIdx { 175 | ms.truncateTo(cmpIdx) 176 | } 177 | } 178 | // short ents 179 | if ms.size > ms.capacity { 180 | ms.resize(ms.capacity, maxSize) 181 | } 182 | } 183 | 184 | // append new entries 185 | start := ms.locatePosition(entries[0].Index) 186 | next := start + entriesSize 187 | if next <= ms.size { 188 | copy(ms.ents[start:], entries) 189 | if ms.start <= start { 190 | ms.count = next - ms.start 191 | } else { 192 | ms.count = (ms.size - ms.start) + (next - 0) 193 | } 194 | } else { 195 | count := ms.size - start 196 | copy(ms.ents[start:], entries[0:count]) 197 | copy(ms.ents[0:], entries[count:]) 198 | ms.count = (ms.size - ms.start) + (entriesSize - count) 199 | } 200 | 201 | return nil 202 | } 203 | 204 | func (ms *MemoryStorage) StoreHardState(st proto.HardState) error { 205 | ms.hardState = st 206 | return nil 207 | } 208 | 209 | func (ms *MemoryStorage) ApplySnapshot(meta proto.SnapshotMeta) error { 210 | ms.truncIndex = meta.Index 211 | ms.truncTerm = meta.Term 212 | ms.start = 0 213 | ms.count = 0 214 | ms.size = ms.capacity 215 | ms.ents = make([]*proto.Entry, ms.capacity) 216 | return nil 217 | } 218 | 219 | func (ms *MemoryStorage) Truncate(index uint64) error { 220 | if index == 0 || index <= ms.truncIndex { 221 | return errors.New("requested index is unavailable due to compaction") 222 | } 223 | if index > ms.lastIndex() { 224 | return fmt.Errorf("compact %d is out of bound lastindex(%d)", index, ms.lastIndex()) 225 | } 226 | ms.truncateTo(index) 227 | return nil 228 | } 229 | 230 | func (ms *MemoryStorage) Close() { 231 | 232 | } 233 | 234 | func (ms *MemoryStorage) truncateTo(index uint64) { 235 | ms.truncTerm = ms.ents[ms.locatePosition(index)].Term 236 | ms.start = ms.locatePosition(index + 1) 237 | ms.count = ms.count - (index - ms.truncIndex) 238 | ms.truncIndex = index 239 | } 240 | 241 | func (ms *MemoryStorage) resize(capacity, needSize uint64) { 242 | ents := make([]*proto.Entry, capacity) 243 | count := util.Min(util.Min(capacity, ms.count), needSize) 244 | next := ms.start + count 245 | if next <= ms.size { 246 | copy(ents, ms.ents[ms.start:next]) 247 | } else { 248 | next = next - ms.size 249 | copy(ents, ms.ents[ms.start:]) 250 | copy(ents[ms.size-ms.start:], ms.ents[0:next]) 251 | } 252 | 253 | ms.start = 0 254 | ms.count = count 255 | ms.size = capacity 256 | ms.ents = ents 257 | } 258 | 259 | func (ms *MemoryStorage) locatePosition(index uint64) uint64 { 260 | position := ms.start + (index - ms.truncIndex - 1) 261 | if position >= ms.size { 262 | position = position - ms.size 263 | } 264 | return position 265 | } 266 | -------------------------------------------------------------------------------- /storage/wal/log_file.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The tiglabs raft Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package wal 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "io" 21 | "os" 22 | "path" 23 | 24 | "github.com/tiglabs/raft/proto" 25 | "github.com/tiglabs/raft/util/log" 26 | ) 27 | 28 | type logEntryFile struct { 29 | dir string 30 | name logFileName 31 | 32 | f *os.File 33 | r recordReadAt 34 | w *recordWriter 35 | index logEntryIndex 36 | } 37 | 38 | func openLogEntryFile(dir string, name logFileName, isLastOne bool) (*logEntryFile, error) { 39 | p := path.Join(dir, name.String()) 40 | f, err := os.OpenFile(p, os.O_RDWR|os.O_APPEND, 0600) 41 | if err != nil { 42 | return nil, err 43 | } 44 | 45 | lf := &logEntryFile{ 46 | dir: dir, 47 | name: name, 48 | f: f, 49 | r: newRecordReader(f), 50 | } 51 | 52 | if !isLastOne { 53 | // 读取索引数据 54 | if err = lf.ReadIndex(); err != nil { 55 | return nil, err 56 | } 57 | } else { 58 | // 重建索引 59 | toffset, err := lf.ReBuildIndex() 60 | if err != nil { 61 | return nil, err 62 | } 63 | // 打开写 64 | if err = lf.OpenWrite(); err != nil { 65 | return nil, err 66 | } 67 | // 截断索引及后面的数据 68 | if toffset > 0 { 69 | log.Warn("truncate last logfile's N@%d index at: %d", lf.name.seq, toffset) 70 | if err := lf.w.Truncate(toffset); err != nil { 71 | return nil, err 72 | } 73 | } 74 | } 75 | 76 | return lf, nil 77 | } 78 | 79 | func createLogEntryFile(dir string, name logFileName) (*logEntryFile, error) { 80 | p := path.Join(dir, name.String()) 81 | f, err := os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC|os.O_APPEND, 0600) 82 | if err != nil { 83 | return nil, err 84 | } 85 | 86 | lf := &logEntryFile{ 87 | dir: dir, 88 | name: name, 89 | f: f, 90 | r: newRecordReader(f), 91 | } 92 | 93 | if err := lf.OpenWrite(); err != nil { 94 | return nil, err 95 | } 96 | 97 | return lf, nil 98 | } 99 | 100 | func (lf *logEntryFile) ReadIndex() error { 101 | info, err := lf.f.Stat() 102 | if err != nil { 103 | return err 104 | } 105 | 106 | // read footer 107 | var footer footerRecord 108 | if info.Size() < int64(footer.Size()) { 109 | return NewCorruptError(lf.f.Name(), 0, "too small footer") 110 | } 111 | offset := info.Size() - int64(recordSize(footer)) 112 | rec, err := lf.r.ReadAt(offset) 113 | if err != nil { 114 | return err 115 | } 116 | if rec.recType != recTypeFooter { 117 | return NewCorruptError(lf.f.Name(), offset, "wrong footer record type") 118 | } 119 | if rec.dataLen != footer.Size() { 120 | return NewCorruptError(lf.f.Name(), offset, "wrong footer size") 121 | } 122 | footer.Decode(rec.data) 123 | if !bytes.Equal(footer.magic, footerMagic) { 124 | return NewCorruptError(lf.f.Name(), offset, "wrong footer magic") 125 | } 126 | 127 | // read index data 128 | offset = int64(footer.indexOffset) 129 | rec, err = lf.r.ReadAt(offset) 130 | if err != nil { 131 | return err 132 | } 133 | if rec.recType != recTypeIndex { 134 | return NewCorruptError(lf.f.Name(), offset, "wrong index record type") 135 | } 136 | lf.index = decodeLogIndex(rec.data) 137 | 138 | return nil 139 | } 140 | 141 | func (lf *logEntryFile) ReBuildIndex() (truncateOffset int64, err error) { 142 | lf.index = nil 143 | 144 | // 获取文件大小 145 | info, err := lf.f.Stat() 146 | if err != nil { 147 | return 0, err 148 | } 149 | filesize := info.Size() 150 | 151 | var ( 152 | rec record 153 | offset int64 154 | nextRecordOffset int64 155 | ) 156 | r := newRecordReader(lf.f) 157 | for { 158 | offset, rec, err = r.Read() 159 | if err != nil { 160 | break 161 | } 162 | nextRecordOffset = r.offset 163 | // log entry 更新索引 164 | if rec.recType == recTypeLogEntry { 165 | ent := &proto.Entry{} 166 | ent.Decode(rec.data) 167 | lf.index = lf.index.Append(uint32(offset), ent) 168 | } else if rec.recType == recTypeIndex { // 处理写了index,但是没写footer或者下一个新日志文件没创建 169 | var footer footerRecord 170 | curIndexSize := int64(recordSize(lf.index)) 171 | footerSize := int64(recordSize(footer)) 172 | // index的大小+footer不大于文件大小,则截断 173 | if filesize <= offset+curIndexSize+footerSize { 174 | return offset, nil 175 | } else { 176 | return 0, NewCorruptError(lf.f.Name(), offset, "could not truncate last logfile's index") 177 | } 178 | } else { 179 | return 0, NewCorruptError(lf.f.Name(), offset, fmt.Sprintf("wrong log entry record type: %s", rec.recType.String())) 180 | } 181 | } 182 | if err == io.EOF { 183 | err = nil 184 | } 185 | if filesize != nextRecordOffset { 186 | log.Warn("logName[%v],fileSize[%v],corrupt data after offset[%v]", lf.name, filesize, nextRecordOffset) 187 | } 188 | return offset, err 189 | } 190 | 191 | func (lf *logEntryFile) Name() logFileName { 192 | return lf.name 193 | } 194 | 195 | func (lf *logEntryFile) Seq() uint64 { 196 | return lf.name.seq 197 | } 198 | 199 | func (lf *logEntryFile) Len() int { 200 | return lf.index.Len() 201 | } 202 | 203 | func (lf *logEntryFile) FirstIndex() uint64 { 204 | return lf.index.First() 205 | } 206 | 207 | func (lf *logEntryFile) LastIndex() uint64 { 208 | return lf.index.Last() 209 | } 210 | 211 | // Get get log entry 212 | func (lf *logEntryFile) Get(i uint64) (*proto.Entry, error) { 213 | item, err := lf.index.Get(i) 214 | if err != nil { 215 | return nil, err 216 | } 217 | 218 | rec, err := lf.r.ReadAt(int64(item.offset)) 219 | if err != nil { 220 | return nil, err 221 | } 222 | 223 | ent := &proto.Entry{} 224 | ent.Decode(rec.data) 225 | 226 | return ent, nil 227 | } 228 | 229 | // Term get log's term 230 | func (lf *logEntryFile) Term(i uint64) (uint64, error) { 231 | item, err := lf.index.Get(i) 232 | if err != nil { 233 | return 0, err 234 | } 235 | return item.logterm, nil 236 | } 237 | 238 | // Truncate 截断最近的日志 239 | func (lf *logEntryFile) Truncate(index uint64) error { 240 | if lf.Len() == 0 { 241 | return nil 242 | } 243 | 244 | item, err := lf.index.Get(index) 245 | if err != nil { 246 | return err 247 | } 248 | 249 | // 截断文件 250 | offset := int64(item.offset) 251 | if err = lf.w.Truncate(offset); err != nil { 252 | return err 253 | } 254 | 255 | // 截断索引 256 | lf.index, err = lf.index.Truncate(index) 257 | return err 258 | } 259 | 260 | func (lf *logEntryFile) Save(ent *proto.Entry) error { 261 | // 写入文件 262 | offset := lf.w.Offset() 263 | if err := lf.w.Write(recTypeLogEntry, ent); err != nil { 264 | return err 265 | } 266 | 267 | // 更新索引 268 | lf.index = lf.index.Append(uint32(offset), ent) 269 | 270 | return nil 271 | } 272 | 273 | func (lf *logEntryFile) OpenWrite() error { 274 | if lf.w != nil { 275 | return nil 276 | } 277 | 278 | lf.w = newRecordWriter(lf.f) 279 | return nil 280 | } 281 | 282 | func (lf *logEntryFile) WriteOffset() int64 { 283 | return lf.w.Offset() 284 | } 285 | 286 | func (lf *logEntryFile) Flush() error { 287 | return lf.w.Flush() 288 | } 289 | 290 | // Sync flush write buffer and sync to disk 291 | func (lf *logEntryFile) Sync() error { 292 | return lf.w.Sync() 293 | } 294 | 295 | func (lf *logEntryFile) FinishWrite() error { 296 | var err error 297 | 298 | // write log index data 299 | recOffset := lf.w.Offset() 300 | if err = lf.w.Write(recTypeIndex, lf.index); err != nil { 301 | return err 302 | } 303 | 304 | // write log file footer 305 | footer := &footerRecord{ 306 | indexOffset: uint64(recOffset), 307 | } 308 | if err = lf.w.Write(recTypeFooter, footer); err != nil { 309 | return err 310 | } 311 | 312 | if err := lf.w.Close(); err != nil { 313 | return err 314 | } 315 | lf.w = nil 316 | return nil 317 | } 318 | 319 | // Close 关闭读写,关闭文件 320 | func (lf *logEntryFile) Close() error { 321 | if lf.w != nil { 322 | if err := lf.w.Close(); err != nil { 323 | return err 324 | } 325 | lf.w = nil 326 | } 327 | 328 | return lf.f.Close() 329 | } 330 | --------------------------------------------------------------------------------