├── .github ├── CODEOWNERS ├── dependabot.yml ├── stale.yml └── workflows │ ├── ci.yml │ └── two-step-pr-approval.yml ├── .gitignore ├── .gitmodules ├── .golangci-lint.yml ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── api.go ├── bench └── bench.go ├── bench_test.go ├── commands.go ├── commitment.go ├── commitment_test.go ├── config.go ├── configuration.go ├── configuration_test.go ├── discard_snapshot.go ├── discard_snapshot_test.go ├── docs ├── README.md └── apply.md ├── file_snapshot.go ├── file_snapshot_test.go ├── fsm.go ├── future.go ├── future_test.go ├── fuzzy ├── apply_src.go ├── cluster.go ├── fsm.go ├── fsm_batch.go ├── go.mod ├── go.sum ├── leadershiptransfer_test.go ├── membership_test.go ├── node.go ├── partition_test.go ├── readme.md ├── resolve.go ├── simple_test.go ├── slowvoter_test.go ├── transport.go └── verifier.go ├── go.mod ├── go.sum ├── inmem_snapshot.go ├── inmem_snapshot_test.go ├── inmem_store.go ├── inmem_transport.go ├── inmem_transport_test.go ├── integ_test.go ├── log.go ├── log_cache.go ├── log_cache_test.go ├── log_test.go ├── membership.md ├── net_transport.go ├── net_transport_test.go ├── observer.go ├── peersjson.go ├── peersjson_test.go ├── progress.go ├── raft-compat ├── go.mod ├── go.sum ├── prevote_test.go ├── rolling_upgrade_test.go ├── testcluster │ └── cluster.go └── utils │ └── test_utils.go ├── raft.go ├── raft_test.go ├── replication.go ├── saturation.go ├── saturation_test.go ├── snapshot.go ├── stable.go ├── state.go ├── tag.sh ├── tcp_transport.go ├── tcp_transport_test.go ├── testing.go ├── testing_batch.go ├── transport.go ├── transport_test.go ├── util.go └── util_test.go /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Each line is a file pattern followed by one or more owners. 2 | # More on CODEOWNERS files: https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners 3 | 4 | # Default owner 5 | * @hashicorp/team-ip-compliance @hashicorp/consul-core-reviewers @hashicorp/nomad-eng @hashicorp/raft-force 6 | 7 | # Add override rules below. Each line is a file/folder pattern followed by one or more owners. 8 | # Being an owner means those groups or individuals will be added as reviewers to PRs affecting 9 | # those areas of the code. 10 | # Examples: 11 | # /docs/ @docs-team 12 | # *.js @js-team 13 | # *.go @go-team 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "gomod" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) HashiCorp, Inc. 2 | # SPDX-License-Identifier: MPL-2.0 3 | 4 | # Number of days of inactivity before an Issue becomes stale 5 | daysUntilStale: 60 6 | 7 | # Number of days of inactivity before an Issue with the stale label is closed. 8 | # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale. 9 | daysUntilClose: 30 10 | 11 | # Issues with these labels will never be considered stale. Set to `[]` to disable 12 | # We don't close any issue that is an enhancement or confirmed bug, but issues 13 | # waiting for reproduction cases and questions tend to get outdated. 14 | exemptLabels: 15 | - "enhancement" 16 | - "bug" 17 | - "thinking" 18 | - "docs" 19 | 20 | # Label to use when marking as stale 21 | staleLabel: "waiting-reply" 22 | 23 | # Comment to post when marking as stale. Set to `false` to disable 24 | markComment: | 25 | Hey there, 26 | We wanted to check in on this request since it has been inactive for at least 90 days. 27 | Have you reviewed the latest [godocs](https://godoc.org/github.com/hashicorp/raft)? 28 | If you think this is still an important issue in the latest version of [the Raft library](https://github.com/hashicorp/raft/compare/) or 29 | [its documentation](https://github.com/hashicorp/raft/compare/) please feel let us know and we'll keep it open for investigation. 30 | If there is still no activity on this request in 30 days, we will go ahead and close it. 31 | Thank you! 32 | 33 | # Comment to post when removing the stale label. Set to `false` to disable 34 | unmarkComment: false 35 | 36 | # Comment to post when closing a stale Issue. Set to `false` to disable 37 | closeComment: > 38 | Hey there, 39 | This issue has been automatically closed because there hasn't been any activity for a while. 40 | If you are still experiencing problems, or still have questions, feel free to [open a new one](https://github.com/hashicorp/raft/issues/new) :+1 41 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | pull_request: 5 | branches: ["main"] 6 | push: 7 | branches: ["main"] 8 | tags: ["*"] 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | go-fmt-and-vet: 15 | runs-on: ubuntu-22.04 16 | steps: 17 | - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 18 | - uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0 19 | with: 20 | go-version: '1.20' 21 | cache: true 22 | - run: | 23 | files=$(go fmt ./...) 24 | if [ -n "$files" ]; then 25 | echo "The following file(s) do not conform to go fmt:" 26 | echo "$files" 27 | exit 1 28 | fi 29 | - run: | 30 | PACKAGE_NAMES=$(go list ./... | grep -v github.com/hashicorp/raft/fuzzy) 31 | go vet $PACKAGE_NAMES 32 | 33 | go-test: 34 | needs: go-fmt-and-vet 35 | strategy: 36 | matrix: 37 | go: ['1.19', '1.20'] 38 | arch: ['x32', 'x64'] 39 | runs-on: ubuntu-22.04 40 | env: 41 | INTEG_TESTS: yes 42 | steps: 43 | - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 44 | - uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0 45 | with: 46 | go-version: ${{ matrix.go }} 47 | architecture: ${{ matrix.arch }} 48 | cache: true 49 | # x86 specific build. 50 | - if: matrix.arch == 'x32' 51 | run: | 52 | sudo apt-get update 53 | sudo apt-get install gcc-multilib 54 | go test --tags batchtest ./... 55 | # x86-64 specific build. 56 | - if: matrix.arch == 'x64' 57 | run: go test -race --tags batchtest ./... 58 | go-test-compat: 59 | needs: go-test 60 | strategy: 61 | matrix: 62 | go: [ '1.20', '1.21', '1.22' ] 63 | arch: [ 'x32', 'x64' ] 64 | runs-on: ubuntu-22.04 65 | steps: 66 | - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 67 | - uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0 68 | with: 69 | go-version: ${{ matrix.go }} 70 | architecture: ${{ matrix.arch }} 71 | cache: true 72 | submodules: true 73 | # x86 specific build. 74 | - if: matrix.arch == 'x32' 75 | run: | 76 | sudo apt-get update 77 | sudo apt-get install gcc-multilib 78 | git submodule update --init --recursive 79 | cd raft-compat 80 | go mod tidy 81 | go test -v -coverpkg=./... ./... -coverprofile="${{ github.workspace }}/coverage.out" 82 | # x86-64 specific build. 83 | - if: matrix.arch == 'x64' 84 | run: | 85 | git submodule update --init --recursive 86 | cd raft-compat 87 | go mod tidy 88 | go test -race -v -coverpkg=./... ./... -coverprofile="${{ github.workspace }}/coverage.out" 89 | - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 90 | with: 91 | path: "${{ github.workspace }}/coverage.out" 92 | name: coverage-report-${{matrix.go}}-${{matrix.arch}} 93 | -------------------------------------------------------------------------------- /.github/workflows/two-step-pr-approval.yml: -------------------------------------------------------------------------------- 1 | name: Two-Stage PR Review Process 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize, reopened, labeled, unlabeled, ready_for_review, converted_to_draft] 6 | pull_request_review: 7 | types: [submitted] 8 | 9 | jobs: 10 | manage-pr-status: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | pull-requests: write 14 | contents: write 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | 19 | - name: Two stage PR review 20 | uses: hashicorp/two-stage-pr-approval@v0.1.0 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | 25 | # Goland IDE 26 | .idea 27 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "raft-compat/raft-latest"] 2 | path = raft-compat/raft-previous-version 3 | url = https://github.com/hashicorp/raft.git 4 | -------------------------------------------------------------------------------- /.golangci-lint.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) HashiCorp, Inc. 2 | # SPDX-License-Identifier: MPL-2.0 3 | 4 | run: 5 | deadline: 5m 6 | 7 | linters-settings: 8 | govet: 9 | check-shadowing: true 10 | golint: 11 | min-confidence: 0 12 | depguard: 13 | rules: 14 | main: 15 | list-mode: lax 16 | allow: 17 | - "github.com/hashicorp/go-metrics/compat" 18 | deny: 19 | - pkg: "github.com/hashicorp/go-metrics" 20 | desc: not allowed, use github.com/hashicorp/go-metrics/compat instead 21 | - pkg: "github.com/armon/go-metrics" 22 | desc: not allowed, use github.com/hashicorp/go-metrics/compat instead 23 | 24 | linters: 25 | disable-all: true 26 | enable: 27 | - gofmt 28 | #- golint 29 | - govet 30 | - depguard 31 | #- varcheck 32 | #- typecheck 33 | #- gosimple 34 | 35 | issues: 36 | exclude-use-default: false 37 | exclude: 38 | # ignore the false positive erros resulting from not including a comment above every `package` keyword 39 | - should have a package comment, unless it's in another file for this package (golint) 40 | # golint: Annoying issue about not having a comment. The rare codebase has such comments 41 | # - (comment on exported (method|function|type|const)|should have( a package)? comment|comment should be of the form) 42 | # errcheck: Almost all programs ignore errors on these functions and in most cases it's ok 43 | - Error return value of .((os\.)?std(out|err)\..*|.*Close|.*Flush|os\.Remove(All)?|.*printf?|os\.(Un)?Setenv). is not checked 44 | 45 | # golint: False positive when tests are defined in package 'test' 46 | - func name will be used as test\.Test.* by other packages, and that stutters; consider calling this 47 | 48 | # staticcheck: Developers tend to write in C-style with an 49 | # explicit 'break' in a 'switch', so it's ok to ignore 50 | - ineffective break statement. Did you mean to break out of the outer loop 51 | # gosec: Too many false-positives on 'unsafe' usage 52 | - Use of unsafe calls should be audited 53 | 54 | # gosec: Too many false-positives for parametrized shell calls 55 | - Subprocess launch(ed with variable|ing should be audited) 56 | 57 | # gosec: Duplicated errcheck checks 58 | - G104 59 | 60 | # gosec: Too many issues in popular repos 61 | - (Expect directory permissions to be 0750 or less|Expect file permissions to be 0600 or less) 62 | 63 | # gosec: False positive is triggered by 'src, err := ioutil.ReadFile(filename)' 64 | - Potential file inclusion via variable 65 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) HashiCorp, Inc. 2 | # SPDX-License-Identifier: MPL-2.0 3 | 4 | language: go 5 | 6 | go: 7 | # Disabled until https://github.com/armon/go-metrics/issues/59 is fixed 8 | # - 1.6 9 | - 1.8 10 | - 1.9 11 | - 1.12 12 | - tip 13 | 14 | install: 15 | - make deps 16 | - curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(go env GOPATH)/bin latest 17 | 18 | script: 19 | - make integ 20 | 21 | notifications: 22 | flowdock: 23 | secure: fZrcf9rlh2IrQrlch1sHkn3YI7SKvjGnAl/zyV5D6NROe1Bbr6d3QRMuCXWWdhJHzjKmXk5rIzbqJhUc0PNF7YjxGNKSzqWMQ56KcvN1k8DzlqxpqkcA3Jbs6fXCWo2fssRtZ7hj/wOP1f5n6cc7kzHDt9dgaYJ6nO2fqNPJiTc= 24 | 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) 2 | ENV = $(shell go env GOPATH) 3 | GO_VERSION = $(shell go version) 4 | GOLANG_CI_VERSION = v1.19.0 5 | 6 | # Look for versions prior to 1.10 which have a different fmt output 7 | # and don't lint with gofmt against them. 8 | ifneq (,$(findstring go version go1.8, $(GO_VERSION))) 9 | FMT= 10 | else ifneq (,$(findstring go version go1.9, $(GO_VERSION))) 11 | FMT= 12 | else 13 | FMT=--enable gofmt 14 | endif 15 | 16 | TEST_RESULTS_DIR?=/tmp/test-results 17 | 18 | test: 19 | GOTRACEBACK=all go test $(TESTARGS) -timeout=180s -race . 20 | GOTRACEBACK=all go test $(TESTARGS) -timeout=180s -tags batchtest -race . 21 | 22 | integ: test 23 | INTEG_TESTS=yes go test $(TESTARGS) -timeout=60s -run=Integ . 24 | INTEG_TESTS=yes go test $(TESTARGS) -timeout=60s -tags batchtest -run=Integ . 25 | 26 | fuzz: 27 | cd ./fuzzy && go test $(TESTARGS) -timeout=20m . 28 | cd ./fuzzy && go test $(TESTARGS) -timeout=20m -tags batchtest . 29 | 30 | deps: 31 | go get -t -d -v ./... 32 | echo $(DEPS) | xargs -n1 go get -d 33 | 34 | lint: 35 | gofmt -s -w . 36 | golangci-lint run -c .golangci-lint.yml $(FMT) . 37 | 38 | dep-linter: 39 | curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(ENV)/bin $(GOLANG_CI_VERSION) 40 | 41 | cov: 42 | INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html 43 | open /tmp/coverage.html 44 | 45 | .PHONY: test cov integ deps dep-linter lint 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | raft [![Build Status](https://github.com/hashicorp/raft/workflows/ci/badge.svg)](https://github.com/hashicorp/raft/actions) 2 | [![Go Reference](https://pkg.go.dev/badge/github.com/hashicorp/raft.svg)](https://pkg.go.dev/github.com/hashicorp/raft) 3 | [![Go Report Card](https://goreportcard.com/badge/github.com/hashicorp/raft)](https://goreportcard.com/report/github.com/hashicorp/raft) 4 | [![License: MPL 2.0](https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg)](https://opensource.org/licenses/MPL-2.0) 5 | [![Build Status](https://github.com/hashicorp/raft/actions/workflows/ci.yml/badge.svg)](https://github.com/hashicorp/raft/actions) 6 | [![Release](https://img.shields.io/github/v/release/hashicorp/raft)](https://github.com/hashicorp/raft/releases) 7 | [![Issues](https://img.shields.io/github/issues/hashicorp/raft)](https://github.com/hashicorp/raft/issues) 8 | [![Pull Requests](https://img.shields.io/github/issues-pr/hashicorp/raft)](https://github.com/hashicorp/raft/pulls) 9 | ==== 10 | 11 | raft is a [Go](http://www.golang.org) library that manages a replicated 12 | log and can be used with an FSM to manage replicated state machines. It 13 | is a library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)). 14 | 15 | The use cases for such a library are far-reaching, such as replicated state 16 | machines which are a key component of many distributed systems. They enable 17 | building Consistent, Partition Tolerant (CP) systems, with limited 18 | fault tolerance as well. 19 | 20 | ## Building 21 | 22 | If you wish to build raft you'll need Go version 1.16+ installed. 23 | 24 | Please check your installation with: 25 | 26 | ``` 27 | go version 28 | ``` 29 | 30 | ## Documentation 31 | 32 | For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft). 33 | 34 | To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository, 35 | called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation 36 | for the `LogStore` and `StableStore`. 37 | 38 | A pure Go backend using [Bbolt](https://github.com/etcd-io/bbolt) is also available called 39 | [raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore` 40 | and `StableStore`. 41 | 42 | 43 | ## Community Contributed Examples 44 | - [Raft gRPC Example](https://github.com/Jille/raft-grpc-example) - Utilizing the Raft repository with gRPC 45 | - [Raft-based KV-store Example](https://github.com/otoolep/hraftd) - Uses Hashicorp Raft to build a distributed key-value store 46 | 47 | 48 | ## Tagged Releases 49 | 50 | As of September 2017, HashiCorp will start using tags for this library to clearly indicate 51 | major version updates. We recommend you vendor your application's dependency on this library. 52 | 53 | * v0.1.0 is the original stable version of the library that was in main and has been maintained 54 | with no breaking API changes. This was in use by Consul prior to version 0.7.0. 55 | 56 | * v1.0.0 takes the changes that were staged in the library-v2-stage-one branch. This version 57 | manages server identities using a UUID, so introduces some breaking API changes. It also versions 58 | the Raft protocol, and requires some special steps when interoperating with Raft servers running 59 | older versions of the library (see the detailed comment in config.go about version compatibility). 60 | You can reference https://github.com/hashicorp/consul/pull/2222 for an idea of what was required 61 | to port Consul to these new interfaces. 62 | 63 | This version includes some new features as well, including non voting servers, a new address 64 | provider abstraction in the transport layer, and more resilient snapshots. 65 | 66 | ## Protocol 67 | 68 | raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://raft.github.io/raft.pdf) 69 | 70 | A high level overview of the Raft protocol is described below, but for details please read the full 71 | [Raft paper](https://raft.github.io/raft.pdf) 72 | followed by the raft source. Any questions about the raft protocol should be sent to the 73 | [raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev). 74 | 75 | ### Protocol Description 76 | 77 | Raft nodes are always in one of three states: follower, candidate or leader. All 78 | nodes initially start out as a follower. In this state, nodes can accept log entries 79 | from a leader and cast votes. If no entries are received for some time, nodes 80 | self-promote to the candidate state. In the candidate state nodes request votes from 81 | their peers. If a candidate receives a quorum of votes, then it is promoted to a leader. 82 | The leader must accept new log entries and replicate to all the other followers. 83 | In addition, if stale reads are not acceptable, all queries must also be performed on 84 | the leader. 85 | 86 | Once a cluster has a leader, it is able to accept new log entries. A client can 87 | request that a leader append a new log entry, which is an opaque binary blob to 88 | Raft. The leader then writes the entry to durable storage and attempts to replicate 89 | to a quorum of followers. Once the log entry is considered *committed*, it can be 90 | *applied* to a finite state machine. The finite state machine is application specific, 91 | and is implemented using an interface. 92 | 93 | An obvious question relates to the unbounded nature of a replicated log. Raft provides 94 | a mechanism by which the current state is snapshotted, and the log is compacted. Because 95 | of the FSM abstraction, restoring the state of the FSM must result in the same state 96 | as a replay of old logs. This allows Raft to capture the FSM state at a point in time, 97 | and then remove all the logs that were used to reach that state. This is performed automatically 98 | without user intervention, and prevents unbounded disk usage as well as minimizing 99 | time spent replaying logs. 100 | 101 | Lastly, there is the issue of updating the peer set when new servers are joining 102 | or existing servers are leaving. As long as a quorum of nodes is available, this 103 | is not an issue as Raft provides mechanisms to dynamically update the peer set. 104 | If a quorum of nodes is unavailable, then this becomes a very challenging issue. 105 | For example, suppose there are only 2 peers, A and B. The quorum size is also 106 | 2, meaning both nodes must agree to commit a log entry. If either A or B fails, 107 | it is now impossible to reach quorum. This means the cluster is unable to add, 108 | or remove a node, or commit any additional log entries. This results in *unavailability*. 109 | At this point, manual intervention would be required to remove either A or B, 110 | and to restart the remaining node in bootstrap mode. 111 | 112 | A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster 113 | of 5 can tolerate 2 node failures. The recommended configuration is to either 114 | run 3 or 5 raft servers. This maximizes availability without 115 | greatly sacrificing performance. 116 | 117 | In terms of performance, Raft is comparable to Paxos. Assuming stable leadership, 118 | committing a log entry requires a single round trip to half of the cluster. 119 | Thus performance is bound by disk I/O and network latency. 120 | 121 | 122 | ## Metrics Emission and Compatibility 123 | 124 | This library can emit metrics using either `github.com/armon/go-metrics` or `github.com/hashicorp/go-metrics`. Choosing between the libraries is controlled via build tags. 125 | 126 | **Build Tags** 127 | * `armonmetrics` - Using this tag will cause metrics to be routed to `armon/go-metrics` 128 | * `hashicorpmetrics` - Using this tag will cause all metrics to be routed to `hashicorp/go-metrics` 129 | 130 | If no build tag is specified, the default behavior is to use `armon/go-metrics`. 131 | 132 | **Deprecating `armon/go-metrics`** 133 | 134 | Emitting metrics to `armon/go-metrics` is officially deprecated. Usage of `armon/go-metrics` will remain the default until mid-2025 with opt-in support continuing to the end of 2025. 135 | 136 | **Migration** 137 | To migrate an application currently using the older `armon/go-metrics` to instead use `hashicorp/go-metrics` the following should be done. 138 | 139 | 1. Upgrade libraries using `armon/go-metrics` to consume `hashicorp/go-metrics/compat` instead. This should involve only changing import statements. All repositories in the `hashicorp` namespace 140 | 2. Update an applications library dependencies to those that have the compatibility layer configured. 141 | 3. Update the application to use `hashicorp/go-metrics` for configuring metrics export instead of `armon/go-metrics` 142 | * Replace all application imports of `github.com/armon/go-metrics` with `github.com/hashicorp/go-metrics` 143 | * Instrument your build system to build with the `hashicorpmetrics` tag. 144 | 145 | Eventually once the default behavior changes to use `hashicorp/go-metrics` by default (mid-2025), you can drop the `hashicorpmetrics` build tag. 146 | -------------------------------------------------------------------------------- /bench/bench.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raftbench 5 | 6 | // raftbench provides common benchmarking functions which can be used by 7 | // anything which implements the raft.LogStore and raft.StableStore interfaces. 8 | // All functions accept these interfaces and perform benchmarking. This 9 | // makes comparing backend performance easier by sharing the tests. 10 | 11 | import ( 12 | "testing" 13 | 14 | "github.com/hashicorp/raft" 15 | ) 16 | 17 | func FirstIndex(b *testing.B, store raft.LogStore) { 18 | // Create some fake data 19 | var logs []*raft.Log 20 | for i := 1; i < 10; i++ { 21 | logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) 22 | } 23 | if err := store.StoreLogs(logs); err != nil { 24 | b.Fatalf("err: %s", err) 25 | } 26 | b.ResetTimer() 27 | 28 | // Run FirstIndex a number of times 29 | for n := 0; n < b.N; n++ { 30 | store.FirstIndex() 31 | } 32 | } 33 | 34 | func LastIndex(b *testing.B, store raft.LogStore) { 35 | // Create some fake data 36 | var logs []*raft.Log 37 | for i := 1; i < 10; i++ { 38 | logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) 39 | } 40 | if err := store.StoreLogs(logs); err != nil { 41 | b.Fatalf("err: %s", err) 42 | } 43 | b.ResetTimer() 44 | 45 | // Run LastIndex a number of times 46 | for n := 0; n < b.N; n++ { 47 | store.LastIndex() 48 | } 49 | } 50 | 51 | func GetLog(b *testing.B, store raft.LogStore) { 52 | // Create some fake data 53 | var logs []*raft.Log 54 | for i := 1; i < 10; i++ { 55 | logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) 56 | } 57 | if err := store.StoreLogs(logs); err != nil { 58 | b.Fatalf("err: %s", err) 59 | } 60 | b.ResetTimer() 61 | 62 | // Run GetLog a number of times 63 | for n := 0; n < b.N; n++ { 64 | if err := store.GetLog(5, new(raft.Log)); err != nil { 65 | b.Fatalf("err: %s", err) 66 | } 67 | } 68 | } 69 | 70 | func StoreLog(b *testing.B, store raft.LogStore) { 71 | // Run StoreLog a number of times 72 | for n := 0; n < b.N; n++ { 73 | log := &raft.Log{Index: uint64(n), Data: []byte("data")} 74 | if err := store.StoreLog(log); err != nil { 75 | b.Fatalf("err: %s", err) 76 | } 77 | } 78 | } 79 | 80 | func StoreLogs(b *testing.B, store raft.LogStore) { 81 | // Run StoreLogs a number of times. We want to set multiple logs each 82 | // run, so we create 3 logs with incrementing indexes for each iteration. 83 | for n := 0; n < b.N; n++ { 84 | b.StopTimer() 85 | offset := 3 * (n + 1) 86 | logs := []*raft.Log{ 87 | {Index: uint64(offset - 2), Data: []byte("data")}, 88 | {Index: uint64(offset - 1), Data: []byte("data")}, 89 | {Index: uint64(offset), Data: []byte("data")}, 90 | } 91 | b.StartTimer() 92 | 93 | if err := store.StoreLogs(logs); err != nil { 94 | b.Fatalf("err: %s", err) 95 | } 96 | } 97 | } 98 | 99 | func DeleteRange(b *testing.B, store raft.LogStore) { 100 | // Create some fake data. In this case, we create 3 new log entries for each 101 | // test case, and separate them by index in multiples of 10. This allows 102 | // some room so that we can test deleting ranges with "extra" logs 103 | // to ensure we stop going to the database once our max index is hit. 104 | var logs []*raft.Log 105 | for n := 0; n < b.N; n++ { 106 | offset := 10 * n 107 | for i := offset; i < offset+3; i++ { 108 | logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) 109 | } 110 | } 111 | if err := store.StoreLogs(logs); err != nil { 112 | b.Fatalf("err: %s", err) 113 | } 114 | b.ResetTimer() 115 | 116 | // Delete a range of the data 117 | for n := 0; n < b.N; n++ { 118 | offset := 10 * n 119 | if err := store.DeleteRange(uint64(offset), uint64(offset+9)); err != nil { 120 | b.Fatalf("err: %s", err) 121 | } 122 | } 123 | } 124 | 125 | func Set(b *testing.B, store raft.StableStore) { 126 | // Run Set a number of times 127 | for n := 0; n < b.N; n++ { 128 | if err := store.Set([]byte{byte(n)}, []byte("val")); err != nil { 129 | b.Fatalf("err: %s", err) 130 | } 131 | } 132 | } 133 | 134 | func Get(b *testing.B, store raft.StableStore) { 135 | // Create some fake data 136 | for i := 1; i < 10; i++ { 137 | if err := store.Set([]byte{byte(i)}, []byte("val")); err != nil { 138 | b.Fatalf("err: %s", err) 139 | } 140 | } 141 | b.ResetTimer() 142 | 143 | // Run Get a number of times 144 | for n := 0; n < b.N; n++ { 145 | if _, err := store.Get([]byte{0x05}); err != nil { 146 | b.Fatalf("err: %s", err) 147 | } 148 | } 149 | } 150 | 151 | func SetUint64(b *testing.B, store raft.StableStore) { 152 | // Run SetUint64 a number of times 153 | for n := 0; n < b.N; n++ { 154 | if err := store.SetUint64([]byte{byte(n)}, uint64(n)); err != nil { 155 | b.Fatalf("err: %s", err) 156 | } 157 | } 158 | } 159 | 160 | func GetUint64(b *testing.B, store raft.StableStore) { 161 | // Create some fake data 162 | for i := 0; i < 10; i++ { 163 | if err := store.SetUint64([]byte{byte(i)}, uint64(i)); err != nil { 164 | b.Fatalf("err: %s", err) 165 | } 166 | } 167 | b.ResetTimer() 168 | 169 | // Run GetUint64 a number of times 170 | for n := 0; n < b.N; n++ { 171 | if _, err := store.GetUint64([]byte{0x05}); err != nil { 172 | b.Fatalf("err: %s", err) 173 | } 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /bench_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "testing" 8 | "time" 9 | 10 | "github.com/hashicorp/go-hclog" 11 | ) 12 | 13 | func BenchmarkStoreLogInMem(b *testing.B) { 14 | conf := DefaultConfig() 15 | conf.LocalID = "first" 16 | conf.HeartbeatTimeout = 50 * time.Millisecond 17 | conf.ElectionTimeout = 50 * time.Millisecond 18 | conf.LeaderLeaseTimeout = 50 * time.Millisecond 19 | conf.CommitTimeout = 5 * time.Millisecond 20 | conf.SnapshotThreshold = 100 21 | conf.TrailingLogs = 10 22 | conf.LogLevel = "OFF" 23 | raft := MakeRaft(b, conf, true) 24 | raft.logger.SetLevel(hclog.Off) 25 | 26 | NoErr(WaitFor(raft, Leader), b) 27 | 28 | applyAndWait := func(leader *RaftEnv, n, sz int) { 29 | // Do some commits 30 | var futures []ApplyFuture 31 | for i := 0; i < n; i++ { 32 | futures = append(futures, leader.raft.Apply(logBytes(i, sz), 0)) 33 | } 34 | for _, f := range futures { 35 | NoErr(WaitFuture(f), b) 36 | leader.logger.Debug("applied", "index", f.Index(), "size", sz) 37 | } 38 | } 39 | 40 | for i := 0; i < b.N; i++ { 41 | // Do some commits 42 | applyAndWait(raft, 100, 10) 43 | // Do a snapshot 44 | NoErr(WaitFuture(raft.raft.Snapshot()), b) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /commands.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | // RPCHeader is a common sub-structure used to pass along protocol version and 7 | // other information about the cluster. For older Raft implementations before 8 | // versioning was added this will default to a zero-valued structure when read 9 | // by newer Raft versions. 10 | type RPCHeader struct { 11 | // ProtocolVersion is the version of the protocol the sender is 12 | // speaking. 13 | ProtocolVersion ProtocolVersion 14 | // ID is the ServerID of the node sending the RPC Request or Response 15 | ID []byte 16 | // Addr is the ServerAddr of the node sending the RPC Request or Response 17 | Addr []byte 18 | } 19 | 20 | // WithRPCHeader is an interface that exposes the RPC header. 21 | type WithRPCHeader interface { 22 | GetRPCHeader() RPCHeader 23 | } 24 | 25 | // AppendEntriesRequest is the command used to append entries to the 26 | // replicated log. 27 | type AppendEntriesRequest struct { 28 | RPCHeader 29 | 30 | // Provide the current term and leader 31 | Term uint64 32 | 33 | // Deprecated: use RPCHeader.Addr instead 34 | Leader []byte 35 | 36 | // Provide the previous entries for integrity checking 37 | PrevLogEntry uint64 38 | PrevLogTerm uint64 39 | 40 | // New entries to commit 41 | Entries []*Log 42 | 43 | // Commit index on the leader 44 | LeaderCommitIndex uint64 45 | } 46 | 47 | // GetRPCHeader - See WithRPCHeader. 48 | func (r *AppendEntriesRequest) GetRPCHeader() RPCHeader { 49 | return r.RPCHeader 50 | } 51 | 52 | // AppendEntriesResponse is the response returned from an 53 | // AppendEntriesRequest. 54 | type AppendEntriesResponse struct { 55 | RPCHeader 56 | 57 | // Newer term if leader is out of date 58 | Term uint64 59 | 60 | // Last Log is a hint to help accelerate rebuilding slow nodes 61 | LastLog uint64 62 | 63 | // We may not succeed if we have a conflicting entry 64 | Success bool 65 | 66 | // There are scenarios where this request didn't succeed 67 | // but there's no need to wait/back-off the next attempt. 68 | NoRetryBackoff bool 69 | } 70 | 71 | // GetRPCHeader - See WithRPCHeader. 72 | func (r *AppendEntriesResponse) GetRPCHeader() RPCHeader { 73 | return r.RPCHeader 74 | } 75 | 76 | // RequestVoteRequest is the command used by a candidate to ask a Raft peer 77 | // for a vote in an election. 78 | type RequestVoteRequest struct { 79 | RPCHeader 80 | 81 | // Provide the term and our id 82 | Term uint64 83 | 84 | // Deprecated: use RPCHeader.Addr instead 85 | Candidate []byte 86 | 87 | // Used to ensure safety 88 | LastLogIndex uint64 89 | LastLogTerm uint64 90 | 91 | // Used to indicate to peers if this vote was triggered by a leadership 92 | // transfer. It is required for leadership transfer to work, because servers 93 | // wouldn't vote otherwise if they are aware of an existing leader. 94 | LeadershipTransfer bool 95 | } 96 | 97 | // GetRPCHeader - See WithRPCHeader. 98 | func (r *RequestVoteRequest) GetRPCHeader() RPCHeader { 99 | return r.RPCHeader 100 | } 101 | 102 | // RequestVoteResponse is the response returned from a RequestVoteRequest. 103 | type RequestVoteResponse struct { 104 | RPCHeader 105 | 106 | // Newer term if leader is out of date. 107 | Term uint64 108 | 109 | // Peers is deprecated, but required by servers that only understand 110 | // protocol version 0. This is not populated in protocol version 2 111 | // and later. 112 | Peers []byte 113 | 114 | // Is the vote granted. 115 | Granted bool 116 | } 117 | 118 | // GetRPCHeader - See WithRPCHeader. 119 | func (r *RequestVoteResponse) GetRPCHeader() RPCHeader { 120 | return r.RPCHeader 121 | } 122 | 123 | // RequestPreVoteRequest is the command used by a candidate to ask a Raft peer 124 | // for a vote in an election. 125 | type RequestPreVoteRequest struct { 126 | RPCHeader 127 | 128 | // Provide the term and our id 129 | Term uint64 130 | 131 | // Used to ensure safety 132 | LastLogIndex uint64 133 | LastLogTerm uint64 134 | } 135 | 136 | // GetRPCHeader - See WithRPCHeader. 137 | func (r *RequestPreVoteRequest) GetRPCHeader() RPCHeader { 138 | return r.RPCHeader 139 | } 140 | 141 | // RequestPreVoteResponse is the response returned from a RequestPreVoteRequest. 142 | type RequestPreVoteResponse struct { 143 | RPCHeader 144 | 145 | // Newer term if leader is out of date. 146 | Term uint64 147 | 148 | // Is the vote granted. 149 | Granted bool 150 | } 151 | 152 | // GetRPCHeader - See WithRPCHeader. 153 | func (r *RequestPreVoteResponse) GetRPCHeader() RPCHeader { 154 | return r.RPCHeader 155 | } 156 | 157 | // InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its 158 | // log (and state machine) from a snapshot on another peer. 159 | type InstallSnapshotRequest struct { 160 | RPCHeader 161 | SnapshotVersion SnapshotVersion 162 | 163 | Term uint64 164 | Leader []byte 165 | 166 | // These are the last index/term included in the snapshot 167 | LastLogIndex uint64 168 | LastLogTerm uint64 169 | 170 | // Peer Set in the snapshot. 171 | // but remains here in case we receive an InstallSnapshot from a leader 172 | // that's running old code. 173 | // Deprecated: This is deprecated in favor of Configuration 174 | Peers []byte 175 | 176 | // Cluster membership. 177 | Configuration []byte 178 | // Log index where 'Configuration' entry was originally written. 179 | ConfigurationIndex uint64 180 | 181 | // Size of the snapshot 182 | Size int64 183 | } 184 | 185 | // GetRPCHeader - See WithRPCHeader. 186 | func (r *InstallSnapshotRequest) GetRPCHeader() RPCHeader { 187 | return r.RPCHeader 188 | } 189 | 190 | // InstallSnapshotResponse is the response returned from an 191 | // InstallSnapshotRequest. 192 | type InstallSnapshotResponse struct { 193 | RPCHeader 194 | 195 | Term uint64 196 | Success bool 197 | } 198 | 199 | // GetRPCHeader - See WithRPCHeader. 200 | func (r *InstallSnapshotResponse) GetRPCHeader() RPCHeader { 201 | return r.RPCHeader 202 | } 203 | 204 | // TimeoutNowRequest is the command used by a leader to signal another server to 205 | // start an election. 206 | type TimeoutNowRequest struct { 207 | RPCHeader 208 | } 209 | 210 | // GetRPCHeader - See WithRPCHeader. 211 | func (r *TimeoutNowRequest) GetRPCHeader() RPCHeader { 212 | return r.RPCHeader 213 | } 214 | 215 | // TimeoutNowResponse is the response to TimeoutNowRequest. 216 | type TimeoutNowResponse struct { 217 | RPCHeader 218 | } 219 | 220 | // GetRPCHeader - See WithRPCHeader. 221 | func (r *TimeoutNowResponse) GetRPCHeader() RPCHeader { 222 | return r.RPCHeader 223 | } 224 | -------------------------------------------------------------------------------- /commitment.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "sort" 8 | "sync" 9 | ) 10 | 11 | // Commitment is used to advance the leader's commit index. The leader and 12 | // replication goroutines report in newly written entries with match(), and 13 | // this notifies on commitCh when the commit index has advanced. 14 | type commitment struct { 15 | // protects matchIndexes and commitIndex 16 | sync.Mutex 17 | // notified when commitIndex increases 18 | commitCh chan struct{} 19 | // voter ID to log index: the server stores up through this log entry 20 | matchIndexes map[ServerID]uint64 21 | // a quorum stores up through this log entry. monotonically increases. 22 | commitIndex uint64 23 | // the first index of this leader's term: this needs to be replicated to a 24 | // majority of the cluster before this leader may mark anything committed 25 | // (per Raft's commitment rule) 26 | startIndex uint64 27 | } 28 | 29 | // newCommitment returns a commitment struct that notifies the provided 30 | // channel when log entries have been committed. A new commitment struct is 31 | // created each time this server becomes leader for a particular term. 32 | // 'configuration' is the servers in the cluster. 33 | // 'startIndex' is the first index created in this term (see 34 | // its description above). 35 | func newCommitment(commitCh chan struct{}, configuration Configuration, startIndex uint64) *commitment { 36 | matchIndexes := make(map[ServerID]uint64) 37 | for _, server := range configuration.Servers { 38 | if server.Suffrage == Voter { 39 | matchIndexes[server.ID] = 0 40 | } 41 | } 42 | return &commitment{ 43 | commitCh: commitCh, 44 | matchIndexes: matchIndexes, 45 | commitIndex: 0, 46 | startIndex: startIndex, 47 | } 48 | } 49 | 50 | // Called when a new cluster membership configuration is created: it will be 51 | // used to determine commitment from now on. 'configuration' is the servers in 52 | // the cluster. 53 | func (c *commitment) setConfiguration(configuration Configuration) { 54 | c.Lock() 55 | defer c.Unlock() 56 | oldMatchIndexes := c.matchIndexes 57 | c.matchIndexes = make(map[ServerID]uint64) 58 | for _, server := range configuration.Servers { 59 | if server.Suffrage == Voter { 60 | c.matchIndexes[server.ID] = oldMatchIndexes[server.ID] // defaults to 0 61 | } 62 | } 63 | c.recalculate() 64 | } 65 | 66 | // Called by leader after commitCh is notified 67 | func (c *commitment) getCommitIndex() uint64 { 68 | c.Lock() 69 | defer c.Unlock() 70 | return c.commitIndex 71 | } 72 | 73 | // Match is called once a server completes writing entries to disk: either the 74 | // leader has written the new entry or a follower has replied to an 75 | // AppendEntries RPC. The given server's disk agrees with this server's log up 76 | // through the given index. 77 | func (c *commitment) match(server ServerID, matchIndex uint64) { 78 | c.Lock() 79 | defer c.Unlock() 80 | if prev, hasVote := c.matchIndexes[server]; hasVote && matchIndex > prev { 81 | c.matchIndexes[server] = matchIndex 82 | c.recalculate() 83 | } 84 | } 85 | 86 | // Internal helper to calculate new commitIndex from matchIndexes. 87 | // Must be called with lock held. 88 | func (c *commitment) recalculate() { 89 | if len(c.matchIndexes) == 0 { 90 | return 91 | } 92 | 93 | matched := make([]uint64, 0, len(c.matchIndexes)) 94 | for _, idx := range c.matchIndexes { 95 | matched = append(matched, idx) 96 | } 97 | sort.Sort(uint64Slice(matched)) 98 | quorumMatchIndex := matched[(len(matched)-1)/2] 99 | 100 | if quorumMatchIndex > c.commitIndex && quorumMatchIndex >= c.startIndex { 101 | c.commitIndex = quorumMatchIndex 102 | asyncNotifyCh(c.commitCh) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /commitment_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "testing" 8 | ) 9 | 10 | func makeConfiguration(voters []string) Configuration { 11 | var configuration Configuration 12 | for _, voter := range voters { 13 | configuration.Servers = append(configuration.Servers, Server{ 14 | Suffrage: Voter, 15 | Address: ServerAddress(voter + "addr"), 16 | ID: ServerID(voter), 17 | }) 18 | } 19 | return configuration 20 | } 21 | 22 | // Returns a slice of server names of size n. 23 | func voters(n int) Configuration { 24 | if n > 7 { 25 | panic("only up to 7 servers implemented") 26 | } 27 | return makeConfiguration([]string{"s1", "s2", "s3", "s4", "s5", "s6", "s7"}[:n]) 28 | } 29 | 30 | // Tests setVoters() keeps matchIndexes where possible. 31 | func TestCommitment_setVoters(t *testing.T) { 32 | commitCh := make(chan struct{}, 1) 33 | c := newCommitment(commitCh, makeConfiguration([]string{"a", "b", "c"}), 0) 34 | c.match("a", 10) 35 | c.match("b", 20) 36 | c.match("c", 30) 37 | // commitIndex: 20 38 | if !drainNotifyCh(commitCh) { 39 | t.Fatalf("expected commit notify") 40 | } 41 | c.setConfiguration(makeConfiguration([]string{"c", "d", "e"})) 42 | // c: 30, d: 0, e: 0 43 | c.match("e", 40) 44 | if c.getCommitIndex() != 30 { 45 | t.Fatalf("expected 30 entries committed, found %d", 46 | c.getCommitIndex()) 47 | } 48 | if !drainNotifyCh(commitCh) { 49 | t.Fatalf("expected commit notify") 50 | } 51 | } 52 | 53 | // Tests match() being called with smaller index than before. 54 | func TestCommitment_match_max(t *testing.T) { 55 | commitCh := make(chan struct{}, 1) 56 | c := newCommitment(commitCh, voters(5), 4) 57 | 58 | c.match("s1", 8) 59 | c.match("s2", 8) 60 | c.match("s2", 1) 61 | c.match("s3", 8) 62 | 63 | if c.getCommitIndex() != 8 { 64 | t.Fatalf("calling match with an earlier index should be ignored") 65 | } 66 | } 67 | 68 | // Tests match() being called with non-voters. 69 | func TestCommitment_match_nonVoting(t *testing.T) { 70 | commitCh := make(chan struct{}, 1) 71 | c := newCommitment(commitCh, voters(5), 4) 72 | 73 | c.match("s1", 8) 74 | c.match("s2", 8) 75 | c.match("s3", 8) 76 | 77 | if !drainNotifyCh(commitCh) { 78 | t.Fatalf("expected commit notify") 79 | } 80 | 81 | c.match("s90", 10) 82 | c.match("s91", 10) 83 | c.match("s92", 10) 84 | 85 | if c.getCommitIndex() != 8 { 86 | t.Fatalf("non-voting servers shouldn't be able to commit") 87 | } 88 | if drainNotifyCh(commitCh) { 89 | t.Fatalf("unexpected commit notify") 90 | } 91 | } 92 | 93 | // Tests recalculate() algorithm. 94 | func TestCommitment_recalculate(t *testing.T) { 95 | commitCh := make(chan struct{}, 1) 96 | c := newCommitment(commitCh, voters(5), 0) 97 | 98 | c.match("s1", 30) 99 | c.match("s2", 20) 100 | 101 | if c.getCommitIndex() != 0 { 102 | t.Fatalf("shouldn't commit after two of five servers") 103 | } 104 | if drainNotifyCh(commitCh) { 105 | t.Fatalf("unexpected commit notify") 106 | } 107 | 108 | c.match("s3", 10) 109 | if c.getCommitIndex() != 10 { 110 | t.Fatalf("expected 10 entries committed, found %d", 111 | c.getCommitIndex()) 112 | } 113 | if !drainNotifyCh(commitCh) { 114 | t.Fatalf("expected commit notify") 115 | } 116 | c.match("s4", 15) 117 | if c.getCommitIndex() != 15 { 118 | t.Fatalf("expected 15 entries committed, found %d", 119 | c.getCommitIndex()) 120 | } 121 | if !drainNotifyCh(commitCh) { 122 | t.Fatalf("expected commit notify") 123 | } 124 | 125 | c.setConfiguration(voters(3)) 126 | // s1: 30, s2: 20, s3: 10 127 | if c.getCommitIndex() != 20 { 128 | t.Fatalf("expected 20 entries committed, found %d", 129 | c.getCommitIndex()) 130 | } 131 | if !drainNotifyCh(commitCh) { 132 | t.Fatalf("expected commit notify") 133 | } 134 | 135 | c.setConfiguration(voters(4)) 136 | // s1: 30, s2: 20, s3: 10, s4: 0 137 | c.match("s2", 25) 138 | if c.getCommitIndex() != 20 { 139 | t.Fatalf("expected 20 entries committed, found %d", 140 | c.getCommitIndex()) 141 | } 142 | if drainNotifyCh(commitCh) { 143 | t.Fatalf("unexpected commit notify") 144 | } 145 | c.match("s4", 23) 146 | if c.getCommitIndex() != 23 { 147 | t.Fatalf("expected 23 entries committed, found %d", 148 | c.getCommitIndex()) 149 | } 150 | if !drainNotifyCh(commitCh) { 151 | t.Fatalf("expected commit notify") 152 | } 153 | } 154 | 155 | // Tests recalculate() respecting startIndex. 156 | func TestCommitment_recalculate_startIndex(t *testing.T) { 157 | commitCh := make(chan struct{}, 1) 158 | c := newCommitment(commitCh, voters(5), 4) 159 | 160 | c.match("s1", 3) 161 | c.match("s2", 3) 162 | c.match("s3", 3) 163 | 164 | if c.getCommitIndex() != 0 { 165 | t.Fatalf("can't commit until startIndex is replicated to a quorum") 166 | } 167 | if drainNotifyCh(commitCh) { 168 | t.Fatalf("unexpected commit notify") 169 | } 170 | 171 | c.match("s1", 4) 172 | c.match("s2", 4) 173 | c.match("s3", 4) 174 | 175 | if c.getCommitIndex() != 4 { 176 | t.Fatalf("should be able to commit startIndex once replicated to a quorum") 177 | } 178 | if !drainNotifyCh(commitCh) { 179 | t.Fatalf("expected commit notify") 180 | } 181 | } 182 | 183 | // With no voting members in the cluster, the most sane behavior is probably 184 | // to not mark anything committed. 185 | func TestCommitment_noVoterSanity(t *testing.T) { 186 | commitCh := make(chan struct{}, 1) 187 | c := newCommitment(commitCh, makeConfiguration([]string{}), 4) 188 | c.match("s1", 10) 189 | c.setConfiguration(makeConfiguration([]string{})) 190 | c.match("s1", 10) 191 | if c.getCommitIndex() != 0 { 192 | t.Fatalf("no voting servers: shouldn't be able to commit") 193 | } 194 | if drainNotifyCh(commitCh) { 195 | t.Fatalf("unexpected commit notify") 196 | } 197 | 198 | // add a voter so we can commit something and then remove it 199 | c.setConfiguration(voters(1)) 200 | c.match("s1", 10) 201 | if c.getCommitIndex() != 10 { 202 | t.Fatalf("expected 10 entries committed, found %d", 203 | c.getCommitIndex()) 204 | } 205 | if !drainNotifyCh(commitCh) { 206 | t.Fatalf("expected commit notify") 207 | } 208 | 209 | c.setConfiguration(makeConfiguration([]string{})) 210 | c.match("s1", 20) 211 | if c.getCommitIndex() != 10 { 212 | t.Fatalf("expected 10 entries committed, found %d", 213 | c.getCommitIndex()) 214 | } 215 | if drainNotifyCh(commitCh) { 216 | t.Fatalf("unexpected commit notify") 217 | } 218 | } 219 | 220 | // Single voter commits immediately. 221 | func TestCommitment_singleVoter(t *testing.T) { 222 | commitCh := make(chan struct{}, 1) 223 | c := newCommitment(commitCh, voters(1), 4) 224 | c.match("s1", 10) 225 | if c.getCommitIndex() != 10 { 226 | t.Fatalf("expected 10 entries committed, found %d", 227 | c.getCommitIndex()) 228 | } 229 | if !drainNotifyCh(commitCh) { 230 | t.Fatalf("expected commit notify") 231 | } 232 | c.setConfiguration(voters(1)) 233 | if drainNotifyCh(commitCh) { 234 | t.Fatalf("unexpected commit notify") 235 | } 236 | c.match("s1", 12) 237 | if c.getCommitIndex() != 12 { 238 | t.Fatalf("expected 12 entries committed, found %d", 239 | c.getCommitIndex()) 240 | } 241 | if !drainNotifyCh(commitCh) { 242 | t.Fatalf("expected commit notify") 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /discard_snapshot.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "fmt" 8 | "io" 9 | ) 10 | 11 | // DiscardSnapshotStore is used to successfully snapshot while 12 | // always discarding the snapshot. This is useful for when the 13 | // log should be truncated but no snapshot should be retained. 14 | // This should never be used for production use, and is only 15 | // suitable for testing. 16 | type DiscardSnapshotStore struct{} 17 | 18 | // DiscardSnapshotSink is used to fulfill the SnapshotSink interface 19 | // while always discarding the . This is useful for when the log 20 | // should be truncated but no snapshot should be retained. This 21 | // should never be used for production use, and is only suitable 22 | // for testing. 23 | type DiscardSnapshotSink struct{} 24 | 25 | // NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore. 26 | func NewDiscardSnapshotStore() *DiscardSnapshotStore { 27 | return &DiscardSnapshotStore{} 28 | } 29 | 30 | // Create returns a valid type implementing the SnapshotSink which 31 | // always discards the snapshot. 32 | func (d *DiscardSnapshotStore) Create(version SnapshotVersion, index, term uint64, 33 | configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { 34 | return &DiscardSnapshotSink{}, nil 35 | } 36 | 37 | // List returns successfully with a nil for []*SnapshotMeta. 38 | func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) { 39 | return nil, nil 40 | } 41 | 42 | // Open returns an error since the DiscardSnapshotStore does not 43 | // support opening snapshots. 44 | func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { 45 | return nil, nil, fmt.Errorf("open is not supported") 46 | } 47 | 48 | // Write returns successfully with the length of the input byte slice 49 | // to satisfy the WriteCloser interface 50 | func (d *DiscardSnapshotSink) Write(b []byte) (int, error) { 51 | return len(b), nil 52 | } 53 | 54 | // Close returns a nil error 55 | func (d *DiscardSnapshotSink) Close() error { 56 | return nil 57 | } 58 | 59 | // ID returns "discard" for DiscardSnapshotSink 60 | func (d *DiscardSnapshotSink) ID() string { 61 | return "discard" 62 | } 63 | 64 | // Cancel returns successfully with a nil error 65 | func (d *DiscardSnapshotSink) Cancel() error { 66 | return nil 67 | } 68 | -------------------------------------------------------------------------------- /discard_snapshot_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import "testing" 7 | 8 | func TestDiscardSnapshotStoreImpl(t *testing.T) { 9 | var impl interface{} = &DiscardSnapshotStore{} 10 | if _, ok := impl.(SnapshotStore); !ok { 11 | t.Fatalf("DiscardSnapshotStore not a SnapshotStore") 12 | } 13 | } 14 | 15 | func TestDiscardSnapshotSinkImpl(t *testing.T) { 16 | var impl interface{} = &DiscardSnapshotSink{} 17 | if _, ok := impl.(SnapshotSink); !ok { 18 | t.Fatalf("DiscardSnapshotSink not a SnapshotSink") 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Raft Developer Documentation 2 | 3 | This documentation provides a high level introduction to the `hashicorp/raft` 4 | implementation. The intended audience is anyone interested in understanding 5 | or contributing to the code. 6 | 7 | ## Contents 8 | 9 | 1. [Terminology](#terminology) 10 | 2. [Operations](#operations) 11 | 1. [Apply](./apply.md) 12 | 3. [Threads](#threads) 13 | 14 | 15 | ## Terminology 16 | 17 | This documentation uses the following terms as defined. 18 | 19 | * **Cluster** - the set of peers in the raft configuration 20 | * **Peer** - a node that participates in the consensus protocol using `hashicorp/raft`. A 21 | peer may be in one of the following states: **follower**, **candidate**, or **leader**. 22 | * **Log** - the full set of log entries. 23 | * **Log Entry** - an entry in the log. Each entry has an index that is used to order it 24 | relative to other log entries. 25 | * **Committed** - A log entry is considered committed if it is safe for that entry to be 26 | applied to state machines. A log entry is committed once the leader that created the 27 | entry has replicated it on a majority of the peers. A peer has successfully 28 | replicated the entry once it is persisted. 29 | * **Applied** - log entry applied to the state machine (FSM) 30 | * **Term** - raft divides time into terms of arbitrary length. Terms are numbered with 31 | consecutive integers. Each term begins with an election, in which one or more candidates 32 | attempt to become leader. If a candidate wins the election, then it serves as leader for 33 | the rest of the term. If the election ends with a split vote, the term will end with no 34 | leader. 35 | * **FSM** - finite state machine, stores the cluster state 36 | * **Client** - the application that uses the `hashicorp/raft` library 37 | 38 | ## Operations 39 | 40 | ### Leader Write 41 | 42 | Most write operations must be performed on the leader. 43 | 44 | * RequestConfigChange - update the raft peer list configuration 45 | * Apply - apply a log entry to the log on a majority of peers, and the FSM. See [raft apply](apply.md) for more details. 46 | * Barrier - a special Apply that does not modify the FSM, used to wait for previous logs to be applied 47 | * LeadershipTransfer - stop accepting client requests, and tell a different peer to start a leadership election 48 | * Restore (Snapshot) - overwrite the cluster state with the contents of the snapshot (excluding cluster configuration) 49 | * VerifyLeader - send a heartbeat to all voters to confirm the peer is still the leader 50 | 51 | ### Follower Write 52 | 53 | * BootstrapCluster - store the cluster configuration in the local log store 54 | 55 | 56 | ### Read 57 | 58 | Read operations can be performed on a peer in any state. 59 | 60 | * AppliedIndex - get the index of the last log entry applied to the FSM 61 | * GetConfiguration - return the latest cluster configuration 62 | * LastContact - get the last time this peer made contact with the leader 63 | * LastIndex - get the index of the latest stored log entry 64 | * Leader - get the address of the peer that is currently the leader 65 | * Snapshot - snapshot the current state of the FSM into a file 66 | * State - return the state of the peer 67 | * Stats - return some stats about the peer and the cluster 68 | 69 | ## Threads 70 | 71 | Raft uses the following threads to handle operations. The name of the thread is in bold, 72 | and a short description of the operation handled by the thread follows. The main thread is 73 | responsible for handling many operations. 74 | 75 | * **run** (main thread) - different behaviour based on peer state 76 | * follower 77 | * processRPC (from rpcCh) 78 | * AppendEntries 79 | * RequestVote 80 | * InstallSnapshot 81 | * TimeoutNow 82 | * liveBootstrap (from bootstrapCh) 83 | * periodic heartbeatTimer (HeartbeatTimeout) 84 | * candidate - starts an election for itself when called 85 | * processRPC (from rpcCh) - same as follower 86 | * acceptVote (from askPeerForVote) 87 | * leader - first starts replication to all peers, and applies a Noop log to ensure the new leader has committed up to the commit index 88 | * processRPC (from rpcCh) - same as follower, however we don’t actually expect to receive any RPCs other than a RequestVote 89 | * leadershipTransfer (from leadershipTransferCh) - 90 | * commit (from commitCh) - 91 | * verifyLeader (from verifyCh) - 92 | * user restore snapshot (from userRestoreCh) - 93 | * changeConfig (from configurationChangeCh) - 94 | * dispatchLogs (from applyCh) - handle client Raft.Apply requests by persisting logs to disk, and notifying replication goroutines to replicate the new logs 95 | * checkLease (periodically LeaseTimeout) - 96 | * **runFSM** - has exclusive access to the FSM, all reads and writes must send a message to this thread. Commands: 97 | * apply logs to the FSM, from the fsmMutateCh, from processLogs, from leaderLoop (leader) or appendEntries RPC (follower/candidate) 98 | * restore a snapshot to the FSM, from the fsmMutateCh, from restoreUserSnapshot (leader) or installSnapshot RPC (follower/candidate) 99 | * capture snapshot, from fsmSnapshotCh, from takeSnapshot (runSnapshot thread) 100 | * **runSnapshot** - handles the slower part of taking a snapshot. From a pointer captured by the FSM.Snapshot operation, this thread persists the snapshot by calling FSMSnapshot.Persist. Also calls compactLogs to delete old logs. 101 | * periodically (SnapshotInterval) takeSnapshot for log compaction 102 | * user snapshot, from userSnapshotCh, takeSnapshot to return to the user 103 | * **askPeerForVote (candidate only)** - short lived goroutine that synchronously sends a RequestVote RPC to all voting peers, and waits for the response. One goroutine per voting peer. 104 | * **replicate (leader only)** - long running goroutine that synchronously sends log entry AppendEntry RPCs to all peers. Also starts the heartbeat thread, and possibly the pipelineDecode thread. Runs sendLatestSnapshot when AppendEntry fails. 105 | * **heartbeat (leader only)** - long running goroutine that synchronously sends heartbeat AppendEntry RPCs to all peers. 106 | * **pipelineDecode (leader only)** 107 | -------------------------------------------------------------------------------- /docs/apply.md: -------------------------------------------------------------------------------- 1 | # Raft Apply 2 | 3 | Apply is the primary operation provided by raft. A client calls `raft.Apply` to apply 4 | a command to the FSM. A command will first be committed, i.e., durably stored on a 5 | quorum of raft nodes. Then, the committed command is applied to fsm. 6 | 7 | This sequence diagram shows the steps involved in a `raft.Apply` operation. Each box 8 | across the top is a separate thread. The name in the box identifies the state of the peer 9 | (leader or follower) and the thread (`:`). When there are 10 | multiple copies of the thread, it is indicated with `(each peer)`. 11 | 12 | ```mermaid 13 | sequenceDiagram 14 | autonumber 15 | 16 | participant client 17 | participant leadermain as leader:main 18 | participant leaderfsm as leader:fsm 19 | participant leaderreplicate as leader:replicate (each peer) 20 | participant followermain as follower:main (each peer) 21 | participant followerfsm as follower:fsm (each peer) 22 | 23 | client-)leadermain: applyCh to dispatchLogs 24 | leadermain->>leadermain: store logs to disk 25 | 26 | leadermain-)leaderreplicate: triggerCh 27 | leaderreplicate-->>followermain: Transport.AppendEntries RPC 28 | 29 | followermain->>followermain: store logs to disk 30 | 31 | opt leader commit index is ahead of peer commit index 32 | followermain-)followerfsm: fsmMutateCh
apply committed logs 33 | followerfsm->>followerfsm: fsm.Apply 34 | end 35 | 36 | followermain-->>leaderreplicate: respond success=true 37 | leaderreplicate->>leaderreplicate: update commitment 38 | 39 | opt quorum commit index has increased 40 | leaderreplicate-)leadermain: commitCh 41 | leadermain-)leaderfsm: fsmMutateCh 42 | leaderfsm->>leaderfsm: fsm.Apply 43 | leaderfsm-)client: future.respond 44 | end 45 | 46 | ``` 47 | 48 | Following is the description of each step as shown in the above diagram 49 | 50 | 1. The raft node handles the `raft.Apply` call by creating a new log entry and send the entry 51 | to the `applyCh` channel. 52 | 53 | 2. If the node is not a leader, the method will return an error of `ErrNotLeader`. Otherwise, 54 | the main loop of the leader node calls `raft.dispatchLogs` to write the log entry locally. 55 | 56 | 3. `raft.dispatchLogs` also sends a notification to the `f.triggerCh` of each follower (`map[ServerID]*followerReplication`) to start replicating log entries to the followers. 57 | 58 | 4. For each follower, the leader has started a long running routine (`replicate`) to 59 | replicates log entries. On receiving a log entry to the `triggerCh`, the `replicate` 60 | routine makes the `Transport.AppendEntries` RPC call to do the replication. The log entries 61 | to be replicated are from the follower's nextIndex to min(nextIndex + maxAppendEntries, 62 | leader's lastIndex). Another parameter to AppendEntries is the LeaderCommitIndex. Following 63 | is some examples: 64 | 65 | ``` 66 | AppendEntries(Log: 1..5, LeaderCommitIndex: 0) // Replicating log entries 1..5, 67 | // the leader hasn't committed any log entry; 68 | AppendEntries(Log: 6..8, LeaderCommitIndex: 4) // Replicating log entries 6..8, 69 | // log 0..4 are committed after the leader receives 70 | // a quorum of responses 71 | AppendEntries(Log: 9, LeaderCommitIndex: 8) // Replicating log entry 9, 72 | // log 5..8 are committed. 73 | AppendEntries(Log: , LeaderCommitIndex: 9) // no new log, bumping the commit index 74 | // to let the follower stay up to date of the 75 | // latest committed entries 76 | ``` 77 | 78 | 5. The follower which receives the `appendEntries` RPC calls invokes `raft.appendEntries` to handle 79 | the request. It appends any new entries to the local log store. 80 | 81 | 6. In the same method on the follower as step 5, if the LeaderCommitIndex > this follower's 82 | commitIndex, the follower updates it's commitIndex to min(LeaderCommitIndex, index of its last 83 | log entries). In the first `AppendEntries` call of the above example, the follower won't 84 | update it's commitIndex, because LeaderCommitIndex is 0. The last RPC call doesn't contain 85 | any new log, whereas the follower will update its commitIndex to 9. 86 | 87 | Further, the follower start `processLogs` to send all the committed entries that haven't been 88 | applied to fsm (`fsmMutateCh <- batch`). Otherwise (i.e., `commitIndex <= lastApplied`), 89 | the appendEntries RPC call returns success. 90 | 91 | Therefore, it's possible that a very small window of time exists when all followers have 92 | committed the log to disk, the write has been realized in the FSM of the leader but the 93 | followers have not yet applied the log to their FSM. 94 | 95 | 7. The peer applies the committed entries to the FSM. 96 | 97 | 8. If all went well, the follower responds success (`resp.Success = true`) to the 98 | `appendEntries` RPC call. 99 | 100 | 9. On receiving the successful response from `Transport.AppendEntries`, the leader needs to 101 | update the fsm based on the replicated log entries. Specifically, the leader finds the 102 | highest log entry index that has been replicated to a quorum of the servers ( 103 | `if quorumMatchIndex > c.commitIndex`), update `commitIndex` to that index, and 104 | notify through the `commitCh` channel. 105 | 106 | 10. The leader receives the notification on the `r.leaderState.commitCh` channel and starts 107 | grouping the entries that can be applied to the fsm. 108 | 109 | 11. `processLogs` applies all the committed entries that haven't been applied by batching the log entries and forwarding them through the `fsmMutateCh` channel to fsm. 110 | 111 | 12. The actual place applying the committed log entries is in the main loop of `runFSM()`. 112 | 113 | 13. After the log entries that contains the client req are applied to the fsm, the fsm 114 | module will set the responses to the client request (`req.future.respond(nil)`). From the 115 | client's point of view, the future returned by `raft.Apply` should now be unblocked and 116 | calls to `Error()` or `Response()` should return the data at this point. 117 | -------------------------------------------------------------------------------- /file_snapshot_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "io" 9 | "os" 10 | "reflect" 11 | "runtime" 12 | "testing" 13 | ) 14 | 15 | func TestFileSnapshotStoreImpl(t *testing.T) { 16 | var impl interface{} = &FileSnapshotStore{} 17 | if _, ok := impl.(SnapshotStore); !ok { 18 | t.Fatalf("FileSnapshotStore not a SnapshotStore") 19 | } 20 | } 21 | 22 | func TestFileSnapshotSinkImpl(t *testing.T) { 23 | var impl interface{} = &FileSnapshotSink{} 24 | if _, ok := impl.(SnapshotSink); !ok { 25 | t.Fatalf("FileSnapshotSink not a SnapshotSink") 26 | } 27 | } 28 | 29 | func TestFileSS_CreateSnapshotMissingParentDir(t *testing.T) { 30 | parent, err := os.MkdirTemp("", "raft") 31 | if err != nil { 32 | t.Fatalf("err: %v ", err) 33 | } 34 | defer os.RemoveAll(parent) 35 | 36 | dir, err := os.MkdirTemp(parent, "raft") 37 | if err != nil { 38 | t.Fatalf("err: %v ", err) 39 | } 40 | 41 | snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) 42 | if err != nil { 43 | t.Fatalf("err: %v", err) 44 | } 45 | 46 | os.RemoveAll(parent) 47 | _, trans := NewInmemTransport(NewInmemAddr()) 48 | _, err = snap.Create(SnapshotVersionMax, 10, 3, Configuration{}, 0, trans) 49 | if err != nil { 50 | t.Fatalf("should not fail when using non existing parent") 51 | } 52 | } 53 | 54 | func TestFileSS_CreateSnapshot(t *testing.T) { 55 | // Create a test dir 56 | dir, err := os.MkdirTemp("", "raft") 57 | if err != nil { 58 | t.Fatalf("err: %v ", err) 59 | } 60 | defer os.RemoveAll(dir) 61 | 62 | snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) 63 | if err != nil { 64 | t.Fatalf("err: %v", err) 65 | } 66 | 67 | // Check no snapshots 68 | snaps, err := snap.List() 69 | if err != nil { 70 | t.Fatalf("err: %v", err) 71 | } 72 | if len(snaps) != 0 { 73 | t.Fatalf("did not expect any snapshots: %v", snaps) 74 | } 75 | 76 | // Create a new sink 77 | var configuration Configuration 78 | configuration.Servers = append(configuration.Servers, Server{ 79 | Suffrage: Voter, 80 | ID: ServerID("my id"), 81 | Address: ServerAddress("over here"), 82 | }) 83 | _, trans := NewInmemTransport(NewInmemAddr()) 84 | sink, err := snap.Create(SnapshotVersionMax, 10, 3, configuration, 2, trans) 85 | if err != nil { 86 | t.Fatalf("err: %v", err) 87 | } 88 | 89 | // The sink is not done, should not be in a list! 90 | snaps, err = snap.List() 91 | if err != nil { 92 | t.Fatalf("err: %v", err) 93 | } 94 | if len(snaps) != 0 { 95 | t.Fatalf("did not expect any snapshots: %v", snaps) 96 | } 97 | 98 | // Write to the sink 99 | _, err = sink.Write([]byte("first\n")) 100 | if err != nil { 101 | t.Fatalf("err: %v", err) 102 | } 103 | _, err = sink.Write([]byte("second\n")) 104 | if err != nil { 105 | t.Fatalf("err: %v", err) 106 | } 107 | 108 | // Done! 109 | err = sink.Close() 110 | if err != nil { 111 | t.Fatalf("err: %v", err) 112 | } 113 | 114 | // Should have a snapshot! 115 | snaps, err = snap.List() 116 | if err != nil { 117 | t.Fatalf("err: %v", err) 118 | } 119 | if len(snaps) != 1 { 120 | t.Fatalf("expect a snapshots: %v", snaps) 121 | } 122 | 123 | // Check the latest 124 | latest := snaps[0] 125 | if latest.Index != 10 { 126 | t.Fatalf("bad snapshot: %v", *latest) 127 | } 128 | if latest.Term != 3 { 129 | t.Fatalf("bad snapshot: %v", *latest) 130 | } 131 | if !reflect.DeepEqual(latest.Configuration, configuration) { 132 | t.Fatalf("bad snapshot: %v", *latest) 133 | } 134 | if latest.ConfigurationIndex != 2 { 135 | t.Fatalf("bad snapshot: %v", *latest) 136 | } 137 | if latest.Size != 13 { 138 | t.Fatalf("bad snapshot: %v", *latest) 139 | } 140 | 141 | // Read the snapshot 142 | _, r, err := snap.Open(latest.ID) 143 | if err != nil { 144 | t.Fatalf("err: %v", err) 145 | } 146 | 147 | // Read out everything 148 | var buf bytes.Buffer 149 | if _, err := io.Copy(&buf, r); err != nil { 150 | t.Fatalf("err: %v", err) 151 | } 152 | if err := r.Close(); err != nil { 153 | t.Fatalf("err: %v", err) 154 | } 155 | 156 | // Ensure a match 157 | if bytes.Compare(buf.Bytes(), []byte("first\nsecond\n")) != 0 { 158 | t.Fatalf("content mismatch") 159 | } 160 | } 161 | 162 | func TestFileSS_CancelSnapshot(t *testing.T) { 163 | // Create a test dir 164 | dir, err := os.MkdirTemp("", "raft") 165 | if err != nil { 166 | t.Fatalf("err: %v ", err) 167 | } 168 | defer os.RemoveAll(dir) 169 | 170 | snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) 171 | if err != nil { 172 | t.Fatalf("err: %v", err) 173 | } 174 | 175 | // Create a new sink 176 | _, trans := NewInmemTransport(NewInmemAddr()) 177 | sink, err := snap.Create(SnapshotVersionMax, 10, 3, Configuration{}, 0, trans) 178 | if err != nil { 179 | t.Fatalf("err: %v", err) 180 | } 181 | 182 | // Cancel the snapshot! Should delete 183 | err = sink.Cancel() 184 | if err != nil { 185 | t.Fatalf("err: %v", err) 186 | } 187 | 188 | // The sink is canceled, should not be in a list! 189 | snaps, err := snap.List() 190 | if err != nil { 191 | t.Fatalf("err: %v", err) 192 | } 193 | if len(snaps) != 0 { 194 | t.Fatalf("did not expect any snapshots: %v", snaps) 195 | } 196 | } 197 | 198 | func TestFileSS_Retention(t *testing.T) { 199 | var err error 200 | // Create a test dir 201 | var dir string 202 | dir, err = os.MkdirTemp("", "raft") 203 | if err != nil { 204 | t.Fatalf("err: %v ", err) 205 | } 206 | defer os.RemoveAll(dir) 207 | 208 | var snap *FileSnapshotStore 209 | snap, err = NewFileSnapshotStoreWithLogger(dir, 2, newTestLogger(t)) 210 | if err != nil { 211 | t.Fatalf("err: %v", err) 212 | } 213 | 214 | // Create a few snapshots 215 | _, trans := NewInmemTransport(NewInmemAddr()) 216 | for i := 10; i < 15; i++ { 217 | var sink SnapshotSink 218 | sink, err = snap.Create(SnapshotVersionMax, uint64(i), 3, Configuration{}, 0, trans) 219 | if err != nil { 220 | t.Fatalf("err: %v", err) 221 | } 222 | err = sink.Close() 223 | if err != nil { 224 | t.Fatalf("err: %v", err) 225 | } 226 | } 227 | 228 | // Should only have 2 listed! 229 | var snaps []*SnapshotMeta 230 | snaps, err = snap.List() 231 | if err != nil { 232 | t.Fatalf("err: %v", err) 233 | } 234 | if len(snaps) != 2 { 235 | t.Fatalf("expect 2 snapshots: %v", snaps) 236 | } 237 | 238 | // Check they are the latest 239 | if snaps[0].Index != 14 { 240 | t.Fatalf("bad snap: %#v", *snaps[0]) 241 | } 242 | if snaps[1].Index != 13 { 243 | t.Fatalf("bad snap: %#v", *snaps[1]) 244 | } 245 | } 246 | 247 | func TestFileSS_BadPerm(t *testing.T) { 248 | var err error 249 | if runtime.GOOS == "windows" { 250 | t.Skip("skipping file permission test on windows") 251 | } 252 | 253 | // Create a temp dir 254 | var dir1 string 255 | dir1, err = os.MkdirTemp("", "raft") 256 | if err != nil { 257 | t.Fatalf("err: %s", err) 258 | } 259 | defer os.RemoveAll(dir1) 260 | 261 | // Create a sub dir and remove all permissions 262 | var dir2 string 263 | dir2, err = os.MkdirTemp(dir1, "badperm") 264 | if err != nil { 265 | t.Fatalf("err: %s", err) 266 | } 267 | if err = os.Chmod(dir2, 0o00); err != nil { 268 | t.Fatalf("err: %s", err) 269 | } 270 | defer os.Chmod(dir2, 777) // Set perms back for delete 271 | 272 | // Should fail 273 | if _, err = NewFileSnapshotStore(dir2, 3, nil); err == nil { 274 | t.Fatalf("should fail to use dir with bad perms") 275 | } 276 | } 277 | 278 | func TestFileSS_MissingParentDir(t *testing.T) { 279 | parent, err := os.MkdirTemp("", "raft") 280 | if err != nil { 281 | t.Fatalf("err: %v ", err) 282 | } 283 | defer os.RemoveAll(parent) 284 | 285 | dir, err := os.MkdirTemp(parent, "raft") 286 | if err != nil { 287 | t.Fatalf("err: %v ", err) 288 | } 289 | 290 | os.RemoveAll(parent) 291 | _, err = NewFileSnapshotStore(dir, 3, nil) 292 | if err != nil { 293 | t.Fatalf("should not fail when using non existing parent") 294 | } 295 | } 296 | 297 | func TestFileSS_Ordering(t *testing.T) { 298 | // Create a test dir 299 | dir, err := os.MkdirTemp("", "raft") 300 | if err != nil { 301 | t.Fatalf("err: %v ", err) 302 | } 303 | defer os.RemoveAll(dir) 304 | 305 | snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) 306 | if err != nil { 307 | t.Fatalf("err: %v", err) 308 | } 309 | 310 | // Create a new sink 311 | _, trans := NewInmemTransport(NewInmemAddr()) 312 | sink, err := snap.Create(SnapshotVersionMax, 130350, 5, Configuration{}, 0, trans) 313 | if err != nil { 314 | t.Fatalf("err: %v", err) 315 | } 316 | err = sink.Close() 317 | if err != nil { 318 | t.Fatalf("err: %v", err) 319 | } 320 | 321 | sink, err = snap.Create(SnapshotVersionMax, 204917, 36, Configuration{}, 0, trans) 322 | if err != nil { 323 | t.Fatalf("err: %v", err) 324 | } 325 | err = sink.Close() 326 | if err != nil { 327 | t.Fatalf("err: %v", err) 328 | } 329 | 330 | // Should only have 2 listed! 331 | snaps, err := snap.List() 332 | if err != nil { 333 | t.Fatalf("err: %v", err) 334 | } 335 | if len(snaps) != 2 { 336 | t.Fatalf("expect 2 snapshots: %v", snaps) 337 | } 338 | 339 | // Check they are ordered 340 | if snaps[0].Term != 36 { 341 | t.Fatalf("bad snap: %#v", *snaps[0]) 342 | } 343 | if snaps[1].Term != 5 { 344 | t.Fatalf("bad snap: %#v", *snaps[1]) 345 | } 346 | } 347 | -------------------------------------------------------------------------------- /future.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "fmt" 8 | "io" 9 | "sync" 10 | "time" 11 | ) 12 | 13 | // Future is used to represent an action that may occur in the future. 14 | type Future interface { 15 | // Error blocks until the future arrives and then returns the error status 16 | // of the future. This may be called any number of times - all calls will 17 | // return the same value, however is not OK to call this method twice 18 | // concurrently on the same Future instance. 19 | // Error will only return generic errors related to raft, such 20 | // as ErrLeadershipLost, or ErrRaftShutdown. Some operations, such as 21 | // ApplyLog, may also return errors from other methods. 22 | Error() error 23 | } 24 | 25 | // IndexFuture is used for future actions that can result in a raft log entry 26 | // being created. 27 | type IndexFuture interface { 28 | Future 29 | 30 | // Index holds the index of the newly applied log entry. 31 | // This must not be called until after the Error method has returned. 32 | Index() uint64 33 | } 34 | 35 | // ApplyFuture is used for Apply and can return the FSM response. 36 | type ApplyFuture interface { 37 | IndexFuture 38 | 39 | // Response returns the FSM response as returned by the FSM.Apply method. This 40 | // must not be called until after the Error method has returned. 41 | // Note that if FSM.Apply returns an error, it will be returned by Response, 42 | // and not by the Error method, so it is always important to check Response 43 | // for errors from the FSM. 44 | Response() interface{} 45 | } 46 | 47 | // ConfigurationFuture is used for GetConfiguration and can return the 48 | // latest configuration in use by Raft. 49 | type ConfigurationFuture interface { 50 | IndexFuture 51 | 52 | // Configuration contains the latest configuration. This must 53 | // not be called until after the Error method has returned. 54 | Configuration() Configuration 55 | } 56 | 57 | // SnapshotFuture is used for waiting on a user-triggered snapshot to complete. 58 | type SnapshotFuture interface { 59 | Future 60 | 61 | // Open is a function you can call to access the underlying snapshot and 62 | // its metadata. This must not be called until after the Error method 63 | // has returned. 64 | Open() (*SnapshotMeta, io.ReadCloser, error) 65 | } 66 | 67 | // LeadershipTransferFuture is used for waiting on a user-triggered leadership 68 | // transfer to complete. 69 | type LeadershipTransferFuture interface { 70 | Future 71 | } 72 | 73 | // errorFuture is used to return a static error. 74 | type errorFuture struct { 75 | err error 76 | } 77 | 78 | func (e errorFuture) Error() error { 79 | return e.err 80 | } 81 | 82 | func (e errorFuture) Response() interface{} { 83 | return nil 84 | } 85 | 86 | func (e errorFuture) Index() uint64 { 87 | return 0 88 | } 89 | 90 | // deferError can be embedded to allow a future 91 | // to provide an error in the future. 92 | type deferError struct { 93 | err error 94 | errCh chan error 95 | responded bool 96 | ShutdownCh chan struct{} 97 | } 98 | 99 | func (d *deferError) init() { 100 | d.errCh = make(chan error, 1) 101 | } 102 | 103 | func (d *deferError) Error() error { 104 | if d.err != nil { 105 | // Note that when we've received a nil error, this 106 | // won't trigger, but the channel is closed after 107 | // send so we'll still return nil below. 108 | return d.err 109 | } 110 | if d.errCh == nil { 111 | panic("waiting for response on nil channel") 112 | } 113 | select { 114 | case d.err = <-d.errCh: 115 | case <-d.ShutdownCh: 116 | d.err = ErrRaftShutdown 117 | } 118 | return d.err 119 | } 120 | 121 | func (d *deferError) respond(err error) { 122 | if d.errCh == nil { 123 | return 124 | } 125 | if d.responded { 126 | return 127 | } 128 | d.errCh <- err 129 | close(d.errCh) 130 | d.responded = true 131 | } 132 | 133 | // There are several types of requests that cause a configuration entry to 134 | // be appended to the log. These are encoded here for leaderLoop() to process. 135 | // This is internal to a single server. 136 | type configurationChangeFuture struct { 137 | logFuture 138 | req configurationChangeRequest 139 | } 140 | 141 | // bootstrapFuture is used to attempt a live bootstrap of the cluster. See the 142 | // Raft object's BootstrapCluster member function for more details. 143 | type bootstrapFuture struct { 144 | deferError 145 | 146 | // configuration is the proposed bootstrap configuration to apply. 147 | configuration Configuration 148 | } 149 | 150 | // logFuture is used to apply a log entry and waits until 151 | // the log is considered committed. 152 | type logFuture struct { 153 | deferError 154 | log Log 155 | response interface{} 156 | dispatch time.Time 157 | } 158 | 159 | func (l *logFuture) Response() interface{} { 160 | return l.response 161 | } 162 | 163 | func (l *logFuture) Index() uint64 { 164 | return l.log.Index 165 | } 166 | 167 | type shutdownFuture struct { 168 | raft *Raft 169 | } 170 | 171 | func (s *shutdownFuture) Error() error { 172 | if s.raft == nil { 173 | return nil 174 | } 175 | s.raft.waitShutdown() 176 | if closeable, ok := s.raft.trans.(WithClose); ok { 177 | closeable.Close() 178 | } 179 | return nil 180 | } 181 | 182 | // userSnapshotFuture is used for waiting on a user-triggered snapshot to 183 | // complete. 184 | type userSnapshotFuture struct { 185 | deferError 186 | 187 | // opener is a function used to open the snapshot. This is filled in 188 | // once the future returns with no error. 189 | opener func() (*SnapshotMeta, io.ReadCloser, error) 190 | } 191 | 192 | // Open is a function you can call to access the underlying snapshot and its 193 | // metadata. 194 | func (u *userSnapshotFuture) Open() (*SnapshotMeta, io.ReadCloser, error) { 195 | if u.opener == nil { 196 | return nil, nil, fmt.Errorf("no snapshot available") 197 | } 198 | // Invalidate the opener so it can't get called multiple times, 199 | // which isn't generally safe. 200 | defer func() { 201 | u.opener = nil 202 | }() 203 | return u.opener() 204 | } 205 | 206 | // userRestoreFuture is used for waiting on a user-triggered restore of an 207 | // external snapshot to complete. 208 | type userRestoreFuture struct { 209 | deferError 210 | 211 | // meta is the metadata that belongs with the snapshot. 212 | meta *SnapshotMeta 213 | 214 | // reader is the interface to read the snapshot contents from. 215 | reader io.Reader 216 | } 217 | 218 | // reqSnapshotFuture is used for requesting a snapshot start. 219 | // It is only used internally. 220 | type reqSnapshotFuture struct { 221 | deferError 222 | 223 | // snapshot details provided by the FSM runner before responding 224 | index uint64 225 | term uint64 226 | snapshot FSMSnapshot 227 | } 228 | 229 | // restoreFuture is used for requesting an FSM to perform a 230 | // snapshot restore. Used internally only. 231 | type restoreFuture struct { 232 | deferError 233 | ID string 234 | } 235 | 236 | // verifyFuture is used to verify the current node is still 237 | // the leader. This is to prevent a stale read. 238 | type verifyFuture struct { 239 | deferError 240 | notifyCh chan *verifyFuture 241 | quorumSize int 242 | votes int 243 | voteLock sync.Mutex 244 | } 245 | 246 | // leadershipTransferFuture is used to track the progress of a leadership 247 | // transfer internally. 248 | type leadershipTransferFuture struct { 249 | deferError 250 | 251 | ID *ServerID 252 | Address *ServerAddress 253 | } 254 | 255 | // configurationsFuture is used to retrieve the current configurations. This is 256 | // used to allow safe access to this information outside of the main thread. 257 | type configurationsFuture struct { 258 | deferError 259 | configurations configurations 260 | } 261 | 262 | // Configuration returns the latest configuration in use by Raft. 263 | func (c *configurationsFuture) Configuration() Configuration { 264 | return c.configurations.latest 265 | } 266 | 267 | // Index returns the index of the latest configuration in use by Raft. 268 | func (c *configurationsFuture) Index() uint64 { 269 | return c.configurations.latestIndex 270 | } 271 | 272 | // vote is used to respond to a verifyFuture. 273 | // This may block when responding on the notifyCh. 274 | func (v *verifyFuture) vote(leader bool) { 275 | v.voteLock.Lock() 276 | defer v.voteLock.Unlock() 277 | 278 | // Guard against having notified already 279 | if v.notifyCh == nil { 280 | return 281 | } 282 | 283 | if leader { 284 | v.votes++ 285 | if v.votes >= v.quorumSize { 286 | v.notifyCh <- v 287 | v.notifyCh = nil 288 | } 289 | } else { 290 | v.notifyCh <- v 291 | v.notifyCh = nil 292 | } 293 | } 294 | 295 | // appendFuture is used for waiting on a pipelined append 296 | // entries RPC. 297 | type appendFuture struct { 298 | deferError 299 | start time.Time 300 | args *AppendEntriesRequest 301 | resp *AppendEntriesResponse 302 | } 303 | 304 | func (a *appendFuture) Start() time.Time { 305 | return a.start 306 | } 307 | 308 | func (a *appendFuture) Request() *AppendEntriesRequest { 309 | return a.args 310 | } 311 | 312 | func (a *appendFuture) Response() *AppendEntriesResponse { 313 | return a.resp 314 | } 315 | -------------------------------------------------------------------------------- /future_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "errors" 8 | "testing" 9 | ) 10 | 11 | func TestDeferFutureSuccess(t *testing.T) { 12 | var f deferError 13 | f.init() 14 | f.respond(nil) 15 | if err := f.Error(); err != nil { 16 | t.Fatalf("unexpected error result; got %#v want nil", err) 17 | } 18 | if err := f.Error(); err != nil { 19 | t.Fatalf("unexpected error result; got %#v want nil", err) 20 | } 21 | } 22 | 23 | func TestDeferFutureError(t *testing.T) { 24 | want := errors.New("x") 25 | var f deferError 26 | f.init() 27 | f.respond(want) 28 | if got := f.Error(); got != want { 29 | t.Fatalf("unexpected error result; got %#v want %#v", got, want) 30 | } 31 | if got := f.Error(); got != want { 32 | t.Fatalf("unexpected error result; got %#v want %#v", got, want) 33 | } 34 | } 35 | 36 | func TestDeferFutureConcurrent(t *testing.T) { 37 | // Food for the race detector. 38 | want := errors.New("x") 39 | var f deferError 40 | f.init() 41 | go f.respond(want) 42 | if got := f.Error(); got != want { 43 | t.Errorf("unexpected error result; got %#v want %#v", got, want) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /fuzzy/apply_src.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "hash/fnv" 8 | "math/rand" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | type applySource struct { 14 | rnd *rand.Rand 15 | seed int64 16 | } 17 | 18 | // newApplySource will create a new source, any source created with the same seed will generate the same sequence of data. 19 | func newApplySource(seed string) *applySource { 20 | h := fnv.New32() 21 | h.Write([]byte(seed)) 22 | s := &applySource{seed: int64(h.Sum32())} 23 | s.reset() 24 | return s 25 | } 26 | 27 | // reset this source back to its initial state, it'll generate the same sequence of data it initially did 28 | func (a *applySource) reset() { 29 | a.rnd = rand.New(rand.NewSource(a.seed)) 30 | } 31 | 32 | func (a *applySource) nextEntry() []byte { 33 | const sz = 33 34 | r := make([]byte, sz) 35 | for i := 0; i < len(r); i++ { 36 | r[i] = byte(a.rnd.Int31n(256)) 37 | } 38 | return r 39 | } 40 | 41 | type clusterApplier struct { 42 | stopCh chan bool 43 | applied uint64 44 | src *applySource 45 | } 46 | 47 | // runs apply in chunks of n to the cluster, use the returned Applier to Stop() it 48 | func (a *applySource) apply(t *testing.T, c *cluster, n uint) *clusterApplier { 49 | ap := &clusterApplier{stopCh: make(chan bool), src: a} 50 | go ap.apply(t, c, n) 51 | return ap 52 | } 53 | 54 | func (ca *clusterApplier) apply(t *testing.T, c *cluster, n uint) { 55 | for true { 56 | select { 57 | case <-ca.stopCh: 58 | return 59 | default: 60 | ca.applied += c.ApplyN(t, 5*time.Second, ca.src, n) 61 | } 62 | } 63 | } 64 | 65 | func (ca *clusterApplier) stop() { 66 | ca.stopCh <- true 67 | close(ca.stopCh) 68 | } 69 | -------------------------------------------------------------------------------- /fuzzy/fsm.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "bufio" 8 | "encoding/binary" 9 | "fmt" 10 | "hash/adler32" 11 | "io" 12 | "os" 13 | 14 | "github.com/hashicorp/raft" 15 | ) 16 | 17 | type logHash struct { 18 | lastHash []byte 19 | } 20 | 21 | func (l *logHash) Add(d []byte) { 22 | hasher := adler32.New() 23 | hasher.Write(l.lastHash) 24 | hasher.Write(d) 25 | l.lastHash = hasher.Sum(nil) 26 | } 27 | 28 | type applyItem struct { 29 | index uint64 30 | term uint64 31 | data []byte 32 | } 33 | 34 | func (a *applyItem) set(l *raft.Log) { 35 | a.index = l.Index 36 | a.term = l.Term 37 | a.data = make([]byte, len(l.Data)) 38 | copy(a.data, l.Data) 39 | } 40 | 41 | type fuzzyFSM struct { 42 | logHash 43 | lastTerm uint64 44 | lastIndex uint64 45 | applied []applyItem 46 | } 47 | 48 | func (f *fuzzyFSM) Apply(l *raft.Log) interface{} { 49 | if l.Index <= f.lastIndex { 50 | panic(fmt.Errorf("fsm.Apply received log entry with invalid Index %v (lastIndex we saw was %d)", l, f.lastIndex)) 51 | } 52 | if l.Term < f.lastTerm { 53 | panic(fmt.Errorf("fsm.Apply received log entry with invalid Term %v (lastTerm we saw was %d)", l, f.lastTerm)) 54 | } 55 | f.lastIndex = l.Index 56 | f.lastTerm = l.Term 57 | f.Add(l.Data) 58 | f.applied = append(f.applied, applyItem{}) 59 | f.applied[len(f.applied)-1].set(l) 60 | return nil 61 | } 62 | 63 | func (f *fuzzyFSM) WriteTo(fn string) error { 64 | fw, err := os.Create(fn) 65 | if err != nil { 66 | return err 67 | } 68 | defer fw.Close() 69 | w := bufio.NewWriter(fw) 70 | defer w.Flush() 71 | for _, i := range f.applied { 72 | fmt.Fprintf(w, "%d.%8d: %X\n", i.term, i.index, i.data) 73 | } 74 | return nil 75 | } 76 | 77 | func (f *fuzzyFSM) Snapshot() (raft.FSMSnapshot, error) { 78 | s := *f 79 | return &s, nil 80 | } 81 | 82 | func (f *fuzzyFSM) Restore(r io.ReadCloser) error { 83 | err := binary.Read(r, binary.LittleEndian, &f.lastTerm) 84 | if err == nil { 85 | err = binary.Read(r, binary.LittleEndian, &f.lastIndex) 86 | } 87 | if err == nil { 88 | f.lastHash = make([]byte, adler32.Size) 89 | _, err = r.Read(f.lastHash) 90 | } 91 | return err 92 | } 93 | 94 | func (f *fuzzyFSM) Persist(sink raft.SnapshotSink) error { 95 | err := binary.Write(sink, binary.LittleEndian, f.lastTerm) 96 | if err == nil { 97 | err = binary.Write(sink, binary.LittleEndian, f.lastIndex) 98 | } 99 | if err == nil { 100 | _, err = sink.Write(f.lastHash) 101 | } 102 | return err 103 | } 104 | 105 | func (f *fuzzyFSM) Release() { 106 | } 107 | -------------------------------------------------------------------------------- /fuzzy/fsm_batch.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | //go:build batchtest 5 | // +build batchtest 6 | 7 | package fuzzy 8 | 9 | import "github.com/hashicorp/raft" 10 | 11 | // ApplyBatch enables fuzzyFSM to satisfy the BatchingFSM interface. This 12 | // function is gated by the batchtest build flag. 13 | func (f *fuzzyFSM) ApplyBatch(logs []*raft.Log) []interface{} { 14 | ret := make([]interface{}, len(logs)) 15 | 16 | for _, l := range logs { 17 | f.Apply(l) 18 | } 19 | 20 | return ret 21 | } 22 | -------------------------------------------------------------------------------- /fuzzy/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hashicorp/raft/fuzzy 2 | 3 | go 1.20 4 | 5 | require ( 6 | github.com/hashicorp/go-hclog v1.6.2 7 | github.com/hashicorp/go-msgpack/v2 v2.1.1 8 | github.com/hashicorp/raft v1.2.0 9 | github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea 10 | ) 11 | 12 | require ( 13 | github.com/armon/go-metrics v0.4.1 // indirect 14 | github.com/boltdb/bolt v1.3.1 // indirect 15 | github.com/fatih/color v1.13.0 // indirect 16 | github.com/hashicorp/go-immutable-radix v1.0.0 // indirect 17 | github.com/hashicorp/go-msgpack v0.5.5 // indirect 18 | github.com/hashicorp/golang-lru v0.5.0 // indirect 19 | github.com/mattn/go-colorable v0.1.12 // indirect 20 | github.com/mattn/go-isatty v0.0.14 // indirect 21 | golang.org/x/sys v0.13.0 // indirect 22 | ) 23 | 24 | replace github.com/hashicorp/raft => ../ 25 | -------------------------------------------------------------------------------- /fuzzy/leadershiptransfer_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "math/rand" 8 | "testing" 9 | "time" 10 | 11 | "github.com/hashicorp/raft" 12 | ) 13 | 14 | // 5 node cluster 15 | func TestRaft_FuzzyLeadershipTransfer(t *testing.T) { 16 | cluster := newRaftCluster(t, testLogWriter, "lt", 5, nil) 17 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 18 | 19 | s := newApplySource("LeadershipTransfer") 20 | data := cluster.generateNApplies(s, uint(r.Intn(10000))) 21 | futures := cluster.sendNApplies(time.Minute, data) 22 | cluster.leadershipTransfer(time.Minute) 23 | 24 | data = cluster.generateNApplies(s, uint(r.Intn(10000))) 25 | futures = append(futures, cluster.sendNApplies(time.Minute, data)...) 26 | cluster.leadershipTransfer(time.Minute) 27 | 28 | data = cluster.generateNApplies(s, uint(r.Intn(10000))) 29 | futures = append(futures, cluster.sendNApplies(time.Minute, data)...) 30 | cluster.leadershipTransfer(time.Minute) 31 | 32 | data = cluster.generateNApplies(s, uint(r.Intn(10000))) 33 | futures = append(futures, cluster.sendNApplies(time.Minute, data)...) 34 | 35 | ac := cluster.checkApplyFutures(futures) 36 | 37 | cluster.Stop(t, time.Minute) 38 | cluster.VerifyLog(t, ac) 39 | cluster.VerifyFSM(t) 40 | } 41 | 42 | type LeadershipTransferMode int 43 | 44 | type LeadershipTransfer struct { 45 | verifier appendEntriesVerifier 46 | slowNodes map[string]bool 47 | delayMin time.Duration 48 | delayMax time.Duration 49 | mode LeadershipTransferMode 50 | } 51 | 52 | func (lt *LeadershipTransfer) Report(t *testing.T) { 53 | lt.verifier.Report(t) 54 | } 55 | 56 | func (lt *LeadershipTransfer) PreRPC(s, t string, r *raft.RPC) error { 57 | return nil 58 | } 59 | 60 | func (lt *LeadershipTransfer) nap() { 61 | d := lt.delayMin + time.Duration(rand.Int63n((lt.delayMax - lt.delayMin).Nanoseconds())) 62 | time.Sleep(d) 63 | } 64 | 65 | func (lt *LeadershipTransfer) PostRPC(src, target string, r *raft.RPC, res *raft.RPCResponse) error { 66 | return nil 67 | } 68 | 69 | func (lt *LeadershipTransfer) PreRequestVote(src, target string, v *raft.RequestVoteRequest) (*raft.RequestVoteResponse, error) { 70 | return nil, nil 71 | } 72 | 73 | func (lt *LeadershipTransfer) PreAppendEntries(src, target string, v *raft.AppendEntriesRequest) (*raft.AppendEntriesResponse, error) { 74 | lt.verifier.PreAppendEntries(src, target, v) 75 | return nil, nil 76 | } 77 | -------------------------------------------------------------------------------- /fuzzy/membership_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "io" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | "testing" 12 | "time" 13 | ) 14 | 15 | var testLogWriter io.Writer 16 | 17 | func init() { 18 | testLogWriter = os.Stdout 19 | logDir := os.Getenv("TEST_LOG_DIR") 20 | if logDir != "" { 21 | f, err := os.Create(filepath.Join(logDir, "debug.log")) 22 | if err != nil { 23 | log.Fatalf("TEST_LOG_DIR Env set, but unable to create log file: %v\n", err) 24 | } 25 | testLogWriter = f 26 | } 27 | } 28 | 29 | // this runs a 3 node cluster then expands it to a 5 node cluster and checks all 5 nodes agree at the end 30 | func TestRaft_AddMembership(t *testing.T) { 31 | v := appendEntriesVerifier{} 32 | v.Init() 33 | cluster := newRaftCluster(t, testLogWriter, "m", 3, &v) 34 | s := newApplySource("AddMembership") 35 | initApplied := cluster.ApplyN(t, time.Minute, s, 100) 36 | a := s.apply(t, cluster, 1000) 37 | if err := cluster.CreateAndAddNode(t, testLogWriter, "m", 3); err != nil { 38 | t.Fatalf("Failed to add node m3: %v", err) 39 | } 40 | if err := cluster.CreateAndAddNode(t, testLogWriter, "m", 4); err != nil { 41 | t.Fatalf("Failed to add node m4: %v", err) 42 | } 43 | time.Sleep(time.Second * 5) 44 | a.stop() 45 | cluster.Stop(t, time.Minute) 46 | v.Report(t) 47 | cluster.VerifyLog(t, uint64(a.applied+initApplied)) 48 | cluster.VerifyFSM(t) 49 | } 50 | 51 | // starts with 3 nodes, goes to 5, then goes back to 3, but never removes the leader. 52 | func TestRaft_AddRemoveNodesNotLeader(t *testing.T) { 53 | v := appendEntriesVerifier{} 54 | v.Init() 55 | cluster := newRaftCluster(t, testLogWriter, "ar", 3, &v) 56 | s := newApplySource("AddRemoveNodesNotLeader") 57 | initApplied := cluster.ApplyN(t, time.Minute, s, 100) 58 | a := s.apply(t, cluster, 1000) 59 | cluster.CreateAndAddNode(t, testLogWriter, "ar", 3) 60 | cluster.CreateAndAddNode(t, testLogWriter, "ar", 4) 61 | ldr := cluster.Leader(time.Minute) 62 | removed := 0 63 | for _, rn := range cluster.nodes { 64 | if rn.name != ldr.name { 65 | cluster.RemoveNode(t, rn.name) 66 | removed++ 67 | if removed >= 2 { 68 | break 69 | } 70 | } 71 | } 72 | a.stop() 73 | cluster.Stop(t, time.Minute) 74 | v.Report(t) 75 | cluster.VerifyLog(t, uint64(a.applied+initApplied)) 76 | cluster.VerifyFSM(t) 77 | } 78 | 79 | // starts with a 5 node cluster then removes the leader. 80 | func TestRaft_RemoveLeader(t *testing.T) { 81 | v := appendEntriesVerifier{} 82 | v.Init() 83 | cluster := newRaftCluster(t, testLogWriter, "rl", 5, &v) 84 | s := newApplySource("RemoveLeader") 85 | initApplied := cluster.ApplyN(t, time.Minute, s, 100) 86 | a := s.apply(t, cluster, 100) 87 | time.Sleep(time.Second) 88 | ldr := cluster.Leader(time.Minute) 89 | cluster.RemoveNode(t, ldr.name) 90 | time.Sleep(5 * time.Second) 91 | a.stop() 92 | cluster.Stop(t, time.Minute) 93 | v.Report(t) 94 | cluster.VerifyLog(t, uint64(a.applied+initApplied)) 95 | cluster.VerifyFSM(t) 96 | ldr.raft.Shutdown() 97 | } 98 | 99 | // starts with a 5 node cluster, partitions off one node, and then removes it from the cluster on the other partition 100 | func TestRaft_RemovePartitionedNode(t *testing.T) { 101 | hooks := NewPartitioner() 102 | cluster := newRaftCluster(t, testLogWriter, "rmp", 5, hooks) 103 | s := newApplySource("RemovePartitionedNode") 104 | initApplied := cluster.ApplyN(t, time.Minute, s, 101) 105 | a := s.apply(t, cluster, 100) 106 | nodes := cluster.LeaderPlus(3) 107 | victim := nodes[len(nodes)-1] 108 | hooks.PartitionOff(cluster.log, []*raftNode{victim}) 109 | time.Sleep(3 * time.Second) 110 | removed := cluster.RemoveNode(t, victim.name) 111 | time.Sleep(3 * time.Second) 112 | hooks.HealAll(cluster.log) 113 | time.Sleep(10 * time.Second) 114 | a.stop() 115 | cluster.Stop(t, time.Minute) 116 | hooks.Report(t) 117 | cluster.VerifyLog(t, uint64(a.applied+initApplied)) 118 | cluster.VerifyFSM(t) 119 | 120 | // we should verify that the partitioned node see that it was removed & shutdown 121 | // but it never gets notified of that, so we can't verify that currently. 122 | removed.raft.Shutdown() 123 | } 124 | -------------------------------------------------------------------------------- /fuzzy/node.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "fmt" 8 | "path/filepath" 9 | "time" 10 | 11 | "github.com/hashicorp/go-hclog" 12 | "github.com/hashicorp/raft" 13 | rdb "github.com/hashicorp/raft-boltdb" 14 | ) 15 | 16 | type raftNode struct { 17 | transport *transport 18 | store *rdb.BoltStore 19 | raft *raft.Raft 20 | log hclog.Logger 21 | fsm *fuzzyFSM 22 | name string 23 | dir string 24 | } 25 | 26 | func newRaftNode(logger hclog.Logger, tc *transports, h TransportHooks, nodes []string, name string) (*raftNode, error) { 27 | var err error 28 | var datadir string 29 | datadir, err = resolveDirectory(fmt.Sprintf("data/%v", name), true) 30 | if err != nil { 31 | return nil, err 32 | } 33 | logger.Info("[INFO] Creating new raft Node with data in dir %v", datadir) 34 | var ss *raft.FileSnapshotStore 35 | ss, err = raft.NewFileSnapshotStoreWithLogger(datadir, 5, logger) 36 | 37 | if err != nil { 38 | return nil, fmt.Errorf("unable to initialize snapshots %v", err.Error()) 39 | } 40 | transport := tc.AddNode(name, h) 41 | 42 | config := raft.DefaultConfig() 43 | config.SnapshotThreshold = 1409600 44 | config.SnapshotInterval = time.Hour 45 | config.Logger = logger 46 | config.ShutdownOnRemove = false 47 | config.LocalID = raft.ServerID(name) 48 | 49 | var store *rdb.BoltStore 50 | store, err = rdb.NewBoltStore(filepath.Join(datadir, "store.bolt")) 51 | if err != nil { 52 | return nil, fmt.Errorf("unable to initialize log %v", err.Error()) 53 | } 54 | 55 | if len(nodes) > 0 { 56 | c := make([]raft.Server, 0, len(nodes)) 57 | for _, n := range nodes { 58 | c = append(c, raft.Server{Suffrage: raft.Voter, ID: raft.ServerID(n), Address: raft.ServerAddress(n)}) 59 | } 60 | configuration := raft.Configuration{Servers: c} 61 | 62 | if err = raft.BootstrapCluster(config, store, store, ss, transport, configuration); err != nil { 63 | return nil, err 64 | } 65 | } 66 | fsm := &fuzzyFSM{} 67 | var r *raft.Raft 68 | r, err = raft.NewRaft(config, fsm, store, store, ss, transport) 69 | if err != nil { 70 | return nil, err 71 | } 72 | n := raftNode{ 73 | transport: transport, 74 | store: store, 75 | raft: r, 76 | fsm: fsm, 77 | log: logger, 78 | name: name, 79 | dir: datadir, 80 | } 81 | return &n, nil 82 | } 83 | -------------------------------------------------------------------------------- /fuzzy/partition_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "bytes" 8 | "fmt" 9 | "math/rand" 10 | "sync" 11 | "testing" 12 | "time" 13 | 14 | "github.com/hashicorp/raft" 15 | ) 16 | 17 | // 5 node cluster where the leader and another node get regularly partitioned off 18 | // eventually all partitions heal. 19 | func TestRaft_LeaderPartitions(t *testing.T) { 20 | hooks := NewPartitioner() 21 | cluster := newRaftCluster(t, testLogWriter, "lp", 5, hooks) 22 | cluster.Leader(time.Second * 10) 23 | s := newApplySource("LeaderPartitions") 24 | applier := s.apply(t, cluster, 5) 25 | for i := 0; i < 10; i++ { 26 | pg := hooks.PartitionOff(cluster.log, cluster.LeaderPlus(rand.Intn(4))) 27 | time.Sleep(time.Second * 4) 28 | r := rand.Intn(10) 29 | if r < 1 { 30 | cluster.log.Logf("Healing no partitions!") 31 | } else if r < 4 { 32 | hooks.HealAll(cluster.log) 33 | } else { 34 | hooks.Heal(cluster.log, pg) 35 | } 36 | time.Sleep(time.Second * 5) 37 | } 38 | hooks.HealAll(cluster.log) 39 | cluster.Leader(time.Hour) 40 | applier.stop() 41 | cluster.Stop(t, time.Minute*10) 42 | hooks.Report(t) 43 | cluster.VerifyLog(t, applier.applied) 44 | cluster.VerifyFSM(t) 45 | } 46 | 47 | type Partitioner struct { 48 | verifier appendEntriesVerifier 49 | lock sync.RWMutex // protects partitioned / nextGroup 50 | // this is a map of node -> partition group, only nodes in the same partition group can communicate with each other 51 | partitioned map[string]int 52 | nextGroup int 53 | } 54 | 55 | func NewPartitioner() *Partitioner { 56 | p := &Partitioner{ 57 | partitioned: make(map[string]int), 58 | nextGroup: 1, 59 | } 60 | p.verifier.Init() 61 | return p 62 | } 63 | 64 | // PartitionOff creates a partition where the supplied nodes can only communicate with each other 65 | // returns the partition group, which can be used later with Heal to heal this specific partition 66 | func (p *Partitioner) PartitionOff(l Logger, nodes []*raftNode) int { 67 | nn := make([]string, 0, len(nodes)) 68 | p.lock.Lock() 69 | defer p.lock.Unlock() 70 | pGroup := p.nextGroup 71 | p.nextGroup++ 72 | for _, n := range nodes { 73 | p.partitioned[n.name] = pGroup 74 | nn = append(nn, n.name) 75 | } 76 | l.Logf("Created partition %d with nodes %v, partitions now are %v", pGroup, nn, p) 77 | return pGroup 78 | } 79 | 80 | func (p *Partitioner) Heal(l Logger, pGroup int) { 81 | p.lock.Lock() 82 | defer p.lock.Unlock() 83 | for k, v := range p.partitioned { 84 | if v == pGroup { 85 | p.partitioned[k] = 0 86 | } 87 | } 88 | l.Logf("Healing partition group %d, now partitions are %v", pGroup, p) 89 | } 90 | 91 | func (p *Partitioner) String() string { 92 | pl := make([][]string, 0, 10) 93 | for n, pv := range p.partitioned { 94 | if pv > 0 { 95 | for pv >= len(pl) { 96 | pl = append(pl, nil) 97 | } 98 | pl[pv] = append(pl[pv], n) 99 | } 100 | } 101 | b := bytes.Buffer{} 102 | for i, n := range pl { 103 | if len(n) > 0 { 104 | if b.Len() > 0 { 105 | b.WriteString(", ") 106 | } 107 | fmt.Fprintf(&b, "%d = %v", i, n) 108 | } 109 | } 110 | if b.Len() == 0 { 111 | return "[None]" 112 | } 113 | return b.String() 114 | } 115 | 116 | func (p *Partitioner) HealAll(l Logger) { 117 | p.lock.Lock() 118 | defer p.lock.Unlock() 119 | p.partitioned = make(map[string]int) 120 | l.Logf("Healing all partitions, partitions now %v", p) 121 | } 122 | 123 | func (p *Partitioner) Report(t *testing.T) { 124 | p.verifier.Report(t) 125 | } 126 | 127 | func (p *Partitioner) PreRPC(s, t string, r *raft.RPC) error { 128 | p.lock.RLock() 129 | sp := p.partitioned[s] 130 | st := p.partitioned[t] 131 | p.lock.RUnlock() 132 | if sp == st { 133 | return nil 134 | } 135 | return fmt.Errorf("unable to connect to %v, from %v", t, s) 136 | } 137 | 138 | func (p *Partitioner) PostRPC(s, t string, req *raft.RPC, res *raft.RPCResponse) error { 139 | return nil 140 | } 141 | 142 | func (p *Partitioner) PreRequestVote(src, target string, v *raft.RequestVoteRequest) (*raft.RequestVoteResponse, error) { 143 | return nil, nil 144 | } 145 | 146 | func (p *Partitioner) PreAppendEntries(src, target string, v *raft.AppendEntriesRequest) (*raft.AppendEntriesResponse, error) { 147 | return nil, nil 148 | } 149 | -------------------------------------------------------------------------------- /fuzzy/readme.md: -------------------------------------------------------------------------------- 1 | # Fuzzy Raft 2 | 3 | Inspired by http://colin-scott.github.io/blog/2015/10/07/fuzzing-raft-for-fun-and-profit/ this package 4 | is a framework and set of test scenarios for testing the behavior and correctness of the raft library 5 | under various conditions. 6 | 7 | ## Framework 8 | 9 | The framework allows you to construct multiple node raft clusters, connected by an instrumented transport 10 | that allows a test to inject various transport level behaviors to simulate various scenarios (e.g. you 11 | can have your hook fail all transport calls to a particular node to simulate it being partitioned off 12 | the network). There are helper classes to create and Apply well know sequences of test data, and to 13 | examine the final state of the cluster, the nodes FSMs and the raft log. 14 | 15 | ## Running 16 | 17 | The tests run with the standard go test framework, run with go test . [from this dir] or use make fuzz from 18 | the parent directory. As these tests are looking for timing and other edge cases, a pass from a single run 19 | isn't enough, the tests needs running repeatedly to build up confidence. 20 | 21 | ## Test Scenarios 22 | 23 | The follow test scenario's are currently implemented. Each test concludes with a standard set of validations 24 | 25 | * Each node raft log contains the same set of entries (term/index/data). 26 | * The raft log contains data matching the client request for each call to raft.Apply() that reported success. 27 | * Each node's FSM saw the same sequence of Apply(*raft.Log) calls. 28 | * A verifier at the transport level verifies a number of transport level invariants. 29 | 30 | Most tests run with a background workload that is constantly apply()ing new entries to the log. [when there's a leader] 31 | 32 | ### TestRaft_LeaderPartitions 33 | 34 | This creates a 5 node cluster and then repeated partitions multiple nodes off (including the current leader), 35 | then heals the partition and repeats. At the end all partitions are removed. [clearly inspired by Jepson] 36 | 37 | ### TestRaft_NoIssueSanity 38 | 39 | Is a basic 5 node cluster test, it starts a 5 node cluster applies some data, then does the verifications 40 | 41 | ### TestRaft_SlowSendVote 42 | 43 | Tests what happens when RequestVote requests are delaying being sent to other nodes 44 | 45 | ### TestRaft_SlowRecvVote 46 | 47 | Tests what happens when RequestVote responses are delaying being received by the sender. 48 | 49 | ### TestRaft_AddMembership 50 | 51 | Starts a 3 node cluster, and then adds 2 new members to the cluster. 52 | 53 | ### TestRaft_AddRemoveNodesNotLeader 54 | 55 | Starts a 5 node cluster, and then then removes 2 follower nodes from the cluster. 56 | 57 | ### TestRaft_RemoveLeader 58 | 59 | Starts a 5 node cluster, and then removes the node that is the leader. 60 | 61 | ### TestRaft_RemovePartitionedNode 62 | 63 | Starts a 5 node cluster, partitions one of the follower nodes off the network, and then tells the leader to remove that node, then heals the partition. 64 | -------------------------------------------------------------------------------- /fuzzy/resolve.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "os" 8 | "path/filepath" 9 | ) 10 | 11 | // resolveDirectory returns a full directory path based on the supplied dir path 12 | // if the supplied dir path is absolute (i.e. it starts with / ) then it is 13 | // returned as is, if it's a relative path, then it is assumed to be relative 14 | // to the executable, and that is computed and returned. 15 | // 16 | // if create is true, then the directory path will be created if it doesn't 17 | // already exist 18 | // 19 | // if create is false, then it's upto the caller to ensure it exists and/or 20 | // create it as needed [this won't verify that it exists] 21 | func resolveDirectory(dir string, create bool) (string, error) { 22 | var resolved string 23 | if filepath.IsAbs(dir) { 24 | resolved = dir 25 | } else { 26 | execdir, err := filepath.Abs(filepath.Dir(os.Args[0])) 27 | if err != nil { 28 | return "", err 29 | } 30 | resolved = filepath.Join(execdir, dir) 31 | } 32 | if create { 33 | if _, err := os.Stat(resolved); os.IsNotExist(err) { 34 | if err := os.MkdirAll(resolved, 0o744); err != nil { 35 | return "", err 36 | } 37 | } 38 | } 39 | return resolved, nil 40 | } 41 | -------------------------------------------------------------------------------- /fuzzy/simple_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "testing" 8 | "time" 9 | ) 10 | 11 | // this runs a 5 node cluster with verifications turned on, but no failures or issues injected. 12 | func TestRaft_NoIssueSanity(t *testing.T) { 13 | v := appendEntriesVerifier{} 14 | v.Init() 15 | cluster := newRaftCluster(t, testLogWriter, "node", 5, &v) 16 | s := newApplySource("NoIssueSanity") 17 | applyCount := cluster.ApplyN(t, time.Minute, s, 10000) 18 | cluster.Stop(t, time.Minute) 19 | v.Report(t) 20 | cluster.VerifyLog(t, applyCount) 21 | cluster.VerifyFSM(t) 22 | } 23 | -------------------------------------------------------------------------------- /fuzzy/slowvoter_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "math/rand" 8 | "testing" 9 | "time" 10 | 11 | "github.com/hashicorp/raft" 12 | ) 13 | 14 | // 5 node cluster where 2 nodes always see a delay in getting a request vote msg. 15 | func TestRaft_SlowSendVote(t *testing.T) { 16 | hooks := NewSlowVoter("sv_0", "sv_1") 17 | cluster := newRaftCluster(t, testLogWriter, "sv", 5, hooks) 18 | s := newApplySource("SlowSendVote") 19 | ac := cluster.ApplyN(t, time.Minute, s, 10000) 20 | cluster.Stop(t, time.Minute) 21 | hooks.Report(t) 22 | cluster.VerifyLog(t, ac) 23 | cluster.VerifyFSM(t) 24 | } 25 | 26 | // 5 node cluster where vote results from 3 nodes are slow to turn up. 27 | // [they see the vote request normally, but their response is slow] 28 | func TestRaft_SlowRecvVote(t *testing.T) { 29 | hooks := NewSlowVoter("svr_1", "svr_4", "svr_3") 30 | hooks.mode = SlowRecv 31 | cluster := newRaftCluster(t, testLogWriter, "svr", 5, hooks) 32 | s := newApplySource("SlowRecvVote") 33 | ac := cluster.ApplyN(t, time.Minute, s, 10000) 34 | cluster.Stop(t, time.Minute) 35 | hooks.Report(t) 36 | cluster.VerifyLog(t, ac) 37 | cluster.VerifyFSM(t) 38 | } 39 | 40 | type SlowVoterMode int 41 | 42 | const ( 43 | SlowSend SlowVoterMode = iota 44 | SlowRecv 45 | ) 46 | 47 | type SlowVoter struct { 48 | verifier appendEntriesVerifier 49 | slowNodes map[string]bool 50 | delayMin time.Duration 51 | delayMax time.Duration 52 | mode SlowVoterMode 53 | } 54 | 55 | func NewSlowVoter(slowNodes ...string) *SlowVoter { 56 | sv := SlowVoter{ 57 | slowNodes: make(map[string]bool, len(slowNodes)), 58 | delayMin: time.Second, 59 | delayMax: time.Second * 2, 60 | mode: SlowSend, 61 | } 62 | for _, n := range slowNodes { 63 | sv.slowNodes[n] = true 64 | } 65 | sv.verifier.Init() 66 | return &sv 67 | } 68 | 69 | func (sv *SlowVoter) Report(t *testing.T) { 70 | sv.verifier.Report(t) 71 | } 72 | 73 | func (sv *SlowVoter) PreRPC(s, t string, r *raft.RPC) error { 74 | return nil 75 | } 76 | 77 | func (sv *SlowVoter) nap() { 78 | d := sv.delayMin + time.Duration(rand.Int63n((sv.delayMax - sv.delayMin).Nanoseconds())) 79 | time.Sleep(d) 80 | } 81 | 82 | func (sv *SlowVoter) PostRPC(src, target string, r *raft.RPC, res *raft.RPCResponse) error { 83 | if sv.mode == SlowRecv && sv.slowNodes[target] { 84 | _, ok := r.Command.(*raft.RequestVoteRequest) 85 | if ok { 86 | sv.nap() 87 | } 88 | } 89 | return nil 90 | } 91 | 92 | func (sv *SlowVoter) PreRequestVote(src, target string, v *raft.RequestVoteRequest) (*raft.RequestVoteResponse, error) { 93 | if sv.mode == SlowSend && sv.slowNodes[target] { 94 | sv.nap() 95 | } 96 | return nil, nil 97 | } 98 | 99 | func (sv *SlowVoter) PreAppendEntries(src, target string, v *raft.AppendEntriesRequest) (*raft.AppendEntriesResponse, error) { 100 | sv.verifier.PreAppendEntries(src, target, v) 101 | return nil, nil 102 | } 103 | -------------------------------------------------------------------------------- /fuzzy/verifier.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fuzzy 5 | 6 | import ( 7 | "fmt" 8 | "sync" 9 | "testing" 10 | 11 | "github.com/hashicorp/raft" 12 | ) 13 | 14 | // AppendEntriesVerifier looks at all the AppendEntry RPC request and verifies that only one node sends AE requests for any given term 15 | // it also verifies that the request only comes from the node indicated as the leader in the AE message. 16 | type appendEntriesVerifier struct { 17 | sync.RWMutex 18 | leaderForTerm map[uint64]string 19 | errors []string 20 | } 21 | 22 | func (v *appendEntriesVerifier) Report(t *testing.T) { 23 | v.Lock() 24 | defer v.Unlock() 25 | for _, e := range v.errors { 26 | t.Error(e) 27 | } 28 | } 29 | 30 | func (v *appendEntriesVerifier) Init() { 31 | v.Lock() 32 | defer v.Unlock() 33 | v.leaderForTerm = make(map[uint64]string) 34 | v.errors = make([]string, 0, 10) 35 | } 36 | 37 | func (v *appendEntriesVerifier) PreRPC(src, target string, r *raft.RPC) error { 38 | return nil 39 | } 40 | 41 | func (v *appendEntriesVerifier) PostRPC(src, target string, req *raft.RPC, res *raft.RPCResponse) error { 42 | return nil 43 | } 44 | 45 | func (v *appendEntriesVerifier) PreRequestVote(src, target string, rv *raft.RequestVoteRequest) (*raft.RequestVoteResponse, error) { 46 | return nil, nil 47 | } 48 | 49 | func (v *appendEntriesVerifier) PreAppendEntries(src, target string, req *raft.AppendEntriesRequest) (*raft.AppendEntriesResponse, error) { 50 | term := req.Term 51 | var ldr string 52 | if len(req.RPCHeader.Addr) > 0 { 53 | ldr = string(req.RPCHeader.Addr) 54 | } else { 55 | ldr = string(req.Leader) 56 | } 57 | 58 | if ldr != src { 59 | v.Lock() 60 | defer v.Unlock() 61 | v.errors = append(v.errors, fmt.Sprintf("Node %v sent an appendEntries request for term %d that said the leader was some other node %v", src, term, ldr)) 62 | } 63 | v.RLock() 64 | tl, exists := v.leaderForTerm[term] 65 | v.RUnlock() 66 | if exists && tl != ldr { 67 | v.Lock() 68 | defer v.Unlock() 69 | v.errors = append(v.errors, fmt.Sprintf("Node %v sent an AppendEntries request for term %d, but node %v had already done some, multiple leaders for same term!", src, term, tl)) 70 | } 71 | if !exists { 72 | v.Lock() 73 | tl, exists := v.leaderForTerm[term] 74 | if exists && tl != ldr { 75 | v.errors = append(v.errors, fmt.Sprintf("Node %v sent an AppendEntries request for term %d, but node %v had already done some, multiple leaders for same term!", src, term, tl)) 76 | } 77 | if !exists { 78 | v.leaderForTerm[term] = ldr 79 | } 80 | v.Unlock() 81 | } 82 | return nil, nil 83 | } 84 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hashicorp/raft 2 | 3 | go 1.20 4 | 5 | retract v1.1.3 // Deleted original tag; module checksum may not be accurate. 6 | 7 | require ( 8 | github.com/hashicorp/go-hclog v1.6.2 9 | github.com/hashicorp/go-metrics v0.5.4 10 | github.com/hashicorp/go-msgpack/v2 v2.1.2 11 | github.com/stretchr/testify v1.8.4 12 | ) 13 | 14 | require ( 15 | github.com/armon/go-metrics v0.4.1 // indirect 16 | github.com/davecgh/go-spew v1.1.1 // indirect 17 | github.com/fatih/color v1.13.0 // indirect 18 | github.com/hashicorp/go-immutable-radix v1.0.0 // indirect 19 | github.com/hashicorp/golang-lru v0.5.0 // indirect 20 | github.com/kr/pretty v0.2.1 // indirect 21 | github.com/mattn/go-colorable v0.1.12 // indirect 22 | github.com/mattn/go-isatty v0.0.14 // indirect 23 | github.com/pmezard/go-difflib v1.0.0 // indirect 24 | golang.org/x/sys v0.13.0 // indirect 25 | gopkg.in/yaml.v3 v3.0.1 // indirect 26 | ) 27 | -------------------------------------------------------------------------------- /inmem_snapshot.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "fmt" 9 | "io" 10 | "sync" 11 | ) 12 | 13 | // InmemSnapshotStore implements the SnapshotStore interface and 14 | // retains only the most recent snapshot 15 | type InmemSnapshotStore struct { 16 | latest *InmemSnapshotSink 17 | hasSnapshot bool 18 | sync.RWMutex 19 | } 20 | 21 | // InmemSnapshotSink implements SnapshotSink in memory 22 | type InmemSnapshotSink struct { 23 | meta SnapshotMeta 24 | contents *bytes.Buffer 25 | } 26 | 27 | // NewInmemSnapshotStore creates a blank new InmemSnapshotStore 28 | func NewInmemSnapshotStore() *InmemSnapshotStore { 29 | return &InmemSnapshotStore{ 30 | latest: &InmemSnapshotSink{ 31 | contents: &bytes.Buffer{}, 32 | }, 33 | } 34 | } 35 | 36 | // Create replaces the stored snapshot with a new one using the given args 37 | func (m *InmemSnapshotStore) Create(version SnapshotVersion, index, term uint64, 38 | configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { 39 | // We only support version 1 snapshots at this time. 40 | if version != 1 { 41 | return nil, fmt.Errorf("unsupported snapshot version %d", version) 42 | } 43 | 44 | name := snapshotName(term, index) 45 | 46 | m.Lock() 47 | defer m.Unlock() 48 | 49 | sink := &InmemSnapshotSink{ 50 | meta: SnapshotMeta{ 51 | Version: version, 52 | ID: name, 53 | Index: index, 54 | Term: term, 55 | Peers: encodePeers(configuration, trans), 56 | Configuration: configuration, 57 | ConfigurationIndex: configurationIndex, 58 | }, 59 | contents: &bytes.Buffer{}, 60 | } 61 | m.hasSnapshot = true 62 | m.latest = sink 63 | 64 | return sink, nil 65 | } 66 | 67 | // List returns the latest snapshot taken 68 | func (m *InmemSnapshotStore) List() ([]*SnapshotMeta, error) { 69 | m.RLock() 70 | defer m.RUnlock() 71 | 72 | if !m.hasSnapshot { 73 | return []*SnapshotMeta{}, nil 74 | } 75 | return []*SnapshotMeta{&m.latest.meta}, nil 76 | } 77 | 78 | // Open wraps an io.ReadCloser around the snapshot contents 79 | func (m *InmemSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { 80 | m.RLock() 81 | defer m.RUnlock() 82 | 83 | if m.latest.meta.ID != id { 84 | return nil, nil, fmt.Errorf("[ERR] snapshot: failed to open snapshot id: %s", id) 85 | } 86 | 87 | // Make a copy of the contents, since a bytes.Buffer can only be read 88 | // once. 89 | contents := bytes.NewBuffer(m.latest.contents.Bytes()) 90 | return &m.latest.meta, io.NopCloser(contents), nil 91 | } 92 | 93 | // Write appends the given bytes to the snapshot contents 94 | func (s *InmemSnapshotSink) Write(p []byte) (n int, err error) { 95 | written, err := s.contents.Write(p) 96 | s.meta.Size += int64(written) 97 | return written, err 98 | } 99 | 100 | // Close updates the Size and is otherwise a no-op 101 | func (s *InmemSnapshotSink) Close() error { 102 | return nil 103 | } 104 | 105 | // ID returns the ID of the SnapshotMeta 106 | func (s *InmemSnapshotSink) ID() string { 107 | return s.meta.ID 108 | } 109 | 110 | // Cancel returns successfully with a nil error 111 | func (s *InmemSnapshotSink) Cancel() error { 112 | return nil 113 | } 114 | -------------------------------------------------------------------------------- /inmem_snapshot_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "io" 9 | "reflect" 10 | "testing" 11 | ) 12 | 13 | func TestInmemSnapshotStoreImpl(t *testing.T) { 14 | var impl interface{} = &InmemSnapshotStore{} 15 | if _, ok := impl.(SnapshotStore); !ok { 16 | t.Fatalf("InmemSnapshotStore not a SnapshotStore") 17 | } 18 | } 19 | 20 | func TestInmemSnapshotSinkImpl(t *testing.T) { 21 | var impl interface{} = &InmemSnapshotSink{} 22 | if _, ok := impl.(SnapshotSink); !ok { 23 | t.Fatalf("InmemSnapshotSink not a SnapshotSink") 24 | } 25 | } 26 | 27 | func TestInmemSS_CreateSnapshot(t *testing.T) { 28 | snap := NewInmemSnapshotStore() 29 | 30 | // Check no snapshots 31 | snaps, err := snap.List() 32 | if err != nil { 33 | t.Fatalf("err: %v", err) 34 | } 35 | if len(snaps) != 0 { 36 | t.Fatalf("did not expect any snapshots: %v", snaps) 37 | } 38 | 39 | // Create a new sink 40 | var configuration Configuration 41 | configuration.Servers = append(configuration.Servers, Server{ 42 | Suffrage: Voter, 43 | ID: ServerID("my id"), 44 | Address: ServerAddress("over here"), 45 | }) 46 | _, trans := NewInmemTransport(NewInmemAddr()) 47 | sink, err := snap.Create(SnapshotVersionMax, 10, 3, configuration, 2, trans) 48 | if err != nil { 49 | t.Fatalf("err: %v", err) 50 | } 51 | 52 | // The sink is not done, should not be in a list! 53 | snaps, err = snap.List() 54 | if err != nil { 55 | t.Fatalf("err: %v", err) 56 | } 57 | if len(snaps) != 1 { 58 | t.Fatalf("should always be 1 snapshot: %v", snaps) 59 | } 60 | 61 | // Write to the sink 62 | _, err = sink.Write([]byte("first\n")) 63 | if err != nil { 64 | t.Fatalf("err: %v", err) 65 | } 66 | _, err = sink.Write([]byte("second\n")) 67 | if err != nil { 68 | t.Fatalf("err: %v", err) 69 | } 70 | 71 | // Done! 72 | err = sink.Close() 73 | if err != nil { 74 | t.Fatalf("err: %v", err) 75 | } 76 | 77 | // Should have a snapshot! 78 | snaps, err = snap.List() 79 | if err != nil { 80 | t.Fatalf("err: %v", err) 81 | } 82 | if len(snaps) != 1 { 83 | t.Fatalf("expect a snapshots: %v", snaps) 84 | } 85 | 86 | // Check the latest 87 | latest := snaps[0] 88 | if latest.Index != 10 { 89 | t.Fatalf("bad snapshot: %v", *latest) 90 | } 91 | if latest.Term != 3 { 92 | t.Fatalf("bad snapshot: %v", *latest) 93 | } 94 | if !reflect.DeepEqual(latest.Configuration, configuration) { 95 | t.Fatalf("bad snapshot: %v", *latest) 96 | } 97 | if latest.ConfigurationIndex != 2 { 98 | t.Fatalf("bad snapshot: %v", *latest) 99 | } 100 | if latest.Size != 13 { 101 | t.Fatalf("bad snapshot: %v", *latest) 102 | } 103 | 104 | // Read the snapshot 105 | _, r, err := snap.Open(latest.ID) 106 | if err != nil { 107 | t.Fatalf("err: %v", err) 108 | } 109 | 110 | // Read out everything 111 | var buf bytes.Buffer 112 | if _, err := io.Copy(&buf, r); err != nil { 113 | t.Fatalf("err: %v", err) 114 | } 115 | if err := r.Close(); err != nil { 116 | t.Fatalf("err: %v", err) 117 | } 118 | 119 | // Ensure a match 120 | if bytes.Compare(buf.Bytes(), []byte("first\nsecond\n")) != 0 { 121 | t.Fatalf("content mismatch") 122 | } 123 | } 124 | 125 | func TestInmemSS_OpenSnapshotTwice(t *testing.T) { 126 | snap := NewInmemSnapshotStore() 127 | 128 | // Create a new sink 129 | var configuration Configuration 130 | configuration.Servers = append(configuration.Servers, Server{ 131 | Suffrage: Voter, 132 | ID: ServerID("my id"), 133 | Address: ServerAddress("over here"), 134 | }) 135 | _, trans := NewInmemTransport(NewInmemAddr()) 136 | sink, err := snap.Create(SnapshotVersionMax, 10, 3, configuration, 2, trans) 137 | if err != nil { 138 | t.Fatalf("err: %v", err) 139 | } 140 | 141 | // Write to the sink 142 | _, err = sink.Write([]byte("data\n")) 143 | if err != nil { 144 | t.Fatalf("err: %v", err) 145 | } 146 | err = sink.Close() 147 | if err != nil { 148 | t.Fatalf("err: %v", err) 149 | } 150 | 151 | // Read the snapshot a first time 152 | _, r, err := snap.Open(sink.ID()) 153 | if err != nil { 154 | t.Fatalf("err: %v", err) 155 | } 156 | 157 | // Read out everything 158 | var buf1 bytes.Buffer 159 | if _, err = io.Copy(&buf1, r); err != nil { 160 | t.Fatalf("err: %v", err) 161 | } 162 | if err = r.Close(); err != nil { 163 | t.Fatalf("err: %v", err) 164 | } 165 | 166 | // Ensure a match 167 | if bytes.Compare(buf1.Bytes(), []byte("data\n")) != 0 { 168 | t.Fatalf("content mismatch") 169 | } 170 | 171 | // Read the snapshot a second time. 172 | _, r, err = snap.Open(sink.ID()) 173 | if err != nil { 174 | t.Fatalf("err: %v", err) 175 | } 176 | 177 | // Read out everything again 178 | var buf2 bytes.Buffer 179 | if _, err := io.Copy(&buf2, r); err != nil { 180 | t.Fatalf("err: %v", err) 181 | } 182 | if err := r.Close(); err != nil { 183 | t.Fatalf("err: %v", err) 184 | } 185 | 186 | // Ensure it's still the same content 187 | if bytes.Compare(buf2.Bytes(), []byte("data\n")) != 0 { 188 | t.Fatalf("content mismatch") 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /inmem_store.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "errors" 8 | "sync" 9 | ) 10 | 11 | // InmemStore implements the LogStore and StableStore interface. 12 | // It should NOT EVER be used for production. It is used only for 13 | // unit tests. Use the MDBStore implementation instead. 14 | type InmemStore struct { 15 | l sync.RWMutex 16 | lowIndex uint64 17 | highIndex uint64 18 | logs map[uint64]*Log 19 | kv map[string][]byte 20 | kvInt map[string]uint64 21 | } 22 | 23 | // NewInmemStore returns a new in-memory backend. Do not ever 24 | // use for production. Only for testing. 25 | func NewInmemStore() *InmemStore { 26 | i := &InmemStore{ 27 | logs: make(map[uint64]*Log), 28 | kv: make(map[string][]byte), 29 | kvInt: make(map[string]uint64), 30 | } 31 | return i 32 | } 33 | 34 | // FirstIndex implements the LogStore interface. 35 | func (i *InmemStore) FirstIndex() (uint64, error) { 36 | i.l.RLock() 37 | defer i.l.RUnlock() 38 | return i.lowIndex, nil 39 | } 40 | 41 | // LastIndex implements the LogStore interface. 42 | func (i *InmemStore) LastIndex() (uint64, error) { 43 | i.l.RLock() 44 | defer i.l.RUnlock() 45 | return i.highIndex, nil 46 | } 47 | 48 | // GetLog implements the LogStore interface. 49 | func (i *InmemStore) GetLog(index uint64, log *Log) error { 50 | i.l.RLock() 51 | defer i.l.RUnlock() 52 | l, ok := i.logs[index] 53 | if !ok { 54 | return ErrLogNotFound 55 | } 56 | *log = *l 57 | return nil 58 | } 59 | 60 | // StoreLog implements the LogStore interface. 61 | func (i *InmemStore) StoreLog(log *Log) error { 62 | return i.StoreLogs([]*Log{log}) 63 | } 64 | 65 | // StoreLogs implements the LogStore interface. 66 | func (i *InmemStore) StoreLogs(logs []*Log) error { 67 | i.l.Lock() 68 | defer i.l.Unlock() 69 | for _, l := range logs { 70 | i.logs[l.Index] = l 71 | if i.lowIndex == 0 { 72 | i.lowIndex = l.Index 73 | } 74 | if l.Index > i.highIndex { 75 | i.highIndex = l.Index 76 | } 77 | } 78 | return nil 79 | } 80 | 81 | // DeleteRange implements the LogStore interface. 82 | func (i *InmemStore) DeleteRange(min, max uint64) error { 83 | i.l.Lock() 84 | defer i.l.Unlock() 85 | for j := min; j <= max; j++ { 86 | delete(i.logs, j) 87 | } 88 | if min <= i.lowIndex { 89 | i.lowIndex = max + 1 90 | } 91 | if max >= i.highIndex { 92 | i.highIndex = min - 1 93 | } 94 | if i.lowIndex > i.highIndex { 95 | i.lowIndex = 0 96 | i.highIndex = 0 97 | } 98 | return nil 99 | } 100 | 101 | // Set implements the StableStore interface. 102 | func (i *InmemStore) Set(key []byte, val []byte) error { 103 | i.l.Lock() 104 | defer i.l.Unlock() 105 | i.kv[string(key)] = val 106 | return nil 107 | } 108 | 109 | // Get implements the StableStore interface. 110 | func (i *InmemStore) Get(key []byte) ([]byte, error) { 111 | i.l.RLock() 112 | defer i.l.RUnlock() 113 | val := i.kv[string(key)] 114 | if val == nil { 115 | return nil, errors.New("not found") 116 | } 117 | return val, nil 118 | } 119 | 120 | // SetUint64 implements the StableStore interface. 121 | func (i *InmemStore) SetUint64(key []byte, val uint64) error { 122 | i.l.Lock() 123 | defer i.l.Unlock() 124 | i.kvInt[string(key)] = val 125 | return nil 126 | } 127 | 128 | // GetUint64 implements the StableStore interface. 129 | func (i *InmemStore) GetUint64(key []byte) (uint64, error) { 130 | i.l.RLock() 131 | defer i.l.RUnlock() 132 | return i.kvInt[string(key)], nil 133 | } 134 | -------------------------------------------------------------------------------- /inmem_transport_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "testing" 8 | "time" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestInmemTransportImpl(t *testing.T) { 14 | var inm interface{} = &InmemTransport{} 15 | if _, ok := inm.(Transport); !ok { 16 | t.Fatalf("InmemTransport is not a Transport") 17 | } 18 | if _, ok := inm.(LoopbackTransport); !ok { 19 | t.Fatalf("InmemTransport is not a Loopback Transport") 20 | } 21 | if _, ok := inm.(WithPeers); !ok { 22 | t.Fatalf("InmemTransport is not a WithPeers Transport") 23 | } 24 | } 25 | 26 | func TestInmemTransportWriteTimeout(t *testing.T) { 27 | // InmemTransport should timeout if the other end has gone away 28 | // when it tries to send a request. 29 | // Use unbuffered channels so that we can see the write failing 30 | // without having to contrive to fill up the buffer first. 31 | timeout := 10 * time.Millisecond 32 | t1 := &InmemTransport{ 33 | consumerCh: make(chan RPC), 34 | localAddr: NewInmemAddr(), 35 | peers: make(map[ServerAddress]*InmemTransport), 36 | timeout: timeout, 37 | } 38 | t2 := &InmemTransport{ 39 | consumerCh: make(chan RPC), 40 | localAddr: NewInmemAddr(), 41 | peers: make(map[ServerAddress]*InmemTransport), 42 | timeout: timeout, 43 | } 44 | a2 := t2.LocalAddr() 45 | t1.Connect(a2, t2) 46 | 47 | stop := make(chan struct{}) 48 | stopped := make(chan struct{}) 49 | go func() { 50 | defer close(stopped) 51 | var i uint64 52 | for { 53 | select { 54 | case <-stop: 55 | return 56 | case rpc := <-t2.Consumer(): 57 | i++ 58 | rpc.Respond(&AppendEntriesResponse{ 59 | Success: true, 60 | LastLog: i, 61 | }, nil) 62 | } 63 | } 64 | }() 65 | 66 | var resp AppendEntriesResponse 67 | // Sanity check that sending is working before stopping the 68 | // responder. 69 | err := t1.AppendEntries("server1", a2, &AppendEntriesRequest{}, &resp) 70 | NoErr(err, t) 71 | require.True(t, resp.LastLog == 1) 72 | 73 | close(stop) 74 | select { 75 | case <-stopped: 76 | case <-time.After(time.Second): 77 | t.Fatalf("timed out waiting for responder to stop") 78 | } 79 | 80 | err = t1.AppendEntries("server1", a2, &AppendEntriesRequest{}, &resp) 81 | if err == nil { 82 | t.Fatalf("expected AppendEntries to time out") 83 | } 84 | if err.Error() != "send timed out" { 85 | t.Fatalf("unexpected error: %v", err) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /log.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "fmt" 8 | "time" 9 | 10 | metrics "github.com/hashicorp/go-metrics/compat" 11 | ) 12 | 13 | // LogType describes various types of log entries. 14 | type LogType uint8 15 | 16 | const ( 17 | // LogCommand is applied to a user FSM. 18 | LogCommand LogType = iota 19 | 20 | // LogNoop is used to assert leadership. 21 | LogNoop 22 | 23 | // LogAddPeerDeprecated is used to add a new peer. This should only be used with 24 | // older protocol versions designed to be compatible with unversioned 25 | // Raft servers. See comments in config.go for details. 26 | LogAddPeerDeprecated 27 | 28 | // LogRemovePeerDeprecated is used to remove an existing peer. This should only be 29 | // used with older protocol versions designed to be compatible with 30 | // unversioned Raft servers. See comments in config.go for details. 31 | LogRemovePeerDeprecated 32 | 33 | // LogBarrier is used to ensure all preceding operations have been 34 | // applied to the FSM. It is similar to LogNoop, but instead of returning 35 | // once committed, it only returns once the FSM manager acks it. Otherwise, 36 | // it is possible there are operations committed but not yet applied to 37 | // the FSM. 38 | LogBarrier 39 | 40 | // LogConfiguration establishes a membership change configuration. It is 41 | // created when a server is added, removed, promoted, etc. Only used 42 | // when protocol version 1 or greater is in use. 43 | LogConfiguration 44 | ) 45 | 46 | // String returns LogType as a human readable string. 47 | func (lt LogType) String() string { 48 | switch lt { 49 | case LogCommand: 50 | return "LogCommand" 51 | case LogNoop: 52 | return "LogNoop" 53 | case LogAddPeerDeprecated: 54 | return "LogAddPeerDeprecated" 55 | case LogRemovePeerDeprecated: 56 | return "LogRemovePeerDeprecated" 57 | case LogBarrier: 58 | return "LogBarrier" 59 | case LogConfiguration: 60 | return "LogConfiguration" 61 | default: 62 | return fmt.Sprintf("%d", lt) 63 | } 64 | } 65 | 66 | // Log entries are replicated to all members of the Raft cluster 67 | // and form the heart of the replicated state machine. 68 | type Log struct { 69 | // Index holds the index of the log entry. 70 | Index uint64 71 | 72 | // Term holds the election term of the log entry. 73 | Term uint64 74 | 75 | // Type holds the type of the log entry. 76 | Type LogType 77 | 78 | // Data holds the log entry's type-specific data. 79 | Data []byte 80 | 81 | // Extensions holds an opaque byte slice of information for middleware. It 82 | // is up to the client of the library to properly modify this as it adds 83 | // layers and remove those layers when appropriate. This value is a part of 84 | // the log, so very large values could cause timing issues. 85 | // 86 | // N.B. It is _up to the client_ to handle upgrade paths. For instance if 87 | // using this with go-raftchunking, the client should ensure that all Raft 88 | // peers are using a version that can handle that extension before ever 89 | // actually triggering chunking behavior. It is sometimes sufficient to 90 | // ensure that non-leaders are upgraded first, then the current leader is 91 | // upgraded, but a leader changeover during this process could lead to 92 | // trouble, so gating extension behavior via some flag in the client 93 | // program is also a good idea. 94 | Extensions []byte 95 | 96 | // AppendedAt stores the time the leader first appended this log to it's 97 | // LogStore. Followers will observe the leader's time. It is not used for 98 | // coordination or as part of the replication protocol at all. It exists only 99 | // to provide operational information for example how many seconds worth of 100 | // logs are present on the leader which might impact follower's ability to 101 | // catch up after restoring a large snapshot. We should never rely on this 102 | // being in the past when appending on a follower or reading a log back since 103 | // the clock skew can mean a follower could see a log with a future timestamp. 104 | // In general too the leader is not required to persist the log before 105 | // delivering to followers although the current implementation happens to do 106 | // this. 107 | AppendedAt time.Time 108 | } 109 | 110 | // LogStore is used to provide an interface for storing 111 | // and retrieving logs in a durable fashion. 112 | type LogStore interface { 113 | // FirstIndex returns the first index written. 0 for no entries. 114 | FirstIndex() (uint64, error) 115 | 116 | // LastIndex returns the last index written. 0 for no entries. 117 | LastIndex() (uint64, error) 118 | 119 | // GetLog gets a log entry at a given index. 120 | GetLog(index uint64, log *Log) error 121 | 122 | // StoreLog stores a log entry. 123 | StoreLog(log *Log) error 124 | 125 | // StoreLogs stores multiple log entries. By default the logs stored may not be contiguous with previous logs (i.e. may have a gap in Index since the last log written). If an implementation can't tolerate this it may optionally implement `MonotonicLogStore` to indicate that this is not allowed. This changes Raft's behaviour after restoring a user snapshot to remove all previous logs instead of relying on a "gap" to signal the discontinuity between logs before the snapshot and logs after. 126 | StoreLogs(logs []*Log) error 127 | 128 | // DeleteRange deletes a range of log entries. The range is inclusive. 129 | DeleteRange(min, max uint64) error 130 | } 131 | 132 | // MonotonicLogStore is an optional interface for LogStore implementations that 133 | // cannot tolerate gaps in between the Index values of consecutive log entries. For example, 134 | // this may allow more efficient indexing because the Index values are densely populated. If true is 135 | // returned, Raft will avoid relying on gaps to trigger re-synching logs on followers after a 136 | // snapshot is restored. The LogStore must have an efficient implementation of 137 | // DeleteLogs for the case where all logs are removed, as this must be called after snapshot restore when gaps are not allowed. 138 | // We avoid deleting all records for LogStores that do not implement MonotonicLogStore 139 | // because although it's always correct to do so, it has a major negative performance impact on the BoltDB store that is currently 140 | // the most widely used. 141 | type MonotonicLogStore interface { 142 | IsMonotonic() bool 143 | } 144 | 145 | func oldestLog(s LogStore) (Log, error) { 146 | var l Log 147 | 148 | // We might get unlucky and have a truncate right between getting first log 149 | // index and fetching it so keep trying until we succeed or hard fail. 150 | var lastFailIdx uint64 151 | var lastErr error 152 | for { 153 | firstIdx, err := s.FirstIndex() 154 | if err != nil { 155 | return l, err 156 | } 157 | if firstIdx == 0 { 158 | return l, ErrLogNotFound 159 | } 160 | if firstIdx == lastFailIdx { 161 | // Got same index as last time around which errored, don't bother trying 162 | // to fetch it again just return the error. 163 | return l, lastErr 164 | } 165 | err = s.GetLog(firstIdx, &l) 166 | if err == nil { 167 | // We found the oldest log, break the loop 168 | break 169 | } 170 | // We failed, keep trying to see if there is a new firstIndex 171 | lastFailIdx = firstIdx 172 | lastErr = err 173 | } 174 | return l, nil 175 | } 176 | 177 | func emitLogStoreMetrics(s LogStore, prefix []string, interval time.Duration, stopCh <-chan struct{}) { 178 | for { 179 | select { 180 | case <-time.After(interval): 181 | // In error case emit 0 as the age 182 | ageMs := float32(0.0) 183 | l, err := oldestLog(s) 184 | if err == nil && !l.AppendedAt.IsZero() { 185 | ageMs = float32(time.Since(l.AppendedAt).Milliseconds()) 186 | } 187 | metrics.SetGauge(append(prefix, "oldestLogAge"), ageMs) 188 | case <-stopCh: 189 | return 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /log_cache.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "fmt" 8 | "sync" 9 | ) 10 | 11 | // LogCache wraps any LogStore implementation to provide an 12 | // in-memory ring buffer. This is used to cache access to 13 | // the recently written entries. For implementations that do not 14 | // cache themselves, this can provide a substantial boost by 15 | // avoiding disk I/O on recent entries. 16 | type LogCache struct { 17 | store LogStore 18 | 19 | cache []*Log 20 | l sync.RWMutex 21 | } 22 | 23 | // NewLogCache is used to create a new LogCache with the 24 | // given capacity and backend store. 25 | func NewLogCache(capacity int, store LogStore) (*LogCache, error) { 26 | if capacity <= 0 { 27 | return nil, fmt.Errorf("capacity must be positive") 28 | } 29 | c := &LogCache{ 30 | store: store, 31 | cache: make([]*Log, capacity), 32 | } 33 | return c, nil 34 | } 35 | 36 | // IsMonotonic implements the MonotonicLogStore interface. This is a shim to 37 | // expose the underlying store as monotonically indexed or not. 38 | func (c *LogCache) IsMonotonic() bool { 39 | if store, ok := c.store.(MonotonicLogStore); ok { 40 | return store.IsMonotonic() 41 | } 42 | 43 | return false 44 | } 45 | 46 | func (c *LogCache) GetLog(idx uint64, log *Log) error { 47 | // Check the buffer for an entry 48 | c.l.RLock() 49 | cached := c.cache[idx%uint64(len(c.cache))] 50 | c.l.RUnlock() 51 | 52 | // Check if entry is valid 53 | if cached != nil && cached.Index == idx { 54 | *log = *cached 55 | return nil 56 | } 57 | 58 | // Forward request on cache miss 59 | return c.store.GetLog(idx, log) 60 | } 61 | 62 | func (c *LogCache) StoreLog(log *Log) error { 63 | return c.StoreLogs([]*Log{log}) 64 | } 65 | 66 | func (c *LogCache) StoreLogs(logs []*Log) error { 67 | err := c.store.StoreLogs(logs) 68 | // Insert the logs into the ring buffer, but only on success 69 | if err != nil { 70 | return fmt.Errorf("unable to store logs within log store, err: %q", err) 71 | } 72 | c.l.Lock() 73 | for _, l := range logs { 74 | c.cache[l.Index%uint64(len(c.cache))] = l 75 | } 76 | c.l.Unlock() 77 | return nil 78 | } 79 | 80 | func (c *LogCache) FirstIndex() (uint64, error) { 81 | return c.store.FirstIndex() 82 | } 83 | 84 | func (c *LogCache) LastIndex() (uint64, error) { 85 | return c.store.LastIndex() 86 | } 87 | 88 | func (c *LogCache) DeleteRange(min, max uint64) error { 89 | // Invalidate the cache on deletes 90 | c.l.Lock() 91 | c.cache = make([]*Log, len(c.cache)) 92 | c.l.Unlock() 93 | 94 | return c.store.DeleteRange(min, max) 95 | } 96 | -------------------------------------------------------------------------------- /log_cache_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "errors" 8 | "strings" 9 | "sync" 10 | "testing" 11 | ) 12 | 13 | func TestLogCache(t *testing.T) { 14 | store := NewInmemStore() 15 | c, _ := NewLogCache(16, store) 16 | 17 | // Insert into the in-mem store 18 | for i := 0; i < 32; i++ { 19 | log := &Log{Index: uint64(i) + 1} 20 | store.StoreLog(log) 21 | } 22 | 23 | // Check the indexes 24 | if idx, _ := c.FirstIndex(); idx != 1 { 25 | t.Fatalf("bad: %d", idx) 26 | } 27 | if idx, _ := c.LastIndex(); idx != 32 { 28 | t.Fatalf("bad: %d", idx) 29 | } 30 | 31 | // Try get log with a miss 32 | var out Log 33 | err := c.GetLog(1, &out) 34 | if err != nil { 35 | t.Fatalf("err: %v", err) 36 | } 37 | if out.Index != 1 { 38 | t.Fatalf("bad: %#v", out) 39 | } 40 | 41 | // Store logs 42 | l1 := &Log{Index: 33} 43 | l2 := &Log{Index: 34} 44 | err = c.StoreLogs([]*Log{l1, l2}) 45 | if err != nil { 46 | t.Fatalf("err: %v", err) 47 | } 48 | 49 | if idx, _ := c.LastIndex(); idx != 34 { 50 | t.Fatalf("bad: %d", idx) 51 | } 52 | 53 | // Check that it wrote-through 54 | err = store.GetLog(33, &out) 55 | if err != nil { 56 | t.Fatalf("err: %v", err) 57 | } 58 | err = store.GetLog(34, &out) 59 | if err != nil { 60 | t.Fatalf("err: %v", err) 61 | } 62 | 63 | // Delete in the backend 64 | err = store.DeleteRange(33, 34) 65 | if err != nil { 66 | t.Fatalf("err: %v", err) 67 | } 68 | 69 | // Should be in the ring buffer 70 | err = c.GetLog(33, &out) 71 | if err != nil { 72 | t.Fatalf("err: %v", err) 73 | } 74 | err = c.GetLog(34, &out) 75 | if err != nil { 76 | t.Fatalf("err: %v", err) 77 | } 78 | 79 | // Purge the ring buffer 80 | err = c.DeleteRange(33, 34) 81 | if err != nil { 82 | t.Fatalf("err: %v", err) 83 | } 84 | 85 | // Should not be in the ring buffer 86 | err = c.GetLog(33, &out) 87 | if err != ErrLogNotFound { 88 | t.Fatalf("err: %v", err) 89 | } 90 | err = c.GetLog(34, &out) 91 | if err != ErrLogNotFound { 92 | t.Fatalf("err: %v", err) 93 | } 94 | } 95 | 96 | type errorStore struct { 97 | LogStore 98 | mu sync.Mutex 99 | fail bool 100 | failed int 101 | failMax int 102 | } 103 | 104 | func (e *errorStore) StoreLogs(logs []*Log) error { 105 | e.mu.Lock() 106 | defer e.mu.Unlock() 107 | if e.fail { 108 | e.failed++ 109 | if e.failed <= e.failMax { 110 | return errors.New("some error") 111 | } 112 | e.fail = false 113 | } 114 | return e.LogStore.StoreLogs(logs) 115 | } 116 | 117 | func (e *errorStore) failNext(count int) { 118 | e.mu.Lock() 119 | e.fail = true 120 | e.failMax = count 121 | e.mu.Unlock() 122 | } 123 | 124 | func TestLogCacheWithBackendStoreError(t *testing.T) { 125 | var err error 126 | store := NewInmemStore() 127 | errStore := &errorStore{LogStore: store} 128 | c, _ := NewLogCache(16, errStore) 129 | 130 | for i := 0; i < 4; i++ { 131 | log := &Log{Index: uint64(i) + 1} 132 | store.StoreLog(log) 133 | } 134 | errStore.failNext(1) 135 | log := &Log{Index: 5} 136 | err = c.StoreLog(log) 137 | if !strings.Contains(err.Error(), "some error") { 138 | t.Fatalf("wanted: some error, got err=%v", err) 139 | } 140 | 141 | var out Log 142 | for i := 1; i < 5; i++ { 143 | if err := c.GetLog(uint64(i), &out); err != nil { 144 | t.Fatalf("err: %v", err) 145 | } 146 | } 147 | out = Log{} 148 | if err = c.GetLog(5, &out); err != ErrLogNotFound { 149 | t.Fatalf("Should have returned not found, got err=%v out=%+v", err, out) 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /log_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "fmt" 9 | "testing" 10 | "time" 11 | 12 | metrics "github.com/hashicorp/go-metrics/compat" 13 | ) 14 | 15 | func TestOldestLog(t *testing.T) { 16 | cases := []struct { 17 | Name string 18 | Logs []*Log 19 | WantIdx uint64 20 | WantErr bool 21 | }{ 22 | { 23 | Name: "empty logs", 24 | Logs: nil, 25 | WantIdx: 0, 26 | WantErr: true, 27 | }, 28 | { 29 | Name: "simple case", 30 | Logs: []*Log{ 31 | { 32 | Index: 1234, 33 | Term: 1, 34 | }, 35 | { 36 | Index: 1235, 37 | Term: 1, 38 | }, 39 | { 40 | Index: 1236, 41 | Term: 2, 42 | }, 43 | }, 44 | WantIdx: 1234, 45 | WantErr: false, 46 | }, 47 | } 48 | 49 | for _, tc := range cases { 50 | tc := tc 51 | t.Run(tc.Name, func(t *testing.T) { 52 | s := NewInmemStore() 53 | if err := s.StoreLogs(tc.Logs); err != nil { 54 | t.Fatalf("expected store logs not to fail: %s", err) 55 | } 56 | 57 | got, err := oldestLog(s) 58 | switch { 59 | case tc.WantErr && err == nil: 60 | t.Fatalf("wanted error got nil") 61 | case !tc.WantErr && err != nil: 62 | t.Fatalf("wanted no error got: %s", err) 63 | } 64 | 65 | if got.Index != tc.WantIdx { 66 | t.Fatalf("got index %v, want %v", got.Index, tc.WantIdx) 67 | } 68 | }) 69 | } 70 | } 71 | 72 | func TestEmitsLogStoreMetrics(t *testing.T) { 73 | sink := testSetupMetrics(t) 74 | 75 | start := time.Now() 76 | 77 | s := NewInmemStore() 78 | logs := []*Log{ 79 | { 80 | Index: 1234, 81 | Term: 1, 82 | AppendedAt: time.Now(), 83 | }, 84 | { 85 | Index: 1235, 86 | Term: 1, 87 | }, 88 | { 89 | Index: 1236, 90 | Term: 2, 91 | }, 92 | } 93 | if err := s.StoreLogs(logs); err != nil { 94 | t.Fatalf("expected store logs not to fail: %s", err) 95 | } 96 | 97 | stopCh := make(chan struct{}) 98 | defer close(stopCh) 99 | 100 | go emitLogStoreMetrics(s, []string{"foo"}, time.Millisecond, stopCh) 101 | 102 | // Wait for at least one interval 103 | time.Sleep(5 * time.Millisecond) 104 | 105 | got := getCurrentGaugeValue(t, sink, "raft.test.foo.oldestLogAge") 106 | 107 | // Assert the age is in a reasonable range. 108 | if got > float32(time.Since(start).Milliseconds()) { 109 | t.Fatalf("max age before test start: %v", got) 110 | } 111 | 112 | if got < 1 { 113 | t.Fatalf("max age less than interval: %v", got) 114 | } 115 | } 116 | 117 | func testSetupMetrics(t *testing.T) *metrics.InmemSink { 118 | // Record for ages (5 mins) so we can be confident that our assertions won't 119 | // fail on silly long test runs due to dropped data. 120 | s := metrics.NewInmemSink(10*time.Second, 300*time.Second) 121 | cfg := metrics.DefaultConfig("raft.test") 122 | cfg.EnableHostname = false 123 | metrics.NewGlobal(cfg, s) 124 | return s 125 | } 126 | 127 | func getCurrentGaugeValue(t *testing.T, sink *metrics.InmemSink, name string) float32 { 128 | t.Helper() 129 | 130 | data := sink.Data() 131 | 132 | // Loop backward through intervals until there is a non-empty one 133 | // Addresses flakiness around recording to one interval but accessing during the next 134 | for i := len(data) - 1; i >= 0; i-- { 135 | currentInterval := data[i] 136 | 137 | currentInterval.RLock() 138 | if gv, ok := currentInterval.Gauges[name]; ok { 139 | currentInterval.RUnlock() 140 | return gv.Value 141 | } 142 | currentInterval.RUnlock() 143 | } 144 | 145 | // Debug print all the gauges 146 | buf := bytes.NewBuffer(nil) 147 | for _, intv := range data { 148 | intv.RLock() 149 | for name, val := range intv.Gauges { 150 | fmt.Fprintf(buf, "[%v][G] '%s': %0.3f\n", intv.Interval, name, val.Value) 151 | } 152 | intv.RUnlock() 153 | } 154 | t.Log(buf.String()) 155 | 156 | t.Fatalf("didn't find gauge %q", name) 157 | return 0 158 | } 159 | -------------------------------------------------------------------------------- /membership.md: -------------------------------------------------------------------------------- 1 | Simon (@superfell) and I (@ongardie) talked through reworking this library's cluster membership changes last Friday. We don't see a way to split this into independent patches, so we're taking the next best approach: submitting the plan here for review, then working on an enormous PR. Your feedback would be appreciated. (@superfell is out this week, however, so don't expect him to respond quickly.) 2 | 3 | These are the main goals: 4 | - Bringing things in line with the description in my PhD dissertation; 5 | - Catching up new servers prior to granting them a vote, as well as allowing permanent non-voting members; and 6 | - Eliminating the `peers.json` file, to avoid issues of consistency between that and the log/snapshot. 7 | 8 | ## Data-centric view 9 | 10 | We propose to re-define a *configuration* as a set of servers, where each server includes an address (as it does today) and a mode that is either: 11 | - *Voter*: a server whose vote is counted in elections and whose match index is used in advancing the leader's commit index. 12 | - *Nonvoter*: a server that receives log entries but is not considered for elections or commitment purposes. 13 | - *Staging*: a server that acts like a nonvoter with one exception: once a staging server receives enough log entries to catch up sufficiently to the leader's log, the leader will invoke a membership change to change the staging server to a voter. 14 | 15 | All changes to the configuration will be done by writing a new configuration to the log. The new configuration will be in affect as soon as it is appended to the log (not when it is committed like a normal state machine command). Note that, per my dissertation, there can be at most one uncommitted configuration at a time (the next configuration may not be created until the prior one has been committed). It's not strictly necessary to follow these same rules for the nonvoter/staging servers, but we think its best to treat all changes uniformly. 16 | 17 | Each server will track two configurations: 18 | 1. its *committed configuration*: the latest configuration in the log/snapshot that has been committed, along with its index. 19 | 2. its *latest configuration*: the latest configuration in the log/snapshot (may be committed or uncommitted), along with its index. 20 | 21 | When there's no membership change happening, these two will be the same. The latest configuration is almost always the one used, except: 22 | - When followers truncate the suffix of their logs, they may need to fall back to the committed configuration. 23 | - When snapshotting, the committed configuration is written, to correspond with the committed log prefix that is being snapshotted. 24 | 25 | 26 | ## Application API 27 | 28 | We propose the following operations for clients to manipulate the cluster configuration: 29 | - AddVoter: server becomes staging unless voter, 30 | - AddNonvoter: server becomes nonvoter unless staging or voter, 31 | - DemoteVoter: server becomes nonvoter unless absent, 32 | - RemovePeer: server removed from configuration, 33 | - GetConfiguration: waits for latest config to commit, returns committed config. 34 | 35 | This diagram, of which I'm quite proud, shows the possible transitions: 36 | ``` 37 | +-----------------------------------------------------------------------------+ 38 | | | 39 | | Start -> +--------+ | 40 | | ,------<------------| | | 41 | | / | absent | | 42 | | / RemovePeer--> | | <---RemovePeer | 43 | | / | +--------+ \ | 44 | | / | | \ | 45 | | AddNonvoter | AddVoter \ | 46 | | | ,->---' `--<-. | \ | 47 | | v / \ v \ | 48 | | +----------+ +----------+ +----------+ | 49 | | | | ---AddVoter--> | | -log caught up --> | | | 50 | | | nonvoter | | staging | | voter | | 51 | | | | <-DemoteVoter- | | ,- | | | 52 | | +----------+ \ +----------+ / +----------+ | 53 | | \ / | 54 | | `--------------<---------------' | 55 | | | 56 | +-----------------------------------------------------------------------------+ 57 | ``` 58 | 59 | While these operations aren't quite symmetric, we think they're a good set to capture 60 | the possible intent of the user. For example, if I want to make sure a server doesn't have a vote, but the server isn't part of the configuration at all, it probably shouldn't be added as a nonvoting server. 61 | 62 | Each of these application-level operations will be interpreted by the leader and, if it has an effect, will cause the leader to write a new configuration entry to its log. Which particular application-level operation caused the log entry to be written need not be part of the log entry. 63 | 64 | ## Code implications 65 | 66 | This is a non-exhaustive list, but we came up with a few things: 67 | - Remove the PeerStore: the `peers.json` file introduces the possibility of getting out of sync with the log and snapshot, and it's hard to maintain this atomically as the log changes. It's not clear whether it's meant to track the committed or latest configuration, either. 68 | - Servers will have to search their snapshot and log to find the committed configuration and the latest configuration on startup. 69 | - Bootstrap will no longer use `peers.json` but should initialize the log or snapshot with an application-provided configuration entry. 70 | - Snapshots should store the index of their configuration along with the configuration itself. In my experience with LogCabin, the original log index of the configuration is very useful to include in debug log messages. 71 | - As noted in hashicorp/raft#84, configuration change requests should come in via a separate channel, and one may not proceed until the last has been committed. 72 | - As to deciding when a log is sufficiently caught up, implementing a sophisticated algorithm *is* something that can be done in a separate PR. An easy and decent placeholder is: once the staging server has reached 95% of the leader's commit index, promote it. 73 | 74 | ## Feedback 75 | 76 | Again, we're looking for feedback here before we start working on this. Here are some questions to think about: 77 | - Does this seem like where we want things to go? 78 | - Is there anything here that should be left out? 79 | - Is there anything else we're forgetting about? 80 | - Is there a good way to break this up? 81 | - What do we need to worry about in terms of backwards compatibility? 82 | - What implication will this have on current tests? 83 | - What's the best way to test this code, in particular the small changes that will be sprinkled all over the library? 84 | -------------------------------------------------------------------------------- /observer.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "sync/atomic" 8 | "time" 9 | ) 10 | 11 | // Observation is sent along the given channel to observers when an event occurs. 12 | type Observation struct { 13 | // Raft holds the Raft instance generating the observation. 14 | Raft *Raft 15 | // Data holds observation-specific data. Possible types are 16 | // RequestVoteRequest 17 | // RaftState 18 | // PeerObservation 19 | // LeaderObservation 20 | Data interface{} 21 | } 22 | 23 | // LeaderObservation is used for the data when leadership changes. 24 | type LeaderObservation struct { 25 | // DEPRECATED The LeaderAddr field should now be used 26 | Leader ServerAddress 27 | LeaderAddr ServerAddress 28 | LeaderID ServerID 29 | } 30 | 31 | // PeerObservation is sent to observers when peers change. 32 | type PeerObservation struct { 33 | Removed bool 34 | Peer Server 35 | } 36 | 37 | // FailedHeartbeatObservation is sent when a node fails to heartbeat with the leader 38 | type FailedHeartbeatObservation struct { 39 | PeerID ServerID 40 | LastContact time.Time 41 | } 42 | 43 | // ResumedHeartbeatObservation is sent when a node resumes to heartbeat with the leader following failures 44 | type ResumedHeartbeatObservation struct { 45 | PeerID ServerID 46 | } 47 | 48 | // nextObserverId is used to provide a unique ID for each observer to aid in 49 | // deregistration. 50 | var nextObserverID uint64 51 | 52 | // FilterFn is a function that can be registered in order to filter observations. 53 | // The function reports whether the observation should be included - if 54 | // it returns false, the observation will be filtered out. 55 | type FilterFn func(o *Observation) bool 56 | 57 | // Observer describes what to do with a given observation. 58 | type Observer struct { 59 | // numObserved and numDropped are performance counters for this observer. 60 | // 64 bit types must be 64 bit aligned to use with atomic operations on 61 | // 32 bit platforms, so keep them at the top of the struct. 62 | numObserved uint64 63 | numDropped uint64 64 | 65 | // channel receives observations. 66 | channel chan Observation 67 | 68 | // blocking, if true, will cause Raft to block when sending an observation 69 | // to this observer. This should generally be set to false. 70 | blocking bool 71 | 72 | // filter will be called to determine if an observation should be sent to 73 | // the channel. 74 | filter FilterFn 75 | 76 | // id is the ID of this observer in the Raft map. 77 | id uint64 78 | } 79 | 80 | // NewObserver creates a new observer that can be registered 81 | // to make observations on a Raft instance. Observations 82 | // will be sent on the given channel if they satisfy the 83 | // given filter. 84 | // 85 | // If blocking is true, the observer will block when it can't 86 | // send on the channel, otherwise it may discard events. 87 | func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer { 88 | return &Observer{ 89 | channel: channel, 90 | blocking: blocking, 91 | filter: filter, 92 | id: atomic.AddUint64(&nextObserverID, 1), 93 | } 94 | } 95 | 96 | // GetNumObserved returns the number of observations. 97 | func (or *Observer) GetNumObserved() uint64 { 98 | return atomic.LoadUint64(&or.numObserved) 99 | } 100 | 101 | // GetNumDropped returns the number of dropped observations due to blocking. 102 | func (or *Observer) GetNumDropped() uint64 { 103 | return atomic.LoadUint64(&or.numDropped) 104 | } 105 | 106 | // RegisterObserver registers a new observer. 107 | func (r *Raft) RegisterObserver(or *Observer) { 108 | r.observersLock.Lock() 109 | defer r.observersLock.Unlock() 110 | r.observers[or.id] = or 111 | } 112 | 113 | // DeregisterObserver deregisters an observer. 114 | func (r *Raft) DeregisterObserver(or *Observer) { 115 | r.observersLock.Lock() 116 | defer r.observersLock.Unlock() 117 | delete(r.observers, or.id) 118 | } 119 | 120 | // observe sends an observation to every observer. 121 | func (r *Raft) observe(o interface{}) { 122 | // In general observers should not block. But in any case this isn't 123 | // disastrous as we only hold a read lock, which merely prevents 124 | // registration / deregistration of observers. 125 | r.observersLock.RLock() 126 | defer r.observersLock.RUnlock() 127 | for _, or := range r.observers { 128 | // It's wasteful to do this in the loop, but for the common case 129 | // where there are no observers we won't create any objects. 130 | ob := Observation{Raft: r, Data: o} 131 | if or.filter != nil && !or.filter(&ob) { 132 | continue 133 | } 134 | if or.channel == nil { 135 | continue 136 | } 137 | if or.blocking { 138 | or.channel <- ob 139 | atomic.AddUint64(&or.numObserved, 1) 140 | } else { 141 | select { 142 | case or.channel <- ob: 143 | atomic.AddUint64(&or.numObserved, 1) 144 | default: 145 | atomic.AddUint64(&or.numDropped, 1) 146 | } 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /peersjson.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "encoding/json" 9 | "os" 10 | ) 11 | 12 | // ReadPeersJSON consumes a legacy peers.json file in the format of the old JSON 13 | // peer store and creates a new-style configuration structure. This can be used 14 | // to migrate this data or perform manual recovery when running protocol versions 15 | // that can interoperate with older, unversioned Raft servers. This should not be 16 | // used once server IDs are in use, because the old peers.json file didn't have 17 | // support for these, nor non-voter suffrage types. 18 | func ReadPeersJSON(path string) (Configuration, error) { 19 | // Read in the file. 20 | buf, err := os.ReadFile(path) 21 | if err != nil { 22 | return Configuration{}, err 23 | } 24 | 25 | // Parse it as JSON. 26 | var peers []string 27 | dec := json.NewDecoder(bytes.NewReader(buf)) 28 | if err := dec.Decode(&peers); err != nil { 29 | return Configuration{}, err 30 | } 31 | 32 | // Map it into the new-style configuration structure. We can only specify 33 | // voter roles here, and the ID has to be the same as the address. 34 | var configuration Configuration 35 | for _, peer := range peers { 36 | server := Server{ 37 | Suffrage: Voter, 38 | ID: ServerID(peer), 39 | Address: ServerAddress(peer), 40 | } 41 | configuration.Servers = append(configuration.Servers, server) 42 | } 43 | 44 | // We should only ingest valid configurations. 45 | if err := checkConfiguration(configuration); err != nil { 46 | return Configuration{}, err 47 | } 48 | return configuration, nil 49 | } 50 | 51 | // configEntry is used when decoding a new-style peers.json. 52 | type configEntry struct { 53 | // ID is the ID of the server (a UUID, usually). 54 | ID ServerID `json:"id"` 55 | 56 | // Address is the host:port of the server. 57 | Address ServerAddress `json:"address"` 58 | 59 | // NonVoter controls the suffrage. We choose this sense so people 60 | // can leave this out and get a Voter by default. 61 | NonVoter bool `json:"non_voter"` 62 | } 63 | 64 | // ReadConfigJSON reads a new-style peers.json and returns a configuration 65 | // structure. This can be used to perform manual recovery when running protocol 66 | // versions that use server IDs. 67 | func ReadConfigJSON(path string) (Configuration, error) { 68 | // Read in the file. 69 | buf, err := os.ReadFile(path) 70 | if err != nil { 71 | return Configuration{}, err 72 | } 73 | 74 | // Parse it as JSON. 75 | var peers []configEntry 76 | dec := json.NewDecoder(bytes.NewReader(buf)) 77 | if err := dec.Decode(&peers); err != nil { 78 | return Configuration{}, err 79 | } 80 | 81 | // Map it into the new-style configuration structure. 82 | var configuration Configuration 83 | for _, peer := range peers { 84 | suffrage := Voter 85 | if peer.NonVoter { 86 | suffrage = Nonvoter 87 | } 88 | server := Server{ 89 | Suffrage: suffrage, 90 | ID: peer.ID, 91 | Address: peer.Address, 92 | } 93 | configuration.Servers = append(configuration.Servers, server) 94 | } 95 | 96 | // We should only ingest valid configurations. 97 | if err := checkConfiguration(configuration); err != nil { 98 | return Configuration{}, err 99 | } 100 | return configuration, nil 101 | } 102 | -------------------------------------------------------------------------------- /peersjson_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "os" 8 | "path/filepath" 9 | "reflect" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | func TestPeersJSON_BadConfiguration(t *testing.T) { 15 | var err error 16 | var base string 17 | base, err = os.MkdirTemp("", "") 18 | if err != nil { 19 | t.Fatalf("err: %v", err) 20 | } 21 | defer os.RemoveAll(base) 22 | 23 | peers := filepath.Join(base, "peers.json") 24 | if err = os.WriteFile(peers, []byte("null"), 0o666); err != nil { 25 | t.Fatalf("err: %v", err) 26 | } 27 | 28 | _, err = ReadPeersJSON(peers) 29 | if err == nil || !strings.Contains(err.Error(), "at least one voter") { 30 | t.Fatalf("err: %v", err) 31 | } 32 | } 33 | 34 | func TestPeersJSON_ReadPeersJSON(t *testing.T) { 35 | var err error 36 | var base string 37 | base, err = os.MkdirTemp("", "") 38 | if err != nil { 39 | t.Fatalf("err: %v", err) 40 | } 41 | defer os.RemoveAll(base) 42 | 43 | content := []byte(` 44 | ["127.0.0.1:123", 45 | "127.0.0.2:123", 46 | "127.0.0.3:123"] 47 | `) 48 | peers := filepath.Join(base, "peers.json") 49 | if err = os.WriteFile(peers, content, 0o666); err != nil { 50 | t.Fatalf("err: %v", err) 51 | } 52 | var configuration Configuration 53 | configuration, err = ReadPeersJSON(peers) 54 | if err != nil { 55 | t.Fatalf("err: %v", err) 56 | } 57 | 58 | expected := Configuration{ 59 | Servers: []Server{ 60 | { 61 | Suffrage: Voter, 62 | ID: ServerID("127.0.0.1:123"), 63 | Address: ServerAddress("127.0.0.1:123"), 64 | }, 65 | { 66 | Suffrage: Voter, 67 | ID: ServerID("127.0.0.2:123"), 68 | Address: ServerAddress("127.0.0.2:123"), 69 | }, 70 | { 71 | Suffrage: Voter, 72 | ID: ServerID("127.0.0.3:123"), 73 | Address: ServerAddress("127.0.0.3:123"), 74 | }, 75 | }, 76 | } 77 | if !reflect.DeepEqual(configuration, expected) { 78 | t.Fatalf("bad configuration: %+v != %+v", configuration, expected) 79 | } 80 | } 81 | 82 | func TestPeersJSON_ReadConfigJSON(t *testing.T) { 83 | var err error 84 | var base string 85 | base, err = os.MkdirTemp("", "") 86 | if err != nil { 87 | t.Fatalf("err: %v", err) 88 | } 89 | defer os.RemoveAll(base) 90 | 91 | content := []byte(` 92 | [ 93 | { 94 | "id": "adf4238a-882b-9ddc-4a9d-5b6758e4159e", 95 | "address": "127.0.0.1:123", 96 | "non_voter": false 97 | }, 98 | { 99 | "id": "8b6dda82-3103-11e7-93ae-92361f002671", 100 | "address": "127.0.0.2:123" 101 | }, 102 | { 103 | "id": "97e17742-3103-11e7-93ae-92361f002671", 104 | "address": "127.0.0.3:123", 105 | "non_voter": true 106 | } 107 | ] 108 | `) 109 | peers := filepath.Join(base, "peers.json") 110 | if err = os.WriteFile(peers, content, 0o666); err != nil { 111 | t.Fatalf("err: %v", err) 112 | } 113 | 114 | var configuration Configuration 115 | configuration, err = ReadConfigJSON(peers) 116 | if err != nil { 117 | t.Fatalf("err: %v", err) 118 | } 119 | 120 | expected := Configuration{ 121 | Servers: []Server{ 122 | { 123 | Suffrage: Voter, 124 | ID: ServerID("adf4238a-882b-9ddc-4a9d-5b6758e4159e"), 125 | Address: ServerAddress("127.0.0.1:123"), 126 | }, 127 | { 128 | Suffrage: Voter, 129 | ID: ServerID("8b6dda82-3103-11e7-93ae-92361f002671"), 130 | Address: ServerAddress("127.0.0.2:123"), 131 | }, 132 | { 133 | Suffrage: Nonvoter, 134 | ID: ServerID("97e17742-3103-11e7-93ae-92361f002671"), 135 | Address: ServerAddress("127.0.0.3:123"), 136 | }, 137 | }, 138 | } 139 | if !reflect.DeepEqual(configuration, expected) { 140 | t.Fatalf("bad configuration: %+v != %+v", configuration, expected) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /progress.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "context" 8 | "io" 9 | "sync" 10 | "time" 11 | 12 | hclog "github.com/hashicorp/go-hclog" 13 | ) 14 | 15 | const ( 16 | snapshotRestoreMonitorInterval = 10 * time.Second 17 | ) 18 | 19 | type snapshotRestoreMonitor struct { 20 | logger hclog.Logger 21 | cr CountingReader 22 | size int64 23 | networkTransfer bool 24 | 25 | once sync.Once 26 | cancel func() 27 | doneCh chan struct{} 28 | } 29 | 30 | func startSnapshotRestoreMonitor( 31 | logger hclog.Logger, 32 | cr CountingReader, 33 | size int64, 34 | networkTransfer bool, 35 | ) *snapshotRestoreMonitor { 36 | ctx, cancel := context.WithCancel(context.Background()) 37 | 38 | m := &snapshotRestoreMonitor{ 39 | logger: logger, 40 | cr: cr, 41 | size: size, 42 | networkTransfer: networkTransfer, 43 | cancel: cancel, 44 | doneCh: make(chan struct{}), 45 | } 46 | go m.run(ctx) 47 | return m 48 | } 49 | 50 | func (m *snapshotRestoreMonitor) run(ctx context.Context) { 51 | defer close(m.doneCh) 52 | 53 | ticker := time.NewTicker(snapshotRestoreMonitorInterval) 54 | defer ticker.Stop() 55 | 56 | ranOnce := false 57 | for { 58 | select { 59 | case <-ctx.Done(): 60 | if !ranOnce { 61 | m.runOnce() 62 | } 63 | return 64 | case <-ticker.C: 65 | m.runOnce() 66 | ranOnce = true 67 | } 68 | } 69 | } 70 | 71 | func (m *snapshotRestoreMonitor) runOnce() { 72 | readBytes := m.cr.Count() 73 | pct := float64(100*readBytes) / float64(m.size) 74 | 75 | message := "snapshot restore progress" 76 | if m.networkTransfer { 77 | message = "snapshot network transfer progress" 78 | } 79 | 80 | m.logger.Info(message, 81 | "read-bytes", readBytes, 82 | "percent-complete", hclog.Fmt("%0.2f%%", pct), 83 | ) 84 | } 85 | 86 | func (m *snapshotRestoreMonitor) StopAndWait() { 87 | m.once.Do(func() { 88 | m.cancel() 89 | <-m.doneCh 90 | }) 91 | } 92 | 93 | type CountingReader interface { 94 | io.Reader 95 | Count() int64 96 | } 97 | 98 | type countingReader struct { 99 | reader io.Reader 100 | 101 | mu sync.Mutex 102 | bytes int64 103 | } 104 | 105 | func (r *countingReader) Read(p []byte) (n int, err error) { 106 | n, err = r.reader.Read(p) 107 | r.mu.Lock() 108 | r.bytes += int64(n) 109 | r.mu.Unlock() 110 | return n, err 111 | } 112 | 113 | func (r *countingReader) Count() int64 { 114 | r.mu.Lock() 115 | defer r.mu.Unlock() 116 | return r.bytes 117 | } 118 | 119 | func newCountingReader(r io.Reader) *countingReader { 120 | return &countingReader{reader: r} 121 | } 122 | 123 | type countingReadCloser struct { 124 | *countingReader 125 | readCloser io.ReadCloser 126 | } 127 | 128 | func newCountingReadCloser(rc io.ReadCloser) *countingReadCloser { 129 | return &countingReadCloser{ 130 | countingReader: newCountingReader(rc), 131 | readCloser: rc, 132 | } 133 | } 134 | 135 | func (c countingReadCloser) Close() error { 136 | return c.readCloser.Close() 137 | } 138 | 139 | func (c countingReadCloser) WrappedReadCloser() io.ReadCloser { 140 | return c.readCloser 141 | } 142 | 143 | // ReadCloserWrapper allows access to an underlying ReadCloser from a wrapper. 144 | type ReadCloserWrapper interface { 145 | io.ReadCloser 146 | WrappedReadCloser() io.ReadCloser 147 | } 148 | 149 | var _ ReadCloserWrapper = &countingReadCloser{} 150 | -------------------------------------------------------------------------------- /raft-compat/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hashicorp/raft/compat 2 | 3 | go 1.20 4 | 5 | require github.com/stretchr/testify v1.8.4 6 | 7 | require ( 8 | github.com/armon/go-metrics v0.4.1 // indirect 9 | github.com/fatih/color v1.13.0 // indirect 10 | github.com/hashicorp/go-hclog v1.6.2 // indirect 11 | github.com/hashicorp/go-immutable-radix v1.0.0 // indirect 12 | github.com/hashicorp/go-msgpack v0.5.5 // indirect 13 | github.com/hashicorp/go-msgpack/v2 v2.1.1 // indirect 14 | github.com/hashicorp/golang-lru v0.5.0 // indirect 15 | github.com/mattn/go-colorable v0.1.12 // indirect 16 | github.com/mattn/go-isatty v0.0.14 // indirect 17 | golang.org/x/sys v0.13.0 // indirect 18 | ) 19 | 20 | replace github.com/hashicorp/raft-previous-version => ./raft-previous-version 21 | 22 | replace github.com/hashicorp/raft => ../ 23 | 24 | require ( 25 | github.com/davecgh/go-spew v1.1.1 // indirect 26 | github.com/hashicorp/raft v1.6.1 27 | github.com/hashicorp/raft-previous-version v1.2.0 28 | github.com/pmezard/go-difflib v1.0.0 // indirect 29 | gopkg.in/yaml.v3 v3.0.1 // indirect 30 | ) 31 | -------------------------------------------------------------------------------- /raft-compat/testcluster/cluster.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package testcluster 5 | 6 | import ( 7 | "fmt" 8 | "github.com/hashicorp/raft" 9 | raftprevious "github.com/hashicorp/raft-previous-version" 10 | "github.com/stretchr/testify/require" 11 | "testing" 12 | "time" 13 | ) 14 | 15 | type RaftUIT struct { 16 | raft *raft.Raft 17 | trans *raft.NetworkTransport 18 | Config *raft.Config 19 | Store *raft.InmemStore 20 | Snap *raft.InmemSnapshotStore 21 | id raft.ServerID 22 | fsm *raft.MockFSM 23 | } 24 | 25 | func (r RaftUIT) NumLogs() int { 26 | return len(r.fsm.Logs()) 27 | } 28 | 29 | func (r RaftUIT) GetLocalAddr() string { 30 | return string(r.trans.LocalAddr()) 31 | } 32 | 33 | func (r RaftUIT) GetRaft() interface{} { 34 | return r.raft 35 | } 36 | 37 | func (r RaftUIT) GetStore() interface{} { 38 | return r.Store 39 | } 40 | 41 | func (r RaftUIT) GetLocalID() string { 42 | return string(r.id) 43 | } 44 | 45 | func (r RaftUIT) GetLeaderID() string { 46 | _, id := r.raft.LeaderWithID() 47 | return string(id) 48 | } 49 | 50 | func (r *RaftCluster) ID(i int) string { 51 | return r.rafts[i].GetLocalID() 52 | } 53 | func (r *RaftCluster) Addr(i int) string { 54 | return r.rafts[i].GetLocalAddr() 55 | } 56 | 57 | func (r *RaftCluster) Raft(id string) interface{} { 58 | i := r.GetIndex(id) 59 | return r.rafts[i].GetRaft() 60 | } 61 | 62 | func (r *RaftCluster) Store(id string) interface{} { 63 | i := r.GetIndex(id) 64 | return r.rafts[i].GetStore() 65 | } 66 | 67 | type RaftLatest struct { 68 | raft *raftprevious.Raft 69 | trans *raftprevious.NetworkTransport 70 | Config *raftprevious.Config 71 | Store *raftprevious.InmemStore 72 | Snap *raftprevious.InmemSnapshotStore 73 | id raftprevious.ServerID 74 | fsm *raftprevious.MockFSM 75 | } 76 | 77 | func (r RaftLatest) NumLogs() int { 78 | return len(r.fsm.Logs()) 79 | } 80 | 81 | func (r RaftLatest) GetLocalAddr() string { 82 | return string(r.trans.LocalAddr()) 83 | } 84 | 85 | func (r RaftLatest) GetRaft() interface{} { 86 | return r.raft 87 | } 88 | func (r RaftLatest) GetStore() interface{} { 89 | return r.Store 90 | } 91 | 92 | func (r RaftLatest) GetLocalID() string { 93 | return string(r.id) 94 | } 95 | 96 | func (r RaftLatest) GetLeaderID() string { 97 | _, id := r.raft.LeaderWithID() 98 | return string(id) 99 | } 100 | 101 | type RaftNode interface { 102 | GetLocalID() string 103 | GetLocalAddr() string 104 | GetLeaderID() string 105 | GetRaft() interface{} 106 | GetStore() interface{} 107 | NumLogs() int 108 | } 109 | 110 | type RaftCluster struct { 111 | rafts []RaftNode 112 | } 113 | 114 | func NewRaftCluster(t *testing.T, f func(t *testing.T, id string) RaftNode, count int, name string) RaftCluster { 115 | rc := RaftCluster{} 116 | rc.rafts = make([]RaftNode, count) 117 | for i := 0; i < count; i++ { 118 | rc.rafts[i] = f(t, fmt.Sprintf("%s-%d", name, i)) 119 | } 120 | return rc 121 | } 122 | 123 | func NewPreviousRaftCluster(t *testing.T, count int, name string) RaftCluster { 124 | return NewRaftCluster(t, InitPrevious, count, name) 125 | } 126 | 127 | func NewUITRaftCluster(t *testing.T, count int, name string) RaftCluster { 128 | return NewRaftCluster(t, InitUIT, count, name) 129 | } 130 | 131 | func (r *RaftCluster) GetLeader() RaftNode { 132 | for _, n := range r.rafts { 133 | if n.GetLocalID() == n.GetLeaderID() { 134 | return n 135 | } 136 | } 137 | return nil 138 | } 139 | 140 | func (r *RaftCluster) Len() int { 141 | return len(r.rafts) 142 | } 143 | 144 | func (r *RaftCluster) AddNode(node RaftNode) { 145 | r.rafts = append([]RaftNode{node}, r.rafts...) 146 | } 147 | 148 | func (r *RaftCluster) DeleteNode(id string) { 149 | i := r.GetIndex(id) 150 | r.rafts = append(r.rafts[:i], r.rafts[i+1:]...) 151 | } 152 | 153 | func (r *RaftCluster) GetIndex(id string) int { 154 | i := 0 155 | for _, r := range r.rafts { 156 | if r.GetLocalID() == id { 157 | return i 158 | } 159 | i++ 160 | } 161 | return -1 162 | } 163 | 164 | func InitUIT(t *testing.T, id string) RaftNode { 165 | return InitUITWithStore(t, id, nil, func(config *raft.Config) {}) 166 | } 167 | 168 | func InitUITWithStore(t *testing.T, id string, store *raftprevious.InmemStore, cfgMod func(config *raft.Config)) RaftNode { 169 | node := RaftUIT{} 170 | node.Config = raft.DefaultConfig() 171 | cfgMod(node.Config) 172 | node.Config.HeartbeatTimeout = 50 * time.Millisecond 173 | node.Config.ElectionTimeout = 50 * time.Millisecond 174 | node.Config.LeaderLeaseTimeout = 50 * time.Millisecond 175 | node.Config.CommitTimeout = 5 * time.Millisecond 176 | node.id = raft.ServerID(id) 177 | node.Config.LocalID = node.id 178 | if store != nil { 179 | node.Store = convertInMemStoreToUIT(store) 180 | } else { 181 | node.Store = raft.NewInmemStore() 182 | } 183 | 184 | node.Snap = raft.NewInmemSnapshotStore() 185 | node.fsm = &raft.MockFSM{} 186 | var err error 187 | node.trans, err = raft.NewTCPTransport("localhost:0", nil, 2, time.Second, nil) 188 | require.NoError(t, err) 189 | node.raft, err = raft.NewRaft(node.Config, node.fsm, node.Store, 190 | node.Store, node.Snap, node.trans) 191 | require.NoError(t, err) 192 | return node 193 | } 194 | 195 | func InitPrevious(t *testing.T, id string) RaftNode { 196 | return InitPreviousWithStore(t, id, nil, func(config *raftprevious.Config) { 197 | }) 198 | } 199 | 200 | func InitPreviousWithStore(t *testing.T, id string, store *raft.InmemStore, f func(config *raftprevious.Config)) RaftNode { 201 | node := RaftLatest{} 202 | node.Config = raftprevious.DefaultConfig() 203 | node.Config.HeartbeatTimeout = 50 * time.Millisecond 204 | node.Config.ElectionTimeout = 50 * time.Millisecond 205 | node.Config.LeaderLeaseTimeout = 50 * time.Millisecond 206 | node.Config.CommitTimeout = 5 * time.Millisecond 207 | node.id = raftprevious.ServerID(id) 208 | node.Config.LocalID = node.id 209 | f(node.Config) 210 | 211 | if store != nil { 212 | node.Store = convertInMemStoreToPrevious(store) 213 | } else { 214 | node.Store = raftprevious.NewInmemStore() 215 | } 216 | node.Snap = raftprevious.NewInmemSnapshotStore() 217 | node.fsm = &raftprevious.MockFSM{} 218 | var err error 219 | node.trans, err = raftprevious.NewTCPTransport("localhost:0", nil, 2, time.Second, nil) 220 | require.NoError(t, err) 221 | node.raft, err = raftprevious.NewRaft(node.Config, node.fsm, node.Store, 222 | node.Store, node.Snap, node.trans) 223 | require.NoError(t, err) 224 | return node 225 | } 226 | 227 | func convertLogToUIT(ll *raftprevious.Log) *raft.Log { 228 | l := new(raft.Log) 229 | l.Index = ll.Index 230 | l.AppendedAt = ll.AppendedAt 231 | l.Type = raft.LogType(ll.Type) 232 | l.Term = ll.Term 233 | l.Data = ll.Data 234 | l.Extensions = ll.Extensions 235 | return l 236 | } 237 | func convertLogToPrevious(ll *raft.Log) *raftprevious.Log { 238 | l := new(raftprevious.Log) 239 | l.Index = ll.Index 240 | l.AppendedAt = ll.AppendedAt 241 | l.Type = raftprevious.LogType(ll.Type) 242 | l.Term = ll.Term 243 | l.Data = ll.Data 244 | l.Extensions = ll.Extensions 245 | return l 246 | } 247 | 248 | var ( 249 | keyCurrentTerm = []byte("CurrentTerm") 250 | keyLastVoteTerm = []byte("LastVoteTerm") 251 | keyLastVoteCand = []byte("LastVoteCand") 252 | ) 253 | 254 | func convertInMemStoreToPrevious(s *raft.InmemStore) *raftprevious.InmemStore { 255 | ss := raftprevious.NewInmemStore() 256 | fi, _ := s.FirstIndex() 257 | li, _ := s.LastIndex() 258 | for i := fi; i <= li; i++ { 259 | log := new(raft.Log) 260 | s.GetLog(i, log) 261 | ss.StoreLog(convertLogToPrevious(log)) 262 | } 263 | 264 | get, _ := ss.Get(keyCurrentTerm) 265 | ss.Set(keyCurrentTerm, get) 266 | 267 | get, _ = ss.Get(keyLastVoteTerm) 268 | ss.Set(keyLastVoteTerm, get) 269 | 270 | get, _ = ss.Get(keyLastVoteCand) 271 | ss.Set(keyLastVoteCand, get) 272 | 273 | get64, _ := ss.GetUint64(keyCurrentTerm) 274 | ss.SetUint64(keyCurrentTerm, get64) 275 | 276 | get64, _ = ss.GetUint64(keyLastVoteTerm) 277 | ss.SetUint64(keyLastVoteTerm, get64) 278 | 279 | get64, _ = ss.GetUint64(keyLastVoteCand) 280 | ss.SetUint64(keyLastVoteCand, get64) 281 | 282 | return ss 283 | } 284 | 285 | func convertInMemStoreToUIT(s *raftprevious.InmemStore) *raft.InmemStore { 286 | ss := raft.NewInmemStore() 287 | fi, _ := s.FirstIndex() 288 | li, _ := s.LastIndex() 289 | for i := fi; i <= li; i++ { 290 | log := new(raftprevious.Log) 291 | s.GetLog(i, log) 292 | ss.StoreLog(convertLogToUIT(log)) 293 | } 294 | 295 | get, _ := ss.Get(keyCurrentTerm) 296 | ss.Set(keyCurrentTerm, get) 297 | 298 | get, _ = ss.Get(keyLastVoteTerm) 299 | ss.Set(keyLastVoteTerm, get) 300 | 301 | get, _ = ss.Get(keyLastVoteCand) 302 | ss.Set(keyLastVoteCand, get) 303 | 304 | get64, _ := ss.GetUint64(keyCurrentTerm) 305 | ss.SetUint64(keyCurrentTerm, get64) 306 | 307 | get64, _ = ss.GetUint64(keyLastVoteTerm) 308 | ss.SetUint64(keyLastVoteTerm, get64) 309 | 310 | get64, _ = ss.GetUint64(keyLastVoteCand) 311 | ss.SetUint64(keyLastVoteCand, get64) 312 | 313 | return ss 314 | } 315 | -------------------------------------------------------------------------------- /raft-compat/utils/test_utils.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package utils 5 | 6 | import ( 7 | "fmt" 8 | "github.com/hashicorp/raft" 9 | raftprevious "github.com/hashicorp/raft-previous-version" 10 | "github.com/hashicorp/raft/compat/testcluster" 11 | "github.com/stretchr/testify/require" 12 | "testing" 13 | "time" 14 | ) 15 | 16 | func WaitForNewLeader(t *testing.T, oldLeader string, c testcluster.RaftCluster) { 17 | 18 | leader := func() string { 19 | for i := 0; i < c.Len(); i++ { 20 | switch r := c.Raft(c.ID(i)).(type) { 21 | case *raft.Raft: 22 | if r.State() == raft.Leader { 23 | return c.ID(i) 24 | } 25 | case *raftprevious.Raft: 26 | if r.State() == raftprevious.Leader { 27 | return c.ID(i) 28 | } 29 | } 30 | } 31 | return "" 32 | } 33 | after := time.After(5 * time.Second) 34 | ticker := time.NewTicker(100 * time.Millisecond) 35 | for { 36 | select { 37 | case <-after: 38 | t.Fatalf("timedout") 39 | case <-ticker.C: 40 | id := leader() 41 | if id != "" { 42 | if id != oldLeader || oldLeader == "" { 43 | return 44 | } 45 | } 46 | } 47 | } 48 | } 49 | 50 | type future interface { 51 | Error() error 52 | } 53 | 54 | func WaitFuture(t *testing.T, f future) { 55 | timer := time.AfterFunc(1000*time.Millisecond, func() { 56 | panic(fmt.Errorf("timeout waiting for future %v", f)) 57 | }) 58 | defer timer.Stop() 59 | require.NoError(t, f.Error()) 60 | } 61 | -------------------------------------------------------------------------------- /saturation.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "math" 8 | "time" 9 | 10 | "github.com/hashicorp/go-metrics/compat" 11 | ) 12 | 13 | // saturationMetric measures the saturation (percentage of time spent working vs 14 | // waiting for work) of an event processing loop, such as runFSM. It reports the 15 | // saturation as a gauge metric (at most) once every reportInterval. 16 | // 17 | // Callers must instrument their loop with calls to sleeping and working, starting 18 | // with a call to sleeping. 19 | // 20 | // Note: the caller must be single-threaded and saturationMetric is not safe for 21 | // concurrent use by multiple goroutines. 22 | type saturationMetric struct { 23 | reportInterval time.Duration 24 | 25 | // slept contains time for which the event processing loop was sleeping rather 26 | // than working in the period since lastReport. 27 | slept time.Duration 28 | 29 | // lost contains time that is considered lost due to incorrect use of 30 | // saturationMetricBucket (e.g. calling sleeping() or working() multiple 31 | // times in succession) in the period since lastReport. 32 | lost time.Duration 33 | 34 | lastReport, sleepBegan, workBegan time.Time 35 | 36 | // These are overwritten in tests. 37 | nowFn func() time.Time 38 | reportFn func(float32) 39 | } 40 | 41 | // newSaturationMetric creates a saturationMetric that will update the gauge 42 | // with the given name at the given reportInterval. keepPrev determines the 43 | // number of previous measurements that will be used to smooth out spikes. 44 | func newSaturationMetric(name []string, reportInterval time.Duration) *saturationMetric { 45 | m := &saturationMetric{ 46 | reportInterval: reportInterval, 47 | nowFn: time.Now, 48 | lastReport: time.Now(), 49 | reportFn: func(sat float32) { metrics.AddSample(name, sat) }, 50 | } 51 | return m 52 | } 53 | 54 | // sleeping records the time at which the loop began waiting for work. After the 55 | // initial call it must always be proceeded by a call to working. 56 | func (s *saturationMetric) sleeping() { 57 | now := s.nowFn() 58 | 59 | if !s.sleepBegan.IsZero() { 60 | // sleeping called twice in succession. Count that time as lost rather than 61 | // measuring nonsense. 62 | s.lost += now.Sub(s.sleepBegan) 63 | } 64 | 65 | s.sleepBegan = now 66 | s.workBegan = time.Time{} 67 | s.report() 68 | } 69 | 70 | // working records the time at which the loop began working. It must always be 71 | // proceeded by a call to sleeping. 72 | func (s *saturationMetric) working() { 73 | now := s.nowFn() 74 | 75 | if s.workBegan.IsZero() { 76 | if s.sleepBegan.IsZero() { 77 | // working called before the initial call to sleeping. Count that time as 78 | // lost rather than measuring nonsense. 79 | s.lost += now.Sub(s.lastReport) 80 | } else { 81 | s.slept += now.Sub(s.sleepBegan) 82 | } 83 | } else { 84 | // working called twice in succession. Count that time as lost rather than 85 | // measuring nonsense. 86 | s.lost += now.Sub(s.workBegan) 87 | } 88 | 89 | s.workBegan = now 90 | s.sleepBegan = time.Time{} 91 | s.report() 92 | } 93 | 94 | // report updates the gauge if reportInterval has passed since our last report. 95 | func (s *saturationMetric) report() { 96 | now := s.nowFn() 97 | timeSinceLastReport := now.Sub(s.lastReport) 98 | 99 | if timeSinceLastReport < s.reportInterval { 100 | return 101 | } 102 | 103 | var saturation float64 104 | total := timeSinceLastReport - s.lost 105 | if total != 0 { 106 | saturation = float64(total-s.slept) / float64(total) 107 | saturation = math.Round(saturation*100) / 100 108 | } 109 | s.reportFn(float32(saturation)) 110 | 111 | s.slept = 0 112 | s.lost = 0 113 | s.lastReport = now 114 | } 115 | -------------------------------------------------------------------------------- /saturation_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "testing" 8 | "time" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestSaturationMetric(t *testing.T) { 14 | t.Run("without smoothing", func(t *testing.T) { 15 | sat := newSaturationMetric([]string{"metric"}, 100*time.Millisecond) 16 | 17 | now := sat.lastReport 18 | sat.nowFn = func() time.Time { return now } 19 | 20 | var reported float32 21 | sat.reportFn = func(val float32) { reported = val } 22 | 23 | sat.sleeping() 24 | 25 | // First window: 50ms sleeping + 75ms working. 26 | now = now.Add(50 * time.Millisecond) 27 | sat.working() 28 | 29 | now = now.Add(75 * time.Millisecond) 30 | sat.sleeping() 31 | 32 | // Should be 60% saturation. 33 | require.Equal(t, float32(0.6), reported) 34 | 35 | // Second window: 90ms sleeping + 10ms working. 36 | now = now.Add(90 * time.Millisecond) 37 | sat.working() 38 | 39 | now = now.Add(10 * time.Millisecond) 40 | sat.sleeping() 41 | 42 | // Should be 10% saturation. 43 | require.Equal(t, float32(0.1), reported) 44 | 45 | // Third window: 100ms sleeping + 0ms working. 46 | now = now.Add(100 * time.Millisecond) 47 | sat.working() 48 | 49 | // Should be 0% saturation. 50 | require.Equal(t, float32(0), reported) 51 | }) 52 | } 53 | 54 | func TestSaturationMetric_IncorrectUsage(t *testing.T) { 55 | t.Run("calling sleeping() consecutively", func(t *testing.T) { 56 | sat := newSaturationMetric([]string{"metric"}, 50*time.Millisecond) 57 | 58 | now := sat.lastReport 59 | sat.nowFn = func() time.Time { return now } 60 | 61 | var reported float32 62 | sat.reportFn = func(v float32) { reported = v } 63 | 64 | // Calling sleeping() consecutively should reset sleepBegan without recording 65 | // a sample, such that we "lose" time rather than recording nonsense data. 66 | // 67 | // 0 | sleeping() | 68 | // => Sleeping (10ms) 69 | // +10ms | working() | 70 | // => Working (10ms) 71 | // +20ms | sleeping() | 72 | // => [!] LOST [!] (10ms) 73 | // +30ms | sleeping() | 74 | // => Sleeping (10ms) 75 | // +40ms | working() | 76 | // => Working (10ms) 77 | // +50ms | sleeping() | 78 | // 79 | // Total reportable time: 40ms. Saturation: 50%. 80 | sat.sleeping() 81 | now = now.Add(10 * time.Millisecond) 82 | sat.working() 83 | now = now.Add(10 * time.Millisecond) 84 | sat.sleeping() 85 | now = now.Add(10 * time.Millisecond) 86 | sat.sleeping() 87 | now = now.Add(10 * time.Millisecond) 88 | sat.working() 89 | now = now.Add(10 * time.Millisecond) 90 | sat.sleeping() 91 | 92 | require.Equal(t, float32(0.5), reported) 93 | }) 94 | 95 | t.Run("calling working() consecutively", func(t *testing.T) { 96 | sat := newSaturationMetric([]string{"metric"}, 30*time.Millisecond) 97 | 98 | now := sat.lastReport 99 | sat.nowFn = func() time.Time { return now } 100 | 101 | var reported float32 102 | sat.reportFn = func(v float32) { reported = v } 103 | 104 | // Calling working() consecutively should reset workBegan without recording 105 | // a sample, such that we "lose" time rather than recording nonsense data. 106 | // 107 | // 0 | sleeping() | 108 | // => Sleeping (10ms) 109 | // +10ms | working() | 110 | // => [!] LOST [!] (10ms) 111 | // +20ms | working() | 112 | // => Working (10ms) 113 | // +30ms | sleeping() | 114 | // 115 | // Total reportable time: 20ms. Saturation: 50%. 116 | sat.sleeping() 117 | now = now.Add(10 * time.Millisecond) 118 | sat.working() 119 | now = now.Add(10 * time.Millisecond) 120 | sat.working() 121 | now = now.Add(10 * time.Millisecond) 122 | sat.sleeping() 123 | 124 | require.Equal(t, float32(0.5), reported) 125 | }) 126 | 127 | t.Run("calling working() first", func(t *testing.T) { 128 | sat := newSaturationMetric([]string{"metric"}, 10*time.Millisecond) 129 | 130 | now := sat.lastReport 131 | sat.nowFn = func() time.Time { return now } 132 | 133 | var reported float32 134 | sat.reportFn = func(v float32) { reported = v } 135 | 136 | // Time from start until working() is treated as lost. 137 | sat.working() 138 | require.Equal(t, float32(0), reported) 139 | 140 | sat.sleeping() 141 | now = now.Add(5 * time.Millisecond) 142 | sat.working() 143 | now = now.Add(5 * time.Millisecond) 144 | sat.sleeping() 145 | require.Equal(t, float32(0.5), reported) 146 | }) 147 | } 148 | -------------------------------------------------------------------------------- /stable.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | // StableStore is used to provide stable storage 7 | // of key configurations to ensure safety. 8 | type StableStore interface { 9 | Set(key []byte, val []byte) error 10 | 11 | // Get returns the value for key, or an empty byte slice if key was not found. 12 | Get(key []byte) ([]byte, error) 13 | 14 | SetUint64(key []byte, val uint64) error 15 | 16 | // GetUint64 returns the uint64 value for key, or 0 if key was not found. 17 | GetUint64(key []byte) (uint64, error) 18 | } 19 | -------------------------------------------------------------------------------- /state.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "sync" 8 | "sync/atomic" 9 | ) 10 | 11 | // RaftState captures the state of a Raft node: Follower, Candidate, Leader, 12 | // or Shutdown. 13 | type RaftState uint32 14 | 15 | const ( 16 | // Follower is the initial state of a Raft node. 17 | Follower RaftState = iota 18 | 19 | // Candidate is one of the valid states of a Raft node. 20 | Candidate 21 | 22 | // Leader is one of the valid states of a Raft node. 23 | Leader 24 | 25 | // Shutdown is the terminal state of a Raft node. 26 | Shutdown 27 | ) 28 | 29 | func (s RaftState) String() string { 30 | switch s { 31 | case Follower: 32 | return "Follower" 33 | case Candidate: 34 | return "Candidate" 35 | case Leader: 36 | return "Leader" 37 | case Shutdown: 38 | return "Shutdown" 39 | default: 40 | return "Unknown" 41 | } 42 | } 43 | 44 | // raftState is used to maintain various state variables 45 | // and provides an interface to set/get the variables in a 46 | // thread safe manner. 47 | type raftState struct { 48 | // currentTerm commitIndex, lastApplied, must be kept at the top of 49 | // the struct so they're 64 bit aligned which is a requirement for 50 | // atomic ops on 32 bit platforms. 51 | 52 | // The current term, cache of StableStore 53 | currentTerm uint64 54 | 55 | // Highest committed log entry 56 | commitIndex uint64 57 | 58 | // Last applied log to the FSM 59 | lastApplied uint64 60 | 61 | // protects 4 next fields 62 | lastLock sync.Mutex 63 | 64 | // Cache the latest snapshot index/term 65 | lastSnapshotIndex uint64 66 | lastSnapshotTerm uint64 67 | 68 | // Cache the latest log from LogStore 69 | lastLogIndex uint64 70 | lastLogTerm uint64 71 | 72 | // Tracks running goroutines 73 | routinesGroup sync.WaitGroup 74 | 75 | // The current state 76 | state RaftState 77 | } 78 | 79 | func (r *raftState) getState() RaftState { 80 | stateAddr := (*uint32)(&r.state) 81 | return RaftState(atomic.LoadUint32(stateAddr)) 82 | } 83 | 84 | func (r *raftState) setState(s RaftState) { 85 | stateAddr := (*uint32)(&r.state) 86 | atomic.StoreUint32(stateAddr, uint32(s)) 87 | } 88 | 89 | func (r *raftState) getCurrentTerm() uint64 { 90 | return atomic.LoadUint64(&r.currentTerm) 91 | } 92 | 93 | func (r *raftState) setCurrentTerm(term uint64) { 94 | atomic.StoreUint64(&r.currentTerm, term) 95 | } 96 | 97 | func (r *raftState) getLastLog() (index, term uint64) { 98 | r.lastLock.Lock() 99 | index = r.lastLogIndex 100 | term = r.lastLogTerm 101 | r.lastLock.Unlock() 102 | return 103 | } 104 | 105 | func (r *raftState) setLastLog(index, term uint64) { 106 | r.lastLock.Lock() 107 | r.lastLogIndex = index 108 | r.lastLogTerm = term 109 | r.lastLock.Unlock() 110 | } 111 | 112 | func (r *raftState) getLastSnapshot() (index, term uint64) { 113 | r.lastLock.Lock() 114 | index = r.lastSnapshotIndex 115 | term = r.lastSnapshotTerm 116 | r.lastLock.Unlock() 117 | return 118 | } 119 | 120 | func (r *raftState) setLastSnapshot(index, term uint64) { 121 | r.lastLock.Lock() 122 | r.lastSnapshotIndex = index 123 | r.lastSnapshotTerm = term 124 | r.lastLock.Unlock() 125 | } 126 | 127 | func (r *raftState) getCommitIndex() uint64 { 128 | return atomic.LoadUint64(&r.commitIndex) 129 | } 130 | 131 | func (r *raftState) setCommitIndex(index uint64) { 132 | atomic.StoreUint64(&r.commitIndex, index) 133 | } 134 | 135 | func (r *raftState) getLastApplied() uint64 { 136 | return atomic.LoadUint64(&r.lastApplied) 137 | } 138 | 139 | func (r *raftState) setLastApplied(index uint64) { 140 | atomic.StoreUint64(&r.lastApplied, index) 141 | } 142 | 143 | // Start a goroutine and properly handle the race between a routine 144 | // starting and incrementing, and exiting and decrementing. 145 | func (r *raftState) goFunc(f func()) { 146 | r.routinesGroup.Add(1) 147 | go func() { 148 | defer r.routinesGroup.Done() 149 | f() 150 | }() 151 | } 152 | 153 | func (r *raftState) waitShutdown() { 154 | r.routinesGroup.Wait() 155 | } 156 | 157 | // getLastIndex returns the last index in stable storage. 158 | // Either from the last log or from the last snapshot. 159 | func (r *raftState) getLastIndex() uint64 { 160 | r.lastLock.Lock() 161 | defer r.lastLock.Unlock() 162 | return max(r.lastLogIndex, r.lastSnapshotIndex) 163 | } 164 | 165 | // getLastEntry returns the last index and term in stable storage. 166 | // Either from the last log or from the last snapshot. 167 | func (r *raftState) getLastEntry() (uint64, uint64) { 168 | r.lastLock.Lock() 169 | defer r.lastLock.Unlock() 170 | if r.lastLogIndex >= r.lastSnapshotIndex { 171 | return r.lastLogIndex, r.lastLogTerm 172 | } 173 | return r.lastSnapshotIndex, r.lastSnapshotTerm 174 | } 175 | -------------------------------------------------------------------------------- /tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) HashiCorp, Inc. 3 | # SPDX-License-Identifier: MPL-2.0 4 | 5 | set -e 6 | 7 | # The version must be supplied from the environment. Do not include the 8 | # leading "v". 9 | if [ -z $VERSION ]; then 10 | echo "Please specify a version." 11 | exit 1 12 | fi 13 | 14 | # Generate the tag. 15 | echo "==> Tagging version $VERSION..." 16 | git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION" 17 | git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" main 18 | 19 | exit 0 20 | -------------------------------------------------------------------------------- /tcp_transport.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "errors" 8 | "io" 9 | "net" 10 | "time" 11 | 12 | "github.com/hashicorp/go-hclog" 13 | ) 14 | 15 | var ( 16 | errNotAdvertisable = errors.New("local bind address is not advertisable") 17 | errNotTCP = errors.New("local address is not a TCP address") 18 | ) 19 | 20 | // TCPStreamLayer implements StreamLayer interface for plain TCP. 21 | type TCPStreamLayer struct { 22 | advertise net.Addr 23 | listener *net.TCPListener 24 | } 25 | 26 | // NewTCPTransport returns a NetworkTransport that is built on top of 27 | // a TCP streaming transport layer. 28 | func NewTCPTransport( 29 | bindAddr string, 30 | advertise net.Addr, 31 | maxPool int, 32 | timeout time.Duration, 33 | logOutput io.Writer, 34 | ) (*NetworkTransport, error) { 35 | return newTCPTransport(bindAddr, advertise, func(stream StreamLayer) *NetworkTransport { 36 | return NewNetworkTransport(stream, maxPool, timeout, logOutput) 37 | }) 38 | } 39 | 40 | // NewTCPTransportWithLogger returns a NetworkTransport that is built on top of 41 | // a TCP streaming transport layer, with log output going to the supplied Logger 42 | func NewTCPTransportWithLogger( 43 | bindAddr string, 44 | advertise net.Addr, 45 | maxPool int, 46 | timeout time.Duration, 47 | logger hclog.Logger, 48 | ) (*NetworkTransport, error) { 49 | return newTCPTransport(bindAddr, advertise, func(stream StreamLayer) *NetworkTransport { 50 | return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger) 51 | }) 52 | } 53 | 54 | // NewTCPTransportWithConfig returns a NetworkTransport that is built on top of 55 | // a TCP streaming transport layer, using the given config struct. 56 | func NewTCPTransportWithConfig( 57 | bindAddr string, 58 | advertise net.Addr, 59 | config *NetworkTransportConfig, 60 | ) (*NetworkTransport, error) { 61 | return newTCPTransport(bindAddr, advertise, func(stream StreamLayer) *NetworkTransport { 62 | config.Stream = stream 63 | return NewNetworkTransportWithConfig(config) 64 | }) 65 | } 66 | 67 | func newTCPTransport(bindAddr string, 68 | advertise net.Addr, 69 | transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) { 70 | // Try to bind 71 | list, err := net.Listen("tcp", bindAddr) 72 | if err != nil { 73 | return nil, err 74 | } 75 | 76 | // Create stream 77 | stream := &TCPStreamLayer{ 78 | advertise: advertise, 79 | listener: list.(*net.TCPListener), 80 | } 81 | 82 | // Verify that we have a usable advertise address 83 | addr, ok := stream.Addr().(*net.TCPAddr) 84 | if !ok { 85 | list.Close() 86 | return nil, errNotTCP 87 | } 88 | if addr.IP == nil || addr.IP.IsUnspecified() { 89 | list.Close() 90 | return nil, errNotAdvertisable 91 | } 92 | 93 | // Create the network transport 94 | trans := transportCreator(stream) 95 | return trans, nil 96 | } 97 | 98 | // Dial implements the StreamLayer interface. 99 | func (t *TCPStreamLayer) Dial(address ServerAddress, timeout time.Duration) (net.Conn, error) { 100 | return net.DialTimeout("tcp", string(address), timeout) 101 | } 102 | 103 | // Accept implements the net.Listener interface. 104 | func (t *TCPStreamLayer) Accept() (c net.Conn, err error) { 105 | return t.listener.Accept() 106 | } 107 | 108 | // Close implements the net.Listener interface. 109 | func (t *TCPStreamLayer) Close() (err error) { 110 | return t.listener.Close() 111 | } 112 | 113 | // Addr implements the net.Listener interface. 114 | func (t *TCPStreamLayer) Addr() net.Addr { 115 | // Use an advertise addr if provided 116 | if t.advertise != nil { 117 | return t.advertise 118 | } 119 | return t.listener.Addr() 120 | } 121 | -------------------------------------------------------------------------------- /tcp_transport_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "net" 8 | "testing" 9 | ) 10 | 11 | func TestTCPTransport_BadAddr(t *testing.T) { 12 | _, err := NewTCPTransportWithLogger("0.0.0.0:0", nil, 1, 0, newTestLogger(t)) 13 | if err != errNotAdvertisable { 14 | t.Fatalf("err: %v", err) 15 | } 16 | } 17 | 18 | func TestTCPTransport_EmptyAddr(t *testing.T) { 19 | _, err := NewTCPTransportWithLogger(":0", nil, 1, 0, newTestLogger(t)) 20 | if err != errNotAdvertisable { 21 | t.Fatalf("err: %v", err) 22 | } 23 | } 24 | 25 | func TestTCPTransport_WithAdvertise(t *testing.T) { 26 | ips, err := net.LookupIP("localhost") 27 | if err != nil { 28 | t.Fatal(err) 29 | } 30 | if len(ips) == 0 { 31 | t.Fatalf("localhost did not resolve to any IPs") 32 | } 33 | addr := &net.TCPAddr{IP: ips[0], Port: 12345} 34 | trans, err := NewTCPTransportWithLogger("0.0.0.0:0", addr, 1, 0, newTestLogger(t)) 35 | if err != nil { 36 | t.Fatalf("err: %v", err) 37 | } 38 | if trans.LocalAddr() != ServerAddress(net.JoinHostPort(ips[0].String(), "12345")) { 39 | t.Fatalf("bad: %v", trans.LocalAddr()) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /testing_batch.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | //go:build batchtest 5 | // +build batchtest 6 | 7 | package raft 8 | 9 | func init() { 10 | userSnapshotErrorsOnNoData = false 11 | } 12 | 13 | // ApplyBatch enables MockFSM to satisfy the BatchingFSM interface. This 14 | // function is gated by the batchtest build flag. 15 | // 16 | // NOTE: This is exposed for middleware testing purposes and is not a stable API 17 | func (m *MockFSM) ApplyBatch(logs []*Log) []interface{} { 18 | m.Lock() 19 | defer m.Unlock() 20 | 21 | ret := make([]interface{}, len(logs)) 22 | for i, log := range logs { 23 | switch log.Type { 24 | case LogCommand: 25 | m.logs = append(m.logs, log.Data) 26 | ret[i] = len(m.logs) 27 | default: 28 | ret[i] = nil 29 | } 30 | } 31 | 32 | return ret 33 | } 34 | -------------------------------------------------------------------------------- /transport.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "io" 8 | "time" 9 | ) 10 | 11 | // RPCResponse captures both a response and a potential error. 12 | type RPCResponse struct { 13 | Response interface{} 14 | Error error 15 | } 16 | 17 | // RPC has a command, and provides a response mechanism. 18 | type RPC struct { 19 | Command interface{} 20 | Reader io.Reader // Set only for InstallSnapshot 21 | RespChan chan<- RPCResponse 22 | } 23 | 24 | // Respond is used to respond with a response, error or both 25 | func (r *RPC) Respond(resp interface{}, err error) { 26 | r.RespChan <- RPCResponse{resp, err} 27 | } 28 | 29 | // Transport provides an interface for network transports 30 | // to allow Raft to communicate with other nodes. 31 | type Transport interface { 32 | // Consumer returns a channel that can be used to 33 | // consume and respond to RPC requests. 34 | Consumer() <-chan RPC 35 | 36 | // LocalAddr is used to return our local address to distinguish from our peers. 37 | LocalAddr() ServerAddress 38 | 39 | // AppendEntriesPipeline returns an interface that can be used to pipeline 40 | // AppendEntries requests. 41 | AppendEntriesPipeline(id ServerID, target ServerAddress) (AppendPipeline, error) 42 | 43 | // AppendEntries sends the appropriate RPC to the target node. 44 | AppendEntries(id ServerID, target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error 45 | 46 | // RequestVote sends the appropriate RPC to the target node. 47 | RequestVote(id ServerID, target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error 48 | 49 | // InstallSnapshot is used to push a snapshot down to a follower. The data is read from 50 | // the ReadCloser and streamed to the client. 51 | InstallSnapshot(id ServerID, target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error 52 | 53 | // EncodePeer is used to serialize a peer's address. 54 | EncodePeer(id ServerID, addr ServerAddress) []byte 55 | 56 | // DecodePeer is used to deserialize a peer's address. 57 | DecodePeer([]byte) ServerAddress 58 | 59 | // SetHeartbeatHandler is used to setup a heartbeat handler 60 | // as a fast-pass. This is to avoid head-of-line blocking from 61 | // disk IO. If a Transport does not support this, it can simply 62 | // ignore the call, and push the heartbeat onto the Consumer channel. 63 | SetHeartbeatHandler(cb func(rpc RPC)) 64 | 65 | // TimeoutNow is used to start a leadership transfer to the target node. 66 | TimeoutNow(id ServerID, target ServerAddress, args *TimeoutNowRequest, resp *TimeoutNowResponse) error 67 | } 68 | 69 | // WithPreVote is an interface that a transport may provide which 70 | // allows a transport to support a PreVote request. 71 | // 72 | // It is defined separately from Transport as unfortunately it wasn't in the 73 | // original interface specification. 74 | type WithPreVote interface { 75 | // RequestPreVote sends the appropriate RPC to the target node. 76 | RequestPreVote(id ServerID, target ServerAddress, args *RequestPreVoteRequest, resp *RequestPreVoteResponse) error 77 | } 78 | 79 | // WithClose is an interface that a transport may provide which 80 | // allows a transport to be shut down cleanly when a Raft instance 81 | // shuts down. 82 | // 83 | // It is defined separately from Transport as unfortunately it wasn't in the 84 | // original interface specification. 85 | type WithClose interface { 86 | // Close permanently closes a transport, stopping 87 | // any associated goroutines and freeing other resources. 88 | Close() error 89 | } 90 | 91 | // LoopbackTransport is an interface that provides a loopback transport suitable for testing 92 | // e.g. InmemTransport. It's there so we don't have to rewrite tests. 93 | type LoopbackTransport interface { 94 | Transport // Embedded transport reference 95 | WithPeers // Embedded peer management 96 | WithClose // with a close routine 97 | WithPreVote // with a prevote 98 | } 99 | 100 | // WithPeers is an interface that a transport may provide which allows for connection and 101 | // disconnection. Unless the transport is a loopback transport, the transport specified to 102 | // "Connect" is likely to be nil. 103 | type WithPeers interface { 104 | Connect(peer ServerAddress, t Transport) // Connect a peer 105 | Disconnect(peer ServerAddress) // Disconnect a given peer 106 | DisconnectAll() // Disconnect all peers, possibly to reconnect them later 107 | } 108 | 109 | // AppendPipeline is used for pipelining AppendEntries requests. It is used 110 | // to increase the replication throughput by masking latency and better 111 | // utilizing bandwidth. 112 | type AppendPipeline interface { 113 | // AppendEntries is used to add another request to the pipeline. 114 | // The send may block which is an effective form of back-pressure. 115 | AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) 116 | 117 | // Consumer returns a channel that can be used to consume 118 | // response futures when they are ready. 119 | Consumer() <-chan AppendFuture 120 | 121 | // Close closes the pipeline and cancels all inflight RPCs 122 | Close() error 123 | } 124 | 125 | // AppendFuture is used to return information about a pipelined AppendEntries request. 126 | type AppendFuture interface { 127 | Future 128 | 129 | // Start returns the time that the append request was started. 130 | // It is always OK to call this method. 131 | Start() time.Time 132 | 133 | // Request holds the parameters of the AppendEntries call. 134 | // It is always OK to call this method. 135 | Request() *AppendEntriesRequest 136 | 137 | // Response holds the results of the AppendEntries call. 138 | // This method must only be called after the Error 139 | // method returns, and will only be valid on success. 140 | Response() *AppendEntriesResponse 141 | } 142 | -------------------------------------------------------------------------------- /transport_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "reflect" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | const ( 14 | TTInmem = iota 15 | 16 | // NOTE: must be last 17 | numTestTransports 18 | ) 19 | 20 | func NewTestTransport(ttype int, addr ServerAddress) (ServerAddress, LoopbackTransport) { 21 | switch ttype { 22 | case TTInmem: 23 | return NewInmemTransport(addr) 24 | default: 25 | panic("Unknown transport type") 26 | } 27 | } 28 | 29 | func TestTransport_StartStop(t *testing.T) { 30 | for ttype := 0; ttype < numTestTransports; ttype++ { 31 | _, trans := NewTestTransport(ttype, "") 32 | if err := trans.Close(); err != nil { 33 | t.Fatalf("err: %v", err) 34 | } 35 | } 36 | } 37 | 38 | func TestTransport_AppendEntries(t *testing.T) { 39 | for ttype := 0; ttype < numTestTransports; ttype++ { 40 | addr1, trans1 := NewTestTransport(ttype, "") 41 | defer trans1.Close() 42 | rpcCh := trans1.Consumer() 43 | 44 | // Make the RPC request 45 | args := AppendEntriesRequest{ 46 | Term: 10, 47 | PrevLogEntry: 100, 48 | PrevLogTerm: 4, 49 | Entries: []*Log{ 50 | { 51 | Index: 101, 52 | Term: 4, 53 | Type: LogNoop, 54 | }, 55 | }, 56 | LeaderCommitIndex: 90, 57 | RPCHeader: RPCHeader{Addr: []byte("cartman")}, 58 | } 59 | 60 | resp := AppendEntriesResponse{ 61 | Term: 4, 62 | LastLog: 90, 63 | Success: true, 64 | } 65 | 66 | // Listen for a request 67 | go func() { 68 | select { 69 | case rpc := <-rpcCh: 70 | // Verify the command 71 | req := rpc.Command.(*AppendEntriesRequest) 72 | if !reflect.DeepEqual(req, &args) { 73 | t.Errorf("command mismatch: %#v %#v", *req, args) 74 | return 75 | } 76 | rpc.Respond(&resp, nil) 77 | 78 | case <-time.After(200 * time.Millisecond): 79 | t.Errorf("timeout") 80 | } 81 | }() 82 | 83 | // Transport 2 makes outbound request 84 | addr2, trans2 := NewTestTransport(ttype, "") 85 | defer trans2.Close() 86 | 87 | trans1.Connect(addr2, trans2) 88 | trans2.Connect(addr1, trans1) 89 | 90 | var out AppendEntriesResponse 91 | if err := trans2.AppendEntries("id1", trans1.LocalAddr(), &args, &out); err != nil { 92 | t.Fatalf("err: %v", err) 93 | } 94 | 95 | // Verify the response 96 | if !reflect.DeepEqual(resp, out) { 97 | t.Fatalf("command mismatch: %#v %#v", resp, out) 98 | } 99 | } 100 | } 101 | 102 | func TestTransport_AppendEntriesPipeline(t *testing.T) { 103 | for ttype := 0; ttype < numTestTransports; ttype++ { 104 | addr1, trans1 := NewTestTransport(ttype, "") 105 | defer trans1.Close() 106 | rpcCh := trans1.Consumer() 107 | 108 | // Make the RPC request 109 | args := AppendEntriesRequest{ 110 | Term: 10, 111 | PrevLogEntry: 100, 112 | PrevLogTerm: 4, 113 | Entries: []*Log{ 114 | { 115 | Index: 101, 116 | Term: 4, 117 | Type: LogNoop, 118 | }, 119 | }, 120 | LeaderCommitIndex: 90, 121 | RPCHeader: RPCHeader{Addr: []byte("cartman")}, 122 | } 123 | 124 | resp := AppendEntriesResponse{ 125 | Term: 4, 126 | LastLog: 90, 127 | Success: true, 128 | } 129 | 130 | // Listen for a request 131 | go func() { 132 | for i := 0; i < 10; i++ { 133 | select { 134 | case rpc := <-rpcCh: 135 | // Verify the command 136 | req := rpc.Command.(*AppendEntriesRequest) 137 | if !reflect.DeepEqual(req, &args) { 138 | t.Errorf("command mismatch: %#v %#v", *req, args) 139 | return 140 | } 141 | rpc.Respond(&resp, nil) 142 | 143 | case <-time.After(200 * time.Millisecond): 144 | t.Errorf("timeout") 145 | return 146 | } 147 | } 148 | }() 149 | 150 | // Transport 2 makes outbound request 151 | addr2, trans2 := NewTestTransport(ttype, "") 152 | defer trans2.Close() 153 | 154 | trans1.Connect(addr2, trans2) 155 | trans2.Connect(addr1, trans1) 156 | 157 | pipeline, err := trans2.AppendEntriesPipeline("id1", trans1.LocalAddr()) 158 | if err != nil { 159 | t.Fatalf("err: %v", err) 160 | } 161 | defer pipeline.Close() 162 | for i := 0; i < 10; i++ { 163 | out := new(AppendEntriesResponse) 164 | if _, err := pipeline.AppendEntries(&args, out); err != nil { 165 | t.Fatalf("err: %v", err) 166 | } 167 | } 168 | 169 | respCh := pipeline.Consumer() 170 | for i := 0; i < 10; i++ { 171 | select { 172 | case ready := <-respCh: 173 | // Verify the response 174 | if !reflect.DeepEqual(&resp, ready.Response()) { 175 | t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response()) 176 | } 177 | case <-time.After(200 * time.Millisecond): 178 | t.Fatalf("timeout") 179 | } 180 | } 181 | } 182 | } 183 | 184 | func TestTransport_RequestVote(t *testing.T) { 185 | for ttype := 0; ttype < numTestTransports; ttype++ { 186 | addr1, trans1 := NewTestTransport(ttype, "") 187 | defer trans1.Close() 188 | rpcCh := trans1.Consumer() 189 | 190 | // Make the RPC request 191 | args := RequestVoteRequest{ 192 | Term: 20, 193 | LastLogIndex: 100, 194 | LastLogTerm: 19, 195 | RPCHeader: RPCHeader{Addr: []byte("butters")}, 196 | } 197 | resp := RequestVoteResponse{ 198 | Term: 100, 199 | Granted: false, 200 | } 201 | 202 | // Listen for a request 203 | go func() { 204 | select { 205 | case rpc := <-rpcCh: 206 | // Verify the command 207 | req := rpc.Command.(*RequestVoteRequest) 208 | if !reflect.DeepEqual(req, &args) { 209 | t.Errorf("command mismatch: %#v %#v", *req, args) 210 | return 211 | } 212 | 213 | rpc.Respond(&resp, nil) 214 | 215 | case <-time.After(200 * time.Millisecond): 216 | t.Errorf("timeout") 217 | } 218 | }() 219 | 220 | // Transport 2 makes outbound request 221 | addr2, trans2 := NewTestTransport(ttype, "") 222 | defer trans2.Close() 223 | 224 | trans1.Connect(addr2, trans2) 225 | trans2.Connect(addr1, trans1) 226 | 227 | var out RequestVoteResponse 228 | if err := trans2.RequestVote("id1", trans1.LocalAddr(), &args, &out); err != nil { 229 | t.Fatalf("err: %v", err) 230 | } 231 | 232 | // Verify the response 233 | if !reflect.DeepEqual(resp, out) { 234 | t.Fatalf("command mismatch: %#v %#v", resp, out) 235 | } 236 | } 237 | } 238 | 239 | func TestTransport_InstallSnapshot(t *testing.T) { 240 | for ttype := 0; ttype < numTestTransports; ttype++ { 241 | addr1, trans1 := NewTestTransport(ttype, "") 242 | defer trans1.Close() 243 | rpcCh := trans1.Consumer() 244 | 245 | // Make the RPC request 246 | args := InstallSnapshotRequest{ 247 | Term: 10, 248 | LastLogIndex: 100, 249 | LastLogTerm: 9, 250 | Peers: []byte("blah blah"), 251 | Size: 10, 252 | RPCHeader: RPCHeader{Addr: []byte("kyle")}, 253 | } 254 | 255 | resp := InstallSnapshotResponse{ 256 | Term: 10, 257 | Success: true, 258 | } 259 | 260 | // Listen for a request 261 | go func() { 262 | select { 263 | case rpc := <-rpcCh: 264 | // Verify the command 265 | req := rpc.Command.(*InstallSnapshotRequest) 266 | if !reflect.DeepEqual(req, &args) { 267 | t.Errorf("command mismatch: %#v %#v", *req, args) 268 | return 269 | } 270 | 271 | // Try to read the bytes 272 | buf := make([]byte, 10) 273 | rpc.Reader.Read(buf) 274 | 275 | // Compare 276 | if bytes.Compare(buf, []byte("0123456789")) != 0 { 277 | t.Errorf("bad buf %v", buf) 278 | return 279 | } 280 | 281 | rpc.Respond(&resp, nil) 282 | 283 | case <-time.After(200 * time.Millisecond): 284 | t.Errorf("timeout") 285 | } 286 | }() 287 | 288 | // Transport 2 makes outbound request 289 | addr2, trans2 := NewTestTransport(ttype, "") 290 | defer trans2.Close() 291 | 292 | trans1.Connect(addr2, trans2) 293 | trans2.Connect(addr1, trans1) 294 | 295 | // Create a buffer 296 | buf := bytes.NewBuffer([]byte("0123456789")) 297 | 298 | var out InstallSnapshotResponse 299 | if err := trans2.InstallSnapshot("id1", trans1.LocalAddr(), &args, &out, buf); err != nil { 300 | t.Fatalf("err: %v", err) 301 | } 302 | 303 | // Verify the response 304 | if !reflect.DeepEqual(resp, out) { 305 | t.Fatalf("command mismatch: %#v %#v", resp, out) 306 | } 307 | } 308 | } 309 | 310 | func TestTransport_EncodeDecode(t *testing.T) { 311 | for ttype := 0; ttype < numTestTransports; ttype++ { 312 | _, trans1 := NewTestTransport(ttype, "") 313 | defer trans1.Close() 314 | 315 | local := trans1.LocalAddr() 316 | enc := trans1.EncodePeer("aaaa", local) 317 | dec := trans1.DecodePeer(enc) 318 | 319 | if dec != local { 320 | t.Fatalf("enc/dec fail: %v %v", dec, local) 321 | } 322 | } 323 | } 324 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | crand "crypto/rand" 9 | "fmt" 10 | "math" 11 | "math/big" 12 | "math/rand" 13 | "time" 14 | 15 | "github.com/hashicorp/go-msgpack/v2/codec" 16 | ) 17 | 18 | func init() { 19 | // Ensure we use a high-entropy seed for the pseudo-random generator 20 | rand.Seed(newSeed()) 21 | } 22 | 23 | // returns an int64 from a crypto random source 24 | // can be used to seed a source for a math/rand. 25 | func newSeed() int64 { 26 | r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) 27 | if err != nil { 28 | panic(fmt.Errorf("failed to read random bytes: %v", err)) 29 | } 30 | return r.Int64() 31 | } 32 | 33 | // randomTimeout returns a value that is between the minVal and 2x minVal. 34 | func randomTimeout(minVal time.Duration) <-chan time.Time { 35 | if minVal == 0 { 36 | return nil 37 | } 38 | extra := time.Duration(rand.Int63()) % minVal 39 | return time.After(minVal + extra) 40 | } 41 | 42 | // min returns the minimum. 43 | func min(a, b uint64) uint64 { 44 | if a <= b { 45 | return a 46 | } 47 | return b 48 | } 49 | 50 | // max returns the maximum. 51 | func max(a, b uint64) uint64 { 52 | if a >= b { 53 | return a 54 | } 55 | return b 56 | } 57 | 58 | // generateUUID is used to generate a random UUID. 59 | func generateUUID() string { 60 | buf := make([]byte, 16) 61 | if _, err := crand.Read(buf); err != nil { 62 | panic(fmt.Errorf("failed to read random bytes: %v", err)) 63 | } 64 | 65 | return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", 66 | buf[0:4], 67 | buf[4:6], 68 | buf[6:8], 69 | buf[8:10], 70 | buf[10:16]) 71 | } 72 | 73 | // asyncNotifyCh is used to do an async channel send 74 | // to a single channel without blocking. 75 | func asyncNotifyCh(ch chan struct{}) { 76 | select { 77 | case ch <- struct{}{}: 78 | default: 79 | } 80 | } 81 | 82 | // drainNotifyCh empties out a single-item notification channel without 83 | // blocking, and returns whether it received anything. 84 | func drainNotifyCh(ch chan struct{}) bool { 85 | select { 86 | case <-ch: 87 | return true 88 | default: 89 | return false 90 | } 91 | } 92 | 93 | // asyncNotifyBool is used to do an async notification 94 | // on a bool channel. 95 | func asyncNotifyBool(ch chan bool, v bool) { 96 | select { 97 | case ch <- v: 98 | default: 99 | } 100 | } 101 | 102 | // overrideNotifyBool is used to notify on a bool channel 103 | // but override existing value if value is present. 104 | // ch must be 1-item buffered channel. 105 | // 106 | // This method does not support multiple concurrent calls. 107 | func overrideNotifyBool(ch chan bool, v bool) { 108 | select { 109 | case ch <- v: 110 | // value sent, all done 111 | case <-ch: 112 | // channel had an old value 113 | select { 114 | case ch <- v: 115 | default: 116 | panic("race: channel was sent concurrently") 117 | } 118 | } 119 | } 120 | 121 | // Decode reverses the encode operation on a byte slice input. 122 | func decodeMsgPack(buf []byte, out interface{}) error { 123 | r := bytes.NewBuffer(buf) 124 | hd := codec.MsgpackHandle{} 125 | dec := codec.NewDecoder(r, &hd) 126 | return dec.Decode(out) 127 | } 128 | 129 | // Encode writes an encoded object to a new bytes buffer. 130 | func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { 131 | buf := bytes.NewBuffer(nil) 132 | hd := codec.MsgpackHandle{ 133 | BasicHandle: codec.BasicHandle{ 134 | TimeNotBuiltin: true, 135 | }, 136 | } 137 | enc := codec.NewEncoder(buf, &hd) 138 | err := enc.Encode(in) 139 | return buf, err 140 | } 141 | 142 | // backoff is used to compute an exponential backoff 143 | // duration. Base time is scaled by the current round, 144 | // up to some maximum scale factor. 145 | func backoff(base time.Duration, round, limit uint64) time.Duration { 146 | power := min(round, limit) 147 | for power > 2 { 148 | base *= 2 149 | power-- 150 | } 151 | return base 152 | } 153 | 154 | // cappedExponentialBackoff computes the exponential backoff with an adjustable 155 | // cap on the max timeout. 156 | func cappedExponentialBackoff(base time.Duration, round, limit uint64, cap time.Duration) time.Duration { 157 | power := min(round, limit) 158 | for power > 2 { 159 | if base > cap { 160 | return cap 161 | } 162 | base *= 2 163 | power-- 164 | } 165 | if base > cap { 166 | return cap 167 | } 168 | return base 169 | } 170 | 171 | // Needed for sorting []uint64, used to determine commitment 172 | type uint64Slice []uint64 173 | 174 | func (p uint64Slice) Len() int { return len(p) } 175 | func (p uint64Slice) Less(i, j int) bool { return p[i] < p[j] } 176 | func (p uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 177 | -------------------------------------------------------------------------------- /util_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) HashiCorp, Inc. 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package raft 5 | 6 | import ( 7 | "bytes" 8 | "regexp" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | // TestMsgpackEncodeTime ensures that we don't break backwards compatibility when updating go-msgpack with 14 | // Raft binary formats. 15 | func TestMsgpackEncodeTimeDefaultFormat(t *testing.T) { 16 | stamp := "2006-01-02T15:04:05Z" 17 | tm, err := time.Parse(time.RFC3339, stamp) 18 | if err != nil { 19 | t.Fatal(err) 20 | } 21 | buf, err := encodeMsgPack(tm) 22 | 23 | expected := []byte{175, 1, 0, 0, 0, 14, 187, 75, 55, 229, 0, 0, 0, 0, 255, 255} 24 | 25 | if !bytes.Equal(buf.Bytes(), expected) { 26 | t.Errorf("Expected time %s to encode as %+v but got %+v", stamp, expected, buf.Bytes()) 27 | } 28 | } 29 | 30 | func TestRandomTimeout(t *testing.T) { 31 | start := time.Now() 32 | timeout := randomTimeout(time.Millisecond) 33 | 34 | select { 35 | case <-timeout: 36 | diff := time.Now().Sub(start) 37 | if diff < time.Millisecond { 38 | t.Fatalf("fired early") 39 | } 40 | case <-time.After(3 * time.Millisecond): 41 | t.Fatalf("timeout") 42 | } 43 | } 44 | 45 | func TestNewSeed(t *testing.T) { 46 | vals := make(map[int64]bool) 47 | for i := 0; i < 1000; i++ { 48 | seed := newSeed() 49 | if _, exists := vals[seed]; exists { 50 | t.Fatal("newSeed() return a value it'd previously returned") 51 | } 52 | vals[seed] = true 53 | } 54 | } 55 | 56 | func TestRandomTimeout_NoTime(t *testing.T) { 57 | timeout := randomTimeout(0) 58 | if timeout != nil { 59 | t.Fatalf("expected nil channel") 60 | } 61 | } 62 | 63 | func TestMin(t *testing.T) { 64 | if min(1, 1) != 1 { 65 | t.Fatalf("bad min") 66 | } 67 | if min(2, 1) != 1 { 68 | t.Fatalf("bad min") 69 | } 70 | if min(1, 2) != 1 { 71 | t.Fatalf("bad min") 72 | } 73 | } 74 | 75 | func TestMax(t *testing.T) { 76 | if max(1, 1) != 1 { 77 | t.Fatalf("bad max") 78 | } 79 | if max(2, 1) != 2 { 80 | t.Fatalf("bad max") 81 | } 82 | if max(1, 2) != 2 { 83 | t.Fatalf("bad max") 84 | } 85 | } 86 | 87 | func TestGenerateUUID(t *testing.T) { 88 | prev := generateUUID() 89 | for i := 0; i < 100; i++ { 90 | id := generateUUID() 91 | if prev == id { 92 | t.Fatalf("Should get a new ID!") 93 | } 94 | 95 | matched, err := regexp.MatchString( 96 | `[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}`, id) 97 | if !matched || err != nil { 98 | t.Fatalf("expected match %s %v %s", id, matched, err) 99 | } 100 | } 101 | } 102 | 103 | func TestBackoff(t *testing.T) { 104 | b := backoff(10*time.Millisecond, 1, 8) 105 | if b != 10*time.Millisecond { 106 | t.Fatalf("bad: %v", b) 107 | } 108 | 109 | b = backoff(20*time.Millisecond, 2, 8) 110 | if b != 20*time.Millisecond { 111 | t.Fatalf("bad: %v", b) 112 | } 113 | 114 | b = backoff(10*time.Millisecond, 8, 8) 115 | if b != 640*time.Millisecond { 116 | t.Fatalf("bad: %v", b) 117 | } 118 | 119 | b = backoff(10*time.Millisecond, 9, 8) 120 | if b != 640*time.Millisecond { 121 | t.Fatalf("bad: %v", b) 122 | } 123 | } 124 | 125 | func TestOverrideNotifyBool(t *testing.T) { 126 | ch := make(chan bool, 1) 127 | 128 | // sanity check - buffered channel don't have any values 129 | select { 130 | case v := <-ch: 131 | t.Fatalf("unexpected receive: %v", v) 132 | default: 133 | } 134 | 135 | // simple case of a single push 136 | overrideNotifyBool(ch, false) 137 | select { 138 | case v := <-ch: 139 | if v != false { 140 | t.Fatalf("expected false but got %v", v) 141 | } 142 | default: 143 | t.Fatalf("expected a value but is not ready") 144 | } 145 | 146 | // assert that function never blocks and only last item is received 147 | overrideNotifyBool(ch, false) 148 | overrideNotifyBool(ch, false) 149 | overrideNotifyBool(ch, false) 150 | overrideNotifyBool(ch, false) 151 | overrideNotifyBool(ch, true) 152 | 153 | select { 154 | case v := <-ch: 155 | if v != true { 156 | t.Fatalf("expected true but got %v", v) 157 | } 158 | default: 159 | t.Fatalf("expected a value but is not ready") 160 | } 161 | 162 | // no further value is available 163 | select { 164 | case v := <-ch: 165 | t.Fatalf("unexpected receive: %v", v) 166 | default: 167 | } 168 | } 169 | --------------------------------------------------------------------------------