├── cmd └── waldump │ ├── .gitignore │ ├── waldump.go │ └── README.md ├── .gitignore ├── alice ├── .gitignore ├── run-workload.sh ├── Makefile ├── workload │ └── main.go └── checker │ └── main.go ├── bench ├── .gitignore ├── main.go ├── append_requestor.go └── bench_test.go ├── CHANGELOG.md ├── NOTICE.txt ├── segment ├── crc.go ├── reader_test.go ├── reader.go ├── format_test.go ├── writer_test.go ├── format.go ├── vfs_test.go └── filer.go ├── .copywrite.hcl ├── .github ├── pull_request_template.md ├── workflows │ ├── two-step-pr-approval.yml │ └── go-tests.yml └── dependabot.yml ├── CODEOWNERS ├── types ├── buffer.go ├── types.go ├── meta.go ├── vfs.go └── segment.go ├── fs ├── file.go ├── fs_test.go └── fs.go ├── metrics ├── atomic_collector_test.go ├── metrics.go ├── gometrics_collector_test.go ├── atomic_collector.go └── gometrics_collector.go ├── integration ├── meta.go └── integration_test.go ├── go.mod ├── verifier ├── metrics.go ├── store.go └── verifier.go ├── options.go ├── codec_test.go ├── metrics.go ├── metadb ├── metadb_test.go └── metadb.go ├── codec.go ├── migrate ├── migrate.go └── migrate_test.go └── state.go /cmd/waldump/.gitignore: -------------------------------------------------------------------------------- 1 | waldump -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | bench.test 3 | profile.out -------------------------------------------------------------------------------- /alice/.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | *.gdb 3 | traces_dir 4 | workload_dir -------------------------------------------------------------------------------- /bench/.gitignore: -------------------------------------------------------------------------------- 1 | bench 2 | bench-result* 3 | uncorrected_bench-result* 4 | bench.test 5 | profile.out -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Unreleased 2 | 3 | ### Improvements 4 | 5 | ### Changes 6 | 7 | ### Fixed 8 | 9 | ### Security 10 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | HashiCorp Raft WAL Library 2 | https://www.hashicorp.com/ 3 | License: Mozilla Public License Version 2.0 4 | Copyright 2022 HashiCorp, Inc. -------------------------------------------------------------------------------- /segment/crc.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "hash/crc32" 8 | ) 9 | 10 | var castagnoliTable *crc32.Table 11 | 12 | func init() { 13 | castagnoliTable = crc32.MakeTable(crc32.Castagnoli) 14 | } 15 | -------------------------------------------------------------------------------- /.copywrite.hcl: -------------------------------------------------------------------------------- 1 | schema_version = 1 2 | 3 | project { 4 | license = "MPL-2.0" 5 | copyright_year = 2022 6 | 7 | # (OPTIONAL) A list of globs that should not have copyright/license headers. 8 | # Supports doublestar glob patterns for more flexibility in defining which 9 | # files or folders should be ignored (e.g., "vendors/**") 10 | header_ignore = [] 11 | } 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | ## Description 3 | 4 | 5 | 6 | ## Related Issue 7 | 8 | 9 | 10 | ## How Has This Been Tested? 11 | 12 | 13 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Each line is a file pattern followed by one or more owners. 2 | # More on CODEOWNERS files: https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners 3 | 4 | # Default owner 5 | * @hashicorp/team-ip-compliance @hashicorp/consul-core-reviewers @hashicorp/raft-force 6 | 7 | # Add override rules below. Each line is a file/folder pattern followed by one or more owners. 8 | # Being an owner means those groups or individuals will be added as reviewers to PRs affecting 9 | # those areas of the code. 10 | # Examples: 11 | # /docs/ @docs-team 12 | # *.js @js-team 13 | # *.go @go-team 14 | -------------------------------------------------------------------------------- /.github/workflows/two-step-pr-approval.yml: -------------------------------------------------------------------------------- 1 | name: Two-Stage PR Review Process 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize, reopened, labeled, unlabeled, ready_for_review, converted_to_draft] 6 | pull_request_review: 7 | types: [submitted] 8 | 9 | jobs: 10 | manage-pr-status: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | pull-requests: write 14 | contents: write 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.0.0 18 | 19 | - name: Two stage PR review 20 | uses: hashicorp/two-stage-pr-approval@v0.1.0 21 | -------------------------------------------------------------------------------- /types/buffer.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package types 5 | 6 | // PooledBuffer is a wrapper that allows WAL to return read buffers to segment 7 | // implementations when we're done decoding. 8 | type PooledBuffer struct { 9 | Bs []byte 10 | CloseFn func() 11 | } 12 | 13 | // Close implements io.Closer and returns the buffer to the pool. It should be 14 | // called exactly once for each buffer when it's no longer needed. It's no 15 | // longer safe to access Bs or any slice taken from it after the call. 16 | func (b *PooledBuffer) Close() error { 17 | if b.CloseFn != nil { 18 | b.CloseFn() 19 | } 20 | return nil 21 | } 22 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) HashiCorp, Inc. 2 | # SPDX-License-Identifier: MPL-2.0 3 | 4 | version: 2 5 | 6 | updates: 7 | - package-ecosystem: "github-actions" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | day: "sunday" 12 | commit-message: 13 | prefix: "[chore] : " 14 | groups: 15 | actions: 16 | patterns: 17 | - "*" 18 | 19 | - package-ecosystem: "gomod" 20 | directory: "/" 21 | schedule: 22 | interval: "weekly" 23 | day: "sunday" 24 | commit-message: 25 | prefix: "[chore] : " 26 | groups: 27 | go: 28 | patterns: 29 | - "*" 30 | applies-to: "version-updates" 31 | go-security: 32 | patterns: 33 | - "*" 34 | applies-to: "security-updates" 35 | -------------------------------------------------------------------------------- /types/types.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package types 5 | 6 | import ( 7 | "errors" 8 | 9 | "github.com/hashicorp/raft" 10 | ) 11 | 12 | var ( 13 | // ErrNotFound is our own version of raft's not found error. It's important 14 | // it's exactly the same because the raft lib checks for equality with it's 15 | // own type as a crucial part of replication processing (detecting end of logs 16 | // and that a snapshot is needed for a follower). 17 | ErrNotFound = raft.ErrLogNotFound 18 | ErrCorrupt = errors.New("WAL is corrupt") 19 | ErrSealed = errors.New("segment is sealed") 20 | ErrClosed = errors.New("closed") 21 | ) 22 | 23 | // LogEntry represents an entry that has already been encoded. 24 | type LogEntry struct { 25 | Index uint64 26 | Data []byte 27 | } 28 | -------------------------------------------------------------------------------- /alice/run-workload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright IBM Corp. 2020, 2025 4 | # SPDX-License-Identifier: MPL-2.0 5 | 6 | set -e 7 | # trap 'error ${LINENO}' ERR 8 | 9 | WORKLOAD=$1 10 | 11 | bold=$(tput bold) 12 | green=$(tput setf 2) 13 | normal=$(tput sgr0) 14 | 15 | echo "${green}==> Running Workload ${bold}${WORKLOAD}${normal}" 16 | 17 | echo " -> Cleaning up dirs" 18 | rm -rf /workload_dir traces_dir 19 | mkdir /workload_dir traces_dir 20 | 21 | echo " -> Running init" 22 | bin/workload -dir /workload_dir -workload "$WORKLOAD" -init 23 | 24 | echo " -> Running alice-record" 25 | env GOMAXPROCS=1 alice-record --workload_dir /workload_dir \ 26 | --traces_dir traces_dir \ 27 | bin/workload -dir /workload_dir -workload "$WORKLOAD" 28 | 29 | echo " -> Running alice-check" 30 | alice-check --traces_dir=traces_dir --checker=bin/checker 31 | -------------------------------------------------------------------------------- /alice/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright IBM Corp. 2020, 2025 2 | # SPDX-License-Identifier: MPL-2.0 3 | 4 | WORK_DIR=$(shell pwd) 5 | 6 | WORKLOAD ?= append 7 | 8 | .PHONY: test test-local 9 | 10 | test: bin/workload bin/checker 11 | docker run --privileged \ 12 | --cap-add=SYS_PTRACE \ 13 | --memory=8gb \ 14 | --shm-size=2gb \ 15 | -v $(WORK_DIR):/app \ 16 | -it ghcr.io/banks/alice:master \ 17 | ./run-workload.sh ${WORKLOAD} 18 | 19 | test-local: 20 | mkdir -p ${WORK_DIR}/workload_dir 21 | echo " -> Running init (${WORKLOAD})" 22 | go run ${WORK_DIR}/workload/main.go \ 23 | -dir ${WORK_DIR}/workload_dir \ 24 | -workload ${WORKLOAD} \ 25 | -init 26 | echo " -> Running workload (${WORKLOAD})" 27 | go run ${WORK_DIR}/workload/main.go \ 28 | -dir ${WORK_DIR}/workload_dir \ 29 | -workload ${WORKLOAD} | tee ${WORK_DIR}/workload_dir/stdout.txt 30 | echo " -> Running checker" 31 | go run ${WORK_DIR}/checker/main.go \ 32 | ${WORK_DIR}/workload_dir \ 33 | ${WORK_DIR}/workload_dir/stdout.txt 34 | 35 | bin/workload: workload/main.go 36 | GOOS=linux go build -o bin/workload workload/main.go 37 | 38 | bin/checker: checker/main.go 39 | GOOS=linux go build -o bin/checker checker/main.go 40 | -------------------------------------------------------------------------------- /fs/file.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fs 5 | 6 | import ( 7 | "os" 8 | "sync/atomic" 9 | 10 | "github.com/hashicorp/raft-wal/types" 11 | ) 12 | 13 | var _ types.WritableFile = &File{} 14 | 15 | // File wraps an os.File and implements types.WritableFile. It ensures that the 16 | // first time Sync is called on the file, that the parent directory is also 17 | // Fsynced to ensure a crash won't cause the FS to forget the file is there. 18 | // 19 | // Postponing this allows us to ensure that we do the minimum necessary fsyncs 20 | // but still ensure all required fsyncs are done by the time we acknowledge 21 | // committed data in the new file. 22 | type File struct { 23 | new uint32 // atomically accessed, keep it aligned! 24 | dir string 25 | os.File 26 | } 27 | 28 | // Sync calls fsync on the underlying file. If this is the first call to Sync 29 | // since creation it also fsyncs the parent dir. 30 | func (f *File) Sync() error { 31 | // Sync the underlying file 32 | if err := f.File.Sync(); err != nil { 33 | return err 34 | } 35 | new := atomic.SwapUint32(&f.new, 1) 36 | if new == 0 { 37 | return syncDir(f.dir) 38 | } 39 | return nil 40 | } 41 | -------------------------------------------------------------------------------- /metrics/atomic_collector_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package metrics 5 | 6 | import ( 7 | "sync" 8 | "testing" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestAtomicCollector(t *testing.T) { 14 | defs := Definitions{ 15 | Counters: []Descriptor{ 16 | { 17 | Name: "c1", 18 | Desc: "counter one.", 19 | }, 20 | { 21 | Name: "c2", 22 | Desc: "counter two.", 23 | }, 24 | }, 25 | Gauges: []Descriptor{ 26 | { 27 | Name: "g1", 28 | Desc: "gauge one.", 29 | }, 30 | { 31 | Name: "g2", 32 | Desc: "gauge two.", 33 | }, 34 | }, 35 | } 36 | 37 | c := NewAtomicCollector(defs) 38 | 39 | var wg sync.WaitGroup 40 | 41 | for i := 0; i < 10; i++ { 42 | wg.Add(1) 43 | go func() { 44 | defer wg.Done() 45 | for j := 0; j < 10; j++ { 46 | c.IncrementCounter("c1", 1) 47 | c.IncrementCounter("c2", 2) 48 | c.SetGauge("g1", uint64(j)) 49 | c.SetGauge("g2", uint64(j*2)) 50 | } 51 | }() 52 | } 53 | 54 | wg.Wait() 55 | 56 | s := c.Summary() 57 | require.Equal(t, 100, int(s.Counters["c1"])) 58 | require.Equal(t, 200, int(s.Counters["c2"])) 59 | require.Equal(t, 9, int(s.Gauges["g1"])) 60 | require.Equal(t, 18, int(s.Gauges["g2"])) 61 | } 62 | -------------------------------------------------------------------------------- /metrics/metrics.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package metrics 5 | 6 | // Collector provides a simple abstraction for counter type metrics that 7 | // the WAL and log verifier can use without depending on a specific metrics 8 | // collector implementation. 9 | type Collector interface { 10 | // IncrementCounter record val occurrences of the named event. Names will 11 | // follow prometheus conventions with lower_case_and_underscores. We don't 12 | // need any additional labels currently. 13 | IncrementCounter(name string, delta uint64) 14 | 15 | // SetGauge sets the value of the named gauge overriding any previous value. 16 | SetGauge(name string, val uint64) 17 | } 18 | 19 | // Definitions provides a simple description of a set of scalar metrics. 20 | type Definitions struct { 21 | Counters []Descriptor 22 | Gauges []Descriptor 23 | } 24 | 25 | // Descriptor describes a specific metric. 26 | type Descriptor struct { 27 | Name string 28 | Desc string 29 | } 30 | 31 | var _ Collector = &NoOpCollector{} 32 | 33 | // NoOpCollector is a Collector that does nothing. 34 | type NoOpCollector struct{} 35 | 36 | // IncrementCounter record val occurrences of the named event. Names will 37 | // follow prometheus conventions with lower_case_and_underscores. We don't 38 | // need any additional labels currently. 39 | func (c *NoOpCollector) IncrementCounter(name string, delta uint64) {} 40 | 41 | // SetGauge sets the value of the named gauge overriding any previous value. 42 | func (c *NoOpCollector) SetGauge(name string, val uint64) {} 43 | -------------------------------------------------------------------------------- /integration/meta.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package integration 5 | 6 | import ( 7 | "sync" 8 | 9 | "github.com/hashicorp/raft-wal/types" 10 | ) 11 | 12 | type PeekingMetaStore struct { 13 | mu sync.Mutex 14 | meta types.MetaStore 15 | state types.PersistentState 16 | stable map[string]string 17 | } 18 | 19 | func (s *PeekingMetaStore) PeekState() types.PersistentState { 20 | s.mu.Lock() 21 | defer s.mu.Unlock() 22 | return s.state 23 | } 24 | 25 | func (s *PeekingMetaStore) PeekStable(key string) (string, bool) { 26 | s.mu.Lock() 27 | defer s.mu.Unlock() 28 | v, ok := s.stable[key] 29 | return v, ok 30 | } 31 | 32 | func (s *PeekingMetaStore) Load(dir string) (types.PersistentState, error) { 33 | state, err := s.meta.Load(dir) 34 | if err == nil { 35 | s.mu.Lock() 36 | s.state = state 37 | s.mu.Unlock() 38 | } 39 | return state, err 40 | } 41 | 42 | func (s *PeekingMetaStore) CommitState(state types.PersistentState) error { 43 | err := s.meta.CommitState(state) 44 | if err == nil { 45 | s.mu.Lock() 46 | s.state = state 47 | s.mu.Unlock() 48 | } 49 | return nil 50 | } 51 | 52 | func (s *PeekingMetaStore) GetStable(key []byte) ([]byte, error) { 53 | return s.meta.GetStable(key) 54 | } 55 | 56 | func (s *PeekingMetaStore) SetStable(key, value []byte) error { 57 | err := s.meta.SetStable(key, value) 58 | if err == nil { 59 | s.mu.Lock() 60 | s.stable[string(key)] = string(value) 61 | s.mu.Unlock() 62 | } 63 | return err 64 | } 65 | 66 | func (s *PeekingMetaStore) Close() error { 67 | return s.meta.Close() 68 | } 69 | -------------------------------------------------------------------------------- /types/meta.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package types 5 | 6 | import "io" 7 | 8 | // MetaStore is the interface we need to some persistent, crash safe backend. We 9 | // implement it with BoltDB for real usage but the interface allows alternatives 10 | // to be used, or tests to mock out FS access. 11 | type MetaStore interface { 12 | // Load loads the existing persisted state. If there is no existing state 13 | // implementations are expected to create initialize new storage and return an 14 | // empty state. 15 | Load(dir string) (PersistentState, error) 16 | 17 | // CommitState must atomically replace all persisted metadata in the current 18 | // store with the set provided. It must not return until the data is persisted 19 | // durably and in a crash-safe way otherwise the guarantees of the WAL will be 20 | // compromised. The WAL will only ever call this in a single thread at one 21 | // time and it will never be called concurrently with Load however it may be 22 | // called concurrently with Get/SetStable operations. 23 | CommitState(PersistentState) error 24 | 25 | // GetStable returns a value from stable store or nil if it doesn't exist. May 26 | // be called concurrently by multiple threads. 27 | GetStable(key []byte) ([]byte, error) 28 | 29 | // SetStable stores a value from stable store. May be called concurrently with 30 | // GetStable. 31 | SetStable(key, value []byte) error 32 | 33 | io.Closer 34 | } 35 | 36 | // PersistentState represents the WAL file metadata we need to store reliably to 37 | // recover on restart. 38 | type PersistentState struct { 39 | NextSegmentID uint64 40 | Segments []SegmentInfo 41 | } 42 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hashicorp/raft-wal 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.23.5 6 | 7 | require ( 8 | github.com/HdrHistogram/hdrhistogram-go v1.1.2 9 | github.com/benbjohnson/immutable v0.4.3 10 | github.com/benmathews/bench v0.0.0-20210120214102-f7c75b9ef6e7 11 | github.com/benmathews/hdrhistogram-writer v0.0.0-20210120211942-3cb1c7c33f95 12 | github.com/google/gofuzz v1.2.0 13 | github.com/hashicorp/go-hclog v1.6.3 14 | github.com/hashicorp/go-metrics v0.5.4 15 | github.com/hashicorp/raft v1.7.3 16 | github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702 17 | github.com/hashicorp/raft-boltdb/v2 v2.3.1 18 | github.com/segmentio/fasthash v1.0.3 19 | github.com/stretchr/testify v1.10.0 20 | go.etcd.io/bbolt v1.4.3 21 | go.etcd.io/etcd/client/pkg/v3 v3.6.4 22 | ) 23 | 24 | require ( 25 | github.com/armon/go-metrics v0.4.1 // indirect 26 | github.com/boltdb/bolt v1.3.1 // indirect 27 | github.com/davecgh/go-spew v1.1.1 // indirect 28 | github.com/fatih/color v1.13.0 // indirect 29 | github.com/hashicorp/go-immutable-radix v1.3.0 // indirect 30 | github.com/hashicorp/go-msgpack v1.1.5 // indirect 31 | github.com/hashicorp/go-msgpack/v2 v2.1.2 // indirect 32 | github.com/hashicorp/golang-lru v0.5.4 // indirect 33 | github.com/mattn/go-colorable v0.1.12 // indirect 34 | github.com/mattn/go-isatty v0.0.14 // indirect 35 | github.com/pmezard/go-difflib v1.0.0 // indirect 36 | go.uber.org/multierr v1.11.0 // indirect 37 | go.uber.org/zap v1.27.0 // indirect 38 | golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect 39 | golang.org/x/sys v0.31.0 // indirect 40 | golang.org/x/time v0.1.0 // indirect 41 | gonum.org/v1/gonum v0.12.0 // indirect 42 | gopkg.in/yaml.v3 v3.0.1 // indirect 43 | ) 44 | -------------------------------------------------------------------------------- /cmd/waldump/waldump.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | 3 | package main 4 | 5 | import ( 6 | "encoding/json" 7 | "flag" 8 | "fmt" 9 | "os" 10 | 11 | "github.com/hashicorp/raft" 12 | wal "github.com/hashicorp/raft-wal" 13 | "github.com/hashicorp/raft-wal/fs" 14 | "github.com/hashicorp/raft-wal/segment" 15 | "github.com/hashicorp/raft-wal/types" 16 | ) 17 | 18 | type opts struct { 19 | Dir string 20 | After uint64 21 | Before uint64 22 | } 23 | 24 | func main() { 25 | var o opts 26 | flag.Uint64Var(&o.After, "after", 0, "specified a raft index to use as an exclusive lower bound when dumping log entries.") 27 | flag.Uint64Var(&o.Before, "before", 0, "specified a raft index to use as an exclusive upper bound when dumping log entries.") 28 | 29 | flag.Parse() 30 | 31 | // Accept dir as positional arg 32 | o.Dir = flag.Arg(0) 33 | if o.Dir == "" { 34 | fmt.Println("Usage: waldump [-after INDEX] [-before INDEX] ") 35 | os.Exit(1) 36 | } 37 | 38 | vfs := fs.New() 39 | f := segment.NewFiler(o.Dir, vfs) 40 | 41 | codec := &wal.BinaryCodec{} 42 | var log raft.Log 43 | enc := json.NewEncoder(os.Stdout) 44 | 45 | err := f.DumpLogs(o.After, o.Before, func(info types.SegmentInfo, e types.LogEntry) (bool, error) { 46 | if info.Codec != wal.CodecBinaryV1 { 47 | return false, fmt.Errorf("unsupported codec %d in file %s", info.Codec, segment.FileName(info)) 48 | } 49 | if err := codec.Decode(e.Data, &log); err != nil { 50 | return false, err 51 | } 52 | // Output the raft Log struct as JSON 53 | if err := enc.Encode(log); err != nil { 54 | return false, err 55 | } 56 | return true, nil 57 | }) 58 | if err != nil { 59 | fmt.Printf("ERROR: %s\n", err) 60 | os.Exit(1) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /metrics/gometrics_collector_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package metrics 5 | 6 | import ( 7 | "testing" 8 | "time" 9 | 10 | gometrics "github.com/hashicorp/go-metrics/compat" 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func TestGoMetricsCollector(t *testing.T) { 15 | cfg := &gometrics.Config{ 16 | EnableHostname: false, 17 | EnableRuntimeMetrics: false, 18 | // FilterDefault is super weird and backwards but "true" means "don't 19 | // filter"! 20 | FilterDefault: true, 21 | } 22 | sink := gometrics.NewInmemSink(1*time.Second, 10*time.Second) 23 | gm, err := gometrics.New(cfg, sink) 24 | require.NoError(t, err) 25 | 26 | c := NewGoMetricsCollector( 27 | []string{"myapp", "wal"}, 28 | []gometrics.Label{{Name: "label", Value: "foo"}}, 29 | gm, 30 | ) 31 | 32 | c.IncrementCounter("counter_one", 1) 33 | c.IncrementCounter("counter_one", 1) 34 | c.IncrementCounter("counter_two", 10) 35 | 36 | c.SetGauge("g1", 12345) 37 | 38 | summary := flattenData(sink.Data()) 39 | 40 | require.Equal(t, 2, int(summary.Counters["myapp.wal.counter_one;label=foo"])) 41 | require.Equal(t, 10, int(summary.Counters["myapp.wal.counter_two;label=foo"])) 42 | 43 | require.Equal(t, 12345, int(summary.Gauges["myapp.wal.g1;label=foo"])) 44 | 45 | } 46 | 47 | func flattenData(ivs []*gometrics.IntervalMetrics) Summary { 48 | s := Summary{ 49 | Counters: make(map[string]uint64), 50 | Gauges: make(map[string]uint64), 51 | } 52 | for _, iv := range ivs { 53 | for name, v := range iv.Counters { 54 | s.Counters[name] += uint64(v.Sum) 55 | } 56 | for name, v := range iv.Gauges { 57 | s.Gauges[name] = uint64(v.Value) 58 | } 59 | } 60 | return s 61 | } 62 | -------------------------------------------------------------------------------- /verifier/metrics.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package verifier 5 | 6 | import ( 7 | "github.com/hashicorp/raft-wal/metrics" 8 | ) 9 | 10 | var ( 11 | // MetricDefinitions describe the metrics emitted by this library via the 12 | // provided metrics.Collector implementation. It's public so that these can be 13 | // registered during init with metrics clients that support pre-defining 14 | // metrics. 15 | MetricDefinitions = metrics.Definitions{ 16 | Counters: []metrics.Descriptor{ 17 | { 18 | Name: "checkpoints_written", 19 | Desc: "checkpoints_written counts the number of checkpoint entries" + 20 | " written to the LogStore.", 21 | }, 22 | { 23 | Name: "ranges_verified", 24 | Desc: "ranges_verified counts the number of log ranges for which a" + 25 | " verification report has been completed.", 26 | }, 27 | { 28 | Name: "read_checksum_failures", 29 | Desc: "read_checksum_failures counts the number of times a range of" + 30 | " logs between two check points contained at least one corruption.", 31 | }, 32 | { 33 | Name: "write_checksum_failures", 34 | Desc: "write_checksum_failures counts the number of times a follower" + 35 | " has a different checksum to the leader at the point where it" + 36 | " writes to the log. This could be caused by either a disk-corruption" + 37 | " on the leader (unlikely) or some other corruption of the log" + 38 | " entries in-flight.", 39 | }, 40 | { 41 | Name: "dropped_reports", 42 | Desc: "dropped_reports counts how many times the verifier routine was" + 43 | " still busy when the next checksum came in and so verification for" + 44 | " a range was skipped. If you see this happen consider increasing" + 45 | " the interval between checkpoints.", 46 | }, 47 | }, 48 | } 49 | ) 50 | -------------------------------------------------------------------------------- /cmd/waldump/README.md: -------------------------------------------------------------------------------- 1 | # waldump 2 | 3 | A simple command for dumping the contents of WAL segment files to JSON for 4 | debugging. 5 | 6 | ## Usage 7 | 8 | ``` 9 | $ waldump [-after INDEX] [-before INDEX] /path/to/wal/dir 10 | ... 11 | {"Index":227281,"Term":4,"Type":0,"Data":"hpGEpUNvb3JkhKpBZGp1c3RtZW50yz7pEPrkTc4tpUVycm9yyz/B4NJg87MZpkhlaWdodMs/ABkEWHeDZqNWZWOYyz8FyF63P/XOyz8Fe2fyqYpayz7eXgvdsOWVyz7xX/ARy9MByz7XZq0fmx5eyz7x8ic7zxhJy78EgvusSgKUy77xVfw2sEr5pE5vZGWiczGpUGFydGl0aW9uoKdTZWdtZW50oA==","Extensions":null,"AppendedAt":"2023-03-23T12:24:05.440317Z"} 12 | ... 13 | ``` 14 | 15 | Each `raft.Log` is written out as JSON followed by a newline. The `Data` and 16 | `Extensions` fields are opaque byte strings that will be base64 encoded. 17 | Decoding those requires knowledge of the encoding used by the writing 18 | application. 19 | 20 | ## Limitations 21 | 22 | This tool is designed for debugging only. It does _not_ inspect the wal-meta 23 | database. This has the nice property that you can safely dump the contexts of 24 | WAL files even while the application is still writing to the WAL since we don't 25 | have to take a lock on the meta database. 26 | 27 | The downside is that this tool might in some edge cases output logs that have 28 | already been deleted from the WAL. It's possible although extremely unlikely 29 | that the WAL could be in the process of truncating the tail which could result 30 | in there being both pre-truncate and post-truncate segment files present. This 31 | tool might possibly output duplicate and out-of-order log indexes from before 32 | and after the truncation. Or if `before` and `after` are used, it's possible we 33 | might skip records entirely because an older file that has already been removed 34 | was read instead of the newer one. These are all very unlikely in practice and 35 | if the application that writes the WAL is still up and running are likely to be 36 | resolved by the time you run the tool again. -------------------------------------------------------------------------------- /types/vfs.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package types 5 | 6 | import "io" 7 | 8 | // VFS is the interface WAL needs to interact with the file system. In 9 | // production it would normally be implemented by RealFS which interacts with 10 | // the operating system FS using standard go os package. It's useful to allow 11 | // testing both to run quicker (by being in memory only) and to make it easy to 12 | // simulate all kinds of disk errors and failure modes without needing a more 13 | // elaborate external test harness like ALICE. 14 | type VFS interface { 15 | // ListDir returns a list of all files in the specified dir in lexicographical 16 | // order. If the dir doesn't exist, it must return an error. Empty array with 17 | // nil error is assumed to mean that the directory exists and was readable, 18 | // but contains no files. 19 | ListDir(dir string) ([]string, error) 20 | 21 | // Create creates a new file with the given name. If a file with the same name 22 | // already exists an error is returned. If a non-zero size is given, 23 | // implementations should make a best effort to pre-allocate the file to be 24 | // that size. The dir must already exist and be writable to the current 25 | // process. 26 | Create(dir, name string, size uint64) (WritableFile, error) 27 | 28 | // Delete indicates the file is no longer required. Typically it should be 29 | // deleted from the underlying system to free disk space. 30 | Delete(dir, name string) error 31 | 32 | // OpenReader opens an existing file in read-only mode. If the file doesn't 33 | // exist or permission is denied, an error is returned, otherwise no checks 34 | // are made about the well-formedness of the file, it may be empty, the wrong 35 | // size or corrupt in arbitrary ways. 36 | OpenReader(dir, name string) (ReadableFile, error) 37 | 38 | // OpenWriter opens a file in read-write mode. If the file doesn't exist or 39 | // permission is denied, an error is returned, otherwise no checks are made 40 | // about the well-formedness of the file, it may be empty, the wrong size or 41 | // corrupt in arbitrary ways. 42 | OpenWriter(dir, name string) (WritableFile, error) 43 | } 44 | 45 | // WritableFile provides random read-write access to a file as well as the 46 | // ability to fsync it to disk. 47 | type WritableFile interface { 48 | io.WriterAt 49 | io.ReaderAt 50 | io.Closer 51 | 52 | Sync() error 53 | } 54 | 55 | // ReadableFile provides random read access to a file. 56 | type ReadableFile interface { 57 | io.ReaderAt 58 | io.Closer 59 | } 60 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package wal 5 | 6 | import ( 7 | "fmt" 8 | 9 | "github.com/hashicorp/go-hclog" 10 | "github.com/hashicorp/raft-wal/fs" 11 | "github.com/hashicorp/raft-wal/metadb" 12 | "github.com/hashicorp/raft-wal/metrics" 13 | "github.com/hashicorp/raft-wal/segment" 14 | "github.com/hashicorp/raft-wal/types" 15 | ) 16 | 17 | // WithCodec is an option that allows a custom Codec to be provided to the WAL. 18 | // If not used the default Codec is used. 19 | func WithCodec(c Codec) walOpt { 20 | return func(w *WAL) { 21 | w.codec = c 22 | } 23 | } 24 | 25 | // WithMetaStore is an option that allows a custom MetaStore to be provided to 26 | // the WAL. If not used the default MetaStore is used. 27 | func WithMetaStore(db types.MetaStore) walOpt { 28 | return func(w *WAL) { 29 | w.metaDB = db 30 | } 31 | } 32 | 33 | // WithSegmentFiler is an option that allows a custom SegmentFiler (and hence 34 | // Segment Reader/Writer implementation) to be provided to the WAL. If not used 35 | // the default SegmentFiler is used. 36 | func WithSegmentFiler(sf types.SegmentFiler) walOpt { 37 | return func(w *WAL) { 38 | w.sf = sf 39 | } 40 | } 41 | 42 | // WithLogger is an option that allows a custom logger to be used. 43 | func WithLogger(logger hclog.Logger) walOpt { 44 | return func(w *WAL) { 45 | w.log = logger 46 | } 47 | } 48 | 49 | // WithSegmentSize is an option that allows a custom segmentSize to be set. 50 | func WithSegmentSize(size int) walOpt { 51 | return func(w *WAL) { 52 | w.segmentSize = size 53 | } 54 | } 55 | 56 | // WithMetricsCollector is an option that allows a custom segmentSize to be set. 57 | func WithMetricsCollector(c metrics.Collector) walOpt { 58 | return func(w *WAL) { 59 | w.metrics = c 60 | } 61 | } 62 | 63 | func (w *WAL) applyDefaultsAndValidate() error { 64 | // Check if an external codec has been used that it's not using a reserved ID. 65 | if w.codec != nil && w.codec.ID() < FirstExternalCodecID { 66 | return fmt.Errorf("codec is using a reserved ID (below %d)", FirstExternalCodecID) 67 | } 68 | 69 | // Defaults 70 | if w.log == nil { 71 | w.log = hclog.Default().Named("wal") 72 | } 73 | if w.codec == nil { 74 | w.codec = &BinaryCodec{} 75 | } 76 | if w.sf == nil { 77 | // These are not actually swappable via options right now but we override 78 | // them in tests. Only load the default implementations if they are not set. 79 | vfs := fs.New() 80 | w.sf = segment.NewFiler(w.dir, vfs) 81 | } 82 | if w.metrics == nil { 83 | w.metrics = &metrics.NoOpCollector{} 84 | } 85 | if w.metaDB == nil { 86 | w.metaDB = &metadb.BoltMetaDB{} 87 | } 88 | if w.segmentSize == 0 { 89 | w.segmentSize = DefaultSegmentSize 90 | } 91 | return nil 92 | } 93 | -------------------------------------------------------------------------------- /codec_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package wal 5 | 6 | import ( 7 | "bytes" 8 | "testing" 9 | "time" 10 | 11 | fuzz "github.com/google/gofuzz" 12 | "github.com/hashicorp/raft" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | // TestBinaryCodecFuzz tests that our codec can decode whatever it encoded. 17 | // Because we are using a reflection-based fuzzer to assign random values to all 18 | // fields this test will also catch any changes in a later version of raft that 19 | // add new fields since our codec will "loose" them. 20 | func TestBinaryCodecFuzz(t *testing.T) { 21 | rounds := 1000 22 | 23 | f := fuzz.New().Funcs( 24 | // Stub time since gofuzz generates unencodable times depending on your 25 | // local timezone! On my computer in GMT timezone, it will generate Times 26 | // that are unencodable for some reason I don't understand. All it's doing 27 | // is picking a random UnixTimestamp but for some reason that is sometimes 28 | // unencodable? 29 | func(t *time.Time, c fuzz.Continue) { 30 | // This is copied from fuzzTime in gofuzz but with a fix until it's 31 | // accepted upstream. 32 | var sec, nsec int64 33 | // Allow for about 1000 years of random time values, which keeps things 34 | // like JSON parsing reasonably happy. 35 | sec = c.Rand.Int63n(1000 * 365 * 24 * 60 * 60) 36 | nsec = c.Rand.Int63n(999_999_999) 37 | *t = time.Unix(sec, nsec) 38 | }, 39 | ) 40 | c := BinaryCodec{} 41 | 42 | require.Equal(t, CodecBinaryV1, c.ID()) 43 | 44 | var buf bytes.Buffer 45 | 46 | for i := 0; i < rounds; i++ { 47 | var log, log2 raft.Log 48 | f.Fuzz(&log) 49 | buf.Reset() 50 | 51 | err := c.Encode(&log, &buf) 52 | require.NoError(t, err) 53 | 54 | err = c.Decode(buf.Bytes(), &log2) 55 | require.NoError(t, err) 56 | 57 | t.Logf("log %#v. Binary: % x", log, buf.Bytes()) 58 | 59 | require.Equal(t, log, log2) 60 | } 61 | } 62 | 63 | func TestBinaryCodecCopysOnDecode(t *testing.T) { 64 | var in, out raft.Log 65 | 66 | in.Index = 1234 67 | in.Term = 2 68 | in.Type = raft.LogCommand 69 | in.Data = []byte("foo") 70 | in.Extensions = []byte("ext") 71 | 72 | c := BinaryCodec{} 73 | var buf bytes.Buffer 74 | require.NoError(t, c.Encode(&in, &buf)) 75 | 76 | rawBytes := buf.Bytes() 77 | 78 | require.NoError(t, c.Decode(rawBytes, &out)) 79 | 80 | // Make sure the decoded data is the same 81 | require.Equal(t, string(out.Data), "foo") 82 | require.Equal(t, string(out.Extensions), "ext") 83 | 84 | // Intentionally mangle the buffer contents 85 | for i := 0; i < len(rawBytes); i++ { 86 | rawBytes[i] = 'x' 87 | } 88 | 89 | // Make sure the decoded data is still the same (i.e. didn't refer to the 90 | // underlying bytes) 91 | require.Equal(t, string(out.Data), "foo") 92 | require.Equal(t, string(out.Extensions), "ext") 93 | 94 | } 95 | -------------------------------------------------------------------------------- /metrics/atomic_collector.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package metrics 5 | 6 | import "sync/atomic" 7 | 8 | var ( 9 | _ Collector = &AtomicCollector{} 10 | ) 11 | 12 | // AtomicCollector is a simple Collector that atomically stores 13 | // counters and gauges in memory. 14 | type AtomicCollector struct { 15 | counters []uint64 16 | gauges []uint64 17 | 18 | counterIndex, gaugeIndex map[string]int 19 | } 20 | 21 | // NewAtomicCollector creates a collector for the given set of Definitions. 22 | func NewAtomicCollector(defs Definitions) *AtomicCollector { 23 | c := &AtomicCollector{ 24 | counters: make([]uint64, len(defs.Counters)), 25 | gauges: make([]uint64, len(defs.Gauges)), 26 | counterIndex: make(map[string]int), 27 | gaugeIndex: make(map[string]int), 28 | } 29 | for i, d := range defs.Counters { 30 | if _, ok := c.counterIndex[d.Name]; ok { 31 | panic("duplicate metrics named " + d.Name) 32 | } 33 | c.counterIndex[d.Name] = i 34 | } 35 | for i, d := range defs.Gauges { 36 | if _, ok := c.counterIndex[d.Name]; ok { 37 | panic("duplicate metrics named " + d.Name) 38 | } 39 | if _, ok := c.gaugeIndex[d.Name]; ok { 40 | panic("duplicate metrics named " + d.Name) 41 | } 42 | c.gaugeIndex[d.Name] = i 43 | } 44 | return c 45 | } 46 | 47 | // IncrementCounter record val occurrences of the named event. Names will 48 | // follow prometheus conventions with lower_case_and_underscores. We don't 49 | // need any additional labels currently. 50 | func (c *AtomicCollector) IncrementCounter(name string, delta uint64) { 51 | id, ok := c.counterIndex[name] 52 | if !ok { 53 | panic("invalid metric name: " + name) 54 | } 55 | atomic.AddUint64(&c.counters[id], delta) 56 | } 57 | 58 | // SetGauge sets the value of the named gauge overriding any previous value. 59 | func (c *AtomicCollector) SetGauge(name string, val uint64) { 60 | id, ok := c.gaugeIndex[name] 61 | if !ok { 62 | panic("invalid metric name: " + name) 63 | } 64 | atomic.StoreUint64(&c.gauges[id], val) 65 | } 66 | 67 | // Summary returns a summary of the metrics since startup. Each value is 68 | // atomically loaded but the set is not atomic overall and may represent an 69 | // inconsistent snapshot e.g. with some metrics reflecting the most recent 70 | // operation while others don't. 71 | func (c *AtomicCollector) Summary() Summary { 72 | s := Summary{ 73 | Counters: make(map[string]uint64, len(c.counters)), 74 | Gauges: make(map[string]uint64, len(c.gauges)), 75 | } 76 | for name, id := range c.counterIndex { 77 | s.Counters[name] = atomic.LoadUint64(&c.counters[id]) 78 | } 79 | for name, id := range c.gaugeIndex { 80 | s.Gauges[name] = atomic.LoadUint64(&c.gauges[id]) 81 | } 82 | return s 83 | } 84 | 85 | // Summary is a copy of the values recorded so far for each metric. 86 | type Summary struct { 87 | Counters map[string]uint64 88 | Gauges map[string]uint64 89 | } 90 | -------------------------------------------------------------------------------- /.github/workflows/go-tests.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) HashiCorp, Inc. 2 | 3 | name: go-tests 4 | 5 | on: [push] 6 | 7 | env: 8 | TEST_RESULTS: /tmp/test-results 9 | 10 | jobs: 11 | 12 | go-tests: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | go-version: [ 1.22 ] 17 | 18 | steps: 19 | - name: Setup go 20 | uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 21 | with: 22 | go-version: ${{ matrix.go-version }} 23 | 24 | - name: Checkout code 25 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 26 | 27 | - name: Create test directory 28 | run: | 29 | mkdir -p ${{ env.TEST_RESULTS }} 30 | 31 | - name: Download go modules 32 | run: go mod download 33 | 34 | - name: Cache / restore go modules 35 | uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 36 | with: 37 | path: | 38 | ~/go/pkg/mod 39 | key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} 40 | restore-keys: | 41 | ${{ runner.os }}-go- 42 | 43 | # Check go fmt output because it does not report non-zero when there are fmt changes 44 | - name: Run gofmt 45 | run: | 46 | go fmt ./... 47 | files=$(go fmt ./...) 48 | if [ -n "$files" ]; then 49 | echo "The following file(s) do not conform to go fmt:" 50 | echo "$files" 51 | exit 1 52 | fi 53 | 54 | # Install gotestsum 55 | - name: Install gotestsum 56 | run: | 57 | GTS="gotest.tools/gotestsum@v1.8.2" 58 | # We use the same error message prefix in either failure case, so just define it once here. 59 | ERROR="Failed to install $GTS" 60 | # First try to 'go install', if that fails try 'go get'... 61 | go install "$GTS" || go get "$GTS" || { echo "$ERROR: both 'go install' and 'go get' failed"; exit 1; } 62 | # Check that the gotestsum command was actually installed in the path... 63 | command -v gotestsum > /dev/null 2>&1 || { echo "$ERROR: gotestsum command not installed"; exit 1; } 64 | echo "OK: Command 'gotestsum' installed ($GTS)" 65 | 66 | - name: Run go tests 67 | run: | 68 | PACKAGE_NAMES=$(go list ./...) 69 | gotestsum --format=short-verbose \ 70 | --junitfile $TEST_RESULTS/gotestsum-report.xml \ 71 | -- \ 72 | -coverprofile $TEST_RESULTS/coverage.out \ 73 | -race $PACKAGE_NAMES 74 | go tool cover -html=$TEST_RESULTS/coverage.out -o $TEST_RESULTS/coverage.html 75 | 76 | # Save coverage report parts 77 | - name: Upload and save artifacts 78 | uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 79 | with: 80 | name: Test Results 81 | path: ${{ env.TEST_RESULTS }} 82 | -------------------------------------------------------------------------------- /metrics.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package wal 5 | 6 | import ( 7 | "github.com/hashicorp/raft-wal/metrics" 8 | ) 9 | 10 | var ( 11 | // MetricDefinitions describe the metrics emitted by this library via the 12 | // provided metrics.Collector implementation. It's public so that these can be 13 | // registered during init with metrics clients that support pre-defining 14 | // metrics. 15 | MetricDefinitions = metrics.Definitions{ 16 | Counters: []metrics.Descriptor{ 17 | { 18 | Name: "log_entry_bytes_written", 19 | Desc: "log_entry_bytes_written counts the bytes of log entry after encoding" + 20 | " with Codec. Actual bytes written to disk might be slightly higher as it" + 21 | " includes headers and index entries.", 22 | }, 23 | { 24 | Name: "log_entries_written", 25 | Desc: "log_entries_written counts the number of entries written.", 26 | }, 27 | { 28 | Name: "log_appends", 29 | Desc: "log_appends counts the number of calls to StoreLog(s) i.e." + 30 | " number of batches of entries appended.", 31 | }, 32 | { 33 | Name: "log_entry_bytes_read", 34 | Desc: "log_entry_bytes_read counts the bytes of log entry read from" + 35 | " segments before decoding. actual bytes read from disk might be higher" + 36 | " as it includes headers and index entries and possible secondary reads" + 37 | " for large entries that don't fit in buffers.", 38 | }, 39 | { 40 | Name: "log_entries_read", 41 | Desc: "log_entries_read counts the number of calls to get_log.", 42 | }, 43 | { 44 | Name: "segment_rotations", 45 | Desc: "segment_rotations counts how many times we move to a new segment file.", 46 | }, 47 | { 48 | Name: "head_truncations", 49 | Desc: "head_truncations counts how many log entries have been truncated" + 50 | " from the head - i.e. the oldest entries. by graphing the rate of" + 51 | " change over time you can see individual truncate calls as spikes.", 52 | }, 53 | { 54 | Name: "tail_truncations", 55 | Desc: "tail_truncations counts how many log entries have been truncated" + 56 | " from the head - i.e. the newest entries. by graphing the rate of" + 57 | " change over time you can see individual truncate calls as spikes.", 58 | }, 59 | { 60 | Name: "stable_gets", 61 | Desc: "stable_gets counts how many calls to StableStore.Get or GetUint64.", 62 | }, 63 | { 64 | Name: "stable_sets", 65 | Desc: "stable_sets counts how many calls to StableStore.Set or SetUint64.", 66 | }, 67 | }, 68 | Gauges: []metrics.Descriptor{ 69 | { 70 | Name: "last_segment_age_seconds", 71 | Desc: "last_segment_age_seconds is a gauge that is set each time we" + 72 | " rotate a segment and describes the number of seconds between when" + 73 | " that segment file was first created and when it was sealed. this" + 74 | " gives a rough estimate how quickly writes are filling the disk.", 75 | }, 76 | }, 77 | } 78 | ) 79 | -------------------------------------------------------------------------------- /segment/reader_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "fmt" 8 | "strings" 9 | "testing" 10 | "time" 11 | 12 | "github.com/hashicorp/raft-wal/types" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | type entryDesc struct { 17 | len, num int 18 | } 19 | 20 | func TestReader(t *testing.T) { 21 | cases := []struct { 22 | name string 23 | firstIndex uint64 24 | entries []entryDesc 25 | corrupt func(twf *testWritableFile) error 26 | wantLastIndex uint64 27 | wantOpenErr string 28 | }{ 29 | { 30 | name: "basic sealed", 31 | firstIndex: 1, 32 | entries: []entryDesc{ 33 | // 28 * 128 bytes entries are all that will fit in a 4KiB segment after 34 | // headers and index size are accounted for. 35 | {len: 128, num: 28}, 36 | }, 37 | wantLastIndex: 28, 38 | }, 39 | { 40 | name: "value larger than minBufSize", 41 | firstIndex: 1, 42 | entries: []entryDesc{ 43 | {len: 128, num: 5}, 44 | {len: minBufSize + 10, num: 1}, 45 | }, 46 | wantLastIndex: 6, 47 | }, 48 | { 49 | name: "sealed file truncated", 50 | firstIndex: 1, 51 | entries: []entryDesc{ 52 | {len: 128, num: 28}, 53 | }, 54 | corrupt: func(twf *testWritableFile) error { 55 | twf.Truncate(0) 56 | return nil 57 | }, 58 | wantOpenErr: "corrupt", 59 | }, 60 | } 61 | 62 | for _, tc := range cases { 63 | tc := tc 64 | t.Run(tc.name, func(t *testing.T) { 65 | vfs := newTestVFS() 66 | 67 | f := NewFiler("test", vfs) 68 | 69 | seg0 := testSegment(1) 70 | 71 | w, err := f.Create(seg0) 72 | require.NoError(t, err) 73 | defer w.Close() 74 | 75 | // Append previous entries. We just pick a fixed size and format that's 76 | // easy to verify but generally fits in our test block size. 77 | idx := tc.firstIndex 78 | wantLength := make(map[uint64]int) 79 | for _, desc := range tc.entries { 80 | // Append individually, could do commit batches but this is all in 81 | // memory so no real benefit. 82 | padLen := 0 83 | if desc.len > 6 { 84 | padLen = desc.len - 6 85 | } 86 | padding := strings.Repeat("P", padLen) 87 | for i := 0; i < desc.num; i++ { 88 | v := fmt.Sprintf("%05d:%s", idx, padding) 89 | err := w.Append([]types.LogEntry{{Index: idx, Data: []byte(v)}}) 90 | require.NoError(t, err, "error appending entry idx=%d", idx) 91 | wantLength[idx] = desc.len 92 | idx++ 93 | } 94 | } 95 | 96 | // Should have sealed 97 | sealed, indexStart, err := w.Sealed() 98 | require.NoError(t, err) 99 | require.True(t, sealed) 100 | 101 | if tc.corrupt != nil { 102 | file := testFileFor(t, w) 103 | require.NoError(t, tc.corrupt(file)) 104 | } 105 | 106 | seg0.IndexStart = indexStart 107 | seg0.MaxIndex = w.LastIndex() 108 | seg0.SealTime = time.Now() 109 | 110 | // Now open the "file" with a reader. 111 | r, err := f.Open(seg0) 112 | 113 | if tc.wantOpenErr != "" { 114 | require.ErrorContains(t, err, tc.wantOpenErr) 115 | return 116 | } 117 | require.NoError(t, err) 118 | 119 | // Make sure we can read every value 120 | for idx := tc.firstIndex; idx <= tc.wantLastIndex; idx++ { 121 | got, err := r.GetLog(idx) 122 | require.NoError(t, err, "error reading idx=%d", idx) 123 | require.True(t, strings.HasPrefix(string(got.Bs), fmt.Sprintf("%05d:", idx)), "bad value for idx=%d", idx) 124 | require.Len(t, string(got.Bs), wantLength[idx]) 125 | } 126 | 127 | // And we should _not_ read one either side 128 | if tc.firstIndex > 1 { 129 | _, err := r.GetLog(tc.firstIndex - 1) 130 | require.ErrorIs(t, err, types.ErrNotFound) 131 | } 132 | _, err = r.GetLog(tc.wantLastIndex + 1) 133 | require.ErrorIs(t, err, types.ErrNotFound) 134 | }) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /metrics/gometrics_collector.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | // # Metrics Configuration 5 | // 6 | // The raft-wal library is instrumented to be able to use different metrics collectors. There are currently two implemented within this package: 7 | // - atomic 8 | // - go-metrics 9 | // 10 | // # go-metrics Compatibility 11 | // 12 | // This library can emit metrics using either github.com/armon/go-metrics or github.com/hashicorp/go-metrics. Choosing between the libraries is controlled via build tags. 13 | // 14 | // Build Tags: 15 | // - armonmetrics - Using this tag will cause metrics to be routed to armon/go-metrics 16 | // - hashicorpmetrics - Using this tag will cause all metrics to be routed to hashicorp/go-metrics 17 | // 18 | // If no build tag is specified, the default behavior is to use armon/go-metrics. 19 | // 20 | // # Deprecating armon/go-metrics 21 | // 22 | // Emitting metrics to armon/go-metrics is officially deprecated. Usage of armon/go-metrics will remain the default until mid-2025 with opt-in support continuing to the end of 2025. 23 | // 24 | // Migration: 25 | // To migrate an application currently using the older armon/go-metrics to instead use hashicorp/go-metrics the following should be done. 26 | // 27 | // 1. Upgrade libraries using armon/go-metrics to consume hashicorp/go-metrics/compat instead. This should involve only changing import statements. All repositories within the hashicorp GitHub organization will be getting these updates in early 2025. 28 | // 29 | // 2. Update an applications library dependencies to those that have the compatibility layer configured. 30 | // 31 | // 3. Update the application to use hashicorp/go-metrics for configuring metrics export instead of armon/go-metrics 32 | // 33 | // - Replace all application imports of github.com/armon/go-metrics with github.com/hashicorp/go-metrics 34 | // 35 | // - Instrument your build system to build with the hashicorpmetrics tag. 36 | // 37 | // Eventually once the default behavior changes to use hashicorp/go-metrics by default (mid-2025), you can drop the hashicorpmetrics build tag. 38 | package metrics 39 | 40 | import gometrics "github.com/hashicorp/go-metrics/compat" 41 | 42 | // GoMetricsCollector implements a Collector that passes through observations to 43 | // a go-metrics instance. The zero value works, writing metrics to the default 44 | // global instance however to set a prefix or a static set of labels to add to 45 | // each metric observed, or to use a non-global metrics instance use 46 | // NewGoMetricsCollector. 47 | type GoMetricsCollector struct { 48 | gm *gometrics.Metrics 49 | prefix []string 50 | labels []gometrics.Label 51 | } 52 | 53 | // NewGoMetricsCollector returns a GoMetricsCollector that will attach the 54 | // specified name prefix and/or labels to each observation. If gm is nil the 55 | // global metrics instance is used. 56 | func NewGoMetricsCollector(prefix []string, labels []gometrics.Label, gm *gometrics.Metrics) *GoMetricsCollector { 57 | if gm == nil { 58 | gm = gometrics.Default() 59 | } 60 | return &GoMetricsCollector{ 61 | gm: gm, 62 | prefix: prefix, 63 | labels: labels, 64 | } 65 | } 66 | 67 | // IncrementCounter record val occurrences of the named event. Names will 68 | // follow prometheus conventions with lower_case_and_underscores. We don't 69 | // need any additional labels currently. 70 | func (c *GoMetricsCollector) IncrementCounter(name string, delta uint64) { 71 | c.gm.IncrCounterWithLabels(c.name(name), float32(delta), c.labels) 72 | } 73 | 74 | // SetGauge sets the value of the named gauge overriding any previous value. 75 | func (c *GoMetricsCollector) SetGauge(name string, val uint64) { 76 | c.gm.SetGaugeWithLabels(c.name(name), float32(val), c.labels) 77 | } 78 | 79 | // name returns the metric name as a slice we don't want to risk modifying the 80 | // prefix slice backing array since this might be called concurrently so we 81 | // always allocate a new slice. 82 | func (c *GoMetricsCollector) name(name string) []string { 83 | var ss []string 84 | return append(append(ss, c.prefix...), name) 85 | } 86 | -------------------------------------------------------------------------------- /bench/main.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package main 5 | 6 | import ( 7 | "bytes" 8 | "flag" 9 | "fmt" 10 | "io" 11 | "io/ioutil" 12 | "os" 13 | "path/filepath" 14 | "strings" 15 | "time" 16 | 17 | "github.com/HdrHistogram/hdrhistogram-go" 18 | "github.com/benmathews/bench" 19 | "github.com/hashicorp/raft-wal/metadb" 20 | ) 21 | 22 | type opts struct { 23 | // LogStore params 24 | version string 25 | dir string 26 | segSize int 27 | noFreelistSync bool 28 | 29 | // Common params 30 | preLoadN int 31 | 32 | // Append params 33 | rate int 34 | duration time.Duration 35 | logSize int 36 | batchSize int 37 | 38 | // Truncate params 39 | truncateTrailingLogs int 40 | truncatePeriod time.Duration 41 | } 42 | 43 | func main() { 44 | var o opts 45 | 46 | flag.StringVar(&o.version, "v", "wal", "version to test 'wal' or 'bolt'") 47 | flag.StringVar(&o.dir, "dir", "", "dir to write to. If empty will create a tmp dir. If not empty the dir will delete any existing WAL files present!") 48 | flag.IntVar(&o.rate, "rate", 10, "append rate target per second") 49 | flag.DurationVar(&o.duration, "t", 10*time.Second, "duration of the test") 50 | flag.IntVar(&o.logSize, "s", 128, "size of each log entry appended") 51 | flag.IntVar(&o.batchSize, "n", 1, "number of logs per append batch") 52 | flag.IntVar(&o.segSize, "seg", 64, "segment size in MB") 53 | flag.IntVar(&o.truncateTrailingLogs, "trail", 10000, "number of trailing logs to leave on truncate") 54 | flag.DurationVar(&o.truncatePeriod, "tp", 0, "how often to head truncate back to 'trail' logs during append") 55 | flag.IntVar(&o.preLoadN, "preload", 0, "number of logs to append and then truncate before we start") 56 | flag.BoolVar(&o.noFreelistSync, "no-fl-sync", false, "used to disable freelist sync in boltdb for v=bolt") 57 | flag.Parse() 58 | 59 | var outBuf bytes.Buffer 60 | teeOut := io.MultiWriter(os.Stdout, &outBuf) 61 | 62 | if o.dir == "" { 63 | tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*") 64 | if err != nil { 65 | panic(err) 66 | } 67 | 68 | defer os.RemoveAll(tmpDir) 69 | o.dir = tmpDir 70 | } else { 71 | // Delete metadb and any segment files present 72 | files, err := os.ReadDir(o.dir) 73 | if err != nil { 74 | panic(err) 75 | } 76 | for _, f := range files { 77 | if f.IsDir() { 78 | continue 79 | } 80 | if strings.HasSuffix(f.Name(), ".wal") || f.Name() == metadb.FileName || f.Name() == "raft.db" { 81 | os.RemoveAll(filepath.Join(o.dir, f.Name())) 82 | } 83 | } 84 | } 85 | 86 | // Make the results dir if it doesn't exist 87 | if err := os.MkdirAll(filepath.Join(o.dir, filepath.Dir(outFileName(o, "blah"))), 0755); err != nil { 88 | panic(err) 89 | } 90 | 91 | r := &appendRequesterFactory{ 92 | opts: o, 93 | output: teeOut, 94 | } 95 | benchmark := bench.NewBenchmark(r, uint64(o.rate), 1, o.duration, 0) 96 | summary, err := benchmark.Run() 97 | if err != nil { 98 | panic(err) 99 | } 100 | 101 | printHistogram(teeOut, "Good Append Latencies (ms)", summary.SuccessHistogram, 1_000_000) 102 | 103 | fmt.Fprintln(teeOut, summary) 104 | summary.GenerateLatencyDistribution(nil, outFileName(o, "append-lat")) 105 | ioutil.WriteFile(outFileName(o, "stdout"), outBuf.Bytes(), 0644) 106 | } 107 | 108 | func outFileName(o opts, suffix string) string { 109 | version := o.version 110 | if o.version == "bolt" && o.noFreelistSync { 111 | version += "-nfls" 112 | } 113 | return fmt.Sprintf("bench-result-%s-s%d-n%d-r%d-seg%dm-pre%d-trail%d-tp%s/%s-%s.txt", 114 | o.duration, o.logSize, o.batchSize, o.rate, o.segSize, o.preLoadN, 115 | o.truncateTrailingLogs, o.truncatePeriod, version, suffix) 116 | } 117 | 118 | func printHistogram(f io.Writer, name string, h *hdrhistogram.Histogram, scale int64) { 119 | fmt.Fprintf(f, "\n==> %s\n", name) 120 | fmt.Fprintf(f, " count mean p50 p99 p99.9 max\n") 121 | fmt.Fprintf(f, " %6d %6.0f %6d %6d %6d %6d\n", 122 | h.TotalCount(), 123 | h.Mean()/float64(scale), 124 | h.ValueAtPercentile(50)/scale, 125 | h.ValueAtPercentile(99)/scale, 126 | h.ValueAtPercentile(99.9)/scale, 127 | h.Max()/scale, 128 | ) 129 | } 130 | -------------------------------------------------------------------------------- /fs/fs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fs 5 | 6 | import ( 7 | "bytes" 8 | "io" 9 | "os" 10 | "path/filepath" 11 | "testing" 12 | 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | func TestFS(t *testing.T) { 17 | tmpDir, err := os.MkdirTemp("", "raft-wal-fs-test-*") 18 | require.NoError(t, err) 19 | defer os.RemoveAll(tmpDir) 20 | 21 | fs := New() 22 | 23 | // List should return nothing 24 | files, err := fs.ListDir(tmpDir) 25 | require.NoError(t, err) 26 | require.Len(t, files, 0) 27 | 28 | // Create a new file 29 | wf, err := fs.Create(tmpDir, "00001-abcd1234.wal", 512*1024) 30 | require.NoError(t, err) 31 | defer wf.Close() 32 | 33 | // Should be pre-allocated (on supported file systems). 34 | // TODO work out if this is reliable in CI or if we can detect supported FSs?) 35 | info, err := os.Stat(filepath.Join(tmpDir, "00001-abcd1234.wal")) 36 | require.NoError(t, err) 37 | require.Equal(t, int64(512*1024), info.Size()) 38 | 39 | // Should be able to write data in any order 40 | n, err := wf.WriteAt(bytes.Repeat([]byte{'2'}, 1024), 1024) 41 | require.NoError(t, err) 42 | require.Equal(t, 1024, n) 43 | 44 | n, err = wf.WriteAt(bytes.Repeat([]byte{'1'}, 1024), 0) 45 | require.NoError(t, err) 46 | require.Equal(t, 1024, n) 47 | 48 | // And past the preallocated end. 49 | n, err = wf.WriteAt(bytes.Repeat([]byte{'3'}, 1024), 512*1024) 50 | require.NoError(t, err) 51 | require.Equal(t, 1024, n) 52 | 53 | // And sync them 54 | require.NoError(t, wf.Sync()) 55 | 56 | // And read them back 57 | rf, err := fs.OpenReader(tmpDir, "00001-abcd1234.wal") 58 | require.NoError(t, err) 59 | defer rf.Close() 60 | 61 | var buf [1024]byte 62 | n, err = rf.ReadAt(buf[:], 1024) 63 | require.NoError(t, err) 64 | require.Equal(t, len(buf), n) 65 | require.Equal(t, byte('2'), buf[0]) 66 | 67 | n, err = rf.ReadAt(buf[:], 0) 68 | require.NoError(t, err) 69 | require.Equal(t, len(buf), n) 70 | require.Equal(t, byte('1'), buf[0]) 71 | 72 | n, err = rf.ReadAt(buf[:], 512*1024) 73 | require.NoError(t, err) 74 | require.Equal(t, len(buf), n) 75 | require.Equal(t, byte('3'), buf[0]) 76 | 77 | // Read off end is an error 78 | _, err = rf.ReadAt(buf[:], 513*1024) 79 | require.ErrorIs(t, err, io.EOF) 80 | 81 | // Should also be able to re-open writable file. 82 | wf.Close() 83 | wf, err = fs.OpenWriter(tmpDir, "00001-abcd1234.wal") 84 | require.NoError(t, err) 85 | 86 | // And write more 87 | n, err = wf.WriteAt(bytes.Repeat([]byte{'4'}, 1024), 2048) 88 | require.NoError(t, err) 89 | require.Equal(t, 1024, n) 90 | require.NoError(t, wf.Sync()) 91 | 92 | // And read back prior and new data through the writer. Read across the old 93 | // and new data written - first byte is old data rest is new. 94 | n, err = wf.ReadAt(buf[:], 2047) 95 | require.NoError(t, err) 96 | require.Equal(t, len(buf), n) 97 | require.Equal(t, byte('2'), buf[0]) 98 | require.Equal(t, byte('4'), buf[1]) 99 | 100 | // The already open reader should also be able to read that newly written data 101 | n, err = rf.ReadAt(buf[:], 2048) 102 | require.NoError(t, err) 103 | require.Equal(t, len(buf), n) 104 | require.Equal(t, byte('4'), buf[0]) 105 | 106 | // List should return file now 107 | files, err = fs.ListDir(tmpDir) 108 | require.NoError(t, err) 109 | require.Equal(t, []string{"00001-abcd1234.wal"}, files) 110 | 111 | // Delete should work 112 | require.NoError(t, fs.Delete(tmpDir, "00001-abcd1234.wal")) 113 | 114 | files, err = fs.ListDir(tmpDir) 115 | require.NoError(t, err) 116 | require.Equal(t, []string{}, files) 117 | } 118 | 119 | func TestRealFSNoDir(t *testing.T) { 120 | fs := New() 121 | 122 | _, err := fs.ListDir("/not-a-real-dir") 123 | require.Error(t, err) 124 | require.Contains(t, err.Error(), "no such file or directory") 125 | 126 | _, err = fs.Create("/not-a-real-dir", "foo", 1024) 127 | require.Error(t, err) 128 | require.Contains(t, err.Error(), "no such file or directory") 129 | 130 | _, err = fs.OpenReader("/not-a-real-dir", "foo") 131 | require.Error(t, err) 132 | require.Contains(t, err.Error(), "no such file or directory") 133 | 134 | _, err = fs.OpenWriter("/not-a-real-dir", "foo") 135 | require.Error(t, err) 136 | require.Contains(t, err.Error(), "no such file or directory") 137 | } 138 | -------------------------------------------------------------------------------- /metadb/metadb_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package metadb 5 | 6 | import ( 7 | "io/ioutil" 8 | "os" 9 | "testing" 10 | "time" 11 | 12 | "github.com/hashicorp/raft-wal/types" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | func TestMetaDB(t *testing.T) { 17 | cases := []struct { 18 | name string 19 | writeState *types.PersistentState 20 | writeStable map[string][]byte 21 | failSim func() 22 | }{ 23 | { 24 | name: "basic storage", 25 | writeState: makeState(4), 26 | writeStable: map[string][]byte{ 27 | "CurrentTerm": []byte{0, 0, 0, 0, 0, 0, 0, 5}, 28 | "LastVoteTerm": []byte{0, 0, 0, 0, 0, 0, 0, 5}, 29 | "LastVoteCand": []byte("server1"), 30 | }, 31 | }, 32 | } 33 | 34 | for _, tc := range cases { 35 | tc := tc 36 | t.Run(tc.name, func(t *testing.T) { 37 | tmpDir, err := os.MkdirTemp("", "raft-wal-meta-test-*") 38 | require.NoError(t, err) 39 | defer os.RemoveAll(tmpDir) 40 | 41 | { 42 | // Should be able to load the DB 43 | var db BoltMetaDB 44 | gotState, err := db.Load(tmpDir) 45 | require.NoError(t, err) 46 | defer db.Close() 47 | 48 | require.Equal(t, 0, int(gotState.NextSegmentID)) 49 | require.Empty(t, gotState.Segments) 50 | 51 | if tc.writeState != nil { 52 | require.NoError(t, db.CommitState(*tc.writeState)) 53 | } 54 | for k, v := range tc.writeStable { 55 | require.NoError(t, db.SetStable([]byte(k), v)) 56 | } 57 | 58 | // Close DB and re-open a new one to ensure persistence. 59 | db.Close() 60 | } 61 | 62 | var db BoltMetaDB 63 | gotState, err := db.Load(tmpDir) 64 | require.NoError(t, err) 65 | 66 | require.Equal(t, *tc.writeState, gotState) 67 | 68 | for k, v := range tc.writeStable { 69 | got, err := db.GetStable([]byte(k)) 70 | require.NoError(t, err) 71 | require.Equal(t, v, got) 72 | } 73 | }) 74 | } 75 | } 76 | 77 | func TestMetaDBErrors(t *testing.T) { 78 | tmpDir, err := os.MkdirTemp("", "raft-wal-meta-test-*") 79 | require.NoError(t, err) 80 | defer os.RemoveAll(tmpDir) 81 | 82 | var db BoltMetaDB 83 | 84 | // Calling anything before load is an error 85 | require.ErrorIs(t, db.CommitState(types.PersistentState{NextSegmentID: 1234}), ErrUnintialized) 86 | 87 | _, err = db.GetStable([]byte("foo")) 88 | require.ErrorIs(t, err, ErrUnintialized) 89 | 90 | err = db.SetStable([]byte("foo"), []byte("bar")) 91 | require.ErrorIs(t, err, ErrUnintialized) 92 | 93 | // Loading twice is OK from same dir 94 | _, err = db.Load(tmpDir) 95 | require.NoError(t, err) 96 | _, err = db.Load(tmpDir) 97 | require.NoError(t, err) 98 | 99 | // But not from a different (valid) one 100 | tmpDir2, err := ioutil.TempDir("", "wal-fs-test-*") 101 | require.NoError(t, err) 102 | defer os.RemoveAll(tmpDir2) 103 | 104 | _, err = db.Load(tmpDir2) 105 | require.ErrorContains(t, err, "already open in dir") 106 | 107 | // Loading from a non-existent dir is an error 108 | var db2 BoltMetaDB 109 | _, err = db2.Load("fake-dir-that-does-not-exist") 110 | require.ErrorContains(t, err, "no such file or directory") 111 | } 112 | 113 | func makeState(nSegs int) *types.PersistentState { 114 | startIdx := 1000 115 | perSegment := 100 116 | startID := 1234 117 | // Times are pesky remove as much stuff that doesn't survive serilisation as 118 | // possible as we don't really care about it! 119 | startTime := time.Now().UTC().Round(time.Second).Add(time.Duration(-1*nSegs) * time.Minute) 120 | 121 | state := &types.PersistentState{ 122 | NextSegmentID: uint64(startID + nSegs), 123 | } 124 | 125 | for i := 0; i < (nSegs - 1); i++ { 126 | si := types.SegmentInfo{ 127 | ID: uint64(startID + i), 128 | BaseIndex: uint64(startIdx + (i * perSegment)), 129 | MinIndex: uint64(startIdx + (i * perSegment)), 130 | MaxIndex: uint64(startIdx + ((i + 1) * perSegment) - 1), 131 | Codec: 1, 132 | IndexStart: 123456, 133 | CreateTime: startTime.Add(time.Duration(i) * time.Minute), 134 | SealTime: startTime.Add(time.Duration(i+1) * time.Minute), 135 | SizeLimit: 64 * 1024 * 1024, 136 | } 137 | state.Segments = append(state.Segments, si) 138 | } 139 | if nSegs > 0 { 140 | // Append an unsealed tail 141 | i := nSegs - 1 142 | si := types.SegmentInfo{ 143 | ID: uint64(startID + i), 144 | BaseIndex: uint64(startIdx + (i * perSegment)), 145 | MinIndex: uint64(startIdx + (i * perSegment)), 146 | Codec: 1, 147 | CreateTime: startTime.Add(time.Duration(i) * time.Minute), 148 | SizeLimit: 64 * 1024 * 1024, 149 | } 150 | state.Segments = append(state.Segments, si) 151 | } 152 | return state 153 | } 154 | -------------------------------------------------------------------------------- /fs/fs.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package fs 5 | 6 | import ( 7 | "fmt" 8 | "io/ioutil" 9 | "math" 10 | "os" 11 | "path/filepath" 12 | 13 | "github.com/hashicorp/raft-wal/types" 14 | "go.etcd.io/etcd/client/pkg/v3/fileutil" 15 | ) 16 | 17 | // FS implements the wal.VFS interface using GO's built in OS Filesystem (and a 18 | // few helpers). 19 | // 20 | // TODO if we changed the interface to be Dir centric we could cache the open 21 | // dir handle and save some time opening it on each Create in order to fsync. 22 | type FS struct { 23 | } 24 | 25 | func New() *FS { 26 | return &FS{} 27 | } 28 | 29 | // ListDir returns a list of all files in the specified dir in lexicographical 30 | // order. If the dir doesn't exist, it must return an error. Empty array with 31 | // nil error is assumed to mean that the directory exists and was readable, 32 | // but contains no files. 33 | func (fs *FS) ListDir(dir string) ([]string, error) { 34 | files, err := ioutil.ReadDir(dir) 35 | if err != nil { 36 | return nil, err 37 | } 38 | names := make([]string, len(files)) 39 | for i, f := range files { 40 | if f.IsDir() { 41 | continue 42 | } 43 | names[i] = f.Name() 44 | } 45 | return names, nil 46 | } 47 | 48 | // Create creates a new file with the given name. If a file with the same name 49 | // already exists an error is returned. If a non-zero size is given, 50 | // implementations should make a best effort to pre-allocate the file to be 51 | // that size. The dir must already exist and be writable to the current 52 | // process. 53 | func (fs *FS) Create(dir string, name string, size uint64) (types.WritableFile, error) { 54 | f, err := os.OpenFile(filepath.Join(dir, name), os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644)) 55 | if err != nil { 56 | return nil, err 57 | } 58 | // We just created the file. Preallocate it's size. 59 | if size > 0 { 60 | if size > math.MaxInt32 { 61 | return nil, fmt.Errorf("maximum file size is %d bytes", math.MaxInt32) 62 | } 63 | if err := fileutil.Preallocate(f, int64(size), true); err != nil { 64 | f.Close() 65 | return nil, err 66 | } 67 | } 68 | // We don't fsync here for performance reasons. Technically we need to fsync 69 | // the file itself to make sure it is really persisted to disk, and you always 70 | // need to fsync its parent dir after a creation because fsync doesn't ensure 71 | // the directory entry is persisted - a crash could make the file appear to be 72 | // missing as there is no directory entry. 73 | // 74 | // BUT, it doesn't actually matter if this file is crash safe, right up to the 75 | // point where we actually commit log data. Since we always fsync the file 76 | // when we commit logs, we don't need to again here. That does however leave 77 | // the parent dir fsync which must be done after the first fsync to a newly 78 | // created file to ensure it survives a crash. 79 | // 80 | // To handle that, we return a wrapped io.File that will fsync the parent dir 81 | // as well the first time Sync is called (and only the first time), 82 | fi := &File{ 83 | new: 0, 84 | dir: dir, 85 | File: *f, 86 | } 87 | return fi, nil 88 | } 89 | 90 | // Delete indicates the file is no longer required. Typically it should be 91 | // deleted from the underlying system to free disk space. 92 | func (fs *FS) Delete(dir string, name string) error { 93 | if err := os.Remove(filepath.Join(dir, name)); err != nil { 94 | return err 95 | } 96 | // Make sure parent directory metadata is fsynced too before we call this 97 | // "done". 98 | return syncDir(dir) 99 | } 100 | 101 | // OpenReader opens an existing file in read-only mode. If the file doesn't 102 | // exist or permission is denied, an error is returned, otherwise no checks 103 | // are made about the well-formedness of the file, it may be empty, the wrong 104 | // size or corrupt in arbitrary ways. 105 | func (fs *FS) OpenReader(dir string, name string) (types.ReadableFile, error) { 106 | return os.OpenFile(filepath.Join(dir, name), os.O_RDONLY, os.FileMode(0644)) 107 | } 108 | 109 | // OpenWriter opens a file in read-write mode. If the file doesn't exist or 110 | // permission is denied, an error is returned, otherwise no checks are made 111 | // about the well-formedness of the file, it may be empty, the wrong size or 112 | // corrupt in arbitrary ways. 113 | func (fs *FS) OpenWriter(dir string, name string) (types.WritableFile, error) { 114 | return os.OpenFile(filepath.Join(dir, name), os.O_RDWR, os.FileMode(0644)) 115 | } 116 | 117 | func syncDir(dir string) error { 118 | f, err := os.Open(dir) 119 | if err != nil { 120 | return err 121 | } 122 | err = f.Sync() 123 | closeErr := f.Close() 124 | if err != nil { 125 | return err 126 | } 127 | return closeErr 128 | } 129 | -------------------------------------------------------------------------------- /codec.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package wal 5 | 6 | import ( 7 | "encoding/binary" 8 | "io" 9 | "time" 10 | 11 | "github.com/hashicorp/raft" 12 | ) 13 | 14 | const ( 15 | // FirstExternalCodecID is the lowest value an external code may use to 16 | // identify their codec. Values lower than this are reserved for future 17 | // internal use. 18 | FirstExternalCodecID = 1 << 16 19 | 20 | // Codec* constants identify internally-defined codec identifiers. 21 | CodecBinaryV1 uint64 = iota 22 | ) 23 | 24 | // Codec is the interface required for encoding/decoding log entries. Callers 25 | // can pass a custom one to manage their own serialization, or to add additional 26 | // layers like encryption or compression of records. Each codec 27 | type Codec interface { 28 | // ID returns the globally unique identifier for this codec version. This is 29 | // encoded into segment file headers and must remain consistent over the life 30 | // of the log. Values up to FirstExternalCodecID are reserved and will error 31 | // if specified externally. 32 | ID() uint64 33 | 34 | // Encode the log into the io.Writer. We pass a writer to allow the caller to 35 | // manage buffer allocation and re-use. 36 | Encode(l *raft.Log, w io.Writer) error 37 | 38 | // Decode a log from the passed byte slice into the log entry pointed to. This 39 | // allows the caller to manage allocation and re-use of the bytes and log 40 | // entry. The resulting raft.Log MUST NOT reference data in the input byte 41 | // slice since the input byte slice may be returned to a pool and re-used. 42 | Decode([]byte, *raft.Log) error 43 | } 44 | 45 | // BinaryCodec is a Codec that encodes raft.Log with a simple binary format. We 46 | // test that all fields are captured using reflection. 47 | // 48 | // For now we assume raft.Log is not likely to change too much. If it does we'll 49 | // use a new Codec ID for the later version and have to support decoding either. 50 | type BinaryCodec struct{} 51 | 52 | // ID returns the globally unique identifier for this codec version. This is 53 | // encoded into segment file headers and must remain consistent over the life 54 | // of the log. Values up to FirstExternalCodecID are reserved and will error 55 | // if specified externally. 56 | func (c *BinaryCodec) ID() uint64 { 57 | return CodecBinaryV1 58 | } 59 | 60 | // Encode the log into the io.Writer. We pass a writer to allow the caller to 61 | // manage buffer allocation and re-use. 62 | func (c *BinaryCodec) Encode(l *raft.Log, w io.Writer) error { 63 | enc := encoder{w: w} 64 | enc.varint(l.Index) 65 | enc.varint(l.Term) 66 | enc.varint(uint64(l.Type)) 67 | enc.bytes(l.Data) 68 | enc.bytes(l.Extensions) 69 | enc.time(l.AppendedAt) 70 | return enc.err 71 | } 72 | 73 | // Decode a log from the passed byte slice into the log entry pointed to. This 74 | // allows the caller to manage allocation and re-use of the bytes and log 75 | // entry. 76 | func (c *BinaryCodec) Decode(bs []byte, l *raft.Log) error { 77 | dec := decoder{buf: bs} 78 | l.Index = dec.varint() 79 | l.Term = dec.varint() 80 | l.Type = raft.LogType(dec.varint()) 81 | l.Data = dec.bytes() 82 | l.Extensions = dec.bytes() 83 | l.AppendedAt = dec.time() 84 | return dec.err 85 | } 86 | 87 | type encoder struct { 88 | w io.Writer 89 | err error 90 | scratch [10]byte 91 | } 92 | 93 | func (e *encoder) varint(v uint64) { 94 | if e.err != nil { 95 | return 96 | } 97 | 98 | // Varint encoding might use up to 9 bytes for a uint64 99 | n := binary.PutUvarint(e.scratch[:], v) 100 | _, e.err = e.w.Write(e.scratch[:n]) 101 | } 102 | 103 | func (e *encoder) bytes(bs []byte) { 104 | // Put a length prefix 105 | e.varint(uint64(len(bs))) 106 | if e.err != nil { 107 | return 108 | } 109 | // Copy the bytes to the writer 110 | _, e.err = e.w.Write(bs) 111 | } 112 | 113 | func (e *encoder) time(t time.Time) { 114 | if e.err != nil { 115 | return 116 | } 117 | bs, err := t.MarshalBinary() 118 | if err != nil { 119 | e.err = err 120 | return 121 | } 122 | _, e.err = e.w.Write(bs) 123 | } 124 | 125 | type decoder struct { 126 | buf []byte 127 | err error 128 | } 129 | 130 | func (d *decoder) varint() uint64 { 131 | if d.err != nil { 132 | return 0 133 | } 134 | v, n := binary.Uvarint(d.buf) 135 | d.buf = d.buf[n:] 136 | return v 137 | } 138 | 139 | func (d *decoder) bytes() []byte { 140 | // Get length prefix 141 | n := d.varint() 142 | if d.err != nil { 143 | return nil 144 | } 145 | if n == 0 { 146 | return nil 147 | } 148 | if n > uint64(len(d.buf)) { 149 | d.err = io.ErrShortBuffer 150 | return nil 151 | } 152 | bs := make([]byte, n) 153 | copy(bs, d.buf[:n]) 154 | d.buf = d.buf[n:] 155 | return bs 156 | } 157 | 158 | func (d *decoder) time() time.Time { 159 | var t time.Time 160 | if d.err != nil { 161 | return t 162 | } 163 | // Note that Unmarshal Binary updates d.buf to remove the bytes it read 164 | // already. 165 | d.err = t.UnmarshalBinary(d.buf) 166 | return t 167 | } 168 | -------------------------------------------------------------------------------- /segment/reader.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "encoding/binary" 8 | "errors" 9 | "fmt" 10 | "io" 11 | "sync" 12 | 13 | "github.com/hashicorp/raft-wal/types" 14 | ) 15 | 16 | // Reader allows reading logs from a segment file. 17 | type Reader struct { 18 | info types.SegmentInfo 19 | rf types.ReadableFile 20 | 21 | bufPool *sync.Pool 22 | 23 | // tail optionally providers an interface to the writer state when this is an 24 | // unsealed segment so we can fetch from it's in-memory index. 25 | tail tailWriter 26 | } 27 | 28 | type tailWriter interface { 29 | OffsetForFrame(idx uint64) (uint32, error) 30 | } 31 | 32 | func openReader(info types.SegmentInfo, rf types.ReadableFile, bufPool *sync.Pool) (*Reader, error) { 33 | r := &Reader{ 34 | info: info, 35 | rf: rf, 36 | bufPool: bufPool, 37 | } 38 | 39 | return r, nil 40 | } 41 | 42 | // Close implements io.Closer 43 | func (r *Reader) Close() error { 44 | return r.rf.Close() 45 | } 46 | 47 | // GetLog returns the raw log entry bytes associated with idx. If the log 48 | // doesn't exist in this segment types.ErrNotFound must be returned. 49 | func (r *Reader) GetLog(idx uint64) (*types.PooledBuffer, error) { 50 | offset, err := r.findFrameOffset(idx) 51 | if err != nil { 52 | return nil, err 53 | } 54 | 55 | _, payload, err := r.readFrame(offset) 56 | if err != nil { 57 | return nil, err 58 | } 59 | return payload, err 60 | } 61 | 62 | func (r *Reader) readFrame(offset uint32) (frameHeader, *types.PooledBuffer, error) { 63 | buf := r.makeBuffer() 64 | 65 | n, err := r.rf.ReadAt(buf.Bs, int64(offset)) 66 | if errors.Is(err, io.EOF) && n >= frameHeaderLen { 67 | // We might have hit EOF just because our read buffer (at least 64KiB) might 68 | // be larger than the space left in the file (say if files are tiny or if we 69 | // are reading a frame near the end.). So don't treat EOF as an error as 70 | // long as we have actually managed to read a frameHeader - we'll work out 71 | // if we got the whole thing or not below. 72 | err = nil 73 | 74 | // Re-slice buf.Bs so it's len() reflect only what we actually managed to 75 | // read. Note this doesn't impact the buffer length when it's returned to 76 | // the pool which will still return the whole cap. 77 | buf.Bs = buf.Bs[:n] 78 | } 79 | if err != nil { 80 | return frameHeader{}, nil, err 81 | } 82 | fh, err := readFrameHeader(buf.Bs) 83 | if err != nil { 84 | return fh, nil, err 85 | } 86 | 87 | if (frameHeaderLen + int(fh.len)) <= len(buf.Bs) { 88 | // We already have all we need read, just return it sliced to just include 89 | // the payload. 90 | buf.Bs = buf.Bs[frameHeaderLen : frameHeaderLen+fh.len] 91 | return fh, buf, nil 92 | } 93 | // Need to read again, with a bigger buffer, return this one 94 | buf.Close() 95 | 96 | // Need to read more bytes, validate that len is a sensible number 97 | if fh.len > MaxEntrySize { 98 | return fh, nil, fmt.Errorf("%w: frame header indicates a record larger than MaxEntrySize (%d bytes)", types.ErrCorrupt, MaxEntrySize) 99 | } 100 | 101 | buf = &types.PooledBuffer{ 102 | Bs: make([]byte, fh.len), 103 | // No closer, let outsized buffers be GCed in case they are massive and way 104 | // bigger than we need again. Could reconsider this if we find we need to 105 | // optimize for frequent > minBufSize reads. 106 | } 107 | if _, err := r.rf.ReadAt(buf.Bs, int64(offset+frameHeaderLen)); err != nil { 108 | return fh, nil, err 109 | } 110 | return fh, buf, nil 111 | } 112 | 113 | func (r *Reader) makeBuffer() *types.PooledBuffer { 114 | if r.bufPool == nil { 115 | return &types.PooledBuffer{Bs: make([]byte, minBufSize)} 116 | } 117 | buf := r.bufPool.Get().([]byte) 118 | return &types.PooledBuffer{ 119 | Bs: buf, 120 | CloseFn: func() { 121 | // Note we always return the whole allocated buf regardless of what Bs 122 | // ended up being sliced to. 123 | r.bufPool.Put(buf) 124 | }, 125 | } 126 | 127 | } 128 | 129 | func (r *Reader) findFrameOffset(idx uint64) (uint32, error) { 130 | if r.tail != nil { 131 | // This is not a sealed segment. 132 | return r.tail.OffsetForFrame(idx) 133 | } 134 | 135 | // Sealed segment, read from the on-disk index block. 136 | if r.info.IndexStart == 0 { 137 | return 0, fmt.Errorf("sealed segment has no index block") 138 | } 139 | 140 | if idx < r.info.MinIndex || (r.info.MaxIndex > 0 && idx > r.info.MaxIndex) { 141 | return 0, types.ErrNotFound 142 | } 143 | 144 | // IndexStart is the offset to the first entry in the index array. We need to 145 | // find the byte offset to the Nth entry 146 | entryOffset := (idx - r.info.BaseIndex) 147 | byteOffset := r.info.IndexStart + (entryOffset * 4) 148 | 149 | var bs [4]byte 150 | n, err := r.rf.ReadAt(bs[:], int64(byteOffset)) 151 | if err == io.EOF && n == 4 { 152 | // Read all of it just happened to be at end of file, ignore 153 | err = nil 154 | } 155 | if err != nil { 156 | return 0, fmt.Errorf("failed to read segment index: %w", err) 157 | } 158 | offset := binary.LittleEndian.Uint32(bs[:]) 159 | return offset, nil 160 | } 161 | -------------------------------------------------------------------------------- /migrate/migrate.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package migrate 5 | 6 | import ( 7 | "context" 8 | "fmt" 9 | "time" 10 | 11 | "github.com/hashicorp/raft" 12 | ) 13 | 14 | // CopyLogs takes an src and a dst raft.LogStore implementation and copies all 15 | // entries from src to dst. It assumes dst is empty. Neither LogStore may be in 16 | // use at the time. batchBytes is the target number of bytes of log data to 17 | // group into each append for efficiency. If progress is non-nil it will be 18 | // delivered updates during the copy since it could take a while. Updates will 19 | // be delivered best-effort with a short wait of 1 millisecond. If the channel 20 | // blocks for longer updates may be lost. The caller should sufficiently buffer 21 | // it and ensure it's being drained as fast as needed. If non-nil progress will 22 | // be closed when the function returns. 23 | func CopyLogs(ctx context.Context, dst, src raft.LogStore, batchBytes int, progress chan<- string) error { 24 | defer func() { 25 | if progress != nil { 26 | close(progress) 27 | } 28 | }() 29 | 30 | st := time.Now() 31 | update := func(message string, args ...interface{}) { 32 | if progress == nil { 33 | return 34 | } 35 | select { 36 | case progress <- fmt.Sprintf(message, args...): 37 | case <-time.After(time.Millisecond): 38 | } 39 | } 40 | 41 | first, err := src.FirstIndex() 42 | if err != nil { 43 | return fmt.Errorf("failed getting first index: %w", err) 44 | } 45 | last, err := src.LastIndex() 46 | if err != nil { 47 | return fmt.Errorf("failed getting last index: %w", err) 48 | } 49 | 50 | batch := make([]*raft.Log, 0, 4096) 51 | batchSize := 0 52 | n := 0 53 | batchN := 1 54 | total := int(last - first + 1) 55 | totalBytes := 0 56 | update("starting to copy %d log entries with indexes [%d, %d]", total, first, last) 57 | for idx := first; idx <= last; idx++ { 58 | if ctx.Err() != nil { 59 | return ctx.Err() 60 | } 61 | var log raft.Log 62 | n++ 63 | err := src.GetLog(idx, &log) 64 | if err != nil { 65 | return fmt.Errorf("failed copying log %d (%d/%d): %w", idx, n, total, err) 66 | } 67 | batch = append(batch, &log) 68 | // Fudge some overhead for headers and other fields this is about right for 69 | // our WAL anyway. 70 | batchSize += len(log.Data) + 32 71 | if batchSize >= batchBytes { 72 | // Flush the batch 73 | batchSummary := fmt.Sprintf("batch %6d: %d entries ending at %d", batchN, len(batch), idx) 74 | err := dst.StoreLogs(batch) 75 | if err != nil { 76 | return fmt.Errorf("failed writing %s: %w", batchSummary, err) 77 | } 78 | update(" -> wrote %s (%3.0f%% complete)", batchSummary, (float32(n-1)/float32(total))*100.0) 79 | batchN++ 80 | batch = batch[:0] 81 | totalBytes += batchSize 82 | batchSize = 0 83 | } 84 | } 85 | if len(batch) > 0 { 86 | // Flush the batch 87 | batchSummary := fmt.Sprintf("batch %6d: %d entries ending at %d", batchN, len(batch), last) 88 | err := dst.StoreLogs(batch) 89 | if err != nil { 90 | return fmt.Errorf("failed writing %s: %w", batchSummary, err) 91 | } 92 | update(" -> wrote %s (%3.0f%% complete)", batchSummary, (float32(n-1)/float32(total))*100.0) 93 | batchN++ 94 | batch = batch[:0] 95 | totalBytes += batchSize 96 | batchSize = 0 97 | } 98 | update("DONE: took %s to copy %d entries (%d bytes)", time.Since(st), total, totalBytes) 99 | return nil 100 | } 101 | 102 | // CopyStable copies the known hashicorp/raft library used keys from one stable 103 | // store to another. Since StableStore has no list method there is no general 104 | // way to copy all possibly stored keys, however this is sufficient for standard 105 | // uses of `hashicorp/raft` as of the current release since it only every writes 106 | // these keys to StableStore. If other keys are written by another code path, 107 | // the caller can provide them in extraKeys and/or extraIntKeys depending on which 108 | // interface method they were written with - we don't assume all implementations 109 | // share a key space for Set and SetUint64. Both can be nil for just the 110 | // standard raft keys to be copied. 111 | func CopyStable(ctx context.Context, dst, src raft.StableStore, extraKeys, extraIntKeys [][]byte, progress chan<- string) error { 112 | // https://github.com/hashicorp/raft/blob/44124c28758b8cfb675e90c75a204a08a84f8d4f/raft.go#L22-L26 113 | knownIntKeys := [][]byte{ 114 | []byte("CurrentTerm"), 115 | []byte("LastVoteTerm"), 116 | } 117 | knownKeys := [][]byte{ 118 | []byte("LastVoteCand"), 119 | } 120 | 121 | defer func() { 122 | if progress != nil { 123 | close(progress) 124 | } 125 | }() 126 | 127 | update := func(message string, args ...interface{}) { 128 | if progress == nil { 129 | return 130 | } 131 | select { 132 | case progress <- fmt.Sprintf(message, args...): 133 | case <-time.After(time.Millisecond): 134 | } 135 | } 136 | 137 | st := time.Now() 138 | update("copying %d int, %d regular KVs", len(knownIntKeys)+len(extraIntKeys), 139 | len(knownKeys)+len(extraKeys)) 140 | for _, k := range append(knownIntKeys, extraIntKeys...) { 141 | if ctx.Err() != nil { 142 | return ctx.Err() 143 | } 144 | v, err := src.GetUint64(k) 145 | if err != nil { 146 | return fmt.Errorf("failed to read int key %s: %w", k, err) 147 | } 148 | err = dst.SetUint64(k, v) 149 | if err != nil { 150 | return fmt.Errorf("failed to set int key %s => %d: %w", k, v, err) 151 | } 152 | update(" copied int %s => %d", k, v) 153 | } 154 | for _, k := range append(knownKeys, extraKeys...) { 155 | if ctx.Err() != nil { 156 | return ctx.Err() 157 | } 158 | v, err := src.Get(k) 159 | if err != nil { 160 | return fmt.Errorf("failed to read key %s: %w", k, err) 161 | } 162 | err = dst.Set(k, v) 163 | if err != nil { 164 | return fmt.Errorf("failed to set key %s => %s: %w", k, v, err) 165 | } 166 | update(" copied %s => %q", k, v) 167 | } 168 | update("DONE: took %s to copy %d KVs", time.Since(st), 169 | len(knownIntKeys)+len(extraIntKeys)+len(knownKeys)+len(extraKeys)) 170 | return nil 171 | } 172 | -------------------------------------------------------------------------------- /alice/workload/main.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package main 5 | 6 | import ( 7 | "bytes" 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "path/filepath" 13 | "strings" 14 | "time" 15 | 16 | "github.com/hashicorp/raft" 17 | wal "github.com/hashicorp/raft-wal" 18 | ) 19 | 20 | type opts struct { 21 | dir string 22 | workload string 23 | init bool 24 | truncType string 25 | } 26 | 27 | func main() { 28 | var o opts 29 | 30 | flag.StringVar(&o.dir, "dir", "./workload_dir", "path to directory for WAL files") 31 | flag.StringVar(&o.workload, "workload", "append", "workload to run, one of 'append', 'truncate-head', 'truncate-tail', 'truncate-all'") 32 | flag.BoolVar(&o.init, "init", false, "whether this is the init or actual recording") 33 | flag.Parse() 34 | 35 | var fn func(o opts) error 36 | var initFn func(o opts) error 37 | switch o.workload { 38 | case "append": 39 | fn = runAppend 40 | case "truncate-head": 41 | o.truncType = "head" 42 | fn = runTruncate 43 | initFn = runInitTruncate 44 | case "truncate-tail": 45 | o.truncType = "tail" 46 | fn = runTruncate 47 | initFn = runInitTruncate 48 | case "truncate-all": 49 | o.truncType = "all" 50 | fn = runTruncate 51 | initFn = runInitTruncate 52 | default: 53 | log.Fatalf("unsupported workload %q", o.workload) 54 | } 55 | 56 | if o.init == true { 57 | fn = initFn 58 | } 59 | if fn == nil { 60 | return 61 | } 62 | 63 | if err := fn(o); err != nil { 64 | log.Fatal(err) 65 | } 66 | } 67 | 68 | // runInitTruncate sets up a WAL with a bunch of segments ready to test 69 | // truncations. We setup ahead of the actual alice test to limit the IOs needed 70 | // to be explored when simulating different scenarios. 71 | // 72 | // The setup leaves us with a set of segments that contain the following ranges: 73 | // 74 | // [1..20] 75 | // [21..40] 76 | // [41..60] 77 | // [61..65] 78 | func runInitTruncate(o opts) error { 79 | return populate(o.dir, 80 | 16*1024, // 16 KiB segments 81 | 1024, // 1KiB byte logs 82 | 20, // batchSize (20 * 1024 is bigger than segment size so each segment will have this many logs except the tail) 83 | 65, // 65 logs total 84 | ) 85 | } 86 | 87 | func runTruncate(o opts) error { 88 | w, err := wal.Open(o.dir, wal.WithSegmentSize(8*1024)) 89 | if err != nil { 90 | return err 91 | } 92 | 93 | // Output the initial commitIdx to get checker in sync! 94 | last, err := w.LastIndex() 95 | if err != nil { 96 | return err 97 | } 98 | fmt.Printf("commitIdx=%d\n", last) 99 | 100 | switch o.truncType { 101 | case "head": 102 | // Remove the first two segments 103 | fmt.Printf("willTruncateBefore=46\n") 104 | err = w.DeleteRange(0, 45) 105 | fmt.Printf("truncatedBefore=46\n") 106 | case "tail": 107 | // Remove the last two segments 108 | fmt.Printf("willTruncateAfter=34\n") 109 | err = w.DeleteRange(35, 100) 110 | fmt.Printf("truncatedAfter=34\n") 111 | case "all": 112 | fmt.Printf("willTruncateAfter=0\n") 113 | err = w.DeleteRange(0, 100) 114 | fmt.Printf("truncatedAfter=0\n") 115 | } 116 | if err != nil { 117 | return err 118 | } 119 | 120 | last, err = w.LastIndex() 121 | if err != nil { 122 | return err 123 | } 124 | 125 | // Now append another entry to prove we are in a good state and can't loose 126 | // following writes in a crash. 127 | err = w.StoreLog(&raft.Log{ 128 | Index: last + 1, 129 | Term: 1, 130 | Type: raft.LogCommand, 131 | Data: []byte("Post Truncate Entry"), 132 | AppendedAt: time.Now(), 133 | }) 134 | if err != nil { 135 | return err 136 | } 137 | fmt.Printf("commitIdx=%d\n", last+1) 138 | 139 | return nil 140 | } 141 | 142 | func runAppend(o opts) error { 143 | // We want to limit the total disk IOs we do because ALICE takes forever to 144 | // explore the reordering state space if there are more than a few. To 145 | // exercise realistic enough code paths though we want at least a couple of 146 | // append batches in each segment and at least one segment rotation. We'll 147 | // just write large values which will be treated as a single disk op while 148 | // taking up more space. We'll use 16 KiB segments and write values that are 149 | // 4KiB each (with headers that will take us over segment limit after 4 logs 150 | // appended in 2 batches.). To make it easier to manually inspect hex dumps of 151 | // WAL files for debugging, we'll use printable chars rather than random 152 | // bytes, and make the deterministic so we can also confirm that we didn't 153 | // accidentally return the wrong payload or corrupt them too. 154 | return populate(o.dir, 155 | 16*1024, // 16 KiB segments 156 | 4096, // 4KiB logs 157 | 2, // batchSize 158 | 8, // Add 8 logs in total 159 | ) 160 | } 161 | 162 | func resetDir(dir string) error { 163 | entries, err := os.ReadDir(dir) 164 | if err != nil { 165 | return err 166 | } 167 | for _, e := range entries { 168 | if strings.HasSuffix(e.Name(), ".wal") || strings.HasSuffix(e.Name(), ".db") { 169 | if err := os.Remove(filepath.Join(dir, e.Name())); err != nil { 170 | return err 171 | } 172 | } 173 | } 174 | return nil 175 | } 176 | 177 | func populate(dir string, segMentSize, logSize, batchSize, num int) error { 178 | if err := resetDir(dir); err != nil { 179 | return err 180 | } 181 | 182 | w, err := wal.Open(dir, wal.WithSegmentSize(segMentSize)) 183 | if err != nil { 184 | return err 185 | } 186 | 187 | var logs []*raft.Log 188 | 189 | if logSize%4 != 0 { 190 | return fmt.Errorf("logSize must be a multiple of 4") 191 | } 192 | if num > 999 { 193 | return fmt.Errorf("num must be less than 999") 194 | } 195 | 196 | numRepeats := logSize / 4 197 | 198 | commitBatch := func() error { 199 | if len(logs) == 0 { 200 | return nil 201 | } 202 | if err := w.StoreLogs(logs); err != nil { 203 | return err 204 | } 205 | // Log that we expect everything up to i to be durable now so checker can 206 | // assert that. 207 | fmt.Printf("commitIdx=%d\n", logs[len(logs)-1].Index) 208 | logs = logs[:0] 209 | return nil 210 | } 211 | 212 | for i := 1; i <= num; i++ { 213 | logs = append(logs, &raft.Log{ 214 | Index: uint64(i), 215 | Term: 1, 216 | Type: raft.LogCommand, 217 | Data: bytes.Repeat([]byte(fmt.Sprintf("%03d|", i)), numRepeats), 218 | AppendedAt: time.Now(), 219 | }) 220 | 221 | if len(logs) >= batchSize { 222 | if err := commitBatch(); err != nil { 223 | return err 224 | } 225 | } 226 | } 227 | 228 | // Commit the remainder 229 | return commitBatch() 230 | } 231 | -------------------------------------------------------------------------------- /bench/append_requestor.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package main 5 | 6 | import ( 7 | "context" 8 | "crypto/rand" 9 | "fmt" 10 | "io" 11 | "path/filepath" 12 | "sync/atomic" 13 | "time" 14 | 15 | "github.com/HdrHistogram/hdrhistogram-go" 16 | "github.com/benmathews/bench" 17 | histwriter "github.com/benmathews/hdrhistogram-writer" 18 | "github.com/hashicorp/raft" 19 | raftboltdb "github.com/hashicorp/raft-boltdb/v2" 20 | wal "github.com/hashicorp/raft-wal" 21 | "go.etcd.io/bbolt" 22 | ) 23 | 24 | var ( 25 | _ bench.RequesterFactory = &appendRequesterFactory{} 26 | 27 | randomData []byte 28 | ) 29 | 30 | func init() { 31 | randomData = make([]byte, 1024*1024) 32 | rand.Read(randomData) 33 | } 34 | 35 | // appendRequesterFactory implements bench.RequesterFactory 36 | type appendRequesterFactory struct { 37 | opts opts 38 | output io.Writer 39 | } 40 | 41 | // GetRequester returns a new Requester, called for each Benchmark 42 | // connection. 43 | func (f *appendRequesterFactory) GetRequester(number uint64) bench.Requester { 44 | if number > 0 { 45 | panic("wal only supports a single writer") 46 | } 47 | 48 | var fn func() (raft.LogStore, error) 49 | switch f.opts.version { 50 | case "wal": 51 | fn = func() (raft.LogStore, error) { 52 | return wal.Open(f.opts.dir, wal.WithSegmentSize(f.opts.segSize*1024*1024)) 53 | } 54 | case "bolt": 55 | fn = func() (raft.LogStore, error) { 56 | boltOpts := raftboltdb.Options{ 57 | Path: filepath.Join(f.opts.dir, "raft.db"), 58 | BoltOptions: &bbolt.Options{ 59 | NoFreelistSync: f.opts.noFreelistSync, 60 | }, 61 | } 62 | return raftboltdb.New(boltOpts) 63 | } 64 | default: 65 | panic("unknown LogStore version: " + f.opts.version) 66 | } 67 | 68 | return &appendRequester{ 69 | opts: f.opts, 70 | output: f.output, 71 | newStore: fn, 72 | } 73 | } 74 | 75 | // appendRequester implements bench.Requester for appending entries to the WAL. 76 | type appendRequester struct { 77 | closed uint32 78 | 79 | opts opts 80 | 81 | batch []*raft.Log 82 | index uint64 83 | newStore func() (raft.LogStore, error) 84 | store raft.LogStore 85 | truncateStop func() 86 | output io.Writer 87 | 88 | truncateTiming *hdrhistogram.Histogram 89 | } 90 | 91 | // Setup prepares the Requester for benchmarking. 92 | func (r *appendRequester) Setup() error { 93 | ls, err := r.newStore() 94 | if err != nil { 95 | return err 96 | } 97 | r.store = ls 98 | 99 | // Prebuild the batch of logs. There is no compression so we don't care that 100 | // they are all the same data. 101 | r.batch = make([]*raft.Log, r.opts.batchSize) 102 | for i := range r.batch { 103 | r.batch[i] = &raft.Log{ 104 | // We'll vary the indexes each time but save on setting this up the same 105 | // way every time to! 106 | Data: randomData[:r.opts.logSize], 107 | AppendedAt: time.Now(), 108 | } 109 | } 110 | r.index = 1 111 | 112 | if r.opts.preLoadN > 0 { 113 | // Write lots of big records and then delete them again. We'll use batches 114 | // of 1000 1024 byte records for now to speed things up a bit. 115 | preBatch := make([]*raft.Log, 0, 1000) 116 | fmt.Fprintf(r.output, "Preloading up to index %d\n", r.opts.preLoadN) 117 | for r.index <= uint64(r.opts.preLoadN) { 118 | preBatch = append(preBatch, &raft.Log{Index: r.index, Data: randomData[:1024]}) 119 | r.index++ 120 | if len(preBatch) == 1000 { 121 | err := r.store.StoreLogs(preBatch) 122 | if err != nil { 123 | return err 124 | } 125 | preBatch = preBatch[:0] 126 | } 127 | } 128 | if len(preBatch) > 0 { 129 | err := r.store.StoreLogs(preBatch) 130 | if err != nil { 131 | return err 132 | } 133 | } 134 | 135 | // Now truncate back to trailingLogs. 136 | fmt.Fprintf(r.output, "Truncating 1 - %d\n", r.index-uint64(r.opts.truncateTrailingLogs)) 137 | err := r.store.DeleteRange(1, r.index-uint64(r.opts.truncateTrailingLogs)) 138 | if err != nil { 139 | return err 140 | } 141 | r.dumpStats() 142 | } 143 | if r.opts.truncatePeriod > 0 { 144 | r.truncateTiming = hdrhistogram.New(1, 10_000_000, 3) 145 | fmt.Fprintf(r.output, "Starting Truncator every %s\n", r.opts.truncatePeriod) 146 | ctx, cancel := context.WithCancel(context.Background()) 147 | r.truncateStop = cancel 148 | go r.runTruncate(ctx) 149 | } else { 150 | fmt.Fprintf(r.output, "Truncation disabled\n") 151 | } 152 | 153 | return nil 154 | } 155 | 156 | func (r *appendRequester) runTruncate(ctx context.Context) { 157 | ticker := time.NewTicker(r.opts.truncatePeriod) 158 | for { 159 | select { 160 | case <-ticker.C: 161 | if atomic.LoadUint32(&r.closed) == 1 { 162 | return 163 | } 164 | first, err := r.store.FirstIndex() 165 | if err != nil { 166 | panic(err) 167 | } 168 | last, err := r.store.LastIndex() 169 | if err != nil { 170 | panic(err) 171 | } 172 | 173 | deleteMax := uint64(0) 174 | if last > uint64(r.opts.truncateTrailingLogs) { 175 | deleteMax = last - uint64(r.opts.truncateTrailingLogs) 176 | } 177 | if deleteMax >= first { 178 | st := time.Now() 179 | err := r.store.DeleteRange(first, deleteMax) 180 | elapsed := time.Since(st) 181 | r.truncateTiming.RecordValue(elapsed.Microseconds()) 182 | if err != nil { 183 | panic(err) 184 | } 185 | } 186 | 187 | case <-ctx.Done(): 188 | return 189 | } 190 | } 191 | } 192 | 193 | // Request performs a synchronous request to the system under test. 194 | func (r *appendRequester) Request() error { 195 | // Update log indexes 196 | for i := range r.batch { 197 | r.batch[i].Index = r.index 198 | r.index++ 199 | } 200 | return r.store.StoreLogs(r.batch) 201 | } 202 | 203 | type metricer interface { 204 | Metrics() map[string]uint64 205 | } 206 | 207 | func (r *appendRequester) dumpStats() { 208 | if m, ok := r.store.(metricer); ok { 209 | fmt.Fprintln(r.output, "\n== METRICS ==========") 210 | for k, v := range m.Metrics() { 211 | fmt.Fprintf(r.output, "% 25s: % 15d\n", k, v) 212 | } 213 | } 214 | if r.truncateTiming != nil { 215 | scaleFactor := 0.001 // Scale us to ms. 216 | if err := histwriter.WriteDistributionFile(r.truncateTiming, nil, scaleFactor, outFileName(r.opts, "truncate-lat")); err != nil { 217 | fmt.Fprintf(r.output, "ERROR writing truncate histogram: %s\n", err) 218 | } 219 | printHistogram(r.output, "Truncate Latency (ms)", r.truncateTiming, 1000) 220 | } 221 | } 222 | 223 | // Teardown is called upon benchmark completion. 224 | func (r *appendRequester) Teardown() error { 225 | old := atomic.SwapUint32(&r.closed, 1) 226 | if old == 0 { 227 | r.dumpStats() 228 | if c, ok := r.store.(io.Closer); ok { 229 | return c.Close() 230 | } 231 | } 232 | return nil 233 | } 234 | -------------------------------------------------------------------------------- /segment/format_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "encoding/binary" 8 | "math" 9 | "strings" 10 | "testing" 11 | 12 | fuzz "github.com/google/gofuzz" 13 | "github.com/hashicorp/raft-wal/types" 14 | "github.com/stretchr/testify/require" 15 | ) 16 | 17 | func TestFileHeaderCodec(t *testing.T) { 18 | cases := []struct { 19 | name string 20 | info types.SegmentInfo 21 | bufSize int 22 | corrupt func([]byte) []byte 23 | wantWriteErr string 24 | wantReadErr string 25 | wantValidateErr string 26 | }{ 27 | { 28 | name: "basic encoding/decoding", 29 | info: types.SegmentInfo{ 30 | BaseIndex: 1234, 31 | ID: 4321, 32 | Codec: 1, 33 | }, 34 | }, 35 | { 36 | name: "short buf writing", 37 | info: types.SegmentInfo{ 38 | BaseIndex: 1234, 39 | ID: 4321, 40 | Codec: 1, 41 | }, 42 | bufSize: 10, 43 | wantWriteErr: "short buffer", 44 | }, 45 | { 46 | name: "short buf reading", 47 | info: types.SegmentInfo{ 48 | BaseIndex: 1234, 49 | ID: 4321, 50 | Codec: 1, 51 | }, 52 | corrupt: func(buf []byte) []byte { 53 | return buf[0:5] 54 | }, 55 | wantReadErr: "short buffer", 56 | }, 57 | { 58 | name: "bad magic reading", 59 | info: types.SegmentInfo{ 60 | BaseIndex: 1234, 61 | ID: 4321, 62 | Codec: 1, 63 | }, 64 | corrupt: func(buf []byte) []byte { 65 | buf[0] = 0xff 66 | return buf 67 | }, 68 | wantReadErr: "corrupt", 69 | }, 70 | { 71 | name: "bad BaseIndex reading", 72 | info: types.SegmentInfo{ 73 | BaseIndex: 1234, 74 | ID: 4321, 75 | Codec: 1, 76 | }, 77 | corrupt: func(buf []byte) []byte { 78 | buf[8] = 0xff 79 | return buf 80 | }, 81 | wantValidateErr: "corrupt", 82 | }, 83 | { 84 | name: "bad ID reading", 85 | info: types.SegmentInfo{ 86 | BaseIndex: 1234, 87 | ID: 4321, 88 | Codec: 1, 89 | }, 90 | corrupt: func(buf []byte) []byte { 91 | buf[16] = 0xff 92 | return buf 93 | }, 94 | wantValidateErr: "corrupt", 95 | }, 96 | { 97 | name: "bad Codec reading", 98 | info: types.SegmentInfo{ 99 | BaseIndex: 1234, 100 | ID: 4321, 101 | Codec: 1, 102 | }, 103 | corrupt: func(buf []byte) []byte { 104 | buf[24] = 0xff 105 | return buf 106 | }, 107 | wantValidateErr: "corrupt", 108 | }, 109 | } 110 | 111 | for _, tc := range cases { 112 | tc := tc 113 | t.Run(tc.name, func(t *testing.T) { 114 | len := fileHeaderLen 115 | if tc.bufSize > 0 { 116 | len = tc.bufSize 117 | } 118 | buf := make([]byte, len) 119 | 120 | err := writeFileHeader(buf, tc.info) 121 | 122 | if tc.wantWriteErr != "" { 123 | require.ErrorContains(t, err, tc.wantWriteErr) 124 | return 125 | } 126 | require.NoError(t, err) 127 | 128 | if tc.corrupt != nil { 129 | buf = tc.corrupt(buf) 130 | } 131 | 132 | got, err := readFileHeader(buf) 133 | if tc.wantReadErr != "" { 134 | require.ErrorContains(t, err, tc.wantReadErr) 135 | return 136 | } 137 | require.NoError(t, err) 138 | require.NotNil(t, got) 139 | 140 | err = validateFileHeader(*got, tc.info) 141 | if tc.wantValidateErr != "" { 142 | require.ErrorContains(t, err, tc.wantValidateErr) 143 | return 144 | } 145 | require.NoError(t, err) 146 | }) 147 | } 148 | } 149 | 150 | func TestFileHeaderCodecFuzz(t *testing.T) { 151 | fuzz := fuzz.New() 152 | 153 | var info types.SegmentInfo 154 | var buf [fileHeaderLen]byte 155 | for i := 0; i < 1000; i++ { 156 | fuzz.Fuzz(&info) 157 | err := writeFileHeader(buf[:], info) 158 | require.NoError(t, err) 159 | 160 | t.Logf("% x", buf[:]) 161 | 162 | got, err := readFileHeader(buf[:]) 163 | require.NoError(t, err) 164 | require.NotNil(t, got) 165 | 166 | err = validateFileHeader(*got, info) 167 | require.NoError(t, err) 168 | } 169 | } 170 | 171 | func TestFrameCodecFuzz(t *testing.T) { 172 | fuzz := fuzz.New() 173 | 174 | var len uint16 175 | // Allocate an extra frameHeaderLen here because some lengths might end up 176 | // needing padding which takes them just over the buffer size. 177 | var buf [math.MaxUint16 + frameHeaderLen + frameHeaderLen]byte 178 | var val = []byte(strings.Repeat("A Value!", math.MaxUint16/8)) 179 | var fh frameHeader 180 | for i := 0; i < 1000; i++ { 181 | fuzz.Fuzz(&len) 182 | 183 | fh.typ = FrameEntry 184 | fh.len = uint32(len) 185 | 186 | expectLen := encodedFrameSize(int(len)) 187 | 188 | // Note length of val is not the same as fh.len which is what should be 189 | // used. 190 | err := writeFrame(buf[:expectLen], fh, val) 191 | require.NoError(t, err) 192 | 193 | // We mostly care about the start and end... 194 | if expectLen > 64 { 195 | t.Logf("% x [...] % x (%d)", buf[0:16], buf[expectLen-16:expectLen], expectLen) 196 | } else { 197 | t.Logf("% x", buf[:expectLen]) 198 | } 199 | 200 | // Verify the last padLen bytes are zero 201 | for i := padLen(int(len)); i > 0; i-- { 202 | require.Equal(t, byte(0), buf[expectLen-i], 203 | "expected last %d bytes to be padding. Byte %d of %d isn't zero.", 204 | padLen(int(len)), expectLen-i, expectLen) 205 | } 206 | 207 | got, err := readFrameHeader(buf[:]) 208 | require.NoError(t, err) 209 | require.Equal(t, fh, got) 210 | } 211 | } 212 | 213 | func TestPadLen(t *testing.T) { 214 | fuzz := fuzz.New() 215 | var len uint32 216 | 217 | for i := 0; i < 1000; i++ { 218 | fuzz.Fuzz(&len) 219 | 220 | got := padLen(int(len)) 221 | 222 | t.Log("len", len) 223 | 224 | // Test basic properties of padLen 225 | require.Less(t, got, frameHeaderLen, "padding must be less than the whole header len") 226 | require.GreaterOrEqual(t, got, 0, "padding must be positive") 227 | require.Equal(t, 0, (got+int(len))%frameHeaderLen, "padding plus length must be a multiple of header len") 228 | } 229 | } 230 | 231 | func TestWriteIndexFrame(t *testing.T) { 232 | // TestFrameCodecFuzz covers most of the bases for the actual header encoding 233 | // etc. This just needs to test the index encoding. 234 | var index [1024]uint32 235 | 236 | for i := range index { 237 | // Write offsets as if each record is exactly 64 bytes 238 | index[i] = uint32(i * 64) 239 | } 240 | 241 | buf := make([]byte, indexFrameSize(len(index))) 242 | 243 | err := writeIndexFrame(buf, index[:]) 244 | require.NoError(t, err) 245 | 246 | //t.Log(index, buf) 247 | 248 | // Validate that the encoded index after the header is what we expect 249 | offset := frameHeaderLen 250 | for i := range index { 251 | got := binary.LittleEndian.Uint32(buf[offset:]) 252 | require.Equal(t, uint32(i*64), got, "unexpected index value at offset %d", i) 253 | offset += 4 254 | } 255 | } 256 | -------------------------------------------------------------------------------- /bench/bench_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package main 5 | 6 | import ( 7 | "fmt" 8 | "os" 9 | "path/filepath" 10 | "testing" 11 | "time" 12 | 13 | "github.com/hashicorp/raft" 14 | raftboltdb "github.com/hashicorp/raft-boltdb" 15 | wal "github.com/hashicorp/raft-wal" 16 | "github.com/stretchr/testify/require" 17 | "go.etcd.io/etcd/client/pkg/v3/fileutil" 18 | ) 19 | 20 | func BenchmarkAppend(b *testing.B) { 21 | sizes := []int{ 22 | 10, 23 | 1024, 24 | 100 * 1024, 25 | 1024 * 1024, 26 | } 27 | sizeNames := []string{ 28 | "10", 29 | "1k", 30 | "100k", 31 | "1m", 32 | } 33 | batchSizes := []int{1, 10} 34 | 35 | for i, s := range sizes { 36 | for _, bSize := range batchSizes { 37 | b.Run(fmt.Sprintf("entrySize=%s/batchSize=%d/v=WAL", sizeNames[i], bSize), func(b *testing.B) { 38 | ls, done := openWAL(b) 39 | defer done() 40 | // close _first_ (defers run in reverse order) before done() which will 41 | // delete since rotate could still be happening 42 | defer ls.Close() 43 | runAppendBench(b, ls, s, bSize) 44 | }) 45 | b.Run(fmt.Sprintf("entrySize=%s/batchSize=%d/v=Bolt", sizeNames[i], bSize), func(b *testing.B) { 46 | ls := openBolt(b) 47 | runAppendBench(b, ls, s, bSize) 48 | }) 49 | } 50 | } 51 | } 52 | 53 | func openWAL(b *testing.B) (*wal.WAL, func()) { 54 | tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*") 55 | require.NoError(b, err) 56 | 57 | // Force every 1k append to create a new segment to profile segment rotation. 58 | ls, err := wal.Open(tmpDir, wal.WithSegmentSize(512)) 59 | require.NoError(b, err) 60 | 61 | return ls, func() { os.RemoveAll(tmpDir) } 62 | } 63 | 64 | func openBolt(b *testing.B) *raftboltdb.BoltStore { 65 | tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*") 66 | require.NoError(b, err) 67 | defer os.RemoveAll(tmpDir) 68 | 69 | ls, err := raftboltdb.NewBoltStore(filepath.Join(tmpDir, "bolt-wal.db")) 70 | require.NoError(b, err) 71 | 72 | return ls 73 | } 74 | 75 | func runAppendBench(b *testing.B, ls raft.LogStore, s, n int) { 76 | // Pre-create batch, we'll just adjust the indexes in the loop 77 | batch := make([]*raft.Log, n) 78 | for i := range batch { 79 | batch[i] = &raft.Log{ 80 | Data: randomData[:s], 81 | AppendedAt: time.Now(), 82 | } 83 | } 84 | 85 | b.ResetTimer() 86 | idx := uint64(1) 87 | for i := 0; i < b.N; i++ { 88 | for j := range batch { 89 | batch[j].Index = idx 90 | idx++ 91 | } 92 | b.StartTimer() 93 | err := ls.StoreLogs(batch) 94 | b.StopTimer() 95 | if err != nil { 96 | b.Fatalf("error appending: %s", err) 97 | } 98 | } 99 | } 100 | 101 | func BenchmarkGetLogs(b *testing.B) { 102 | sizes := []int{ 103 | 1000, 104 | 1_000_000, 105 | } 106 | sizeNames := []string{ 107 | "1k", 108 | "1m", 109 | } 110 | for i, s := range sizes { 111 | wLs, done := openWAL(b) 112 | defer done() 113 | // close _first_ (defers run in reverse order) before done() which will 114 | // delete since rotate could still be happening 115 | defer wLs.Close() 116 | populateLogs(b, wLs, s, 128) // fixed 128 byte logs 117 | 118 | bLs := openBolt(b) 119 | populateLogs(b, bLs, s, 128) // fixed 128 byte logs 120 | 121 | b.Run(fmt.Sprintf("numLogs=%s/v=WAL", sizeNames[i]), func(b *testing.B) { 122 | runGetLogBench(b, wLs, s) 123 | }) 124 | b.Run(fmt.Sprintf("numLogs=%s/v=Bolt", sizeNames[i]), func(b *testing.B) { 125 | runGetLogBench(b, bLs, s) 126 | }) 127 | } 128 | } 129 | 130 | func populateLogs(b *testing.B, ls raft.LogStore, n, size int) { 131 | batchSize := 1000 132 | batch := make([]*raft.Log, 0, batchSize) 133 | start := time.Now() 134 | for i := 0; i < n; i++ { 135 | l := raft.Log{Index: uint64(i + 1), Data: randomData[:2], AppendedAt: time.Now()} 136 | batch = append(batch, &l) 137 | if len(batch) == batchSize { 138 | err := ls.StoreLogs(batch) 139 | require.NoError(b, err) 140 | batch = batch[:0] 141 | } 142 | } 143 | if len(batch) > 0 { 144 | err := ls.StoreLogs(batch) 145 | require.NoError(b, err) 146 | } 147 | b.Logf("populateTime=%s", time.Since(start)) 148 | } 149 | 150 | func runGetLogBench(b *testing.B, ls raft.LogStore, n int) { 151 | b.ResetTimer() 152 | var log raft.Log 153 | for i := 0; i < b.N; i++ { 154 | b.StartTimer() 155 | err := ls.GetLog(uint64((i+1)%n), &log) 156 | b.StopTimer() 157 | require.NoError(b, err) 158 | } 159 | } 160 | 161 | // These OS benchmarks showed that at least on my Mac Creating and preallocating 162 | // a file is not reliably quicker than renaming a file we already created and 163 | // preallocated so the extra work of doing that in the background ahead of time 164 | // and just renaming it during rotation seems unnecessary. We are not fsyncing 165 | // either the file or parent dir in either case which dominates cost of either 166 | // operation. Three random consecutive runs on my machine: 167 | // 168 | // BenchmarkOSCreateAndPreallocate-16 100 370304 ns/op 221 B/op 3 allocs/op 169 | // BenchmarkOSRename-16 100 876001 ns/op 570 B/op 5 allocs/op 170 | // 171 | // BenchmarkOSCreateAndPreallocate-16 100 353654 ns/op 221 B/op 3 allocs/op 172 | // BenchmarkOSRename-16 100 168558 ns/op 570 B/op 5 allocs/op 173 | // 174 | // BenchmarkOSCreateAndPreallocate-16 100 367360 ns/op 224 B/op 3 allocs/op 175 | // BenchmarkOSRename-16 100 1353014 ns/op 571 B/op 5 allocs/op 176 | 177 | func BenchmarkOSCreateAndPreallocate(b *testing.B) { 178 | tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*") 179 | require.NoError(b, err) 180 | defer os.RemoveAll(tmpDir) 181 | 182 | b.ResetTimer() 183 | for i := 0; i < b.N; i++ { 184 | fname := filepath.Join(tmpDir, fmt.Sprintf("test-%d.txt", i)) 185 | b.StartTimer() 186 | f, err := os.OpenFile(fname, os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644)) 187 | if err != nil { 188 | panic(err) // require is kinda slow in benchmarks 189 | } 190 | err = fileutil.Preallocate(f, int64(64*1024*1024), true) 191 | if err != nil { 192 | panic(err) 193 | } 194 | b.StopTimer() 195 | f.Close() 196 | } 197 | } 198 | 199 | func BenchmarkOSRename(b *testing.B) { 200 | tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*") 201 | require.NoError(b, err) 202 | defer os.RemoveAll(tmpDir) 203 | 204 | b.ResetTimer() 205 | for i := 0; i < b.N; i++ { 206 | tmpName := filepath.Join(tmpDir, fmt.Sprintf("%d.tmp", i%2)) 207 | // Create the tmp file outside timer loop to simulate it happening in the 208 | // background 209 | f, err := os.OpenFile(tmpName, os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644)) 210 | require.NoError(b, err) 211 | f.Close() 212 | 213 | fname := filepath.Join(tmpDir, fmt.Sprintf("test-%d.txt", i)) 214 | b.StartTimer() 215 | // Note we are not fsyncing parent dir in either case 216 | err = os.Rename(tmpName, fname) 217 | if err != nil { 218 | panic(err) 219 | } 220 | b.StopTimer() 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /alice/checker/main.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package main 5 | 6 | import ( 7 | "bufio" 8 | "bytes" 9 | "flag" 10 | "fmt" 11 | "log" 12 | "os" 13 | "regexp" 14 | "strconv" 15 | 16 | "github.com/hashicorp/raft" 17 | wal "github.com/hashicorp/raft-wal" 18 | ) 19 | 20 | func main() { 21 | flag.Parse() 22 | 23 | if err := run(os.Args[1], os.Args[2]); err != nil { 24 | log.Fatal(err) 25 | } 26 | } 27 | 28 | var re = regexp.MustCompile("(\\w+)=(\\d+)") 29 | 30 | type runSummary struct { 31 | lastCommit uint64 32 | truncatedAfter uint64 33 | truncatedBefore uint64 34 | willTruncateAfter uint64 35 | willTruncateBefore uint64 36 | truncatedEntriesMaybeAfter uint64 37 | willTruncateHead bool 38 | willTruncateTail bool 39 | truncatedHead bool 40 | truncatedTail bool 41 | } 42 | 43 | func readStdoutFile(stdoutFile string) (runSummary, error) { 44 | var sum runSummary 45 | 46 | stdout, err := os.Open(stdoutFile) 47 | if err != nil { 48 | return sum, err 49 | } 50 | defer stdout.Close() 51 | 52 | scanner := bufio.NewScanner(stdout) 53 | for scanner.Scan() { 54 | line := scanner.Text() 55 | if len(line) == 0 { 56 | continue 57 | } 58 | matches := re.FindStringSubmatch(line) 59 | if matches != nil { 60 | n, err := strconv.Atoi(matches[2]) 61 | if err != nil { 62 | return sum, err 63 | } 64 | switch matches[1] { 65 | case "commitIdx": 66 | sum.lastCommit = uint64(n) 67 | case "truncatedBefore": 68 | sum.truncatedHead = true 69 | if n > int(sum.truncatedBefore) { 70 | sum.truncatedBefore = uint64(n) 71 | sum.truncatedEntriesMaybeAfter = sum.lastCommit 72 | } 73 | case "truncatedAfter": 74 | sum.truncatedTail = true 75 | if int(sum.truncatedAfter) == 0 || n < int(sum.truncatedAfter) { 76 | sum.truncatedAfter = uint64(n) 77 | sum.truncatedEntriesMaybeAfter = sum.truncatedAfter 78 | } 79 | if n < int(sum.lastCommit) { 80 | sum.lastCommit = uint64(n) 81 | } 82 | case "willTruncateAfter": 83 | sum.willTruncateAfter = uint64(n) 84 | sum.willTruncateTail = true 85 | sum.truncatedEntriesMaybeAfter = sum.willTruncateAfter 86 | case "willTruncateBefore": 87 | sum.willTruncateBefore = uint64(n) 88 | sum.willTruncateHead = true 89 | sum.truncatedEntriesMaybeAfter = sum.lastCommit 90 | default: 91 | // skip unknown output KVs 92 | } 93 | continue 94 | } 95 | return sum, fmt.Errorf("unrecognizable output line: %s", line) 96 | } 97 | return sum, nil 98 | } 99 | 100 | func validateFirst(first uint64, expect runSummary) error { 101 | switch { 102 | case expect.willTruncateHead: 103 | if expect.truncatedHead { 104 | // We actually completed the truncation. First must now be the new index. 105 | if first != expect.truncatedBefore { 106 | return fmt.Errorf("Expected first to be %d after truncation. Got %d", 107 | expect.truncatedBefore, first) 108 | } 109 | 110 | } else { 111 | // Not sure if truncation completed or not so allow either value 112 | if first != 1 && first != expect.willTruncateBefore { 113 | return fmt.Errorf("Expected first to be 1 before truncation, %d after. Got %d", 114 | expect.willTruncateBefore, first) 115 | } 116 | } 117 | 118 | case expect.willTruncateTail && expect.willTruncateAfter == 0: 119 | // Special case of an "everything" truncation which is modelled as a tail 120 | // truncation (after=0) In this case first will either be 1 before, 0 right 121 | // after truncation or 1 again after the next append. 122 | if first != 0 && first != 1 { 123 | return fmt.Errorf("Expected first to be 1 before truncation, 0 after or 1 after the next append. Got %d", 124 | first) 125 | } 126 | 127 | default: 128 | // No head truncations can have started yet. 129 | if first != 1 && first != 0 { 130 | return fmt.Errorf("Want first=1 or first=0 (if no writes yet) before any truncation. Got %d", first) 131 | } 132 | } 133 | return nil 134 | } 135 | 136 | func validateLast(last uint64, expect runSummary) error { 137 | switch { 138 | case expect.willTruncateTail: 139 | if expect.truncatedTail { 140 | // We actually completed the truncation. Last must now be the new index, 141 | // or the subsequent write if that's higher. 142 | if last != expect.truncatedAfter && last != expect.truncatedAfter+1 { 143 | return fmt.Errorf("Expected last to be %d after truncation or %d after subsequent append. Got %d", 144 | expect.truncatedAfter, expect.truncatedAfter+1, last) 145 | } 146 | 147 | } else { 148 | // Not sure if truncation completed or not so allow any last value greater 149 | // than the truncate after target (since we know the workload always 150 | // truncates after an index lower than commitIdx). 151 | if last < expect.willTruncateAfter { 152 | return fmt.Errorf("Expected last to be >= %d after before or after truncation. Got %d", 153 | expect.willTruncateAfter, last) 154 | } 155 | } 156 | 157 | default: 158 | // No tail truncations can have started yet. Just ensure we have everything committed. 159 | if last < expect.lastCommit { 160 | return fmt.Errorf("Want last >= lastCommit. Lost committed writes! last=%d commitIdx=%d", last, expect.lastCommit) 161 | } 162 | } 163 | return nil 164 | } 165 | 166 | func run(dir string, stdoutFile string) error { 167 | w, err := wal.Open(dir, wal.WithSegmentSize(32*1024)) 168 | if err != nil { 169 | return err 170 | } 171 | 172 | // Find the expected committed range 173 | expect, err := readStdoutFile(stdoutFile) 174 | if err != nil { 175 | return err 176 | } 177 | 178 | first, err := w.FirstIndex() 179 | if err != nil { 180 | return err 181 | } 182 | last, err := w.LastIndex() 183 | if err != nil { 184 | return err 185 | } 186 | 187 | if err := validateFirst(first, expect); err != nil { 188 | return err 189 | } 190 | if err := validateLast(last, expect); err != nil { 191 | return err 192 | } 193 | 194 | fmt.Printf("Found first=%d last=%d expected %v\n", first, last, expect) 195 | 196 | var i uint64 197 | var l raft.Log 198 | for i = first; i <= last; i++ { 199 | if i == 0 { 200 | // Everything was truncated so nothing to read! 201 | continue 202 | } 203 | if err := w.GetLog(i, &l); err != nil { 204 | return fmt.Errorf("error reading log [%d/%d] - %v", i, last, err) 205 | } 206 | // Verify contents match 207 | validPrefixes := []string{fmt.Sprintf("%03d|", i)} 208 | if (expect.willTruncateHead || expect.willTruncateTail) && i > expect.truncatedEntriesMaybeAfter { 209 | // If we will truncate but didn't yet either outcome is possible so 210 | // include both viable options. 211 | validPrefixes = append(validPrefixes, "Post Truncate Entry") 212 | } 213 | if (expect.truncatedTail || expect.truncatedHead) && i > expect.truncatedEntriesMaybeAfter { 214 | // Truncate completed so the original payload is no longer possible 215 | validPrefixes = validPrefixes[1:] 216 | } 217 | 218 | valid := false 219 | for _, vp := range validPrefixes { 220 | if bytes.HasPrefix(l.Data, []byte(vp)) { 221 | valid = true 222 | break 223 | } 224 | } 225 | if !valid { 226 | return fmt.Errorf("entry %d has unexpected payload. Want prefix in %q, got %q", 227 | i, validPrefixes, string(l.Data)) 228 | } 229 | } 230 | 231 | log.Printf("OK!\n") 232 | return nil 233 | } 234 | -------------------------------------------------------------------------------- /state.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package wal 5 | 6 | import ( 7 | "sync/atomic" 8 | 9 | "github.com/benbjohnson/immutable" 10 | "github.com/hashicorp/raft-wal/types" 11 | ) 12 | 13 | // state is an immutable snapshot of the state of the log. Modifications must be 14 | // made by copying and modifying the copy. This is easy enough because segments 15 | // is an immutable map so changing and re-assigning to the clone won't impact 16 | // the original map, and tail is just a pointer that can be mutated in the 17 | // shallow clone. Note that methods called on the tail segmentWriter may mutate 18 | // it's state so must only be called while holding the WAL's writeLock. 19 | type state struct { 20 | // refCount tracks readers that are reading segments based on this metadata. 21 | // It is accessed atomically nd must be 64 bit aligned (i.e. leave it at the 22 | // start of the struct). 23 | refCount int32 24 | // finaliser is set at most once while WAL is holding the write lock in order 25 | // to provide a func that must be called when all current readers are done 26 | // with this state. It's used for deferring closing and deleting old segments 27 | // until we can be sure no reads are still in progress on them. 28 | finalizer atomic.Value // func() 29 | 30 | nextSegmentID uint64 31 | 32 | // nextBaseIndex is used to signal which baseIndex to use next if there are no 33 | // segments or current tail. 34 | nextBaseIndex uint64 35 | segments *immutable.SortedMap[uint64, segmentState] 36 | tail types.SegmentWriter 37 | } 38 | 39 | type segmentState struct { 40 | types.SegmentInfo 41 | 42 | // r is the SegmentReader for our in-memory state. 43 | r types.SegmentReader 44 | } 45 | 46 | // Commit converts the in-memory state into a PersistentState. 47 | func (s *state) Persistent() types.PersistentState { 48 | segs := make([]types.SegmentInfo, 0, s.segments.Len()) 49 | it := s.segments.Iterator() 50 | for !it.Done() { 51 | _, s, _ := it.Next() 52 | segs = append(segs, s.SegmentInfo) 53 | } 54 | return types.PersistentState{ 55 | NextSegmentID: s.nextSegmentID, 56 | Segments: segs, 57 | } 58 | } 59 | 60 | func (s *state) getLog(index uint64) (*types.PooledBuffer, error) { 61 | // Check the tail writer first 62 | if s.tail != nil { 63 | raw, err := s.tail.GetLog(index) 64 | if err != nil && err != ErrNotFound { 65 | // Return actual errors since they might mask the fact that index really 66 | // is in the tail but failed to read for some other reason. 67 | return nil, err 68 | } 69 | if err == nil { 70 | // No error means we found it and just need to decode. 71 | return raw, nil 72 | } 73 | // Not in the tail segment, fall back to searching previous segments. 74 | } 75 | 76 | seg, err := s.findSegmentReader(index) 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | return seg.GetLog(index) 82 | } 83 | 84 | // findSegmentReader searches the segment tree for the segment that contains the 85 | // log at index idx. It may return the tail segment which may not in fact 86 | // contain idx if idx is larger than the last written index. Typically this is 87 | // called after already checking with the tail writer whether the log is in 88 | // there which means the caller can be sure it's not going to return the tail 89 | // segment. 90 | func (s *state) findSegmentReader(idx uint64) (types.SegmentReader, error) { 91 | 92 | if s.segments.Len() == 0 { 93 | return nil, ErrNotFound 94 | } 95 | 96 | // Search for a segment with baseIndex. 97 | it := s.segments.Iterator() 98 | 99 | // The baseIndex we want is the first one lower or equal to idx. Seek gets us 100 | // to the first result equal or greater so we are either at it (if equal) or 101 | // on the one _after_ the one we need. We step back since that's most likely 102 | it.Seek(idx) 103 | // The first call to Next/Prev actually returns the node the iterator is 104 | // currently on (which is probably the one after the one we want) but in some 105 | // edge cases we might actually want this one. Rather than reversing back and 106 | // coming forward again, just check both this and the one before it. 107 | _, seg, ok := it.Prev() 108 | if ok && seg.BaseIndex > idx { 109 | _, seg, ok = it.Prev() 110 | } 111 | 112 | // We either have the right segment or it doesn't exist. 113 | if ok && seg.MinIndex <= idx && (seg.MaxIndex == 0 || seg.MaxIndex >= idx) { 114 | return seg.r, nil 115 | } 116 | 117 | return nil, ErrNotFound 118 | } 119 | 120 | func (s *state) getTailInfo() *segmentState { 121 | it := s.segments.Iterator() 122 | it.Last() 123 | _, tail, ok := it.Next() 124 | if !ok { 125 | return nil 126 | } 127 | return &tail 128 | } 129 | 130 | func (s *state) append(entries []types.LogEntry) error { 131 | return s.tail.Append(entries) 132 | } 133 | 134 | func (s *state) firstIndex() uint64 { 135 | it := s.segments.Iterator() 136 | _, seg, ok := it.Next() 137 | if !ok { 138 | return 0 139 | } 140 | if seg.SealTime.IsZero() { 141 | // First segment is unsealed so is also the tail. Check it actually has at 142 | // least one log in otherwise it doesn't matter what the BaseIndex/MinIndex 143 | // are. 144 | if s.tail.LastIndex() == 0 { 145 | // No logs in the WAL 146 | return 0 147 | } 148 | // At least one log exists, return the MinIndex 149 | } 150 | return seg.MinIndex 151 | } 152 | 153 | func (s *state) lastIndex() uint64 { 154 | tailIdx := s.tail.LastIndex() 155 | if tailIdx > 0 { 156 | return tailIdx 157 | } 158 | // Current tail is empty. Check there are previous sealed segments. 159 | it := s.segments.Iterator() 160 | it.Last() 161 | _, _, ok := it.Prev() 162 | if !ok { 163 | // No tail! shouldn't be possible but means no logs yet 164 | return 0 165 | } 166 | // Go back to the segment before the tail 167 | _, _, ok = it.Prev() 168 | if !ok { 169 | // No previous segment so the whole log is empty 170 | return 0 171 | } 172 | 173 | // There was a previous segment so it's MaxIndex will be one less than the 174 | // tail's BaseIndex. 175 | tailSeg := s.getTailInfo() 176 | if tailSeg == nil || tailSeg.BaseIndex == 0 { 177 | return 0 178 | } 179 | return tailSeg.BaseIndex - 1 180 | } 181 | 182 | func (s *state) acquire() func() { 183 | atomic.AddInt32(&s.refCount, 1) 184 | return s.release 185 | } 186 | 187 | func (s *state) release() { 188 | // decrement on release 189 | new := atomic.AddInt32(&s.refCount, -1) 190 | if new == 0 { 191 | // Cleanup state associated with this version now all refs have gone. Since 192 | // there are no more refs and we should not set a finalizer until this state 193 | // is no longer the active state, we can be sure this will happen only one. 194 | // Even still lets swap the fn to ensure we only call finalizer once ever! 195 | // We can't swap actual nil as it's not the same type as func() so do a 196 | // dance with a nilFn below. 197 | var nilFn func() 198 | fnRaw := s.finalizer.Swap(nilFn) 199 | if fn, ok := fnRaw.(func()); ok && fn != nil { 200 | fn() 201 | } 202 | } 203 | } 204 | 205 | // clone returns a new state which is a shallow copy of just the immutable parts 206 | // of s. This is safer than a simple assignment copy because that "reads" the 207 | // atomically modified state non-atomically. We never want to copy the refCount 208 | // or finalizer anyway. 209 | func (s *state) clone() state { 210 | return state{ 211 | nextSegmentID: s.nextSegmentID, 212 | segments: s.segments, 213 | tail: s.tail, 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /types/segment.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package types 5 | 6 | import ( 7 | "io" 8 | "time" 9 | ) 10 | 11 | // SegmentInfo is the metadata describing a single WAL segment. 12 | type SegmentInfo struct { 13 | // ID uniquely identifies this segment file 14 | ID uint64 15 | 16 | // BaseIndex is the raft index of the first entry that will be written to the 17 | // segment. 18 | BaseIndex uint64 19 | 20 | // MinIndex is the logical lowest index that still exists in the segment. It 21 | // may be greater than BaseIndex if a head truncation has "deleted" a prefix 22 | // of the segment. 23 | MinIndex uint64 24 | 25 | // MaxIndex is the logical highest index that still exists in the segment. It 26 | // may be lower than the actual highest index if a tail truncation has 27 | // "deleted" a suffix of the segment. It is zero for unsealed segments and 28 | // only set one seal. 29 | MaxIndex uint64 30 | 31 | // Codec identifies the codec used to encode log entries. Codec values 0 to 32 | // 16k (i.e. the lower 16 bits) are reserved for internal future usage. Custom 33 | // codecs must be registered with an identifier higher than this which the 34 | // caller is responsible for ensuring uniquely identifies the specific version 35 | // of their codec used in any given log. uint64 provides sufficient space that 36 | // a randomly generated identifier is almost certainly unique. 37 | Codec uint64 38 | 39 | // IndexStart is the file offset where the index can be read from it's 0 for 40 | // tail segments and only set after a segment is sealed. 41 | IndexStart uint64 42 | 43 | // CreateTime records when the segment was first created. 44 | CreateTime time.Time 45 | 46 | // SealTime records when the segment was sealed. Zero indicates that it's not 47 | // sealed yet. 48 | SealTime time.Time 49 | 50 | // SizeLimit is the soft limit for the segment's size. The segment file may be 51 | // pre-allocated to this size on filesystems that support it. It is a soft 52 | // limit in the sense that the final Append usually takes the segment file 53 | // past this size before it is considered full and sealed. 54 | SizeLimit uint32 55 | } 56 | 57 | // SegmentFiler is the interface that provides access to segments to the WAL. It 58 | // encapsulated creating, and recovering segments and returning reader or writer 59 | // interfaces to interact with them. It's main purpose is to abstract the core 60 | // WAL logic both from the actual encoding layer of segment files. You can think 61 | // of it as a layer of abstraction above the VFS which abstracts actual file 62 | // system operations on files but knows nothing about the format. In tests for 63 | // example we can implement a SegmentFiler that is way simpler than the real 64 | // encoding/decoding layer on top of a VFS - even an in-memory VFS which makes 65 | // tests much simpler to write and run. 66 | type SegmentFiler interface { 67 | // Create adds a new segment with the given info and returns a writer or an 68 | // error. 69 | Create(info SegmentInfo) (SegmentWriter, error) 70 | 71 | // RecoverTail is called on an unsealed segment when re-opening the WAL it 72 | // will attempt to recover from a possible crash. It will either return an 73 | // error, or return a valid segmentWriter that is ready for further appends. 74 | // If the expected tail segment doesn't exist it must return an error wrapping 75 | // os.ErrNotExist. 76 | RecoverTail(info SegmentInfo) (SegmentWriter, error) 77 | 78 | // Open an already sealed segment for reading. Open may validate the file's 79 | // header and return an error if it doesn't match the expected info. 80 | Open(info SegmentInfo) (SegmentReader, error) 81 | 82 | // List returns the set of segment IDs currently stored. It's used by the WAL 83 | // on recovery to find any segment files that need to be deleted following a 84 | // unclean shutdown. The returned map is a map of ID -> BaseIndex. BaseIndex 85 | // is returned to allow subsequent Delete calls to be made. 86 | List() (map[uint64]uint64, error) 87 | 88 | // Delete removes the segment with given baseIndex and id if it exists. Note 89 | // that baseIndex is technically redundant since ID is unique on it's own. But 90 | // in practice we name files (or keys) with both so that they sort correctly. 91 | // This interface allows a simpler implementation where we can just delete 92 | // the file if it exists without having to scan the underlying storage for a. 93 | Delete(baseIndex, ID uint64) error 94 | } 95 | 96 | // SegmentWriter manages appending logs to the tail segment of the WAL. It's an 97 | // interface to make testing core WAL simpler. Every SegmentWriter will have 98 | // either `init` or `recover` called once before any other methods. When either 99 | // returns it must either return an error or be ready to accept new writes and 100 | // reads. 101 | type SegmentWriter interface { 102 | io.Closer 103 | SegmentReader 104 | 105 | // Append adds one or more entries. It must not return until the entries are 106 | // durably stored otherwise raft's guarantees will be compromised. Append must 107 | // not be called concurrently with any other call to Sealed, Append or 108 | // ForceSeal. 109 | Append(entries []LogEntry) error 110 | 111 | // Sealed returns whether the segment is sealed or not. If it is it returns 112 | // true and the file offset that it's index array starts at to be saved in 113 | // meta data. WAL will call this after every append so it should be relatively 114 | // cheap in the common case. This design allows the final Append to write out 115 | // the index or any additional data needed at seal time in the same fsync. 116 | // Sealed must not be called concurrently with any other call to Sealed, 117 | // Append or ForceSeal. 118 | Sealed() (bool, uint64, error) 119 | 120 | // ForceSeal causes the segment to become sealed by writing out an index 121 | // block. This is not used in the typical flow of append and rotation, but is 122 | // necessary during truncations where some suffix of the writer needs to be 123 | // truncated. Rather than manipulate what is on disk in a complex way, the WAL 124 | // will simply force seal it with whatever state it has already saved and then 125 | // open a new segment at the right offset for continued writing. ForceSeal may 126 | // be called on a segment that has already been sealed and should just return 127 | // the existing index offset in that case. (We don't actually rely on that 128 | // currently but it's easier not to assume we'll always call it at most once). 129 | // ForceSeal must not be called concurrently with any other call to Sealed, 130 | // Append or ForceSeal. 131 | ForceSeal() (uint64, error) 132 | 133 | // LastIndex returns the most recently persisted index in the log. It must 134 | // respond without blocking on Append since it's needed frequently by read 135 | // paths that may call it concurrently. Typically this will be loaded from an 136 | // atomic int. If the segment is empty lastIndex should return zero. 137 | LastIndex() uint64 138 | } 139 | 140 | // SegmentReader wraps a ReadableFile to allow lookup of logs in an existing 141 | // segment file. It's an interface to make testing core WAL simpler. The first 142 | // call will always be validate which passes in the ReaderAt to be used for 143 | // subsequent reads. 144 | type SegmentReader interface { 145 | io.Closer 146 | 147 | // GetLog returns the raw log entry bytes associated with idx. If the log 148 | // doesn't exist in this segment ErrNotFound must be returned. 149 | GetLog(idx uint64) (*PooledBuffer, error) 150 | } 151 | -------------------------------------------------------------------------------- /integration/integration_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package integration 5 | 6 | import ( 7 | "bytes" 8 | "fmt" 9 | "os" 10 | "strings" 11 | "testing" 12 | 13 | "github.com/hashicorp/raft" 14 | wal "github.com/hashicorp/raft-wal" 15 | "github.com/hashicorp/raft-wal/metadb" 16 | "github.com/stretchr/testify/require" 17 | ) 18 | 19 | type step func(w *wal.WAL) error 20 | 21 | func TestIntegrationScenarios(t *testing.T) { 22 | cases := []struct { 23 | name string 24 | steps []step 25 | expectFirstIdx, expectLastIdx int 26 | expectNumSegments int 27 | }{ 28 | { 29 | name: "basic creation, appends, rotation", 30 | steps: []step{ 31 | // ~256 bytes plus overhead per log want to write more than 4K segment 32 | // size. Batches of 4 are ~1k so 5 batches is enough to rotate once. 33 | appendLogsInBatches(5, 4), 34 | }, 35 | expectFirstIdx: 1, 36 | expectLastIdx: 20, 37 | expectNumSegments: 2, 38 | }, 39 | { 40 | name: "starting at high index, appends, rotation", 41 | steps: []step{ 42 | appendFirstLogAt(1_000_000), 43 | // ~256 bytes plus overhead per log want to write more than 4K segment 44 | // size. Batches of 4 are ~1k so 5 batches is enough to rotate once. 45 | appendLogsInBatches(5, 4), 46 | }, 47 | expectFirstIdx: 1_000_000, 48 | expectLastIdx: 1_000_020, 49 | expectNumSegments: 2, 50 | }, 51 | { 52 | name: "head truncation deleting no files", 53 | steps: []step{ 54 | appendLogsInBatches(11, 4), 55 | deleteRange(1, 2), 56 | }, 57 | expectFirstIdx: 3, 58 | expectLastIdx: 44, 59 | expectNumSegments: 3, 60 | }, 61 | { 62 | name: "head truncation deleting multiple files", 63 | steps: []step{ 64 | appendLogsInBatches(11, 4), 65 | deleteRange(1, 20), 66 | }, 67 | expectFirstIdx: 21, 68 | expectLastIdx: 44, 69 | expectNumSegments: 2, 70 | }, 71 | { 72 | name: "tail truncation in active segment", 73 | steps: []step{ 74 | appendLogsInBatches(11, 4), 75 | deleteRange(44, 44), // Delete the last one log 76 | }, 77 | expectFirstIdx: 1, 78 | expectLastIdx: 43, 79 | expectNumSegments: 4, 80 | }, 81 | { 82 | name: "tail truncation in active segment and write more", 83 | steps: []step{ 84 | appendLogsInBatches(11, 4), 85 | deleteRange(44, 44), // Delete the last one log 86 | appendLogsInBatches(1, 4), 87 | }, 88 | expectFirstIdx: 1, 89 | expectLastIdx: 47, 90 | expectNumSegments: 4, 91 | }, 92 | { 93 | name: "tail truncation deleting files", 94 | steps: []step{ 95 | appendLogsInBatches(11, 4), 96 | deleteRange(20, 44), 97 | }, 98 | expectFirstIdx: 1, 99 | expectLastIdx: 19, 100 | // Only need 2 segments but the truncation will rotate to a new tail 101 | expectNumSegments: 3, 102 | }, 103 | { 104 | name: "tail truncation deleting files and write more", 105 | steps: []step{ 106 | appendLogsInBatches(11, 4), 107 | deleteRange(20, 44), 108 | appendLogsInBatches(1, 4), 109 | }, 110 | expectFirstIdx: 1, 111 | expectLastIdx: 23, 112 | // Only need 2 segments but the truncation will rotate to a new tail 113 | expectNumSegments: 3, 114 | }, 115 | { 116 | name: "write some logs, truncate everything, restart logs from different index", 117 | steps: []step{ 118 | appendLogsInBatches(11, 4), 119 | deleteRange(1, 44), 120 | appendFirstLogAt(1000), 121 | appendLogsInBatches(1, 4), 122 | }, 123 | expectFirstIdx: 1000, 124 | expectLastIdx: 1004, 125 | expectNumSegments: 1, 126 | }, 127 | } 128 | 129 | for _, tc := range cases { 130 | tc := tc 131 | t.Run(tc.name, func(t *testing.T) { 132 | t.Parallel() 133 | 134 | tmpDir, err := os.MkdirTemp("", tc.name) 135 | require.NoError(t, err) 136 | defer os.RemoveAll(tmpDir) 137 | 138 | // Wrap the BoltDB meta store so we can peek into it's values. 139 | meta := &PeekingMetaStore{ 140 | meta: &metadb.BoltMetaDB{}, 141 | } 142 | 143 | w, err := wal.Open(tmpDir, 144 | // 4k segments to test rotation quicker 145 | wal.WithSegmentSize(4096), 146 | wal.WithMetaStore(meta), 147 | ) 148 | require.NoError(t, err) 149 | 150 | // Execute initial operations 151 | for i, step := range tc.steps { 152 | require.NoError(t, step(w), "failed on step %d", i) 153 | } 154 | 155 | // Assert expected properties 156 | assertLogContents(t, w, tc.expectFirstIdx, tc.expectLastIdx) 157 | assertNumSegments(t, meta, tmpDir, tc.expectNumSegments) 158 | 159 | // Close WAL and re-open 160 | require.NoError(t, w.Close()) 161 | 162 | meta2 := &PeekingMetaStore{ 163 | meta: &metadb.BoltMetaDB{}, 164 | } 165 | 166 | w2, err := wal.Open(tmpDir, 167 | wal.WithSegmentSize(4096), 168 | wal.WithMetaStore(meta2), 169 | ) 170 | require.NoError(t, err) 171 | defer w2.Close() 172 | 173 | // Assert expected properties still hold 174 | assertLogContents(t, w2, tc.expectFirstIdx, tc.expectLastIdx) 175 | assertNumSegments(t, meta2, tmpDir, tc.expectNumSegments) 176 | }) 177 | } 178 | } 179 | 180 | func appendLogsInBatches(nBatches, nPerBatch int) step { 181 | return func(w *wal.WAL) error { 182 | lastIdx, err := w.LastIndex() 183 | if err != nil { 184 | return err 185 | } 186 | nextIdx := lastIdx + 1 187 | 188 | return appendLogsInBatchesStartingAt(w, nBatches, nPerBatch, int(nextIdx)) 189 | } 190 | } 191 | 192 | func appendFirstLogAt(index int) step { 193 | return func(w *wal.WAL) error { 194 | return appendLogsInBatchesStartingAt(w, 1, 1, index) 195 | } 196 | } 197 | 198 | func appendLogsInBatchesStartingAt(w *wal.WAL, nBatches, nPerBatch, firstIndex int) error { 199 | nextIdx := uint64(firstIndex) 200 | 201 | batch := make([]*raft.Log, 0, nPerBatch) 202 | for b := 0; b < nBatches; b++ { 203 | for i := 0; i < nPerBatch; i++ { 204 | log := raft.Log{ 205 | Index: nextIdx, 206 | Data: makeValue(nextIdx), 207 | } 208 | batch = append(batch, &log) 209 | nextIdx++ 210 | } 211 | if err := w.StoreLogs(batch); err != nil { 212 | return err 213 | } 214 | batch = batch[:0] 215 | } 216 | return nil 217 | } 218 | 219 | func makeValue(n uint64) []byte { 220 | // Values are 16 repetitions of a 16 byte string based on the index so 256 221 | // bytes total. 222 | return bytes.Repeat([]byte(fmt.Sprintf("val-%011d\n", n)), 16) 223 | } 224 | 225 | func deleteRange(min, max int) step { 226 | return func(w *wal.WAL) error { 227 | return w.DeleteRange(uint64(min), uint64(max)) 228 | } 229 | } 230 | 231 | func assertLogContents(t *testing.T, w *wal.WAL, first, last int) { 232 | t.Helper() 233 | 234 | firstIdx, err := w.FirstIndex() 235 | require.NoError(t, err) 236 | lastIdx, err := w.LastIndex() 237 | require.NoError(t, err) 238 | 239 | require.Equal(t, first, int(firstIdx)) 240 | require.Equal(t, last, int(lastIdx)) 241 | 242 | var log raft.Log 243 | for i := first; i <= last; i++ { 244 | err := w.GetLog(uint64(i), &log) 245 | require.NoError(t, err, "log index %d", i) 246 | require.Equal(t, i, int(log.Index), "log index %d", i) 247 | require.Equal(t, string(makeValue(log.Index)), string(log.Data), "log index %d", i) 248 | } 249 | } 250 | 251 | func assertNumSegments(t *testing.T, meta *PeekingMetaStore, dir string, numSegments int) { 252 | t.Helper() 253 | 254 | state := meta.PeekState() 255 | require.Equal(t, numSegments, len(state.Segments)) 256 | 257 | // Check the right number of segment files on disk too 258 | des, err := os.ReadDir(dir) 259 | require.NoError(t, err) 260 | 261 | segFiles := make([]string, 0, numSegments) 262 | for _, de := range des { 263 | if de.IsDir() { 264 | continue 265 | } 266 | if strings.HasSuffix(de.Name(), ".wal") { 267 | segFiles = append(segFiles, de.Name()) 268 | } 269 | } 270 | require.Equal(t, numSegments, len(segFiles), "expected two segment files, got %v", segFiles) 271 | } 272 | -------------------------------------------------------------------------------- /metadb/metadb.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package metadb 5 | 6 | import ( 7 | "encoding/json" 8 | "errors" 9 | "fmt" 10 | "os" 11 | "path/filepath" 12 | 13 | "github.com/hashicorp/raft-wal/types" 14 | "go.etcd.io/bbolt" 15 | ) 16 | 17 | const ( 18 | // FileName is the default file name for the bolt db file. 19 | FileName = "wal-meta.db" 20 | 21 | // *Bucket are the names used for internal bolt buckets 22 | MetaBucket = "wal-meta" 23 | StableBucket = "stable" 24 | 25 | // We just need one key for now so use the byte 'm' for meta arbitrarily. 26 | MetaKey = "m" 27 | ) 28 | 29 | var ( 30 | // ErrUnintialized is returned when any call is made before Load has opened 31 | // the DB file. 32 | ErrUnintialized = errors.New("uninitialized") 33 | ) 34 | 35 | // BoltMetaDB implements types.MetaStore using BoltDB as a reliable persistent 36 | // store. See repo README for reasons for this design choice and performance 37 | // implications. 38 | type BoltMetaDB struct { 39 | dir string 40 | db *bbolt.DB 41 | } 42 | 43 | func (db *BoltMetaDB) ensureOpen(dir string) error { 44 | if db.dir != "" && db.dir != dir { 45 | return fmt.Errorf("can't load dir %s, already open in dir %s", dir, db.dir) 46 | } 47 | if db.db != nil { 48 | return nil 49 | } 50 | 51 | fileName := filepath.Join(dir, FileName) 52 | 53 | open := func() error { 54 | bb, err := bbolt.Open(fileName, 0644, nil) 55 | if err != nil { 56 | return fmt.Errorf("failed to open %s: %w", FileName, err) 57 | } 58 | db.db = bb 59 | db.dir = dir 60 | return nil 61 | } 62 | 63 | // BoltDB can get stuck in invalid states if we crash while it's initializing. 64 | // We can't distinguish those as safe to just wipe it and start again because 65 | // we don't know for sure if it's failing due to bad init or later corruption 66 | // (which would loose data if we just wipe and start over). So to ensure 67 | // initial creation of the WAL is as crash-safe as possible we will manually 68 | // detect we have an atomic init procedure: 69 | // 1. Check if file exits already. If yes, skip init and just open it. 70 | // 2. Delete any existing DB file with tmp name 71 | // 3. Creat a new BoltDB that is empty and has the buckets with a temp name. 72 | // 4. Once that's committed, rename to final name and Fsync parent dir 73 | _, err := os.Stat(fileName) 74 | if err == nil { 75 | // File exists, just open it 76 | return open() 77 | } 78 | if !errors.Is(err, os.ErrNotExist) { 79 | // Unknown err just return that 80 | return fmt.Errorf("failed to stat %s: %w", FileName, err) 81 | } 82 | 83 | // File doesn't exist, initialize a new DB in a crash-safe way 84 | if err := safeInitBoltDB(dir); err != nil { 85 | return fmt.Errorf("failed initializing meta DB: %w", err) 86 | } 87 | 88 | // All good, now open it! 89 | return open() 90 | } 91 | 92 | func safeInitBoltDB(dir string) error { 93 | tmpFileName := filepath.Join(dir, FileName+".tmp") 94 | 95 | // Delete any old attempts to init that were unsuccessful 96 | if err := os.RemoveAll(tmpFileName); err != nil { 97 | return err 98 | } 99 | 100 | // Open bolt DB at tmp file name 101 | bb, err := bbolt.Open(tmpFileName, 0644, nil) 102 | if err != nil { 103 | return err 104 | } 105 | 106 | tx, err := bb.Begin(true) 107 | defer tx.Rollback() 108 | 109 | if err != nil { 110 | return err 111 | } 112 | _, err = tx.CreateBucket([]byte(MetaBucket)) 113 | if err != nil { 114 | return err 115 | } 116 | _, err = tx.CreateBucket([]byte(StableBucket)) 117 | if err != nil { 118 | return err 119 | } 120 | if err := tx.Commit(); err != nil { 121 | return err 122 | } 123 | // Close the file ready to rename into place and re-open. This probably isn't 124 | // necessary but it make it easier to reason about this code path being 125 | // totally separate from the common case. 126 | if err := bb.Close(); err != nil { 127 | return err 128 | } 129 | 130 | // We created the DB OK. Now rename it to the final name. 131 | if err := os.Rename(tmpFileName, filepath.Join(dir, FileName)); err != nil { 132 | return err 133 | } 134 | 135 | // And Fsync that parent dir to make sure the new new file with it's new name 136 | // is persisted! 137 | dirF, err := os.Open(dir) 138 | if err != nil { 139 | return err 140 | } 141 | err = dirF.Sync() 142 | closeErr := dirF.Close() 143 | if err != nil { 144 | return err 145 | } 146 | return closeErr 147 | } 148 | 149 | // Load loads the existing persisted state. If there is no existing state 150 | // implementations are expected to create initialize new storage and return an 151 | // empty state. 152 | func (db *BoltMetaDB) Load(dir string) (types.PersistentState, error) { 153 | var state types.PersistentState 154 | 155 | if err := db.ensureOpen(dir); err != nil { 156 | return state, err 157 | } 158 | 159 | tx, err := db.db.Begin(false) 160 | if err != nil { 161 | return state, err 162 | } 163 | defer tx.Rollback() 164 | meta := tx.Bucket([]byte(MetaBucket)) 165 | 166 | // We just need one key for now so use the byte 'm' for meta arbitrarily. 167 | raw := meta.Get([]byte(MetaKey)) 168 | if raw == nil { 169 | // This is valid it's an "empty" log that will be initialized by the WAL. 170 | return state, nil 171 | } 172 | 173 | if err := json.Unmarshal(raw, &state); err != nil { 174 | return state, fmt.Errorf("%w: failed to parse persisted state: %s", types.ErrCorrupt, err) 175 | } 176 | return state, nil 177 | } 178 | 179 | // CommitState must atomically replace all persisted metadata in the current 180 | // store with the set provided. It must not return until the data is persisted 181 | // durably and in a crash-safe way otherwise the guarantees of the WAL will be 182 | // compromised. The WAL will only ever call this in a single thread at one 183 | // time and it will never be called concurrently with Load however it may be 184 | // called concurrently with Get/SetStable operations. 185 | func (db *BoltMetaDB) CommitState(state types.PersistentState) error { 186 | if db.db == nil { 187 | return ErrUnintialized 188 | } 189 | 190 | encoded, err := json.Marshal(state) 191 | if err != nil { 192 | return fmt.Errorf("failed to encode persisted state: %w", err) 193 | } 194 | 195 | tx, err := db.db.Begin(true) 196 | if err != nil { 197 | return err 198 | } 199 | defer tx.Rollback() 200 | meta := tx.Bucket([]byte(MetaBucket)) 201 | 202 | if err := meta.Put([]byte(MetaKey), encoded); err != nil { 203 | return err 204 | } 205 | 206 | return tx.Commit() 207 | } 208 | 209 | // GetStable returns a value from stable store or nil if it doesn't exist. May 210 | // be called concurrently by multiple threads. 211 | func (db *BoltMetaDB) GetStable(key []byte) ([]byte, error) { 212 | if db.db == nil { 213 | return nil, ErrUnintialized 214 | } 215 | 216 | tx, err := db.db.Begin(false) 217 | if err != nil { 218 | return nil, err 219 | } 220 | defer tx.Rollback() 221 | stable := tx.Bucket([]byte(StableBucket)) 222 | 223 | val := stable.Get(key) 224 | if val == nil { 225 | return nil, nil 226 | } 227 | 228 | // Need to copy the value since bolt only guarantees the slice is valid until 229 | // end of txn. 230 | ret := make([]byte, len(val)) 231 | copy(ret, val) 232 | return ret, nil 233 | } 234 | 235 | // SetStable stores a value from stable store. May be called concurrently with 236 | // GetStable. 237 | func (db *BoltMetaDB) SetStable(key []byte, value []byte) error { 238 | if db.db == nil { 239 | return ErrUnintialized 240 | } 241 | 242 | tx, err := db.db.Begin(true) 243 | if err != nil { 244 | return err 245 | } 246 | defer tx.Rollback() 247 | stable := tx.Bucket([]byte(StableBucket)) 248 | 249 | if value == nil { 250 | err = stable.Delete(key) 251 | } else { 252 | err = stable.Put(key, value) 253 | } 254 | if err != nil { 255 | return err 256 | } 257 | 258 | return tx.Commit() 259 | } 260 | 261 | // Close implements io.Closer 262 | func (db *BoltMetaDB) Close() error { 263 | if db.db == nil { 264 | return nil 265 | } 266 | err := db.db.Close() 267 | db.db = nil 268 | return err 269 | } 270 | -------------------------------------------------------------------------------- /segment/writer_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "fmt" 8 | "sync/atomic" 9 | "testing" 10 | "time" 11 | 12 | "github.com/hashicorp/raft-wal/types" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | // TestConcurrentReadersAndWriter is designed to be run with race detector 17 | // enabled to validate the concurrent behavior of the segment. 18 | func TestConcurrentReadersAndWriter(t *testing.T) { 19 | vfs := newTestVFS() 20 | f := NewFiler("test", vfs) 21 | 22 | seg1 := testSegment(1) 23 | 24 | // Increase size limit so we keep going for a while. We don't want to make 25 | // this too large that we time out easily on slower machines though or in CI. 26 | // 256KiB passes easily on my laptop (~5s) and is big enough to take a 27 | // while to test concurrent accesses. 28 | seg1.SizeLimit = 256 * 1024 29 | 30 | wf, err := f.Create(seg1) 31 | require.NoError(t, err) 32 | 33 | var lastIndexWritten uint64 34 | var sealedMaxIndex uint64 35 | var numReads uint64 36 | 37 | writer := func() { 38 | idx := uint64(1) 39 | for { 40 | err := wf.Append([]types.LogEntry{{Index: idx, Data: []byte("test")}}) 41 | if err != nil { 42 | panic("error during append: " + err.Error()) 43 | } 44 | 45 | sealed, _, err := wf.Sealed() 46 | if err != nil { 47 | panic("error during sealed: " + err.Error()) 48 | } 49 | atomic.StoreUint64(&lastIndexWritten, idx) 50 | if sealed { 51 | atomic.StoreUint64(&sealedMaxIndex, idx) 52 | return 53 | } 54 | idx++ 55 | } 56 | } 57 | 58 | reader := func(doneCh chan<- struct{}) { 59 | // Follow the tail 60 | idx := uint64(1) 61 | for { 62 | // Complete once writer has stopped and we've read all of it's written 63 | // entries. 64 | finalIdx := atomic.LoadUint64(&sealedMaxIndex) 65 | if finalIdx > 0 && idx > finalIdx { 66 | doneCh <- struct{}{} 67 | return 68 | } 69 | if idx > wf.LastIndex() { 70 | time.Sleep(time.Millisecond) 71 | continue 72 | } 73 | 74 | log, err := wf.GetLog(idx) 75 | if err != nil { 76 | panic("error during GetLog: " + err.Error()) 77 | } 78 | if string(log.Bs) != "test" { 79 | panic("bad log read: " + string(log.Bs)) 80 | } 81 | atomic.AddUint64(&numReads, 1) 82 | idx++ 83 | } 84 | } 85 | 86 | // Start 10 readers and 1 writer in parallel 87 | done := make(chan struct{}, 10) 88 | for i := 0; i < cap(done); i++ { 89 | go reader(done) 90 | } 91 | go writer() 92 | 93 | complete := 0 94 | // Takes about 5 seconds on my laptop. Give it a really generous margin for CI 95 | // etc. though. 96 | timeoutCh := time.After(30 * time.Second) 97 | for complete < cap(done) { 98 | select { 99 | case <-timeoutCh: 100 | t.Fatalf("Took longer than 10 seconds to write and read the whole segment. w=%d, r=%d s=%d", 101 | atomic.LoadUint64(&lastIndexWritten), 102 | atomic.LoadUint64(&numReads), 103 | atomic.LoadUint64(&sealedMaxIndex), 104 | ) 105 | case <-done: 106 | complete++ 107 | } 108 | } 109 | 110 | t.Logf("Written: %d, Read: %d, SealedMax: %d", 111 | atomic.LoadUint64(&lastIndexWritten), 112 | atomic.LoadUint64(&numReads), 113 | atomic.LoadUint64(&sealedMaxIndex), 114 | ) 115 | 116 | // Check we actually did something! 117 | require.Greater(t, int(atomic.LoadUint64(&lastIndexWritten)), 1000) 118 | require.Greater(t, int(atomic.LoadUint64(&numReads)), 1000) 119 | require.Greater(t, int(atomic.LoadUint64(&sealedMaxIndex)), 1000) 120 | } 121 | 122 | func TestWriterRecoversFromWriteFailure(t *testing.T) { 123 | cases := []struct { 124 | name string 125 | setupFailure func(f *testWritableFile, batch []types.LogEntry) 126 | fixFailure func(batch []types.LogEntry) 127 | }{ 128 | { 129 | name: "fwrite failure", 130 | setupFailure: func(f *testWritableFile, batch []types.LogEntry) { 131 | f.failNextWrite() 132 | }, 133 | }, 134 | { 135 | name: "fsync failure", 136 | setupFailure: func(f *testWritableFile, batch []types.LogEntry) { 137 | f.failNextSync() 138 | }, 139 | }, 140 | { 141 | name: "log append failure", 142 | setupFailure: func(f *testWritableFile, batch []types.LogEntry) { 143 | // Should cause monotonicity check to fail but only on last log after 144 | // other logs have been written and internal state updated. 145 | batch[len(batch)-1].Index = 123456 146 | }, 147 | fixFailure: func(batch []types.LogEntry) { 148 | batch[len(batch)-1].Index = batch[len(batch)-2].Index + 1 149 | }, 150 | }, 151 | } 152 | 153 | for _, tc := range cases { 154 | tc := tc 155 | 156 | testFn := func(t *testing.T, empty bool) { 157 | vfs := newTestVFS() 158 | 159 | f := NewFiler("test", vfs) 160 | 161 | seg0 := testSegment(1) 162 | 163 | w, err := f.Create(seg0) 164 | require.NoError(t, err) 165 | defer w.Close() 166 | 167 | batch := make([]types.LogEntry, 5) 168 | for i := range batch { 169 | batch[i].Index = uint64(i + 1) 170 | batch[i].Data = []byte(fmt.Sprintf("val-%d", i+1)) 171 | } 172 | maxIdx := len(batch) 173 | expectFirstIdx := 0 174 | expectLastIdx := 0 175 | 176 | if !empty { 177 | require.NoError(t, w.Append(batch)) 178 | expectFirstIdx = 1 179 | expectLastIdx = maxIdx 180 | for i := range batch { 181 | batch[i].Index = uint64(i + maxIdx + 1) 182 | batch[i].Data = []byte(fmt.Sprintf("val-%d", i+maxIdx+1)) 183 | } 184 | } 185 | 186 | tf := testFileFor(t, w) 187 | 188 | tc.setupFailure(tf, batch) 189 | 190 | require.Error(t, w.Append(batch)) 191 | assertExpectedLogs(t, w, expectFirstIdx, expectLastIdx) 192 | 193 | if tc.fixFailure != nil { 194 | tc.fixFailure(batch) 195 | } 196 | 197 | // Now retry that write, it should work! 198 | expectFirstIdx = 1 199 | expectLastIdx = int(batch[4].Index) 200 | require.NoError(t, w.Append(batch)) 201 | assertExpectedLogs(t, w, expectFirstIdx, expectLastIdx) 202 | 203 | // Also, re-open the file "from disk" to make sure what has been written 204 | // is correct and recoverable! 205 | w2, err := f.RecoverTail(seg0) 206 | require.NoError(t, err) 207 | assertExpectedLogs(t, w2, expectFirstIdx, expectLastIdx) 208 | w2.Close() 209 | } 210 | 211 | t.Run(tc.name+" empty", func(t *testing.T) { 212 | testFn(t, true) 213 | }) 214 | t.Run(tc.name+" non-empty", func(t *testing.T) { 215 | testFn(t, false) 216 | }) 217 | } 218 | } 219 | 220 | func assertExpectedLogs(t *testing.T, w types.SegmentWriter, first, last int) { 221 | t.Helper() 222 | 223 | require.Equal(t, uint64(last), w.LastIndex()) 224 | if last == 0 { 225 | return 226 | } 227 | assertExpectedReaderLogs(t, w, first, last) 228 | } 229 | 230 | func assertExpectedReaderLogs(t *testing.T, r types.SegmentReader, first, last int) { 231 | t.Helper() 232 | 233 | for idx := first; idx <= last; idx++ { 234 | buf, err := r.GetLog(uint64(idx)) 235 | require.NoError(t, err) 236 | require.Equal(t, fmt.Sprintf("val-%d", idx), string(buf.Bs)) 237 | buf.Close() 238 | } 239 | } 240 | 241 | func TestWriterForceSeal(t *testing.T) { 242 | vfs := newTestVFS() 243 | 244 | f := NewFiler("test", vfs) 245 | 246 | seg0 := testSegment(1) 247 | 248 | w, err := f.Create(seg0) 249 | require.NoError(t, err) 250 | defer w.Close() 251 | 252 | batch := make([]types.LogEntry, 5) 253 | for i := range batch { 254 | batch[i].Index = uint64(i + 1) 255 | batch[i].Data = []byte(fmt.Sprintf("val-%d", i+1)) 256 | } 257 | require.NoError(t, w.Append(batch)) 258 | 259 | assertExpectedLogs(t, w, 1, 5) 260 | 261 | // Should not have sealed after one append. 262 | sealed, indexStart, err := w.Sealed() 263 | require.NoError(t, err) 264 | require.False(t, sealed) 265 | require.Equal(t, 0, int(indexStart)) 266 | 267 | // Force seal it 268 | indexStart, err = w.ForceSeal() 269 | require.NoError(t, err) 270 | require.Greater(t, int(indexStart), 0) 271 | 272 | // It should be possible to open it with a reader now 273 | seg0.IndexStart = indexStart 274 | r, err := f.Open(seg0) 275 | require.NoError(t, err) 276 | 277 | assertExpectedReaderLogs(t, r, 1, 5) 278 | } 279 | -------------------------------------------------------------------------------- /segment/format.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "bytes" 8 | "encoding/binary" 9 | "errors" 10 | "fmt" 11 | "io" 12 | 13 | "github.com/hashicorp/raft-wal/types" 14 | ) 15 | 16 | const ( 17 | // MaxEntrySize is the largest we allow any single raft log entry to be. This 18 | // is larger than our raft implementation ever allows so seems safe to encode 19 | // statically for now. We could make this configurable. It's main purpose it 20 | // to limit allocation when reading entries back if their lengths are 21 | // corrupted. 22 | MaxEntrySize = 64 * 1024 * 1024 // 64 MiB 23 | 24 | // minBufSize is the size we allocate read and write buffers. Setting it 25 | // larger wastes more memory but increases the chances that we'll read the 26 | // whole frame in a single shot and not need a second allocation and trip to 27 | // the disk. 28 | minBufSize = 64 * 1024 29 | 30 | fileHeaderLen = 32 31 | version = 0 32 | magic = 0x58eb6b0d 33 | 34 | // Note that this must remain a power of 2 to ensure aligning to this also 35 | // aligns to sector boundaries. 36 | frameHeaderLen = 8 37 | ) 38 | 39 | const ( // Start iota from 0 40 | FrameInvalid uint8 = iota 41 | FrameEntry 42 | FrameIndex 43 | FrameCommit 44 | ) 45 | 46 | var ( 47 | // ErrTooBig indicates that the caller tried to write a logEntry with a 48 | // payload that's larger than we are prepared to support. 49 | ErrTooBig = errors.New("entries larger than 64MiB are not supported") 50 | ) 51 | 52 | /* 53 | 54 | File Header functions 55 | 56 | 0 1 2 3 4 5 6 7 8 57 | +------+------+------+------+------+------+------+------+ 58 | | Magic | Reserved | Vsn | 59 | +------+------+------+------+------+------+------+------+ 60 | | BaseIndex | 61 | +------+------+------+------+------+------+------+------+ 62 | | SegmentID | 63 | +------+------+------+------+------+------+------+------+ 64 | | Codec | 65 | +------+------+------+------+------+------+------+------+ 66 | 67 | */ 68 | 69 | // writeFileHeader writes a file header into buf for the given file metadata. 70 | func writeFileHeader(buf []byte, info types.SegmentInfo) error { 71 | if len(buf) < fileHeaderLen { 72 | return io.ErrShortBuffer 73 | } 74 | 75 | binary.LittleEndian.PutUint32(buf[0:4], magic) 76 | // Explicitly zero Reserved bytes just in case 77 | buf[4] = 0 78 | buf[5] = 0 79 | buf[6] = 0 80 | buf[7] = version 81 | binary.LittleEndian.PutUint64(buf[8:16], info.BaseIndex) 82 | binary.LittleEndian.PutUint64(buf[16:24], info.ID) 83 | binary.LittleEndian.PutUint64(buf[24:32], info.Codec) 84 | return nil 85 | } 86 | 87 | // readFileHeader reads a file header from buf. 88 | func readFileHeader(buf []byte) (*types.SegmentInfo, error) { 89 | if len(buf) < fileHeaderLen { 90 | return nil, io.ErrShortBuffer 91 | } 92 | 93 | var i types.SegmentInfo 94 | m := binary.LittleEndian.Uint64(buf[0:8]) 95 | if m != magic { 96 | return nil, types.ErrCorrupt 97 | } 98 | if buf[7] != version { 99 | return nil, types.ErrCorrupt 100 | } 101 | i.BaseIndex = binary.LittleEndian.Uint64(buf[8:16]) 102 | i.ID = binary.LittleEndian.Uint64(buf[16:24]) 103 | i.Codec = binary.LittleEndian.Uint64(buf[24:32]) 104 | return &i, nil 105 | } 106 | 107 | func validateFileHeader(got, expect types.SegmentInfo) error { 108 | if expect.ID != got.ID { 109 | return fmt.Errorf("%w: segment header ID %x doesn't match metadata %x", 110 | types.ErrCorrupt, got.ID, expect.ID) 111 | } 112 | if expect.BaseIndex != got.BaseIndex { 113 | return fmt.Errorf("%w: segment header BaseIndex %d doesn't match metadata %d", 114 | types.ErrCorrupt, got.BaseIndex, expect.BaseIndex) 115 | } 116 | if expect.Codec != got.Codec { 117 | return fmt.Errorf("%w: segment header Codec %d doesn't match metadata %d", 118 | types.ErrCorrupt, got.Codec, expect.Codec) 119 | } 120 | 121 | return nil 122 | } 123 | 124 | /* 125 | Frame Functions 126 | 127 | 0 1 2 3 4 5 6 7 8 128 | +------+------+------+------+------+------+------+------+ 129 | | Type | Reserved | Length/CRC | 130 | +------+------+------+------+------+------+------+------+ 131 | */ 132 | 133 | type frameHeader struct { 134 | typ uint8 135 | len uint32 136 | crc uint32 137 | } 138 | 139 | func writeFrame(buf []byte, h frameHeader, payload []byte) error { 140 | if len(buf) < encodedFrameSize(int(h.len)) { 141 | return io.ErrShortBuffer 142 | } 143 | if err := writeFrameHeader(buf, h); err != nil { 144 | return err 145 | } 146 | copy(buf[frameHeaderLen:], payload[:h.len]) 147 | // Explicitly write null bytes for padding 148 | padBytes := padLen(int(h.len)) 149 | for i := 0; i < padBytes; i++ { 150 | buf[frameHeaderLen+int(h.len)+i] = 0x0 151 | } 152 | return nil 153 | } 154 | 155 | func writeFrameHeader(buf []byte, h frameHeader) error { 156 | if len(buf) < frameHeaderLen { 157 | return io.ErrShortBuffer 158 | } 159 | buf[0] = h.typ 160 | buf[1] = 0 161 | buf[2] = 0 162 | buf[3] = 0 163 | lOrCRC := h.len 164 | if h.typ == FrameCommit { 165 | lOrCRC = h.crc 166 | } 167 | binary.LittleEndian.PutUint32(buf[4:8], lOrCRC) 168 | return nil 169 | } 170 | 171 | var zeroHeader [frameHeaderLen]byte 172 | 173 | func readFrameHeader(buf []byte) (frameHeader, error) { 174 | var h frameHeader 175 | if len(buf) < frameHeaderLen { 176 | return h, io.ErrShortBuffer 177 | } 178 | 179 | switch buf[0] { 180 | default: 181 | return h, fmt.Errorf("%w: corrupt frame header with unknown type %d", types.ErrCorrupt, buf[0]) 182 | 183 | case FrameInvalid: 184 | // Check if the whole header is zero and return a zero frame as this could 185 | // just indicate we've read right off the end of the written data during 186 | // recovery. 187 | if bytes.Equal(buf[:frameHeaderLen], zeroHeader[:]) { 188 | return h, nil 189 | } 190 | return h, fmt.Errorf("%w: corrupt frame header with type 0 but non-zero other fields", types.ErrCorrupt) 191 | 192 | case FrameEntry, FrameIndex: 193 | h.typ = buf[0] 194 | h.len = binary.LittleEndian.Uint32(buf[4:8]) 195 | 196 | case FrameCommit: 197 | h.typ = buf[0] 198 | h.crc = binary.LittleEndian.Uint32(buf[4:8]) 199 | } 200 | return h, nil 201 | } 202 | 203 | // padLen returns how many bytes of padding should be added to a frame of length 204 | // n to ensure it is a multiple of headerLen. We ensure frameHeaderLen is a 205 | // power of two so that it's always a multiple of a typical sector size (e.g. 206 | // 512 bytes) to reduce the risk that headers are torn by being written across 207 | // sector boundaries. It will return an int in the range [0, 7]. 208 | func padLen(n int) int { 209 | // This looks a bit awful but it's just doing (n % 8) and subtracting that 210 | // from 8 to get the number of bytes extra needed to get up to the next 8-byte 211 | // boundary. The extra & 7 is to handle the case where n is a multiple of 8 212 | // already and so n%8 is 0 and 8-0 is 8. By &ing 8 (0b1000) with 7 (0b111) we 213 | // effectively wrap it back around to 0. This only works as long as 214 | // frameHeaderLen is a power of 2 but that's necessary per comment above. 215 | return (frameHeaderLen - (n % frameHeaderLen)) & (frameHeaderLen - 1) 216 | } 217 | 218 | func encodedFrameSize(payloadLen int) int { 219 | return frameHeaderLen + payloadLen + padLen(payloadLen) 220 | } 221 | 222 | func indexFrameSize(numEntries int) int { 223 | // Index frames are completely unnecessary if the whole block is a 224 | // continuation with no new entries. 225 | if numEntries == 0 { 226 | return 0 227 | } 228 | return encodedFrameSize(numEntries * 4) 229 | } 230 | 231 | func writeIndexFrame(buf []byte, offsets []uint32) error { 232 | if len(buf) < indexFrameSize(len(offsets)) { 233 | return io.ErrShortBuffer 234 | } 235 | fh := frameHeader{ 236 | typ: FrameIndex, 237 | len: uint32(len(offsets) * 4), 238 | } 239 | if err := writeFrameHeader(buf, fh); err != nil { 240 | return err 241 | } 242 | cursor := frameHeaderLen 243 | for _, o := range offsets { 244 | binary.LittleEndian.PutUint32(buf[cursor:], o) 245 | cursor += 4 246 | } 247 | if (len(offsets) % 2) == 1 { 248 | // Odd number of entries, zero pad to keep it 8-byte aligned 249 | binary.LittleEndian.PutUint32(buf[cursor:], 0) 250 | } 251 | return nil 252 | } 253 | -------------------------------------------------------------------------------- /segment/vfs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "bytes" 8 | "encoding/hex" 9 | "errors" 10 | "fmt" 11 | "io" 12 | "os" 13 | "sort" 14 | "sync/atomic" 15 | "testing" 16 | 17 | "github.com/hashicorp/raft-wal/types" 18 | ) 19 | 20 | // testVFS implements types.VFS for testing. 21 | type testVFS struct { 22 | dir string 23 | files map[string]*testWritableFile 24 | trash map[string]*testWritableFile 25 | 26 | listErr error 27 | createErr error 28 | deleteErr error 29 | openErr error 30 | } 31 | 32 | func newTestVFS() *testVFS { 33 | return &testVFS{ 34 | files: make(map[string]*testWritableFile), 35 | trash: make(map[string]*testWritableFile), 36 | } 37 | } 38 | 39 | // ListDir returns a list of all files in the specified dir in lexicographical 40 | // order. If the dir doesn't exist, it must return an error. Empty array with 41 | // nil error is assumed to mean that the directory exists and was readable, 42 | // but contains no files. 43 | func (fs *testVFS) ListDir(dir string) ([]string, error) { 44 | if fs.listErr != nil { 45 | return nil, fs.listErr 46 | } 47 | if err := fs.setDir(dir); err != nil { 48 | return nil, err 49 | } 50 | 51 | files := make([]string, 0, len(fs.files)) 52 | for name := range fs.files { 53 | files = append(files, name) 54 | } 55 | sort.Strings(files) 56 | return files, nil 57 | } 58 | 59 | func (fs *testVFS) setDir(dir string) error { 60 | if fs.dir == "" { 61 | fs.dir = dir 62 | return nil 63 | } 64 | if fs.dir != dir { 65 | return fmt.Errorf("VFS called for different dir. Prev=%s Current=%s", fs.dir, dir) 66 | } 67 | return nil 68 | } 69 | 70 | // Create creates a new file with the given name. If a file with the same name 71 | // already exists an error is returned. If a non-zero size is given, 72 | // implementations should make a best effort to pre-allocate the file to be 73 | // that size. The dir must already exist and be writable to the current 74 | // process. 75 | func (fs *testVFS) Create(dir string, name string, size uint64) (types.WritableFile, error) { 76 | if fs.createErr != nil { 77 | return nil, fs.createErr 78 | } 79 | if err := fs.setDir(dir); err != nil { 80 | return nil, err 81 | } 82 | _, ok := fs.files[name] 83 | if ok { 84 | return nil, fmt.Errorf("file already exists") 85 | } 86 | f := newTestWritableFile(int(size)) 87 | fs.files[name] = f 88 | return f, nil 89 | } 90 | 91 | // Delete indicates the file is no longer required. Typically it should be 92 | // deleted from the underlying system to free disk space. 93 | func (fs *testVFS) Delete(dir string, name string) error { 94 | if fs.deleteErr != nil { 95 | return fs.deleteErr 96 | } 97 | if err := fs.setDir(dir); err != nil { 98 | return err 99 | } 100 | tf, ok := fs.files[name] 101 | if !ok { 102 | return nil 103 | } 104 | fs.trash[name] = tf 105 | delete(fs.files, name) 106 | return nil 107 | } 108 | 109 | // OpenReader opens an existing file in read-only mode. If the file doesn't 110 | // exist or permission is denied, an error is returned, otherwise no checks 111 | // are made about the well-formedness of the file, it may be empty, the wrong 112 | // size or corrupt in arbitrary ways. 113 | func (fs *testVFS) OpenReader(dir string, name string) (types.ReadableFile, error) { 114 | if fs.openErr != nil { 115 | return nil, fs.openErr 116 | } 117 | if err := fs.setDir(dir); err != nil { 118 | return nil, err 119 | } 120 | f, ok := fs.files[name] 121 | if !ok { 122 | return nil, os.ErrNotExist 123 | } 124 | return f, nil 125 | } 126 | 127 | // OpenWriter opens a file in read-write mode. If the file doesn't exist or 128 | // permission is denied, an error is returned, otherwise no checks are made 129 | // about the well-formedness of the file, it may be empty, the wrong size or 130 | // corrupt in arbitrary ways. 131 | func (fs *testVFS) OpenWriter(dir string, name string) (types.WritableFile, error) { 132 | if fs.openErr != nil { 133 | return nil, fs.openErr 134 | } 135 | if err := fs.setDir(dir); err != nil { 136 | return nil, err 137 | } 138 | f, ok := fs.files[name] 139 | if !ok { 140 | return nil, os.ErrNotExist 141 | } 142 | return f, nil 143 | } 144 | 145 | // testFileFor is a helper for reaching inside our interface types to access 146 | // the underlying "file". 147 | func testFileFor(t *testing.T, r types.SegmentReader) *testWritableFile { 148 | t.Helper() 149 | 150 | switch v := r.(type) { 151 | case *Reader: 152 | return v.rf.(*testWritableFile) 153 | case *Writer: 154 | return v.wf.(*testWritableFile) 155 | default: 156 | t.Fatalf("Invalid SegmentReader implementation passed: %t", r) 157 | return nil 158 | } 159 | } 160 | 161 | type testWritableFile struct { 162 | buf atomic.Value // []byte 163 | maxWritten int 164 | lastSyncStart int 165 | closed, dirty bool 166 | writeErr error 167 | syncErr error 168 | } 169 | 170 | func newTestWritableFile(size int) *testWritableFile { 171 | wf := &testWritableFile{} 172 | wf.buf.Store(make([]byte, 0, size)) 173 | return wf 174 | } 175 | 176 | func (f *testWritableFile) getBuf() []byte { 177 | return f.buf.Load().([]byte) 178 | } 179 | 180 | func (f *testWritableFile) failNextWrite() { 181 | f.writeErr = errors.New("IO error") 182 | } 183 | 184 | func (f *testWritableFile) failNextSync() { 185 | f.syncErr = errors.New("IO error") 186 | } 187 | 188 | // Truncate allows us to simulate the file being a different length to expected 189 | // for example due to a crash. 190 | func (f *testWritableFile) Truncate(size int) { 191 | buf := f.getBuf() 192 | 193 | // We use buffer capacity as a proxy for "file size" so we need a new buffer 194 | // with the right capacity. We'll slice it to the minimum of the new len or 195 | // the current len. 196 | l := len(buf) 197 | if size < l { 198 | l = size 199 | } 200 | newBuf := make([]byte, l, size) 201 | f.buf.Store(newBuf) 202 | f.maxWritten = l 203 | } 204 | 205 | func (f *testWritableFile) Dump() string { 206 | var buf bytes.Buffer 207 | d := hex.Dumper(&buf) 208 | bs := f.getBuf() 209 | max := 128 210 | if len(bs) < 128 { 211 | max = len(bs) 212 | } 213 | _, err := d.Write(bs[:max]) 214 | if err != nil { 215 | panic(err) 216 | } 217 | return buf.String() 218 | } 219 | 220 | func (f *testWritableFile) WriteAt(p []byte, off int64) (n int, err error) { 221 | if f.writeErr != nil { 222 | err := f.writeErr 223 | f.writeErr = nil 224 | return 0, err 225 | } 226 | if !f.dirty { 227 | f.lastSyncStart = int(off) 228 | } 229 | f.dirty = true 230 | maxOffset := int(off) + len(p) 231 | buf := f.getBuf() 232 | if maxOffset > len(buf) { 233 | // re-allocate to simulate appending additional bytes to end of a 234 | // pre-allocated file. 235 | nb := make([]byte, maxOffset) 236 | copy(nb, buf) 237 | buf = nb 238 | } else if off < int64(len(buf)) { 239 | // If this write is to an offset that was already visible to readers (less 240 | // than len(buf)) we can't write because that's racey, need to copy whole 241 | // buffer to mutate it safely. 242 | nb := make([]byte, len(buf), cap(buf)) 243 | copy(nb, buf) 244 | buf = nb 245 | } 246 | copy(buf[off:], p) 247 | if maxOffset > f.maxWritten { 248 | f.maxWritten = maxOffset 249 | } 250 | // Atomically replace the slice to allow readers to see the new appended data 251 | // or new backing array if we reallocated. 252 | f.buf.Store(buf) 253 | return len(p), nil 254 | } 255 | 256 | func (f *testWritableFile) ReadAt(p []byte, off int64) (n int, err error) { 257 | buf := f.getBuf() 258 | // Note we treat the whole cap of buf as "in" the file 259 | if int(off) >= cap(buf) { 260 | return 0, io.EOF 261 | } 262 | // Work out how many bytes we have to read left in the "file" 263 | n = cap(buf) - int(off) 264 | if n < len(p) { 265 | // We can't fill p as there are not enough bytes left in the "file" so 266 | // whatever we do read, also return EOF like a real file does. 267 | err = io.EOF 268 | } 269 | if off >= int64(len(buf)) { 270 | // Offset is within capacity of "file" but after the maximum visible byte so 271 | // just return empty bytes. 272 | for i := 0; i < len(p); i++ { 273 | p[i] = 0 274 | } 275 | return n, err 276 | } 277 | n = copy(p, buf[off:]) 278 | return n, err 279 | } 280 | 281 | func (f *testWritableFile) Close() error { 282 | f.closed = true 283 | return nil 284 | } 285 | 286 | func (f *testWritableFile) Sync() error { 287 | if f.syncErr != nil { 288 | err := f.syncErr 289 | f.syncErr = nil 290 | return err 291 | } 292 | f.dirty = false 293 | return nil 294 | } 295 | -------------------------------------------------------------------------------- /verifier/store.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package verifier 5 | 6 | import ( 7 | "encoding/binary" 8 | "errors" 9 | "fmt" 10 | "io" 11 | "sync/atomic" 12 | 13 | "github.com/hashicorp/go-hclog" 14 | "github.com/hashicorp/raft" 15 | "github.com/hashicorp/raft-wal/metrics" 16 | ) 17 | 18 | var _ raft.LogStore = &LogStore{} 19 | var _ raft.MonotonicLogStore = &LogStore{} 20 | 21 | // LogStore is a raft.LogStore that acts as middleware around an underlying 22 | // persistent store. It provides support for periodically verifying that ranges 23 | // of logs read back from the LogStore match the values written, and the values 24 | // read from the LogStores of other peers even though all peers will have 25 | // different actual log ranges due to independent snapshotting and truncation. 26 | // 27 | // Verification of the underlying log implementation may be performed as 28 | // follows: 29 | // 1. The application provides an implementation of `IsCheckpoint` that is 30 | // able to identify whether the encoded data represents a checkpoint 31 | // command. 32 | // 2. The application's raft leader then may periodically append such a 33 | // checkpoint log to be replicated out. 34 | // 3. When the LogStore has a log appended for which IsCheckpoint returns true, 35 | // it will write the current cumulative checksum over log entries since the 36 | // last checkpoint into the Extra field. Since hashicorp/raft only 37 | // replicates to peers _after_ a trip through the LogStore, this checksum 38 | // will be replicated. 39 | // 4. When a follower has a log appended for which IsCheckpoint returns true, 40 | // but already has non-empty Extra metadata, it will trigger a background 41 | // verification. 42 | // 5. Verification happens in the background and reads all logs from the 43 | // underlying store since the last checkpoint, calculating their checksums 44 | // cumulatively before calling the configured Report func with a summary of 45 | // what it found. 46 | type LogStore struct { 47 | checksum uint64 // accessed atomically 48 | sumStartIdx uint64 // accessed atomically 49 | 50 | s raft.LogStore 51 | 52 | metrics metrics.Collector 53 | log hclog.Logger 54 | 55 | verifyCh chan VerificationReport 56 | 57 | checkpointFn IsCheckpointFn 58 | reportFn ReportFn 59 | } 60 | 61 | // NewLogStore creates a verifying LogStore. CheckpointFn and ReportFn must be 62 | // set on the returned store _before_ it is passed to Raft, or may be left as 63 | // nil to bypass verification. Close must be called when the log store is no 64 | // longer useful to cleanup background verification. 65 | func NewLogStore(store raft.LogStore, checkpointFn IsCheckpointFn, reportFn ReportFn, mc metrics.Collector) *LogStore { 66 | c := &LogStore{ 67 | s: store, 68 | metrics: mc, 69 | verifyCh: make(chan VerificationReport, 1), 70 | checkpointFn: checkpointFn, 71 | reportFn: reportFn, 72 | } 73 | go c.runVerifier() 74 | return c 75 | } 76 | 77 | // FirstIndex returns the first index written. 0 for no entries. 78 | func (s *LogStore) FirstIndex() (uint64, error) { 79 | return s.s.FirstIndex() 80 | } 81 | 82 | // LastIndex returns the last index written. 0 for no entries. 83 | func (s *LogStore) LastIndex() (uint64, error) { 84 | return s.s.LastIndex() 85 | } 86 | 87 | // GetLog gets a log entry at a given index. 88 | func (s *LogStore) GetLog(index uint64, log *raft.Log) error { 89 | return s.s.GetLog(index, log) 90 | } 91 | 92 | // StoreLog stores a log entry. 93 | func (s *LogStore) StoreLog(log *raft.Log) error { 94 | return s.StoreLogs([]*raft.Log{log}) 95 | } 96 | 97 | func encodeCheckpointMeta(startIdx, sum uint64) []byte { 98 | var buf [24]byte 99 | binary.LittleEndian.PutUint64(buf[0:8], ExtensionMagicPrefix) 100 | binary.LittleEndian.PutUint64(buf[8:16], startIdx) 101 | binary.LittleEndian.PutUint64(buf[16:24], sum) 102 | return buf[:] 103 | } 104 | 105 | func decodeCheckpointMeta(bs []byte) (startIdx, sum uint64, err error) { 106 | if len(bs) < 24 { 107 | return 0, 0, io.ErrShortBuffer 108 | } 109 | magic := binary.LittleEndian.Uint64(bs[0:8]) 110 | if magic != ExtensionMagicPrefix { 111 | return 0, 0, errors.New("invalid extension data") 112 | } 113 | startIdx = binary.LittleEndian.Uint64(bs[8:16]) 114 | sum = binary.LittleEndian.Uint64(bs[16:24]) 115 | return startIdx, sum, nil 116 | } 117 | 118 | func (s *LogStore) updateVerifyState(log *raft.Log, checksum, startIdx uint64) (newSum, newStartIdx uint64, r *VerificationReport, err error) { 119 | // Check if log is a checkpoint, note we already nil-checked this function 120 | // before calling. 121 | isCP, err := s.checkpointFn(log) 122 | if err != nil { 123 | return 0, 0, nil, err 124 | } 125 | 126 | if startIdx == 0 { 127 | startIdx = log.Index 128 | } 129 | 130 | if isCP { 131 | r = &VerificationReport{ 132 | Range: LogRange{End: log.Index}, 133 | WrittenSum: checksum, 134 | } 135 | if len(log.Extensions) == 0 { 136 | // It's a new checkpoint and we must be the leader. Set our state. 137 | log.Extensions = encodeCheckpointMeta(startIdx, checksum) 138 | r.Range.Start = startIdx 139 | r.ExpectedSum = checksum 140 | } else { 141 | cpStartIdx, cpSum, err := decodeCheckpointMeta(log.Extensions) 142 | if err != nil { 143 | return 0, 0, nil, err 144 | } 145 | r.Range.Start = cpStartIdx 146 | r.ExpectedSum = cpSum 147 | 148 | // If we've calculated our own checksum over a different range to the 149 | // leader e.g. because we just started and this is the first sum then 150 | // there's no point trying to verify so leave WrittenSum zero. 151 | if cpStartIdx != startIdx { 152 | r.WrittenSum = 0 153 | } 154 | } 155 | // Reset the checksum as we're now in the range of the next checkpoint. We 156 | // don't update the store state yet until we know these logs committed to 157 | // the underlying store. 158 | checksum = 0 159 | startIdx = log.Index 160 | } 161 | 162 | // Whether checkpoint or not, hash the entry and update return updated 163 | // checksum. 164 | checksum = checksumLog(checksum, log) 165 | return checksum, startIdx, r, nil 166 | } 167 | 168 | // StoreLogs stores multiple log entries. 169 | func (s *LogStore) StoreLogs(logs []*raft.Log) error { 170 | if len(logs) < 1 { 171 | return nil 172 | } 173 | 174 | // Maintain a local copy of the checksum and sumStartIdx, we'll update the 175 | // state only once we know all these entries were stored. 176 | cs := atomic.LoadUint64(&s.checksum) 177 | startIdx := atomic.LoadUint64(&s.sumStartIdx) 178 | var triggeredReports []VerificationReport 179 | 180 | if s.checkpointFn != nil { 181 | var vr *VerificationReport 182 | var err error 183 | for _, log := range logs { 184 | cs, startIdx, vr, err = s.updateVerifyState(log, cs, startIdx) 185 | if err != nil { 186 | return fmt.Errorf("failed updating verifier state: %w", err) 187 | } 188 | if vr != nil { 189 | // We need to trigger a new checkpoint verification. But we can't until 190 | // after the logs are persisted below. 191 | triggeredReports = append(triggeredReports, *vr) 192 | } 193 | } 194 | } 195 | 196 | err := s.s.StoreLogs(logs) 197 | if err != nil { 198 | return err 199 | } 200 | 201 | // Update the checksum state now logs are committed. 202 | atomic.StoreUint64(&s.checksum, cs) 203 | atomic.StoreUint64(&s.sumStartIdx, startIdx) 204 | if len(triggeredReports) > 0 { 205 | s.metrics.IncrementCounter("checkpoints_written", uint64(len(triggeredReports))) 206 | } 207 | 208 | for _, r := range triggeredReports { 209 | s.triggerVerify(r) 210 | } 211 | return nil 212 | } 213 | 214 | // triggerVerify triggers a verification in the background. We won't block if 215 | // the verifier is busy. The chan is one buffered so there can be at most one 216 | // running and one waiting. If there is already one waiting so the chan is 217 | // blocked, we drop r. 218 | func (s *LogStore) triggerVerify(r VerificationReport) { 219 | select { 220 | case s.verifyCh <- r: 221 | default: 222 | s.metrics.IncrementCounter("dropped_reports", 1) 223 | } 224 | } 225 | 226 | // DeleteRange deletes a range of log entries. The range is inclusive. 227 | func (s *LogStore) DeleteRange(min uint64, max uint64) error { 228 | return s.s.DeleteRange(min, max) 229 | } 230 | 231 | // Close cleans up the background verification routine and calls Close on the 232 | // underlying store if it is an io.Closer. 233 | func (s *LogStore) Close() error { 234 | if s.verifyCh == nil { 235 | return nil 236 | } 237 | close(s.verifyCh) 238 | // Don't set verifyCh to nil as that's racey - it's being accessed from other 239 | // routines. 240 | if closer, ok := s.s.(io.Closer); ok { 241 | return closer.Close() 242 | } 243 | return nil 244 | } 245 | 246 | // IsMonotonic implements the raft.MonotonicLogStore interface. This is a shim 247 | // to expose the underlying store as monotonically indexed or not. 248 | func (s *LogStore) IsMonotonic() bool { 249 | if store, ok := s.s.(raft.MonotonicLogStore); ok { 250 | return store.IsMonotonic() 251 | } 252 | return false 253 | } 254 | -------------------------------------------------------------------------------- /verifier/verifier.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package verifier 5 | 6 | import ( 7 | "errors" 8 | "fmt" 9 | "time" 10 | 11 | "github.com/hashicorp/raft" 12 | "github.com/segmentio/fasthash/fnv1a" 13 | ) 14 | 15 | const ( 16 | // ExtensionMagicPrefix is the prefix we append to log Extensions fields to 17 | // disambiguate from other middleware that may use extensions. This value is 18 | // carefully constructed to be completely invalid as the beginning of a 19 | // protobuf (3) wire protocol message since the other known user of this field 20 | // encodes its data that way. If the first byte were 0xa8 this would be a 21 | // valid protobuf field encoding for an int field, however currently the 3 22 | // least significant bits encode the field type as 7, which is not a valid 23 | // type in the current spec. Even if this does change in the future, the 24 | // field's tag number encoded here is 123456789 so it's extremely unlikely 25 | // that any valid protobuf schema will ever have enough fields or arbitrarily 26 | // decide to assign field tags that large (though unrecognized tags would be 27 | // ignored on decode). Finally, the value of the field is the varint encoding 28 | // of the randomly chosen value 53906 so if type 7 is ever valid in the future 29 | // and used as a length-prefixed type, the length decoded would be way longer 30 | // than the buffer making it invalid. 31 | ExtensionMagicPrefix uint64 = 0xafd1f9d60392a503 32 | ) 33 | 34 | // IsCheckpointFn is a function that can decide whether the contents of a raft 35 | // log's Data represents a checkpoint message. It is called on every append so 36 | // it must be relatively fast in the common case. If it returns true for a log, 37 | // the log's Extra field will be used to encode verification metadata and must 38 | // be empty - if it's not empty the append will fail and force the leader to 39 | // step down. If an error is returned the same will happen. 40 | type IsCheckpointFn func(*raft.Log) (bool, error) 41 | 42 | // ReportFn is a function that will be called after every checkpoint has been 43 | // verified. It will not be called concurrently. The VerificationReport may 44 | // represent a failure to report so it's Err field should be checked. For 45 | // example, if checkpoints are arriving faster than they can be calculated, some 46 | // will be skipped and no report will be made for that range. The next report 47 | // that is delivered will contain the range missed for logging. Note that 48 | // ReportFn is called synchronously by the verifier so it should not block for 49 | // long otherwise it may cause the verifier to miss later checkpoints. 50 | type ReportFn func(VerificationReport) 51 | 52 | // ErrRangeMismatch is the error type returned in a VerificationReport where the 53 | // follower does not have enough logs on disk to fill the checkpoint's range and 54 | // so is bound to fail. This is a separate type from pure failures to read a log 55 | // because it's expected this could happen just after truncations or if the 56 | // interval is to large for the number of logs retained etc. Implementations may 57 | // choose to detect this and report as a warning rather than a failure as it 58 | // indicates only an inability to report correctly not an actual error in 59 | // processing data. 60 | var ErrRangeMismatch = errors.New("range mismatch") 61 | 62 | // ErrChecksumMismatch is the error type returned in a VerificationReport where 63 | // the log range's checksum didn't match. 64 | type ErrChecksumMismatch string 65 | 66 | // Error implements error 67 | func (e ErrChecksumMismatch) Error() string { 68 | return string(e) 69 | } 70 | 71 | // LogRange describes the set of logs in the range [Start, End). That is End is 72 | // NOT inclusive. 73 | type LogRange struct { 74 | Start uint64 75 | End uint64 76 | } 77 | 78 | // String implements Stringer 79 | func (r LogRange) String() string { 80 | return fmt.Sprintf("[%d, %d)", r.Start, r.End) 81 | } 82 | 83 | // VerificationReport describes the result of attempting to verify the contents 84 | // of all logs in a range compared with the input the leader delivered for that 85 | // same range. 86 | type VerificationReport struct { 87 | // Range is the range of raft indexes over which the leader calculated its 88 | // checksum. In steady state it typically starts with the index of the 89 | // previous checkpoint command, but after an election it could be an arbitrary 90 | // point in the log. If the range is no longer in the server's log (due to not 91 | // seeing one yet or it being truncated too soon) this will be reported as an 92 | // Err - a longer log retention (`raft.Config.TrailingLogs`) or shorter 93 | // interval between checkpoints should be chosen if this happens often. 94 | Range LogRange 95 | 96 | // ExpectedSum is a uint64 checksum over the logs in the range as calculated 97 | // by the leader before appending to disk. 98 | ExpectedSum uint64 99 | 100 | // WrittenSum is the uint64 checksum calculated over the logs in the range of 101 | // a follower as it wrote them to it's own LogStore. It might be zero to 102 | // indicate that the follower has not written all the logs in Range since 103 | // startup and so its written sum will be invalid. Risk of collision with 104 | // genuine zero sum is acceptable. If zero the verifier will have ignored it 105 | // and not raised an error if it didn't match expected. 106 | WrittenSum uint64 107 | 108 | // ReadSum is the uint64 checksum calculated over the logs in the range as 109 | // read from the underlying LogStore in the range [StartIndex, EndIndex). 110 | ReadSum uint64 111 | 112 | // Err indicates any error that prevented the report from being completed or 113 | // the result of the report. It will be set to ErrChecksumMismatch if the 114 | // report was conducted correctly, but the log data written or read checksum 115 | // did not match the leader's write checksum. The message in the error 116 | // describes the nature of the failure. 117 | Err error 118 | 119 | // SkippedRange indicates the ranges of logs covered by any checkpoints that 120 | // we skipped due to spending too much time verifying. If this is regularly 121 | // non-nil it likely indicates that the checkpoint frequency is too fast. 122 | SkippedRange *LogRange 123 | 124 | // Elapsed records how long it took to read the range and generate the report. 125 | Elapsed time.Duration 126 | } 127 | 128 | func (s *LogStore) runVerifier() { 129 | if s.reportFn == nil { 130 | // Nothing to do! 131 | return 132 | } 133 | 134 | var lastCheckPointIdx uint64 135 | for { 136 | report, ok := <-s.verifyCh 137 | if !ok { 138 | // Close was called 139 | return 140 | } 141 | 142 | // Detect skipped checkpoints 143 | if lastCheckPointIdx > 0 && lastCheckPointIdx != report.Range.Start { 144 | report.SkippedRange = &LogRange{ 145 | Start: lastCheckPointIdx, 146 | End: report.Range.Start, 147 | } 148 | } 149 | lastCheckPointIdx = report.Range.End 150 | 151 | st := time.Now() 152 | s.verify(&report) 153 | 154 | // Whatever state report ended up in, deliver it! 155 | report.Elapsed = time.Since(st) 156 | s.reportFn(report) 157 | s.metrics.IncrementCounter("ranges_verified", 1) 158 | } 159 | } 160 | 161 | func (s *LogStore) verify(report *VerificationReport) { 162 | // Attempt to read all the logs in the range from underlying store. 163 | var log raft.Log 164 | 165 | // If this is a follower but it _wrote_ different data to it's log than the 166 | // leader in this range then there's not much point verifying that we read it 167 | // back OK. 168 | if report.WrittenSum != 0 && report.WrittenSum != report.ExpectedSum { 169 | s.metrics.IncrementCounter("write_checksum_failures", 1) 170 | report.Err = ErrChecksumMismatch(fmt.Sprintf("log verification failed for range %s: "+ 171 | "in-flight corruption: follower wrote checksum=%08x, leader wrote checksum=%08x", 172 | report.Range, report.WrittenSum, report.ExpectedSum)) 173 | return 174 | } 175 | 176 | // Do we actually have enough logs to calculate the checksum? If not indicate 177 | // that explicitly as its an expected case rather than a real "error". Note 178 | // that we may get a racey false negative here if truncation happens right 179 | // between this check and the GetLog call below but there's not much we can do 180 | // about that and hopefully is rare enough! 181 | first, err := s.s.FirstIndex() 182 | if err != nil { 183 | report.Err = fmt.Errorf("unable to verify log range %s: %w", report.Range, err) 184 | return 185 | } 186 | if first > report.Range.Start { 187 | // We don't have enough logs to calculate this correctly. 188 | report.Err = ErrRangeMismatch 189 | return 190 | } 191 | 192 | sum := uint64(0) 193 | for idx := report.Range.Start; idx < report.Range.End; idx++ { 194 | err := s.s.GetLog(idx, &log) 195 | if err != nil { 196 | report.Err = fmt.Errorf("unable to verify log range %s: %w", report.Range, err) 197 | return 198 | } 199 | sum = checksumLog(sum, &log) 200 | } 201 | report.ReadSum = sum 202 | 203 | if report.ReadSum != report.ExpectedSum { 204 | s.metrics.IncrementCounter("read_checksum_failures", 1) 205 | report.Err = ErrChecksumMismatch(fmt.Sprintf("log verification failed for range %s: "+ 206 | "storage corruption: node read checksum=%08x, leader wrote checksum=%08x", 207 | report.Range, report.ReadSum, report.ExpectedSum)) 208 | return 209 | } 210 | } 211 | 212 | func checksumLog(sum uint64, log *raft.Log) uint64 { 213 | // Special case for bootstrap config entries (index 1, type configuration) 214 | // since these are not replicated by raft and so may not be byte-for-byte 215 | // identical as long as they are logical the same on all peers. So just treat 216 | // them all as identical to avoid false-positives on startup. 217 | if log.Index == 1 && log.Type == raft.LogConfiguration { 218 | return 0 219 | } 220 | sum = fnv1a.AddUint64(sum, log.Index) 221 | sum = fnv1a.AddUint64(sum, log.Term) 222 | sum = fnv1a.AddUint64(sum, uint64(log.Type)) 223 | sum = fnv1a.AddBytes64(sum, log.Data) 224 | if len(log.Extensions) > 0 { 225 | sum = fnv1a.AddBytes64(sum, log.Extensions) 226 | } 227 | return sum 228 | } 229 | -------------------------------------------------------------------------------- /migrate/migrate_test.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package migrate 5 | 6 | import ( 7 | "context" 8 | "encoding/binary" 9 | "fmt" 10 | "strings" 11 | "testing" 12 | "time" 13 | 14 | "github.com/hashicorp/raft" 15 | "github.com/hashicorp/raft-wal/types" 16 | "github.com/stretchr/testify/require" 17 | ) 18 | 19 | func TestCopyLogs(t *testing.T) { 20 | cases := []struct { 21 | name string 22 | startIndex uint64 23 | numLogs int 24 | batchBytes int 25 | wantNumBatches int 26 | nilChan bool 27 | cancelCtx bool 28 | wantErr string 29 | }{ 30 | { 31 | name: "basic copy", 32 | startIndex: 1234, 33 | numLogs: 1000, 34 | // Each log is 26 bytes but we assume 32 bytes of overhead for encoding in 35 | // general. So each log takes up 58 bytes of our batch size. 550 bytes is 36 | // not quite enough for 10 but we treat it as a soft limit so we'll get 10 37 | // per batch 38 | batchBytes: 550, 39 | wantNumBatches: 100, 40 | }, 41 | { 42 | name: "start from 1", 43 | startIndex: 1, 44 | numLogs: 1000, 45 | batchBytes: 580, // Exact fit for 10 entries 46 | wantNumBatches: 100, 47 | }, 48 | { 49 | name: "nil progress chan", 50 | startIndex: 1234, 51 | numLogs: 1000, 52 | batchBytes: 580, 53 | wantNumBatches: 100, 54 | // A nil progress chan shouldn't block the copy. 55 | nilChan: true, 56 | }, 57 | { 58 | name: "context cancel", 59 | startIndex: 1234, 60 | numLogs: 1000, 61 | batchBytes: 580, 62 | wantNumBatches: 0, 63 | cancelCtx: true, 64 | wantErr: "context canceled", 65 | }, 66 | } 67 | 68 | for _, tc := range cases { 69 | tc := tc 70 | t.Run(tc.name, func(t *testing.T) { 71 | src := populateTestLogStore(t, tc.startIndex, tc.numLogs) 72 | dst := &testLogStore{} 73 | var progress chan string 74 | if !tc.nilChan { 75 | // Buffer it more than enough so we won't have to read concurrently. 76 | progress = make(chan string, tc.wantNumBatches*3) 77 | } 78 | 79 | ctx := context.Background() 80 | if tc.cancelCtx { 81 | cancelledCtx, cancel := context.WithCancel(ctx) 82 | cancel() 83 | ctx = cancelledCtx 84 | } 85 | 86 | err := CopyLogs(ctx, dst, src, tc.batchBytes, progress) 87 | if tc.wantErr != "" { 88 | require.ErrorContains(t, err, tc.wantErr) 89 | return 90 | } 91 | require.NoError(t, err, "failed copy") 92 | 93 | if progress != nil { 94 | // This loop will not return if progress wasn't closed. 95 | for s := range progress { 96 | t.Log(s) 97 | } 98 | } 99 | 100 | // Verify the copy! 101 | wantFirst, _ := src.FirstIndex() 102 | wantLast, _ := src.LastIndex() 103 | gotFirst, _ := dst.FirstIndex() 104 | gotLast, _ := dst.LastIndex() 105 | require.Equal(t, int(wantFirst), int(gotFirst)) 106 | require.Equal(t, int(wantLast), int(gotLast)) 107 | 108 | var log raft.Log 109 | for idx := wantFirst; idx <= wantLast; idx++ { 110 | err := dst.GetLog(idx, &log) 111 | require.NoError(t, err) 112 | require.Equal(t, int(idx), int(log.Index)) 113 | require.Equal(t, string(logPayload(idx)), string(log.Data)) 114 | } 115 | 116 | // Validate that we actually split into chunks as expected. 117 | require.Equal(t, tc.wantNumBatches, dst.appends) 118 | }) 119 | } 120 | } 121 | 122 | func TestCopyStable(t *testing.T) { 123 | cases := []struct { 124 | name string 125 | srcVals map[string]string 126 | srcIntVals map[string]uint64 127 | extraKeys [][]byte 128 | extraIntKeys [][]byte 129 | nilChan bool 130 | cancelCtx bool 131 | wantErr string 132 | }{ 133 | { 134 | name: "basic raft data", 135 | srcVals: map[string]string{ 136 | "LastVoteCand": "s1", 137 | }, 138 | srcIntVals: map[string]uint64{ 139 | "CurrentTerm": 1234, 140 | "LastVoteTerm": 1000, 141 | }, 142 | }, 143 | { 144 | name: "context cancelled", 145 | srcVals: map[string]string{ 146 | "LastVoteCand": "s1", 147 | }, 148 | srcIntVals: map[string]uint64{ 149 | "CurrentTerm": 1234, 150 | "LastVoteTerm": 1000, 151 | }, 152 | cancelCtx: true, 153 | wantErr: "context canceled", 154 | }, 155 | { 156 | name: "additional keys", 157 | srcVals: map[string]string{ 158 | "LastVoteCand": "s1", 159 | "my_app_key": "foo", 160 | "my_other_key": "baz", 161 | "no_copy": "bar", 162 | }, 163 | srcIntVals: map[string]uint64{ 164 | "CurrentTerm": 1234, 165 | "LastVoteTerm": 1000, 166 | "favorite_term_so_far": 569, 167 | "least_favorite_number_no_copy": 4321, 168 | }, 169 | extraKeys: [][]byte{[]byte("my_app_key"), []byte("my_other_key")}, 170 | extraIntKeys: [][]byte{[]byte("favorite_term_so_far")}, 171 | }, 172 | } 173 | 174 | for _, tc := range cases { 175 | tc := tc 176 | t.Run(tc.name, func(t *testing.T) { 177 | src := newTestStableStore() 178 | dst := newTestStableStore() 179 | 180 | // Insert src values: 181 | for k, v := range tc.srcIntVals { 182 | err := src.SetUint64([]byte(k), v) 183 | require.NoError(t, err) 184 | } 185 | for k, v := range tc.srcVals { 186 | err := src.Set([]byte(k), []byte(v)) 187 | require.NoError(t, err) 188 | } 189 | 190 | var progress chan string 191 | if !tc.nilChan { 192 | // Buffer it more than enough so we won't have to read concurrently. 193 | progress = make(chan string, (len(tc.srcIntVals)+len(tc.srcVals))*3) 194 | } 195 | 196 | ctx := context.Background() 197 | if tc.cancelCtx { 198 | cancelledCtx, cancel := context.WithCancel(ctx) 199 | cancel() 200 | ctx = cancelledCtx 201 | } 202 | err := CopyStable(ctx, dst, src, tc.extraKeys, tc.extraIntKeys, progress) 203 | if tc.wantErr != "" { 204 | require.ErrorContains(t, err, tc.wantErr) 205 | return 206 | } 207 | require.NoError(t, err, "failed copy") 208 | 209 | for s := range progress { 210 | // This loop will not return if progress wasn't closed. 211 | t.Log(s) 212 | } 213 | 214 | // Verify the copy! 215 | for k, v := range tc.srcIntVals { 216 | if strings.HasSuffix(k, "no_copy") { 217 | continue 218 | } 219 | got, err := dst.GetUint64([]byte(k)) 220 | require.NoError(t, err) 221 | require.Equal(t, int(v), int(got), "wrong int value copied for key %s", k) 222 | } 223 | for k, v := range tc.srcVals { 224 | if strings.HasSuffix(k, "no_copy") { 225 | continue 226 | } 227 | got, err := dst.Get([]byte(k)) 228 | require.NoError(t, err) 229 | require.Equal(t, v, string(got), "wrong value copied for key %s", k) 230 | } 231 | }) 232 | } 233 | } 234 | 235 | func logPayload(idx uint64) string { 236 | return fmt.Sprintf("Log entry for index %6d", idx) 237 | } 238 | 239 | func populateTestLogStore(t *testing.T, startIdx uint64, n int) *testLogStore { 240 | t.Helper() 241 | ls := &testLogStore{} 242 | for idx := startIdx; idx < (startIdx + uint64(n)); idx++ { 243 | err := ls.StoreLog(&raft.Log{ 244 | Index: idx, 245 | Data: []byte(logPayload(idx)), 246 | AppendedAt: time.Now(), 247 | }) 248 | require.NoError(t, err) 249 | } 250 | return ls 251 | } 252 | 253 | type testLogStore struct { 254 | appends int 255 | logs []*raft.Log 256 | } 257 | 258 | // FirstIndex returns the first index written. 0 for no entries. 259 | func (s *testLogStore) FirstIndex() (uint64, error) { 260 | if len(s.logs) < 1 { 261 | return 0, nil 262 | } 263 | return s.logs[0].Index, nil 264 | } 265 | 266 | // LastIndex returns the last index written. 0 for no entries. 267 | func (s *testLogStore) LastIndex() (uint64, error) { 268 | if len(s.logs) < 1 { 269 | return 0, nil 270 | } 271 | return s.logs[len(s.logs)-1].Index, nil 272 | } 273 | 274 | // GetLog gets a log entry at a given index. 275 | func (s *testLogStore) GetLog(index uint64, log *raft.Log) error { 276 | first, _ := s.FirstIndex() 277 | last, _ := s.LastIndex() 278 | if first == 0 || index < first || index > last { 279 | return types.ErrNotFound 280 | } 281 | offset := index - first 282 | *log = *s.logs[offset] 283 | return nil 284 | } 285 | 286 | // StoreLog stores a log entry. 287 | func (s *testLogStore) StoreLog(log *raft.Log) error { 288 | return s.StoreLogs([]*raft.Log{log}) 289 | } 290 | 291 | // StoreLogs stores multiple log entries. 292 | func (s *testLogStore) StoreLogs(logs []*raft.Log) error { 293 | last, _ := s.LastIndex() 294 | prev := last 295 | for _, log := range logs { 296 | if prev > 0 && (prev+1) != log.Index { 297 | return fmt.Errorf("logs out of sequence got index=%d expecting index=%d", log.Index, prev+1) 298 | } 299 | s.logs = append(s.logs, log) 300 | prev = log.Index 301 | } 302 | s.appends++ 303 | return nil 304 | } 305 | 306 | // DeleteRange deletes a range of log entries. The range is inclusive. 307 | func (s *testLogStore) DeleteRange(min uint64, max uint64) error { 308 | panic("not implemented") // Don't need this in this package. 309 | } 310 | 311 | type testStableStore struct { 312 | d map[string][]byte 313 | } 314 | 315 | func newTestStableStore() *testStableStore { 316 | return &testStableStore{ 317 | d: make(map[string][]byte), 318 | } 319 | } 320 | 321 | func (s *testStableStore) Set(key []byte, val []byte) error { 322 | s.d[(string(key))] = val 323 | return nil 324 | } 325 | 326 | // Get returns the value for key, or an empty byte slice if key was not found. 327 | func (s *testStableStore) Get(key []byte) ([]byte, error) { 328 | return s.d[string(key)], nil 329 | } 330 | 331 | func (s *testStableStore) SetUint64(key []byte, val uint64) error { 332 | var buf [8]byte 333 | binary.BigEndian.PutUint64(buf[:], val) 334 | s.d[(string(key))] = buf[:] 335 | return nil 336 | } 337 | 338 | // GetUint64 returns the uint64 value for key, or 0 if key was not found. 339 | func (s *testStableStore) GetUint64(key []byte) (uint64, error) { 340 | v, ok := s.d[string(key)] 341 | if !ok { 342 | return 0, nil 343 | } 344 | return binary.BigEndian.Uint64(v), nil 345 | } 346 | -------------------------------------------------------------------------------- /segment/filer.go: -------------------------------------------------------------------------------- 1 | // Copyright IBM Corp. 2020, 2025 2 | // SPDX-License-Identifier: MPL-2.0 3 | 4 | package segment 5 | 6 | import ( 7 | "errors" 8 | "fmt" 9 | "io" 10 | "strings" 11 | "sync" 12 | 13 | "github.com/hashicorp/raft-wal/types" 14 | ) 15 | 16 | const ( 17 | segmentFileSuffix = ".wal" 18 | segmentFileNamePattern = "%020d-%016x" + segmentFileSuffix 19 | ) 20 | 21 | // Filer implements the abstraction for managing a set of segment files in a 22 | // directory. It uses a VFS to abstract actual file system operations for easier 23 | // testing. 24 | type Filer struct { 25 | dir string 26 | vfs types.VFS 27 | bufPool sync.Pool 28 | } 29 | 30 | // NewFiler creates a Filer ready for use. 31 | func NewFiler(dir string, vfs types.VFS) *Filer { 32 | f := &Filer{ 33 | dir: dir, 34 | vfs: vfs, 35 | } 36 | f.bufPool.New = func() interface{} { 37 | return make([]byte, minBufSize) 38 | } 39 | return f 40 | } 41 | 42 | // FileName returns the formatted file name expected for this segment. 43 | // SegmentFiler implementations could choose to ignore this but it's here to 44 | func FileName(i types.SegmentInfo) string { 45 | return fmt.Sprintf(segmentFileNamePattern, i.BaseIndex, i.ID) 46 | } 47 | 48 | // Create adds a new segment with the given info and returns a writer or an 49 | // error. 50 | func (f *Filer) Create(info types.SegmentInfo) (types.SegmentWriter, error) { 51 | if info.BaseIndex == 0 { 52 | return nil, fmt.Errorf("BaseIndex must be greater than zero") 53 | } 54 | fname := FileName(info) 55 | 56 | wf, err := f.vfs.Create(f.dir, fname, uint64(info.SizeLimit)) 57 | if err != nil { 58 | return nil, err 59 | } 60 | 61 | return createFile(info, wf, &f.bufPool) 62 | } 63 | 64 | // RecoverTail is called on an unsealed segment when re-opening the WAL it will 65 | // attempt to recover from a possible crash. It will either return an error, or 66 | // return a valid segmentWriter that is ready for further appends. If the 67 | // expected tail segment doesn't exist it must return an error wrapping 68 | // os.ErrNotExist. 69 | func (f *Filer) RecoverTail(info types.SegmentInfo) (types.SegmentWriter, error) { 70 | fname := FileName(info) 71 | 72 | wf, err := f.vfs.OpenWriter(f.dir, fname) 73 | if err != nil { 74 | return nil, err 75 | } 76 | 77 | return recoverFile(info, wf, &f.bufPool) 78 | } 79 | 80 | // Open an already sealed segment for reading. Open may validate the file's 81 | // header and return an error if it doesn't match the expected info. 82 | func (f *Filer) Open(info types.SegmentInfo) (types.SegmentReader, error) { 83 | fname := FileName(info) 84 | 85 | rf, err := f.vfs.OpenReader(f.dir, fname) 86 | if err != nil { 87 | return nil, err 88 | } 89 | 90 | // Validate header here since openReader is re-used by writer where it's valid 91 | // for the file header not to be committed yet after a crash so we can't check 92 | // it there. 93 | var hdr [fileHeaderLen]byte 94 | 95 | if _, err := rf.ReadAt(hdr[:], 0); err != nil { 96 | if errors.Is(err, io.EOF) { 97 | // Treat failure to read a header as corruption since a sealed file should 98 | // never not have a valid header. (I.e. even if crashes happen it should 99 | // be impossible to seal a segment with no header written so this 100 | // indicates that something truncated the file after the fact) 101 | return nil, fmt.Errorf("%w: failed to read header: %s", types.ErrCorrupt, err) 102 | } 103 | return nil, err 104 | } 105 | 106 | gotInfo, err := readFileHeader(hdr[:]) 107 | if err != nil { 108 | return nil, err 109 | } 110 | 111 | if err := validateFileHeader(*gotInfo, info); err != nil { 112 | return nil, err 113 | } 114 | 115 | return openReader(info, rf, &f.bufPool) 116 | } 117 | 118 | // List returns the set of segment IDs currently stored. It's used by the WAL 119 | // on recovery to find any segment files that need to be deleted following a 120 | // unclean shutdown. The returned map is a map of ID -> BaseIndex. BaseIndex 121 | // is returned to allow subsequent Delete calls to be made. 122 | func (f *Filer) List() (map[uint64]uint64, error) { 123 | segs, _, err := f.listInternal() 124 | return segs, err 125 | } 126 | 127 | func (f *Filer) listInternal() (map[uint64]uint64, []uint64, error) { 128 | files, err := f.vfs.ListDir(f.dir) 129 | if err != nil { 130 | return nil, nil, err 131 | } 132 | 133 | segs := make(map[uint64]uint64) 134 | sorted := make([]uint64, 0) 135 | for _, file := range files { 136 | if !strings.HasSuffix(file, segmentFileSuffix) { 137 | continue 138 | } 139 | // Parse BaseIndex and ID from the file name 140 | var bIdx, id uint64 141 | n, err := fmt.Sscanf(file, segmentFileNamePattern, &bIdx, &id) 142 | if err != nil { 143 | return nil, nil, types.ErrCorrupt 144 | } 145 | if n != 2 { 146 | // Misnamed segment files with the right suffix indicates a bug or 147 | // tampering, we can't be sure what's happened to the data. 148 | return nil, nil, types.ErrCorrupt 149 | } 150 | segs[id] = bIdx 151 | sorted = append(sorted, id) 152 | } 153 | 154 | return segs, sorted, nil 155 | } 156 | 157 | // Delete removes the segment with given baseIndex and id if it exists. Note 158 | // that baseIndex is technically redundant since ID is unique on it's own. But 159 | // in practice we name files (or keys) with both so that they sort correctly. 160 | // This interface allows a simpler implementation where we can just delete 161 | // the file if it exists without having to scan the underlying storage for a. 162 | func (f *Filer) Delete(baseIndex uint64, ID uint64) error { 163 | fname := fmt.Sprintf(segmentFileNamePattern, baseIndex, ID) 164 | return f.vfs.Delete(f.dir, fname) 165 | } 166 | 167 | // DumpSegment attempts to read the segment file specified by the baseIndex and 168 | // ID. It's intended purpose is for debugging the contents of segment files and 169 | // unlike the SegmentFiler interface, it doesn't assume the caller has access to 170 | // the correct metadata. This allows dumping log segments in a WAL that is still 171 | // being written to by another process. Without metadata we don't know if the 172 | // file is sealed so always recover by reading through the whole file. If after 173 | // or before are non-zero, the specify a exclusive lower or upper bound on which 174 | // log entries should be emitted. No error checking is done on the read data. fn 175 | // is called for each entry passing the raft info read from the file header (so 176 | // that the caller knows which codec to use for example) the raft index of the 177 | // entry and the raw bytes of the entry itself. The callback must return true to 178 | // continue reading. The data slice is only valid for the lifetime of the call. 179 | func (f *Filer) DumpSegment(baseIndex uint64, ID uint64, after, before uint64, fn func(info types.SegmentInfo, e types.LogEntry) (bool, error)) error { 180 | fname := fmt.Sprintf(segmentFileNamePattern, baseIndex, ID) 181 | 182 | rf, err := f.vfs.OpenReader(f.dir, fname) 183 | if err != nil { 184 | return err 185 | } 186 | 187 | buf := make([]byte, 64*1024) 188 | idx := baseIndex 189 | 190 | type frameInfo struct { 191 | Index uint64 192 | Offset int64 193 | Len uint32 194 | } 195 | var batch []frameInfo 196 | 197 | _, err = readThroughSegment(rf, func(info types.SegmentInfo, fh frameHeader, offset int64) (bool, error) { 198 | if fh.typ == FrameCommit { 199 | // All the previous entries have been committed. Read them and send up to 200 | // caller. 201 | for _, frame := range batch { 202 | // Check the header is reasonable 203 | if frame.Len > MaxEntrySize { 204 | return false, fmt.Errorf("failed to read entry idx=%d, frame header length (%d) is too big: %w", 205 | frame.Index, frame.Len, err) 206 | } 207 | 208 | if frame.Len > uint32(len(buf)) { 209 | buf = make([]byte, frame.Len) 210 | } 211 | 212 | n, err := rf.ReadAt(buf[:frame.Len], frame.Offset+frameHeaderLen) 213 | if err != nil { 214 | return false, err 215 | } 216 | if uint32(n) < frame.Len { 217 | return false, io.ErrUnexpectedEOF 218 | } 219 | 220 | ok, err := fn(info, types.LogEntry{Index: frame.Index, Data: buf[:n]}) 221 | if !ok || err != nil { 222 | return ok, err 223 | } 224 | } 225 | // Reset batch 226 | batch = batch[:0] 227 | return true, nil 228 | } 229 | 230 | if fh.typ != FrameEntry { 231 | return true, nil 232 | } 233 | 234 | if idx <= after { 235 | // Not in the range we care about, skip reading the entry. 236 | idx++ 237 | return true, nil 238 | } 239 | if before > 0 && idx >= before { 240 | // We're done 241 | return false, nil 242 | } 243 | 244 | batch = append(batch, frameInfo{idx, offset, fh.len}) 245 | idx++ 246 | return true, nil 247 | }) 248 | 249 | return err 250 | } 251 | 252 | // DumpLogs attempts to read all log entries from segment files in the directory 253 | // for debugging purposes. It does _not_ use the metadata and so may output log 254 | // entries that are uncommitted or already truncated as far as the writing 255 | // process is concerned. As such it should not be used for replication of data. 256 | // It is useful though to debug the contents of the log even while the writing 257 | // application is still running. After and before if non-zero specify exclusive 258 | // bounds on the logs that should be returned which may allow the implementation 259 | // to skip reading entire segment files that are not in the range. 260 | func (f *Filer) DumpLogs(after, before uint64, fn func(info types.SegmentInfo, e types.LogEntry) (bool, error)) error { 261 | baseIndexes, segIDsSorted, err := f.listInternal() 262 | if err != nil { 263 | return err 264 | } 265 | 266 | for i, id := range segIDsSorted { 267 | baseIndex := baseIndexes[id] 268 | nextBaseIndex := uint64(0) 269 | if i+1 < len(segIDsSorted) { 270 | // This is not the last segment, peek at the base index of that one and 271 | // assume that this segment won't contain indexes that high. 272 | nextBaseIndex = baseIndexes[segIDsSorted[i+1]] 273 | } 274 | // See if this file contains any indexes in the range 275 | if after > 0 && nextBaseIndex > 0 && after >= nextBaseIndex { 276 | // This segment is all indexes before the lower bound we care about 277 | continue 278 | } 279 | if before > 0 && before <= baseIndex { 280 | // This segment is all indexes higher than the upper bound. We've output 281 | // every log in the range at this point (barring edge cases where we race 282 | // with a truncation which leaves multiple generations of segment files on 283 | // disk which we are going to ignore for now). 284 | return nil 285 | } 286 | 287 | // We probably care about at least some of the entries in this segment 288 | err := f.DumpSegment(baseIndex, id, after, before, fn) 289 | if err != nil { 290 | return err 291 | } 292 | } 293 | 294 | return nil 295 | } 296 | --------------------------------------------------------------------------------