├── cmd
    └── waldump
    │   ├── .gitignore
    │   ├── waldump.go
    │   └── README.md
├── .gitignore
├── alice
    ├── .gitignore
    ├── run-workload.sh
    ├── Makefile
    ├── workload
    │   └── main.go
    └── checker
    │   └── main.go
├── bench
    ├── .gitignore
    ├── main.go
    ├── append_requestor.go
    └── bench_test.go
├── CHANGELOG.md
├── NOTICE.txt
├── segment
    ├── crc.go
    ├── reader_test.go
    ├── reader.go
    ├── format_test.go
    ├── writer_test.go
    ├── format.go
    ├── vfs_test.go
    └── filer.go
├── .copywrite.hcl
├── .github
    ├── pull_request_template.md
    ├── workflows
    │   ├── two-step-pr-approval.yml
    │   └── go-tests.yml
    └── dependabot.yml
├── CODEOWNERS
├── types
    ├── buffer.go
    ├── types.go
    ├── meta.go
    ├── vfs.go
    └── segment.go
├── fs
    ├── file.go
    ├── fs_test.go
    └── fs.go
├── metrics
    ├── atomic_collector_test.go
    ├── metrics.go
    ├── gometrics_collector_test.go
    ├── atomic_collector.go
    └── gometrics_collector.go
├── integration
    ├── meta.go
    └── integration_test.go
├── go.mod
├── verifier
    ├── metrics.go
    ├── store.go
    └── verifier.go
├── options.go
├── codec_test.go
├── metrics.go
├── metadb
    ├── metadb_test.go
    └── metadb.go
├── codec.go
├── migrate
    ├── migrate.go
    └── migrate_test.go
└── state.go


/cmd/waldump/.gitignore:
--------------------------------------------------------------------------------
1 | waldump


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | bench.test
3 | profile.out


--------------------------------------------------------------------------------
/alice/.gitignore:
--------------------------------------------------------------------------------
1 | bin
2 | *.gdb
3 | traces_dir
4 | workload_dir


--------------------------------------------------------------------------------
/bench/.gitignore:
--------------------------------------------------------------------------------
1 | bench
2 | bench-result*
3 | uncorrected_bench-result*
4 | bench.test
5 | profile.out


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## Unreleased
 2 | 
 3 | ### Improvements
 4 | 
 5 | ### Changes
 6 | 
 7 | ### Fixed
 8 | 
 9 | ### Security
10 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | HashiCorp Raft WAL Library
2 | https://www.hashicorp.com/
3 | License: Mozilla Public License Version 2.0
4 | Copyright 2022 HashiCorp, Inc.


--------------------------------------------------------------------------------
/segment/crc.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package segment
 5 | 
 6 | import (
 7 | 	"hash/crc32"
 8 | )
 9 | 
10 | var castagnoliTable *crc32.Table
11 | 
12 | func init() {
13 | 	castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
14 | }
15 | 


--------------------------------------------------------------------------------
/.copywrite.hcl:
--------------------------------------------------------------------------------
 1 | schema_version = 1
 2 | 
 3 | project {
 4 |   license = "MPL-2.0"
 5 |   copyright_year = 2022
 6 | 
 7 |   # (OPTIONAL) A list of globs that should not have copyright/license headers.
 8 |   # Supports doublestar glob patterns for more flexibility in defining which
 9 |   # files or folders should be ignored (e.g., "vendors/**")
10 |   header_ignore = []
11 | }
12 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!-- heimdall_github_prtemplate:grc-pci_dss-2024-01-05 -->
 2 | ## Description
 3 | 
 4 | <!-- Provide a summary of what the PR does and why it is being submitted. -->
 5 | 
 6 | ## Related Issue
 7 | 
 8 | <!-- If this PR is linked to any issue, provide the issue number or description here. Any related JIRA tickets can also be added here. -->
 9 | 
10 | ## How Has This Been Tested?
11 | 
12 | <!-- Describe how the changes have been tested. Provide test instructions or details. -->
13 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # Each line is a file pattern followed by one or more owners.
 2 | # More on CODEOWNERS files: https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
 3 | 
 4 | # Default owner
 5 | * @hashicorp/team-ip-compliance @hashicorp/consul-core-reviewers @hashicorp/raft-force
 6 | 
 7 | # Add override rules below. Each line is a file/folder pattern followed by one or more owners.
 8 | # Being an owner means those groups or individuals will be added as reviewers to PRs affecting
 9 | # those areas of the code.
10 | # Examples:
11 | # /docs/  @docs-team
12 | # *.js    @js-team
13 | # *.go    @go-team
14 | 


--------------------------------------------------------------------------------
/.github/workflows/two-step-pr-approval.yml:
--------------------------------------------------------------------------------
 1 | name: Two-Stage PR Review Process
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [opened, synchronize, reopened, labeled, unlabeled, ready_for_review, converted_to_draft]
 6 |   pull_request_review:
 7 |     types: [submitted]
 8 | 
 9 | jobs:
10 |   manage-pr-status:
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       pull-requests: write
14 |       contents: write
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.0.0
18 |       
19 |       - name: Two stage PR review
20 |         uses: hashicorp/two-stage-pr-approval@v0.1.0
21 | 


--------------------------------------------------------------------------------
/types/buffer.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package types
 5 | 
 6 | // PooledBuffer is a wrapper that allows WAL to return read buffers to segment
 7 | // implementations when we're done decoding.
 8 | type PooledBuffer struct {
 9 | 	Bs      []byte
10 | 	CloseFn func()
11 | }
12 | 
13 | // Close implements io.Closer and returns the buffer to the pool. It should be
14 | // called exactly once for each buffer when it's no longer needed. It's no
15 | // longer safe to access Bs or any slice taken from it after the call.
16 | func (b *PooledBuffer) Close() error {
17 | 	if b.CloseFn != nil {
18 | 		b.CloseFn()
19 | 	}
20 | 	return nil
21 | }
22 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) HashiCorp, Inc.
 2 | # SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | version: 2
 5 | 
 6 | updates:
 7 |   - package-ecosystem: "github-actions"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "weekly"
11 |       day: "sunday"
12 |     commit-message:
13 |       prefix: "[chore] : "
14 |     groups:
15 |       actions:
16 |         patterns:
17 |           - "*"
18 | 
19 |   - package-ecosystem: "gomod"
20 |     directory: "/"
21 |     schedule:
22 |       interval: "weekly"
23 |       day: "sunday"
24 |     commit-message:
25 |       prefix: "[chore] : "
26 |     groups:
27 |       go:
28 |         patterns:
29 |           - "*"
30 |         applies-to: "version-updates"
31 |       go-security:
32 |         patterns:
33 |           - "*"
34 |         applies-to: "security-updates"
35 | 


--------------------------------------------------------------------------------
/types/types.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package types
 5 | 
 6 | import (
 7 | 	"errors"
 8 | 
 9 | 	"github.com/hashicorp/raft"
10 | )
11 | 
12 | var (
13 | 	// ErrNotFound is our own version of raft's not found error. It's important
14 | 	// it's exactly the same because the raft lib checks for equality with it's
15 | 	// own type as a crucial part of replication processing (detecting end of logs
16 | 	// and that a snapshot is needed for a follower).
17 | 	ErrNotFound = raft.ErrLogNotFound
18 | 	ErrCorrupt  = errors.New("WAL is corrupt")
19 | 	ErrSealed   = errors.New("segment is sealed")
20 | 	ErrClosed   = errors.New("closed")
21 | )
22 | 
23 | // LogEntry represents an entry that has already been encoded.
24 | type LogEntry struct {
25 | 	Index uint64
26 | 	Data  []byte
27 | }
28 | 


--------------------------------------------------------------------------------
/alice/run-workload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright IBM Corp. 2020, 2025
 4 | # SPDX-License-Identifier: MPL-2.0
 5 | 
 6 | set -e
 7 | # trap 'error ${LINENO}' ERR
 8 | 
 9 | WORKLOAD=$1
10 | 
11 | bold=$(tput bold)
12 | green=$(tput setf 2)
13 | normal=$(tput sgr0)
14 | 
15 | echo "${green}==> Running Workload ${bold}${WORKLOAD}${normal}"
16 | 
17 | echo " -> Cleaning up dirs"
18 | rm -rf /workload_dir traces_dir
19 | mkdir /workload_dir traces_dir
20 | 
21 | echo " -> Running init"
22 | bin/workload -dir /workload_dir -workload "$WORKLOAD" -init
23 | 
24 | echo " -> Running alice-record"
25 | env GOMAXPROCS=1 alice-record --workload_dir /workload_dir \
26 |   --traces_dir traces_dir \
27 |   bin/workload -dir /workload_dir -workload "$WORKLOAD"
28 | 
29 | echo " -> Running alice-check"
30 | alice-check --traces_dir=traces_dir --checker=bin/checker
31 | 


--------------------------------------------------------------------------------
/alice/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright IBM Corp. 2020, 2025
 2 | # SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | WORK_DIR=$(shell pwd)
 5 | 
 6 | WORKLOAD ?= append
 7 | 
 8 | .PHONY: test test-local
 9 | 
10 | test: bin/workload bin/checker
11 | 	docker run --privileged \
12 | 		--cap-add=SYS_PTRACE \
13 | 		--memory=8gb \
14 | 		--shm-size=2gb \
15 | 		-v $(WORK_DIR):/app \
16 | 		-it ghcr.io/banks/alice:master \
17 | 		./run-workload.sh ${WORKLOAD}
18 | 
19 | test-local:
20 | 	mkdir -p ${WORK_DIR}/workload_dir
21 | 	echo " -> Running init (${WORKLOAD})"
22 | 	go run ${WORK_DIR}/workload/main.go \
23 | 		-dir ${WORK_DIR}/workload_dir \
24 | 		-workload ${WORKLOAD} \
25 | 		-init
26 | 	echo " -> Running workload (${WORKLOAD})"
27 | 	go run ${WORK_DIR}/workload/main.go \
28 | 		-dir ${WORK_DIR}/workload_dir \
29 | 		-workload ${WORKLOAD} | tee ${WORK_DIR}/workload_dir/stdout.txt
30 | 	echo " -> Running checker"
31 | 	go run ${WORK_DIR}/checker/main.go \
32 | 		${WORK_DIR}/workload_dir \
33 | 		${WORK_DIR}/workload_dir/stdout.txt
34 | 
35 | bin/workload: workload/main.go
36 | 	GOOS=linux go build -o bin/workload workload/main.go
37 | 
38 | bin/checker: checker/main.go
39 | 	GOOS=linux go build -o bin/checker checker/main.go
40 | 


--------------------------------------------------------------------------------
/fs/file.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package fs
 5 | 
 6 | import (
 7 | 	"os"
 8 | 	"sync/atomic"
 9 | 
10 | 	"github.com/hashicorp/raft-wal/types"
11 | )
12 | 
13 | var _ types.WritableFile = &File{}
14 | 
15 | // File wraps an os.File and implements types.WritableFile. It ensures that the
16 | // first time Sync is called on the file, that the parent directory is also
17 | // Fsynced to ensure a crash won't cause the FS to forget the file is there.
18 | //
19 | // Postponing this allows us to ensure that we do the minimum necessary fsyncs
20 | // but still ensure all required fsyncs are done by the time we acknowledge
21 | // committed data in the new file.
22 | type File struct {
23 | 	new uint32 // atomically accessed, keep it aligned!
24 | 	dir string
25 | 	os.File
26 | }
27 | 
28 | // Sync calls fsync on the underlying file. If this is the first call to Sync
29 | // since creation it also fsyncs the parent dir.
30 | func (f *File) Sync() error {
31 | 	// Sync the underlying file
32 | 	if err := f.File.Sync(); err != nil {
33 | 		return err
34 | 	}
35 | 	new := atomic.SwapUint32(&f.new, 1)
36 | 	if new == 0 {
37 | 		return syncDir(f.dir)
38 | 	}
39 | 	return nil
40 | }
41 | 


--------------------------------------------------------------------------------
/metrics/atomic_collector_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package metrics
 5 | 
 6 | import (
 7 | 	"sync"
 8 | 	"testing"
 9 | 
10 | 	"github.com/stretchr/testify/require"
11 | )
12 | 
13 | func TestAtomicCollector(t *testing.T) {
14 | 	defs := Definitions{
15 | 		Counters: []Descriptor{
16 | 			{
17 | 				Name: "c1",
18 | 				Desc: "counter one.",
19 | 			},
20 | 			{
21 | 				Name: "c2",
22 | 				Desc: "counter two.",
23 | 			},
24 | 		},
25 | 		Gauges: []Descriptor{
26 | 			{
27 | 				Name: "g1",
28 | 				Desc: "gauge one.",
29 | 			},
30 | 			{
31 | 				Name: "g2",
32 | 				Desc: "gauge two.",
33 | 			},
34 | 		},
35 | 	}
36 | 
37 | 	c := NewAtomicCollector(defs)
38 | 
39 | 	var wg sync.WaitGroup
40 | 
41 | 	for i := 0; i < 10; i++ {
42 | 		wg.Add(1)
43 | 		go func() {
44 | 			defer wg.Done()
45 | 			for j := 0; j < 10; j++ {
46 | 				c.IncrementCounter("c1", 1)
47 | 				c.IncrementCounter("c2", 2)
48 | 				c.SetGauge("g1", uint64(j))
49 | 				c.SetGauge("g2", uint64(j*2))
50 | 			}
51 | 		}()
52 | 	}
53 | 
54 | 	wg.Wait()
55 | 
56 | 	s := c.Summary()
57 | 	require.Equal(t, 100, int(s.Counters["c1"]))
58 | 	require.Equal(t, 200, int(s.Counters["c2"]))
59 | 	require.Equal(t, 9, int(s.Gauges["g1"]))
60 | 	require.Equal(t, 18, int(s.Gauges["g2"]))
61 | }
62 | 


--------------------------------------------------------------------------------
/metrics/metrics.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package metrics
 5 | 
 6 | // Collector provides a simple abstraction for counter type metrics that
 7 | // the WAL and log verifier can use without depending on a specific metrics
 8 | // collector implementation.
 9 | type Collector interface {
10 | 	// IncrementCounter record val occurrences of the named event. Names will
11 | 	// follow prometheus conventions with lower_case_and_underscores. We don't
12 | 	// need any additional labels currently.
13 | 	IncrementCounter(name string, delta uint64)
14 | 
15 | 	// SetGauge sets the value of the named gauge overriding any previous value.
16 | 	SetGauge(name string, val uint64)
17 | }
18 | 
19 | // Definitions provides a simple description of a set of scalar metrics.
20 | type Definitions struct {
21 | 	Counters []Descriptor
22 | 	Gauges   []Descriptor
23 | }
24 | 
25 | // Descriptor describes a specific metric.
26 | type Descriptor struct {
27 | 	Name string
28 | 	Desc string
29 | }
30 | 
31 | var _ Collector = &NoOpCollector{}
32 | 
33 | // NoOpCollector is a Collector that does nothing.
34 | type NoOpCollector struct{}
35 | 
36 | // IncrementCounter record val occurrences of the named event. Names will
37 | // follow prometheus conventions with lower_case_and_underscores. We don't
38 | // need any additional labels currently.
39 | func (c *NoOpCollector) IncrementCounter(name string, delta uint64) {}
40 | 
41 | // SetGauge sets the value of the named gauge overriding any previous value.
42 | func (c *NoOpCollector) SetGauge(name string, val uint64) {}
43 | 


--------------------------------------------------------------------------------
/integration/meta.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package integration
 5 | 
 6 | import (
 7 | 	"sync"
 8 | 
 9 | 	"github.com/hashicorp/raft-wal/types"
10 | )
11 | 
12 | type PeekingMetaStore struct {
13 | 	mu     sync.Mutex
14 | 	meta   types.MetaStore
15 | 	state  types.PersistentState
16 | 	stable map[string]string
17 | }
18 | 
19 | func (s *PeekingMetaStore) PeekState() types.PersistentState {
20 | 	s.mu.Lock()
21 | 	defer s.mu.Unlock()
22 | 	return s.state
23 | }
24 | 
25 | func (s *PeekingMetaStore) PeekStable(key string) (string, bool) {
26 | 	s.mu.Lock()
27 | 	defer s.mu.Unlock()
28 | 	v, ok := s.stable[key]
29 | 	return v, ok
30 | }
31 | 
32 | func (s *PeekingMetaStore) Load(dir string) (types.PersistentState, error) {
33 | 	state, err := s.meta.Load(dir)
34 | 	if err == nil {
35 | 		s.mu.Lock()
36 | 		s.state = state
37 | 		s.mu.Unlock()
38 | 	}
39 | 	return state, err
40 | }
41 | 
42 | func (s *PeekingMetaStore) CommitState(state types.PersistentState) error {
43 | 	err := s.meta.CommitState(state)
44 | 	if err == nil {
45 | 		s.mu.Lock()
46 | 		s.state = state
47 | 		s.mu.Unlock()
48 | 	}
49 | 	return nil
50 | }
51 | 
52 | func (s *PeekingMetaStore) GetStable(key []byte) ([]byte, error) {
53 | 	return s.meta.GetStable(key)
54 | }
55 | 
56 | func (s *PeekingMetaStore) SetStable(key, value []byte) error {
57 | 	err := s.meta.SetStable(key, value)
58 | 	if err == nil {
59 | 		s.mu.Lock()
60 | 		s.stable[string(key)] = string(value)
61 | 		s.mu.Unlock()
62 | 	}
63 | 	return err
64 | }
65 | 
66 | func (s *PeekingMetaStore) Close() error {
67 | 	return s.meta.Close()
68 | }
69 | 


--------------------------------------------------------------------------------
/types/meta.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package types
 5 | 
 6 | import "io"
 7 | 
 8 | // MetaStore is the interface we need to some persistent, crash safe backend. We
 9 | // implement it with BoltDB for real usage but the interface allows alternatives
10 | // to be used, or tests to mock out FS access.
11 | type MetaStore interface {
12 | 	// Load loads the existing persisted state. If there is no existing state
13 | 	// implementations are expected to create initialize new storage and return an
14 | 	// empty state.
15 | 	Load(dir string) (PersistentState, error)
16 | 
17 | 	// CommitState must atomically replace all persisted metadata in the current
18 | 	// store with the set provided. It must not return until the data is persisted
19 | 	// durably and in a crash-safe way otherwise the guarantees of the WAL will be
20 | 	// compromised. The WAL will only ever call this in a single thread at one
21 | 	// time and it will never be called concurrently with Load however it may be
22 | 	// called concurrently with Get/SetStable operations.
23 | 	CommitState(PersistentState) error
24 | 
25 | 	// GetStable returns a value from stable store or nil if it doesn't exist. May
26 | 	// be called concurrently by multiple threads.
27 | 	GetStable(key []byte) ([]byte, error)
28 | 
29 | 	// SetStable stores a value from stable store. May be called concurrently with
30 | 	// GetStable.
31 | 	SetStable(key, value []byte) error
32 | 
33 | 	io.Closer
34 | }
35 | 
36 | // PersistentState represents the WAL file metadata we need to store reliably to
37 | // recover on restart.
38 | type PersistentState struct {
39 | 	NextSegmentID uint64
40 | 	Segments      []SegmentInfo
41 | }
42 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/hashicorp/raft-wal
 2 | 
 3 | go 1.23.0
 4 | 
 5 | toolchain go1.23.5
 6 | 
 7 | require (
 8 | 	github.com/HdrHistogram/hdrhistogram-go v1.1.2
 9 | 	github.com/benbjohnson/immutable v0.4.3
10 | 	github.com/benmathews/bench v0.0.0-20210120214102-f7c75b9ef6e7
11 | 	github.com/benmathews/hdrhistogram-writer v0.0.0-20210120211942-3cb1c7c33f95
12 | 	github.com/google/gofuzz v1.2.0
13 | 	github.com/hashicorp/go-hclog v1.6.3
14 | 	github.com/hashicorp/go-metrics v0.5.4
15 | 	github.com/hashicorp/raft v1.7.3
16 | 	github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702
17 | 	github.com/hashicorp/raft-boltdb/v2 v2.3.1
18 | 	github.com/segmentio/fasthash v1.0.3
19 | 	github.com/stretchr/testify v1.10.0
20 | 	go.etcd.io/bbolt v1.4.3
21 | 	go.etcd.io/etcd/client/pkg/v3 v3.6.4
22 | )
23 | 
24 | require (
25 | 	github.com/armon/go-metrics v0.4.1 // indirect
26 | 	github.com/boltdb/bolt v1.3.1 // indirect
27 | 	github.com/davecgh/go-spew v1.1.1 // indirect
28 | 	github.com/fatih/color v1.13.0 // indirect
29 | 	github.com/hashicorp/go-immutable-radix v1.3.0 // indirect
30 | 	github.com/hashicorp/go-msgpack v1.1.5 // indirect
31 | 	github.com/hashicorp/go-msgpack/v2 v2.1.2 // indirect
32 | 	github.com/hashicorp/golang-lru v0.5.4 // indirect
33 | 	github.com/mattn/go-colorable v0.1.12 // indirect
34 | 	github.com/mattn/go-isatty v0.0.14 // indirect
35 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
36 | 	go.uber.org/multierr v1.11.0 // indirect
37 | 	go.uber.org/zap v1.27.0 // indirect
38 | 	golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect
39 | 	golang.org/x/sys v0.31.0 // indirect
40 | 	golang.org/x/time v0.1.0 // indirect
41 | 	gonum.org/v1/gonum v0.12.0 // indirect
42 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
43 | )
44 | 


--------------------------------------------------------------------------------
/cmd/waldump/waldump.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | 
 3 | package main
 4 | 
 5 | import (
 6 | 	"encoding/json"
 7 | 	"flag"
 8 | 	"fmt"
 9 | 	"os"
10 | 
11 | 	"github.com/hashicorp/raft"
12 | 	wal "github.com/hashicorp/raft-wal"
13 | 	"github.com/hashicorp/raft-wal/fs"
14 | 	"github.com/hashicorp/raft-wal/segment"
15 | 	"github.com/hashicorp/raft-wal/types"
16 | )
17 | 
18 | type opts struct {
19 | 	Dir    string
20 | 	After  uint64
21 | 	Before uint64
22 | }
23 | 
24 | func main() {
25 | 	var o opts
26 | 	flag.Uint64Var(&o.After, "after", 0, "specified a raft index to use as an exclusive lower bound when dumping log entries.")
27 | 	flag.Uint64Var(&o.Before, "before", 0, "specified a raft index to use as an exclusive upper bound when dumping log entries.")
28 | 
29 | 	flag.Parse()
30 | 
31 | 	// Accept dir as positional arg
32 | 	o.Dir = flag.Arg(0)
33 | 	if o.Dir == "" {
34 | 		fmt.Println("Usage: waldump [-after INDEX] [-before INDEX] <path to WAL dir>")
35 | 		os.Exit(1)
36 | 	}
37 | 
38 | 	vfs := fs.New()
39 | 	f := segment.NewFiler(o.Dir, vfs)
40 | 
41 | 	codec := &wal.BinaryCodec{}
42 | 	var log raft.Log
43 | 	enc := json.NewEncoder(os.Stdout)
44 | 
45 | 	err := f.DumpLogs(o.After, o.Before, func(info types.SegmentInfo, e types.LogEntry) (bool, error) {
46 | 		if info.Codec != wal.CodecBinaryV1 {
47 | 			return false, fmt.Errorf("unsupported codec %d in file %s", info.Codec, segment.FileName(info))
48 | 		}
49 | 		if err := codec.Decode(e.Data, &log); err != nil {
50 | 			return false, err
51 | 		}
52 | 		// Output the raft Log struct as JSON
53 | 		if err := enc.Encode(log); err != nil {
54 | 			return false, err
55 | 		}
56 | 		return true, nil
57 | 	})
58 | 	if err != nil {
59 | 		fmt.Printf("ERROR: %s\n", err)
60 | 		os.Exit(1)
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/metrics/gometrics_collector_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package metrics
 5 | 
 6 | import (
 7 | 	"testing"
 8 | 	"time"
 9 | 
10 | 	gometrics "github.com/hashicorp/go-metrics/compat"
11 | 	"github.com/stretchr/testify/require"
12 | )
13 | 
14 | func TestGoMetricsCollector(t *testing.T) {
15 | 	cfg := &gometrics.Config{
16 | 		EnableHostname:       false,
17 | 		EnableRuntimeMetrics: false,
18 | 		// FilterDefault is super weird and backwards but "true" means "don't
19 | 		// filter"!
20 | 		FilterDefault: true,
21 | 	}
22 | 	sink := gometrics.NewInmemSink(1*time.Second, 10*time.Second)
23 | 	gm, err := gometrics.New(cfg, sink)
24 | 	require.NoError(t, err)
25 | 
26 | 	c := NewGoMetricsCollector(
27 | 		[]string{"myapp", "wal"},
28 | 		[]gometrics.Label{{Name: "label", Value: "foo"}},
29 | 		gm,
30 | 	)
31 | 
32 | 	c.IncrementCounter("counter_one", 1)
33 | 	c.IncrementCounter("counter_one", 1)
34 | 	c.IncrementCounter("counter_two", 10)
35 | 
36 | 	c.SetGauge("g1", 12345)
37 | 
38 | 	summary := flattenData(sink.Data())
39 | 
40 | 	require.Equal(t, 2, int(summary.Counters["myapp.wal.counter_one;label=foo"]))
41 | 	require.Equal(t, 10, int(summary.Counters["myapp.wal.counter_two;label=foo"]))
42 | 
43 | 	require.Equal(t, 12345, int(summary.Gauges["myapp.wal.g1;label=foo"]))
44 | 
45 | }
46 | 
47 | func flattenData(ivs []*gometrics.IntervalMetrics) Summary {
48 | 	s := Summary{
49 | 		Counters: make(map[string]uint64),
50 | 		Gauges:   make(map[string]uint64),
51 | 	}
52 | 	for _, iv := range ivs {
53 | 		for name, v := range iv.Counters {
54 | 			s.Counters[name] += uint64(v.Sum)
55 | 		}
56 | 		for name, v := range iv.Gauges {
57 | 			s.Gauges[name] = uint64(v.Value)
58 | 		}
59 | 	}
60 | 	return s
61 | }
62 | 


--------------------------------------------------------------------------------
/verifier/metrics.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package verifier
 5 | 
 6 | import (
 7 | 	"github.com/hashicorp/raft-wal/metrics"
 8 | )
 9 | 
10 | var (
11 | 	// MetricDefinitions describe the metrics emitted by this library via the
12 | 	// provided metrics.Collector implementation. It's public so that these can be
13 | 	// registered during init with metrics clients that support pre-defining
14 | 	// metrics.
15 | 	MetricDefinitions = metrics.Definitions{
16 | 		Counters: []metrics.Descriptor{
17 | 			{
18 | 				Name: "checkpoints_written",
19 | 				Desc: "checkpoints_written counts the number of checkpoint entries" +
20 | 					" written to the LogStore.",
21 | 			},
22 | 			{
23 | 				Name: "ranges_verified",
24 | 				Desc: "ranges_verified counts the number of log ranges for which a" +
25 | 					" verification report has been completed.",
26 | 			},
27 | 			{
28 | 				Name: "read_checksum_failures",
29 | 				Desc: "read_checksum_failures counts the number of times a range of" +
30 | 					" logs between two check points contained at least one corruption.",
31 | 			},
32 | 			{
33 | 				Name: "write_checksum_failures",
34 | 				Desc: "write_checksum_failures counts the number of times a follower" +
35 | 					" has a different checksum to the leader at the point where it" +
36 | 					" writes to the log. This could be caused by either a disk-corruption" +
37 | 					" on the leader (unlikely) or some other corruption of the log" +
38 | 					" entries in-flight.",
39 | 			},
40 | 			{
41 | 				Name: "dropped_reports",
42 | 				Desc: "dropped_reports counts how many times the verifier routine was" +
43 | 					" still busy when the next checksum came in and so verification for" +
44 | 					" a range was skipped. If you see this happen consider increasing" +
45 | 					" the interval between checkpoints.",
46 | 			},
47 | 		},
48 | 	}
49 | )
50 | 


--------------------------------------------------------------------------------
/cmd/waldump/README.md:
--------------------------------------------------------------------------------
 1 | # waldump
 2 | 
 3 | A simple command for dumping the contents of WAL segment files to JSON for
 4 | debugging.
 5 | 
 6 | ## Usage
 7 | 
 8 | ```
 9 | $ waldump [-after INDEX] [-before INDEX] /path/to/wal/dir
10 | ...
11 | {"Index":227281,"Term":4,"Type":0,"Data":"hpGEpUNvb3JkhKpBZGp1c3RtZW50yz7pEPrkTc4tpUVycm9yyz/B4NJg87MZpkhlaWdodMs/ABkEWHeDZqNWZWOYyz8FyF63P/XOyz8Fe2fyqYpayz7eXgvdsOWVyz7xX/ARy9MByz7XZq0fmx5eyz7x8ic7zxhJy78EgvusSgKUy77xVfw2sEr5pE5vZGWiczGpUGFydGl0aW9uoKdTZWdtZW50oA==","Extensions":null,"AppendedAt":"2023-03-23T12:24:05.440317Z"}
12 | ...
13 | ```
14 | 
15 | Each `raft.Log` is written out as JSON followed by a newline. The `Data` and
16 | `Extensions` fields are opaque byte strings that will be base64 encoded.
17 | Decoding those requires knowledge of the encoding used by the writing
18 | application.
19 | 
20 | ## Limitations
21 | 
22 | This tool is designed for debugging only. It does _not_ inspect the wal-meta
23 | database. This has the nice property that you can safely dump the contexts of
24 | WAL files even while the application is still writing to the WAL since we don't
25 | have to take a lock on the meta database.
26 | 
27 | The downside is that this tool might in some edge cases output logs that have
28 | already been deleted from the WAL. It's possible although extremely unlikely
29 | that the WAL could be in the process of truncating the tail which could result
30 | in there being both pre-truncate and post-truncate segment files present. This
31 | tool might possibly output duplicate and out-of-order log indexes from before
32 | and after the truncation. Or if `before` and `after` are used, it's possible we
33 | might skip records entirely because an older file that has already been removed
34 | was read instead of the newer one. These are all very unlikely in practice and
35 | if the application that writes the WAL is still up and running are likely to be
36 | resolved by the time you run the tool again.


--------------------------------------------------------------------------------
/types/vfs.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package types
 5 | 
 6 | import "io"
 7 | 
 8 | // VFS is the interface WAL needs to interact with the file system. In
 9 | // production it would normally be implemented by RealFS which interacts with
10 | // the operating system FS using standard go os package. It's useful to allow
11 | // testing both to run quicker (by being in memory only) and to make it easy to
12 | // simulate all kinds of disk errors and failure modes without needing a more
13 | // elaborate external test harness like ALICE.
14 | type VFS interface {
15 | 	// ListDir returns a list of all files in the specified dir in lexicographical
16 | 	// order. If the dir doesn't exist, it must return an error. Empty array with
17 | 	// nil error is assumed to mean that the directory exists and was readable,
18 | 	// but contains no files.
19 | 	ListDir(dir string) ([]string, error)
20 | 
21 | 	// Create creates a new file with the given name. If a file with the same name
22 | 	// already exists an error is returned. If a non-zero size is given,
23 | 	// implementations should make a best effort to pre-allocate the file to be
24 | 	// that size. The dir must already exist and be writable to the current
25 | 	// process.
26 | 	Create(dir, name string, size uint64) (WritableFile, error)
27 | 
28 | 	// Delete indicates the file is no longer required. Typically it should be
29 | 	// deleted from the underlying system to free disk space.
30 | 	Delete(dir, name string) error
31 | 
32 | 	// OpenReader opens an existing file in read-only mode. If the file doesn't
33 | 	// exist or permission is denied, an error is returned, otherwise no checks
34 | 	// are made about the well-formedness of the file, it may be empty, the wrong
35 | 	// size or corrupt in arbitrary ways.
36 | 	OpenReader(dir, name string) (ReadableFile, error)
37 | 
38 | 	// OpenWriter opens a file in read-write mode. If the file doesn't exist or
39 | 	// permission is denied, an error is returned, otherwise no checks are made
40 | 	// about the well-formedness of the file, it may be empty, the wrong size or
41 | 	// corrupt in arbitrary ways.
42 | 	OpenWriter(dir, name string) (WritableFile, error)
43 | }
44 | 
45 | // WritableFile provides random read-write access to a file as well as the
46 | // ability to fsync it to disk.
47 | type WritableFile interface {
48 | 	io.WriterAt
49 | 	io.ReaderAt
50 | 	io.Closer
51 | 
52 | 	Sync() error
53 | }
54 | 
55 | // ReadableFile provides random read access to a file.
56 | type ReadableFile interface {
57 | 	io.ReaderAt
58 | 	io.Closer
59 | }
60 | 


--------------------------------------------------------------------------------
/options.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package wal
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | 
 9 | 	"github.com/hashicorp/go-hclog"
10 | 	"github.com/hashicorp/raft-wal/fs"
11 | 	"github.com/hashicorp/raft-wal/metadb"
12 | 	"github.com/hashicorp/raft-wal/metrics"
13 | 	"github.com/hashicorp/raft-wal/segment"
14 | 	"github.com/hashicorp/raft-wal/types"
15 | )
16 | 
17 | // WithCodec is an option that allows a custom Codec to be provided to the WAL.
18 | // If not used the default Codec is used.
19 | func WithCodec(c Codec) walOpt {
20 | 	return func(w *WAL) {
21 | 		w.codec = c
22 | 	}
23 | }
24 | 
25 | // WithMetaStore is an option that allows a custom MetaStore to be provided to
26 | // the WAL. If not used the default MetaStore is used.
27 | func WithMetaStore(db types.MetaStore) walOpt {
28 | 	return func(w *WAL) {
29 | 		w.metaDB = db
30 | 	}
31 | }
32 | 
33 | // WithSegmentFiler is an option that allows a custom SegmentFiler (and hence
34 | // Segment Reader/Writer implementation) to be provided to the WAL. If not used
35 | // the default SegmentFiler is used.
36 | func WithSegmentFiler(sf types.SegmentFiler) walOpt {
37 | 	return func(w *WAL) {
38 | 		w.sf = sf
39 | 	}
40 | }
41 | 
42 | // WithLogger is an option that allows a custom logger to be used.
43 | func WithLogger(logger hclog.Logger) walOpt {
44 | 	return func(w *WAL) {
45 | 		w.log = logger
46 | 	}
47 | }
48 | 
49 | // WithSegmentSize is an option that allows a custom segmentSize to be set.
50 | func WithSegmentSize(size int) walOpt {
51 | 	return func(w *WAL) {
52 | 		w.segmentSize = size
53 | 	}
54 | }
55 | 
56 | // WithMetricsCollector is an option that allows a custom segmentSize to be set.
57 | func WithMetricsCollector(c metrics.Collector) walOpt {
58 | 	return func(w *WAL) {
59 | 		w.metrics = c
60 | 	}
61 | }
62 | 
63 | func (w *WAL) applyDefaultsAndValidate() error {
64 | 	// Check if an external codec has been used that it's not using a reserved ID.
65 | 	if w.codec != nil && w.codec.ID() < FirstExternalCodecID {
66 | 		return fmt.Errorf("codec is using a reserved ID (below %d)", FirstExternalCodecID)
67 | 	}
68 | 
69 | 	// Defaults
70 | 	if w.log == nil {
71 | 		w.log = hclog.Default().Named("wal")
72 | 	}
73 | 	if w.codec == nil {
74 | 		w.codec = &BinaryCodec{}
75 | 	}
76 | 	if w.sf == nil {
77 | 		// These are not actually swappable via options right now but we override
78 | 		// them in tests. Only load the default implementations if they are not set.
79 | 		vfs := fs.New()
80 | 		w.sf = segment.NewFiler(w.dir, vfs)
81 | 	}
82 | 	if w.metrics == nil {
83 | 		w.metrics = &metrics.NoOpCollector{}
84 | 	}
85 | 	if w.metaDB == nil {
86 | 		w.metaDB = &metadb.BoltMetaDB{}
87 | 	}
88 | 	if w.segmentSize == 0 {
89 | 		w.segmentSize = DefaultSegmentSize
90 | 	}
91 | 	return nil
92 | }
93 | 


--------------------------------------------------------------------------------
/codec_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package wal
 5 | 
 6 | import (
 7 | 	"bytes"
 8 | 	"testing"
 9 | 	"time"
10 | 
11 | 	fuzz "github.com/google/gofuzz"
12 | 	"github.com/hashicorp/raft"
13 | 	"github.com/stretchr/testify/require"
14 | )
15 | 
16 | // TestBinaryCodecFuzz tests that our codec can decode whatever it encoded.
17 | // Because we are using a reflection-based fuzzer to assign random values to all
18 | // fields this test will also catch any changes in a later version of raft that
19 | // add new fields since our codec will "loose" them.
20 | func TestBinaryCodecFuzz(t *testing.T) {
21 | 	rounds := 1000
22 | 
23 | 	f := fuzz.New().Funcs(
24 | 		// Stub time since gofuzz generates unencodable times depending on your
25 | 		// local timezone! On my computer in GMT timezone, it will generate Times
26 | 		// that are unencodable for some reason I don't understand. All it's doing
27 | 		// is picking a random UnixTimestamp but for some reason that is sometimes
28 | 		// unencodable?
29 | 		func(t *time.Time, c fuzz.Continue) {
30 | 			// This is copied from fuzzTime in gofuzz but with a fix until it's
31 | 			// accepted upstream.
32 | 			var sec, nsec int64
33 | 			// Allow for about 1000 years of random time values, which keeps things
34 | 			// like JSON parsing reasonably happy.
35 | 			sec = c.Rand.Int63n(1000 * 365 * 24 * 60 * 60)
36 | 			nsec = c.Rand.Int63n(999_999_999)
37 | 			*t = time.Unix(sec, nsec)
38 | 		},
39 | 	)
40 | 	c := BinaryCodec{}
41 | 
42 | 	require.Equal(t, CodecBinaryV1, c.ID())
43 | 
44 | 	var buf bytes.Buffer
45 | 
46 | 	for i := 0; i < rounds; i++ {
47 | 		var log, log2 raft.Log
48 | 		f.Fuzz(&log)
49 | 		buf.Reset()
50 | 
51 | 		err := c.Encode(&log, &buf)
52 | 		require.NoError(t, err)
53 | 
54 | 		err = c.Decode(buf.Bytes(), &log2)
55 | 		require.NoError(t, err)
56 | 
57 | 		t.Logf("log %#v. Binary: % x", log, buf.Bytes())
58 | 
59 | 		require.Equal(t, log, log2)
60 | 	}
61 | }
62 | 
63 | func TestBinaryCodecCopysOnDecode(t *testing.T) {
64 | 	var in, out raft.Log
65 | 
66 | 	in.Index = 1234
67 | 	in.Term = 2
68 | 	in.Type = raft.LogCommand
69 | 	in.Data = []byte("foo")
70 | 	in.Extensions = []byte("ext")
71 | 
72 | 	c := BinaryCodec{}
73 | 	var buf bytes.Buffer
74 | 	require.NoError(t, c.Encode(&in, &buf))
75 | 
76 | 	rawBytes := buf.Bytes()
77 | 
78 | 	require.NoError(t, c.Decode(rawBytes, &out))
79 | 
80 | 	// Make sure the decoded data is the same
81 | 	require.Equal(t, string(out.Data), "foo")
82 | 	require.Equal(t, string(out.Extensions), "ext")
83 | 
84 | 	// Intentionally mangle the buffer contents
85 | 	for i := 0; i < len(rawBytes); i++ {
86 | 		rawBytes[i] = 'x'
87 | 	}
88 | 
89 | 	// Make sure the decoded data is still the same (i.e. didn't refer to the
90 | 	// underlying bytes)
91 | 	require.Equal(t, string(out.Data), "foo")
92 | 	require.Equal(t, string(out.Extensions), "ext")
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/metrics/atomic_collector.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package metrics
 5 | 
 6 | import "sync/atomic"
 7 | 
 8 | var (
 9 | 	_ Collector = &AtomicCollector{}
10 | )
11 | 
12 | // AtomicCollector is a simple Collector that atomically stores
13 | // counters and gauges in memory.
14 | type AtomicCollector struct {
15 | 	counters []uint64
16 | 	gauges   []uint64
17 | 
18 | 	counterIndex, gaugeIndex map[string]int
19 | }
20 | 
21 | // NewAtomicCollector creates a collector for the given set of Definitions.
22 | func NewAtomicCollector(defs Definitions) *AtomicCollector {
23 | 	c := &AtomicCollector{
24 | 		counters:     make([]uint64, len(defs.Counters)),
25 | 		gauges:       make([]uint64, len(defs.Gauges)),
26 | 		counterIndex: make(map[string]int),
27 | 		gaugeIndex:   make(map[string]int),
28 | 	}
29 | 	for i, d := range defs.Counters {
30 | 		if _, ok := c.counterIndex[d.Name]; ok {
31 | 			panic("duplicate metrics named " + d.Name)
32 | 		}
33 | 		c.counterIndex[d.Name] = i
34 | 	}
35 | 	for i, d := range defs.Gauges {
36 | 		if _, ok := c.counterIndex[d.Name]; ok {
37 | 			panic("duplicate metrics named " + d.Name)
38 | 		}
39 | 		if _, ok := c.gaugeIndex[d.Name]; ok {
40 | 			panic("duplicate metrics named " + d.Name)
41 | 		}
42 | 		c.gaugeIndex[d.Name] = i
43 | 	}
44 | 	return c
45 | }
46 | 
47 | // IncrementCounter record val occurrences of the named event. Names will
48 | // follow prometheus conventions with lower_case_and_underscores. We don't
49 | // need any additional labels currently.
50 | func (c *AtomicCollector) IncrementCounter(name string, delta uint64) {
51 | 	id, ok := c.counterIndex[name]
52 | 	if !ok {
53 | 		panic("invalid metric name: " + name)
54 | 	}
55 | 	atomic.AddUint64(&c.counters[id], delta)
56 | }
57 | 
58 | // SetGauge sets the value of the named gauge overriding any previous value.
59 | func (c *AtomicCollector) SetGauge(name string, val uint64) {
60 | 	id, ok := c.gaugeIndex[name]
61 | 	if !ok {
62 | 		panic("invalid metric name: " + name)
63 | 	}
64 | 	atomic.StoreUint64(&c.gauges[id], val)
65 | }
66 | 
67 | // Summary returns a summary of the metrics since startup. Each value is
68 | // atomically loaded but the set is not atomic overall and may represent an
69 | // inconsistent snapshot e.g. with some metrics reflecting the most recent
70 | // operation while others don't.
71 | func (c *AtomicCollector) Summary() Summary {
72 | 	s := Summary{
73 | 		Counters: make(map[string]uint64, len(c.counters)),
74 | 		Gauges:   make(map[string]uint64, len(c.gauges)),
75 | 	}
76 | 	for name, id := range c.counterIndex {
77 | 		s.Counters[name] = atomic.LoadUint64(&c.counters[id])
78 | 	}
79 | 	for name, id := range c.gaugeIndex {
80 | 		s.Gauges[name] = atomic.LoadUint64(&c.gauges[id])
81 | 	}
82 | 	return s
83 | }
84 | 
85 | // Summary is a copy of the values recorded so far for each metric.
86 | type Summary struct {
87 | 	Counters map[string]uint64
88 | 	Gauges   map[string]uint64
89 | }
90 | 


--------------------------------------------------------------------------------
/.github/workflows/go-tests.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) HashiCorp, Inc.
 2 | 
 3 | name: go-tests
 4 | 
 5 | on: [push]
 6 | 
 7 | env:
 8 |   TEST_RESULTS: /tmp/test-results
 9 | 
10 | jobs:
11 | 
12 |   go-tests:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         go-version: [ 1.22 ]
17 | 
18 |     steps:
19 |       - name: Setup go
20 |         uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
21 |         with:
22 |           go-version: ${{ matrix.go-version }}
23 | 
24 |       - name: Checkout code
25 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
26 | 
27 |       - name: Create test directory
28 |         run: |
29 |           mkdir -p ${{ env.TEST_RESULTS }}
30 | 
31 |       - name: Download go modules
32 |         run: go mod download
33 | 
34 |       - name: Cache / restore go modules
35 |         uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
36 |         with:
37 |           path: |
38 |             ~/go/pkg/mod
39 |           key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
40 |           restore-keys: |
41 |             ${{ runner.os }}-go-
42 | 
43 |       # Check go fmt output because it does not report non-zero when there are fmt changes
44 |       - name: Run gofmt
45 |         run: |
46 |           go fmt ./...
47 |           files=$(go fmt ./...)
48 |             if [ -n "$files" ]; then
49 |               echo "The following file(s) do not conform to go fmt:"
50 |               echo "$files"
51 |               exit 1
52 |             fi
53 | 
54 |       # Install gotestsum
55 |       - name: Install gotestsum
56 |         run: |
57 |           GTS="gotest.tools/gotestsum@v1.8.2"
58 |           # We use the same error message prefix in either failure case, so just define it once here.
59 |           ERROR="Failed to install $GTS"
60 |           # First try to 'go install', if that fails try 'go get'...
61 |           go install "$GTS" || go get "$GTS" || { echo "$ERROR: both 'go install' and 'go get' failed"; exit 1; }
62 |           # Check that the gotestsum command was actually installed in the path...
63 |           command -v gotestsum > /dev/null 2>&1 || { echo "$ERROR: gotestsum command not installed"; exit 1; }
64 |           echo "OK: Command 'gotestsum' installed ($GTS)"
65 | 
66 |       - name: Run go tests
67 |         run: |
68 |           PACKAGE_NAMES=$(go list ./...)
69 |           gotestsum --format=short-verbose \
70 |             --junitfile $TEST_RESULTS/gotestsum-report.xml \
71 |             -- \
72 |             -coverprofile $TEST_RESULTS/coverage.out \
73 |             -race $PACKAGE_NAMES
74 |           go tool cover -html=$TEST_RESULTS/coverage.out -o $TEST_RESULTS/coverage.html
75 | 
76 |       # Save coverage report parts
77 |       - name: Upload and save artifacts
78 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
79 |         with:
80 |           name: Test Results
81 |           path: ${{ env.TEST_RESULTS }}
82 | 


--------------------------------------------------------------------------------
/metrics.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | package wal
 5 | 
 6 | import (
 7 | 	"github.com/hashicorp/raft-wal/metrics"
 8 | )
 9 | 
10 | var (
11 | 	// MetricDefinitions describe the metrics emitted by this library via the
12 | 	// provided metrics.Collector implementation. It's public so that these can be
13 | 	// registered during init with metrics clients that support pre-defining
14 | 	// metrics.
15 | 	MetricDefinitions = metrics.Definitions{
16 | 		Counters: []metrics.Descriptor{
17 | 			{
18 | 				Name: "log_entry_bytes_written",
19 | 				Desc: "log_entry_bytes_written counts the bytes of log entry after encoding" +
20 | 					" with Codec. Actual bytes written to disk might be slightly higher as it" +
21 | 					" includes headers and index entries.",
22 | 			},
23 | 			{
24 | 				Name: "log_entries_written",
25 | 				Desc: "log_entries_written counts the number of entries written.",
26 | 			},
27 | 			{
28 | 				Name: "log_appends",
29 | 				Desc: "log_appends counts the number of calls to StoreLog(s) i.e." +
30 | 					" number of batches of entries appended.",
31 | 			},
32 | 			{
33 | 				Name: "log_entry_bytes_read",
34 | 				Desc: "log_entry_bytes_read counts the bytes of log entry read from" +
35 | 					" segments before decoding. actual bytes read from disk might be higher" +
36 | 					" as it includes headers and index entries and possible secondary reads" +
37 | 					" for large entries that don't fit in buffers.",
38 | 			},
39 | 			{
40 | 				Name: "log_entries_read",
41 | 				Desc: "log_entries_read counts the number of calls to get_log.",
42 | 			},
43 | 			{
44 | 				Name: "segment_rotations",
45 | 				Desc: "segment_rotations counts how many times we move to a new segment file.",
46 | 			},
47 | 			{
48 | 				Name: "head_truncations",
49 | 				Desc: "head_truncations counts how many log entries have been truncated" +
50 | 					" from the head - i.e. the oldest entries. by graphing the rate of" +
51 | 					" change over time you can see individual truncate calls as spikes.",
52 | 			},
53 | 			{
54 | 				Name: "tail_truncations",
55 | 				Desc: "tail_truncations counts how many log entries have been truncated" +
56 | 					" from the head - i.e. the newest entries. by graphing the rate of" +
57 | 					" change over time you can see individual truncate calls as spikes.",
58 | 			},
59 | 			{
60 | 				Name: "stable_gets",
61 | 				Desc: "stable_gets counts how many calls to StableStore.Get or GetUint64.",
62 | 			},
63 | 			{
64 | 				Name: "stable_sets",
65 | 				Desc: "stable_sets counts how many calls to StableStore.Set or SetUint64.",
66 | 			},
67 | 		},
68 | 		Gauges: []metrics.Descriptor{
69 | 			{
70 | 				Name: "last_segment_age_seconds",
71 | 				Desc: "last_segment_age_seconds is a gauge that is set each time we" +
72 | 					" rotate a segment and describes the number of seconds between when" +
73 | 					" that segment file was first created and when it was sealed. this" +
74 | 					" gives a rough estimate how quickly writes are filling the disk.",
75 | 			},
76 | 		},
77 | 	}
78 | )
79 | 


--------------------------------------------------------------------------------
/segment/reader_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"fmt"
  8 | 	"strings"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/hashicorp/raft-wal/types"
 13 | 	"github.com/stretchr/testify/require"
 14 | )
 15 | 
 16 | type entryDesc struct {
 17 | 	len, num int
 18 | }
 19 | 
 20 | func TestReader(t *testing.T) {
 21 | 	cases := []struct {
 22 | 		name          string
 23 | 		firstIndex    uint64
 24 | 		entries       []entryDesc
 25 | 		corrupt       func(twf *testWritableFile) error
 26 | 		wantLastIndex uint64
 27 | 		wantOpenErr   string
 28 | 	}{
 29 | 		{
 30 | 			name:       "basic sealed",
 31 | 			firstIndex: 1,
 32 | 			entries: []entryDesc{
 33 | 				// 28 * 128 bytes entries are all that will fit in a 4KiB segment after
 34 | 				// headers and index size are accounted for.
 35 | 				{len: 128, num: 28},
 36 | 			},
 37 | 			wantLastIndex: 28,
 38 | 		},
 39 | 		{
 40 | 			name:       "value larger than minBufSize",
 41 | 			firstIndex: 1,
 42 | 			entries: []entryDesc{
 43 | 				{len: 128, num: 5},
 44 | 				{len: minBufSize + 10, num: 1},
 45 | 			},
 46 | 			wantLastIndex: 6,
 47 | 		},
 48 | 		{
 49 | 			name:       "sealed file truncated",
 50 | 			firstIndex: 1,
 51 | 			entries: []entryDesc{
 52 | 				{len: 128, num: 28},
 53 | 			},
 54 | 			corrupt: func(twf *testWritableFile) error {
 55 | 				twf.Truncate(0)
 56 | 				return nil
 57 | 			},
 58 | 			wantOpenErr: "corrupt",
 59 | 		},
 60 | 	}
 61 | 
 62 | 	for _, tc := range cases {
 63 | 		tc := tc
 64 | 		t.Run(tc.name, func(t *testing.T) {
 65 | 			vfs := newTestVFS()
 66 | 
 67 | 			f := NewFiler("test", vfs)
 68 | 
 69 | 			seg0 := testSegment(1)
 70 | 
 71 | 			w, err := f.Create(seg0)
 72 | 			require.NoError(t, err)
 73 | 			defer w.Close()
 74 | 
 75 | 			// Append previous entries. We just pick a fixed size and format that's
 76 | 			// easy to verify but generally fits in our test block size.
 77 | 			idx := tc.firstIndex
 78 | 			wantLength := make(map[uint64]int)
 79 | 			for _, desc := range tc.entries {
 80 | 				// Append individually, could do commit batches but this is all in
 81 | 				// memory so no real benefit.
 82 | 				padLen := 0
 83 | 				if desc.len > 6 {
 84 | 					padLen = desc.len - 6
 85 | 				}
 86 | 				padding := strings.Repeat("P", padLen)
 87 | 				for i := 0; i < desc.num; i++ {
 88 | 					v := fmt.Sprintf("%05d:%s", idx, padding)
 89 | 					err := w.Append([]types.LogEntry{{Index: idx, Data: []byte(v)}})
 90 | 					require.NoError(t, err, "error appending entry idx=%d", idx)
 91 | 					wantLength[idx] = desc.len
 92 | 					idx++
 93 | 				}
 94 | 			}
 95 | 
 96 | 			// Should have sealed
 97 | 			sealed, indexStart, err := w.Sealed()
 98 | 			require.NoError(t, err)
 99 | 			require.True(t, sealed)
100 | 
101 | 			if tc.corrupt != nil {
102 | 				file := testFileFor(t, w)
103 | 				require.NoError(t, tc.corrupt(file))
104 | 			}
105 | 
106 | 			seg0.IndexStart = indexStart
107 | 			seg0.MaxIndex = w.LastIndex()
108 | 			seg0.SealTime = time.Now()
109 | 
110 | 			// Now open the "file" with a reader.
111 | 			r, err := f.Open(seg0)
112 | 
113 | 			if tc.wantOpenErr != "" {
114 | 				require.ErrorContains(t, err, tc.wantOpenErr)
115 | 				return
116 | 			}
117 | 			require.NoError(t, err)
118 | 
119 | 			// Make sure we can read every value
120 | 			for idx := tc.firstIndex; idx <= tc.wantLastIndex; idx++ {
121 | 				got, err := r.GetLog(idx)
122 | 				require.NoError(t, err, "error reading idx=%d", idx)
123 | 				require.True(t, strings.HasPrefix(string(got.Bs), fmt.Sprintf("%05d:", idx)), "bad value for idx=%d", idx)
124 | 				require.Len(t, string(got.Bs), wantLength[idx])
125 | 			}
126 | 
127 | 			// And we should _not_ read one either side
128 | 			if tc.firstIndex > 1 {
129 | 				_, err := r.GetLog(tc.firstIndex - 1)
130 | 				require.ErrorIs(t, err, types.ErrNotFound)
131 | 			}
132 | 			_, err = r.GetLog(tc.wantLastIndex + 1)
133 | 			require.ErrorIs(t, err, types.ErrNotFound)
134 | 		})
135 | 	}
136 | }
137 | 


--------------------------------------------------------------------------------
/metrics/gometrics_collector.go:
--------------------------------------------------------------------------------
 1 | // Copyright IBM Corp. 2020, 2025
 2 | // SPDX-License-Identifier: MPL-2.0
 3 | 
 4 | // # Metrics Configuration
 5 | //
 6 | // The raft-wal library is instrumented to be able to use different metrics collectors. There are currently two implemented within this package:
 7 | //   - atomic
 8 | //   - go-metrics
 9 | //
10 | // # go-metrics Compatibility
11 | //
12 | // This library can emit metrics using either github.com/armon/go-metrics or github.com/hashicorp/go-metrics. Choosing between the libraries is controlled via build tags.
13 | //
14 | // Build Tags:
15 | //   - armonmetrics - Using this tag will cause metrics to be routed to armon/go-metrics
16 | //   - hashicorpmetrics - Using this tag will cause all metrics to be routed to hashicorp/go-metrics
17 | //
18 | // If no build tag is specified, the default behavior is to use armon/go-metrics.
19 | //
20 | // # Deprecating armon/go-metrics
21 | //
22 | // Emitting metrics to armon/go-metrics is officially deprecated. Usage of armon/go-metrics will remain the default until mid-2025 with opt-in support continuing to the end of 2025.
23 | //
24 | // Migration:
25 | // To migrate an application currently using the older armon/go-metrics to instead use hashicorp/go-metrics the following should be done.
26 | //
27 | //  1. Upgrade libraries using armon/go-metrics to consume hashicorp/go-metrics/compat instead. This should involve only changing import statements. All repositories within the hashicorp GitHub organization will be getting these updates in early 2025.
28 | //
29 | //  2. Update an applications library dependencies to those that have the compatibility layer configured.
30 | //
31 | //  3. Update the application to use hashicorp/go-metrics for configuring metrics export instead of armon/go-metrics
32 | //
33 | //     - Replace all application imports of github.com/armon/go-metrics with github.com/hashicorp/go-metrics
34 | //
35 | //     - Instrument your build system to build with the hashicorpmetrics tag.
36 | //
37 | // Eventually once the default behavior changes to use hashicorp/go-metrics by default (mid-2025), you can drop the hashicorpmetrics build tag.
38 | package metrics
39 | 
40 | import gometrics "github.com/hashicorp/go-metrics/compat"
41 | 
42 | // GoMetricsCollector implements a Collector that passes through observations to
43 | // a go-metrics instance. The zero value works, writing metrics to the default
44 | // global instance however to set a prefix or a static set of labels to add to
45 | // each metric observed, or to use a non-global metrics instance use
46 | // NewGoMetricsCollector.
47 | type GoMetricsCollector struct {
48 | 	gm     *gometrics.Metrics
49 | 	prefix []string
50 | 	labels []gometrics.Label
51 | }
52 | 
53 | // NewGoMetricsCollector returns a GoMetricsCollector that will attach the
54 | // specified name prefix and/or labels to each observation. If gm is nil the
55 | // global metrics instance is used.
56 | func NewGoMetricsCollector(prefix []string, labels []gometrics.Label, gm *gometrics.Metrics) *GoMetricsCollector {
57 | 	if gm == nil {
58 | 		gm = gometrics.Default()
59 | 	}
60 | 	return &GoMetricsCollector{
61 | 		gm:     gm,
62 | 		prefix: prefix,
63 | 		labels: labels,
64 | 	}
65 | }
66 | 
67 | // IncrementCounter record val occurrences of the named event. Names will
68 | // follow prometheus conventions with lower_case_and_underscores. We don't
69 | // need any additional labels currently.
70 | func (c *GoMetricsCollector) IncrementCounter(name string, delta uint64) {
71 | 	c.gm.IncrCounterWithLabels(c.name(name), float32(delta), c.labels)
72 | }
73 | 
74 | // SetGauge sets the value of the named gauge overriding any previous value.
75 | func (c *GoMetricsCollector) SetGauge(name string, val uint64) {
76 | 	c.gm.SetGaugeWithLabels(c.name(name), float32(val), c.labels)
77 | }
78 | 
79 | // name returns the metric name as a slice we don't want to risk modifying the
80 | // prefix slice backing array since this might be called concurrently so we
81 | // always allocate a new slice.
82 | func (c *GoMetricsCollector) name(name string) []string {
83 | 	var ss []string
84 | 	return append(append(ss, c.prefix...), name)
85 | }
86 | 


--------------------------------------------------------------------------------
/bench/main.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package main
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"io/ioutil"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	"strings"
 15 | 	"time"
 16 | 
 17 | 	"github.com/HdrHistogram/hdrhistogram-go"
 18 | 	"github.com/benmathews/bench"
 19 | 	"github.com/hashicorp/raft-wal/metadb"
 20 | )
 21 | 
 22 | type opts struct {
 23 | 	// LogStore params
 24 | 	version        string
 25 | 	dir            string
 26 | 	segSize        int
 27 | 	noFreelistSync bool
 28 | 
 29 | 	// Common params
 30 | 	preLoadN int
 31 | 
 32 | 	// Append params
 33 | 	rate      int
 34 | 	duration  time.Duration
 35 | 	logSize   int
 36 | 	batchSize int
 37 | 
 38 | 	// Truncate params
 39 | 	truncateTrailingLogs int
 40 | 	truncatePeriod       time.Duration
 41 | }
 42 | 
 43 | func main() {
 44 | 	var o opts
 45 | 
 46 | 	flag.StringVar(&o.version, "v", "wal", "version to test 'wal' or 'bolt'")
 47 | 	flag.StringVar(&o.dir, "dir", "", "dir to write to. If empty will create a tmp dir. If not empty the dir will delete any existing WAL files present!")
 48 | 	flag.IntVar(&o.rate, "rate", 10, "append rate target per second")
 49 | 	flag.DurationVar(&o.duration, "t", 10*time.Second, "duration of the test")
 50 | 	flag.IntVar(&o.logSize, "s", 128, "size of each log entry appended")
 51 | 	flag.IntVar(&o.batchSize, "n", 1, "number of logs per append batch")
 52 | 	flag.IntVar(&o.segSize, "seg", 64, "segment size in MB")
 53 | 	flag.IntVar(&o.truncateTrailingLogs, "trail", 10000, "number of trailing logs to leave on truncate")
 54 | 	flag.DurationVar(&o.truncatePeriod, "tp", 0, "how often to head truncate back to 'trail' logs during append")
 55 | 	flag.IntVar(&o.preLoadN, "preload", 0, "number of logs to append and then truncate before we start")
 56 | 	flag.BoolVar(&o.noFreelistSync, "no-fl-sync", false, "used to disable freelist sync in boltdb for v=bolt")
 57 | 	flag.Parse()
 58 | 
 59 | 	var outBuf bytes.Buffer
 60 | 	teeOut := io.MultiWriter(os.Stdout, &outBuf)
 61 | 
 62 | 	if o.dir == "" {
 63 | 		tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*")
 64 | 		if err != nil {
 65 | 			panic(err)
 66 | 		}
 67 | 
 68 | 		defer os.RemoveAll(tmpDir)
 69 | 		o.dir = tmpDir
 70 | 	} else {
 71 | 		// Delete metadb and any segment files present
 72 | 		files, err := os.ReadDir(o.dir)
 73 | 		if err != nil {
 74 | 			panic(err)
 75 | 		}
 76 | 		for _, f := range files {
 77 | 			if f.IsDir() {
 78 | 				continue
 79 | 			}
 80 | 			if strings.HasSuffix(f.Name(), ".wal") || f.Name() == metadb.FileName || f.Name() == "raft.db" {
 81 | 				os.RemoveAll(filepath.Join(o.dir, f.Name()))
 82 | 			}
 83 | 		}
 84 | 	}
 85 | 
 86 | 	// Make the results dir if it doesn't exist
 87 | 	if err := os.MkdirAll(filepath.Join(o.dir, filepath.Dir(outFileName(o, "blah"))), 0755); err != nil {
 88 | 		panic(err)
 89 | 	}
 90 | 
 91 | 	r := &appendRequesterFactory{
 92 | 		opts:   o,
 93 | 		output: teeOut,
 94 | 	}
 95 | 	benchmark := bench.NewBenchmark(r, uint64(o.rate), 1, o.duration, 0)
 96 | 	summary, err := benchmark.Run()
 97 | 	if err != nil {
 98 | 		panic(err)
 99 | 	}
100 | 
101 | 	printHistogram(teeOut, "Good Append Latencies (ms)", summary.SuccessHistogram, 1_000_000)
102 | 
103 | 	fmt.Fprintln(teeOut, summary)
104 | 	summary.GenerateLatencyDistribution(nil, outFileName(o, "append-lat"))
105 | 	ioutil.WriteFile(outFileName(o, "stdout"), outBuf.Bytes(), 0644)
106 | }
107 | 
108 | func outFileName(o opts, suffix string) string {
109 | 	version := o.version
110 | 	if o.version == "bolt" && o.noFreelistSync {
111 | 		version += "-nfls"
112 | 	}
113 | 	return fmt.Sprintf("bench-result-%s-s%d-n%d-r%d-seg%dm-pre%d-trail%d-tp%s/%s-%s.txt",
114 | 		o.duration, o.logSize, o.batchSize, o.rate, o.segSize, o.preLoadN,
115 | 		o.truncateTrailingLogs, o.truncatePeriod, version, suffix)
116 | }
117 | 
118 | func printHistogram(f io.Writer, name string, h *hdrhistogram.Histogram, scale int64) {
119 | 	fmt.Fprintf(f, "\n==> %s\n", name)
120 | 	fmt.Fprintf(f, "  count    mean     p50     p99   p99.9     max\n")
121 | 	fmt.Fprintf(f, " %6d  %6.0f  %6d  %6d  %6d  %6d\n",
122 | 		h.TotalCount(),
123 | 		h.Mean()/float64(scale),
124 | 		h.ValueAtPercentile(50)/scale,
125 | 		h.ValueAtPercentile(99)/scale,
126 | 		h.ValueAtPercentile(99.9)/scale,
127 | 		h.Max()/scale,
128 | 	)
129 | }
130 | 


--------------------------------------------------------------------------------
/fs/fs_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package fs
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"io"
  9 | 	"os"
 10 | 	"path/filepath"
 11 | 	"testing"
 12 | 
 13 | 	"github.com/stretchr/testify/require"
 14 | )
 15 | 
 16 | func TestFS(t *testing.T) {
 17 | 	tmpDir, err := os.MkdirTemp("", "raft-wal-fs-test-*")
 18 | 	require.NoError(t, err)
 19 | 	defer os.RemoveAll(tmpDir)
 20 | 
 21 | 	fs := New()
 22 | 
 23 | 	// List should return nothing
 24 | 	files, err := fs.ListDir(tmpDir)
 25 | 	require.NoError(t, err)
 26 | 	require.Len(t, files, 0)
 27 | 
 28 | 	// Create a new file
 29 | 	wf, err := fs.Create(tmpDir, "00001-abcd1234.wal", 512*1024)
 30 | 	require.NoError(t, err)
 31 | 	defer wf.Close()
 32 | 
 33 | 	// Should be pre-allocated (on supported file systems).
 34 | 	// TODO work out if this is reliable in CI or if we can detect supported FSs?)
 35 | 	info, err := os.Stat(filepath.Join(tmpDir, "00001-abcd1234.wal"))
 36 | 	require.NoError(t, err)
 37 | 	require.Equal(t, int64(512*1024), info.Size())
 38 | 
 39 | 	// Should be able to write data in any order
 40 | 	n, err := wf.WriteAt(bytes.Repeat([]byte{'2'}, 1024), 1024)
 41 | 	require.NoError(t, err)
 42 | 	require.Equal(t, 1024, n)
 43 | 
 44 | 	n, err = wf.WriteAt(bytes.Repeat([]byte{'1'}, 1024), 0)
 45 | 	require.NoError(t, err)
 46 | 	require.Equal(t, 1024, n)
 47 | 
 48 | 	// And past the preallocated end.
 49 | 	n, err = wf.WriteAt(bytes.Repeat([]byte{'3'}, 1024), 512*1024)
 50 | 	require.NoError(t, err)
 51 | 	require.Equal(t, 1024, n)
 52 | 
 53 | 	// And sync them
 54 | 	require.NoError(t, wf.Sync())
 55 | 
 56 | 	// And read them back
 57 | 	rf, err := fs.OpenReader(tmpDir, "00001-abcd1234.wal")
 58 | 	require.NoError(t, err)
 59 | 	defer rf.Close()
 60 | 
 61 | 	var buf [1024]byte
 62 | 	n, err = rf.ReadAt(buf[:], 1024)
 63 | 	require.NoError(t, err)
 64 | 	require.Equal(t, len(buf), n)
 65 | 	require.Equal(t, byte('2'), buf[0])
 66 | 
 67 | 	n, err = rf.ReadAt(buf[:], 0)
 68 | 	require.NoError(t, err)
 69 | 	require.Equal(t, len(buf), n)
 70 | 	require.Equal(t, byte('1'), buf[0])
 71 | 
 72 | 	n, err = rf.ReadAt(buf[:], 512*1024)
 73 | 	require.NoError(t, err)
 74 | 	require.Equal(t, len(buf), n)
 75 | 	require.Equal(t, byte('3'), buf[0])
 76 | 
 77 | 	// Read off end is an error
 78 | 	_, err = rf.ReadAt(buf[:], 513*1024)
 79 | 	require.ErrorIs(t, err, io.EOF)
 80 | 
 81 | 	// Should also be able to re-open writable file.
 82 | 	wf.Close()
 83 | 	wf, err = fs.OpenWriter(tmpDir, "00001-abcd1234.wal")
 84 | 	require.NoError(t, err)
 85 | 
 86 | 	// And write more
 87 | 	n, err = wf.WriteAt(bytes.Repeat([]byte{'4'}, 1024), 2048)
 88 | 	require.NoError(t, err)
 89 | 	require.Equal(t, 1024, n)
 90 | 	require.NoError(t, wf.Sync())
 91 | 
 92 | 	// And read back prior and new data through the writer. Read across the old
 93 | 	// and new data written - first byte is old data rest is new.
 94 | 	n, err = wf.ReadAt(buf[:], 2047)
 95 | 	require.NoError(t, err)
 96 | 	require.Equal(t, len(buf), n)
 97 | 	require.Equal(t, byte('2'), buf[0])
 98 | 	require.Equal(t, byte('4'), buf[1])
 99 | 
100 | 	// The already open reader should also be able to read that newly written data
101 | 	n, err = rf.ReadAt(buf[:], 2048)
102 | 	require.NoError(t, err)
103 | 	require.Equal(t, len(buf), n)
104 | 	require.Equal(t, byte('4'), buf[0])
105 | 
106 | 	// List should return file now
107 | 	files, err = fs.ListDir(tmpDir)
108 | 	require.NoError(t, err)
109 | 	require.Equal(t, []string{"00001-abcd1234.wal"}, files)
110 | 
111 | 	// Delete should work
112 | 	require.NoError(t, fs.Delete(tmpDir, "00001-abcd1234.wal"))
113 | 
114 | 	files, err = fs.ListDir(tmpDir)
115 | 	require.NoError(t, err)
116 | 	require.Equal(t, []string{}, files)
117 | }
118 | 
119 | func TestRealFSNoDir(t *testing.T) {
120 | 	fs := New()
121 | 
122 | 	_, err := fs.ListDir("/not-a-real-dir")
123 | 	require.Error(t, err)
124 | 	require.Contains(t, err.Error(), "no such file or directory")
125 | 
126 | 	_, err = fs.Create("/not-a-real-dir", "foo", 1024)
127 | 	require.Error(t, err)
128 | 	require.Contains(t, err.Error(), "no such file or directory")
129 | 
130 | 	_, err = fs.OpenReader("/not-a-real-dir", "foo")
131 | 	require.Error(t, err)
132 | 	require.Contains(t, err.Error(), "no such file or directory")
133 | 
134 | 	_, err = fs.OpenWriter("/not-a-real-dir", "foo")
135 | 	require.Error(t, err)
136 | 	require.Contains(t, err.Error(), "no such file or directory")
137 | }
138 | 


--------------------------------------------------------------------------------
/metadb/metadb_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package metadb
  5 | 
  6 | import (
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/hashicorp/raft-wal/types"
 13 | 	"github.com/stretchr/testify/require"
 14 | )
 15 | 
 16 | func TestMetaDB(t *testing.T) {
 17 | 	cases := []struct {
 18 | 		name        string
 19 | 		writeState  *types.PersistentState
 20 | 		writeStable map[string][]byte
 21 | 		failSim     func()
 22 | 	}{
 23 | 		{
 24 | 			name:       "basic storage",
 25 | 			writeState: makeState(4),
 26 | 			writeStable: map[string][]byte{
 27 | 				"CurrentTerm":  []byte{0, 0, 0, 0, 0, 0, 0, 5},
 28 | 				"LastVoteTerm": []byte{0, 0, 0, 0, 0, 0, 0, 5},
 29 | 				"LastVoteCand": []byte("server1"),
 30 | 			},
 31 | 		},
 32 | 	}
 33 | 
 34 | 	for _, tc := range cases {
 35 | 		tc := tc
 36 | 		t.Run(tc.name, func(t *testing.T) {
 37 | 			tmpDir, err := os.MkdirTemp("", "raft-wal-meta-test-*")
 38 | 			require.NoError(t, err)
 39 | 			defer os.RemoveAll(tmpDir)
 40 | 
 41 | 			{
 42 | 				// Should be able to load the DB
 43 | 				var db BoltMetaDB
 44 | 				gotState, err := db.Load(tmpDir)
 45 | 				require.NoError(t, err)
 46 | 				defer db.Close()
 47 | 
 48 | 				require.Equal(t, 0, int(gotState.NextSegmentID))
 49 | 				require.Empty(t, gotState.Segments)
 50 | 
 51 | 				if tc.writeState != nil {
 52 | 					require.NoError(t, db.CommitState(*tc.writeState))
 53 | 				}
 54 | 				for k, v := range tc.writeStable {
 55 | 					require.NoError(t, db.SetStable([]byte(k), v))
 56 | 				}
 57 | 
 58 | 				// Close DB and re-open a new one to ensure persistence.
 59 | 				db.Close()
 60 | 			}
 61 | 
 62 | 			var db BoltMetaDB
 63 | 			gotState, err := db.Load(tmpDir)
 64 | 			require.NoError(t, err)
 65 | 
 66 | 			require.Equal(t, *tc.writeState, gotState)
 67 | 
 68 | 			for k, v := range tc.writeStable {
 69 | 				got, err := db.GetStable([]byte(k))
 70 | 				require.NoError(t, err)
 71 | 				require.Equal(t, v, got)
 72 | 			}
 73 | 		})
 74 | 	}
 75 | }
 76 | 
 77 | func TestMetaDBErrors(t *testing.T) {
 78 | 	tmpDir, err := os.MkdirTemp("", "raft-wal-meta-test-*")
 79 | 	require.NoError(t, err)
 80 | 	defer os.RemoveAll(tmpDir)
 81 | 
 82 | 	var db BoltMetaDB
 83 | 
 84 | 	// Calling anything before load is an error
 85 | 	require.ErrorIs(t, db.CommitState(types.PersistentState{NextSegmentID: 1234}), ErrUnintialized)
 86 | 
 87 | 	_, err = db.GetStable([]byte("foo"))
 88 | 	require.ErrorIs(t, err, ErrUnintialized)
 89 | 
 90 | 	err = db.SetStable([]byte("foo"), []byte("bar"))
 91 | 	require.ErrorIs(t, err, ErrUnintialized)
 92 | 
 93 | 	// Loading twice is OK from same dir
 94 | 	_, err = db.Load(tmpDir)
 95 | 	require.NoError(t, err)
 96 | 	_, err = db.Load(tmpDir)
 97 | 	require.NoError(t, err)
 98 | 
 99 | 	// But not from a different (valid) one
100 | 	tmpDir2, err := ioutil.TempDir("", "wal-fs-test-*")
101 | 	require.NoError(t, err)
102 | 	defer os.RemoveAll(tmpDir2)
103 | 
104 | 	_, err = db.Load(tmpDir2)
105 | 	require.ErrorContains(t, err, "already open in dir")
106 | 
107 | 	// Loading from a non-existent dir is an error
108 | 	var db2 BoltMetaDB
109 | 	_, err = db2.Load("fake-dir-that-does-not-exist")
110 | 	require.ErrorContains(t, err, "no such file or directory")
111 | }
112 | 
113 | func makeState(nSegs int) *types.PersistentState {
114 | 	startIdx := 1000
115 | 	perSegment := 100
116 | 	startID := 1234
117 | 	// Times are pesky remove as much stuff that doesn't survive serilisation as
118 | 	// possible as we don't really care about it!
119 | 	startTime := time.Now().UTC().Round(time.Second).Add(time.Duration(-1*nSegs) * time.Minute)
120 | 
121 | 	state := &types.PersistentState{
122 | 		NextSegmentID: uint64(startID + nSegs),
123 | 	}
124 | 
125 | 	for i := 0; i < (nSegs - 1); i++ {
126 | 		si := types.SegmentInfo{
127 | 			ID:         uint64(startID + i),
128 | 			BaseIndex:  uint64(startIdx + (i * perSegment)),
129 | 			MinIndex:   uint64(startIdx + (i * perSegment)),
130 | 			MaxIndex:   uint64(startIdx + ((i + 1) * perSegment) - 1),
131 | 			Codec:      1,
132 | 			IndexStart: 123456,
133 | 			CreateTime: startTime.Add(time.Duration(i) * time.Minute),
134 | 			SealTime:   startTime.Add(time.Duration(i+1) * time.Minute),
135 | 			SizeLimit:  64 * 1024 * 1024,
136 | 		}
137 | 		state.Segments = append(state.Segments, si)
138 | 	}
139 | 	if nSegs > 0 {
140 | 		// Append an unsealed tail
141 | 		i := nSegs - 1
142 | 		si := types.SegmentInfo{
143 | 			ID:         uint64(startID + i),
144 | 			BaseIndex:  uint64(startIdx + (i * perSegment)),
145 | 			MinIndex:   uint64(startIdx + (i * perSegment)),
146 | 			Codec:      1,
147 | 			CreateTime: startTime.Add(time.Duration(i) * time.Minute),
148 | 			SizeLimit:  64 * 1024 * 1024,
149 | 		}
150 | 		state.Segments = append(state.Segments, si)
151 | 	}
152 | 	return state
153 | }
154 | 


--------------------------------------------------------------------------------
/fs/fs.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package fs
  5 | 
  6 | import (
  7 | 	"fmt"
  8 | 	"io/ioutil"
  9 | 	"math"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 
 13 | 	"github.com/hashicorp/raft-wal/types"
 14 | 	"go.etcd.io/etcd/client/pkg/v3/fileutil"
 15 | )
 16 | 
 17 | // FS implements the wal.VFS interface using GO's built in OS Filesystem (and a
 18 | // few helpers).
 19 | //
 20 | // TODO if we changed the interface to be Dir centric we could cache the open
 21 | // dir handle and save some time opening it on each Create in order to fsync.
 22 | type FS struct {
 23 | }
 24 | 
 25 | func New() *FS {
 26 | 	return &FS{}
 27 | }
 28 | 
 29 | // ListDir returns a list of all files in the specified dir in lexicographical
 30 | // order. If the dir doesn't exist, it must return an error. Empty array with
 31 | // nil error is assumed to mean that the directory exists and was readable,
 32 | // but contains no files.
 33 | func (fs *FS) ListDir(dir string) ([]string, error) {
 34 | 	files, err := ioutil.ReadDir(dir)
 35 | 	if err != nil {
 36 | 		return nil, err
 37 | 	}
 38 | 	names := make([]string, len(files))
 39 | 	for i, f := range files {
 40 | 		if f.IsDir() {
 41 | 			continue
 42 | 		}
 43 | 		names[i] = f.Name()
 44 | 	}
 45 | 	return names, nil
 46 | }
 47 | 
 48 | // Create creates a new file with the given name. If a file with the same name
 49 | // already exists an error is returned. If a non-zero size is given,
 50 | // implementations should make a best effort to pre-allocate the file to be
 51 | // that size. The dir must already exist and be writable to the current
 52 | // process.
 53 | func (fs *FS) Create(dir string, name string, size uint64) (types.WritableFile, error) {
 54 | 	f, err := os.OpenFile(filepath.Join(dir, name), os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644))
 55 | 	if err != nil {
 56 | 		return nil, err
 57 | 	}
 58 | 	// We just created the file. Preallocate it's size.
 59 | 	if size > 0 {
 60 | 		if size > math.MaxInt32 {
 61 | 			return nil, fmt.Errorf("maximum file size is %d bytes", math.MaxInt32)
 62 | 		}
 63 | 		if err := fileutil.Preallocate(f, int64(size), true); err != nil {
 64 | 			f.Close()
 65 | 			return nil, err
 66 | 		}
 67 | 	}
 68 | 	// We don't fsync here for performance reasons. Technically we need to fsync
 69 | 	// the file itself to make sure it is really persisted to disk, and you always
 70 | 	// need to fsync its parent dir after a creation because fsync doesn't ensure
 71 | 	// the directory entry is persisted - a crash could make the file appear to be
 72 | 	// missing as there is no directory entry.
 73 | 	//
 74 | 	// BUT, it doesn't actually matter if this file is crash safe, right up to the
 75 | 	// point where we actually commit log data. Since we always fsync the file
 76 | 	// when we commit logs, we don't need to again here. That does however leave
 77 | 	// the parent dir fsync which must be done after the first fsync to a newly
 78 | 	// created file to ensure it survives a crash.
 79 | 	//
 80 | 	// To handle that, we return a wrapped io.File that will fsync the parent dir
 81 | 	// as well the first time Sync is called (and only the first time),
 82 | 	fi := &File{
 83 | 		new:  0,
 84 | 		dir:  dir,
 85 | 		File: *f,
 86 | 	}
 87 | 	return fi, nil
 88 | }
 89 | 
 90 | // Delete indicates the file is no longer required. Typically it should be
 91 | // deleted from the underlying system to free disk space.
 92 | func (fs *FS) Delete(dir string, name string) error {
 93 | 	if err := os.Remove(filepath.Join(dir, name)); err != nil {
 94 | 		return err
 95 | 	}
 96 | 	// Make sure parent directory metadata is fsynced too before we call this
 97 | 	// "done".
 98 | 	return syncDir(dir)
 99 | }
100 | 
101 | // OpenReader opens an existing file in read-only mode. If the file doesn't
102 | // exist or permission is denied, an error is returned, otherwise no checks
103 | // are made about the well-formedness of the file, it may be empty, the wrong
104 | // size or corrupt in arbitrary ways.
105 | func (fs *FS) OpenReader(dir string, name string) (types.ReadableFile, error) {
106 | 	return os.OpenFile(filepath.Join(dir, name), os.O_RDONLY, os.FileMode(0644))
107 | }
108 | 
109 | // OpenWriter opens a file in read-write mode. If the file doesn't exist or
110 | // permission is denied, an error is returned, otherwise no checks are made
111 | // about the well-formedness of the file, it may be empty, the wrong size or
112 | // corrupt in arbitrary ways.
113 | func (fs *FS) OpenWriter(dir string, name string) (types.WritableFile, error) {
114 | 	return os.OpenFile(filepath.Join(dir, name), os.O_RDWR, os.FileMode(0644))
115 | }
116 | 
117 | func syncDir(dir string) error {
118 | 	f, err := os.Open(dir)
119 | 	if err != nil {
120 | 		return err
121 | 	}
122 | 	err = f.Sync()
123 | 	closeErr := f.Close()
124 | 	if err != nil {
125 | 		return err
126 | 	}
127 | 	return closeErr
128 | }
129 | 


--------------------------------------------------------------------------------
/codec.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package wal
  5 | 
  6 | import (
  7 | 	"encoding/binary"
  8 | 	"io"
  9 | 	"time"
 10 | 
 11 | 	"github.com/hashicorp/raft"
 12 | )
 13 | 
 14 | const (
 15 | 	// FirstExternalCodecID is the lowest value an external code may use to
 16 | 	// identify their codec. Values lower than this are reserved for future
 17 | 	// internal use.
 18 | 	FirstExternalCodecID = 1 << 16
 19 | 
 20 | 	// Codec* constants identify internally-defined codec identifiers.
 21 | 	CodecBinaryV1 uint64 = iota
 22 | )
 23 | 
 24 | // Codec is the interface required for encoding/decoding log entries. Callers
 25 | // can pass a custom one to manage their own serialization, or to add additional
 26 | // layers like encryption or compression of records. Each codec
 27 | type Codec interface {
 28 | 	// ID returns the globally unique identifier for this codec version. This is
 29 | 	// encoded into segment file headers and must remain consistent over the life
 30 | 	// of the log. Values up to FirstExternalCodecID are reserved and will error
 31 | 	// if specified externally.
 32 | 	ID() uint64
 33 | 
 34 | 	// Encode the log into the io.Writer. We pass a writer to allow the caller to
 35 | 	// manage buffer allocation and re-use.
 36 | 	Encode(l *raft.Log, w io.Writer) error
 37 | 
 38 | 	// Decode a log from the passed byte slice into the log entry pointed to. This
 39 | 	// allows the caller to manage allocation and re-use of the bytes and log
 40 | 	// entry. The resulting raft.Log MUST NOT reference data in the input byte
 41 | 	// slice since the input byte slice may be returned to a pool and re-used.
 42 | 	Decode([]byte, *raft.Log) error
 43 | }
 44 | 
 45 | // BinaryCodec is a Codec that encodes raft.Log with a simple binary format. We
 46 | // test that all fields are captured using reflection.
 47 | //
 48 | // For now we assume raft.Log is not likely to change too much. If it does we'll
 49 | // use a new Codec ID for the later version and have to support decoding either.
 50 | type BinaryCodec struct{}
 51 | 
 52 | // ID returns the globally unique identifier for this codec version. This is
 53 | // encoded into segment file headers and must remain consistent over the life
 54 | // of the log. Values up to FirstExternalCodecID are reserved and will error
 55 | // if specified externally.
 56 | func (c *BinaryCodec) ID() uint64 {
 57 | 	return CodecBinaryV1
 58 | }
 59 | 
 60 | // Encode the log into the io.Writer. We pass a writer to allow the caller to
 61 | // manage buffer allocation and re-use.
 62 | func (c *BinaryCodec) Encode(l *raft.Log, w io.Writer) error {
 63 | 	enc := encoder{w: w}
 64 | 	enc.varint(l.Index)
 65 | 	enc.varint(l.Term)
 66 | 	enc.varint(uint64(l.Type))
 67 | 	enc.bytes(l.Data)
 68 | 	enc.bytes(l.Extensions)
 69 | 	enc.time(l.AppendedAt)
 70 | 	return enc.err
 71 | }
 72 | 
 73 | // Decode a log from the passed byte slice into the log entry pointed to. This
 74 | // allows the caller to manage allocation and re-use of the bytes and log
 75 | // entry.
 76 | func (c *BinaryCodec) Decode(bs []byte, l *raft.Log) error {
 77 | 	dec := decoder{buf: bs}
 78 | 	l.Index = dec.varint()
 79 | 	l.Term = dec.varint()
 80 | 	l.Type = raft.LogType(dec.varint())
 81 | 	l.Data = dec.bytes()
 82 | 	l.Extensions = dec.bytes()
 83 | 	l.AppendedAt = dec.time()
 84 | 	return dec.err
 85 | }
 86 | 
 87 | type encoder struct {
 88 | 	w       io.Writer
 89 | 	err     error
 90 | 	scratch [10]byte
 91 | }
 92 | 
 93 | func (e *encoder) varint(v uint64) {
 94 | 	if e.err != nil {
 95 | 		return
 96 | 	}
 97 | 
 98 | 	// Varint encoding might use up to 9 bytes for a uint64
 99 | 	n := binary.PutUvarint(e.scratch[:], v)
100 | 	_, e.err = e.w.Write(e.scratch[:n])
101 | }
102 | 
103 | func (e *encoder) bytes(bs []byte) {
104 | 	// Put a length prefix
105 | 	e.varint(uint64(len(bs)))
106 | 	if e.err != nil {
107 | 		return
108 | 	}
109 | 	// Copy the bytes to the writer
110 | 	_, e.err = e.w.Write(bs)
111 | }
112 | 
113 | func (e *encoder) time(t time.Time) {
114 | 	if e.err != nil {
115 | 		return
116 | 	}
117 | 	bs, err := t.MarshalBinary()
118 | 	if err != nil {
119 | 		e.err = err
120 | 		return
121 | 	}
122 | 	_, e.err = e.w.Write(bs)
123 | }
124 | 
125 | type decoder struct {
126 | 	buf []byte
127 | 	err error
128 | }
129 | 
130 | func (d *decoder) varint() uint64 {
131 | 	if d.err != nil {
132 | 		return 0
133 | 	}
134 | 	v, n := binary.Uvarint(d.buf)
135 | 	d.buf = d.buf[n:]
136 | 	return v
137 | }
138 | 
139 | func (d *decoder) bytes() []byte {
140 | 	// Get length prefix
141 | 	n := d.varint()
142 | 	if d.err != nil {
143 | 		return nil
144 | 	}
145 | 	if n == 0 {
146 | 		return nil
147 | 	}
148 | 	if n > uint64(len(d.buf)) {
149 | 		d.err = io.ErrShortBuffer
150 | 		return nil
151 | 	}
152 | 	bs := make([]byte, n)
153 | 	copy(bs, d.buf[:n])
154 | 	d.buf = d.buf[n:]
155 | 	return bs
156 | }
157 | 
158 | func (d *decoder) time() time.Time {
159 | 	var t time.Time
160 | 	if d.err != nil {
161 | 		return t
162 | 	}
163 | 	// Note that Unmarshal Binary updates d.buf to remove the bytes it read
164 | 	// already.
165 | 	d.err = t.UnmarshalBinary(d.buf)
166 | 	return t
167 | }
168 | 


--------------------------------------------------------------------------------
/segment/reader.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"encoding/binary"
  8 | 	"errors"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"sync"
 12 | 
 13 | 	"github.com/hashicorp/raft-wal/types"
 14 | )
 15 | 
 16 | // Reader allows reading logs from a segment file.
 17 | type Reader struct {
 18 | 	info types.SegmentInfo
 19 | 	rf   types.ReadableFile
 20 | 
 21 | 	bufPool *sync.Pool
 22 | 
 23 | 	// tail optionally providers an interface to the writer state when this is an
 24 | 	// unsealed segment so we can fetch from it's in-memory index.
 25 | 	tail tailWriter
 26 | }
 27 | 
 28 | type tailWriter interface {
 29 | 	OffsetForFrame(idx uint64) (uint32, error)
 30 | }
 31 | 
 32 | func openReader(info types.SegmentInfo, rf types.ReadableFile, bufPool *sync.Pool) (*Reader, error) {
 33 | 	r := &Reader{
 34 | 		info:    info,
 35 | 		rf:      rf,
 36 | 		bufPool: bufPool,
 37 | 	}
 38 | 
 39 | 	return r, nil
 40 | }
 41 | 
 42 | // Close implements io.Closer
 43 | func (r *Reader) Close() error {
 44 | 	return r.rf.Close()
 45 | }
 46 | 
 47 | // GetLog returns the raw log entry bytes associated with idx. If the log
 48 | // doesn't exist in this segment types.ErrNotFound must be returned.
 49 | func (r *Reader) GetLog(idx uint64) (*types.PooledBuffer, error) {
 50 | 	offset, err := r.findFrameOffset(idx)
 51 | 	if err != nil {
 52 | 		return nil, err
 53 | 	}
 54 | 
 55 | 	_, payload, err := r.readFrame(offset)
 56 | 	if err != nil {
 57 | 		return nil, err
 58 | 	}
 59 | 	return payload, err
 60 | }
 61 | 
 62 | func (r *Reader) readFrame(offset uint32) (frameHeader, *types.PooledBuffer, error) {
 63 | 	buf := r.makeBuffer()
 64 | 
 65 | 	n, err := r.rf.ReadAt(buf.Bs, int64(offset))
 66 | 	if errors.Is(err, io.EOF) && n >= frameHeaderLen {
 67 | 		// We might have hit EOF just because our read buffer (at least 64KiB) might
 68 | 		// be larger than the space left in the file (say if files are tiny or if we
 69 | 		// are reading a frame near the end.). So don't treat EOF as an error as
 70 | 		// long as we have actually managed to read a frameHeader - we'll work out
 71 | 		// if we got the whole thing or not below.
 72 | 		err = nil
 73 | 
 74 | 		// Re-slice buf.Bs so it's len() reflect only what we actually managed to
 75 | 		// read. Note this doesn't impact the buffer length when it's returned to
 76 | 		// the pool which will still return the whole cap.
 77 | 		buf.Bs = buf.Bs[:n]
 78 | 	}
 79 | 	if err != nil {
 80 | 		return frameHeader{}, nil, err
 81 | 	}
 82 | 	fh, err := readFrameHeader(buf.Bs)
 83 | 	if err != nil {
 84 | 		return fh, nil, err
 85 | 	}
 86 | 
 87 | 	if (frameHeaderLen + int(fh.len)) <= len(buf.Bs) {
 88 | 		// We already have all we need read, just return it sliced to just include
 89 | 		// the payload.
 90 | 		buf.Bs = buf.Bs[frameHeaderLen : frameHeaderLen+fh.len]
 91 | 		return fh, buf, nil
 92 | 	}
 93 | 	// Need to read again, with a bigger buffer, return this one
 94 | 	buf.Close()
 95 | 
 96 | 	// Need to read more bytes, validate that len is a sensible number
 97 | 	if fh.len > MaxEntrySize {
 98 | 		return fh, nil, fmt.Errorf("%w: frame header indicates a record larger than MaxEntrySize (%d bytes)", types.ErrCorrupt, MaxEntrySize)
 99 | 	}
100 | 
101 | 	buf = &types.PooledBuffer{
102 | 		Bs: make([]byte, fh.len),
103 | 		// No closer, let outsized buffers be GCed in case they are massive and way
104 | 		// bigger than we need again. Could reconsider this if we find we need to
105 | 		// optimize for frequent > minBufSize reads.
106 | 	}
107 | 	if _, err := r.rf.ReadAt(buf.Bs, int64(offset+frameHeaderLen)); err != nil {
108 | 		return fh, nil, err
109 | 	}
110 | 	return fh, buf, nil
111 | }
112 | 
113 | func (r *Reader) makeBuffer() *types.PooledBuffer {
114 | 	if r.bufPool == nil {
115 | 		return &types.PooledBuffer{Bs: make([]byte, minBufSize)}
116 | 	}
117 | 	buf := r.bufPool.Get().([]byte)
118 | 	return &types.PooledBuffer{
119 | 		Bs: buf,
120 | 		CloseFn: func() {
121 | 			// Note we always return the whole allocated buf regardless of what Bs
122 | 			// ended up being sliced to.
123 | 			r.bufPool.Put(buf)
124 | 		},
125 | 	}
126 | 
127 | }
128 | 
129 | func (r *Reader) findFrameOffset(idx uint64) (uint32, error) {
130 | 	if r.tail != nil {
131 | 		// This is not a sealed segment.
132 | 		return r.tail.OffsetForFrame(idx)
133 | 	}
134 | 
135 | 	// Sealed segment, read from the on-disk index block.
136 | 	if r.info.IndexStart == 0 {
137 | 		return 0, fmt.Errorf("sealed segment has no index block")
138 | 	}
139 | 
140 | 	if idx < r.info.MinIndex || (r.info.MaxIndex > 0 && idx > r.info.MaxIndex) {
141 | 		return 0, types.ErrNotFound
142 | 	}
143 | 
144 | 	// IndexStart is the offset to the first entry in the index array. We need to
145 | 	// find the byte offset to the Nth entry
146 | 	entryOffset := (idx - r.info.BaseIndex)
147 | 	byteOffset := r.info.IndexStart + (entryOffset * 4)
148 | 
149 | 	var bs [4]byte
150 | 	n, err := r.rf.ReadAt(bs[:], int64(byteOffset))
151 | 	if err == io.EOF && n == 4 {
152 | 		// Read all of it just happened to be at end of file, ignore
153 | 		err = nil
154 | 	}
155 | 	if err != nil {
156 | 		return 0, fmt.Errorf("failed to read segment index: %w", err)
157 | 	}
158 | 	offset := binary.LittleEndian.Uint32(bs[:])
159 | 	return offset, nil
160 | }
161 | 


--------------------------------------------------------------------------------
/migrate/migrate.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package migrate
  5 | 
  6 | import (
  7 | 	"context"
  8 | 	"fmt"
  9 | 	"time"
 10 | 
 11 | 	"github.com/hashicorp/raft"
 12 | )
 13 | 
 14 | // CopyLogs takes an src and a dst raft.LogStore implementation and copies all
 15 | // entries from src to dst. It assumes dst is empty. Neither LogStore may be in
 16 | // use at the time. batchBytes is the target number of bytes of log data to
 17 | // group into each append for efficiency. If progress is non-nil it will be
 18 | // delivered updates during the copy since it could take a while. Updates will
 19 | // be delivered best-effort with a short wait of 1 millisecond. If the channel
 20 | // blocks for longer updates may be lost. The caller should sufficiently buffer
 21 | // it and ensure it's being drained as fast as needed. If non-nil progress will
 22 | // be closed when the function returns.
 23 | func CopyLogs(ctx context.Context, dst, src raft.LogStore, batchBytes int, progress chan<- string) error {
 24 | 	defer func() {
 25 | 		if progress != nil {
 26 | 			close(progress)
 27 | 		}
 28 | 	}()
 29 | 
 30 | 	st := time.Now()
 31 | 	update := func(message string, args ...interface{}) {
 32 | 		if progress == nil {
 33 | 			return
 34 | 		}
 35 | 		select {
 36 | 		case progress <- fmt.Sprintf(message, args...):
 37 | 		case <-time.After(time.Millisecond):
 38 | 		}
 39 | 	}
 40 | 
 41 | 	first, err := src.FirstIndex()
 42 | 	if err != nil {
 43 | 		return fmt.Errorf("failed getting first index: %w", err)
 44 | 	}
 45 | 	last, err := src.LastIndex()
 46 | 	if err != nil {
 47 | 		return fmt.Errorf("failed getting last index: %w", err)
 48 | 	}
 49 | 
 50 | 	batch := make([]*raft.Log, 0, 4096)
 51 | 	batchSize := 0
 52 | 	n := 0
 53 | 	batchN := 1
 54 | 	total := int(last - first + 1)
 55 | 	totalBytes := 0
 56 | 	update("starting to copy %d log entries with indexes [%d, %d]", total, first, last)
 57 | 	for idx := first; idx <= last; idx++ {
 58 | 		if ctx.Err() != nil {
 59 | 			return ctx.Err()
 60 | 		}
 61 | 		var log raft.Log
 62 | 		n++
 63 | 		err := src.GetLog(idx, &log)
 64 | 		if err != nil {
 65 | 			return fmt.Errorf("failed copying log %d (%d/%d): %w", idx, n, total, err)
 66 | 		}
 67 | 		batch = append(batch, &log)
 68 | 		// Fudge some overhead for headers and other fields this is about right for
 69 | 		// our WAL anyway.
 70 | 		batchSize += len(log.Data) + 32
 71 | 		if batchSize >= batchBytes {
 72 | 			// Flush the batch
 73 | 			batchSummary := fmt.Sprintf("batch %6d: %d entries ending at %d", batchN, len(batch), idx)
 74 | 			err := dst.StoreLogs(batch)
 75 | 			if err != nil {
 76 | 				return fmt.Errorf("failed writing %s: %w", batchSummary, err)
 77 | 			}
 78 | 			update("  -> wrote %s (%3.0f%% complete)", batchSummary, (float32(n-1)/float32(total))*100.0)
 79 | 			batchN++
 80 | 			batch = batch[:0]
 81 | 			totalBytes += batchSize
 82 | 			batchSize = 0
 83 | 		}
 84 | 	}
 85 | 	if len(batch) > 0 {
 86 | 		// Flush the batch
 87 | 		batchSummary := fmt.Sprintf("batch %6d: %d entries ending at %d", batchN, len(batch), last)
 88 | 		err := dst.StoreLogs(batch)
 89 | 		if err != nil {
 90 | 			return fmt.Errorf("failed writing %s: %w", batchSummary, err)
 91 | 		}
 92 | 		update("  -> wrote %s (%3.0f%% complete)", batchSummary, (float32(n-1)/float32(total))*100.0)
 93 | 		batchN++
 94 | 		batch = batch[:0]
 95 | 		totalBytes += batchSize
 96 | 		batchSize = 0
 97 | 	}
 98 | 	update("DONE: took %s to copy %d entries (%d bytes)", time.Since(st), total, totalBytes)
 99 | 	return nil
100 | }
101 | 
102 | // CopyStable copies the known hashicorp/raft library used keys from one stable
103 | // store to another. Since StableStore has no list method there is no general
104 | // way to copy all possibly stored keys, however this is sufficient for standard
105 | // uses of `hashicorp/raft` as of the current release since it only every writes
106 | // these keys to StableStore. If other keys are written by another code path,
107 | // the caller can provide them in extraKeys and/or extraIntKeys depending on which
108 | // interface method they were written with - we don't assume all implementations
109 | // share a key space for Set and SetUint64. Both can be nil for just the
110 | // standard raft keys to be copied.
111 | func CopyStable(ctx context.Context, dst, src raft.StableStore, extraKeys, extraIntKeys [][]byte, progress chan<- string) error {
112 | 	// https://github.com/hashicorp/raft/blob/44124c28758b8cfb675e90c75a204a08a84f8d4f/raft.go#L22-L26
113 | 	knownIntKeys := [][]byte{
114 | 		[]byte("CurrentTerm"),
115 | 		[]byte("LastVoteTerm"),
116 | 	}
117 | 	knownKeys := [][]byte{
118 | 		[]byte("LastVoteCand"),
119 | 	}
120 | 
121 | 	defer func() {
122 | 		if progress != nil {
123 | 			close(progress)
124 | 		}
125 | 	}()
126 | 
127 | 	update := func(message string, args ...interface{}) {
128 | 		if progress == nil {
129 | 			return
130 | 		}
131 | 		select {
132 | 		case progress <- fmt.Sprintf(message, args...):
133 | 		case <-time.After(time.Millisecond):
134 | 		}
135 | 	}
136 | 
137 | 	st := time.Now()
138 | 	update("copying %d int, %d regular KVs", len(knownIntKeys)+len(extraIntKeys),
139 | 		len(knownKeys)+len(extraKeys))
140 | 	for _, k := range append(knownIntKeys, extraIntKeys...) {
141 | 		if ctx.Err() != nil {
142 | 			return ctx.Err()
143 | 		}
144 | 		v, err := src.GetUint64(k)
145 | 		if err != nil {
146 | 			return fmt.Errorf("failed to read int key %s: %w", k, err)
147 | 		}
148 | 		err = dst.SetUint64(k, v)
149 | 		if err != nil {
150 | 			return fmt.Errorf("failed to set int key %s => %d: %w", k, v, err)
151 | 		}
152 | 		update("  copied int %s => %d", k, v)
153 | 	}
154 | 	for _, k := range append(knownKeys, extraKeys...) {
155 | 		if ctx.Err() != nil {
156 | 			return ctx.Err()
157 | 		}
158 | 		v, err := src.Get(k)
159 | 		if err != nil {
160 | 			return fmt.Errorf("failed to read key %s: %w", k, err)
161 | 		}
162 | 		err = dst.Set(k, v)
163 | 		if err != nil {
164 | 			return fmt.Errorf("failed to set key %s => %s: %w", k, v, err)
165 | 		}
166 | 		update("  copied %s => %q", k, v)
167 | 	}
168 | 	update("DONE: took %s to copy %d KVs", time.Since(st),
169 | 		len(knownIntKeys)+len(extraIntKeys)+len(knownKeys)+len(extraKeys))
170 | 	return nil
171 | }
172 | 


--------------------------------------------------------------------------------
/alice/workload/main.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package main
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"log"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | 	"strings"
 14 | 	"time"
 15 | 
 16 | 	"github.com/hashicorp/raft"
 17 | 	wal "github.com/hashicorp/raft-wal"
 18 | )
 19 | 
 20 | type opts struct {
 21 | 	dir       string
 22 | 	workload  string
 23 | 	init      bool
 24 | 	truncType string
 25 | }
 26 | 
 27 | func main() {
 28 | 	var o opts
 29 | 
 30 | 	flag.StringVar(&o.dir, "dir", "./workload_dir", "path to directory for WAL files")
 31 | 	flag.StringVar(&o.workload, "workload", "append", "workload to run, one of 'append', 'truncate-head', 'truncate-tail', 'truncate-all'")
 32 | 	flag.BoolVar(&o.init, "init", false, "whether this is the init or actual recording")
 33 | 	flag.Parse()
 34 | 
 35 | 	var fn func(o opts) error
 36 | 	var initFn func(o opts) error
 37 | 	switch o.workload {
 38 | 	case "append":
 39 | 		fn = runAppend
 40 | 	case "truncate-head":
 41 | 		o.truncType = "head"
 42 | 		fn = runTruncate
 43 | 		initFn = runInitTruncate
 44 | 	case "truncate-tail":
 45 | 		o.truncType = "tail"
 46 | 		fn = runTruncate
 47 | 		initFn = runInitTruncate
 48 | 	case "truncate-all":
 49 | 		o.truncType = "all"
 50 | 		fn = runTruncate
 51 | 		initFn = runInitTruncate
 52 | 	default:
 53 | 		log.Fatalf("unsupported workload %q", o.workload)
 54 | 	}
 55 | 
 56 | 	if o.init == true {
 57 | 		fn = initFn
 58 | 	}
 59 | 	if fn == nil {
 60 | 		return
 61 | 	}
 62 | 
 63 | 	if err := fn(o); err != nil {
 64 | 		log.Fatal(err)
 65 | 	}
 66 | }
 67 | 
 68 | // runInitTruncate sets up a WAL with a bunch of segments ready to test
 69 | // truncations. We setup ahead of the actual alice test to limit the IOs needed
 70 | // to be explored when simulating different scenarios.
 71 | //
 72 | // The setup leaves us with a set of segments that contain the following ranges:
 73 | //
 74 | //	[1..20]
 75 | //	[21..40]
 76 | //	[41..60]
 77 | //	[61..65]
 78 | func runInitTruncate(o opts) error {
 79 | 	return populate(o.dir,
 80 | 		16*1024, // 16 KiB segments
 81 | 		1024,    // 1KiB byte logs
 82 | 		20,      // batchSize (20 * 1024 is bigger than segment size so each segment will have this many logs except the tail)
 83 | 		65,      // 65 logs total
 84 | 	)
 85 | }
 86 | 
 87 | func runTruncate(o opts) error {
 88 | 	w, err := wal.Open(o.dir, wal.WithSegmentSize(8*1024))
 89 | 	if err != nil {
 90 | 		return err
 91 | 	}
 92 | 
 93 | 	// Output the initial commitIdx to get checker in sync!
 94 | 	last, err := w.LastIndex()
 95 | 	if err != nil {
 96 | 		return err
 97 | 	}
 98 | 	fmt.Printf("commitIdx=%d\n", last)
 99 | 
100 | 	switch o.truncType {
101 | 	case "head":
102 | 		// Remove the first two segments
103 | 		fmt.Printf("willTruncateBefore=46\n")
104 | 		err = w.DeleteRange(0, 45)
105 | 		fmt.Printf("truncatedBefore=46\n")
106 | 	case "tail":
107 | 		// Remove the last two segments
108 | 		fmt.Printf("willTruncateAfter=34\n")
109 | 		err = w.DeleteRange(35, 100)
110 | 		fmt.Printf("truncatedAfter=34\n")
111 | 	case "all":
112 | 		fmt.Printf("willTruncateAfter=0\n")
113 | 		err = w.DeleteRange(0, 100)
114 | 		fmt.Printf("truncatedAfter=0\n")
115 | 	}
116 | 	if err != nil {
117 | 		return err
118 | 	}
119 | 
120 | 	last, err = w.LastIndex()
121 | 	if err != nil {
122 | 		return err
123 | 	}
124 | 
125 | 	// Now append another entry to prove we are in a good state and can't loose
126 | 	// following writes in a crash.
127 | 	err = w.StoreLog(&raft.Log{
128 | 		Index:      last + 1,
129 | 		Term:       1,
130 | 		Type:       raft.LogCommand,
131 | 		Data:       []byte("Post Truncate Entry"),
132 | 		AppendedAt: time.Now(),
133 | 	})
134 | 	if err != nil {
135 | 		return err
136 | 	}
137 | 	fmt.Printf("commitIdx=%d\n", last+1)
138 | 
139 | 	return nil
140 | }
141 | 
142 | func runAppend(o opts) error {
143 | 	// We want to limit the total disk IOs we do because ALICE takes forever to
144 | 	// explore the reordering state space if there are more than a few. To
145 | 	// exercise realistic enough code paths though we want at least a couple of
146 | 	// append batches in each segment and at least one segment rotation. We'll
147 | 	// just write large values which will be treated as a single disk op while
148 | 	// taking up more space. We'll use 16 KiB segments and write values that are
149 | 	// 4KiB each (with headers that will take us over segment limit after 4 logs
150 | 	// appended in 2 batches.). To make it easier to manually inspect hex dumps of
151 | 	// WAL files for debugging, we'll use printable chars rather than random
152 | 	// bytes, and make the deterministic so we can also confirm that we didn't
153 | 	// accidentally return the wrong payload or corrupt them too.
154 | 	return populate(o.dir,
155 | 		16*1024, // 16 KiB segments
156 | 		4096,    // 4KiB logs
157 | 		2,       // batchSize
158 | 		8,       // Add 8 logs in total
159 | 	)
160 | }
161 | 
162 | func resetDir(dir string) error {
163 | 	entries, err := os.ReadDir(dir)
164 | 	if err != nil {
165 | 		return err
166 | 	}
167 | 	for _, e := range entries {
168 | 		if strings.HasSuffix(e.Name(), ".wal") || strings.HasSuffix(e.Name(), ".db") {
169 | 			if err := os.Remove(filepath.Join(dir, e.Name())); err != nil {
170 | 				return err
171 | 			}
172 | 		}
173 | 	}
174 | 	return nil
175 | }
176 | 
177 | func populate(dir string, segMentSize, logSize, batchSize, num int) error {
178 | 	if err := resetDir(dir); err != nil {
179 | 		return err
180 | 	}
181 | 
182 | 	w, err := wal.Open(dir, wal.WithSegmentSize(segMentSize))
183 | 	if err != nil {
184 | 		return err
185 | 	}
186 | 
187 | 	var logs []*raft.Log
188 | 
189 | 	if logSize%4 != 0 {
190 | 		return fmt.Errorf("logSize must be a multiple of 4")
191 | 	}
192 | 	if num > 999 {
193 | 		return fmt.Errorf("num must be less than 999")
194 | 	}
195 | 
196 | 	numRepeats := logSize / 4
197 | 
198 | 	commitBatch := func() error {
199 | 		if len(logs) == 0 {
200 | 			return nil
201 | 		}
202 | 		if err := w.StoreLogs(logs); err != nil {
203 | 			return err
204 | 		}
205 | 		// Log that we expect everything up to i to be durable now so checker can
206 | 		// assert that.
207 | 		fmt.Printf("commitIdx=%d\n", logs[len(logs)-1].Index)
208 | 		logs = logs[:0]
209 | 		return nil
210 | 	}
211 | 
212 | 	for i := 1; i <= num; i++ {
213 | 		logs = append(logs, &raft.Log{
214 | 			Index:      uint64(i),
215 | 			Term:       1,
216 | 			Type:       raft.LogCommand,
217 | 			Data:       bytes.Repeat([]byte(fmt.Sprintf("%03d|", i)), numRepeats),
218 | 			AppendedAt: time.Now(),
219 | 		})
220 | 
221 | 		if len(logs) >= batchSize {
222 | 			if err := commitBatch(); err != nil {
223 | 				return err
224 | 			}
225 | 		}
226 | 	}
227 | 
228 | 	// Commit the remainder
229 | 	return commitBatch()
230 | }
231 | 


--------------------------------------------------------------------------------
/bench/append_requestor.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package main
  5 | 
  6 | import (
  7 | 	"context"
  8 | 	"crypto/rand"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"path/filepath"
 12 | 	"sync/atomic"
 13 | 	"time"
 14 | 
 15 | 	"github.com/HdrHistogram/hdrhistogram-go"
 16 | 	"github.com/benmathews/bench"
 17 | 	histwriter "github.com/benmathews/hdrhistogram-writer"
 18 | 	"github.com/hashicorp/raft"
 19 | 	raftboltdb "github.com/hashicorp/raft-boltdb/v2"
 20 | 	wal "github.com/hashicorp/raft-wal"
 21 | 	"go.etcd.io/bbolt"
 22 | )
 23 | 
 24 | var (
 25 | 	_ bench.RequesterFactory = &appendRequesterFactory{}
 26 | 
 27 | 	randomData []byte
 28 | )
 29 | 
 30 | func init() {
 31 | 	randomData = make([]byte, 1024*1024)
 32 | 	rand.Read(randomData)
 33 | }
 34 | 
 35 | // appendRequesterFactory implements bench.RequesterFactory
 36 | type appendRequesterFactory struct {
 37 | 	opts   opts
 38 | 	output io.Writer
 39 | }
 40 | 
 41 | // GetRequester returns a new Requester, called for each Benchmark
 42 | // connection.
 43 | func (f *appendRequesterFactory) GetRequester(number uint64) bench.Requester {
 44 | 	if number > 0 {
 45 | 		panic("wal only supports a single writer")
 46 | 	}
 47 | 
 48 | 	var fn func() (raft.LogStore, error)
 49 | 	switch f.opts.version {
 50 | 	case "wal":
 51 | 		fn = func() (raft.LogStore, error) {
 52 | 			return wal.Open(f.opts.dir, wal.WithSegmentSize(f.opts.segSize*1024*1024))
 53 | 		}
 54 | 	case "bolt":
 55 | 		fn = func() (raft.LogStore, error) {
 56 | 			boltOpts := raftboltdb.Options{
 57 | 				Path: filepath.Join(f.opts.dir, "raft.db"),
 58 | 				BoltOptions: &bbolt.Options{
 59 | 					NoFreelistSync: f.opts.noFreelistSync,
 60 | 				},
 61 | 			}
 62 | 			return raftboltdb.New(boltOpts)
 63 | 		}
 64 | 	default:
 65 | 		panic("unknown LogStore version: " + f.opts.version)
 66 | 	}
 67 | 
 68 | 	return &appendRequester{
 69 | 		opts:     f.opts,
 70 | 		output:   f.output,
 71 | 		newStore: fn,
 72 | 	}
 73 | }
 74 | 
 75 | // appendRequester implements bench.Requester for appending entries to the WAL.
 76 | type appendRequester struct {
 77 | 	closed uint32
 78 | 
 79 | 	opts opts
 80 | 
 81 | 	batch        []*raft.Log
 82 | 	index        uint64
 83 | 	newStore     func() (raft.LogStore, error)
 84 | 	store        raft.LogStore
 85 | 	truncateStop func()
 86 | 	output       io.Writer
 87 | 
 88 | 	truncateTiming *hdrhistogram.Histogram
 89 | }
 90 | 
 91 | // Setup prepares the Requester for benchmarking.
 92 | func (r *appendRequester) Setup() error {
 93 | 	ls, err := r.newStore()
 94 | 	if err != nil {
 95 | 		return err
 96 | 	}
 97 | 	r.store = ls
 98 | 
 99 | 	// Prebuild the batch of logs. There is no compression so we don't care that
100 | 	// they are all the same data.
101 | 	r.batch = make([]*raft.Log, r.opts.batchSize)
102 | 	for i := range r.batch {
103 | 		r.batch[i] = &raft.Log{
104 | 			// We'll vary the indexes each time but save on setting this up the same
105 | 			// way every time to!
106 | 			Data:       randomData[:r.opts.logSize],
107 | 			AppendedAt: time.Now(),
108 | 		}
109 | 	}
110 | 	r.index = 1
111 | 
112 | 	if r.opts.preLoadN > 0 {
113 | 		// Write lots of big records and then delete them again. We'll use batches
114 | 		// of 1000 1024 byte records for now to speed things up a bit.
115 | 		preBatch := make([]*raft.Log, 0, 1000)
116 | 		fmt.Fprintf(r.output, "Preloading up to index %d\n", r.opts.preLoadN)
117 | 		for r.index <= uint64(r.opts.preLoadN) {
118 | 			preBatch = append(preBatch, &raft.Log{Index: r.index, Data: randomData[:1024]})
119 | 			r.index++
120 | 			if len(preBatch) == 1000 {
121 | 				err := r.store.StoreLogs(preBatch)
122 | 				if err != nil {
123 | 					return err
124 | 				}
125 | 				preBatch = preBatch[:0]
126 | 			}
127 | 		}
128 | 		if len(preBatch) > 0 {
129 | 			err := r.store.StoreLogs(preBatch)
130 | 			if err != nil {
131 | 				return err
132 | 			}
133 | 		}
134 | 
135 | 		// Now truncate back to trailingLogs.
136 | 		fmt.Fprintf(r.output, "Truncating 1 - %d\n", r.index-uint64(r.opts.truncateTrailingLogs))
137 | 		err := r.store.DeleteRange(1, r.index-uint64(r.opts.truncateTrailingLogs))
138 | 		if err != nil {
139 | 			return err
140 | 		}
141 | 		r.dumpStats()
142 | 	}
143 | 	if r.opts.truncatePeriod > 0 {
144 | 		r.truncateTiming = hdrhistogram.New(1, 10_000_000, 3)
145 | 		fmt.Fprintf(r.output, "Starting Truncator every %s\n", r.opts.truncatePeriod)
146 | 		ctx, cancel := context.WithCancel(context.Background())
147 | 		r.truncateStop = cancel
148 | 		go r.runTruncate(ctx)
149 | 	} else {
150 | 		fmt.Fprintf(r.output, "Truncation disabled\n")
151 | 	}
152 | 
153 | 	return nil
154 | }
155 | 
156 | func (r *appendRequester) runTruncate(ctx context.Context) {
157 | 	ticker := time.NewTicker(r.opts.truncatePeriod)
158 | 	for {
159 | 		select {
160 | 		case <-ticker.C:
161 | 			if atomic.LoadUint32(&r.closed) == 1 {
162 | 				return
163 | 			}
164 | 			first, err := r.store.FirstIndex()
165 | 			if err != nil {
166 | 				panic(err)
167 | 			}
168 | 			last, err := r.store.LastIndex()
169 | 			if err != nil {
170 | 				panic(err)
171 | 			}
172 | 
173 | 			deleteMax := uint64(0)
174 | 			if last > uint64(r.opts.truncateTrailingLogs) {
175 | 				deleteMax = last - uint64(r.opts.truncateTrailingLogs)
176 | 			}
177 | 			if deleteMax >= first {
178 | 				st := time.Now()
179 | 				err := r.store.DeleteRange(first, deleteMax)
180 | 				elapsed := time.Since(st)
181 | 				r.truncateTiming.RecordValue(elapsed.Microseconds())
182 | 				if err != nil {
183 | 					panic(err)
184 | 				}
185 | 			}
186 | 
187 | 		case <-ctx.Done():
188 | 			return
189 | 		}
190 | 	}
191 | }
192 | 
193 | // Request performs a synchronous request to the system under test.
194 | func (r *appendRequester) Request() error {
195 | 	// Update log indexes
196 | 	for i := range r.batch {
197 | 		r.batch[i].Index = r.index
198 | 		r.index++
199 | 	}
200 | 	return r.store.StoreLogs(r.batch)
201 | }
202 | 
203 | type metricer interface {
204 | 	Metrics() map[string]uint64
205 | }
206 | 
207 | func (r *appendRequester) dumpStats() {
208 | 	if m, ok := r.store.(metricer); ok {
209 | 		fmt.Fprintln(r.output, "\n== METRICS ==========")
210 | 		for k, v := range m.Metrics() {
211 | 			fmt.Fprintf(r.output, "% 25s: % 15d\n", k, v)
212 | 		}
213 | 	}
214 | 	if r.truncateTiming != nil {
215 | 		scaleFactor := 0.001 // Scale us to ms.
216 | 		if err := histwriter.WriteDistributionFile(r.truncateTiming, nil, scaleFactor, outFileName(r.opts, "truncate-lat")); err != nil {
217 | 			fmt.Fprintf(r.output, "ERROR writing truncate histogram: %s\n", err)
218 | 		}
219 | 		printHistogram(r.output, "Truncate Latency (ms)", r.truncateTiming, 1000)
220 | 	}
221 | }
222 | 
223 | // Teardown is called upon benchmark completion.
224 | func (r *appendRequester) Teardown() error {
225 | 	old := atomic.SwapUint32(&r.closed, 1)
226 | 	if old == 0 {
227 | 		r.dumpStats()
228 | 		if c, ok := r.store.(io.Closer); ok {
229 | 			return c.Close()
230 | 		}
231 | 	}
232 | 	return nil
233 | }
234 | 


--------------------------------------------------------------------------------
/segment/format_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"encoding/binary"
  8 | 	"math"
  9 | 	"strings"
 10 | 	"testing"
 11 | 
 12 | 	fuzz "github.com/google/gofuzz"
 13 | 	"github.com/hashicorp/raft-wal/types"
 14 | 	"github.com/stretchr/testify/require"
 15 | )
 16 | 
 17 | func TestFileHeaderCodec(t *testing.T) {
 18 | 	cases := []struct {
 19 | 		name            string
 20 | 		info            types.SegmentInfo
 21 | 		bufSize         int
 22 | 		corrupt         func([]byte) []byte
 23 | 		wantWriteErr    string
 24 | 		wantReadErr     string
 25 | 		wantValidateErr string
 26 | 	}{
 27 | 		{
 28 | 			name: "basic encoding/decoding",
 29 | 			info: types.SegmentInfo{
 30 | 				BaseIndex: 1234,
 31 | 				ID:        4321,
 32 | 				Codec:     1,
 33 | 			},
 34 | 		},
 35 | 		{
 36 | 			name: "short buf writing",
 37 | 			info: types.SegmentInfo{
 38 | 				BaseIndex: 1234,
 39 | 				ID:        4321,
 40 | 				Codec:     1,
 41 | 			},
 42 | 			bufSize:      10,
 43 | 			wantWriteErr: "short buffer",
 44 | 		},
 45 | 		{
 46 | 			name: "short buf reading",
 47 | 			info: types.SegmentInfo{
 48 | 				BaseIndex: 1234,
 49 | 				ID:        4321,
 50 | 				Codec:     1,
 51 | 			},
 52 | 			corrupt: func(buf []byte) []byte {
 53 | 				return buf[0:5]
 54 | 			},
 55 | 			wantReadErr: "short buffer",
 56 | 		},
 57 | 		{
 58 | 			name: "bad magic reading",
 59 | 			info: types.SegmentInfo{
 60 | 				BaseIndex: 1234,
 61 | 				ID:        4321,
 62 | 				Codec:     1,
 63 | 			},
 64 | 			corrupt: func(buf []byte) []byte {
 65 | 				buf[0] = 0xff
 66 | 				return buf
 67 | 			},
 68 | 			wantReadErr: "corrupt",
 69 | 		},
 70 | 		{
 71 | 			name: "bad BaseIndex reading",
 72 | 			info: types.SegmentInfo{
 73 | 				BaseIndex: 1234,
 74 | 				ID:        4321,
 75 | 				Codec:     1,
 76 | 			},
 77 | 			corrupt: func(buf []byte) []byte {
 78 | 				buf[8] = 0xff
 79 | 				return buf
 80 | 			},
 81 | 			wantValidateErr: "corrupt",
 82 | 		},
 83 | 		{
 84 | 			name: "bad ID reading",
 85 | 			info: types.SegmentInfo{
 86 | 				BaseIndex: 1234,
 87 | 				ID:        4321,
 88 | 				Codec:     1,
 89 | 			},
 90 | 			corrupt: func(buf []byte) []byte {
 91 | 				buf[16] = 0xff
 92 | 				return buf
 93 | 			},
 94 | 			wantValidateErr: "corrupt",
 95 | 		},
 96 | 		{
 97 | 			name: "bad Codec reading",
 98 | 			info: types.SegmentInfo{
 99 | 				BaseIndex: 1234,
100 | 				ID:        4321,
101 | 				Codec:     1,
102 | 			},
103 | 			corrupt: func(buf []byte) []byte {
104 | 				buf[24] = 0xff
105 | 				return buf
106 | 			},
107 | 			wantValidateErr: "corrupt",
108 | 		},
109 | 	}
110 | 
111 | 	for _, tc := range cases {
112 | 		tc := tc
113 | 		t.Run(tc.name, func(t *testing.T) {
114 | 			len := fileHeaderLen
115 | 			if tc.bufSize > 0 {
116 | 				len = tc.bufSize
117 | 			}
118 | 			buf := make([]byte, len)
119 | 
120 | 			err := writeFileHeader(buf, tc.info)
121 | 
122 | 			if tc.wantWriteErr != "" {
123 | 				require.ErrorContains(t, err, tc.wantWriteErr)
124 | 				return
125 | 			}
126 | 			require.NoError(t, err)
127 | 
128 | 			if tc.corrupt != nil {
129 | 				buf = tc.corrupt(buf)
130 | 			}
131 | 
132 | 			got, err := readFileHeader(buf)
133 | 			if tc.wantReadErr != "" {
134 | 				require.ErrorContains(t, err, tc.wantReadErr)
135 | 				return
136 | 			}
137 | 			require.NoError(t, err)
138 | 			require.NotNil(t, got)
139 | 
140 | 			err = validateFileHeader(*got, tc.info)
141 | 			if tc.wantValidateErr != "" {
142 | 				require.ErrorContains(t, err, tc.wantValidateErr)
143 | 				return
144 | 			}
145 | 			require.NoError(t, err)
146 | 		})
147 | 	}
148 | }
149 | 
150 | func TestFileHeaderCodecFuzz(t *testing.T) {
151 | 	fuzz := fuzz.New()
152 | 
153 | 	var info types.SegmentInfo
154 | 	var buf [fileHeaderLen]byte
155 | 	for i := 0; i < 1000; i++ {
156 | 		fuzz.Fuzz(&info)
157 | 		err := writeFileHeader(buf[:], info)
158 | 		require.NoError(t, err)
159 | 
160 | 		t.Logf("% x", buf[:])
161 | 
162 | 		got, err := readFileHeader(buf[:])
163 | 		require.NoError(t, err)
164 | 		require.NotNil(t, got)
165 | 
166 | 		err = validateFileHeader(*got, info)
167 | 		require.NoError(t, err)
168 | 	}
169 | }
170 | 
171 | func TestFrameCodecFuzz(t *testing.T) {
172 | 	fuzz := fuzz.New()
173 | 
174 | 	var len uint16
175 | 	// Allocate an extra frameHeaderLen here because some lengths might end up
176 | 	// needing padding which takes them just over the buffer size.
177 | 	var buf [math.MaxUint16 + frameHeaderLen + frameHeaderLen]byte
178 | 	var val = []byte(strings.Repeat("A Value!", math.MaxUint16/8))
179 | 	var fh frameHeader
180 | 	for i := 0; i < 1000; i++ {
181 | 		fuzz.Fuzz(&len)
182 | 
183 | 		fh.typ = FrameEntry
184 | 		fh.len = uint32(len)
185 | 
186 | 		expectLen := encodedFrameSize(int(len))
187 | 
188 | 		// Note length of val is not the same as fh.len which is what should be
189 | 		// used.
190 | 		err := writeFrame(buf[:expectLen], fh, val)
191 | 		require.NoError(t, err)
192 | 
193 | 		// We mostly care about the start and end...
194 | 		if expectLen > 64 {
195 | 			t.Logf("% x [...] % x (%d)", buf[0:16], buf[expectLen-16:expectLen], expectLen)
196 | 		} else {
197 | 			t.Logf("% x", buf[:expectLen])
198 | 		}
199 | 
200 | 		// Verify the last padLen bytes are zero
201 | 		for i := padLen(int(len)); i > 0; i-- {
202 | 			require.Equal(t, byte(0), buf[expectLen-i],
203 | 				"expected last %d bytes to be padding. Byte %d of %d isn't zero.",
204 | 				padLen(int(len)), expectLen-i, expectLen)
205 | 		}
206 | 
207 | 		got, err := readFrameHeader(buf[:])
208 | 		require.NoError(t, err)
209 | 		require.Equal(t, fh, got)
210 | 	}
211 | }
212 | 
213 | func TestPadLen(t *testing.T) {
214 | 	fuzz := fuzz.New()
215 | 	var len uint32
216 | 
217 | 	for i := 0; i < 1000; i++ {
218 | 		fuzz.Fuzz(&len)
219 | 
220 | 		got := padLen(int(len))
221 | 
222 | 		t.Log("len", len)
223 | 
224 | 		// Test basic properties of padLen
225 | 		require.Less(t, got, frameHeaderLen, "padding must be less than the whole header len")
226 | 		require.GreaterOrEqual(t, got, 0, "padding must be positive")
227 | 		require.Equal(t, 0, (got+int(len))%frameHeaderLen, "padding plus length must be a multiple of header len")
228 | 	}
229 | }
230 | 
231 | func TestWriteIndexFrame(t *testing.T) {
232 | 	// TestFrameCodecFuzz covers most of the bases for the actual header encoding
233 | 	// etc. This just needs to test the index encoding.
234 | 	var index [1024]uint32
235 | 
236 | 	for i := range index {
237 | 		// Write offsets as if each record is exactly 64 bytes
238 | 		index[i] = uint32(i * 64)
239 | 	}
240 | 
241 | 	buf := make([]byte, indexFrameSize(len(index)))
242 | 
243 | 	err := writeIndexFrame(buf, index[:])
244 | 	require.NoError(t, err)
245 | 
246 | 	//t.Log(index, buf)
247 | 
248 | 	// Validate that the encoded index after the header is what we expect
249 | 	offset := frameHeaderLen
250 | 	for i := range index {
251 | 		got := binary.LittleEndian.Uint32(buf[offset:])
252 | 		require.Equal(t, uint32(i*64), got, "unexpected index value at offset %d", i)
253 | 		offset += 4
254 | 	}
255 | }
256 | 


--------------------------------------------------------------------------------
/bench/bench_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package main
  5 | 
  6 | import (
  7 | 	"fmt"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"testing"
 11 | 	"time"
 12 | 
 13 | 	"github.com/hashicorp/raft"
 14 | 	raftboltdb "github.com/hashicorp/raft-boltdb"
 15 | 	wal "github.com/hashicorp/raft-wal"
 16 | 	"github.com/stretchr/testify/require"
 17 | 	"go.etcd.io/etcd/client/pkg/v3/fileutil"
 18 | )
 19 | 
 20 | func BenchmarkAppend(b *testing.B) {
 21 | 	sizes := []int{
 22 | 		10,
 23 | 		1024,
 24 | 		100 * 1024,
 25 | 		1024 * 1024,
 26 | 	}
 27 | 	sizeNames := []string{
 28 | 		"10",
 29 | 		"1k",
 30 | 		"100k",
 31 | 		"1m",
 32 | 	}
 33 | 	batchSizes := []int{1, 10}
 34 | 
 35 | 	for i, s := range sizes {
 36 | 		for _, bSize := range batchSizes {
 37 | 			b.Run(fmt.Sprintf("entrySize=%s/batchSize=%d/v=WAL", sizeNames[i], bSize), func(b *testing.B) {
 38 | 				ls, done := openWAL(b)
 39 | 				defer done()
 40 | 				// close _first_ (defers run in reverse order) before done() which will
 41 | 				// delete since rotate could still be happening
 42 | 				defer ls.Close()
 43 | 				runAppendBench(b, ls, s, bSize)
 44 | 			})
 45 | 			b.Run(fmt.Sprintf("entrySize=%s/batchSize=%d/v=Bolt", sizeNames[i], bSize), func(b *testing.B) {
 46 | 				ls := openBolt(b)
 47 | 				runAppendBench(b, ls, s, bSize)
 48 | 			})
 49 | 		}
 50 | 	}
 51 | }
 52 | 
 53 | func openWAL(b *testing.B) (*wal.WAL, func()) {
 54 | 	tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*")
 55 | 	require.NoError(b, err)
 56 | 
 57 | 	// Force every 1k append to create a new segment to profile segment rotation.
 58 | 	ls, err := wal.Open(tmpDir, wal.WithSegmentSize(512))
 59 | 	require.NoError(b, err)
 60 | 
 61 | 	return ls, func() { os.RemoveAll(tmpDir) }
 62 | }
 63 | 
 64 | func openBolt(b *testing.B) *raftboltdb.BoltStore {
 65 | 	tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*")
 66 | 	require.NoError(b, err)
 67 | 	defer os.RemoveAll(tmpDir)
 68 | 
 69 | 	ls, err := raftboltdb.NewBoltStore(filepath.Join(tmpDir, "bolt-wal.db"))
 70 | 	require.NoError(b, err)
 71 | 
 72 | 	return ls
 73 | }
 74 | 
 75 | func runAppendBench(b *testing.B, ls raft.LogStore, s, n int) {
 76 | 	// Pre-create batch, we'll just adjust the indexes in the loop
 77 | 	batch := make([]*raft.Log, n)
 78 | 	for i := range batch {
 79 | 		batch[i] = &raft.Log{
 80 | 			Data:       randomData[:s],
 81 | 			AppendedAt: time.Now(),
 82 | 		}
 83 | 	}
 84 | 
 85 | 	b.ResetTimer()
 86 | 	idx := uint64(1)
 87 | 	for i := 0; i < b.N; i++ {
 88 | 		for j := range batch {
 89 | 			batch[j].Index = idx
 90 | 			idx++
 91 | 		}
 92 | 		b.StartTimer()
 93 | 		err := ls.StoreLogs(batch)
 94 | 		b.StopTimer()
 95 | 		if err != nil {
 96 | 			b.Fatalf("error appending: %s", err)
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | func BenchmarkGetLogs(b *testing.B) {
102 | 	sizes := []int{
103 | 		1000,
104 | 		1_000_000,
105 | 	}
106 | 	sizeNames := []string{
107 | 		"1k",
108 | 		"1m",
109 | 	}
110 | 	for i, s := range sizes {
111 | 		wLs, done := openWAL(b)
112 | 		defer done()
113 | 		// close _first_ (defers run in reverse order) before done() which will
114 | 		// delete since rotate could still be happening
115 | 		defer wLs.Close()
116 | 		populateLogs(b, wLs, s, 128) // fixed 128 byte logs
117 | 
118 | 		bLs := openBolt(b)
119 | 		populateLogs(b, bLs, s, 128) // fixed 128 byte logs
120 | 
121 | 		b.Run(fmt.Sprintf("numLogs=%s/v=WAL", sizeNames[i]), func(b *testing.B) {
122 | 			runGetLogBench(b, wLs, s)
123 | 		})
124 | 		b.Run(fmt.Sprintf("numLogs=%s/v=Bolt", sizeNames[i]), func(b *testing.B) {
125 | 			runGetLogBench(b, bLs, s)
126 | 		})
127 | 	}
128 | }
129 | 
130 | func populateLogs(b *testing.B, ls raft.LogStore, n, size int) {
131 | 	batchSize := 1000
132 | 	batch := make([]*raft.Log, 0, batchSize)
133 | 	start := time.Now()
134 | 	for i := 0; i < n; i++ {
135 | 		l := raft.Log{Index: uint64(i + 1), Data: randomData[:2], AppendedAt: time.Now()}
136 | 		batch = append(batch, &l)
137 | 		if len(batch) == batchSize {
138 | 			err := ls.StoreLogs(batch)
139 | 			require.NoError(b, err)
140 | 			batch = batch[:0]
141 | 		}
142 | 	}
143 | 	if len(batch) > 0 {
144 | 		err := ls.StoreLogs(batch)
145 | 		require.NoError(b, err)
146 | 	}
147 | 	b.Logf("populateTime=%s", time.Since(start))
148 | }
149 | 
150 | func runGetLogBench(b *testing.B, ls raft.LogStore, n int) {
151 | 	b.ResetTimer()
152 | 	var log raft.Log
153 | 	for i := 0; i < b.N; i++ {
154 | 		b.StartTimer()
155 | 		err := ls.GetLog(uint64((i+1)%n), &log)
156 | 		b.StopTimer()
157 | 		require.NoError(b, err)
158 | 	}
159 | }
160 | 
161 | // These OS benchmarks showed that at least on my Mac Creating and preallocating
162 | // a file is not reliably quicker than renaming a file we already created and
163 | // preallocated so the extra work of doing that in the background ahead of time
164 | // and just renaming it during rotation seems unnecessary. We are not fsyncing
165 | // either the file or parent dir in either case which dominates cost of either
166 | // operation. Three random consecutive runs on my machine:
167 | //
168 | // BenchmarkOSCreateAndPreallocate-16           100            370304 ns/op             221 B/op          3 allocs/op
169 | // BenchmarkOSRename-16                         100            876001 ns/op             570 B/op          5 allocs/op
170 | //
171 | // BenchmarkOSCreateAndPreallocate-16           100            353654 ns/op             221 B/op          3 allocs/op
172 | // BenchmarkOSRename-16                         100            168558 ns/op             570 B/op          5 allocs/op
173 | //
174 | // BenchmarkOSCreateAndPreallocate-16           100            367360 ns/op             224 B/op          3 allocs/op
175 | // BenchmarkOSRename-16                         100           1353014 ns/op             571 B/op          5 allocs/op
176 | 
177 | func BenchmarkOSCreateAndPreallocate(b *testing.B) {
178 | 	tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*")
179 | 	require.NoError(b, err)
180 | 	defer os.RemoveAll(tmpDir)
181 | 
182 | 	b.ResetTimer()
183 | 	for i := 0; i < b.N; i++ {
184 | 		fname := filepath.Join(tmpDir, fmt.Sprintf("test-%d.txt", i))
185 | 		b.StartTimer()
186 | 		f, err := os.OpenFile(fname, os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644))
187 | 		if err != nil {
188 | 			panic(err) // require is kinda slow in benchmarks
189 | 		}
190 | 		err = fileutil.Preallocate(f, int64(64*1024*1024), true)
191 | 		if err != nil {
192 | 			panic(err)
193 | 		}
194 | 		b.StopTimer()
195 | 		f.Close()
196 | 	}
197 | }
198 | 
199 | func BenchmarkOSRename(b *testing.B) {
200 | 	tmpDir, err := os.MkdirTemp("", "raft-wal-bench-*")
201 | 	require.NoError(b, err)
202 | 	defer os.RemoveAll(tmpDir)
203 | 
204 | 	b.ResetTimer()
205 | 	for i := 0; i < b.N; i++ {
206 | 		tmpName := filepath.Join(tmpDir, fmt.Sprintf("%d.tmp", i%2))
207 | 		// Create the tmp file outside timer loop to simulate it happening in the
208 | 		// background
209 | 		f, err := os.OpenFile(tmpName, os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644))
210 | 		require.NoError(b, err)
211 | 		f.Close()
212 | 
213 | 		fname := filepath.Join(tmpDir, fmt.Sprintf("test-%d.txt", i))
214 | 		b.StartTimer()
215 | 		// Note we are not fsyncing parent dir in either case
216 | 		err = os.Rename(tmpName, fname)
217 | 		if err != nil {
218 | 			panic(err)
219 | 		}
220 | 		b.StopTimer()
221 | 	}
222 | }
223 | 


--------------------------------------------------------------------------------
/alice/checker/main.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package main
  5 | 
  6 | import (
  7 | 	"bufio"
  8 | 	"bytes"
  9 | 	"flag"
 10 | 	"fmt"
 11 | 	"log"
 12 | 	"os"
 13 | 	"regexp"
 14 | 	"strconv"
 15 | 
 16 | 	"github.com/hashicorp/raft"
 17 | 	wal "github.com/hashicorp/raft-wal"
 18 | )
 19 | 
 20 | func main() {
 21 | 	flag.Parse()
 22 | 
 23 | 	if err := run(os.Args[1], os.Args[2]); err != nil {
 24 | 		log.Fatal(err)
 25 | 	}
 26 | }
 27 | 
 28 | var re = regexp.MustCompile("(\\w+)=(\\d+)")
 29 | 
 30 | type runSummary struct {
 31 | 	lastCommit                 uint64
 32 | 	truncatedAfter             uint64
 33 | 	truncatedBefore            uint64
 34 | 	willTruncateAfter          uint64
 35 | 	willTruncateBefore         uint64
 36 | 	truncatedEntriesMaybeAfter uint64
 37 | 	willTruncateHead           bool
 38 | 	willTruncateTail           bool
 39 | 	truncatedHead              bool
 40 | 	truncatedTail              bool
 41 | }
 42 | 
 43 | func readStdoutFile(stdoutFile string) (runSummary, error) {
 44 | 	var sum runSummary
 45 | 
 46 | 	stdout, err := os.Open(stdoutFile)
 47 | 	if err != nil {
 48 | 		return sum, err
 49 | 	}
 50 | 	defer stdout.Close()
 51 | 
 52 | 	scanner := bufio.NewScanner(stdout)
 53 | 	for scanner.Scan() {
 54 | 		line := scanner.Text()
 55 | 		if len(line) == 0 {
 56 | 			continue
 57 | 		}
 58 | 		matches := re.FindStringSubmatch(line)
 59 | 		if matches != nil {
 60 | 			n, err := strconv.Atoi(matches[2])
 61 | 			if err != nil {
 62 | 				return sum, err
 63 | 			}
 64 | 			switch matches[1] {
 65 | 			case "commitIdx":
 66 | 				sum.lastCommit = uint64(n)
 67 | 			case "truncatedBefore":
 68 | 				sum.truncatedHead = true
 69 | 				if n > int(sum.truncatedBefore) {
 70 | 					sum.truncatedBefore = uint64(n)
 71 | 					sum.truncatedEntriesMaybeAfter = sum.lastCommit
 72 | 				}
 73 | 			case "truncatedAfter":
 74 | 				sum.truncatedTail = true
 75 | 				if int(sum.truncatedAfter) == 0 || n < int(sum.truncatedAfter) {
 76 | 					sum.truncatedAfter = uint64(n)
 77 | 					sum.truncatedEntriesMaybeAfter = sum.truncatedAfter
 78 | 				}
 79 | 				if n < int(sum.lastCommit) {
 80 | 					sum.lastCommit = uint64(n)
 81 | 				}
 82 | 			case "willTruncateAfter":
 83 | 				sum.willTruncateAfter = uint64(n)
 84 | 				sum.willTruncateTail = true
 85 | 				sum.truncatedEntriesMaybeAfter = sum.willTruncateAfter
 86 | 			case "willTruncateBefore":
 87 | 				sum.willTruncateBefore = uint64(n)
 88 | 				sum.willTruncateHead = true
 89 | 				sum.truncatedEntriesMaybeAfter = sum.lastCommit
 90 | 			default:
 91 | 				// skip unknown output KVs
 92 | 			}
 93 | 			continue
 94 | 		}
 95 | 		return sum, fmt.Errorf("unrecognizable output line: %s", line)
 96 | 	}
 97 | 	return sum, nil
 98 | }
 99 | 
100 | func validateFirst(first uint64, expect runSummary) error {
101 | 	switch {
102 | 	case expect.willTruncateHead:
103 | 		if expect.truncatedHead {
104 | 			// We actually completed the truncation. First must now be the new index.
105 | 			if first != expect.truncatedBefore {
106 | 				return fmt.Errorf("Expected first to be %d after truncation. Got %d",
107 | 					expect.truncatedBefore, first)
108 | 			}
109 | 
110 | 		} else {
111 | 			// Not sure if truncation completed or not so allow either value
112 | 			if first != 1 && first != expect.willTruncateBefore {
113 | 				return fmt.Errorf("Expected first to be 1 before truncation, %d after. Got %d",
114 | 					expect.willTruncateBefore, first)
115 | 			}
116 | 		}
117 | 
118 | 	case expect.willTruncateTail && expect.willTruncateAfter == 0:
119 | 		// Special case of an "everything" truncation which is modelled as a tail
120 | 		// truncation (after=0) In this case first will either be 1 before, 0 right
121 | 		// after truncation or 1 again after the next append.
122 | 		if first != 0 && first != 1 {
123 | 			return fmt.Errorf("Expected first to be 1 before truncation, 0 after or 1 after the next append. Got %d",
124 | 				first)
125 | 		}
126 | 
127 | 	default:
128 | 		// No head truncations can have started yet.
129 | 		if first != 1 && first != 0 {
130 | 			return fmt.Errorf("Want first=1 or first=0 (if no writes yet) before any truncation. Got %d", first)
131 | 		}
132 | 	}
133 | 	return nil
134 | }
135 | 
136 | func validateLast(last uint64, expect runSummary) error {
137 | 	switch {
138 | 	case expect.willTruncateTail:
139 | 		if expect.truncatedTail {
140 | 			// We actually completed the truncation. Last must now be the new index,
141 | 			// or the subsequent write if that's higher.
142 | 			if last != expect.truncatedAfter && last != expect.truncatedAfter+1 {
143 | 				return fmt.Errorf("Expected last to be %d after truncation or %d after subsequent append. Got %d",
144 | 					expect.truncatedAfter, expect.truncatedAfter+1, last)
145 | 			}
146 | 
147 | 		} else {
148 | 			// Not sure if truncation completed or not so allow any last value greater
149 | 			// than the truncate after target (since we know the workload always
150 | 			// truncates after an index lower than commitIdx).
151 | 			if last < expect.willTruncateAfter {
152 | 				return fmt.Errorf("Expected last to be >= %d after before or after truncation. Got %d",
153 | 					expect.willTruncateAfter, last)
154 | 			}
155 | 		}
156 | 
157 | 	default:
158 | 		// No tail truncations can have started yet. Just ensure we have everything committed.
159 | 		if last < expect.lastCommit {
160 | 			return fmt.Errorf("Want last >= lastCommit. Lost committed writes! last=%d commitIdx=%d", last, expect.lastCommit)
161 | 		}
162 | 	}
163 | 	return nil
164 | }
165 | 
166 | func run(dir string, stdoutFile string) error {
167 | 	w, err := wal.Open(dir, wal.WithSegmentSize(32*1024))
168 | 	if err != nil {
169 | 		return err
170 | 	}
171 | 
172 | 	// Find the expected committed range
173 | 	expect, err := readStdoutFile(stdoutFile)
174 | 	if err != nil {
175 | 		return err
176 | 	}
177 | 
178 | 	first, err := w.FirstIndex()
179 | 	if err != nil {
180 | 		return err
181 | 	}
182 | 	last, err := w.LastIndex()
183 | 	if err != nil {
184 | 		return err
185 | 	}
186 | 
187 | 	if err := validateFirst(first, expect); err != nil {
188 | 		return err
189 | 	}
190 | 	if err := validateLast(last, expect); err != nil {
191 | 		return err
192 | 	}
193 | 
194 | 	fmt.Printf("Found first=%d last=%d expected %v\n", first, last, expect)
195 | 
196 | 	var i uint64
197 | 	var l raft.Log
198 | 	for i = first; i <= last; i++ {
199 | 		if i == 0 {
200 | 			// Everything was truncated so nothing to read!
201 | 			continue
202 | 		}
203 | 		if err := w.GetLog(i, &l); err != nil {
204 | 			return fmt.Errorf("error reading log [%d/%d] - %v", i, last, err)
205 | 		}
206 | 		// Verify contents match
207 | 		validPrefixes := []string{fmt.Sprintf("%03d|", i)}
208 | 		if (expect.willTruncateHead || expect.willTruncateTail) && i > expect.truncatedEntriesMaybeAfter {
209 | 			// If we will truncate but didn't yet either outcome is possible so
210 | 			// include both viable options.
211 | 			validPrefixes = append(validPrefixes, "Post Truncate Entry")
212 | 		}
213 | 		if (expect.truncatedTail || expect.truncatedHead) && i > expect.truncatedEntriesMaybeAfter {
214 | 			// Truncate completed so the original payload is no longer possible
215 | 			validPrefixes = validPrefixes[1:]
216 | 		}
217 | 
218 | 		valid := false
219 | 		for _, vp := range validPrefixes {
220 | 			if bytes.HasPrefix(l.Data, []byte(vp)) {
221 | 				valid = true
222 | 				break
223 | 			}
224 | 		}
225 | 		if !valid {
226 | 			return fmt.Errorf("entry %d has unexpected payload. Want prefix in %q, got %q",
227 | 				i, validPrefixes, string(l.Data))
228 | 		}
229 | 	}
230 | 
231 | 	log.Printf("OK!\n")
232 | 	return nil
233 | }
234 | 


--------------------------------------------------------------------------------
/state.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package wal
  5 | 
  6 | import (
  7 | 	"sync/atomic"
  8 | 
  9 | 	"github.com/benbjohnson/immutable"
 10 | 	"github.com/hashicorp/raft-wal/types"
 11 | )
 12 | 
 13 | // state is an immutable snapshot of the state of the log. Modifications must be
 14 | // made by copying and modifying the copy. This is easy enough because segments
 15 | // is an immutable map so changing and re-assigning to the clone won't impact
 16 | // the original map, and tail is just a pointer that can be mutated in the
 17 | // shallow clone. Note that methods called on the tail segmentWriter may mutate
 18 | // it's state so must only be called while holding the WAL's writeLock.
 19 | type state struct {
 20 | 	// refCount tracks readers that are reading segments based on this metadata.
 21 | 	// It is accessed atomically nd must be 64 bit aligned (i.e. leave it at the
 22 | 	// start of the struct).
 23 | 	refCount int32
 24 | 	// finaliser is set at most once while WAL is holding the write lock in order
 25 | 	// to provide a func that must be called when all current readers are done
 26 | 	// with this state. It's used for deferring closing and deleting old segments
 27 | 	// until we can be sure no reads are still in progress on them.
 28 | 	finalizer atomic.Value // func()
 29 | 
 30 | 	nextSegmentID uint64
 31 | 
 32 | 	// nextBaseIndex is used to signal which baseIndex to use next if there are no
 33 | 	// segments or current tail.
 34 | 	nextBaseIndex uint64
 35 | 	segments      *immutable.SortedMap[uint64, segmentState]
 36 | 	tail          types.SegmentWriter
 37 | }
 38 | 
 39 | type segmentState struct {
 40 | 	types.SegmentInfo
 41 | 
 42 | 	// r is the SegmentReader for our in-memory state.
 43 | 	r types.SegmentReader
 44 | }
 45 | 
 46 | // Commit converts the in-memory state into a PersistentState.
 47 | func (s *state) Persistent() types.PersistentState {
 48 | 	segs := make([]types.SegmentInfo, 0, s.segments.Len())
 49 | 	it := s.segments.Iterator()
 50 | 	for !it.Done() {
 51 | 		_, s, _ := it.Next()
 52 | 		segs = append(segs, s.SegmentInfo)
 53 | 	}
 54 | 	return types.PersistentState{
 55 | 		NextSegmentID: s.nextSegmentID,
 56 | 		Segments:      segs,
 57 | 	}
 58 | }
 59 | 
 60 | func (s *state) getLog(index uint64) (*types.PooledBuffer, error) {
 61 | 	// Check the tail writer first
 62 | 	if s.tail != nil {
 63 | 		raw, err := s.tail.GetLog(index)
 64 | 		if err != nil && err != ErrNotFound {
 65 | 			// Return actual errors since they might mask the fact that index really
 66 | 			// is in the tail but failed to read for some other reason.
 67 | 			return nil, err
 68 | 		}
 69 | 		if err == nil {
 70 | 			// No error means we found it and just need to decode.
 71 | 			return raw, nil
 72 | 		}
 73 | 		// Not in the tail segment, fall back to searching previous segments.
 74 | 	}
 75 | 
 76 | 	seg, err := s.findSegmentReader(index)
 77 | 	if err != nil {
 78 | 		return nil, err
 79 | 	}
 80 | 
 81 | 	return seg.GetLog(index)
 82 | }
 83 | 
 84 | // findSegmentReader searches the segment tree for the segment that contains the
 85 | // log at index idx. It may return the tail segment which may not in fact
 86 | // contain idx if idx is larger than the last written index. Typically this is
 87 | // called after already checking with the tail writer whether the log is in
 88 | // there which means the caller can be sure it's not going to return the tail
 89 | // segment.
 90 | func (s *state) findSegmentReader(idx uint64) (types.SegmentReader, error) {
 91 | 
 92 | 	if s.segments.Len() == 0 {
 93 | 		return nil, ErrNotFound
 94 | 	}
 95 | 
 96 | 	// Search for a segment with baseIndex.
 97 | 	it := s.segments.Iterator()
 98 | 
 99 | 	// The baseIndex we want is the first one lower or equal to idx. Seek gets us
100 | 	// to the first result equal or greater so we are either at it (if equal) or
101 | 	// on the one _after_ the one we need. We step back since that's most likely
102 | 	it.Seek(idx)
103 | 	// The first call to Next/Prev actually returns the node the iterator is
104 | 	// currently on (which is probably the one after the one we want) but in some
105 | 	// edge cases we might actually want this one. Rather than reversing back and
106 | 	// coming forward again, just check both this and the one before it.
107 | 	_, seg, ok := it.Prev()
108 | 	if ok && seg.BaseIndex > idx {
109 | 		_, seg, ok = it.Prev()
110 | 	}
111 | 
112 | 	// We either have the right segment or it doesn't exist.
113 | 	if ok && seg.MinIndex <= idx && (seg.MaxIndex == 0 || seg.MaxIndex >= idx) {
114 | 		return seg.r, nil
115 | 	}
116 | 
117 | 	return nil, ErrNotFound
118 | }
119 | 
120 | func (s *state) getTailInfo() *segmentState {
121 | 	it := s.segments.Iterator()
122 | 	it.Last()
123 | 	_, tail, ok := it.Next()
124 | 	if !ok {
125 | 		return nil
126 | 	}
127 | 	return &tail
128 | }
129 | 
130 | func (s *state) append(entries []types.LogEntry) error {
131 | 	return s.tail.Append(entries)
132 | }
133 | 
134 | func (s *state) firstIndex() uint64 {
135 | 	it := s.segments.Iterator()
136 | 	_, seg, ok := it.Next()
137 | 	if !ok {
138 | 		return 0
139 | 	}
140 | 	if seg.SealTime.IsZero() {
141 | 		// First segment is unsealed so is also the tail. Check it actually has at
142 | 		// least one log in otherwise it doesn't matter what the BaseIndex/MinIndex
143 | 		// are.
144 | 		if s.tail.LastIndex() == 0 {
145 | 			// No logs in the WAL
146 | 			return 0
147 | 		}
148 | 		// At least one log exists, return the MinIndex
149 | 	}
150 | 	return seg.MinIndex
151 | }
152 | 
153 | func (s *state) lastIndex() uint64 {
154 | 	tailIdx := s.tail.LastIndex()
155 | 	if tailIdx > 0 {
156 | 		return tailIdx
157 | 	}
158 | 	// Current tail is empty. Check there are previous sealed segments.
159 | 	it := s.segments.Iterator()
160 | 	it.Last()
161 | 	_, _, ok := it.Prev()
162 | 	if !ok {
163 | 		// No tail! shouldn't be possible but means no logs yet
164 | 		return 0
165 | 	}
166 | 	// Go back to the segment before the tail
167 | 	_, _, ok = it.Prev()
168 | 	if !ok {
169 | 		// No previous segment so the whole log is empty
170 | 		return 0
171 | 	}
172 | 
173 | 	// There was a previous segment so it's MaxIndex will be one less than the
174 | 	// tail's BaseIndex.
175 | 	tailSeg := s.getTailInfo()
176 | 	if tailSeg == nil || tailSeg.BaseIndex == 0 {
177 | 		return 0
178 | 	}
179 | 	return tailSeg.BaseIndex - 1
180 | }
181 | 
182 | func (s *state) acquire() func() {
183 | 	atomic.AddInt32(&s.refCount, 1)
184 | 	return s.release
185 | }
186 | 
187 | func (s *state) release() {
188 | 	// decrement on release
189 | 	new := atomic.AddInt32(&s.refCount, -1)
190 | 	if new == 0 {
191 | 		// Cleanup state associated with this version now all refs have gone. Since
192 | 		// there are no more refs and we should not set a finalizer until this state
193 | 		// is no longer the active state, we can be sure this will happen only one.
194 | 		// Even still lets swap the fn to ensure we only call finalizer once ever!
195 | 		// We can't swap actual nil as it's not the same type as func() so do a
196 | 		// dance with a nilFn below.
197 | 		var nilFn func()
198 | 		fnRaw := s.finalizer.Swap(nilFn)
199 | 		if fn, ok := fnRaw.(func()); ok && fn != nil {
200 | 			fn()
201 | 		}
202 | 	}
203 | }
204 | 
205 | // clone returns a new state which is a shallow copy of just the immutable parts
206 | // of s. This is safer than a simple assignment copy because that "reads" the
207 | // atomically modified state non-atomically. We never want to copy the refCount
208 | // or finalizer anyway.
209 | func (s *state) clone() state {
210 | 	return state{
211 | 		nextSegmentID: s.nextSegmentID,
212 | 		segments:      s.segments,
213 | 		tail:          s.tail,
214 | 	}
215 | }
216 | 


--------------------------------------------------------------------------------
/types/segment.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package types
  5 | 
  6 | import (
  7 | 	"io"
  8 | 	"time"
  9 | )
 10 | 
 11 | // SegmentInfo is the metadata describing a single WAL segment.
 12 | type SegmentInfo struct {
 13 | 	// ID uniquely identifies this segment file
 14 | 	ID uint64
 15 | 
 16 | 	// BaseIndex is the raft index of the first entry that will be written to the
 17 | 	// segment.
 18 | 	BaseIndex uint64
 19 | 
 20 | 	// MinIndex is the logical lowest index that still exists in the segment. It
 21 | 	// may be greater than BaseIndex if a head truncation has "deleted" a prefix
 22 | 	// of the segment.
 23 | 	MinIndex uint64
 24 | 
 25 | 	// MaxIndex is the logical highest index that still exists in the segment. It
 26 | 	// may be lower than the actual highest index if a tail truncation has
 27 | 	// "deleted" a suffix of the segment. It is zero for unsealed segments and
 28 | 	// only set one seal.
 29 | 	MaxIndex uint64
 30 | 
 31 | 	// Codec identifies the codec used to encode log entries. Codec values 0 to
 32 | 	// 16k (i.e. the lower 16 bits) are reserved for internal future usage. Custom
 33 | 	// codecs must be registered with an identifier higher than this which the
 34 | 	// caller is responsible for ensuring uniquely identifies the specific version
 35 | 	// of their codec used in any given log. uint64 provides sufficient space that
 36 | 	// a randomly generated identifier is almost certainly unique.
 37 | 	Codec uint64
 38 | 
 39 | 	// IndexStart is the file offset where the index can be read from it's 0 for
 40 | 	// tail segments and only set after a segment is sealed.
 41 | 	IndexStart uint64
 42 | 
 43 | 	// CreateTime records when the segment was first created.
 44 | 	CreateTime time.Time
 45 | 
 46 | 	// SealTime records when the segment was sealed. Zero indicates that it's not
 47 | 	// sealed yet.
 48 | 	SealTime time.Time
 49 | 
 50 | 	// SizeLimit is the soft limit for the segment's size. The segment file may be
 51 | 	// pre-allocated to this size on filesystems that support it. It is a soft
 52 | 	// limit in the sense that the final Append usually takes the segment file
 53 | 	// past this size before it is considered full and sealed.
 54 | 	SizeLimit uint32
 55 | }
 56 | 
 57 | // SegmentFiler is the interface that provides access to segments to the WAL. It
 58 | // encapsulated creating, and recovering segments and returning reader or writer
 59 | // interfaces to interact with them. It's main purpose is to abstract the core
 60 | // WAL logic both from the actual encoding layer of segment files. You can think
 61 | // of it as a layer of abstraction above the VFS which abstracts actual file
 62 | // system operations on files but knows nothing about the format. In tests for
 63 | // example we can implement a SegmentFiler that is way simpler than the real
 64 | // encoding/decoding layer on top of a VFS - even an in-memory VFS which makes
 65 | // tests much simpler to write and run.
 66 | type SegmentFiler interface {
 67 | 	// Create adds a new segment with the given info and returns a writer or an
 68 | 	// error.
 69 | 	Create(info SegmentInfo) (SegmentWriter, error)
 70 | 
 71 | 	// RecoverTail is called on an unsealed segment when re-opening the WAL it
 72 | 	// will attempt to recover from a possible crash. It will either return an
 73 | 	// error, or return a valid segmentWriter that is ready for further appends.
 74 | 	// If the expected tail segment doesn't exist it must return an error wrapping
 75 | 	// os.ErrNotExist.
 76 | 	RecoverTail(info SegmentInfo) (SegmentWriter, error)
 77 | 
 78 | 	// Open an already sealed segment for reading. Open may validate the file's
 79 | 	// header and return an error if it doesn't match the expected info.
 80 | 	Open(info SegmentInfo) (SegmentReader, error)
 81 | 
 82 | 	// List returns the set of segment IDs currently stored. It's used by the WAL
 83 | 	// on recovery to find any segment files that need to be deleted following a
 84 | 	// unclean shutdown. The returned map is a map of ID -> BaseIndex. BaseIndex
 85 | 	// is returned to allow subsequent Delete calls to be made.
 86 | 	List() (map[uint64]uint64, error)
 87 | 
 88 | 	// Delete removes the segment with given baseIndex and id if it exists. Note
 89 | 	// that baseIndex is technically redundant since ID is unique on it's own. But
 90 | 	// in practice we name files (or keys) with both so that they sort correctly.
 91 | 	// This interface allows a  simpler implementation where we can just delete
 92 | 	// the file if it exists without having to scan the underlying storage for a.
 93 | 	Delete(baseIndex, ID uint64) error
 94 | }
 95 | 
 96 | // SegmentWriter manages appending logs to the tail segment of the WAL. It's an
 97 | // interface to make testing core WAL simpler. Every SegmentWriter will have
 98 | // either `init` or `recover` called once before any other methods. When either
 99 | // returns it must either return an error or be ready to accept new writes and
100 | // reads.
101 | type SegmentWriter interface {
102 | 	io.Closer
103 | 	SegmentReader
104 | 
105 | 	// Append adds one or more entries. It must not return until the entries are
106 | 	// durably stored otherwise raft's guarantees will be compromised. Append must
107 | 	// not be called concurrently with any other call to Sealed, Append or
108 | 	// ForceSeal.
109 | 	Append(entries []LogEntry) error
110 | 
111 | 	// Sealed returns whether the segment is sealed or not. If it is it returns
112 | 	// true and the file offset that it's index array starts at to be saved in
113 | 	// meta data. WAL will call this after every append so it should be relatively
114 | 	// cheap in the common case. This design allows the final Append to write out
115 | 	// the index or any additional data needed at seal time in the same fsync.
116 | 	// Sealed must not be called concurrently with any other call to Sealed,
117 | 	// Append or ForceSeal.
118 | 	Sealed() (bool, uint64, error)
119 | 
120 | 	// ForceSeal causes the segment to become sealed by writing out an index
121 | 	// block. This is not used in the typical flow of append and rotation, but is
122 | 	// necessary during truncations where some suffix of the writer needs to be
123 | 	// truncated. Rather than manipulate what is on disk in a complex way, the WAL
124 | 	// will simply force seal it with whatever state it has already saved and then
125 | 	// open a new segment at the right offset for continued writing. ForceSeal may
126 | 	// be called on a segment that has already been sealed and should just return
127 | 	// the existing index offset in that case. (We don't actually rely on that
128 | 	// currently but it's easier not to assume we'll always call it at most once).
129 | 	// ForceSeal must not be called concurrently with any other call to Sealed,
130 | 	// Append or ForceSeal.
131 | 	ForceSeal() (uint64, error)
132 | 
133 | 	// LastIndex returns the most recently persisted index in the log. It must
134 | 	// respond without blocking on Append since it's needed frequently by read
135 | 	// paths that may call it concurrently. Typically this will be loaded from an
136 | 	// atomic int. If the segment is empty lastIndex should return zero.
137 | 	LastIndex() uint64
138 | }
139 | 
140 | // SegmentReader wraps a ReadableFile to allow lookup of logs in an existing
141 | // segment file. It's an interface to make testing core WAL simpler. The first
142 | // call will always be validate which passes in the ReaderAt to be used for
143 | // subsequent reads.
144 | type SegmentReader interface {
145 | 	io.Closer
146 | 
147 | 	// GetLog returns the raw log entry bytes associated with idx. If the log
148 | 	// doesn't exist in this segment ErrNotFound must be returned.
149 | 	GetLog(idx uint64) (*PooledBuffer, error)
150 | }
151 | 


--------------------------------------------------------------------------------
/integration/integration_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package integration
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"fmt"
  9 | 	"os"
 10 | 	"strings"
 11 | 	"testing"
 12 | 
 13 | 	"github.com/hashicorp/raft"
 14 | 	wal "github.com/hashicorp/raft-wal"
 15 | 	"github.com/hashicorp/raft-wal/metadb"
 16 | 	"github.com/stretchr/testify/require"
 17 | )
 18 | 
 19 | type step func(w *wal.WAL) error
 20 | 
 21 | func TestIntegrationScenarios(t *testing.T) {
 22 | 	cases := []struct {
 23 | 		name                          string
 24 | 		steps                         []step
 25 | 		expectFirstIdx, expectLastIdx int
 26 | 		expectNumSegments             int
 27 | 	}{
 28 | 		{
 29 | 			name: "basic creation, appends, rotation",
 30 | 			steps: []step{
 31 | 				// ~256 bytes plus overhead per log want to write more than 4K segment
 32 | 				// size. Batches of 4 are ~1k so 5 batches is enough to rotate once.
 33 | 				appendLogsInBatches(5, 4),
 34 | 			},
 35 | 			expectFirstIdx:    1,
 36 | 			expectLastIdx:     20,
 37 | 			expectNumSegments: 2,
 38 | 		},
 39 | 		{
 40 | 			name: "starting at high index, appends, rotation",
 41 | 			steps: []step{
 42 | 				appendFirstLogAt(1_000_000),
 43 | 				// ~256 bytes plus overhead per log want to write more than 4K segment
 44 | 				// size. Batches of 4 are ~1k so 5 batches is enough to rotate once.
 45 | 				appendLogsInBatches(5, 4),
 46 | 			},
 47 | 			expectFirstIdx:    1_000_000,
 48 | 			expectLastIdx:     1_000_020,
 49 | 			expectNumSegments: 2,
 50 | 		},
 51 | 		{
 52 | 			name: "head truncation deleting no files",
 53 | 			steps: []step{
 54 | 				appendLogsInBatches(11, 4),
 55 | 				deleteRange(1, 2),
 56 | 			},
 57 | 			expectFirstIdx:    3,
 58 | 			expectLastIdx:     44,
 59 | 			expectNumSegments: 3,
 60 | 		},
 61 | 		{
 62 | 			name: "head truncation deleting multiple files",
 63 | 			steps: []step{
 64 | 				appendLogsInBatches(11, 4),
 65 | 				deleteRange(1, 20),
 66 | 			},
 67 | 			expectFirstIdx:    21,
 68 | 			expectLastIdx:     44,
 69 | 			expectNumSegments: 2,
 70 | 		},
 71 | 		{
 72 | 			name: "tail truncation in active segment",
 73 | 			steps: []step{
 74 | 				appendLogsInBatches(11, 4),
 75 | 				deleteRange(44, 44), // Delete the last one log
 76 | 			},
 77 | 			expectFirstIdx:    1,
 78 | 			expectLastIdx:     43,
 79 | 			expectNumSegments: 4,
 80 | 		},
 81 | 		{
 82 | 			name: "tail truncation in active segment and write more",
 83 | 			steps: []step{
 84 | 				appendLogsInBatches(11, 4),
 85 | 				deleteRange(44, 44), // Delete the last one log
 86 | 				appendLogsInBatches(1, 4),
 87 | 			},
 88 | 			expectFirstIdx:    1,
 89 | 			expectLastIdx:     47,
 90 | 			expectNumSegments: 4,
 91 | 		},
 92 | 		{
 93 | 			name: "tail truncation deleting files",
 94 | 			steps: []step{
 95 | 				appendLogsInBatches(11, 4),
 96 | 				deleteRange(20, 44),
 97 | 			},
 98 | 			expectFirstIdx: 1,
 99 | 			expectLastIdx:  19,
100 | 			// Only need 2 segments but the truncation will rotate to a new tail
101 | 			expectNumSegments: 3,
102 | 		},
103 | 		{
104 | 			name: "tail truncation deleting files and write more",
105 | 			steps: []step{
106 | 				appendLogsInBatches(11, 4),
107 | 				deleteRange(20, 44),
108 | 				appendLogsInBatches(1, 4),
109 | 			},
110 | 			expectFirstIdx: 1,
111 | 			expectLastIdx:  23,
112 | 			// Only need 2 segments but the truncation will rotate to a new tail
113 | 			expectNumSegments: 3,
114 | 		},
115 | 		{
116 | 			name: "write some logs, truncate everything, restart logs from different index",
117 | 			steps: []step{
118 | 				appendLogsInBatches(11, 4),
119 | 				deleteRange(1, 44),
120 | 				appendFirstLogAt(1000),
121 | 				appendLogsInBatches(1, 4),
122 | 			},
123 | 			expectFirstIdx:    1000,
124 | 			expectLastIdx:     1004,
125 | 			expectNumSegments: 1,
126 | 		},
127 | 	}
128 | 
129 | 	for _, tc := range cases {
130 | 		tc := tc
131 | 		t.Run(tc.name, func(t *testing.T) {
132 | 			t.Parallel()
133 | 
134 | 			tmpDir, err := os.MkdirTemp("", tc.name)
135 | 			require.NoError(t, err)
136 | 			defer os.RemoveAll(tmpDir)
137 | 
138 | 			// Wrap the BoltDB meta store so we can peek into it's values.
139 | 			meta := &PeekingMetaStore{
140 | 				meta: &metadb.BoltMetaDB{},
141 | 			}
142 | 
143 | 			w, err := wal.Open(tmpDir,
144 | 				// 4k segments to test rotation quicker
145 | 				wal.WithSegmentSize(4096),
146 | 				wal.WithMetaStore(meta),
147 | 			)
148 | 			require.NoError(t, err)
149 | 
150 | 			// Execute initial operations
151 | 			for i, step := range tc.steps {
152 | 				require.NoError(t, step(w), "failed on step %d", i)
153 | 			}
154 | 
155 | 			// Assert expected properties
156 | 			assertLogContents(t, w, tc.expectFirstIdx, tc.expectLastIdx)
157 | 			assertNumSegments(t, meta, tmpDir, tc.expectNumSegments)
158 | 
159 | 			// Close WAL and re-open
160 | 			require.NoError(t, w.Close())
161 | 
162 | 			meta2 := &PeekingMetaStore{
163 | 				meta: &metadb.BoltMetaDB{},
164 | 			}
165 | 
166 | 			w2, err := wal.Open(tmpDir,
167 | 				wal.WithSegmentSize(4096),
168 | 				wal.WithMetaStore(meta2),
169 | 			)
170 | 			require.NoError(t, err)
171 | 			defer w2.Close()
172 | 
173 | 			// Assert expected properties still hold
174 | 			assertLogContents(t, w2, tc.expectFirstIdx, tc.expectLastIdx)
175 | 			assertNumSegments(t, meta2, tmpDir, tc.expectNumSegments)
176 | 		})
177 | 	}
178 | }
179 | 
180 | func appendLogsInBatches(nBatches, nPerBatch int) step {
181 | 	return func(w *wal.WAL) error {
182 | 		lastIdx, err := w.LastIndex()
183 | 		if err != nil {
184 | 			return err
185 | 		}
186 | 		nextIdx := lastIdx + 1
187 | 
188 | 		return appendLogsInBatchesStartingAt(w, nBatches, nPerBatch, int(nextIdx))
189 | 	}
190 | }
191 | 
192 | func appendFirstLogAt(index int) step {
193 | 	return func(w *wal.WAL) error {
194 | 		return appendLogsInBatchesStartingAt(w, 1, 1, index)
195 | 	}
196 | }
197 | 
198 | func appendLogsInBatchesStartingAt(w *wal.WAL, nBatches, nPerBatch, firstIndex int) error {
199 | 	nextIdx := uint64(firstIndex)
200 | 
201 | 	batch := make([]*raft.Log, 0, nPerBatch)
202 | 	for b := 0; b < nBatches; b++ {
203 | 		for i := 0; i < nPerBatch; i++ {
204 | 			log := raft.Log{
205 | 				Index: nextIdx,
206 | 				Data:  makeValue(nextIdx),
207 | 			}
208 | 			batch = append(batch, &log)
209 | 			nextIdx++
210 | 		}
211 | 		if err := w.StoreLogs(batch); err != nil {
212 | 			return err
213 | 		}
214 | 		batch = batch[:0]
215 | 	}
216 | 	return nil
217 | }
218 | 
219 | func makeValue(n uint64) []byte {
220 | 	// Values are 16 repetitions of a 16 byte string based on the index so 256
221 | 	// bytes total.
222 | 	return bytes.Repeat([]byte(fmt.Sprintf("val-%011d\n", n)), 16)
223 | }
224 | 
225 | func deleteRange(min, max int) step {
226 | 	return func(w *wal.WAL) error {
227 | 		return w.DeleteRange(uint64(min), uint64(max))
228 | 	}
229 | }
230 | 
231 | func assertLogContents(t *testing.T, w *wal.WAL, first, last int) {
232 | 	t.Helper()
233 | 
234 | 	firstIdx, err := w.FirstIndex()
235 | 	require.NoError(t, err)
236 | 	lastIdx, err := w.LastIndex()
237 | 	require.NoError(t, err)
238 | 
239 | 	require.Equal(t, first, int(firstIdx))
240 | 	require.Equal(t, last, int(lastIdx))
241 | 
242 | 	var log raft.Log
243 | 	for i := first; i <= last; i++ {
244 | 		err := w.GetLog(uint64(i), &log)
245 | 		require.NoError(t, err, "log index %d", i)
246 | 		require.Equal(t, i, int(log.Index), "log index %d", i)
247 | 		require.Equal(t, string(makeValue(log.Index)), string(log.Data), "log index %d", i)
248 | 	}
249 | }
250 | 
251 | func assertNumSegments(t *testing.T, meta *PeekingMetaStore, dir string, numSegments int) {
252 | 	t.Helper()
253 | 
254 | 	state := meta.PeekState()
255 | 	require.Equal(t, numSegments, len(state.Segments))
256 | 
257 | 	// Check the right number of segment files on disk too
258 | 	des, err := os.ReadDir(dir)
259 | 	require.NoError(t, err)
260 | 
261 | 	segFiles := make([]string, 0, numSegments)
262 | 	for _, de := range des {
263 | 		if de.IsDir() {
264 | 			continue
265 | 		}
266 | 		if strings.HasSuffix(de.Name(), ".wal") {
267 | 			segFiles = append(segFiles, de.Name())
268 | 		}
269 | 	}
270 | 	require.Equal(t, numSegments, len(segFiles), "expected two segment files, got %v", segFiles)
271 | }
272 | 


--------------------------------------------------------------------------------
/metadb/metadb.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package metadb
  5 | 
  6 | import (
  7 | 	"encoding/json"
  8 | 	"errors"
  9 | 	"fmt"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 
 13 | 	"github.com/hashicorp/raft-wal/types"
 14 | 	"go.etcd.io/bbolt"
 15 | )
 16 | 
 17 | const (
 18 | 	// FileName is the default file name for the bolt db file.
 19 | 	FileName = "wal-meta.db"
 20 | 
 21 | 	// *Bucket are the names used for internal bolt buckets
 22 | 	MetaBucket   = "wal-meta"
 23 | 	StableBucket = "stable"
 24 | 
 25 | 	// We just need one key for now so use the byte 'm' for meta arbitrarily.
 26 | 	MetaKey = "m"
 27 | )
 28 | 
 29 | var (
 30 | 	// ErrUnintialized is returned when any call is made before Load has opened
 31 | 	// the DB file.
 32 | 	ErrUnintialized = errors.New("uninitialized")
 33 | )
 34 | 
 35 | // BoltMetaDB implements types.MetaStore using BoltDB as a reliable persistent
 36 | // store. See repo README for reasons for this design choice and performance
 37 | // implications.
 38 | type BoltMetaDB struct {
 39 | 	dir string
 40 | 	db  *bbolt.DB
 41 | }
 42 | 
 43 | func (db *BoltMetaDB) ensureOpen(dir string) error {
 44 | 	if db.dir != "" && db.dir != dir {
 45 | 		return fmt.Errorf("can't load dir %s, already open in dir %s", dir, db.dir)
 46 | 	}
 47 | 	if db.db != nil {
 48 | 		return nil
 49 | 	}
 50 | 
 51 | 	fileName := filepath.Join(dir, FileName)
 52 | 
 53 | 	open := func() error {
 54 | 		bb, err := bbolt.Open(fileName, 0644, nil)
 55 | 		if err != nil {
 56 | 			return fmt.Errorf("failed to open %s: %w", FileName, err)
 57 | 		}
 58 | 		db.db = bb
 59 | 		db.dir = dir
 60 | 		return nil
 61 | 	}
 62 | 
 63 | 	// BoltDB can get stuck in invalid states if we crash while it's initializing.
 64 | 	// We can't distinguish those as safe to just wipe it and start again because
 65 | 	// we don't know for sure if it's failing due to bad init or later corruption
 66 | 	// (which would loose data if we just wipe and start over). So to ensure
 67 | 	// initial creation of the WAL is as crash-safe as possible we will manually
 68 | 	// detect we have an atomic init procedure:
 69 | 	//  1. Check if file exits already. If yes, skip init and just open it.
 70 | 	//  2. Delete any existing DB file with tmp name
 71 | 	//  3. Creat a new BoltDB that is empty and has the buckets with a temp name.
 72 | 	//  4. Once that's committed, rename to final name and Fsync parent dir
 73 | 	_, err := os.Stat(fileName)
 74 | 	if err == nil {
 75 | 		// File exists, just open it
 76 | 		return open()
 77 | 	}
 78 | 	if !errors.Is(err, os.ErrNotExist) {
 79 | 		// Unknown err just return that
 80 | 		return fmt.Errorf("failed to stat %s: %w", FileName, err)
 81 | 	}
 82 | 
 83 | 	// File doesn't exist, initialize a new DB in a crash-safe way
 84 | 	if err := safeInitBoltDB(dir); err != nil {
 85 | 		return fmt.Errorf("failed initializing meta DB: %w", err)
 86 | 	}
 87 | 
 88 | 	// All good, now open it!
 89 | 	return open()
 90 | }
 91 | 
 92 | func safeInitBoltDB(dir string) error {
 93 | 	tmpFileName := filepath.Join(dir, FileName+".tmp")
 94 | 
 95 | 	// Delete any old attempts to init that were unsuccessful
 96 | 	if err := os.RemoveAll(tmpFileName); err != nil {
 97 | 		return err
 98 | 	}
 99 | 
100 | 	// Open bolt DB at tmp file name
101 | 	bb, err := bbolt.Open(tmpFileName, 0644, nil)
102 | 	if err != nil {
103 | 		return err
104 | 	}
105 | 
106 | 	tx, err := bb.Begin(true)
107 | 	defer tx.Rollback()
108 | 
109 | 	if err != nil {
110 | 		return err
111 | 	}
112 | 	_, err = tx.CreateBucket([]byte(MetaBucket))
113 | 	if err != nil {
114 | 		return err
115 | 	}
116 | 	_, err = tx.CreateBucket([]byte(StableBucket))
117 | 	if err != nil {
118 | 		return err
119 | 	}
120 | 	if err := tx.Commit(); err != nil {
121 | 		return err
122 | 	}
123 | 	// Close the file ready to rename into place and re-open. This probably isn't
124 | 	// necessary but it make it easier to reason about this code path being
125 | 	// totally separate from the common case.
126 | 	if err := bb.Close(); err != nil {
127 | 		return err
128 | 	}
129 | 
130 | 	// We created the DB OK. Now rename it to the final name.
131 | 	if err := os.Rename(tmpFileName, filepath.Join(dir, FileName)); err != nil {
132 | 		return err
133 | 	}
134 | 
135 | 	// And Fsync that parent dir to make sure the new new file with it's new name
136 | 	// is persisted!
137 | 	dirF, err := os.Open(dir)
138 | 	if err != nil {
139 | 		return err
140 | 	}
141 | 	err = dirF.Sync()
142 | 	closeErr := dirF.Close()
143 | 	if err != nil {
144 | 		return err
145 | 	}
146 | 	return closeErr
147 | }
148 | 
149 | // Load loads the existing persisted state. If there is no existing state
150 | // implementations are expected to create initialize new storage and return an
151 | // empty state.
152 | func (db *BoltMetaDB) Load(dir string) (types.PersistentState, error) {
153 | 	var state types.PersistentState
154 | 
155 | 	if err := db.ensureOpen(dir); err != nil {
156 | 		return state, err
157 | 	}
158 | 
159 | 	tx, err := db.db.Begin(false)
160 | 	if err != nil {
161 | 		return state, err
162 | 	}
163 | 	defer tx.Rollback()
164 | 	meta := tx.Bucket([]byte(MetaBucket))
165 | 
166 | 	// We just need one key for now so use the byte 'm' for meta arbitrarily.
167 | 	raw := meta.Get([]byte(MetaKey))
168 | 	if raw == nil {
169 | 		// This is valid it's an "empty" log that will be initialized by the WAL.
170 | 		return state, nil
171 | 	}
172 | 
173 | 	if err := json.Unmarshal(raw, &state); err != nil {
174 | 		return state, fmt.Errorf("%w: failed to parse persisted state: %s", types.ErrCorrupt, err)
175 | 	}
176 | 	return state, nil
177 | }
178 | 
179 | // CommitState must atomically replace all persisted metadata in the current
180 | // store with the set provided. It must not return until the data is persisted
181 | // durably and in a crash-safe way otherwise the guarantees of the WAL will be
182 | // compromised. The WAL will only ever call this in a single thread at one
183 | // time and it will never be called concurrently with Load however it may be
184 | // called concurrently with Get/SetStable operations.
185 | func (db *BoltMetaDB) CommitState(state types.PersistentState) error {
186 | 	if db.db == nil {
187 | 		return ErrUnintialized
188 | 	}
189 | 
190 | 	encoded, err := json.Marshal(state)
191 | 	if err != nil {
192 | 		return fmt.Errorf("failed to encode persisted state: %w", err)
193 | 	}
194 | 
195 | 	tx, err := db.db.Begin(true)
196 | 	if err != nil {
197 | 		return err
198 | 	}
199 | 	defer tx.Rollback()
200 | 	meta := tx.Bucket([]byte(MetaBucket))
201 | 
202 | 	if err := meta.Put([]byte(MetaKey), encoded); err != nil {
203 | 		return err
204 | 	}
205 | 
206 | 	return tx.Commit()
207 | }
208 | 
209 | // GetStable returns a value from stable store or nil if it doesn't exist. May
210 | // be called concurrently by multiple threads.
211 | func (db *BoltMetaDB) GetStable(key []byte) ([]byte, error) {
212 | 	if db.db == nil {
213 | 		return nil, ErrUnintialized
214 | 	}
215 | 
216 | 	tx, err := db.db.Begin(false)
217 | 	if err != nil {
218 | 		return nil, err
219 | 	}
220 | 	defer tx.Rollback()
221 | 	stable := tx.Bucket([]byte(StableBucket))
222 | 
223 | 	val := stable.Get(key)
224 | 	if val == nil {
225 | 		return nil, nil
226 | 	}
227 | 
228 | 	// Need to copy the value since bolt only guarantees the slice is valid until
229 | 	// end of txn.
230 | 	ret := make([]byte, len(val))
231 | 	copy(ret, val)
232 | 	return ret, nil
233 | }
234 | 
235 | // SetStable stores a value from stable store. May be called concurrently with
236 | // GetStable.
237 | func (db *BoltMetaDB) SetStable(key []byte, value []byte) error {
238 | 	if db.db == nil {
239 | 		return ErrUnintialized
240 | 	}
241 | 
242 | 	tx, err := db.db.Begin(true)
243 | 	if err != nil {
244 | 		return err
245 | 	}
246 | 	defer tx.Rollback()
247 | 	stable := tx.Bucket([]byte(StableBucket))
248 | 
249 | 	if value == nil {
250 | 		err = stable.Delete(key)
251 | 	} else {
252 | 		err = stable.Put(key, value)
253 | 	}
254 | 	if err != nil {
255 | 		return err
256 | 	}
257 | 
258 | 	return tx.Commit()
259 | }
260 | 
261 | // Close implements io.Closer
262 | func (db *BoltMetaDB) Close() error {
263 | 	if db.db == nil {
264 | 		return nil
265 | 	}
266 | 	err := db.db.Close()
267 | 	db.db = nil
268 | 	return err
269 | }
270 | 


--------------------------------------------------------------------------------
/segment/writer_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"fmt"
  8 | 	"sync/atomic"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/hashicorp/raft-wal/types"
 13 | 	"github.com/stretchr/testify/require"
 14 | )
 15 | 
 16 | // TestConcurrentReadersAndWriter is designed to be run with race detector
 17 | // enabled to validate the concurrent behavior of the segment.
 18 | func TestConcurrentReadersAndWriter(t *testing.T) {
 19 | 	vfs := newTestVFS()
 20 | 	f := NewFiler("test", vfs)
 21 | 
 22 | 	seg1 := testSegment(1)
 23 | 
 24 | 	// Increase size limit so we keep going for a while. We don't want to make
 25 | 	// this too large that we time out easily on slower machines though or in CI.
 26 | 	// 256KiB passes easily on my laptop (~5s) and is big enough to take a
 27 | 	// while to test concurrent accesses.
 28 | 	seg1.SizeLimit = 256 * 1024
 29 | 
 30 | 	wf, err := f.Create(seg1)
 31 | 	require.NoError(t, err)
 32 | 
 33 | 	var lastIndexWritten uint64
 34 | 	var sealedMaxIndex uint64
 35 | 	var numReads uint64
 36 | 
 37 | 	writer := func() {
 38 | 		idx := uint64(1)
 39 | 		for {
 40 | 			err := wf.Append([]types.LogEntry{{Index: idx, Data: []byte("test")}})
 41 | 			if err != nil {
 42 | 				panic("error during append: " + err.Error())
 43 | 			}
 44 | 
 45 | 			sealed, _, err := wf.Sealed()
 46 | 			if err != nil {
 47 | 				panic("error during sealed: " + err.Error())
 48 | 			}
 49 | 			atomic.StoreUint64(&lastIndexWritten, idx)
 50 | 			if sealed {
 51 | 				atomic.StoreUint64(&sealedMaxIndex, idx)
 52 | 				return
 53 | 			}
 54 | 			idx++
 55 | 		}
 56 | 	}
 57 | 
 58 | 	reader := func(doneCh chan<- struct{}) {
 59 | 		// Follow the tail
 60 | 		idx := uint64(1)
 61 | 		for {
 62 | 			// Complete once writer has stopped and we've read all of it's written
 63 | 			// entries.
 64 | 			finalIdx := atomic.LoadUint64(&sealedMaxIndex)
 65 | 			if finalIdx > 0 && idx > finalIdx {
 66 | 				doneCh <- struct{}{}
 67 | 				return
 68 | 			}
 69 | 			if idx > wf.LastIndex() {
 70 | 				time.Sleep(time.Millisecond)
 71 | 				continue
 72 | 			}
 73 | 
 74 | 			log, err := wf.GetLog(idx)
 75 | 			if err != nil {
 76 | 				panic("error during GetLog: " + err.Error())
 77 | 			}
 78 | 			if string(log.Bs) != "test" {
 79 | 				panic("bad log read: " + string(log.Bs))
 80 | 			}
 81 | 			atomic.AddUint64(&numReads, 1)
 82 | 			idx++
 83 | 		}
 84 | 	}
 85 | 
 86 | 	// Start 10 readers and 1 writer in parallel
 87 | 	done := make(chan struct{}, 10)
 88 | 	for i := 0; i < cap(done); i++ {
 89 | 		go reader(done)
 90 | 	}
 91 | 	go writer()
 92 | 
 93 | 	complete := 0
 94 | 	// Takes about 5 seconds on my laptop. Give it a really generous margin for CI
 95 | 	// etc. though.
 96 | 	timeoutCh := time.After(30 * time.Second)
 97 | 	for complete < cap(done) {
 98 | 		select {
 99 | 		case <-timeoutCh:
100 | 			t.Fatalf("Took longer than 10 seconds to write and read the whole segment. w=%d, r=%d s=%d",
101 | 				atomic.LoadUint64(&lastIndexWritten),
102 | 				atomic.LoadUint64(&numReads),
103 | 				atomic.LoadUint64(&sealedMaxIndex),
104 | 			)
105 | 		case <-done:
106 | 			complete++
107 | 		}
108 | 	}
109 | 
110 | 	t.Logf("Written: %d, Read: %d, SealedMax: %d",
111 | 		atomic.LoadUint64(&lastIndexWritten),
112 | 		atomic.LoadUint64(&numReads),
113 | 		atomic.LoadUint64(&sealedMaxIndex),
114 | 	)
115 | 
116 | 	// Check we actually did something!
117 | 	require.Greater(t, int(atomic.LoadUint64(&lastIndexWritten)), 1000)
118 | 	require.Greater(t, int(atomic.LoadUint64(&numReads)), 1000)
119 | 	require.Greater(t, int(atomic.LoadUint64(&sealedMaxIndex)), 1000)
120 | }
121 | 
122 | func TestWriterRecoversFromWriteFailure(t *testing.T) {
123 | 	cases := []struct {
124 | 		name         string
125 | 		setupFailure func(f *testWritableFile, batch []types.LogEntry)
126 | 		fixFailure   func(batch []types.LogEntry)
127 | 	}{
128 | 		{
129 | 			name: "fwrite failure",
130 | 			setupFailure: func(f *testWritableFile, batch []types.LogEntry) {
131 | 				f.failNextWrite()
132 | 			},
133 | 		},
134 | 		{
135 | 			name: "fsync failure",
136 | 			setupFailure: func(f *testWritableFile, batch []types.LogEntry) {
137 | 				f.failNextSync()
138 | 			},
139 | 		},
140 | 		{
141 | 			name: "log append failure",
142 | 			setupFailure: func(f *testWritableFile, batch []types.LogEntry) {
143 | 				// Should cause monotonicity check to fail but only on last log after
144 | 				// other logs have been written and internal state updated.
145 | 				batch[len(batch)-1].Index = 123456
146 | 			},
147 | 			fixFailure: func(batch []types.LogEntry) {
148 | 				batch[len(batch)-1].Index = batch[len(batch)-2].Index + 1
149 | 			},
150 | 		},
151 | 	}
152 | 
153 | 	for _, tc := range cases {
154 | 		tc := tc
155 | 
156 | 		testFn := func(t *testing.T, empty bool) {
157 | 			vfs := newTestVFS()
158 | 
159 | 			f := NewFiler("test", vfs)
160 | 
161 | 			seg0 := testSegment(1)
162 | 
163 | 			w, err := f.Create(seg0)
164 | 			require.NoError(t, err)
165 | 			defer w.Close()
166 | 
167 | 			batch := make([]types.LogEntry, 5)
168 | 			for i := range batch {
169 | 				batch[i].Index = uint64(i + 1)
170 | 				batch[i].Data = []byte(fmt.Sprintf("val-%d", i+1))
171 | 			}
172 | 			maxIdx := len(batch)
173 | 			expectFirstIdx := 0
174 | 			expectLastIdx := 0
175 | 
176 | 			if !empty {
177 | 				require.NoError(t, w.Append(batch))
178 | 				expectFirstIdx = 1
179 | 				expectLastIdx = maxIdx
180 | 				for i := range batch {
181 | 					batch[i].Index = uint64(i + maxIdx + 1)
182 | 					batch[i].Data = []byte(fmt.Sprintf("val-%d", i+maxIdx+1))
183 | 				}
184 | 			}
185 | 
186 | 			tf := testFileFor(t, w)
187 | 
188 | 			tc.setupFailure(tf, batch)
189 | 
190 | 			require.Error(t, w.Append(batch))
191 | 			assertExpectedLogs(t, w, expectFirstIdx, expectLastIdx)
192 | 
193 | 			if tc.fixFailure != nil {
194 | 				tc.fixFailure(batch)
195 | 			}
196 | 
197 | 			// Now retry that write, it should work!
198 | 			expectFirstIdx = 1
199 | 			expectLastIdx = int(batch[4].Index)
200 | 			require.NoError(t, w.Append(batch))
201 | 			assertExpectedLogs(t, w, expectFirstIdx, expectLastIdx)
202 | 
203 | 			// Also, re-open the file "from disk" to make sure what has been written
204 | 			// is correct and recoverable!
205 | 			w2, err := f.RecoverTail(seg0)
206 | 			require.NoError(t, err)
207 | 			assertExpectedLogs(t, w2, expectFirstIdx, expectLastIdx)
208 | 			w2.Close()
209 | 		}
210 | 
211 | 		t.Run(tc.name+" empty", func(t *testing.T) {
212 | 			testFn(t, true)
213 | 		})
214 | 		t.Run(tc.name+" non-empty", func(t *testing.T) {
215 | 			testFn(t, false)
216 | 		})
217 | 	}
218 | }
219 | 
220 | func assertExpectedLogs(t *testing.T, w types.SegmentWriter, first, last int) {
221 | 	t.Helper()
222 | 
223 | 	require.Equal(t, uint64(last), w.LastIndex())
224 | 	if last == 0 {
225 | 		return
226 | 	}
227 | 	assertExpectedReaderLogs(t, w, first, last)
228 | }
229 | 
230 | func assertExpectedReaderLogs(t *testing.T, r types.SegmentReader, first, last int) {
231 | 	t.Helper()
232 | 
233 | 	for idx := first; idx <= last; idx++ {
234 | 		buf, err := r.GetLog(uint64(idx))
235 | 		require.NoError(t, err)
236 | 		require.Equal(t, fmt.Sprintf("val-%d", idx), string(buf.Bs))
237 | 		buf.Close()
238 | 	}
239 | }
240 | 
241 | func TestWriterForceSeal(t *testing.T) {
242 | 	vfs := newTestVFS()
243 | 
244 | 	f := NewFiler("test", vfs)
245 | 
246 | 	seg0 := testSegment(1)
247 | 
248 | 	w, err := f.Create(seg0)
249 | 	require.NoError(t, err)
250 | 	defer w.Close()
251 | 
252 | 	batch := make([]types.LogEntry, 5)
253 | 	for i := range batch {
254 | 		batch[i].Index = uint64(i + 1)
255 | 		batch[i].Data = []byte(fmt.Sprintf("val-%d", i+1))
256 | 	}
257 | 	require.NoError(t, w.Append(batch))
258 | 
259 | 	assertExpectedLogs(t, w, 1, 5)
260 | 
261 | 	// Should not have sealed after one append.
262 | 	sealed, indexStart, err := w.Sealed()
263 | 	require.NoError(t, err)
264 | 	require.False(t, sealed)
265 | 	require.Equal(t, 0, int(indexStart))
266 | 
267 | 	// Force seal it
268 | 	indexStart, err = w.ForceSeal()
269 | 	require.NoError(t, err)
270 | 	require.Greater(t, int(indexStart), 0)
271 | 
272 | 	// It should be possible to open it with a reader now
273 | 	seg0.IndexStart = indexStart
274 | 	r, err := f.Open(seg0)
275 | 	require.NoError(t, err)
276 | 
277 | 	assertExpectedReaderLogs(t, r, 1, 5)
278 | }
279 | 


--------------------------------------------------------------------------------
/segment/format.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"encoding/binary"
  9 | 	"errors"
 10 | 	"fmt"
 11 | 	"io"
 12 | 
 13 | 	"github.com/hashicorp/raft-wal/types"
 14 | )
 15 | 
 16 | const (
 17 | 	// MaxEntrySize is the largest we allow any single raft log entry to be. This
 18 | 	// is larger than our raft implementation ever allows so seems safe to encode
 19 | 	// statically for now. We could make this configurable. It's main purpose it
 20 | 	// to limit allocation when reading entries back if their lengths are
 21 | 	// corrupted.
 22 | 	MaxEntrySize = 64 * 1024 * 1024 // 64 MiB
 23 | 
 24 | 	// minBufSize is the size we allocate read and write buffers. Setting it
 25 | 	// larger wastes more memory but increases the chances that we'll read the
 26 | 	// whole frame in a single shot and not need a second allocation and trip to
 27 | 	// the disk.
 28 | 	minBufSize = 64 * 1024
 29 | 
 30 | 	fileHeaderLen = 32
 31 | 	version       = 0
 32 | 	magic         = 0x58eb6b0d
 33 | 
 34 | 	// Note that this must remain a power of 2 to ensure aligning to this also
 35 | 	// aligns to sector boundaries.
 36 | 	frameHeaderLen = 8
 37 | )
 38 | 
 39 | const ( // Start iota from 0
 40 | 	FrameInvalid uint8 = iota
 41 | 	FrameEntry
 42 | 	FrameIndex
 43 | 	FrameCommit
 44 | )
 45 | 
 46 | var (
 47 | 	// ErrTooBig indicates that the caller tried to write a logEntry with a
 48 | 	// payload that's larger than we are prepared to support.
 49 | 	ErrTooBig = errors.New("entries larger than 64MiB are not supported")
 50 | )
 51 | 
 52 | /*
 53 | 
 54 |   File Header functions
 55 | 
 56 | 	0      1      2      3      4      5      6      7      8
 57 | 	+------+------+------+------+------+------+------+------+
 58 | 	| Magic                     | Reserved           | Vsn  |
 59 | 	+------+------+------+------+------+------+------+------+
 60 | 	| BaseIndex                                             |
 61 | 	+------+------+------+------+------+------+------+------+
 62 | 	| SegmentID                                             |
 63 | 	+------+------+------+------+------+------+------+------+
 64 | 	| Codec                                                 |
 65 | 	+------+------+------+------+------+------+------+------+
 66 | 
 67 | */
 68 | 
 69 | // writeFileHeader writes a file header into buf for the given file metadata.
 70 | func writeFileHeader(buf []byte, info types.SegmentInfo) error {
 71 | 	if len(buf) < fileHeaderLen {
 72 | 		return io.ErrShortBuffer
 73 | 	}
 74 | 
 75 | 	binary.LittleEndian.PutUint32(buf[0:4], magic)
 76 | 	// Explicitly zero Reserved bytes just in case
 77 | 	buf[4] = 0
 78 | 	buf[5] = 0
 79 | 	buf[6] = 0
 80 | 	buf[7] = version
 81 | 	binary.LittleEndian.PutUint64(buf[8:16], info.BaseIndex)
 82 | 	binary.LittleEndian.PutUint64(buf[16:24], info.ID)
 83 | 	binary.LittleEndian.PutUint64(buf[24:32], info.Codec)
 84 | 	return nil
 85 | }
 86 | 
 87 | // readFileHeader reads a file header from buf.
 88 | func readFileHeader(buf []byte) (*types.SegmentInfo, error) {
 89 | 	if len(buf) < fileHeaderLen {
 90 | 		return nil, io.ErrShortBuffer
 91 | 	}
 92 | 
 93 | 	var i types.SegmentInfo
 94 | 	m := binary.LittleEndian.Uint64(buf[0:8])
 95 | 	if m != magic {
 96 | 		return nil, types.ErrCorrupt
 97 | 	}
 98 | 	if buf[7] != version {
 99 | 		return nil, types.ErrCorrupt
100 | 	}
101 | 	i.BaseIndex = binary.LittleEndian.Uint64(buf[8:16])
102 | 	i.ID = binary.LittleEndian.Uint64(buf[16:24])
103 | 	i.Codec = binary.LittleEndian.Uint64(buf[24:32])
104 | 	return &i, nil
105 | }
106 | 
107 | func validateFileHeader(got, expect types.SegmentInfo) error {
108 | 	if expect.ID != got.ID {
109 | 		return fmt.Errorf("%w: segment header ID %x doesn't match metadata %x",
110 | 			types.ErrCorrupt, got.ID, expect.ID)
111 | 	}
112 | 	if expect.BaseIndex != got.BaseIndex {
113 | 		return fmt.Errorf("%w: segment header BaseIndex %d doesn't match metadata %d",
114 | 			types.ErrCorrupt, got.BaseIndex, expect.BaseIndex)
115 | 	}
116 | 	if expect.Codec != got.Codec {
117 | 		return fmt.Errorf("%w: segment header Codec %d doesn't match metadata %d",
118 | 			types.ErrCorrupt, got.Codec, expect.Codec)
119 | 	}
120 | 
121 | 	return nil
122 | }
123 | 
124 | /*
125 | 	Frame Functions
126 | 
127 | 	0      1      2      3      4      5      6      7      8
128 | 	+------+------+------+------+------+------+------+------+
129 | 	| Type | Reserved           | Length/CRC                |
130 | 	+------+------+------+------+------+------+------+------+
131 | */
132 | 
133 | type frameHeader struct {
134 | 	typ uint8
135 | 	len uint32
136 | 	crc uint32
137 | }
138 | 
139 | func writeFrame(buf []byte, h frameHeader, payload []byte) error {
140 | 	if len(buf) < encodedFrameSize(int(h.len)) {
141 | 		return io.ErrShortBuffer
142 | 	}
143 | 	if err := writeFrameHeader(buf, h); err != nil {
144 | 		return err
145 | 	}
146 | 	copy(buf[frameHeaderLen:], payload[:h.len])
147 | 	// Explicitly write null bytes for padding
148 | 	padBytes := padLen(int(h.len))
149 | 	for i := 0; i < padBytes; i++ {
150 | 		buf[frameHeaderLen+int(h.len)+i] = 0x0
151 | 	}
152 | 	return nil
153 | }
154 | 
155 | func writeFrameHeader(buf []byte, h frameHeader) error {
156 | 	if len(buf) < frameHeaderLen {
157 | 		return io.ErrShortBuffer
158 | 	}
159 | 	buf[0] = h.typ
160 | 	buf[1] = 0
161 | 	buf[2] = 0
162 | 	buf[3] = 0
163 | 	lOrCRC := h.len
164 | 	if h.typ == FrameCommit {
165 | 		lOrCRC = h.crc
166 | 	}
167 | 	binary.LittleEndian.PutUint32(buf[4:8], lOrCRC)
168 | 	return nil
169 | }
170 | 
171 | var zeroHeader [frameHeaderLen]byte
172 | 
173 | func readFrameHeader(buf []byte) (frameHeader, error) {
174 | 	var h frameHeader
175 | 	if len(buf) < frameHeaderLen {
176 | 		return h, io.ErrShortBuffer
177 | 	}
178 | 
179 | 	switch buf[0] {
180 | 	default:
181 | 		return h, fmt.Errorf("%w: corrupt frame header with unknown type %d", types.ErrCorrupt, buf[0])
182 | 
183 | 	case FrameInvalid:
184 | 		// Check if the whole header is zero and return a zero frame as this could
185 | 		// just indicate we've read right off the end of the written data during
186 | 		// recovery.
187 | 		if bytes.Equal(buf[:frameHeaderLen], zeroHeader[:]) {
188 | 			return h, nil
189 | 		}
190 | 		return h, fmt.Errorf("%w: corrupt frame header with type 0 but non-zero other fields", types.ErrCorrupt)
191 | 
192 | 	case FrameEntry, FrameIndex:
193 | 		h.typ = buf[0]
194 | 		h.len = binary.LittleEndian.Uint32(buf[4:8])
195 | 
196 | 	case FrameCommit:
197 | 		h.typ = buf[0]
198 | 		h.crc = binary.LittleEndian.Uint32(buf[4:8])
199 | 	}
200 | 	return h, nil
201 | }
202 | 
203 | // padLen returns how many bytes of padding should be added to a frame of length
204 | // n to ensure it is a multiple of headerLen. We ensure frameHeaderLen is a
205 | // power of two so that it's always a multiple of a typical sector size (e.g.
206 | // 512 bytes) to reduce the risk that headers are torn by being written across
207 | // sector boundaries. It will return an int in the range [0, 7].
208 | func padLen(n int) int {
209 | 	// This looks a bit awful but it's just doing (n % 8) and subtracting that
210 | 	// from 8 to get the number of bytes extra needed to get up to the next 8-byte
211 | 	// boundary. The extra & 7 is to handle the case where n is a multiple of 8
212 | 	// already and so n%8 is 0 and 8-0 is 8. By &ing 8 (0b1000) with 7 (0b111) we
213 | 	// effectively wrap it back around to 0. This only works as long as
214 | 	// frameHeaderLen is a power of 2 but that's necessary per comment above.
215 | 	return (frameHeaderLen - (n % frameHeaderLen)) & (frameHeaderLen - 1)
216 | }
217 | 
218 | func encodedFrameSize(payloadLen int) int {
219 | 	return frameHeaderLen + payloadLen + padLen(payloadLen)
220 | }
221 | 
222 | func indexFrameSize(numEntries int) int {
223 | 	// Index frames are completely unnecessary if the whole block is a
224 | 	// continuation with no new entries.
225 | 	if numEntries == 0 {
226 | 		return 0
227 | 	}
228 | 	return encodedFrameSize(numEntries * 4)
229 | }
230 | 
231 | func writeIndexFrame(buf []byte, offsets []uint32) error {
232 | 	if len(buf) < indexFrameSize(len(offsets)) {
233 | 		return io.ErrShortBuffer
234 | 	}
235 | 	fh := frameHeader{
236 | 		typ: FrameIndex,
237 | 		len: uint32(len(offsets) * 4),
238 | 	}
239 | 	if err := writeFrameHeader(buf, fh); err != nil {
240 | 		return err
241 | 	}
242 | 	cursor := frameHeaderLen
243 | 	for _, o := range offsets {
244 | 		binary.LittleEndian.PutUint32(buf[cursor:], o)
245 | 		cursor += 4
246 | 	}
247 | 	if (len(offsets) % 2) == 1 {
248 | 		// Odd number of entries, zero pad to keep it 8-byte aligned
249 | 		binary.LittleEndian.PutUint32(buf[cursor:], 0)
250 | 	}
251 | 	return nil
252 | }
253 | 


--------------------------------------------------------------------------------
/segment/vfs_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"encoding/hex"
  9 | 	"errors"
 10 | 	"fmt"
 11 | 	"io"
 12 | 	"os"
 13 | 	"sort"
 14 | 	"sync/atomic"
 15 | 	"testing"
 16 | 
 17 | 	"github.com/hashicorp/raft-wal/types"
 18 | )
 19 | 
 20 | // testVFS implements types.VFS for testing.
 21 | type testVFS struct {
 22 | 	dir   string
 23 | 	files map[string]*testWritableFile
 24 | 	trash map[string]*testWritableFile
 25 | 
 26 | 	listErr   error
 27 | 	createErr error
 28 | 	deleteErr error
 29 | 	openErr   error
 30 | }
 31 | 
 32 | func newTestVFS() *testVFS {
 33 | 	return &testVFS{
 34 | 		files: make(map[string]*testWritableFile),
 35 | 		trash: make(map[string]*testWritableFile),
 36 | 	}
 37 | }
 38 | 
 39 | // ListDir returns a list of all files in the specified dir in lexicographical
 40 | // order. If the dir doesn't exist, it must return an error. Empty array with
 41 | // nil error is assumed to mean that the directory exists and was readable,
 42 | // but contains no files.
 43 | func (fs *testVFS) ListDir(dir string) ([]string, error) {
 44 | 	if fs.listErr != nil {
 45 | 		return nil, fs.listErr
 46 | 	}
 47 | 	if err := fs.setDir(dir); err != nil {
 48 | 		return nil, err
 49 | 	}
 50 | 
 51 | 	files := make([]string, 0, len(fs.files))
 52 | 	for name := range fs.files {
 53 | 		files = append(files, name)
 54 | 	}
 55 | 	sort.Strings(files)
 56 | 	return files, nil
 57 | }
 58 | 
 59 | func (fs *testVFS) setDir(dir string) error {
 60 | 	if fs.dir == "" {
 61 | 		fs.dir = dir
 62 | 		return nil
 63 | 	}
 64 | 	if fs.dir != dir {
 65 | 		return fmt.Errorf("VFS called for different dir. Prev=%s Current=%s", fs.dir, dir)
 66 | 	}
 67 | 	return nil
 68 | }
 69 | 
 70 | // Create creates a new file with the given name. If a file with the same name
 71 | // already exists an error is returned. If a non-zero size is given,
 72 | // implementations should make a best effort to pre-allocate the file to be
 73 | // that size. The dir must already exist and be writable to the current
 74 | // process.
 75 | func (fs *testVFS) Create(dir string, name string, size uint64) (types.WritableFile, error) {
 76 | 	if fs.createErr != nil {
 77 | 		return nil, fs.createErr
 78 | 	}
 79 | 	if err := fs.setDir(dir); err != nil {
 80 | 		return nil, err
 81 | 	}
 82 | 	_, ok := fs.files[name]
 83 | 	if ok {
 84 | 		return nil, fmt.Errorf("file already exists")
 85 | 	}
 86 | 	f := newTestWritableFile(int(size))
 87 | 	fs.files[name] = f
 88 | 	return f, nil
 89 | }
 90 | 
 91 | // Delete indicates the file is no longer required. Typically it should be
 92 | // deleted from the underlying system to free disk space.
 93 | func (fs *testVFS) Delete(dir string, name string) error {
 94 | 	if fs.deleteErr != nil {
 95 | 		return fs.deleteErr
 96 | 	}
 97 | 	if err := fs.setDir(dir); err != nil {
 98 | 		return err
 99 | 	}
100 | 	tf, ok := fs.files[name]
101 | 	if !ok {
102 | 		return nil
103 | 	}
104 | 	fs.trash[name] = tf
105 | 	delete(fs.files, name)
106 | 	return nil
107 | }
108 | 
109 | // OpenReader opens an existing file in read-only mode. If the file doesn't
110 | // exist or permission is denied, an error is returned, otherwise no checks
111 | // are made about the well-formedness of the file, it may be empty, the wrong
112 | // size or corrupt in arbitrary ways.
113 | func (fs *testVFS) OpenReader(dir string, name string) (types.ReadableFile, error) {
114 | 	if fs.openErr != nil {
115 | 		return nil, fs.openErr
116 | 	}
117 | 	if err := fs.setDir(dir); err != nil {
118 | 		return nil, err
119 | 	}
120 | 	f, ok := fs.files[name]
121 | 	if !ok {
122 | 		return nil, os.ErrNotExist
123 | 	}
124 | 	return f, nil
125 | }
126 | 
127 | // OpenWriter opens a file in read-write mode. If the file doesn't exist or
128 | // permission is denied, an error is returned, otherwise no checks are made
129 | // about the well-formedness of the file, it may be empty, the wrong size or
130 | // corrupt in arbitrary ways.
131 | func (fs *testVFS) OpenWriter(dir string, name string) (types.WritableFile, error) {
132 | 	if fs.openErr != nil {
133 | 		return nil, fs.openErr
134 | 	}
135 | 	if err := fs.setDir(dir); err != nil {
136 | 		return nil, err
137 | 	}
138 | 	f, ok := fs.files[name]
139 | 	if !ok {
140 | 		return nil, os.ErrNotExist
141 | 	}
142 | 	return f, nil
143 | }
144 | 
145 | // testFileFor is a helper for reaching inside our interface types to access
146 | // the underlying "file".
147 | func testFileFor(t *testing.T, r types.SegmentReader) *testWritableFile {
148 | 	t.Helper()
149 | 
150 | 	switch v := r.(type) {
151 | 	case *Reader:
152 | 		return v.rf.(*testWritableFile)
153 | 	case *Writer:
154 | 		return v.wf.(*testWritableFile)
155 | 	default:
156 | 		t.Fatalf("Invalid SegmentReader implementation passed: %t", r)
157 | 		return nil
158 | 	}
159 | }
160 | 
161 | type testWritableFile struct {
162 | 	buf           atomic.Value // []byte
163 | 	maxWritten    int
164 | 	lastSyncStart int
165 | 	closed, dirty bool
166 | 	writeErr      error
167 | 	syncErr       error
168 | }
169 | 
170 | func newTestWritableFile(size int) *testWritableFile {
171 | 	wf := &testWritableFile{}
172 | 	wf.buf.Store(make([]byte, 0, size))
173 | 	return wf
174 | }
175 | 
176 | func (f *testWritableFile) getBuf() []byte {
177 | 	return f.buf.Load().([]byte)
178 | }
179 | 
180 | func (f *testWritableFile) failNextWrite() {
181 | 	f.writeErr = errors.New("IO error")
182 | }
183 | 
184 | func (f *testWritableFile) failNextSync() {
185 | 	f.syncErr = errors.New("IO error")
186 | }
187 | 
188 | // Truncate allows us to simulate the file being a different length to expected
189 | // for example due to a crash.
190 | func (f *testWritableFile) Truncate(size int) {
191 | 	buf := f.getBuf()
192 | 
193 | 	// We use buffer capacity as a proxy for "file size" so we need a new buffer
194 | 	// with the right capacity. We'll slice it to the minimum of the new len or
195 | 	// the current len.
196 | 	l := len(buf)
197 | 	if size < l {
198 | 		l = size
199 | 	}
200 | 	newBuf := make([]byte, l, size)
201 | 	f.buf.Store(newBuf)
202 | 	f.maxWritten = l
203 | }
204 | 
205 | func (f *testWritableFile) Dump() string {
206 | 	var buf bytes.Buffer
207 | 	d := hex.Dumper(&buf)
208 | 	bs := f.getBuf()
209 | 	max := 128
210 | 	if len(bs) < 128 {
211 | 		max = len(bs)
212 | 	}
213 | 	_, err := d.Write(bs[:max])
214 | 	if err != nil {
215 | 		panic(err)
216 | 	}
217 | 	return buf.String()
218 | }
219 | 
220 | func (f *testWritableFile) WriteAt(p []byte, off int64) (n int, err error) {
221 | 	if f.writeErr != nil {
222 | 		err := f.writeErr
223 | 		f.writeErr = nil
224 | 		return 0, err
225 | 	}
226 | 	if !f.dirty {
227 | 		f.lastSyncStart = int(off)
228 | 	}
229 | 	f.dirty = true
230 | 	maxOffset := int(off) + len(p)
231 | 	buf := f.getBuf()
232 | 	if maxOffset > len(buf) {
233 | 		// re-allocate to simulate appending additional bytes to end of a
234 | 		// pre-allocated file.
235 | 		nb := make([]byte, maxOffset)
236 | 		copy(nb, buf)
237 | 		buf = nb
238 | 	} else if off < int64(len(buf)) {
239 | 		// If this write is to an offset that was already visible to readers (less
240 | 		// than len(buf)) we can't write because that's racey, need to copy whole
241 | 		// buffer to mutate it safely.
242 | 		nb := make([]byte, len(buf), cap(buf))
243 | 		copy(nb, buf)
244 | 		buf = nb
245 | 	}
246 | 	copy(buf[off:], p)
247 | 	if maxOffset > f.maxWritten {
248 | 		f.maxWritten = maxOffset
249 | 	}
250 | 	// Atomically replace the slice to allow readers to see the new appended data
251 | 	// or new backing array if we reallocated.
252 | 	f.buf.Store(buf)
253 | 	return len(p), nil
254 | }
255 | 
256 | func (f *testWritableFile) ReadAt(p []byte, off int64) (n int, err error) {
257 | 	buf := f.getBuf()
258 | 	// Note we treat the whole cap of buf as "in" the file
259 | 	if int(off) >= cap(buf) {
260 | 		return 0, io.EOF
261 | 	}
262 | 	// Work out how many bytes we have to read left in the "file"
263 | 	n = cap(buf) - int(off)
264 | 	if n < len(p) {
265 | 		// We can't fill p as there are not enough bytes left in the "file" so
266 | 		// whatever we do read, also return EOF like a real file does.
267 | 		err = io.EOF
268 | 	}
269 | 	if off >= int64(len(buf)) {
270 | 		// Offset is within capacity of "file" but after the maximum visible byte so
271 | 		// just return empty bytes.
272 | 		for i := 0; i < len(p); i++ {
273 | 			p[i] = 0
274 | 		}
275 | 		return n, err
276 | 	}
277 | 	n = copy(p, buf[off:])
278 | 	return n, err
279 | }
280 | 
281 | func (f *testWritableFile) Close() error {
282 | 	f.closed = true
283 | 	return nil
284 | }
285 | 
286 | func (f *testWritableFile) Sync() error {
287 | 	if f.syncErr != nil {
288 | 		err := f.syncErr
289 | 		f.syncErr = nil
290 | 		return err
291 | 	}
292 | 	f.dirty = false
293 | 	return nil
294 | }
295 | 


--------------------------------------------------------------------------------
/verifier/store.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package verifier
  5 | 
  6 | import (
  7 | 	"encoding/binary"
  8 | 	"errors"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"sync/atomic"
 12 | 
 13 | 	"github.com/hashicorp/go-hclog"
 14 | 	"github.com/hashicorp/raft"
 15 | 	"github.com/hashicorp/raft-wal/metrics"
 16 | )
 17 | 
 18 | var _ raft.LogStore = &LogStore{}
 19 | var _ raft.MonotonicLogStore = &LogStore{}
 20 | 
 21 | // LogStore is a raft.LogStore that acts as middleware around an underlying
 22 | // persistent store. It provides support for periodically verifying that ranges
 23 | // of logs read back from the LogStore match the values written, and the values
 24 | // read from the LogStores of other peers even though all peers will have
 25 | // different actual log ranges due to independent snapshotting and truncation.
 26 | //
 27 | // Verification of the underlying log implementation may be performed as
 28 | // follows:
 29 | //  1. The application provides an implementation of `IsCheckpoint` that is
 30 | //     able to identify whether the encoded data represents a checkpoint
 31 | //     command.
 32 | //  2. The application's raft leader then may periodically append such a
 33 | //     checkpoint log to be replicated out.
 34 | //  3. When the LogStore has a log appended for which IsCheckpoint returns true,
 35 | //     it will write the current cumulative checksum over log entries since the
 36 | //     last checkpoint into the Extra field. Since hashicorp/raft only
 37 | //     replicates to peers _after_ a trip through the LogStore, this checksum
 38 | //     will be replicated.
 39 | //  4. When a follower has a log appended for which IsCheckpoint returns true,
 40 | //     but already has non-empty Extra metadata, it will trigger a background
 41 | //     verification.
 42 | //  5. Verification happens in the background and reads all logs from the
 43 | //     underlying store since the last checkpoint, calculating their checksums
 44 | //     cumulatively before calling the configured Report func with a summary of
 45 | //     what it found.
 46 | type LogStore struct {
 47 | 	checksum    uint64 // accessed atomically
 48 | 	sumStartIdx uint64 // accessed atomically
 49 | 
 50 | 	s raft.LogStore
 51 | 
 52 | 	metrics metrics.Collector
 53 | 	log     hclog.Logger
 54 | 
 55 | 	verifyCh chan VerificationReport
 56 | 
 57 | 	checkpointFn IsCheckpointFn
 58 | 	reportFn     ReportFn
 59 | }
 60 | 
 61 | // NewLogStore creates a verifying LogStore. CheckpointFn and ReportFn must be
 62 | // set on the returned store _before_ it is passed to Raft, or may be left as
 63 | // nil to bypass verification. Close must be called when the log store is no
 64 | // longer useful to cleanup background verification.
 65 | func NewLogStore(store raft.LogStore, checkpointFn IsCheckpointFn, reportFn ReportFn, mc metrics.Collector) *LogStore {
 66 | 	c := &LogStore{
 67 | 		s:            store,
 68 | 		metrics:      mc,
 69 | 		verifyCh:     make(chan VerificationReport, 1),
 70 | 		checkpointFn: checkpointFn,
 71 | 		reportFn:     reportFn,
 72 | 	}
 73 | 	go c.runVerifier()
 74 | 	return c
 75 | }
 76 | 
 77 | // FirstIndex returns the first index written. 0 for no entries.
 78 | func (s *LogStore) FirstIndex() (uint64, error) {
 79 | 	return s.s.FirstIndex()
 80 | }
 81 | 
 82 | // LastIndex returns the last index written. 0 for no entries.
 83 | func (s *LogStore) LastIndex() (uint64, error) {
 84 | 	return s.s.LastIndex()
 85 | }
 86 | 
 87 | // GetLog gets a log entry at a given index.
 88 | func (s *LogStore) GetLog(index uint64, log *raft.Log) error {
 89 | 	return s.s.GetLog(index, log)
 90 | }
 91 | 
 92 | // StoreLog stores a log entry.
 93 | func (s *LogStore) StoreLog(log *raft.Log) error {
 94 | 	return s.StoreLogs([]*raft.Log{log})
 95 | }
 96 | 
 97 | func encodeCheckpointMeta(startIdx, sum uint64) []byte {
 98 | 	var buf [24]byte
 99 | 	binary.LittleEndian.PutUint64(buf[0:8], ExtensionMagicPrefix)
100 | 	binary.LittleEndian.PutUint64(buf[8:16], startIdx)
101 | 	binary.LittleEndian.PutUint64(buf[16:24], sum)
102 | 	return buf[:]
103 | }
104 | 
105 | func decodeCheckpointMeta(bs []byte) (startIdx, sum uint64, err error) {
106 | 	if len(bs) < 24 {
107 | 		return 0, 0, io.ErrShortBuffer
108 | 	}
109 | 	magic := binary.LittleEndian.Uint64(bs[0:8])
110 | 	if magic != ExtensionMagicPrefix {
111 | 		return 0, 0, errors.New("invalid extension data")
112 | 	}
113 | 	startIdx = binary.LittleEndian.Uint64(bs[8:16])
114 | 	sum = binary.LittleEndian.Uint64(bs[16:24])
115 | 	return startIdx, sum, nil
116 | }
117 | 
118 | func (s *LogStore) updateVerifyState(log *raft.Log, checksum, startIdx uint64) (newSum, newStartIdx uint64, r *VerificationReport, err error) {
119 | 	// Check if log is a checkpoint, note we already nil-checked this function
120 | 	// before calling.
121 | 	isCP, err := s.checkpointFn(log)
122 | 	if err != nil {
123 | 		return 0, 0, nil, err
124 | 	}
125 | 
126 | 	if startIdx == 0 {
127 | 		startIdx = log.Index
128 | 	}
129 | 
130 | 	if isCP {
131 | 		r = &VerificationReport{
132 | 			Range:      LogRange{End: log.Index},
133 | 			WrittenSum: checksum,
134 | 		}
135 | 		if len(log.Extensions) == 0 {
136 | 			// It's a new checkpoint and we must be the leader. Set our state.
137 | 			log.Extensions = encodeCheckpointMeta(startIdx, checksum)
138 | 			r.Range.Start = startIdx
139 | 			r.ExpectedSum = checksum
140 | 		} else {
141 | 			cpStartIdx, cpSum, err := decodeCheckpointMeta(log.Extensions)
142 | 			if err != nil {
143 | 				return 0, 0, nil, err
144 | 			}
145 | 			r.Range.Start = cpStartIdx
146 | 			r.ExpectedSum = cpSum
147 | 
148 | 			// If we've calculated our own checksum over a different range to the
149 | 			// leader e.g. because we just started and this is the first sum then
150 | 			// there's no point trying to verify so leave WrittenSum zero.
151 | 			if cpStartIdx != startIdx {
152 | 				r.WrittenSum = 0
153 | 			}
154 | 		}
155 | 		// Reset the checksum as we're now in the range of the next checkpoint. We
156 | 		// don't update the store state yet until we know these logs committed to
157 | 		// the underlying store.
158 | 		checksum = 0
159 | 		startIdx = log.Index
160 | 	}
161 | 
162 | 	// Whether checkpoint or not, hash the entry and update return updated
163 | 	// checksum.
164 | 	checksum = checksumLog(checksum, log)
165 | 	return checksum, startIdx, r, nil
166 | }
167 | 
168 | // StoreLogs stores multiple log entries.
169 | func (s *LogStore) StoreLogs(logs []*raft.Log) error {
170 | 	if len(logs) < 1 {
171 | 		return nil
172 | 	}
173 | 
174 | 	// Maintain a local copy of the checksum and sumStartIdx, we'll update the
175 | 	// state only once we know all these entries were stored.
176 | 	cs := atomic.LoadUint64(&s.checksum)
177 | 	startIdx := atomic.LoadUint64(&s.sumStartIdx)
178 | 	var triggeredReports []VerificationReport
179 | 
180 | 	if s.checkpointFn != nil {
181 | 		var vr *VerificationReport
182 | 		var err error
183 | 		for _, log := range logs {
184 | 			cs, startIdx, vr, err = s.updateVerifyState(log, cs, startIdx)
185 | 			if err != nil {
186 | 				return fmt.Errorf("failed updating verifier state: %w", err)
187 | 			}
188 | 			if vr != nil {
189 | 				// We need to trigger a new checkpoint verification. But we can't until
190 | 				// after the logs are persisted below.
191 | 				triggeredReports = append(triggeredReports, *vr)
192 | 			}
193 | 		}
194 | 	}
195 | 
196 | 	err := s.s.StoreLogs(logs)
197 | 	if err != nil {
198 | 		return err
199 | 	}
200 | 
201 | 	// Update the checksum state now logs are committed.
202 | 	atomic.StoreUint64(&s.checksum, cs)
203 | 	atomic.StoreUint64(&s.sumStartIdx, startIdx)
204 | 	if len(triggeredReports) > 0 {
205 | 		s.metrics.IncrementCounter("checkpoints_written", uint64(len(triggeredReports)))
206 | 	}
207 | 
208 | 	for _, r := range triggeredReports {
209 | 		s.triggerVerify(r)
210 | 	}
211 | 	return nil
212 | }
213 | 
214 | // triggerVerify triggers a verification in the background. We won't block if
215 | // the verifier is busy. The chan is one buffered so there can be at most one
216 | // running and one waiting. If there is already one waiting so the chan is
217 | // blocked, we drop r.
218 | func (s *LogStore) triggerVerify(r VerificationReport) {
219 | 	select {
220 | 	case s.verifyCh <- r:
221 | 	default:
222 | 		s.metrics.IncrementCounter("dropped_reports", 1)
223 | 	}
224 | }
225 | 
226 | // DeleteRange deletes a range of log entries. The range is inclusive.
227 | func (s *LogStore) DeleteRange(min uint64, max uint64) error {
228 | 	return s.s.DeleteRange(min, max)
229 | }
230 | 
231 | // Close cleans up the background verification routine and calls Close on the
232 | // underlying store if it is an io.Closer.
233 | func (s *LogStore) Close() error {
234 | 	if s.verifyCh == nil {
235 | 		return nil
236 | 	}
237 | 	close(s.verifyCh)
238 | 	// Don't set verifyCh to nil as that's racey - it's being accessed from other
239 | 	// routines.
240 | 	if closer, ok := s.s.(io.Closer); ok {
241 | 		return closer.Close()
242 | 	}
243 | 	return nil
244 | }
245 | 
246 | // IsMonotonic implements the raft.MonotonicLogStore interface. This is a shim
247 | // to expose the underlying store as monotonically indexed or not.
248 | func (s *LogStore) IsMonotonic() bool {
249 | 	if store, ok := s.s.(raft.MonotonicLogStore); ok {
250 | 		return store.IsMonotonic()
251 | 	}
252 | 	return false
253 | }
254 | 


--------------------------------------------------------------------------------
/verifier/verifier.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package verifier
  5 | 
  6 | import (
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"time"
 10 | 
 11 | 	"github.com/hashicorp/raft"
 12 | 	"github.com/segmentio/fasthash/fnv1a"
 13 | )
 14 | 
 15 | const (
 16 | 	// ExtensionMagicPrefix is the prefix we append to log Extensions fields to
 17 | 	// disambiguate from other middleware that may use extensions. This value is
 18 | 	// carefully constructed to be completely invalid as the beginning of a
 19 | 	// protobuf (3) wire protocol message since the other known user of this field
 20 | 	// encodes its data that way. If the first byte were 0xa8 this would be a
 21 | 	// valid protobuf field encoding for an int field, however currently the 3
 22 | 	// least significant bits encode the field type as 7, which is not a valid
 23 | 	// type in the current spec. Even if this does change in the future, the
 24 | 	// field's tag number encoded here is 123456789 so it's extremely unlikely
 25 | 	// that any valid protobuf schema will ever have enough fields or arbitrarily
 26 | 	// decide to assign field tags that large (though unrecognized tags would be
 27 | 	// ignored on decode). Finally, the value of the field is the varint encoding
 28 | 	// of the randomly chosen value 53906 so if type 7 is ever valid in the future
 29 | 	// and used as a length-prefixed type, the length decoded would be way longer
 30 | 	// than the buffer making it invalid.
 31 | 	ExtensionMagicPrefix uint64 = 0xafd1f9d60392a503
 32 | )
 33 | 
 34 | // IsCheckpointFn is a function that can decide whether the contents of a raft
 35 | // log's Data represents a checkpoint message. It is called on every append so
 36 | // it must be relatively fast in the common case. If it returns true for a log,
 37 | // the log's Extra field will be used to encode verification metadata and must
 38 | // be empty - if it's not empty the append will fail and force the leader to
 39 | // step down. If an error is returned the same will happen.
 40 | type IsCheckpointFn func(*raft.Log) (bool, error)
 41 | 
 42 | // ReportFn is a function that will be called after every checkpoint has been
 43 | // verified. It will not be called concurrently. The VerificationReport may
 44 | // represent a failure to report so it's Err field should be checked. For
 45 | // example, if checkpoints are arriving faster than they can be calculated, some
 46 | // will be skipped and no report will be made for that range. The next report
 47 | // that is delivered will contain the range missed for logging. Note that
 48 | // ReportFn is called synchronously by the verifier so it should not block for
 49 | // long otherwise it may cause the verifier to miss later checkpoints.
 50 | type ReportFn func(VerificationReport)
 51 | 
 52 | // ErrRangeMismatch is the error type returned in a VerificationReport where the
 53 | // follower does not have enough logs on disk to fill the checkpoint's range and
 54 | // so is bound to fail. This is a separate type from pure failures to read a log
 55 | // because it's expected this could happen just after truncations or if the
 56 | // interval is to large for the number of logs retained etc. Implementations may
 57 | // choose to detect this and report as a warning rather than a failure as it
 58 | // indicates only an inability to report correctly not an actual error in
 59 | // processing data.
 60 | var ErrRangeMismatch = errors.New("range mismatch")
 61 | 
 62 | // ErrChecksumMismatch is the error type returned in a VerificationReport where
 63 | // the log range's checksum didn't match.
 64 | type ErrChecksumMismatch string
 65 | 
 66 | // Error implements error
 67 | func (e ErrChecksumMismatch) Error() string {
 68 | 	return string(e)
 69 | }
 70 | 
 71 | // LogRange describes the set of logs in the range [Start, End). That is End is
 72 | // NOT inclusive.
 73 | type LogRange struct {
 74 | 	Start uint64
 75 | 	End   uint64
 76 | }
 77 | 
 78 | // String implements Stringer
 79 | func (r LogRange) String() string {
 80 | 	return fmt.Sprintf("[%d, %d)", r.Start, r.End)
 81 | }
 82 | 
 83 | // VerificationReport describes the result of attempting to verify the contents
 84 | // of all logs in a range compared with the input the leader delivered for that
 85 | // same range.
 86 | type VerificationReport struct {
 87 | 	// Range is the range of raft indexes over which the leader calculated its
 88 | 	// checksum. In steady state it typically starts with the index of the
 89 | 	// previous checkpoint command, but after an election it could be an arbitrary
 90 | 	// point in the log. If the range is no longer in the server's log (due to not
 91 | 	// seeing one yet or it being truncated too soon) this will be reported as an
 92 | 	// Err - a longer log retention (`raft.Config.TrailingLogs`) or shorter
 93 | 	// interval between checkpoints should be chosen if this happens often.
 94 | 	Range LogRange
 95 | 
 96 | 	// ExpectedSum is a uint64 checksum over the logs in the range as calculated
 97 | 	// by the leader before appending to disk.
 98 | 	ExpectedSum uint64
 99 | 
100 | 	// WrittenSum is the uint64 checksum calculated over the logs in the range of
101 | 	// a follower as it wrote them to it's own LogStore. It might be zero to
102 | 	// indicate that the follower has not written all the logs in Range since
103 | 	// startup and so its written sum will be invalid. Risk of collision with
104 | 	// genuine zero sum is acceptable. If zero the verifier will have ignored it
105 | 	// and not raised an error if it didn't match expected.
106 | 	WrittenSum uint64
107 | 
108 | 	// ReadSum is the uint64 checksum calculated over the logs in the range as
109 | 	// read from the underlying LogStore in the range [StartIndex, EndIndex).
110 | 	ReadSum uint64
111 | 
112 | 	// Err indicates any error that prevented the report from being completed or
113 | 	// the result of the report. It will be set to ErrChecksumMismatch if the
114 | 	// report was conducted correctly, but the log data written or read checksum
115 | 	// did not match the leader's write checksum. The message in the error
116 | 	// describes the nature of the failure.
117 | 	Err error
118 | 
119 | 	// SkippedRange indicates the ranges of logs covered by any checkpoints that
120 | 	// we skipped due to spending too much time verifying. If this is regularly
121 | 	// non-nil it likely indicates that the checkpoint frequency is too fast.
122 | 	SkippedRange *LogRange
123 | 
124 | 	// Elapsed records how long it took to read the range and generate the report.
125 | 	Elapsed time.Duration
126 | }
127 | 
128 | func (s *LogStore) runVerifier() {
129 | 	if s.reportFn == nil {
130 | 		// Nothing to do!
131 | 		return
132 | 	}
133 | 
134 | 	var lastCheckPointIdx uint64
135 | 	for {
136 | 		report, ok := <-s.verifyCh
137 | 		if !ok {
138 | 			// Close was called
139 | 			return
140 | 		}
141 | 
142 | 		// Detect skipped checkpoints
143 | 		if lastCheckPointIdx > 0 && lastCheckPointIdx != report.Range.Start {
144 | 			report.SkippedRange = &LogRange{
145 | 				Start: lastCheckPointIdx,
146 | 				End:   report.Range.Start,
147 | 			}
148 | 		}
149 | 		lastCheckPointIdx = report.Range.End
150 | 
151 | 		st := time.Now()
152 | 		s.verify(&report)
153 | 
154 | 		// Whatever state report ended up in, deliver it!
155 | 		report.Elapsed = time.Since(st)
156 | 		s.reportFn(report)
157 | 		s.metrics.IncrementCounter("ranges_verified", 1)
158 | 	}
159 | }
160 | 
161 | func (s *LogStore) verify(report *VerificationReport) {
162 | 	// Attempt to read all the logs in the range from underlying store.
163 | 	var log raft.Log
164 | 
165 | 	// If this is a follower but it _wrote_ different data to it's log than the
166 | 	// leader in this range then there's not much point verifying that we read it
167 | 	// back OK.
168 | 	if report.WrittenSum != 0 && report.WrittenSum != report.ExpectedSum {
169 | 		s.metrics.IncrementCounter("write_checksum_failures", 1)
170 | 		report.Err = ErrChecksumMismatch(fmt.Sprintf("log verification failed for range %s: "+
171 | 			"in-flight corruption: follower wrote checksum=%08x, leader wrote checksum=%08x",
172 | 			report.Range, report.WrittenSum, report.ExpectedSum))
173 | 		return
174 | 	}
175 | 
176 | 	// Do we actually have enough logs to calculate the checksum? If not indicate
177 | 	// that explicitly as its an expected case rather than a real "error". Note
178 | 	// that we may get a racey false negative here if truncation happens right
179 | 	// between this check and the GetLog call below but there's not much we can do
180 | 	// about that and hopefully is rare enough!
181 | 	first, err := s.s.FirstIndex()
182 | 	if err != nil {
183 | 		report.Err = fmt.Errorf("unable to verify log range %s: %w", report.Range, err)
184 | 		return
185 | 	}
186 | 	if first > report.Range.Start {
187 | 		// We don't have enough logs to calculate this correctly.
188 | 		report.Err = ErrRangeMismatch
189 | 		return
190 | 	}
191 | 
192 | 	sum := uint64(0)
193 | 	for idx := report.Range.Start; idx < report.Range.End; idx++ {
194 | 		err := s.s.GetLog(idx, &log)
195 | 		if err != nil {
196 | 			report.Err = fmt.Errorf("unable to verify log range %s: %w", report.Range, err)
197 | 			return
198 | 		}
199 | 		sum = checksumLog(sum, &log)
200 | 	}
201 | 	report.ReadSum = sum
202 | 
203 | 	if report.ReadSum != report.ExpectedSum {
204 | 		s.metrics.IncrementCounter("read_checksum_failures", 1)
205 | 		report.Err = ErrChecksumMismatch(fmt.Sprintf("log verification failed for range %s: "+
206 | 			"storage corruption: node read checksum=%08x, leader wrote checksum=%08x",
207 | 			report.Range, report.ReadSum, report.ExpectedSum))
208 | 		return
209 | 	}
210 | }
211 | 
212 | func checksumLog(sum uint64, log *raft.Log) uint64 {
213 | 	// Special case for bootstrap config entries (index 1, type configuration)
214 | 	// since these are not replicated by raft and so may not be byte-for-byte
215 | 	// identical as long as they are logical the same on all peers. So just treat
216 | 	// them all as identical to avoid false-positives on startup.
217 | 	if log.Index == 1 && log.Type == raft.LogConfiguration {
218 | 		return 0
219 | 	}
220 | 	sum = fnv1a.AddUint64(sum, log.Index)
221 | 	sum = fnv1a.AddUint64(sum, log.Term)
222 | 	sum = fnv1a.AddUint64(sum, uint64(log.Type))
223 | 	sum = fnv1a.AddBytes64(sum, log.Data)
224 | 	if len(log.Extensions) > 0 {
225 | 		sum = fnv1a.AddBytes64(sum, log.Extensions)
226 | 	}
227 | 	return sum
228 | }
229 | 


--------------------------------------------------------------------------------
/migrate/migrate_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package migrate
  5 | 
  6 | import (
  7 | 	"context"
  8 | 	"encoding/binary"
  9 | 	"fmt"
 10 | 	"strings"
 11 | 	"testing"
 12 | 	"time"
 13 | 
 14 | 	"github.com/hashicorp/raft"
 15 | 	"github.com/hashicorp/raft-wal/types"
 16 | 	"github.com/stretchr/testify/require"
 17 | )
 18 | 
 19 | func TestCopyLogs(t *testing.T) {
 20 | 	cases := []struct {
 21 | 		name           string
 22 | 		startIndex     uint64
 23 | 		numLogs        int
 24 | 		batchBytes     int
 25 | 		wantNumBatches int
 26 | 		nilChan        bool
 27 | 		cancelCtx      bool
 28 | 		wantErr        string
 29 | 	}{
 30 | 		{
 31 | 			name:       "basic copy",
 32 | 			startIndex: 1234,
 33 | 			numLogs:    1000,
 34 | 			// Each log is 26 bytes but we assume 32 bytes of overhead for encoding in
 35 | 			// general. So each log takes up 58 bytes of our batch size. 550 bytes is
 36 | 			// not quite enough for 10 but we treat it as a soft limit so we'll get 10
 37 | 			// per batch
 38 | 			batchBytes:     550,
 39 | 			wantNumBatches: 100,
 40 | 		},
 41 | 		{
 42 | 			name:           "start from 1",
 43 | 			startIndex:     1,
 44 | 			numLogs:        1000,
 45 | 			batchBytes:     580, // Exact fit for 10 entries
 46 | 			wantNumBatches: 100,
 47 | 		},
 48 | 		{
 49 | 			name:           "nil progress chan",
 50 | 			startIndex:     1234,
 51 | 			numLogs:        1000,
 52 | 			batchBytes:     580,
 53 | 			wantNumBatches: 100,
 54 | 			// A nil progress chan shouldn't block the copy.
 55 | 			nilChan: true,
 56 | 		},
 57 | 		{
 58 | 			name:           "context cancel",
 59 | 			startIndex:     1234,
 60 | 			numLogs:        1000,
 61 | 			batchBytes:     580,
 62 | 			wantNumBatches: 0,
 63 | 			cancelCtx:      true,
 64 | 			wantErr:        "context canceled",
 65 | 		},
 66 | 	}
 67 | 
 68 | 	for _, tc := range cases {
 69 | 		tc := tc
 70 | 		t.Run(tc.name, func(t *testing.T) {
 71 | 			src := populateTestLogStore(t, tc.startIndex, tc.numLogs)
 72 | 			dst := &testLogStore{}
 73 | 			var progress chan string
 74 | 			if !tc.nilChan {
 75 | 				// Buffer it more than enough so we won't have to read concurrently.
 76 | 				progress = make(chan string, tc.wantNumBatches*3)
 77 | 			}
 78 | 
 79 | 			ctx := context.Background()
 80 | 			if tc.cancelCtx {
 81 | 				cancelledCtx, cancel := context.WithCancel(ctx)
 82 | 				cancel()
 83 | 				ctx = cancelledCtx
 84 | 			}
 85 | 
 86 | 			err := CopyLogs(ctx, dst, src, tc.batchBytes, progress)
 87 | 			if tc.wantErr != "" {
 88 | 				require.ErrorContains(t, err, tc.wantErr)
 89 | 				return
 90 | 			}
 91 | 			require.NoError(t, err, "failed copy")
 92 | 
 93 | 			if progress != nil {
 94 | 				// This loop will not return if progress wasn't closed.
 95 | 				for s := range progress {
 96 | 					t.Log(s)
 97 | 				}
 98 | 			}
 99 | 
100 | 			// Verify the copy!
101 | 			wantFirst, _ := src.FirstIndex()
102 | 			wantLast, _ := src.LastIndex()
103 | 			gotFirst, _ := dst.FirstIndex()
104 | 			gotLast, _ := dst.LastIndex()
105 | 			require.Equal(t, int(wantFirst), int(gotFirst))
106 | 			require.Equal(t, int(wantLast), int(gotLast))
107 | 
108 | 			var log raft.Log
109 | 			for idx := wantFirst; idx <= wantLast; idx++ {
110 | 				err := dst.GetLog(idx, &log)
111 | 				require.NoError(t, err)
112 | 				require.Equal(t, int(idx), int(log.Index))
113 | 				require.Equal(t, string(logPayload(idx)), string(log.Data))
114 | 			}
115 | 
116 | 			// Validate that we actually split into chunks as expected.
117 | 			require.Equal(t, tc.wantNumBatches, dst.appends)
118 | 		})
119 | 	}
120 | }
121 | 
122 | func TestCopyStable(t *testing.T) {
123 | 	cases := []struct {
124 | 		name         string
125 | 		srcVals      map[string]string
126 | 		srcIntVals   map[string]uint64
127 | 		extraKeys    [][]byte
128 | 		extraIntKeys [][]byte
129 | 		nilChan      bool
130 | 		cancelCtx    bool
131 | 		wantErr      string
132 | 	}{
133 | 		{
134 | 			name: "basic raft data",
135 | 			srcVals: map[string]string{
136 | 				"LastVoteCand": "s1",
137 | 			},
138 | 			srcIntVals: map[string]uint64{
139 | 				"CurrentTerm":  1234,
140 | 				"LastVoteTerm": 1000,
141 | 			},
142 | 		},
143 | 		{
144 | 			name: "context cancelled",
145 | 			srcVals: map[string]string{
146 | 				"LastVoteCand": "s1",
147 | 			},
148 | 			srcIntVals: map[string]uint64{
149 | 				"CurrentTerm":  1234,
150 | 				"LastVoteTerm": 1000,
151 | 			},
152 | 			cancelCtx: true,
153 | 			wantErr:   "context canceled",
154 | 		},
155 | 		{
156 | 			name: "additional keys",
157 | 			srcVals: map[string]string{
158 | 				"LastVoteCand": "s1",
159 | 				"my_app_key":   "foo",
160 | 				"my_other_key": "baz",
161 | 				"no_copy":      "bar",
162 | 			},
163 | 			srcIntVals: map[string]uint64{
164 | 				"CurrentTerm":                   1234,
165 | 				"LastVoteTerm":                  1000,
166 | 				"favorite_term_so_far":          569,
167 | 				"least_favorite_number_no_copy": 4321,
168 | 			},
169 | 			extraKeys:    [][]byte{[]byte("my_app_key"), []byte("my_other_key")},
170 | 			extraIntKeys: [][]byte{[]byte("favorite_term_so_far")},
171 | 		},
172 | 	}
173 | 
174 | 	for _, tc := range cases {
175 | 		tc := tc
176 | 		t.Run(tc.name, func(t *testing.T) {
177 | 			src := newTestStableStore()
178 | 			dst := newTestStableStore()
179 | 
180 | 			// Insert src values:
181 | 			for k, v := range tc.srcIntVals {
182 | 				err := src.SetUint64([]byte(k), v)
183 | 				require.NoError(t, err)
184 | 			}
185 | 			for k, v := range tc.srcVals {
186 | 				err := src.Set([]byte(k), []byte(v))
187 | 				require.NoError(t, err)
188 | 			}
189 | 
190 | 			var progress chan string
191 | 			if !tc.nilChan {
192 | 				// Buffer it more than enough so we won't have to read concurrently.
193 | 				progress = make(chan string, (len(tc.srcIntVals)+len(tc.srcVals))*3)
194 | 			}
195 | 
196 | 			ctx := context.Background()
197 | 			if tc.cancelCtx {
198 | 				cancelledCtx, cancel := context.WithCancel(ctx)
199 | 				cancel()
200 | 				ctx = cancelledCtx
201 | 			}
202 | 			err := CopyStable(ctx, dst, src, tc.extraKeys, tc.extraIntKeys, progress)
203 | 			if tc.wantErr != "" {
204 | 				require.ErrorContains(t, err, tc.wantErr)
205 | 				return
206 | 			}
207 | 			require.NoError(t, err, "failed copy")
208 | 
209 | 			for s := range progress {
210 | 				// This loop will not return if progress wasn't closed.
211 | 				t.Log(s)
212 | 			}
213 | 
214 | 			// Verify the copy!
215 | 			for k, v := range tc.srcIntVals {
216 | 				if strings.HasSuffix(k, "no_copy") {
217 | 					continue
218 | 				}
219 | 				got, err := dst.GetUint64([]byte(k))
220 | 				require.NoError(t, err)
221 | 				require.Equal(t, int(v), int(got), "wrong int value copied for key %s", k)
222 | 			}
223 | 			for k, v := range tc.srcVals {
224 | 				if strings.HasSuffix(k, "no_copy") {
225 | 					continue
226 | 				}
227 | 				got, err := dst.Get([]byte(k))
228 | 				require.NoError(t, err)
229 | 				require.Equal(t, v, string(got), "wrong value copied for key %s", k)
230 | 			}
231 | 		})
232 | 	}
233 | }
234 | 
235 | func logPayload(idx uint64) string {
236 | 	return fmt.Sprintf("Log entry for index %6d", idx)
237 | }
238 | 
239 | func populateTestLogStore(t *testing.T, startIdx uint64, n int) *testLogStore {
240 | 	t.Helper()
241 | 	ls := &testLogStore{}
242 | 	for idx := startIdx; idx < (startIdx + uint64(n)); idx++ {
243 | 		err := ls.StoreLog(&raft.Log{
244 | 			Index:      idx,
245 | 			Data:       []byte(logPayload(idx)),
246 | 			AppendedAt: time.Now(),
247 | 		})
248 | 		require.NoError(t, err)
249 | 	}
250 | 	return ls
251 | }
252 | 
253 | type testLogStore struct {
254 | 	appends int
255 | 	logs    []*raft.Log
256 | }
257 | 
258 | // FirstIndex returns the first index written. 0 for no entries.
259 | func (s *testLogStore) FirstIndex() (uint64, error) {
260 | 	if len(s.logs) < 1 {
261 | 		return 0, nil
262 | 	}
263 | 	return s.logs[0].Index, nil
264 | }
265 | 
266 | // LastIndex returns the last index written. 0 for no entries.
267 | func (s *testLogStore) LastIndex() (uint64, error) {
268 | 	if len(s.logs) < 1 {
269 | 		return 0, nil
270 | 	}
271 | 	return s.logs[len(s.logs)-1].Index, nil
272 | }
273 | 
274 | // GetLog gets a log entry at a given index.
275 | func (s *testLogStore) GetLog(index uint64, log *raft.Log) error {
276 | 	first, _ := s.FirstIndex()
277 | 	last, _ := s.LastIndex()
278 | 	if first == 0 || index < first || index > last {
279 | 		return types.ErrNotFound
280 | 	}
281 | 	offset := index - first
282 | 	*log = *s.logs[offset]
283 | 	return nil
284 | }
285 | 
286 | // StoreLog stores a log entry.
287 | func (s *testLogStore) StoreLog(log *raft.Log) error {
288 | 	return s.StoreLogs([]*raft.Log{log})
289 | }
290 | 
291 | // StoreLogs stores multiple log entries.
292 | func (s *testLogStore) StoreLogs(logs []*raft.Log) error {
293 | 	last, _ := s.LastIndex()
294 | 	prev := last
295 | 	for _, log := range logs {
296 | 		if prev > 0 && (prev+1) != log.Index {
297 | 			return fmt.Errorf("logs out of sequence got index=%d expecting index=%d", log.Index, prev+1)
298 | 		}
299 | 		s.logs = append(s.logs, log)
300 | 		prev = log.Index
301 | 	}
302 | 	s.appends++
303 | 	return nil
304 | }
305 | 
306 | // DeleteRange deletes a range of log entries. The range is inclusive.
307 | func (s *testLogStore) DeleteRange(min uint64, max uint64) error {
308 | 	panic("not implemented") // Don't need this in this package.
309 | }
310 | 
311 | type testStableStore struct {
312 | 	d map[string][]byte
313 | }
314 | 
315 | func newTestStableStore() *testStableStore {
316 | 	return &testStableStore{
317 | 		d: make(map[string][]byte),
318 | 	}
319 | }
320 | 
321 | func (s *testStableStore) Set(key []byte, val []byte) error {
322 | 	s.d[(string(key))] = val
323 | 	return nil
324 | }
325 | 
326 | // Get returns the value for key, or an empty byte slice if key was not found.
327 | func (s *testStableStore) Get(key []byte) ([]byte, error) {
328 | 	return s.d[string(key)], nil
329 | }
330 | 
331 | func (s *testStableStore) SetUint64(key []byte, val uint64) error {
332 | 	var buf [8]byte
333 | 	binary.BigEndian.PutUint64(buf[:], val)
334 | 	s.d[(string(key))] = buf[:]
335 | 	return nil
336 | }
337 | 
338 | // GetUint64 returns the uint64 value for key, or 0 if key was not found.
339 | func (s *testStableStore) GetUint64(key []byte) (uint64, error) {
340 | 	v, ok := s.d[string(key)]
341 | 	if !ok {
342 | 		return 0, nil
343 | 	}
344 | 	return binary.BigEndian.Uint64(v), nil
345 | }
346 | 


--------------------------------------------------------------------------------
/segment/filer.go:
--------------------------------------------------------------------------------
  1 | // Copyright IBM Corp. 2020, 2025
  2 | // SPDX-License-Identifier: MPL-2.0
  3 | 
  4 | package segment
  5 | 
  6 | import (
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"io"
 10 | 	"strings"
 11 | 	"sync"
 12 | 
 13 | 	"github.com/hashicorp/raft-wal/types"
 14 | )
 15 | 
 16 | const (
 17 | 	segmentFileSuffix      = ".wal"
 18 | 	segmentFileNamePattern = "%020d-%016x" + segmentFileSuffix
 19 | )
 20 | 
 21 | // Filer implements the abstraction for managing a set of segment files in a
 22 | // directory. It uses a VFS to abstract actual file system operations for easier
 23 | // testing.
 24 | type Filer struct {
 25 | 	dir     string
 26 | 	vfs     types.VFS
 27 | 	bufPool sync.Pool
 28 | }
 29 | 
 30 | // NewFiler creates a Filer ready for use.
 31 | func NewFiler(dir string, vfs types.VFS) *Filer {
 32 | 	f := &Filer{
 33 | 		dir: dir,
 34 | 		vfs: vfs,
 35 | 	}
 36 | 	f.bufPool.New = func() interface{} {
 37 | 		return make([]byte, minBufSize)
 38 | 	}
 39 | 	return f
 40 | }
 41 | 
 42 | // FileName returns the formatted file name expected for this segment.
 43 | // SegmentFiler implementations could choose to ignore this but it's here to
 44 | func FileName(i types.SegmentInfo) string {
 45 | 	return fmt.Sprintf(segmentFileNamePattern, i.BaseIndex, i.ID)
 46 | }
 47 | 
 48 | // Create adds a new segment with the given info and returns a writer or an
 49 | // error.
 50 | func (f *Filer) Create(info types.SegmentInfo) (types.SegmentWriter, error) {
 51 | 	if info.BaseIndex == 0 {
 52 | 		return nil, fmt.Errorf("BaseIndex must be greater than zero")
 53 | 	}
 54 | 	fname := FileName(info)
 55 | 
 56 | 	wf, err := f.vfs.Create(f.dir, fname, uint64(info.SizeLimit))
 57 | 	if err != nil {
 58 | 		return nil, err
 59 | 	}
 60 | 
 61 | 	return createFile(info, wf, &f.bufPool)
 62 | }
 63 | 
 64 | // RecoverTail is called on an unsealed segment when re-opening the WAL it will
 65 | // attempt to recover from a possible crash. It will either return an error, or
 66 | // return a valid segmentWriter that is ready for further appends. If the
 67 | // expected tail segment doesn't exist it must return an error wrapping
 68 | // os.ErrNotExist.
 69 | func (f *Filer) RecoverTail(info types.SegmentInfo) (types.SegmentWriter, error) {
 70 | 	fname := FileName(info)
 71 | 
 72 | 	wf, err := f.vfs.OpenWriter(f.dir, fname)
 73 | 	if err != nil {
 74 | 		return nil, err
 75 | 	}
 76 | 
 77 | 	return recoverFile(info, wf, &f.bufPool)
 78 | }
 79 | 
 80 | // Open an already sealed segment for reading. Open may validate the file's
 81 | // header and return an error if it doesn't match the expected info.
 82 | func (f *Filer) Open(info types.SegmentInfo) (types.SegmentReader, error) {
 83 | 	fname := FileName(info)
 84 | 
 85 | 	rf, err := f.vfs.OpenReader(f.dir, fname)
 86 | 	if err != nil {
 87 | 		return nil, err
 88 | 	}
 89 | 
 90 | 	// Validate header here since openReader is re-used by writer where it's valid
 91 | 	// for the file header not to be committed yet after a crash so we can't check
 92 | 	// it there.
 93 | 	var hdr [fileHeaderLen]byte
 94 | 
 95 | 	if _, err := rf.ReadAt(hdr[:], 0); err != nil {
 96 | 		if errors.Is(err, io.EOF) {
 97 | 			// Treat failure to read a header as corruption since a sealed file should
 98 | 			// never not have a valid header. (I.e. even if crashes happen it should
 99 | 			// be impossible to seal a segment with no header written so this
100 | 			// indicates that something truncated the file after the fact)
101 | 			return nil, fmt.Errorf("%w: failed to read header: %s", types.ErrCorrupt, err)
102 | 		}
103 | 		return nil, err
104 | 	}
105 | 
106 | 	gotInfo, err := readFileHeader(hdr[:])
107 | 	if err != nil {
108 | 		return nil, err
109 | 	}
110 | 
111 | 	if err := validateFileHeader(*gotInfo, info); err != nil {
112 | 		return nil, err
113 | 	}
114 | 
115 | 	return openReader(info, rf, &f.bufPool)
116 | }
117 | 
118 | // List returns the set of segment IDs currently stored. It's used by the WAL
119 | // on recovery to find any segment files that need to be deleted following a
120 | // unclean shutdown. The returned map is a map of ID -> BaseIndex. BaseIndex
121 | // is returned to allow subsequent Delete calls to be made.
122 | func (f *Filer) List() (map[uint64]uint64, error) {
123 | 	segs, _, err := f.listInternal()
124 | 	return segs, err
125 | }
126 | 
127 | func (f *Filer) listInternal() (map[uint64]uint64, []uint64, error) {
128 | 	files, err := f.vfs.ListDir(f.dir)
129 | 	if err != nil {
130 | 		return nil, nil, err
131 | 	}
132 | 
133 | 	segs := make(map[uint64]uint64)
134 | 	sorted := make([]uint64, 0)
135 | 	for _, file := range files {
136 | 		if !strings.HasSuffix(file, segmentFileSuffix) {
137 | 			continue
138 | 		}
139 | 		// Parse BaseIndex and ID from the file name
140 | 		var bIdx, id uint64
141 | 		n, err := fmt.Sscanf(file, segmentFileNamePattern, &bIdx, &id)
142 | 		if err != nil {
143 | 			return nil, nil, types.ErrCorrupt
144 | 		}
145 | 		if n != 2 {
146 | 			// Misnamed segment files with the right suffix indicates a bug or
147 | 			// tampering, we can't be sure what's happened to the data.
148 | 			return nil, nil, types.ErrCorrupt
149 | 		}
150 | 		segs[id] = bIdx
151 | 		sorted = append(sorted, id)
152 | 	}
153 | 
154 | 	return segs, sorted, nil
155 | }
156 | 
157 | // Delete removes the segment with given baseIndex and id if it exists. Note
158 | // that baseIndex is technically redundant since ID is unique on it's own. But
159 | // in practice we name files (or keys) with both so that they sort correctly.
160 | // This interface allows a  simpler implementation where we can just delete
161 | // the file if it exists without having to scan the underlying storage for a.
162 | func (f *Filer) Delete(baseIndex uint64, ID uint64) error {
163 | 	fname := fmt.Sprintf(segmentFileNamePattern, baseIndex, ID)
164 | 	return f.vfs.Delete(f.dir, fname)
165 | }
166 | 
167 | // DumpSegment attempts to read the segment file specified by the baseIndex and
168 | // ID. It's intended purpose is for debugging the contents of segment files and
169 | // unlike the SegmentFiler interface, it doesn't assume the caller has access to
170 | // the correct metadata. This allows dumping log segments in a WAL that is still
171 | // being written to by another process. Without metadata we don't know if the
172 | // file is sealed so always recover by reading through the whole file. If after
173 | // or before are non-zero, the specify a exclusive lower or upper bound on which
174 | // log entries should be emitted. No error checking is done on the read data. fn
175 | // is called for each entry passing the raft info read from the file header (so
176 | // that the caller knows which codec to use for example) the raft index of the
177 | // entry and the raw bytes of the entry itself. The callback must return true to
178 | // continue reading. The data slice is only valid for the lifetime of the call.
179 | func (f *Filer) DumpSegment(baseIndex uint64, ID uint64, after, before uint64, fn func(info types.SegmentInfo, e types.LogEntry) (bool, error)) error {
180 | 	fname := fmt.Sprintf(segmentFileNamePattern, baseIndex, ID)
181 | 
182 | 	rf, err := f.vfs.OpenReader(f.dir, fname)
183 | 	if err != nil {
184 | 		return err
185 | 	}
186 | 
187 | 	buf := make([]byte, 64*1024)
188 | 	idx := baseIndex
189 | 
190 | 	type frameInfo struct {
191 | 		Index  uint64
192 | 		Offset int64
193 | 		Len    uint32
194 | 	}
195 | 	var batch []frameInfo
196 | 
197 | 	_, err = readThroughSegment(rf, func(info types.SegmentInfo, fh frameHeader, offset int64) (bool, error) {
198 | 		if fh.typ == FrameCommit {
199 | 			// All the previous entries have been committed. Read them and send up to
200 | 			// caller.
201 | 			for _, frame := range batch {
202 | 				// Check the header is reasonable
203 | 				if frame.Len > MaxEntrySize {
204 | 					return false, fmt.Errorf("failed to read entry idx=%d, frame header length (%d) is too big: %w",
205 | 						frame.Index, frame.Len, err)
206 | 				}
207 | 
208 | 				if frame.Len > uint32(len(buf)) {
209 | 					buf = make([]byte, frame.Len)
210 | 				}
211 | 
212 | 				n, err := rf.ReadAt(buf[:frame.Len], frame.Offset+frameHeaderLen)
213 | 				if err != nil {
214 | 					return false, err
215 | 				}
216 | 				if uint32(n) < frame.Len {
217 | 					return false, io.ErrUnexpectedEOF
218 | 				}
219 | 
220 | 				ok, err := fn(info, types.LogEntry{Index: frame.Index, Data: buf[:n]})
221 | 				if !ok || err != nil {
222 | 					return ok, err
223 | 				}
224 | 			}
225 | 			// Reset batch
226 | 			batch = batch[:0]
227 | 			return true, nil
228 | 		}
229 | 
230 | 		if fh.typ != FrameEntry {
231 | 			return true, nil
232 | 		}
233 | 
234 | 		if idx <= after {
235 | 			// Not in the range we care about, skip reading the entry.
236 | 			idx++
237 | 			return true, nil
238 | 		}
239 | 		if before > 0 && idx >= before {
240 | 			// We're done
241 | 			return false, nil
242 | 		}
243 | 
244 | 		batch = append(batch, frameInfo{idx, offset, fh.len})
245 | 		idx++
246 | 		return true, nil
247 | 	})
248 | 
249 | 	return err
250 | }
251 | 
252 | // DumpLogs attempts to read all log entries from segment files in the directory
253 | // for debugging purposes. It does _not_ use the metadata and so may output log
254 | // entries that are uncommitted or already truncated as far as the writing
255 | // process is concerned. As such it should not be used for replication of data.
256 | // It is useful though to debug the contents of the log even while the writing
257 | // application is still running. After and before if non-zero specify exclusive
258 | // bounds on the logs that should be returned which may allow the implementation
259 | // to skip reading entire segment files that are not in the range.
260 | func (f *Filer) DumpLogs(after, before uint64, fn func(info types.SegmentInfo, e types.LogEntry) (bool, error)) error {
261 | 	baseIndexes, segIDsSorted, err := f.listInternal()
262 | 	if err != nil {
263 | 		return err
264 | 	}
265 | 
266 | 	for i, id := range segIDsSorted {
267 | 		baseIndex := baseIndexes[id]
268 | 		nextBaseIndex := uint64(0)
269 | 		if i+1 < len(segIDsSorted) {
270 | 			// This is not the last segment, peek at the base index of that one and
271 | 			// assume that this segment won't contain indexes that high.
272 | 			nextBaseIndex = baseIndexes[segIDsSorted[i+1]]
273 | 		}
274 | 		// See if this file contains any indexes in the range
275 | 		if after > 0 && nextBaseIndex > 0 && after >= nextBaseIndex {
276 | 			// This segment is all indexes before the lower bound we care about
277 | 			continue
278 | 		}
279 | 		if before > 0 && before <= baseIndex {
280 | 			// This segment is all indexes higher than the upper bound. We've output
281 | 			// every log in the range at this point (barring edge cases where we race
282 | 			// with a truncation which leaves multiple generations of segment files on
283 | 			// disk which we are going to ignore for now).
284 | 			return nil
285 | 		}
286 | 
287 | 		// We probably care about at least some of the entries in this segment
288 | 		err := f.DumpSegment(baseIndex, id, after, before, fn)
289 | 		if err != nil {
290 | 			return err
291 | 		}
292 | 	}
293 | 
294 | 	return nil
295 | }
296 | 


--------------------------------------------------------------------------------