├── block_reader
    ├── factory.go
    ├── factory_legacy.go
    ├── pread.go
    ├── interface.go
    └── iouring.go
├── consts.go
├── .gitignore
├── bench
    ├── benchmark3
    ├── benchmark1
    ├── bench_compaction_test.go
    ├── gc_overhead_test.go
    ├── bench_disk_usage_test.go
    └── bench_test.go
├── go.mod
├── wal_rewriter.go
├── .golangci.yml
├── batch.go
├── wal_iterator_test.go
├── db_test.go
├── meta.go
├── manifest_edit_test.go
├── hint_test.go
├── wal_iterator.go
├── manifest_txn.go
├── index_test.go
├── index.go
├── utils.go
├── go.sum
├── manifest_edit.go
├── record_test.go
├── manifest_txn_test.go
├── hint.go
├── deque.go
├── db.go
├── block_cache.go
├── manifest_test.go
├── deque_test.go
├── wal_test.go
├── record.go
├── map_test.go
├── README-CN.md
├── db_impl_test.go
├── README.md
├── compaction_test.go
├── map.go
├── compaction.go
├── manifest.go
└── wal.go


/block_reader/factory.go:
--------------------------------------------------------------------------------
1 | //go:build io_uring
2 | // +build io_uring
3 | 
4 | package block_reader
5 | 
6 | func NewDefaultBlockReader(concurrent uint64) (BlockReader, error) {
7 | 	return NewIOUringBlockReader(concurrent)
8 | }
9 | 


--------------------------------------------------------------------------------
/block_reader/factory_legacy.go:
--------------------------------------------------------------------------------
1 | //go:build !io_uring
2 | // +build !io_uring
3 | 
4 | package block_reader
5 | 
6 | func NewDefaultBlockReader(concurrent uint64) (BlockReader, error) {
7 | 	return NewPreadBlockReader(concurrent)
8 | }
9 | 


--------------------------------------------------------------------------------
/consts.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | const (
 4 | 	DefaultNsSize   = 20
 5 | 	DefaultEtagSize = 20
 6 | 
 7 | 	// trigger one compaction per 60 second
 8 | 	DefaultCompactionTriggerInterval = 60
 9 | 
10 | 	DefaultCheckDiskUsageInterval = 20
11 | 
12 | 	DefaultCompactionPickerRatio = 0.4
13 | 
14 | 	DefaultRecordBufferSize = 64 * 1024 // 64KB
15 | 
16 | 	DefaultLogMaxSize = 20 // 20MB
17 | 
18 | 	DefaultLogFile = "db.log"
19 | )
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If you prefer the allow list template instead of the deny list, see community template:
 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
 3 | #
 4 | # Binaries for programs and plugins
 5 | *.exe
 6 | *.exe~
 7 | *.dll
 8 | *.so
 9 | *.dylib
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 | 
20 | # Go workspace file
21 | go.work
22 | go.work.sum
23 | 
24 | # env file
25 | .env
26 | 


--------------------------------------------------------------------------------
/block_reader/pread.go:
--------------------------------------------------------------------------------
 1 | package block_reader
 2 | 
 3 | import (
 4 | 	"golang.org/x/sys/unix"
 5 | )
 6 | 
 7 | type PreadBlockReader struct{}
 8 | 
 9 | func NewPreadBlockReader(concurrent uint64) (*PreadBlockReader, error) {
10 | 	return &PreadBlockReader{}, nil
11 | }
12 | 
13 | func (r *PreadBlockReader) NewRequest(fd int, fid, offset uint64, blk []byte) *Request {
14 | 	return &Request{
15 | 		Fd:  fd,
16 | 		Fid: fid,
17 | 		Off: offset,
18 | 		Blk: blk,
19 | 		res: 0,
20 | 		err: nil,
21 | 	}
22 | }
23 | 
24 | func (r *PreadBlockReader) Submit(reqs Requests) error {
25 | 	for idx, req := range reqs {
26 | 		n, err := unix.Pread(req.Fd, req.Blk, int64(req.Off))
27 | 		reqs[idx].err = err
28 | 		reqs[idx].res = n
29 | 	}
30 | 
31 | 	return nil
32 | }
33 | 


--------------------------------------------------------------------------------
/bench/benchmark3:
--------------------------------------------------------------------------------
 1 | go test -bench=GcOverhead -benchtime=300s -count=1 -timeout=30m
 2 | goos: linux
 3 | goarch: amd64
 4 | pkg: github.com/wenzhang-dev/bitcaskDB/bench
 5 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
 6 | BenchmarkGcOverhead-8
 7 | GC pause for startup: gc=1.246146ms
 8 | GC pause for test: total=64.350396ms, gc=160.293µs, iter=1
 9 | GC pause for startup: gc=719.201µs
10 | GC pause for test: total=105.89436ms, gc=176.146µs, iter=100
11 | GC pause for startup: gc=202.888µs
12 | GC pause for test: total=150.224754ms, gc=63.252µs, iter=10000
13 | GC pause for startup: gc=323.495µs
14 | GC pause for test: total=5.88998191s, gc=1.734889ms, iter=1000000
15 | GC pause for startup: gc=181.486µs
16 | GC pause for test: total=14m31.795948219s, gc=86.133309ms, iter=59247226
17 | 59247226      14743 ns/op     1698 B/op       10 allocs/op
18 | PASS
19 | ok   github.com/wenzhang-dev/bitcaskDB/bench 884.982s
20 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/wenzhang-dev/bitcaskDB
 2 | 
 3 | go 1.23.0
 4 | 
 5 | toolchain go1.24.2
 6 | 
 7 | require (
 8 | 	github.com/gofrs/flock v0.12.1
 9 | 	github.com/iceber/iouring-go v0.0.0
10 | 	github.com/rs/zerolog v1.34.0
11 | 	github.com/spaolacci/murmur3 v1.1.0
12 | 	github.com/stretchr/testify v1.9.0
13 | 	github.com/vmihailenco/msgpack/v5 v5.4.1
14 | 	golang.org/x/sys v0.32.0
15 | 	gopkg.in/natefinch/lumberjack.v2 v2.2.1
16 | )
17 | 
18 | require (
19 | 	github.com/davecgh/go-spew v1.1.1 // indirect
20 | 	github.com/kr/text v0.2.0 // indirect
21 | 	github.com/mattn/go-colorable v0.1.13 // indirect
22 | 	github.com/mattn/go-isatty v0.0.19 // indirect
23 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
24 | 	github.com/rogpeppe/go-internal v1.14.1 // indirect
25 | 	github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
26 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
27 | )
28 | 
29 | replace github.com/iceber/iouring-go => github.com/royalcat/iouring-go v0.0.0-20240925200811-286062ac1b23
30 | 


--------------------------------------------------------------------------------
/wal_rewriter.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | type WalRewriter struct {
 4 | 	wal *Wal
 5 | 
 6 | 	bufLen    int
 7 | 	threshold int
 8 | }
 9 | 
10 | func NewWalRewriter(wal *Wal, threshold int) *WalRewriter {
11 | 	if threshold < 4*1024 {
12 | 		threshold = 4 * 1024
13 | 	}
14 | 
15 | 	wal.Ref()
16 | 	return &WalRewriter{
17 | 		wal:       wal,
18 | 		bufLen:    0,
19 | 		threshold: threshold,
20 | 	}
21 | }
22 | 
23 | func (r *WalRewriter) Wal() *Wal {
24 | 	return r.wal
25 | }
26 | 
27 | func (r *WalRewriter) Close() error {
28 | 	if r.bufLen != 0 {
29 | 		if err := r.wal.Flush(); err != nil {
30 | 			return err
31 | 		}
32 | 	}
33 | 	r.wal.Unref()
34 | 	return nil
35 | }
36 | 
37 | func (r *WalRewriter) AppendRecord(record []byte) (off uint64, err error) {
38 | 	off, err = r.wal.WriteRecord(record)
39 | 	if err != nil {
40 | 		return 0, err
41 | 	}
42 | 
43 | 	r.bufLen += len(record)
44 | 	if r.bufLen >= r.threshold {
45 | 		err = r.Flush()
46 | 	}
47 | 
48 | 	return
49 | }
50 | 
51 | func (r *WalRewriter) Flush() error {
52 | 	if r.bufLen != 0 {
53 | 		if err := r.wal.Flush(); err != nil {
54 | 			return err
55 | 		}
56 | 		r.bufLen = 0
57 | 	}
58 | 	return nil
59 | }
60 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | linters:
 3 |   default: none
 4 |   enable:
 5 |     - durationcheck
 6 |     - forcetypeassert
 7 |     - gocritic
 8 |     - gomodguard
 9 |     - govet
10 |     - ineffassign
11 |     - misspell
12 |     - revive
13 |     - staticcheck
14 |     - unconvert
15 |     - unused
16 |     - usetesting
17 |     - whitespace
18 |   settings:
19 |     misspell:
20 |       locale: US
21 |     staticcheck:
22 |       checks:
23 |         - all
24 |         - -SA1008
25 |         - -SA1019
26 |         - -SA4000
27 |         - -SA9004
28 |         - -ST1000
29 |         - -ST1005
30 |         - -ST1016
31 |         - -U1000
32 |   exclusions:
33 |     generated: lax
34 |     rules:
35 |       - linters:
36 |           - forcetypeassert
37 |         path: _test\.go
38 |       - path: (.+)\.go$
39 |         text: 'empty-block:'
40 |       - path: (.+)\.go$
41 |         text: 'unused-parameter:'
42 |       - path: (.+)\.go$
43 |         text: 'dot-imports:'
44 |       - path: (.+)\.go$
45 |         text: should have a package comment
46 |       - path: (.+)\.go$
47 |         text: error strings should not be capitalized or end with punctuation or a newline
48 | issues:
49 |   max-issues-per-linter: 100
50 |   max-same-issues: 100
51 | formatters:
52 |   enable:
53 |     - gofumpt
54 |     - goimports
55 | 


--------------------------------------------------------------------------------
/batch.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | type Batch struct {
 4 | 	records []*Record
 5 | 
 6 | 	// the total number of bytes of serialized records
 7 | 	byteSize int
 8 | }
 9 | 
10 | func NewBatch() *Batch {
11 | 	return &Batch{}
12 | }
13 | 
14 | func (b *Batch) Put(ns, key, val []byte, meta *Meta) {
15 | 	record := &Record{
16 | 		Ns:      ns,
17 | 		Key:     key,
18 | 		Meta:    meta,
19 | 		Value:   val,
20 | 		Deleted: false,
21 | 	}
22 | 	b.records = append(b.records, record)
23 | 	b.byteSize += record.ApproximateSize()
24 | }
25 | 
26 | func (b *Batch) Delete(ns, key []byte) {
27 | 	record := &Record{
28 | 		Ns:  ns,
29 | 		Key: key,
30 | 		// the deletion operation will carry tombstone flag, and store in database
31 | 		// at the same time, the related index in memory will be removed. so the key will
32 | 		// not be found. the record with tombstone flag will be removed in compaction
33 | 		Meta:    NewMetaWithTombstone(),
34 | 		Value:   nil,
35 | 		Deleted: true,
36 | 	}
37 | 	b.records = append(b.records, record)
38 | 	b.byteSize += record.ApproximateSize()
39 | }
40 | 
41 | func (b *Batch) Clear() {
42 | 	b.byteSize = 0
43 | 	b.records = nil
44 | }
45 | 
46 | func (b *Batch) Append(batch *Batch) {
47 | 	if batch == nil {
48 | 		return
49 | 	}
50 | 
51 | 	b.records = append(b.records, batch.records...)
52 | 	b.byteSize += batch.byteSize
53 | }
54 | 
55 | func (b *Batch) Size() int {
56 | 	return len(b.records)
57 | }
58 | 
59 | func (b *Batch) ByteSize() int {
60 | 	return b.byteSize
61 | }
62 | 


--------------------------------------------------------------------------------
/wal_iterator_test.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"strconv"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/stretchr/testify/assert"
 9 | )
10 | 
11 | func TestWalIterator_Basic(t *testing.T) {
12 | 	wal := setupWal("test_wal_it_basic.wal", t)
13 | 	defer wal.Unref()
14 | 
15 | 	for i := 0; i < 1000; i++ {
16 | 		data := []byte(strconv.Itoa(i))
17 | 		_, err := wal.WriteRecord(data)
18 | 		assert.Nil(t, err)
19 | 	}
20 | 
21 | 	wal.Flush()
22 | 
23 | 	// iterate
24 | 	it := NewWalIterator(wal)
25 | 	defer it.Close()
26 | 
27 | 	itNum := 0
28 | 	for {
29 | 		_, readData, err := it.Next()
30 | 		if errors.Is(err, ErrWalIteratorEOF) {
31 | 			break
32 | 		}
33 | 		assert.Nil(t, err)
34 | 		assert.Equal(t, readData, []byte(strconv.Itoa(itNum)))
35 | 
36 | 		itNum++
37 | 	}
38 | 
39 | 	assert.Equal(t, itNum, 1000)
40 | }
41 | 
42 | func TestWalIterator_LargeData(t *testing.T) {
43 | 	wal := setupWal("test_wal_it_large_data.wal", t)
44 | 	defer wal.Unref()
45 | 
46 | 	data5KB := GenNKBytes(5)
47 | 
48 | 	// total 5MB = 4KB * 1024
49 | 	for i := 0; i < 1024; i++ {
50 | 		_, err := wal.WriteRecord(data5KB)
51 | 		assert.Nil(t, err)
52 | 		assert.Nil(t, wal.Flush())
53 | 	}
54 | 
55 | 	// iterate
56 | 	it := NewWalIterator(wal)
57 | 	defer it.Close()
58 | 
59 | 	itNum := 0
60 | 	var err error
61 | 	var readData []byte
62 | 	for {
63 | 		_, readData, err = it.Next()
64 | 		if err != nil {
65 | 			break
66 | 		}
67 | 		assert.Equal(t, data5KB, readData)
68 | 
69 | 		itNum++
70 | 	}
71 | 
72 | 	assert.True(t, errors.Is(err, ErrWalIteratorEOF))
73 | 	assert.Equal(t, 1024, itNum)
74 | }
75 | 


--------------------------------------------------------------------------------
/db_test.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func TestDB_ParseFilename(t *testing.T) {
10 | 	hintName := "00001.hint"
11 | 	mergeName := "00002.merge"
12 | 	walName := "00003.wal"
13 | 	manifestName := "MANIFEST-000004"
14 | 	tmpName := "00005.tmp"
15 | 	lockName := "LOCK"
16 | 	currentName := "CURRENT"
17 | 	unknownName := "test"
18 | 
19 | 	ft, fid, err := ParseFilename(hintName)
20 | 	assert.Nil(t, err)
21 | 	assert.Equal(t, ft, HintFileType)
22 | 	assert.Equal(t, fid, uint64(1))
23 | 
24 | 	ft, fid, err = ParseFilename(mergeName)
25 | 	assert.Nil(t, err)
26 | 	assert.Equal(t, ft, MergeFileType)
27 | 	assert.Equal(t, fid, uint64(2))
28 | 
29 | 	ft, fid, err = ParseFilename(walName)
30 | 	assert.Nil(t, err)
31 | 	assert.Equal(t, ft, WalFileType)
32 | 	assert.Equal(t, fid, uint64(3))
33 | 
34 | 	ft, fid, err = ParseFilename(manifestName)
35 | 	assert.Nil(t, err)
36 | 	assert.Equal(t, ft, ManifestFileType)
37 | 	assert.Equal(t, fid, uint64(4))
38 | 
39 | 	ft, fid, err = ParseFilename(tmpName)
40 | 	assert.Nil(t, err)
41 | 	assert.Equal(t, ft, TmpFileType)
42 | 	assert.Equal(t, fid, uint64(5))
43 | 
44 | 	ft, fid, err = ParseFilename(lockName)
45 | 	assert.Nil(t, err)
46 | 	assert.Equal(t, ft, LockFileType)
47 | 	assert.Equal(t, fid, uint64(0))
48 | 
49 | 	ft, fid, err = ParseFilename(currentName)
50 | 	assert.Nil(t, err)
51 | 	assert.Equal(t, ft, CurrentFileType)
52 | 	assert.Equal(t, fid, uint64(0))
53 | 
54 | 	ft, fid, err = ParseFilename(unknownName)
55 | 	assert.Nil(t, err)
56 | 	assert.Equal(t, ft, UnknownFileType)
57 | 	assert.Equal(t, fid, uint64(0))
58 | }
59 | 


--------------------------------------------------------------------------------
/block_reader/interface.go:
--------------------------------------------------------------------------------
 1 | package block_reader
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"golang.org/x/sys/unix"
 6 | 	"slices"
 7 | )
 8 | 
 9 | var ErrBlockReaderRequestFailed = errors.New("block reader request failed")
10 | 
11 | type Request struct {
12 | 	Fd  int
13 | 	Fid uint64
14 | 	Off uint64
15 | 	Blk []byte
16 | 
17 | 	res int
18 | 	err error
19 | }
20 | 
21 | func (r *Request) Err() error {
22 | 	if r.err != nil {
23 | 		return r.err
24 | 	}
25 | 
26 | 	if r.res < 0 {
27 | 		return errors.Join(ErrBlockReaderRequestFailed, unix.Errno(-r.res))
28 | 	}
29 | 	return nil
30 | }
31 | 
32 | func (r *Request) NBytes() int {
33 | 	return r.res
34 | }
35 | 
36 | type Requests []*Request
37 | 
38 | func (r Requests) Sort() {
39 | 	slices.SortFunc(r, func(a, b *Request) int {
40 | 		if a.Fid < b.Fid {
41 | 			return -1
42 | 		} else if a.Fid > b.Fid {
43 | 			return 1
44 | 		}
45 | 
46 | 		if a.Off < b.Off {
47 | 			return -1
48 | 		} else if a.Off > b.Off {
49 | 			return 1
50 | 		}
51 | 
52 | 		return 0
53 | 	})
54 | }
55 | 
56 | func (r Requests) BinarySearch(fid, off uint64) (*Request, bool) {
57 | 	idx, found := slices.BinarySearchFunc(r, &Request{Fid: fid, Off: off}, func(a, b *Request) int {
58 | 		if a.Fid < b.Fid {
59 | 			return -1
60 | 		} else if a.Fid > b.Fid {
61 | 			return 1
62 | 		}
63 | 
64 | 		if a.Off < b.Off {
65 | 			return -1
66 | 		} else if a.Off > b.Off {
67 | 			return 1
68 | 		}
69 | 
70 | 		return 0
71 | 	})
72 | 
73 | 	if !found {
74 | 		return nil, false
75 | 	}
76 | 
77 | 	return r[idx], true
78 | }
79 | 
80 | // thread safe
81 | type BlockReader interface {
82 | 	NewRequest(fd int, fid, offset uint64, blk []byte) *Request
83 | 	Submit(reqs Requests) error
84 | }
85 | 


--------------------------------------------------------------------------------
/meta.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | const (
 4 | 	TombstoneFlagBit = 0
 5 | 	MetaNoExpire     = 0
 6 | )
 7 | 
 8 | type Meta struct {
 9 | 	// user specified meta data
10 | 	AppMeta     map[string]string
11 | 	AppMetaSize int
12 | 
13 | 	// control meta data
14 | 	Expire uint64
15 | 	Etag   []byte
16 | 
17 | 	// bitmap flag
18 | 	Flags uint8
19 | }
20 | 
21 | func NewMeta(appMeta map[string]string) *Meta {
22 | 	meta := &Meta{
23 | 		AppMeta:     nil,
24 | 		AppMetaSize: 0,
25 | 		Expire:      MetaNoExpire,
26 | 		Etag:        nil,
27 | 		Flags:       0,
28 | 	}
29 | 
30 | 	return meta.SetAppMeta(appMeta)
31 | }
32 | 
33 | func NewMetaWithTombstone() *Meta {
34 | 	meta := NewMeta(nil)
35 | 	return meta.SetTombstone(true)
36 | }
37 | 
38 | func (m *Meta) SetAppMeta(appMeta map[string]string) *Meta {
39 | 	// FIXME: insufficient
40 | 	size := 0
41 | 	for k, v := range appMeta {
42 | 		size += len(k) + len(v)
43 | 	}
44 | 
45 | 	m.AppMeta = appMeta
46 | 	m.AppMetaSize = size
47 | 
48 | 	return m
49 | }
50 | 
51 | func (m *Meta) SetExpire(expire uint64) *Meta {
52 | 	m.Expire = expire
53 | 	return m
54 | }
55 | 
56 | func (m *Meta) SetEtag(etag []byte) *Meta {
57 | 	m.Etag = etag
58 | 	return m
59 | }
60 | 
61 | func (m *Meta) SetTombstone(enable bool) *Meta {
62 | 	if enable {
63 | 		m.Flags |= (uint8)(1 << TombstoneFlagBit)
64 | 	} else {
65 | 		m.Flags &= ^(uint8)(1 << TombstoneFlagBit)
66 | 	}
67 | 	return m
68 | }
69 | 
70 | func (m *Meta) AppMetadata() map[string]string {
71 | 	return m.AppMeta
72 | }
73 | 
74 | func (m *Meta) IsTombstone() bool {
75 | 	return m.Flags&(1<<TombstoneFlagBit) != 0
76 | }
77 | 
78 | func (m *Meta) AppMetaApproximateSize() int {
79 | 	return m.AppMetaSize
80 | }
81 | 


--------------------------------------------------------------------------------
/manifest_edit_test.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func TestManifestEdit_EncodeAndDecode(t *testing.T) {
10 | 	edit := &ManifestEdit{
11 | 		addFiles:    []LogFile{{fid: 5}},
12 | 		deleteFiles: []LogFile{{fid: 3}, {fid: 4}},
13 | 		hasNextFid:  true,
14 | 		nextFid:     6,
15 | 		freeBytes:   make(map[uint64]uint64),
16 | 	}
17 | 
18 | 	edit.freeBytes[2] = 567
19 | 
20 | 	bytes := edit.Encode()
21 | 
22 | 	decodeEdit := &ManifestEdit{}
23 | 	err := decodeEdit.DecodeFrom(bytes)
24 | 	assert.Nil(t, err)
25 | 	assert.Equal(t, edit, decodeEdit)
26 | }
27 | 
28 | func TestManifestEdit_DecodeCorruptData(t *testing.T) {
29 | 	edit := &ManifestEdit{}
30 | 	err := edit.DecodeFrom([]byte{255, 255, 255})
31 | 	assert.NotNil(t, err)
32 | }
33 | 
34 | func TestManifestEdit_Merge(t *testing.T) {
35 | 	edit1 := &ManifestEdit{
36 | 		addFiles:    []LogFile{{fid: 5}},
37 | 		deleteFiles: []LogFile{{fid: 3}, {fid: 4}},
38 | 		hasNextFid:  true,
39 | 		nextFid:     6,
40 | 		freeBytes:   make(map[uint64]uint64),
41 | 	}
42 | 
43 | 	edit1.freeBytes[2] = 123
44 | 
45 | 	edit2 := &ManifestEdit{
46 | 		addFiles:    []LogFile{{fid: 6}, {fid: 7}},
47 | 		deleteFiles: []LogFile{{fid: 5}},
48 | 		hasNextFid:  true,
49 | 		nextFid:     8,
50 | 		freeBytes:   make(map[uint64]uint64),
51 | 	}
52 | 
53 | 	edit2.freeBytes[2] = 123
54 | 
55 | 	edit1.Merge(edit2)
56 | 
57 | 	// check
58 | 	assert.Equal(t, edit1.addFiles, []LogFile{{fid: 5}, {fid: 6}, {fid: 7}})
59 | 	assert.Equal(t, edit1.deleteFiles, []LogFile{{fid: 3}, {fid: 4}, {fid: 5}})
60 | 	assert.Equal(t, edit1.nextFid, uint64(8))
61 | 	assert.Equal(t, edit1.freeBytes[2], uint64(246))
62 | 	assert.True(t, edit1.hasNextFid)
63 | }
64 | 


--------------------------------------------------------------------------------
/block_reader/iouring.go:
--------------------------------------------------------------------------------
 1 | package block_reader
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"sync"
 6 | 
 7 | 	"github.com/iceber/iouring-go"
 8 | )
 9 | 
10 | var ErrBlockReaderConcurrent = errors.New("concurrent too large")
11 | 
12 | type IOUringBlockReader struct {
13 | 	reqPool    sync.Pool
14 | 	iour       *iouring.IOURing
15 | 	concurrent uint64
16 | }
17 | 
18 | func NewIOUringBlockReader(concurrent uint64) (*IOUringBlockReader, error) {
19 | 	iour, err := iouring.New(uint(concurrent))
20 | 	if err != nil {
21 | 		return nil, err
22 | 	}
23 | 
24 | 	return &IOUringBlockReader{
25 | 		reqPool: sync.Pool{
26 | 			New: func() any {
27 | 				b := make([]iouring.PrepRequest, concurrent)
28 | 				return &b
29 | 			},
30 | 		},
31 | 		iour:       iour,
32 | 		concurrent: concurrent,
33 | 	}, nil
34 | }
35 | 
36 | func (r *IOUringBlockReader) NewRequest(fd int, fid, offset uint64, blk []byte) *Request {
37 | 	return &Request{
38 | 		Fd:  fd,
39 | 		Fid: fid,
40 | 		Off: offset,
41 | 		Blk: blk,
42 | 		res: 0,
43 | 		err: nil,
44 | 	}
45 | }
46 | 
47 | func (r *IOUringBlockReader) Submit(reqs Requests) error {
48 | 	if len(reqs) > int(r.concurrent) {
49 | 		return ErrBlockReaderConcurrent
50 | 	}
51 | 
52 | 	prepReqsPtr := r.reqPool.Get().(*[]iouring.PrepRequest)
53 | 	defer r.reqPool.Put(prepReqsPtr)
54 | 
55 | 	prepReqs := *prepReqsPtr
56 | 
57 | 	for idx := range reqs {
58 | 		prepReqs[idx] = iouring.Pread(reqs[idx].Fd, reqs[idx].Blk, reqs[idx].Off)
59 | 	}
60 | 
61 | 	rset, err := r.iour.SubmitRequests(prepReqs[:len(reqs)], nil)
62 | 	if err != nil {
63 | 		return err
64 | 	}
65 | 
66 | 	// wait for completion
67 | 	<-rset.Done()
68 | 
69 | 	// the order of io_uring Request is the same to `reqs` arguments
70 | 	for idx, req := range rset.Requests() {
71 | 		res, _ := req.GetRes()
72 | 		reqs[idx].res = res
73 | 	}
74 | 
75 | 	return nil
76 | }
77 | 


--------------------------------------------------------------------------------
/bench/benchmark1:
--------------------------------------------------------------------------------
 1 | go test -bench=PutGet -benchtime=60s -count=3 -timeout=50m
 2 | goos: linux
 3 | goarch: amd64
 4 | pkg: github.com/wenzhang-dev/bitcaskDB/bench
 5 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
 6 | BenchmarkPutGet/put4K-8                  5331782   25259 ns/op   11795 B/op   21 allocs/op
 7 | BenchmarkPutGet/put4K-8                  5130870   25417 ns/op   11767 B/op   21 allocs/op
 8 | BenchmarkPutGet/put4K-8                  4898403   26676 ns/op   11742 B/op   21 allocs/op
 9 | BenchmarkPutGet/batchPut4K-8            10548615   15340 ns/op    1695 B/op   11 allocs/op
10 | BenchmarkPutGet/batchPut4K-8             9220388   14278 ns/op    1694 B/op   11 allocs/op
11 | BenchmarkPutGet/batchPut4K-8            10363459   15019 ns/op    1686 B/op   11 allocs/op
12 | BenchmarkPutGet/get4K-8                  8812342    8076 ns/op   10119 B/op   10 allocs/op
13 | BenchmarkPutGet/get4K-8                  7963098    7952 ns/op   10119 B/op   10 allocs/op
14 | BenchmarkPutGet/get4K-8                  8480240    7997 ns/op   10119 B/op   10 allocs/op
15 | BenchmarkPutGet/concurrentGet4K-8       17233309    4427 ns/op   10044 B/op    7 allocs/op
16 | BenchmarkPutGet/concurrentGet4K-8       26745726    3681 ns/op   10044 B/op    7 allocs/op
17 | BenchmarkPutGet/concurrentGet4K-8       29305041    3654 ns/op   10044 B/op    7 allocs/op
18 | BenchmarkPutGet/concurrentPut4K-8        4558645   19829 ns/op    8340 B/op   18 allocs/op
19 | BenchmarkPutGet/concurrentPut4K-8        4433334   18664 ns/op   10031 B/op   18 allocs/op
20 | BenchmarkPutGet/concurrentPut4K-8        4366149   17031 ns/op    8175 B/op   17 allocs/op
21 | BenchmarkPutGet/concurrentBatchPut4K-8   9443377   12520 ns/op    1527 B/op    9 allocs/op
22 | BenchmarkPutGet/concurrentBatchPut4K-8  11338162   12429 ns/op    1517 B/op    9 allocs/op
23 | BenchmarkPutGet/concurrentBatchPut4K-8  11394081   12101 ns/op    1510 B/op    9 allocs/op
24 | PASS
25 | ok   github.com/wenzhang-dev/bitcaskDB/bench 2310.401s
26 | 


--------------------------------------------------------------------------------
/bench/bench_compaction_test.go:
--------------------------------------------------------------------------------
 1 | package bench
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/stretchr/testify/assert"
 9 | 	"github.com/wenzhang-dev/bitcaskDB"
10 | )
11 | 
12 | func newCompactionDB(b *testing.B) {
13 | 	dir = "./bitcaskDB"
14 | 	_ = os.RemoveAll(dir)
15 | 	_ = os.MkdirAll(dir, os.ModePerm)
16 | 
17 | 	opts := &bitcask.Options{
18 | 		Dir:                       dir,
19 | 		WalMaxSize:                1024 * 1024 * 1024, // 1GB
20 | 		ManifestMaxSize:           10 * 1024 * 1024,   // 10MB
21 | 		IndexCapacity:             10000000,           // 10 million
22 | 		IndexLimited:              8000000,
23 | 		IndexEvictionPoolCapacity: 64,
24 | 		IndexSampleKeys:           5,
25 | 		CompactionPicker:          nil, // default picker
26 | 		CompactionFilter:          nil, // default filter
27 | 		NsSize:                    0,
28 | 		EtagSize:                  0,
29 | 		DisableCompaction:         false,
30 | 		CompactionTriggerInterval: 10, // 10 seconds
31 | 	}
32 | 
33 | 	var err error
34 | 	db, err = bitcask.NewDB(opts)
35 | 	assert.Nil(b, err)
36 | }
37 | 
38 | func BenchmarkCompaction(b *testing.B) {
39 | 	newCompactionDB(b)
40 | 	defer db.Close()
41 | 
42 | 	b.Run("compaction", func(b *testing.B) {
43 | 		benchmarkCompaction(b, db)
44 | 	})
45 | }
46 | 
47 | func benchmarkCompaction(b *testing.B, db bitcask.DB) {
48 | 	threshold := 10000000
49 | 	meta := bitcask.NewMeta(nil)
50 | 	value4KB := bitcask.GenNKBytes(4)
51 | 	opts := &bitcask.WriteOptions{}
52 | 	batchSize := 50
53 | 
54 | 	newKey := func(hint, threshold int) []byte {
55 | 		hint %= threshold
56 | 		key := fmt.Sprintf("key=%10d,%10d", hint, hint) // 25 bytes
57 | 		return []byte(key)
58 | 	}
59 | 
60 | 	// repeat write 10 million keys
61 | 	b.RunParallel(func(pb *testing.PB) {
62 | 		iteration := 1
63 | 		batch := bitcask.NewBatch()
64 | 		for pb.Next() {
65 | 			batch.Put(nil, newKey(iteration, threshold), value4KB, meta)
66 | 
67 | 			if iteration%batchSize == 0 {
68 | 				err := db.Write(batch, opts)
69 | 				assert.Nil(b, err)
70 | 				batch.Clear()
71 | 			}
72 | 
73 | 			iteration++
74 | 		}
75 | 	})
76 | }
77 | 


--------------------------------------------------------------------------------
/hint_test.go:
--------------------------------------------------------------------------------
 1 | package bitcask
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"strconv"
 6 | 	"testing"
 7 | 	"time"
 8 | 
 9 | 	"github.com/stretchr/testify/assert"
10 | )
11 | 
12 | func TestHintEncodeAndDecode(t *testing.T) {
13 | 	ns := sha1Bytes("namespace")
14 | 	hintRecord := &HintRecord{
15 | 		ns:   ns[:],
16 | 		key:  []byte("test-key"),
17 | 		fid:  2,
18 | 		off:  123,
19 | 		size: 100,
20 | 	}
21 | 
22 | 	bytes, err := hintRecord.Encode()
23 | 	assert.Nil(t, err)
24 | 
25 | 	decodeRecord := &HintRecord{}
26 | 	err = decodeRecord.Decode(bytes)
27 | 	assert.Nil(t, err)
28 | 
29 | 	assert.Equal(t, hintRecord, decodeRecord)
30 | }
31 | 
32 | func TestHint_NewHintByWal(t *testing.T) {
33 | 	wal := setupWal("new_hint_by_wal", t)
34 | 	defer wal.Unref()
35 | 
36 | 	ns1 := sha1Bytes("namespace")
37 | 	backStore := make([]byte, DefaultRecordBufferSize)
38 | 	baseTime := uint64(time.Now().Unix())
39 | 	record := &Record{
40 | 		Ns:    ns1[:],
41 | 		Key:   []byte("test-key"),
42 | 		Meta:  NewMeta(nil),
43 | 		Value: []byte("hello world"),
44 | 	}
45 | 
46 | 	for i := 0; i < 1000; i++ {
47 | 		key := []byte("test-key" + strconv.Itoa(i))
48 | 		record.Key = key
49 | 
50 | 		bytes, err := record.Encode(backStore, baseTime)
51 | 		assert.Nil(t, err)
52 | 
53 | 		_, err = wal.WriteRecord(bytes)
54 | 		assert.Nil(t, err)
55 | 	}
56 | 
57 | 	wal.Flush()
58 | 
59 | 	// test hint
60 | 	fileSize, err := NewHintByWal(wal)
61 | 	assert.Nil(t, err)
62 | 	assert.True(t, fileSize > 0)
63 | 
64 | 	hintPath := HintPath(wal.Dir(), wal.Fid())
65 | 	hint, err := LoadWal(hintPath, wal.Fid())
66 | 	assert.Nil(t, err)
67 | 	defer hint.Close()
68 | 	defer os.Remove(hintPath)
69 | 
70 | 	itNum := 0
71 | 	err = IterateHint(hint, func(hintRecord *HintRecord) error {
72 | 		assert.Equal(t, hintRecord.ns, ns1[:])
73 | 		assert.Equal(t, hintRecord.key, []byte("test-key"+strconv.Itoa(itNum)))
74 | 
75 | 		recordBytes, err := wal.ReadRecord(hintRecord.off, hintRecord.size, true)
76 | 		assert.Nil(t, err)
77 | 
78 | 		readRecord, err := RecordFromBytes(recordBytes, wal.BaseTime())
79 | 		assert.Nil(t, err)
80 | 		assert.Equal(t, readRecord.Ns, hintRecord.ns)
81 | 		assert.Equal(t, readRecord.Key, hintRecord.key)
82 | 		assert.Equal(t, readRecord.Value, record.Value)
83 | 
84 | 		itNum++
85 | 		return nil
86 | 	})
87 | 	assert.Nil(t, err)
88 | 	assert.Equal(t, itNum, 1000)
89 | }
90 | 


--------------------------------------------------------------------------------
/bench/gc_overhead_test.go:
--------------------------------------------------------------------------------
 1 | package bench
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"runtime"
 7 | 	"runtime/debug"
 8 | 	"sync/atomic"
 9 | 	"testing"
10 | 	"time"
11 | 
12 | 	"github.com/stretchr/testify/assert"
13 | 	"github.com/wenzhang-dev/bitcaskDB"
14 | )
15 | 
16 | func newGcOverheadDB(b *testing.B) {
17 | 	dir = "./bitcaskDB"
18 | 	_ = os.RemoveAll(dir)
19 | 	_ = os.MkdirAll(dir, os.ModePerm)
20 | 
21 | 	opts := &bitcask.Options{
22 | 		Dir:                       dir,
23 | 		WalMaxSize:                1024 * 1024 * 1024, // 1GB
24 | 		ManifestMaxSize:           10 * 1024 * 1024,   // 10MB
25 | 		IndexCapacity:             10000000,           // 10 million
26 | 		IndexLimited:              8000000,
27 | 		IndexEvictionPoolCapacity: 64,
28 | 		IndexSampleKeys:           5,
29 | 		CompactionPicker:          nil, // default picker
30 | 		CompactionFilter:          nil, // default filter
31 | 		NsSize:                    bitcask.DefaultNsSize,
32 | 		EtagSize:                  bitcask.DefaultEtagSize,
33 | 		DisableCompaction:         false,
34 | 		DiskUsageLimited:          10 * 1024 * 1024 * 1024, // 10GB
35 | 	}
36 | 
37 | 	var err error
38 | 	db, err = bitcask.NewDB(opts)
39 | 	assert.Nil(b, err)
40 | }
41 | 
42 | var previousPause time.Duration
43 | 
44 | func gcPause() time.Duration {
45 | 	runtime.GC()
46 | 
47 | 	var stats debug.GCStats
48 | 	debug.ReadGCStats(&stats)
49 | 
50 | 	pause := stats.PauseTotal - previousPause
51 | 	previousPause = stats.PauseTotal
52 | 
53 | 	return pause
54 | }
55 | 
56 | func BenchmarkGcOverhead(b *testing.B) {
57 | 	newGcOverheadDB(b)
58 | 	defer db.Close()
59 | 
60 | 	meta := bitcask.NewMeta(nil)
61 | 	opts := &bitcask.WriteOptions{}
62 | 
63 | 	b.ResetTimer()
64 | 	b.ReportAllocs()
65 | 
66 | 	startTime := time.Now()
67 | 	fmt.Printf("GC pause for startup: gc=%s\n", gcPause())
68 | 
69 | 	var totalIteration int64
70 | 	b.RunParallel(func(pb *testing.PB) {
71 | 		var err error
72 | 		iteration := 0
73 | 		batch := bitcask.NewBatch()
74 | 		for pb.Next() {
75 | 			batch.Put(ns[:], genTestKey(iteration), bin4KB, meta)
76 | 
77 | 			if iteration%BatchSize == 0 {
78 | 				err = db.Write(batch, opts)
79 | 				assert.Nil(b, err)
80 | 				batch.Clear()
81 | 			}
82 | 
83 | 			iteration++
84 | 		}
85 | 
86 | 		atomic.AddInt64(&totalIteration, int64(iteration))
87 | 	})
88 | 
89 | 	diff := time.Since(startTime)
90 | 	fmt.Printf("GC pause for test: total=%s, gc=%s, iter=%d\n", diff, gcPause(), totalIteration)
91 | }
92 | 


--------------------------------------------------------------------------------
/wal_iterator.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | )
  6 | 
  7 | type WalIterator struct {
  8 | 	wal     *Wal
  9 | 	fileOff int
 10 | 
 11 | 	bufOff  int
 12 | 	bufSize int
 13 | 	buf     []byte
 14 | 
 15 | 	err error
 16 | }
 17 | 
 18 | func NewWalIterator(wal *Wal) *WalIterator {
 19 | 	wal.Ref()
 20 | 
 21 | 	return &WalIterator{
 22 | 		wal:     wal,
 23 | 		fileOff: int(wal.offset),
 24 | 		bufOff:  0,
 25 | 		bufSize: 0,
 26 | 		buf:     make([]byte, BlockSize),
 27 | 	}
 28 | }
 29 | 
 30 | func (i *WalIterator) Close() {
 31 | 	i.wal.Unref()
 32 | }
 33 | 
 34 | func (i *WalIterator) fd() int {
 35 | 	return int(i.wal.fp.Fd())
 36 | }
 37 | 
 38 | // try to read a block unless there is less than one block left
 39 | // the Next method will return the start offset of data in wal file, and the data itself
 40 | func (i *WalIterator) Next() (uint64, []byte, error) {
 41 | 	var off uint64
 42 | 	var record []byte
 43 | 
 44 | 	for i.err == nil {
 45 | 		if i.bufOff+RecordHeaderSize > i.bufSize {
 46 | 			i.fileOff += i.bufSize
 47 | 
 48 | 			// skip the padding
 49 | 			i.bufSize = min(BlockSize, int(i.wal.Size())-i.fileOff)
 50 | 			if i.bufSize == 0 {
 51 | 				i.err = ErrWalIteratorEOF
 52 | 				return 0, nil, i.err
 53 | 			}
 54 | 
 55 | 			if i.err = PreadFull(i.fd(), i.buf[:i.bufSize], int64(i.fileOff)); i.err != nil {
 56 | 				return 0, nil, i.err
 57 | 			}
 58 | 
 59 | 			i.bufOff = 0
 60 | 		}
 61 | 
 62 | 		header := i.buf[i.bufOff : i.bufOff+RecordHeaderSize]
 63 | 		i.bufOff += RecordHeaderSize
 64 | 
 65 | 		crc := binary.LittleEndian.Uint32(header[0:])
 66 | 		length := int(binary.LittleEndian.Uint16(header[4:]))
 67 | 		recordType := header[6]
 68 | 
 69 | 		// record the file offset
 70 | 		if len(record) == 0 {
 71 | 			off = uint64(i.fileOff + i.bufOff)
 72 | 		}
 73 | 
 74 | 		// avoid the corrupted data
 75 | 		length = min(length, i.bufSize-i.bufOff)
 76 | 		data := i.buf[i.bufOff : i.bufOff+length]
 77 | 		i.bufOff += length
 78 | 
 79 | 		if ComputeCRC32(data) != crc {
 80 | 			i.err = ErrWalMismatchCRC
 81 | 			return 0, nil, i.err
 82 | 		}
 83 | 
 84 | 		switch recordType {
 85 | 		case RecordFull:
 86 | 			// reference the backing store of slice
 87 | 			return off, data, nil
 88 | 		case RecordFirst, RecordMiddle:
 89 | 			// Continue reading next chunk
 90 | 			record = append(record, data...)
 91 | 		case RecordLast:
 92 | 			record = append(record, data...)
 93 | 			return off, record, nil
 94 | 		default:
 95 | 			i.err = ErrWalUnknownRecordType
 96 | 		}
 97 | 	}
 98 | 
 99 | 	return 0, nil, i.err
100 | }
101 | 
102 | // the functionality is same to Next, except that the offset does not contain wal header
103 | // it's useful for the hint generation
104 | func (i *WalIterator) NextWithoutHeaderOffset() (uint64, []byte, error) {
105 | 	off, data, err := i.Next()
106 | 	if err != nil {
107 | 		return 0, nil, err
108 | 	}
109 | 
110 | 	off -= RecordHeaderSize
111 | 	return off, data, nil
112 | }
113 | 


--------------------------------------------------------------------------------
/manifest_txn.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"sync"
  6 | )
  7 | 
  8 | var (
  9 | 	ErrAbortedManifestTxn   = errors.New("aborted manifest txn")
 10 | 	ErrCommittedManfiestTxn = errors.New("committed manifest txn")
 11 | )
 12 | 
 13 | // this is a rough implementation of manifest transaction. its main purpose is to make the applied
 14 | // manifest edit visible to other operations. the pending edit of transaction will be persisted only
 15 | // after the transaction is committed.
 16 | //
 17 | // the design is rough because the deleted wals in the applied manifest edits may also be visible to
 18 | // others. also, only one running manifest transaction is supported currently.
 19 | //
 20 | // nevertheless, this design also works well, and avoid transitional design
 21 | type ManifestTxn struct {
 22 | 	manifest    *Manifest
 23 | 	pendingEdit *ManifestEdit
 24 | 
 25 | 	committed bool
 26 | 	aborted   bool
 27 | 
 28 | 	mu sync.RWMutex
 29 | }
 30 | 
 31 | func NewManifestTxn(manifest *Manifest) *ManifestTxn {
 32 | 	return &ManifestTxn{
 33 | 		aborted:     false,
 34 | 		committed:   false,
 35 | 		manifest:    manifest,
 36 | 		pendingEdit: NewManifestEdit(),
 37 | 	}
 38 | }
 39 | 
 40 | func (txn *ManifestTxn) IsDone() bool {
 41 | 	txn.mu.RLock()
 42 | 	defer txn.mu.RUnlock()
 43 | 
 44 | 	return txn.aborted || txn.committed
 45 | }
 46 | 
 47 | func (txn *ManifestTxn) toWalLocked(fid uint64) *Wal {
 48 | 	for idx := range txn.pendingEdit.addFiles {
 49 | 		if txn.pendingEdit.addFiles[idx].fid == fid {
 50 | 			return txn.pendingEdit.addFiles[idx].wal
 51 | 		}
 52 | 	}
 53 | 
 54 | 	return nil
 55 | }
 56 | 
 57 | func (txn *ManifestTxn) ToWal(fid uint64) *Wal {
 58 | 	txn.mu.RLock()
 59 | 	defer txn.mu.RUnlock()
 60 | 
 61 | 	return txn.toWalLocked(fid)
 62 | }
 63 | 
 64 | func (txn *ManifestTxn) ToWalWithRef(fid uint64) *Wal {
 65 | 	txn.mu.RLock()
 66 | 	defer txn.mu.RUnlock()
 67 | 
 68 | 	if wal := txn.toWalLocked(fid); wal != nil {
 69 | 		wal.Ref()
 70 | 		return wal
 71 | 	}
 72 | 
 73 | 	return nil
 74 | }
 75 | 
 76 | func (txn *ManifestTxn) NextFid() uint64 {
 77 | 	txn.mu.RLock()
 78 | 	defer txn.mu.RUnlock()
 79 | 
 80 | 	return txn.pendingEdit.nextFid
 81 | }
 82 | 
 83 | func (txn *ManifestTxn) Apply(edit *ManifestEdit) {
 84 | 	if edit == nil {
 85 | 		return
 86 | 	}
 87 | 
 88 | 	txn.mu.Lock()
 89 | 	defer txn.mu.Unlock()
 90 | 
 91 | 	txn.pendingEdit.Merge(edit)
 92 | }
 93 | 
 94 | func (txn *ManifestTxn) Commit(edit *ManifestEdit) error {
 95 | 	txn.mu.Lock()
 96 | 	defer txn.mu.Unlock()
 97 | 
 98 | 	if txn.committed {
 99 | 		return ErrCommittedManfiestTxn
100 | 	}
101 | 
102 | 	if txn.aborted {
103 | 		return ErrAbortedManifestTxn
104 | 	}
105 | 
106 | 	if edit != nil {
107 | 		txn.pendingEdit.Merge(edit)
108 | 	}
109 | 
110 | 	if err := txn.manifest.LogAndApply(txn.pendingEdit); err != nil {
111 | 		txn.aborted = true
112 | 		return err
113 | 	}
114 | 
115 | 	txn.committed = true
116 | 	return nil
117 | }
118 | 
119 | func (txn *ManifestTxn) Abort() {
120 | 	txn.mu.Lock()
121 | 	defer txn.mu.Unlock()
122 | 
123 | 	if txn.committed {
124 | 		return
125 | 	}
126 | 
127 | 	txn.aborted = true
128 | }
129 | 


--------------------------------------------------------------------------------
/index_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"math/rand"
  6 | 	"strconv"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/stretchr/testify/assert"
 11 | )
 12 | 
 13 | type mockIndexHelper struct{}
 14 | 
 15 | func (mock *mockIndexHelper) Rand(upper uint64) uint64 {
 16 | 	return uint64(rand.Int63n(int64(upper)))
 17 | }
 18 | 
 19 | func (mock *mockIndexHelper) WallTime() time.Time {
 20 | 	return time.Now()
 21 | }
 22 | 
 23 | func setupIndex(t *testing.T) *Index {
 24 | 	index, err := NewIndex(&IndexOptions{
 25 | 		Capacity:             1000,
 26 | 		Limited:              800,
 27 | 		EvictionPoolCapacity: 32,
 28 | 		SampleKeys:           5,
 29 | 		Helper:               &mockIndexHelper{},
 30 | 	})
 31 | 
 32 | 	assert.Nil(t, err)
 33 | 	return index
 34 | }
 35 | 
 36 | func TestIndexBasicOperations(t *testing.T) {
 37 | 	index := setupIndex(t)
 38 | 
 39 | 	ns1 := []byte("ns1")
 40 | 	key1, key2 := []byte("key1"), []byte("key2")
 41 | 
 42 | 	// Insert values
 43 | 	err := index.Put(ns1, key1, 1, 100, 100, nil)
 44 | 	assert.NoError(t, err)
 45 | 
 46 | 	err = index.Put(ns1, key2, 2, 200, 100, nil)
 47 | 	assert.NoError(t, err)
 48 | 
 49 | 	// Retrieve values
 50 | 	fid, off, _, err := index.Get(ns1, key1)
 51 | 	assert.NoError(t, err)
 52 | 	assert.Equal(t, uint64(1), fid)
 53 | 	assert.Equal(t, uint64(100), off)
 54 | 
 55 | 	fid, off, _, err = index.Get(ns1, key2)
 56 | 	assert.NoError(t, err)
 57 | 	assert.Equal(t, uint64(2), fid)
 58 | 	assert.Equal(t, uint64(200), off)
 59 | 
 60 | 	// Update an existing key
 61 | 	err = index.Put(ns1, key1, 3, 300, 100, nil)
 62 | 	assert.NoError(t, err)
 63 | 
 64 | 	fid, off, _, err = index.Get(ns1, key1)
 65 | 	assert.NoError(t, err)
 66 | 	assert.Equal(t, uint64(3), fid)
 67 | 	assert.Equal(t, uint64(300), off)
 68 | }
 69 | 
 70 | func TestIndexDeleteOperations(t *testing.T) {
 71 | 	index := setupIndex(t)
 72 | 
 73 | 	ns1 := []byte("ns1")
 74 | 	key1, key2 := []byte("key1"), []byte("key2")
 75 | 
 76 | 	// Insert and delete key
 77 | 	err := index.Put(ns1, key1, 1, 100, 100, nil)
 78 | 	assert.NoError(t, err)
 79 | 
 80 | 	err = index.Delete(ns1, key1, nil)
 81 | 	assert.NoError(t, err)
 82 | 
 83 | 	_, _, _, err = index.Get(ns1, key1)
 84 | 	assert.Error(t, err) // Should return error since key is deleted
 85 | 
 86 | 	// Soft delete (overwrite with invalid offset)
 87 | 	err = index.Put(ns1, key2, 2, 200, 100, nil)
 88 | 	assert.NoError(t, err)
 89 | 
 90 | 	err = index.SoftDelete(ns1, key2, nil)
 91 | 	assert.NoError(t, err)
 92 | 
 93 | 	_, _, _, err = index.Get(ns1, key2)
 94 | 	assert.NotNil(t, err)
 95 | 	assert.True(t, errors.Is(err, ErrKeySoftDeleted))
 96 | }
 97 | 
 98 | func TestIndexEviction(t *testing.T) {
 99 | 	index := setupIndex(t)
100 | 
101 | 	ns1 := []byte("ns1")
102 | 	totalFreeBytes := uint64(0)
103 | 
104 | 	// the Limited is 800, but 1 million keys has been written
105 | 	// the value size is 100 bytes, so the free bytes should equal to 100 * N
106 | 	for i := 1; i <= 1000000; i++ {
107 | 		key := []byte("key" + strconv.Itoa(i))
108 | 		stat := &WriteStat{}
109 | 		err := index.Put(ns1, key, 1, uint64(i*100), 100, stat)
110 | 		assert.Nil(t, err)
111 | 
112 | 		totalFreeBytes += stat.FreeBytes
113 | 	}
114 | 
115 | 	assert.Equal(t, totalFreeBytes, uint64(100*(1000000-800)))
116 | }
117 | 


--------------------------------------------------------------------------------
/index.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"sync"
  6 | 	"time"
  7 | 
  8 | 	"github.com/spaolacci/murmur3"
  9 | )
 10 | 
 11 | type IndexOperator struct {
 12 | 	helper MapOperatorBase
 13 | }
 14 | 
 15 | func (optr *IndexOperator) Hash(key *[]byte) uint64 {
 16 | 	hasher := murmur3.New64()
 17 | 	hasher.Write(*key)
 18 | 	return hasher.Sum64()
 19 | }
 20 | 
 21 | func (optr *IndexOperator) Equals(lhs, rhs *[]byte) bool {
 22 | 	return bytes.Equal(*lhs, *rhs)
 23 | }
 24 | 
 25 | func (optr *IndexOperator) Rand(upper uint64) uint64 {
 26 | 	return optr.helper.Rand(upper)
 27 | }
 28 | 
 29 | func (optr *IndexOperator) WallTime() time.Time {
 30 | 	return optr.helper.WallTime()
 31 | }
 32 | 
 33 | type IndexValue struct {
 34 | 	fid       uint64
 35 | 	valueOff  uint64
 36 | 	valueSize uint64
 37 | }
 38 | 
 39 | type Index struct {
 40 | 	ivPool sync.Pool
 41 | 	maps   *ShardMap[[]byte, IndexValue]
 42 | }
 43 | 
 44 | type IndexOptions struct {
 45 | 	Capacity             uint64
 46 | 	Limited              uint64
 47 | 	EvictionPoolCapacity uint64
 48 | 	SampleKeys           uint64
 49 | 
 50 | 	Helper MapOperatorBase
 51 | }
 52 | 
 53 | // TODO: batch optimization
 54 | func NewIndex(opts *IndexOptions) (*Index, error) {
 55 | 	mapOpts := &MapOptions{
 56 | 		Capacity:             opts.Capacity,
 57 | 		Limited:              opts.Limited,
 58 | 		EvictionPoolCapacity: opts.EvictionPoolCapacity,
 59 | 		SampleKeys:           opts.SampleKeys,
 60 | 	}
 61 | 
 62 | 	optr := &IndexOperator{
 63 | 		helper: opts.Helper,
 64 | 	}
 65 | 
 66 | 	maps, err := NewShardMap[[]byte, IndexValue](optr, mapOpts)
 67 | 	if err != nil {
 68 | 		return nil, err
 69 | 	}
 70 | 
 71 | 	return &Index{
 72 | 		maps: maps,
 73 | 		ivPool: sync.Pool{
 74 | 			New: func() any {
 75 | 				return &IndexValue{}
 76 | 			},
 77 | 		},
 78 | 	}, nil
 79 | }
 80 | 
 81 | func (i *Index) Get(ns, key []byte) (fid uint64, off uint64, sz uint64, err error) {
 82 | 	var val *IndexValue
 83 | 	mergedKey := MergedKey(ns, key)
 84 | 
 85 | 	if val, err = i.maps.Get(&mergedKey); err != nil {
 86 | 		return
 87 | 	}
 88 | 
 89 | 	fid = val.fid
 90 | 	off = val.valueOff
 91 | 	sz = val.valueSize
 92 | 
 93 | 	if off == 0 { // invalid offset
 94 | 		err = ErrKeySoftDeleted
 95 | 	}
 96 | 
 97 | 	return
 98 | }
 99 | 
100 | type WriteStat struct {
101 | 	// how much disk space is freed by this write
102 | 	FreeBytes uint64
103 | 
104 | 	// which wal is affected
105 | 	FreeWalFid uint64
106 | }
107 | 
108 | func (i *Index) Delete(ns, key []byte, stat *WriteStat) error {
109 | 	mergedKey := MergedKey(ns, key)
110 | 	old, err := i.maps.Delete(&mergedKey)
111 | 	if err != nil {
112 | 		return err
113 | 	}
114 | 
115 | 	if stat != nil && old != nil {
116 | 		stat.FreeBytes = old.valueSize
117 | 		stat.FreeWalFid = old.fid
118 | 
119 | 		i.ivPool.Put(old)
120 | 	}
121 | 
122 | 	return nil
123 | }
124 | 
125 | func (i *Index) SoftDelete(ns, key []byte, stat *WriteStat) error {
126 | 	mergedKey := MergedKey(ns, key)
127 | 	old, err := i.maps.Set(&mergedKey, &IndexValue{
128 | 		valueOff: 0, // invalid offset
129 | 	})
130 | 	if err != nil {
131 | 		return err
132 | 	}
133 | 
134 | 	if stat != nil && old != nil {
135 | 		stat.FreeBytes = old.valueSize
136 | 		stat.FreeWalFid = old.fid
137 | 
138 | 		i.ivPool.Put(old)
139 | 	}
140 | 
141 | 	return nil
142 | }
143 | 
144 | func (i *Index) Put(ns, key []byte, fid uint64, off uint64, sz uint64, stat *WriteStat) error {
145 | 	mergedKey := MergedKey(ns, key)
146 | 
147 | 	iv, _ := i.ivPool.Get().(*IndexValue)
148 | 	iv.fid = fid
149 | 	iv.valueOff = off
150 | 	iv.valueSize = sz
151 | 
152 | 	old, err := i.maps.Set(&mergedKey, iv)
153 | 	if err != nil {
154 | 		return err
155 | 	}
156 | 
157 | 	if stat != nil && old != nil {
158 | 		stat.FreeBytes = old.valueSize
159 | 		stat.FreeWalFid = old.fid
160 | 
161 | 		i.ivPool.Put(old)
162 | 	}
163 | 
164 | 	return nil
165 | }
166 | 
167 | func (i *Index) Capacity() uint64 {
168 | 	return i.maps.Capacity()
169 | }
170 | 


--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"crypto/sha1"
  5 | 	"encoding/binary"
  6 | 	"hash/crc32"
  7 | 	"io"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"strconv"
 11 | 	"strings"
 12 | 
 13 | 	"golang.org/x/sys/unix"
 14 | )
 15 | 
 16 | func PathExists(path string) bool {
 17 | 	_, err := os.Stat(path)
 18 | 	if err != nil && os.IsNotExist(err) {
 19 | 		return false
 20 | 	}
 21 | 	return true
 22 | }
 23 | 
 24 | func ComputeCRC32(data []byte) uint32 {
 25 | 	const castagnoliPoly = 0x82f63b78
 26 | 	table := crc32.MakeTable(castagnoliPoly)
 27 | 	checksum := crc32.Checksum(data, table)
 28 | 	return (checksum>>15 | checksum<<17) + 0xa282ead8
 29 | }
 30 | 
 31 | // pread does not modify the file pointer, so it has no effect on append write
 32 | func PreadFull(fd int, buf []byte, offset int64) error {
 33 | 	totalRead, expectRead := 0, len(buf)
 34 | 	for totalRead < expectRead {
 35 | 		// pread syscall try to read data with the specific buffer length
 36 | 		n, err := unix.Pread(fd, buf[totalRead:], offset+int64(totalRead))
 37 | 		if err != nil {
 38 | 			if err == io.EOF {
 39 | 				break
 40 | 			}
 41 | 			return err
 42 | 		}
 43 | 
 44 | 		totalRead += n
 45 | 	}
 46 | 
 47 | 	return nil
 48 | }
 49 | 
 50 | // return 0, 0 for all exceptions
 51 | func DecodeUvarint(data []byte) (uint64, int) {
 52 | 	v, size := binary.Uvarint(data)
 53 | 	if size <= 0 {
 54 | 		return 0, 0
 55 | 	}
 56 | 	return v, size
 57 | }
 58 | 
 59 | type Runners struct {
 60 | 	functors  []func()
 61 | 	committed bool
 62 | }
 63 | 
 64 | func NewRunner() *Runners {
 65 | 	return &Runners{
 66 | 		committed: true,
 67 | 	}
 68 | }
 69 | 
 70 | func NewReverseRunner() *Runners {
 71 | 	return &Runners{
 72 | 		committed: false,
 73 | 	}
 74 | }
 75 | 
 76 | func (r *Runners) Post(f func()) {
 77 | 	r.functors = append(r.functors, f)
 78 | }
 79 | 
 80 | func (r *Runners) Do() {
 81 | 	if !r.committed {
 82 | 		return
 83 | 	}
 84 | 
 85 | 	for idx := range r.functors {
 86 | 		r.functors[idx]()
 87 | 	}
 88 | }
 89 | 
 90 | func (r *Runners) Rollback() {
 91 | 	r.committed = false
 92 | }
 93 | 
 94 | func (r *Runners) Commit() {
 95 | 	r.committed = true
 96 | }
 97 | 
 98 | func ParseFilename(name string) (fileType int, fid uint64, err error) {
 99 | 	ext := filepath.Ext(name)
100 | 
101 | 	switch ext {
102 | 	case WalFileSuffix:
103 | 		fid, err := strconv.Atoi(name[:len(name)-len(ext)])
104 | 		return WalFileType, uint64(fid), err
105 | 	case HintFileSuffix:
106 | 		fid, err := strconv.Atoi(name[:len(name)-len(ext)])
107 | 		return HintFileType, uint64(fid), err
108 | 	case MergeFileSuffix:
109 | 		fid, err := strconv.Atoi(name[:len(name)-len(ext)])
110 | 		return MergeFileType, uint64(fid), err
111 | 	case TmpFileSuffix:
112 | 		fid, err := strconv.Atoi(name[:len(name)-len(ext)])
113 | 		return TmpFileType, uint64(fid), err
114 | 	}
115 | 
116 | 	if name == CurrentFile {
117 | 		return CurrentFileType, 0, nil
118 | 	}
119 | 
120 | 	if name == LockFile {
121 | 		return LockFileType, 0, nil
122 | 	}
123 | 
124 | 	if strings.HasPrefix(name, ManifestFilePrefix) {
125 | 		fid, err := strconv.Atoi(name[len(ManifestFilePrefix)+1:])
126 | 		return ManifestFileType, uint64(fid), err
127 | 	}
128 | 
129 | 	return UnknownFileType, 0, nil
130 | }
131 | 
132 | // namespace is fixed size
133 | func MergedKey(ns, key []byte) []byte {
134 | 	mergedKey := make([]byte, len(ns)+len(key))
135 | 	copy(mergedKey, ns)
136 | 	copy(mergedKey[len(ns):], key)
137 | 
138 | 	return mergedKey
139 | }
140 | 
141 | func GenSha1NS(ns string) []byte {
142 | 	hash := sha1.Sum([]byte(ns))
143 | 	return hash[:]
144 | }
145 | 
146 | func GenSha1Etag(data []byte) []byte {
147 | 	hash := sha1.Sum(data)
148 | 	return hash[:]
149 | }
150 | 
151 | func Gen1KBytes() []byte {
152 | 	buf := make([]byte, 1024)
153 | 	for i := 0; i < 128; i++ {
154 | 		copy(buf[i*8:], []byte("01234567"))
155 | 	}
156 | 	return buf
157 | }
158 | 
159 | func GenNKBytes(n int) []byte {
160 | 	bytes1KB := Gen1KBytes()
161 | 	buf := make([]byte, 1024*n)
162 | 	for i := 0; i < n; i++ {
163 | 		copy(buf[i*1024:], bytes1KB)
164 | 	}
165 | 	return buf
166 | }
167 | 
168 | func Map[T any, R any](input []T, f func(T) R) []R {
169 | 	result := make([]R, len(input))
170 | 	for i, v := range input {
171 | 		result[i] = f(v)
172 | 	}
173 | 	return result
174 | }
175 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 2 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 3 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 4 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 5 | github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 6 | github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E=
 7 | github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0=
 8 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 9 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
10 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
11 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
12 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
13 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
14 | github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
15 | github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
16 | github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
17 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
18 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
19 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
20 | github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
21 | github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
22 | github.com/royalcat/iouring-go v0.0.0-20240925200811-286062ac1b23 h1:3yOlLKYd6iSGkRUOCPuBQibjjvZyrGB/4sm0fh3nNuQ=
23 | github.com/royalcat/iouring-go v0.0.0-20240925200811-286062ac1b23/go.mod h1:LEzdaZarZ5aqROlLIwJ4P7h3+4o71008fSy6wpaEB+s=
24 | github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
25 | github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY=
26 | github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ=
27 | github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
28 | github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
29 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
30 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
31 | github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8=
32 | github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok=
33 | github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
34 | github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds=
35 | golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
36 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
37 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
38 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
39 | golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
40 | golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
41 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
42 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
43 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
44 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
45 | gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
46 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
47 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
48 | 


--------------------------------------------------------------------------------
/bench/bench_disk_usage_test.go:
--------------------------------------------------------------------------------
  1 | package bench
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"os/exec"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"sync/atomic"
 11 | 	"testing"
 12 | 	"time"
 13 | 
 14 | 	"github.com/stretchr/testify/assert"
 15 | 	"github.com/wenzhang-dev/bitcaskDB"
 16 | )
 17 | 
 18 | func newDiskUsageDB(b *testing.B) {
 19 | 	dir = "./bitcaskDB"
 20 | 	_ = os.RemoveAll(dir)
 21 | 	_ = os.MkdirAll(dir, os.ModePerm)
 22 | 
 23 | 	opts := &bitcask.Options{
 24 | 		Dir:                       dir,
 25 | 		WalMaxSize:                1024 * 1024 * 1024, // 1GB
 26 | 		ManifestMaxSize:           10 * 1024 * 1024,   // 10MB
 27 | 		IndexCapacity:             10000000,           // 10 million
 28 | 		IndexLimited:              8000000,
 29 | 		IndexEvictionPoolCapacity: 64,
 30 | 		IndexSampleKeys:           5,
 31 | 		CompactionPicker:          nil, // default picker
 32 | 		CompactionFilter:          nil, // default filter
 33 | 		NsSize:                    bitcask.DefaultNsSize,
 34 | 		EtagSize:                  bitcask.DefaultEtagSize,
 35 | 		DisableCompaction:         false,
 36 | 		DiskUsageLimited:          10 * 1024 * 1024 * 1024, // 10GB
 37 | 	}
 38 | 
 39 | 	var err error
 40 | 	db, err = bitcask.NewDB(opts)
 41 | 	assert.Nil(b, err)
 42 | }
 43 | 
 44 | func BenchmarkDiskUsage(b *testing.B) {
 45 | 	b.Run("batchPut4K", benchmarkDiskUsageBatchPut)
 46 | 
 47 | 	b.Run("concurrentBatchPut4K", benchmarkDiskUsageConcurrentBatchPut)
 48 | }
 49 | 
 50 | func getActualDiskUsage(path string) int64 {
 51 | 	cmd := exec.Command("du", "-sb", path)
 52 | 
 53 | 	var out bytes.Buffer
 54 | 	cmd.Stdout = &out
 55 | 	if err := cmd.Run(); err != nil {
 56 | 		return 0
 57 | 	}
 58 | 
 59 | 	parts := strings.Fields(out.String())
 60 | 	if len(parts) < 1 {
 61 | 		return 0
 62 | 	}
 63 | 
 64 | 	size, err := strconv.ParseInt(parts[0], 10, 64)
 65 | 	if err != nil {
 66 | 		return 0
 67 | 	}
 68 | 
 69 | 	return size
 70 | }
 71 | 
 72 | // print disk usgae per three seconds
 73 | var (
 74 | 	totalBytesWritten int64
 75 | 	stopCh            chan struct{}
 76 | )
 77 | 
 78 | func printDiskUsageStat() {
 79 | 	fmt.Printf("\n")
 80 | 
 81 | 	ticker := time.NewTicker(3 * time.Second)
 82 | 	defer ticker.Stop()
 83 | 
 84 | 	var lastTotal int64
 85 | 	for {
 86 | 		select {
 87 | 		case <-ticker.C:
 88 | 			current := atomic.LoadInt64(&totalBytesWritten)
 89 | 			speed := current - lastTotal
 90 | 			lastTotal = current
 91 | 			fmt.Printf(
 92 | 				"Write Speed %.2f MB/s | Write Total: %.2f GB | Disk Usage: %.2f GB\n",
 93 | 				float64(speed)/1024/1024/3,
 94 | 				float64(current)/1024/1024/1024,
 95 | 				float64(getActualDiskUsage(dir))/1024/1024/1024,
 96 | 			)
 97 | 		case <-stopCh:
 98 | 			return
 99 | 		}
100 | 	}
101 | }
102 | 
103 | func benchmarkDiskUsageBatchPut(b *testing.B) {
104 | 	newDiskUsageDB(b)
105 | 	defer db.Close()
106 | 
107 | 	totalBytesWritten = 0
108 | 	stopCh = make(chan struct{})
109 | 	defer close(stopCh)
110 | 
111 | 	go printDiskUsageStat()
112 | 
113 | 	meta := bitcask.NewMeta(nil)
114 | 	opts := &bitcask.WriteOptions{}
115 | 
116 | 	b.ResetTimer()
117 | 	b.ReportAllocs()
118 | 
119 | 	batch := bitcask.NewBatch()
120 | 	for i := 0; i < b.N; i++ {
121 | 		batch.Put(ns[:], genTestKey(i), bin4KB, meta)
122 | 
123 | 		if i%BatchSize == 0 {
124 | 			err := db.Write(batch, opts)
125 | 			assert.Nil(b, err)
126 | 			atomic.AddInt64(&totalBytesWritten, int64(batch.ByteSize()))
127 | 
128 | 			batch.Clear()
129 | 		}
130 | 	}
131 | }
132 | 
133 | func benchmarkDiskUsageConcurrentBatchPut(b *testing.B) {
134 | 	newDiskUsageDB(b)
135 | 	defer db.Close()
136 | 
137 | 	totalBytesWritten = 0
138 | 	stopCh = make(chan struct{})
139 | 	defer close(stopCh)
140 | 
141 | 	go printDiskUsageStat()
142 | 
143 | 	meta := bitcask.NewMeta(nil)
144 | 	opts := &bitcask.WriteOptions{}
145 | 
146 | 	b.ResetTimer()
147 | 	b.ReportAllocs()
148 | 
149 | 	b.RunParallel(func(pb *testing.PB) {
150 | 		var err error
151 | 		iteration := 0
152 | 		batch := bitcask.NewBatch()
153 | 		for pb.Next() {
154 | 			batch.Put(ns[:], genTestKey(iteration), bin4KB, meta)
155 | 
156 | 			if iteration%BatchSize == 0 {
157 | 				err = db.Write(batch, opts)
158 | 				assert.Nil(b, err)
159 | 				atomic.AddInt64(&totalBytesWritten, int64(batch.ByteSize()))
160 | 
161 | 				batch.Clear()
162 | 			}
163 | 
164 | 			iteration++
165 | 		}
166 | 	})
167 | }
168 | 


--------------------------------------------------------------------------------
/manifest_edit.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"errors"
  7 | )
  8 | 
  9 | const (
 10 | 	manifestEditDeleteFileTag = 1
 11 | 	manifestEditAddFileTag    = 2
 12 | 	manifestEditNextFidTag    = 3
 13 | 	manifestEditFreeBytesTag  = 4
 14 | )
 15 | 
 16 | var (
 17 | 	ErrCorruptedManifest      = errors.New("corrupted manifest file")
 18 | 	ErrUnknownManifestEditTag = errors.New("unknown manifest tag")
 19 | )
 20 | 
 21 | type LogFile struct {
 22 | 	wal *Wal
 23 | 
 24 | 	fid uint64
 25 | }
 26 | 
 27 | // the manifest edit will be persist one by one, and append to
 28 | // MANIFEST file
 29 | type ManifestEdit struct {
 30 | 	// the delete list of wal file
 31 | 	deleteFiles []LogFile
 32 | 
 33 | 	// the add list of wal file
 34 | 	addFiles []LogFile
 35 | 
 36 | 	// free bytes for each wal file
 37 | 	freeBytes map[uint64]uint64
 38 | 
 39 | 	// the available file number
 40 | 	nextFid    uint64
 41 | 	hasNextFid bool
 42 | }
 43 | 
 44 | func NewManifestEdit() *ManifestEdit {
 45 | 	return &ManifestEdit{
 46 | 		hasNextFid: false,
 47 | 		freeBytes:  make(map[uint64]uint64),
 48 | 	}
 49 | }
 50 | 
 51 | func (edit *ManifestEdit) Merge(other *ManifestEdit) {
 52 | 	if len(other.addFiles) > 0 {
 53 | 		edit.addFiles = append(edit.addFiles, other.addFiles...)
 54 | 	}
 55 | 
 56 | 	if len(other.deleteFiles) > 0 {
 57 | 		edit.deleteFiles = append(edit.deleteFiles, other.deleteFiles...)
 58 | 	}
 59 | 
 60 | 	if other.hasNextFid {
 61 | 		edit.hasNextFid = true
 62 | 		edit.nextFid = max(edit.nextFid, other.nextFid)
 63 | 	}
 64 | 
 65 | 	if len(other.freeBytes) > 0 {
 66 | 		for fid, freeBytes := range other.freeBytes {
 67 | 			edit.freeBytes[fid] += freeBytes
 68 | 		}
 69 | 	}
 70 | }
 71 | 
 72 | func (edit *ManifestEdit) Encode() []byte {
 73 | 	var buf bytes.Buffer
 74 | 	encodeVarint := func(v uint64) {
 75 | 		var tmp [binary.MaxVarintLen64]byte
 76 | 		n := binary.PutUvarint(tmp[:], v)
 77 | 		buf.Write(tmp[:n])
 78 | 	}
 79 | 
 80 | 	if edit.hasNextFid {
 81 | 		encodeVarint(manifestEditNextFidTag)
 82 | 		encodeVarint(edit.nextFid)
 83 | 	}
 84 | 
 85 | 	for _, file := range edit.addFiles {
 86 | 		encodeVarint(manifestEditAddFileTag)
 87 | 		encodeVarint(file.fid)
 88 | 	}
 89 | 
 90 | 	for _, file := range edit.deleteFiles {
 91 | 		encodeVarint(manifestEditDeleteFileTag)
 92 | 		encodeVarint(file.fid)
 93 | 	}
 94 | 
 95 | 	for fid, freeBytes := range edit.freeBytes {
 96 | 		encodeVarint(manifestEditFreeBytesTag)
 97 | 		encodeVarint(fid)
 98 | 		encodeVarint(freeBytes)
 99 | 	}
100 | 
101 | 	return buf.Bytes()
102 | }
103 | 
104 | func (edit *ManifestEdit) Clear() {
105 | 	edit.nextFid = 0
106 | 	edit.hasNextFid = false
107 | 
108 | 	edit.addFiles = nil
109 | 	edit.deleteFiles = nil
110 | 	edit.freeBytes = make(map[uint64]uint64)
111 | }
112 | 
113 | func (edit *ManifestEdit) DecodeFrom(data []byte) error {
114 | 	offset := 0
115 | 	edit.Clear()
116 | 	for offset < len(data) {
117 | 		tag, nbytes := binary.Uvarint(data[offset:])
118 | 		if nbytes <= 0 {
119 | 			return ErrCorruptedManifest
120 | 		}
121 | 		offset += nbytes
122 | 
123 | 		switch tag {
124 | 		case manifestEditDeleteFileTag:
125 | 			fid, nbytes := binary.Uvarint(data[offset:])
126 | 			if nbytes <= 0 {
127 | 				return ErrCorruptedManifest
128 | 			}
129 | 			offset += nbytes
130 | 			edit.deleteFiles = append(edit.deleteFiles, LogFile{fid: fid})
131 | 		case manifestEditAddFileTag:
132 | 			fid, nbytes := binary.Uvarint(data[offset:])
133 | 			if nbytes <= 0 {
134 | 				return ErrCorruptedManifest
135 | 			}
136 | 			offset += nbytes
137 | 			edit.addFiles = append(edit.addFiles, LogFile{fid: fid})
138 | 		case manifestEditNextFidTag:
139 | 			fid, nbytes := binary.Uvarint(data[offset:])
140 | 			if nbytes <= 0 {
141 | 				return ErrCorruptedManifest
142 | 			}
143 | 			offset += nbytes
144 | 			edit.hasNextFid = true
145 | 			edit.nextFid = max(edit.nextFid, fid)
146 | 		case manifestEditFreeBytesTag:
147 | 			fid, nbytes := binary.Uvarint(data[offset:])
148 | 			if nbytes <= 0 {
149 | 				return ErrCorruptedManifest
150 | 			}
151 | 			offset += nbytes
152 | 
153 | 			freeBytes, nbytes := binary.Uvarint(data[offset:])
154 | 			if nbytes <= 0 {
155 | 				return ErrCorruptedManifest
156 | 			}
157 | 			offset += nbytes
158 | 			edit.freeBytes[fid] += freeBytes
159 | 		default:
160 | 			return ErrUnknownManifestEditTag
161 | 		}
162 | 	}
163 | 
164 | 	return nil
165 | }
166 | 


--------------------------------------------------------------------------------
/record_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | func TestRecord_EmptyNs(t *testing.T) {
 11 | 	// mock global options
 12 | 	oldOpts := gOpts
 13 | 	gOpts = &Options{
 14 | 		NsSize:   0,
 15 | 		EtagSize: 0,
 16 | 	}
 17 | 	defer func() {
 18 | 		gOpts = oldOpts
 19 | 	}()
 20 | 
 21 | 	// testcase
 22 | 	record := &Record{
 23 | 		Ns:    nil,
 24 | 		Key:   []byte("test-key"),
 25 | 		Value: []byte("test-value"),
 26 | 		Meta:  NewMeta(nil),
 27 | 	}
 28 | 
 29 | 	backStore := make([]byte, DefaultRecordBufferSize)
 30 | 	baseTime := uint64(time.Now().Unix())
 31 | 	encoded, err := record.Encode(backStore, baseTime)
 32 | 	assert.Nil(t, err)
 33 | 
 34 | 	decoded, err := RecordFromBytes(encoded, baseTime)
 35 | 	assert.Nil(t, err)
 36 | 
 37 | 	// check
 38 | 	assert.Equal(t, len(decoded.Ns), 0)
 39 | 	assert.Equal(t, decoded.Key, record.Key)
 40 | 	assert.Equal(t, decoded.Value, record.Value)
 41 | }
 42 | 
 43 | func TestRecord_EncodingDecoding(t *testing.T) {
 44 | 	// mock global options
 45 | 	oldOpts := gOpts
 46 | 	gOpts = &Options{
 47 | 		NsSize:   DefaultNsSize,
 48 | 		EtagSize: DefaultEtagSize,
 49 | 	}
 50 | 	defer func() {
 51 | 		gOpts = oldOpts
 52 | 	}()
 53 | 
 54 | 	// testcase
 55 | 	ns := sha1Bytes("test-ns")
 56 | 	etag := sha1Bytes("etag")
 57 | 	baseTime := uint64(time.Now().Unix())
 58 | 
 59 | 	tests := []struct {
 60 | 		name    string
 61 | 		record  *Record
 62 | 		wantErr bool
 63 | 	}{
 64 | 		{
 65 | 			name: "Normal case with full metadata and value",
 66 | 			record: &Record{
 67 | 				Ns:    ns[:],
 68 | 				Key:   []byte("test-key"),
 69 | 				Meta:  NewMeta(map[string]string{"foo": "bar"}).SetExpire(baseTime + 60).SetEtag(etag[:]).SetTombstone(true),
 70 | 				Value: []byte("hello world"),
 71 | 			},
 72 | 			wantErr: false,
 73 | 		},
 74 | 		{
 75 | 			name: "Nil APP Meta",
 76 | 			record: &Record{
 77 | 				Ns:    ns[:],
 78 | 				Key:   []byte("test-key"),
 79 | 				Meta:  NewMeta(nil).SetExpire(baseTime + 61).SetEtag(etag[:]).SetTombstone(false),
 80 | 				Value: []byte("hello world"),
 81 | 			},
 82 | 			wantErr: false,
 83 | 		},
 84 | 		{
 85 | 			name: "Nil Value",
 86 | 			record: &Record{
 87 | 				Ns:    ns[:],
 88 | 				Key:   []byte("test-key"),
 89 | 				Meta:  NewMeta(map[string]string{"foo": "bar"}).SetExpire(baseTime + 62).SetEtag(etag[:]).SetTombstone(true),
 90 | 				Value: []byte{},
 91 | 			},
 92 | 			wantErr: false,
 93 | 		},
 94 | 		{
 95 | 			name: "Nil APP Meta, Nil Value, Nil Etag and No Expire",
 96 | 			record: &Record{
 97 | 				Ns:    ns[:],
 98 | 				Key:   []byte("test-key"),
 99 | 				Meta:  NewMeta(nil),
100 | 				Value: []byte{},
101 | 			},
102 | 			wantErr: false,
103 | 		},
104 | 	}
105 | 
106 | 	backStore := make([]byte, DefaultRecordBufferSize)
107 | 	for _, tt := range tests {
108 | 		t.Run(tt.name, func(t *testing.T) {
109 | 			encoded, err := tt.record.Encode(backStore, baseTime)
110 | 			if (err != nil) != tt.wantErr {
111 | 				t.Errorf("Encode() error = %v, wantErr %v", err, tt.wantErr)
112 | 				return
113 | 			}
114 | 
115 | 			if tt.wantErr {
116 | 				return // No need to proceed if encoding is expected to fail
117 | 			}
118 | 
119 | 			decodedRecord, err := RecordFromBytes(encoded, baseTime)
120 | 			assert.NoError(t, err, "RecordFromBytes should not fail")
121 | 
122 | 			// Ensure Namespace is correctly restored
123 | 			assert.Equal(t, tt.record.Ns, decodedRecord.Ns, "Namespace mismatch")
124 | 
125 | 			// Ensure Key is correctly restored
126 | 			assert.Equal(t, tt.record.Key, decodedRecord.Key, "Key mismatch")
127 | 
128 | 			// Ensure Value is correctly restored
129 | 			assert.Equal(t, tt.record.Value, decodedRecord.Value, "Value mismatch")
130 | 
131 | 			// Check Meta field
132 | 			assert.NotNil(t, decodedRecord.Meta, "Meta should not be nil")
133 | 			assert.Equal(t, tt.record.Meta.Flags, decodedRecord.Meta.Flags, "Flags mismatch")
134 | 			assert.Equal(t, tt.record.Meta.Expire, decodedRecord.Meta.Expire, "Expire mismatch")
135 | 
136 | 			if len(tt.record.Meta.Etag) != 0 {
137 | 				assert.Equal(t, tt.record.Meta.Etag, decodedRecord.Meta.Etag, "Etag mismatch")
138 | 			}
139 | 
140 | 			if tt.record.Meta.AppMeta == nil {
141 | 				assert.Nil(t, decodedRecord.Meta.AppMeta, "AppMeta should be nil")
142 | 			} else {
143 | 				assert.Equal(t, tt.record.Meta.AppMeta, decodedRecord.Meta.AppMeta, "AppMeta mismatch")
144 | 			}
145 | 		})
146 | 	}
147 | }
148 | 


--------------------------------------------------------------------------------
/manifest_txn_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"os"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/stretchr/testify/assert"
  9 | )
 10 | 
 11 | func setupManifestTxn(t *testing.T) (*ManifestTxn, func()) {
 12 | 	dir := "./test_bitcask_db"
 13 | 	_ = os.RemoveAll(dir)
 14 | 
 15 | 	assert.Nil(t, os.MkdirAll(dir, os.ModePerm))
 16 | 
 17 | 	manifest, err := NewManifest(dir)
 18 | 	assert.Nil(t, err)
 19 | 
 20 | 	txn, err := manifest.NewTxn()
 21 | 	assert.Nil(t, err)
 22 | 	return txn, func() {
 23 | 		os.RemoveAll(dir)
 24 | 	}
 25 | }
 26 | 
 27 | func TestManifestTxn_Commit(t *testing.T) {
 28 | 	txn, closer := setupManifestTxn(t)
 29 | 	defer closer()
 30 | 
 31 | 	manifest := txn.manifest
 32 | 	dir := manifest.dir
 33 | 
 34 | 	wal3, _ := NewWal(WalPath(dir, 3), 3, -1)
 35 | 	wal4, _ := NewWal(WalPath(dir, 4), 4, -1)
 36 | 	wal5, _ := NewWal(WalPath(dir, 5), 5, -1)
 37 | 	wal6, _ := NewWal(WalPath(dir, 6), 6, -1)
 38 | 	wal7, _ := NewWal(WalPath(dir, 7), 7, -1)
 39 | 
 40 | 	edit1 := &ManifestEdit{
 41 | 		addFiles: []LogFile{
 42 | 			{fid: 3, wal: wal3},
 43 | 			{fid: 4, wal: wal4},
 44 | 			{fid: 5, wal: wal5},
 45 | 		},
 46 | 		deleteFiles: nil,
 47 | 		hasNextFid:  true,
 48 | 		nextFid:     6,
 49 | 		freeBytes:   make(map[uint64]uint64),
 50 | 	}
 51 | 	edit1.freeBytes[2] = 123
 52 | 
 53 | 	// apply
 54 | 	txn.Apply(edit1)
 55 | 
 56 | 	// check
 57 | 	assert.False(t, txn.aborted)
 58 | 	assert.False(t, txn.committed)
 59 | 	assert.False(t, txn.IsDone())
 60 | 	assert.Equal(t, wal3, txn.ToWal(3))
 61 | 	assert.Equal(t, wal4, txn.ToWal(4))
 62 | 	assert.Equal(t, wal5, txn.ToWal(5))
 63 | 	assert.Equal(t, txn.NextFid(), uint64(6))
 64 | 
 65 | 	assert.Equal(t, wal3, manifest.ToWal(3))
 66 | 	assert.Equal(t, wal4, manifest.ToWal(4))
 67 | 	assert.Equal(t, wal5, manifest.ToWal(5))
 68 | 	assert.Equal(t, manifest.NextFid(), uint64(6))
 69 | 
 70 | 	// commit
 71 | 	edit2 := &ManifestEdit{
 72 | 		addFiles: []LogFile{
 73 | 			{fid: 6, wal: wal6},
 74 | 			{fid: 7, wal: wal7},
 75 | 		},
 76 | 		deleteFiles: []LogFile{
 77 | 			{fid: 3, wal: wal3},
 78 | 			{fid: 4, wal: wal4},
 79 | 			{fid: 5, wal: wal5},
 80 | 		},
 81 | 		hasNextFid: true,
 82 | 		nextFid:    8,
 83 | 		freeBytes:  make(map[uint64]uint64),
 84 | 	}
 85 | 
 86 | 	edit2.freeBytes[2] = 123
 87 | 
 88 | 	assert.Nil(t, txn.Commit(edit2))
 89 | 
 90 | 	// check
 91 | 	assert.True(t, txn.committed)
 92 | 	assert.False(t, txn.aborted)
 93 | 	assert.True(t, txn.IsDone())
 94 | 
 95 | 	assert.Nil(t, manifest.ToWal(3))
 96 | 	assert.Nil(t, manifest.ToWal(4))
 97 | 	assert.Nil(t, manifest.ToWal(5))
 98 | 
 99 | 	assert.Equal(t, wal6, manifest.ToWal(6))
100 | 	assert.Equal(t, wal7, manifest.ToWal(7))
101 | 	assert.Equal(t, manifest.NextFid(), uint64(8))
102 | 
103 | 	// commit again
104 | 	err := txn.Commit(nil)
105 | 	assert.True(t, errors.Is(err, ErrCommittedManfiestTxn))
106 | }
107 | 
108 | func TestManifestTxn_Abort(t *testing.T) {
109 | 	txn, closer := setupManifestTxn(t)
110 | 	defer closer()
111 | 
112 | 	manifest := txn.manifest
113 | 	dir := manifest.dir
114 | 
115 | 	wal3, _ := NewWal(WalPath(dir, 3), 3, -1)
116 | 	wal4, _ := NewWal(WalPath(dir, 4), 4, -1)
117 | 	wal5, _ := NewWal(WalPath(dir, 5), 5, -1)
118 | 
119 | 	edit1 := &ManifestEdit{
120 | 		addFiles: []LogFile{
121 | 			{fid: 3, wal: wal3},
122 | 			{fid: 4, wal: wal4},
123 | 			{fid: 5, wal: wal5},
124 | 		},
125 | 		deleteFiles: nil,
126 | 		hasNextFid:  true,
127 | 		nextFid:     6,
128 | 		freeBytes:   make(map[uint64]uint64),
129 | 	}
130 | 	edit1.freeBytes[2] = 123
131 | 
132 | 	// apply
133 | 	txn.Apply(edit1)
134 | 
135 | 	// check
136 | 	assert.False(t, txn.aborted)
137 | 	assert.False(t, txn.committed)
138 | 	assert.False(t, txn.IsDone())
139 | 	assert.Equal(t, wal3, txn.ToWal(3))
140 | 	assert.Equal(t, wal4, txn.ToWal(4))
141 | 	assert.Equal(t, wal5, txn.ToWal(5))
142 | 	assert.Equal(t, txn.NextFid(), uint64(6))
143 | 
144 | 	assert.Equal(t, wal3, manifest.ToWal(3))
145 | 	assert.Equal(t, wal4, manifest.ToWal(4))
146 | 	assert.Equal(t, wal5, manifest.ToWal(5))
147 | 	assert.Equal(t, manifest.NextFid(), uint64(6))
148 | 
149 | 	// abort
150 | 	txn.Abort()
151 | 
152 | 	// check
153 | 	assert.True(t, txn.aborted)
154 | 	assert.False(t, txn.committed)
155 | 	assert.True(t, txn.IsDone())
156 | 
157 | 	assert.Nil(t, manifest.ToWal(3))
158 | 	assert.Nil(t, manifest.ToWal(4))
159 | 	assert.Nil(t, manifest.ToWal(5))
160 | 	assert.Equal(t, manifest.NextFid(), uint64(3))
161 | 
162 | 	// abort again
163 | 	txn.Abort()
164 | }
165 | 


--------------------------------------------------------------------------------
/hint.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"errors"
  7 | )
  8 | 
  9 | type HintRecord struct {
 10 | 	ns  []byte
 11 | 	key []byte
 12 | 
 13 | 	fid  uint64
 14 | 	off  uint64
 15 | 	size uint64
 16 | }
 17 | 
 18 | const (
 19 | 	hintWalRewriterThrehold = 1024 * 1024 // 1MB
 20 | )
 21 | 
 22 | var ErrCorruptedHintRecord = errors.New("corrupted hint record")
 23 | 
 24 | // format:
 25 | // | ns | key-size | key | fid | off | size |
 26 | //
 27 | // ns: fixed-size string
 28 | // key-size: varint64
 29 | // fid: varint64
 30 | // off: varint64
 31 | // size: varint64
 32 | func (r *HintRecord) Encode() ([]byte, error) {
 33 | 	var buf bytes.Buffer
 34 | 	encodeVarint := func(v uint64) {
 35 | 		var tmp [binary.MaxVarintLen64]byte
 36 | 		n := binary.PutUvarint(tmp[:], v)
 37 | 		buf.Write(tmp[:n])
 38 | 	}
 39 | 
 40 | 	buf.Write(r.ns)
 41 | 	encodeVarint(uint64(len(r.key)))
 42 | 	buf.Write(r.key)
 43 | 	encodeVarint(r.fid)
 44 | 	encodeVarint(r.off)
 45 | 	encodeVarint(r.size)
 46 | 
 47 | 	return buf.Bytes(), nil
 48 | }
 49 | 
 50 | func (r *HintRecord) Decode(data []byte) error {
 51 | 	nsSize := int(GetOptions().NsSize)
 52 | 	minHintRecordSize := nsSize + 1 + 1 + 1*3
 53 | 	if len(data) < minHintRecordSize {
 54 | 		return ErrCorruptedHintRecord
 55 | 	}
 56 | 
 57 | 	offset := 0
 58 | 
 59 | 	r.ns = data[:nsSize]
 60 | 	offset += nsSize
 61 | 
 62 | 	keyLen, nbytes := DecodeUvarint(data[offset:])
 63 | 	offset += nbytes
 64 | 
 65 | 	keyOffset := offset
 66 | 	offset += int(keyLen)
 67 | 
 68 | 	r.fid, nbytes = DecodeUvarint(data[offset:])
 69 | 	offset += nbytes
 70 | 
 71 | 	r.off, nbytes = DecodeUvarint(data[offset:])
 72 | 	offset += nbytes
 73 | 
 74 | 	r.size, nbytes = DecodeUvarint(data[offset:])
 75 | 	offset += nbytes
 76 | 
 77 | 	if offset != len(data) {
 78 | 		return ErrCorruptedHintRecord
 79 | 	}
 80 | 
 81 | 	r.key = data[keyOffset : keyOffset+int(keyLen)]
 82 | 
 83 | 	return nil
 84 | }
 85 | 
 86 | type HintWriter struct {
 87 | 	rewriter *WalRewriter
 88 | }
 89 | 
 90 | func NewHintWriter(path string, fid uint64, baseTime int64) (*HintWriter, error) {
 91 | 	hint, err := NewWal(path, fid, baseTime)
 92 | 	if err != nil {
 93 | 		return nil, err
 94 | 	}
 95 | 
 96 | 	return &HintWriter{
 97 | 		rewriter: NewWalRewriter(hint, hintWalRewriterThrehold),
 98 | 	}, nil
 99 | }
100 | 
101 | func (w *HintWriter) AppendRecord(record *HintRecord) error {
102 | 	recordBytes, err := record.Encode()
103 | 	if err != nil {
104 | 		return err
105 | 	}
106 | 
107 | 	_, err = w.rewriter.AppendRecord(recordBytes)
108 | 	return err
109 | }
110 | 
111 | func (w *HintWriter) Wal() *Wal {
112 | 	return w.rewriter.wal
113 | }
114 | 
115 | func (w *HintWriter) Close() error {
116 | 	return w.rewriter.Close()
117 | }
118 | 
119 | func (w *HintWriter) Flush() error {
120 | 	return w.rewriter.Flush()
121 | }
122 | 
123 | func NewHintByWal(wal *Wal) (uint64, error) {
124 | 	// hint wal use the same fid and base time
125 | 	hintPath := TmpPath(wal.Dir(), wal.fid)
126 | 	writer, err := NewHintWriter(hintPath, wal.fid, int64(wal.BaseTime()))
127 | 	if err != nil {
128 | 		return 0, err
129 | 	}
130 | 
131 | 	defer writer.Close()
132 | 
133 | 	if err = IterateRecord(wal, func(record *Record, foff, size uint64) error {
134 | 		// the foff points to the start offset of data in the wal
135 | 		// however, the offset used by ReadRecord of wal expects the start offset of data header
136 | 		foff -= RecordHeaderSize
137 | 
138 | 		hintRecord := &HintRecord{
139 | 			ns:   record.Ns,
140 | 			key:  record.Key,
141 | 			fid:  wal.fid,
142 | 			off:  foff,
143 | 			size: size,
144 | 		}
145 | 
146 | 		return writer.AppendRecord(hintRecord)
147 | 	}); err != nil {
148 | 		return 0, err
149 | 	}
150 | 
151 | 	if err = writer.Flush(); err != nil {
152 | 		return 0, err
153 | 	}
154 | 
155 | 	// rename hint file
156 | 	if err = writer.Wal().Rename(HintFilename(wal.fid)); err != nil {
157 | 		return 0, err
158 | 	}
159 | 
160 | 	return writer.Wal().Size(), nil
161 | }
162 | 
163 | func IterateHint(hint *Wal, cb func(record *HintRecord) error) error {
164 | 	it := NewWalIterator(hint)
165 | 	defer it.Close()
166 | 
167 | 	var err error
168 | 	var recordBytes []byte
169 | 	record := &HintRecord{}
170 | 	for {
171 | 		if _, recordBytes, err = it.Next(); err != nil {
172 | 			if errors.Is(err, ErrWalIteratorEOF) {
173 | 				break
174 | 			}
175 | 			return err
176 | 		}
177 | 
178 | 		if err = record.Decode(recordBytes); err != nil {
179 | 			return err
180 | 		}
181 | 
182 | 		if err = cb(record); err != nil {
183 | 			return err
184 | 		}
185 | 	}
186 | 
187 | 	return nil
188 | }
189 | 


--------------------------------------------------------------------------------
/deque.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import "errors"
  4 | 
  5 | const (
  6 | 	DequeChunkSize        = 512
  7 | 	DequeInitSize         = 64
  8 | 	DequeFrontReserveSize = 3
  9 | )
 10 | 
 11 | var (
 12 | 	ErrDequeEmpty      = errors.New("deque empty")
 13 | 	ErrDequeOutOfRange = errors.New("deque out of range")
 14 | )
 15 | 
 16 | type dequeChunk[T any] struct {
 17 | 	buf [DequeChunkSize]T
 18 | 
 19 | 	// the [start, end) is actual data range
 20 | 	start int32
 21 | 	end   int32
 22 | }
 23 | 
 24 | func (c *dequeChunk[T]) size() int {
 25 | 	return int(c.end - c.start)
 26 | }
 27 | 
 28 | func (c *dequeChunk[T]) rightFull() bool {
 29 | 	return c.end == DequeChunkSize
 30 | }
 31 | 
 32 | func (c *dequeChunk[T]) leftFull() bool {
 33 | 	return c.start == 0
 34 | }
 35 | 
 36 | func (c *dequeChunk[T]) empty() bool {
 37 | 	return c.start == c.end
 38 | }
 39 | 
 40 | type Deque[T any] struct {
 41 | 	// buffer
 42 | 	chunks []*dequeChunk[T]
 43 | 
 44 | 	// the [start, end) is actual data range
 45 | 	start int64
 46 | 	end   int64 // end-1 index always points valid chunk
 47 | 
 48 | 	// the total size of elements
 49 | 	size int64
 50 | }
 51 | 
 52 | func NewDeque[T any]() *Deque[T] {
 53 | 	// the init start end end range: [2, 4)
 54 | 	d := &Deque[T]{
 55 | 		chunks: make([]*dequeChunk[T], DequeInitSize),
 56 | 		start:  DequeFrontReserveSize - 1,
 57 | 		end:    DequeFrontReserveSize + 1,
 58 | 		size:   0,
 59 | 	}
 60 | 
 61 | 	// the start index(2) is writable
 62 | 	d.chunks[d.start] = &dequeChunk[T]{
 63 | 		start: DequeChunkSize,
 64 | 		end:   DequeChunkSize,
 65 | 	}
 66 | 
 67 | 	// the end-1 index(3) is writable
 68 | 	d.chunks[d.start+1] = &dequeChunk[T]{
 69 | 		start: 0,
 70 | 		end:   0,
 71 | 	}
 72 | 
 73 | 	return d
 74 | }
 75 | 
 76 | func (d *Deque[T]) Back() (*T, error) {
 77 | 	if d.Empty() {
 78 | 		return nil, ErrDequeEmpty
 79 | 	}
 80 | 
 81 | 	return d.At(d.Len() - 1)
 82 | }
 83 | 
 84 | func (d *Deque[T]) Front() (*T, error) {
 85 | 	if d.Empty() {
 86 | 		return nil, ErrDequeEmpty
 87 | 	}
 88 | 
 89 | 	return d.At(0)
 90 | }
 91 | 
 92 | func (d *Deque[T]) grow() {
 93 | 	if int(d.end) == len(d.chunks) || d.start == 0 {
 94 | 		// copy pointers
 95 | 		chunks := make([]*dequeChunk[T], 2*d.end)
 96 | 		for i := d.start; i < d.end; i++ {
 97 | 			chunks[DequeFrontReserveSize+(i-d.start)] = d.chunks[i]
 98 | 		}
 99 | 
100 | 		// order matters
101 | 		d.end = DequeFrontReserveSize + (d.end - d.start)
102 | 		d.start = DequeFrontReserveSize
103 | 		d.chunks = chunks
104 | 	}
105 | }
106 | 
107 | func (d *Deque[T]) PushBack(v T) {
108 | 	// the end-1 chunk always writable
109 | 	chunk := d.chunks[d.end-1]
110 | 
111 | 	chunk.buf[chunk.end] = v
112 | 	chunk.end++
113 | 
114 | 	if chunk.rightFull() {
115 | 		d.grow()
116 | 
117 | 		d.chunks[d.end] = &dequeChunk[T]{
118 | 			start: 0,
119 | 			end:   0,
120 | 		}
121 | 		d.end++
122 | 	}
123 | 
124 | 	d.size++
125 | }
126 | 
127 | func (d *Deque[T]) PushFront(v T) {
128 | 	// the start chunk always writable
129 | 	chunk := d.chunks[d.start]
130 | 
131 | 	chunk.buf[chunk.start-1] = v
132 | 	chunk.start--
133 | 
134 | 	if chunk.leftFull() {
135 | 		d.grow()
136 | 
137 | 		d.chunks[d.start-1] = &dequeChunk[T]{
138 | 			start: DequeChunkSize,
139 | 			end:   DequeChunkSize,
140 | 		}
141 | 		d.start--
142 | 	}
143 | 
144 | 	d.size++
145 | }
146 | 
147 | func (d *Deque[T]) PopBack() error {
148 | 	if d.Empty() {
149 | 		return ErrDequeEmpty
150 | 	}
151 | 
152 | 	chunk := d.chunks[d.end-1]
153 | 	if chunk.empty() {
154 | 		d.chunks[d.end-1] = nil
155 | 		d.end--
156 | 		chunk = d.chunks[d.end-1]
157 | 	}
158 | 
159 | 	// make sure the chunk always writable
160 | 	chunk.end--
161 | 
162 | 	d.size--
163 | 	if d.size == 0 {
164 | 		d.Clear()
165 | 	}
166 | 
167 | 	return nil
168 | }
169 | 
170 | func (d *Deque[T]) PopFront() error {
171 | 	if d.Empty() {
172 | 		return ErrDequeEmpty
173 | 	}
174 | 
175 | 	chunk := d.chunks[d.start]
176 | 	if chunk.empty() {
177 | 		d.chunks[d.start] = nil
178 | 		d.start++
179 | 		chunk = d.chunks[d.start]
180 | 	}
181 | 
182 | 	// make sure the chunk always writable
183 | 	chunk.start++
184 | 
185 | 	d.size--
186 | 	if d.size == 0 {
187 | 		d.Clear()
188 | 	}
189 | 
190 | 	return nil
191 | }
192 | 
193 | func (d *Deque[T]) Len() int {
194 | 	return int(d.size)
195 | }
196 | 
197 | func (d *Deque[T]) Empty() bool {
198 | 	return d.size == 0
199 | }
200 | 
201 | func (d *Deque[T]) At(idx int) (*T, error) {
202 | 	if idx >= d.Len() {
203 | 		return nil, ErrDequeOutOfRange
204 | 	}
205 | 
206 | 	if idx < d.chunks[d.start].size() {
207 | 		chunk := d.chunks[d.start]
208 | 		return &chunk.buf[int(chunk.start)+idx], nil
209 | 	}
210 | 
211 | 	idx -= d.chunks[d.start].size()
212 | 	pos := int(d.start+1) + idx/DequeChunkSize
213 | 	off := idx % DequeChunkSize
214 | 	chunk := d.chunks[pos]
215 | 	return &chunk.buf[int(chunk.start)+off], nil
216 | }
217 | 
218 | func (d *Deque[T]) Clear() {
219 | 	for i := d.start; i < d.end; i++ {
220 | 		d.chunks[i] = nil
221 | 	}
222 | 
223 | 	d.start = DequeFrontReserveSize - 1
224 | 	d.end = DequeFrontReserveSize + 1
225 | 
226 | 	// the start index(2) is writable
227 | 	d.chunks[d.start] = &dequeChunk[T]{
228 | 		start: DequeChunkSize,
229 | 		end:   DequeChunkSize,
230 | 	}
231 | 
232 | 	// the end-1 index(3) is writable
233 | 	d.chunks[d.start+1] = &dequeChunk[T]{
234 | 		start: 0,
235 | 		end:   0,
236 | 	}
237 | }
238 | 


--------------------------------------------------------------------------------
/db.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"path/filepath"
  7 | 	"sort"
  8 | )
  9 | 
 10 | const (
 11 | 	WalFileSuffix      = ".wal"
 12 | 	HintFileSuffix     = ".hint"
 13 | 	MergeFileSuffix    = ".merge"
 14 | 	TmpFileSuffix      = ".tmp"
 15 | 	LockFile           = "LOCK"
 16 | 	ManifestFilePrefix = "MANIFEST"
 17 | 	CurrentFile        = "CURRENT"
 18 | )
 19 | 
 20 | const (
 21 | 	UnknownFileType = iota
 22 | 
 23 | 	WalFileType
 24 | 	HintFileType
 25 | 	MergeFileType
 26 | 	TmpFileType
 27 | 	LockFileType
 28 | 	ManifestFileType
 29 | 	CurrentFileType
 30 | )
 31 | 
 32 | var (
 33 | 	ErrKeyNotFound    = errors.New("key not found")
 34 | 	ErrIncompleteRead = errors.New("incomplete block read")
 35 | 	ErrKeySoftDeleted = errors.New("key soft delete")
 36 | )
 37 | 
 38 | type WriteOptions struct {
 39 | 	Sync bool
 40 | }
 41 | 
 42 | type ReadOptions struct {
 43 | 	VerifyChecksum bool
 44 | }
 45 | 
 46 | type PickerWalInfo struct {
 47 | 	CreateTime uint64
 48 | 	FreeBytes  uint64
 49 | 	WalSize    uint64
 50 | 	Fid        uint64
 51 | }
 52 | 
 53 | type (
 54 | 	CompactionPicker func([]PickerWalInfo) []uint64
 55 | 	CompactionFilter func(ns, key, val []byte, meta *Meta) bool
 56 | )
 57 | 
 58 | type Options struct {
 59 | 	Dir string
 60 | 
 61 | 	LogFile       string
 62 | 	LogDir        string
 63 | 	LogLevel      int8
 64 | 	LogMaxSize    uint64
 65 | 	LogMaxBackups uint64
 66 | 
 67 | 	WalMaxSize      uint64
 68 | 	ManifestMaxSize uint64
 69 | 
 70 | 	IndexCapacity             uint64
 71 | 	IndexLimited              uint64
 72 | 	IndexEvictionPoolCapacity uint64
 73 | 	IndexSampleKeys           uint64
 74 | 
 75 | 	BlockCacheCapacity             uint64
 76 | 	BlockCacheLimited              uint64
 77 | 	BlockCacheEvictionPoolCapacity uint64
 78 | 	BlockCacheSampleKeys           uint64
 79 | 
 80 | 	BlockReaderConcurrent uint64
 81 | 
 82 | 	CompactionPicker CompactionPicker
 83 | 	CompactionFilter CompactionFilter
 84 | 
 85 | 	DiskUsageLimited uint64
 86 | 
 87 | 	NsSize   uint64
 88 | 	EtagSize uint64
 89 | 
 90 | 	CompactionTriggerInterval uint64
 91 | 	CheckDiskUsageInterval    uint64
 92 | 
 93 | 	CompactionPickerRatio float64
 94 | 
 95 | 	DisableCompaction bool
 96 | 
 97 | 	RecordBufferSize uint64
 98 | }
 99 | 
100 | func (o *Options) Init() {
101 | 	if o.LogDir == "" {
102 | 		o.LogDir = o.Dir
103 | 	}
104 | 
105 | 	if o.LogFile == "" {
106 | 		o.LogFile = DefaultLogFile
107 | 	}
108 | 
109 | 	if o.LogMaxSize == 0 {
110 | 		o.LogMaxSize = DefaultLogMaxSize
111 | 	}
112 | 
113 | 	if o.CompactionPicker == nil {
114 | 		o.CompactionPicker = DefaultCompactionPicker
115 | 	}
116 | 
117 | 	if o.CompactionTriggerInterval <= 0 {
118 | 		o.CompactionTriggerInterval = DefaultCompactionTriggerInterval
119 | 	}
120 | 
121 | 	if o.CheckDiskUsageInterval <= 0 {
122 | 		o.CheckDiskUsageInterval = DefaultCheckDiskUsageInterval
123 | 	}
124 | 
125 | 	if o.CompactionPickerRatio <= 0 {
126 | 		o.CompactionPickerRatio = DefaultCompactionPickerRatio
127 | 	}
128 | 
129 | 	if o.RecordBufferSize <= 0 {
130 | 		o.RecordBufferSize = DefaultRecordBufferSize
131 | 	}
132 | 
133 | 	gOpts = o
134 | }
135 | 
136 | var gOpts *Options
137 | 
138 | // read-only
139 | func GetOptions() *Options {
140 | 	return gOpts
141 | }
142 | 
143 | type DB interface {
144 | 	Get(ns, key []byte, opts *ReadOptions) (val []byte, meta *Meta, err error)
145 | 	Put(ns, key, val []byte, meta *Meta, opts *WriteOptions) error
146 | 
147 | 	Write(batch *Batch, opts *WriteOptions) error
148 | 	Delete(ns, key []byte, opts *WriteOptions) error
149 | 	Close()
150 | }
151 | 
152 | func TmpFilename(fid uint64) string {
153 | 	return fmt.Sprintf("%06d%s", fid, TmpFileSuffix)
154 | }
155 | 
156 | func WalFilename(fid uint64) string {
157 | 	return fmt.Sprintf("%06d%s", fid, WalFileSuffix)
158 | }
159 | 
160 | func HintFilename(fid uint64) string {
161 | 	return fmt.Sprintf("%06d%s", fid, HintFileSuffix)
162 | }
163 | 
164 | func MergeFilename(fid uint64) string {
165 | 	return fmt.Sprintf("%06d%s", fid, MergeFileSuffix)
166 | }
167 | 
168 | func ManifestFilename(fid uint64) string {
169 | 	return fmt.Sprintf("%s-%06d", ManifestFilePrefix, fid)
170 | }
171 | 
172 | func TmpPath(dir string, fid uint64) string {
173 | 	return filepath.Join(dir, TmpFilename(fid))
174 | }
175 | 
176 | func WalPath(dir string, fid uint64) string {
177 | 	return filepath.Join(dir, WalFilename(fid))
178 | }
179 | 
180 | func HintPath(dir string, fid uint64) string {
181 | 	return filepath.Join(dir, HintFilename(fid))
182 | }
183 | 
184 | func ManifestPath(dir string, fid uint64) string {
185 | 	return filepath.Join(dir, ManifestFilename(fid))
186 | }
187 | 
188 | func MergePath(dir string, fid uint64) string {
189 | 	return filepath.Join(dir, MergeFilename(fid))
190 | }
191 | 
192 | func LockPath(dir string) string {
193 | 	return filepath.Join(dir, LockFile)
194 | }
195 | 
196 | func CurrentPath(dir string) string {
197 | 	return filepath.Join(dir, CurrentFile)
198 | }
199 | 
200 | func DefaultCompactionPicker(wals []PickerWalInfo) []uint64 {
201 | 	compactionPickerRatio := GetOptions().CompactionPickerRatio
202 | 
203 | 	// reverse order
204 | 	sort.Slice(wals, func(i, j int) bool {
205 | 		return wals[i].FreeBytes > wals[j].FreeBytes
206 | 	})
207 | 
208 | 	var res []uint64
209 | 	for idx := range wals {
210 | 		size := float64(wals[idx].WalSize)
211 | 		free := float64(wals[idx].FreeBytes)
212 | 
213 | 		if free/size < compactionPickerRatio {
214 | 			break
215 | 		}
216 | 
217 | 		res = append(res, wals[idx].Fid)
218 | 		if len(res) >= 2 {
219 | 			break
220 | 		}
221 | 	}
222 | 
223 | 	return res
224 | }
225 | 


--------------------------------------------------------------------------------
/block_cache.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"sync"
  6 | 	"time"
  7 | )
  8 | 
  9 | const (
 10 | 	// fid use 5 bytes
 11 | 	BlockCacheFidBits  = 40
 12 | 	BlockCacheFidMask  = (1 << BlockCacheFidBits) - 1
 13 | 	BlockCacheFidShift = (64 - 40) // high 40-bits
 14 | 
 15 | 	// each data block is 32KB, and 22 bits can locate 128 GB
 16 | 	BlockCacheIdxBits  = 22
 17 | 	BlockCacheIdxMask  = (1 << BlockCacheIdxBits) - 1
 18 | 	BlockCacheIdxShift = 0 // low 22-bits
 19 | 
 20 | 	// the extra 2 bits reserved
 21 | )
 22 | 
 23 | var (
 24 | 	ErrBlockCacheMiss          = errors.New("block cache miss")
 25 | 	ErrBlockCacheFidOutOfRange = errors.New("block cache fid out of range")
 26 | 	ErrBlockCacheIdxOutOfRange = errors.New("block cache idx out of range")
 27 | )
 28 | 
 29 | func BlockCacheKey(fid, blkIdx uint64) (uint64, error) {
 30 | 	if fid > BlockCacheFidMask {
 31 | 		return 0, ErrBlockCacheFidOutOfRange
 32 | 	}
 33 | 
 34 | 	if blkIdx > BlockCacheIdxMask {
 35 | 		return 0, ErrBlockCacheIdxOutOfRange
 36 | 	}
 37 | 
 38 | 	return (fid << BlockCacheFidShift) | blkIdx, nil
 39 | }
 40 | 
 41 | type BlockCacheOperator struct {
 42 | 	helper MapOperatorBase
 43 | }
 44 | 
 45 | func (optr *BlockCacheOperator) Hash(key *uint64) uint64 {
 46 | 	return *key
 47 | }
 48 | 
 49 | func (optr *BlockCacheOperator) Equals(lhs, rhs *uint64) bool {
 50 | 	return *lhs == *rhs
 51 | }
 52 | 
 53 | func (optr *BlockCacheOperator) Rand(upper uint64) uint64 {
 54 | 	return optr.helper.Rand(upper)
 55 | }
 56 | 
 57 | func (optr *BlockCacheOperator) WallTime() time.Time {
 58 | 	return optr.helper.WallTime()
 59 | }
 60 | 
 61 | type BlockCache struct {
 62 | 	maps *ShardMap[uint64, []byte]
 63 | 
 64 | 	blkPool *sync.Pool
 65 | }
 66 | 
 67 | type BlockCacheOptions struct {
 68 | 	Capacity             uint64
 69 | 	Limited              uint64
 70 | 	EvictionPoolCapacity uint64
 71 | 	SampleKeys           uint64
 72 | 
 73 | 	Helper MapOperatorBase
 74 | }
 75 | 
 76 | func NewBlockCache(opts *BlockCacheOptions) (*BlockCache, error) {
 77 | 	blkPool := &sync.Pool{
 78 | 		New: func() any {
 79 | 			b := make([]byte, BlockSize)
 80 | 			return &b
 81 | 		},
 82 | 	}
 83 | 
 84 | 	if opts == nil || opts.Capacity == 0 {
 85 | 		return &BlockCache{
 86 | 			maps:    nil,
 87 | 			blkPool: blkPool,
 88 | 		}, nil
 89 | 	}
 90 | 
 91 | 	mapOpts := &MapOptions{
 92 | 		Capacity:             opts.Capacity,
 93 | 		Limited:              opts.Limited,
 94 | 		EvictionPoolCapacity: opts.EvictionPoolCapacity,
 95 | 		SampleKeys:           opts.SampleKeys,
 96 | 	}
 97 | 
 98 | 	optr := &BlockCacheOperator{
 99 | 		helper: opts.Helper,
100 | 	}
101 | 
102 | 	maps, err := NewShardMap[uint64, []byte](optr, mapOpts)
103 | 	if err != nil {
104 | 		return nil, err
105 | 	}
106 | 
107 | 	return &BlockCache{
108 | 		maps:    maps,
109 | 		blkPool: blkPool,
110 | 	}, nil
111 | }
112 | 
113 | func (c *BlockCache) key(fid, blkIdx uint64) (uint64, error) {
114 | 	if fid > BlockCacheFidMask {
115 | 		return 0, ErrBlockCacheFidOutOfRange
116 | 	}
117 | 
118 | 	if blkIdx > BlockCacheIdxMask {
119 | 		return 0, ErrBlockCacheIdxOutOfRange
120 | 	}
121 | 
122 | 	return (fid << BlockCacheFidShift) | blkIdx, nil
123 | }
124 | 
125 | func (c *BlockCache) Get(fid, blkIdx uint64) ([]byte, error) {
126 | 	if c.maps == nil {
127 | 		return nil, ErrBlockCacheMiss
128 | 	}
129 | 
130 | 	key, err := c.key(fid, blkIdx)
131 | 	if err != nil {
132 | 		return nil, err
133 | 	}
134 | 
135 | 	if blkPtr, err := c.maps.Get(&key); err == nil {
136 | 		return *blkPtr, nil
137 | 	}
138 | 
139 | 	return nil, ErrBlockCacheMiss
140 | }
141 | 
142 | func (c *BlockCache) BatchGet(fid, blkStartIdx, blkNum uint64) ([][]byte, error) {
143 | 	if c.maps == nil {
144 | 		return nil, ErrBlockCacheMiss
145 | 	}
146 | 
147 | 	blks := make([][]byte, blkNum)
148 | 	for i := uint64(0); i < blkNum; i++ {
149 | 		key, err := c.key(fid, blkStartIdx+i)
150 | 		if err != nil {
151 | 			return nil, err
152 | 		}
153 | 
154 | 		blkPtr, err := c.maps.Get(&key)
155 | 		if err != nil {
156 | 			return nil, err
157 | 		}
158 | 
159 | 		blks[i] = *blkPtr
160 | 	}
161 | 
162 | 	return blks, nil
163 | }
164 | 
165 | func (c *BlockCache) Put(fid, blkIdx, length uint64, blk []byte) error {
166 | 	if c.maps == nil {
167 | 		c.blkPool.Put(&blk)
168 | 		return nil
169 | 	}
170 | 
171 | 	// the actual length is less than len(blk), and we should not put it into cache
172 | 	if int(length) != len(blk) {
173 | 		c.blkPool.Put(&blk)
174 | 		return nil
175 | 	}
176 | 
177 | 	key, err := c.key(fid, blkIdx)
178 | 	if err != nil {
179 | 		c.blkPool.Put(&blk)
180 | 		return err
181 | 	}
182 | 
183 | 	oldBlkPtr, err := c.maps.Set(&key, &blk)
184 | 	if err != nil {
185 | 		c.blkPool.Put(&blk)
186 | 		return err
187 | 	}
188 | 
189 | 	// maybe no eviction
190 | 	if oldBlkPtr != nil {
191 | 		c.blkPool.Put(oldBlkPtr)
192 | 	}
193 | 
194 | 	return nil
195 | }
196 | 
197 | func (c *BlockCache) BatchPut(fid, blkIdx, length uint64, blks [][]byte) error {
198 | 	if c.maps == nil {
199 | 		for idx := range blks {
200 | 			c.blkPool.Put(&blks[idx])
201 | 		}
202 | 		return nil
203 | 	}
204 | 
205 | 	// blks must include at least one elements
206 | 	blkNum := uint64(len(blks))
207 | 	if length != blkNum*uint64(len(blks[0])) {
208 | 		blkNum--
209 | 		c.blkPool.Put(&blks[len(blks)-1])
210 | 	}
211 | 
212 | 	for i := uint64(0); i < blkNum; i++ {
213 | 		key, err := c.key(fid, blkIdx+i)
214 | 		if err != nil {
215 | 			c.blkPool.Put(&blks[i])
216 | 			continue
217 | 		}
218 | 
219 | 		oldBlkPtr, err := c.maps.Set(&key, &blks[i])
220 | 		if err != nil {
221 | 			c.blkPool.Put(&blks[i])
222 | 			continue
223 | 		}
224 | 
225 | 		if oldBlkPtr != nil {
226 | 			c.blkPool.Put(oldBlkPtr)
227 | 		}
228 | 	}
229 | 
230 | 	return nil
231 | }
232 | 
233 | func (c *BlockCache) NewBlock() []byte {
234 | 	blkPtr := c.blkPool.Get().(*[]byte)
235 | 	return *blkPtr
236 | }
237 | 


--------------------------------------------------------------------------------
/manifest_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | func createTmpdir(t *testing.T) (dir string, closer func()) {
 11 | 	dir = t.TempDir()
 12 | 	closer = func() {
 13 | 		os.RemoveAll(dir)
 14 | 	}
 15 | 	return
 16 | }
 17 | 
 18 | func TestManifest_NewManifest(t *testing.T) {
 19 | 	dir, closer := createTmpdir(t)
 20 | 	defer closer()
 21 | 
 22 | 	manifest, err := NewManifest(dir)
 23 | 	assert.Nil(t, err)
 24 | 
 25 | 	defer manifest.Close()
 26 | 
 27 | 	assert.Equal(t, manifest.fid, uint64(1))
 28 | 	assert.Equal(t, manifest.nextFid, uint64(3))
 29 | 	assert.Equal(t, manifest.NextFid(), uint64(3))
 30 | 
 31 | 	active := manifest.ActiveWal()
 32 | 	assert.NotNil(t, active)
 33 | 	assert.Equal(t, active.Fid(), uint64(2))
 34 | 
 35 | 	assert.True(t, manifest.FileSize() > 0)
 36 | }
 37 | 
 38 | func TestManifest_LoadManifest(t *testing.T) {
 39 | 	dir, closer := createTmpdir(t)
 40 | 	defer closer()
 41 | 
 42 | 	manifest, err := NewManifest(dir)
 43 | 	assert.Nil(t, err)
 44 | 
 45 | 	manifest.Close()
 46 | 
 47 | 	// load the previous manifest
 48 | 	manifest, err = LoadManifest(dir)
 49 | 	assert.Nil(t, err)
 50 | 
 51 | 	defer manifest.Close()
 52 | 
 53 | 	assert.Equal(t, manifest.fid, uint64(1))
 54 | 	assert.Equal(t, manifest.nextFid, uint64(3))
 55 | 
 56 | 	active := manifest.ActiveWal()
 57 | 	assert.NotNil(t, active)
 58 | 	assert.Equal(t, active.Fid(), uint64(2))
 59 | 
 60 | 	assert.True(t, manifest.FileSize() > 0)
 61 | }
 62 | 
 63 | func TestManifest_RotateWal(t *testing.T) {
 64 | 	dir, closer := createTmpdir(t)
 65 | 	defer closer()
 66 | 
 67 | 	manifest, err := NewManifest(dir)
 68 | 	assert.Nil(t, err)
 69 | 
 70 | 	defer manifest.Close()
 71 | 
 72 | 	old, err := manifest.RotateWal()
 73 | 	assert.Nil(t, err)
 74 | 	assert.Equal(t, old.Fid(), uint64(2))
 75 | 	assert.Equal(t, old.refs.Load(), int64(1))
 76 | 
 77 | 	active := manifest.ActiveWal()
 78 | 	assert.NotNil(t, active)
 79 | 	assert.Equal(t, active.refs.Load(), int64(1))
 80 | 
 81 | 	assert.Equal(t, manifest.nextFid, uint64(4))
 82 | }
 83 | 
 84 | func TestManifest_RotateManifest(t *testing.T) {
 85 | 	dir, closer := createTmpdir(t)
 86 | 	defer closer()
 87 | 
 88 | 	manifest, err := NewManifest(dir)
 89 | 	assert.Nil(t, err)
 90 | 
 91 | 	defer manifest.Close()
 92 | 
 93 | 	assert.Equal(t, manifest.fid, uint64(1))
 94 | 
 95 | 	err = manifest.RotateManifest()
 96 | 	assert.Nil(t, err)
 97 | 
 98 | 	assert.Equal(t, manifest.fid, uint64(3))
 99 | 	assert.Equal(t, manifest.nextFid, uint64(4))
100 | }
101 | 
102 | func TestManifest_Apply(t *testing.T) {
103 | 	dir, closer := createTmpdir(t)
104 | 	defer closer()
105 | 
106 | 	manifest, err := NewManifest(dir)
107 | 	assert.Nil(t, err)
108 | 
109 | 	active := manifest.ActiveWal()
110 | 	assert.Equal(t, active.refs.Load(), int64(1))
111 | 	assert.Equal(t, active.Fid(), uint64(2))
112 | 
113 | 	defer manifest.Close()
114 | 
115 | 	wal3, err := NewWal(WalPath(dir, 3), 3, -1)
116 | 	assert.Nil(t, err)
117 | 
118 | 	wal4, err := NewWal(WalPath(dir, 4), 4, -1)
119 | 	assert.Nil(t, err)
120 | 
121 | 	wal5, err := NewWal(WalPath(dir, 5), 5, -1)
122 | 	assert.Nil(t, err)
123 | 
124 | 	// apply
125 | 	edit1 := &ManifestEdit{
126 | 		addFiles: []LogFile{
127 | 			{fid: 3, wal: wal3},
128 | 			{fid: 4, wal: wal4},
129 | 			{fid: 5, wal: wal5},
130 | 		},
131 | 		hasNextFid: true,
132 | 		nextFid:    6,
133 | 	}
134 | 
135 | 	assert.Nil(t, manifest.Apply(edit1))
136 | 	assert.Equal(t, wal3.refs.Load(), int64(2))
137 | 	assert.Equal(t, wal4.refs.Load(), int64(2))
138 | 	assert.Equal(t, wal5.refs.Load(), int64(2))
139 | 
140 | 	edit2 := &ManifestEdit{
141 | 		deleteFiles: []LogFile{{fid: 4}, {fid: 5}},
142 | 	}
143 | 
144 | 	assert.Nil(t, manifest.Apply(edit2))
145 | 	assert.Equal(t, wal3.refs.Load(), int64(2))
146 | 	assert.Equal(t, wal4.refs.Load(), int64(1))
147 | 	assert.Equal(t, wal5.refs.Load(), int64(1))
148 | 
149 | 	assert.Nil(t, manifest.ToWal(4))
150 | 	assert.Nil(t, manifest.ToWal(5))
151 | 	assert.NotNil(t, manifest.ToWal(3))
152 | 	assert.NotNil(t, manifest.ToWal(2))
153 | 	assert.Equal(t, manifest.nextFid, uint64(6))
154 | }
155 | 
156 | func TestManifest_LogAndApply(t *testing.T) {
157 | 	dir, closer := createTmpdir(t)
158 | 	defer closer()
159 | 
160 | 	manifest, err := NewManifest(dir)
161 | 	assert.Nil(t, err)
162 | 
163 | 	wal3, err := NewWal(WalPath(dir, 3), 3, -1)
164 | 	assert.Nil(t, err)
165 | 
166 | 	wal4, err := NewWal(WalPath(dir, 4), 4, -1)
167 | 	assert.Nil(t, err)
168 | 
169 | 	wal5, err := NewWal(WalPath(dir, 5), 5, -1)
170 | 	assert.Nil(t, err)
171 | 
172 | 	// apply
173 | 	edit1 := &ManifestEdit{
174 | 		addFiles: []LogFile{
175 | 			{fid: 3, wal: wal3},
176 | 			{fid: 4, wal: wal4},
177 | 			{fid: 5, wal: wal5},
178 | 		},
179 | 		hasNextFid: true,
180 | 		nextFid:    6,
181 | 	}
182 | 
183 | 	assert.Nil(t, manifest.LogAndApply(edit1))
184 | 	assert.Equal(t, wal3.refs.Load(), int64(2))
185 | 	assert.Equal(t, wal4.refs.Load(), int64(2))
186 | 	assert.Equal(t, wal5.refs.Load(), int64(2))
187 | 
188 | 	edit2 := &ManifestEdit{
189 | 		deleteFiles: []LogFile{{fid: 4}, {fid: 5}},
190 | 	}
191 | 
192 | 	assert.Nil(t, manifest.LogAndApply(edit2))
193 | 	assert.Equal(t, wal3.refs.Load(), int64(2))
194 | 	assert.Equal(t, wal4.refs.Load(), int64(1))
195 | 	assert.Equal(t, wal5.refs.Load(), int64(1))
196 | 
197 | 	// re-open manifest
198 | 	manifest.Close() // all referenced wals will be closed
199 | 
200 | 	manifest, err = LoadManifest(dir)
201 | 	assert.Nil(t, err)
202 | 
203 | 	// check
204 | 	assert.Nil(t, manifest.ToWal(4))
205 | 	assert.Nil(t, manifest.ToWal(5))
206 | 
207 | 	wal2 := manifest.ToWal(2)
208 | 	assert.NotNil(t, wal2)
209 | 	assert.Equal(t, wal2.refs.Load(), int64(1))
210 | 
211 | 	wal3 = manifest.ToWal(3)
212 | 	assert.NotNil(t, wal3)
213 | 	assert.Equal(t, wal3.refs.Load(), int64(1))
214 | 
215 | 	assert.Equal(t, manifest.nextFid, uint64(6))
216 | }
217 | 


--------------------------------------------------------------------------------
/bench/bench_test.go:
--------------------------------------------------------------------------------
  1 | package bench
  2 | 
  3 | import (
  4 | 	"crypto/sha1"
  5 | 	"os"
  6 | 	"strconv"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/stretchr/testify/assert"
 10 | 	"github.com/wenzhang-dev/bitcaskDB"
 11 | )
 12 | 
 13 | var (
 14 | 	dir string
 15 | 	db  *bitcask.DBImpl
 16 | )
 17 | 
 18 | const BatchSize = 50
 19 | 
 20 | func genTestKey(i int) []byte {
 21 | 	return []byte(strconv.Itoa(i))
 22 | }
 23 | 
 24 | var (
 25 | 	bin4KB = bitcask.GenNKBytes(4)
 26 | 	ns     = sha1.Sum([]byte("benchmark"))
 27 | )
 28 | 
 29 | func newDB(b *testing.B) {
 30 | 	dir = "./bitcaskDB"
 31 | 	_ = os.RemoveAll(dir)
 32 | 	_ = os.MkdirAll(dir, os.ModePerm)
 33 | 
 34 | 	opts := &bitcask.Options{
 35 | 		Dir:                            dir,
 36 | 		WalMaxSize:                     1024 * 1024 * 1024, // 1GB
 37 | 		ManifestMaxSize:                10 * 1024 * 1024,   // 10MB
 38 | 		IndexCapacity:                  10000000,           // 10 million
 39 | 		IndexLimited:                   8000000,
 40 | 		IndexEvictionPoolCapacity:      64,
 41 | 		IndexSampleKeys:                5,
 42 | 		BlockCacheCapacity:             8192, // 256MB
 43 | 		BlockCacheLimited:              8192,
 44 | 		BlockCacheSampleKeys:           5,
 45 | 		BlockCacheEvictionPoolCapacity: 32,
 46 | 		BlockReaderConcurrent:          64,
 47 | 		CompactionPicker:               nil, // default picker
 48 | 		CompactionFilter:               nil, // default filter
 49 | 		NsSize:                         bitcask.DefaultNsSize,
 50 | 		EtagSize:                       bitcask.DefaultEtagSize,
 51 | 		DisableCompaction:              true,
 52 | 	}
 53 | 
 54 | 	var err error
 55 | 	db, err = bitcask.NewDB(opts)
 56 | 	assert.Nil(b, err)
 57 | }
 58 | 
 59 | func BenchmarkPutGet(b *testing.B) {
 60 | 	b.Run("put_4K", benchmarkPut)
 61 | 	b.Run("batchPut_4K", benchmarkBatchPut)
 62 | 	b.Run("get_4K", benchmarkGet)
 63 | 	b.Run("concurrentGet_4K", benchmarkConcurrentGet)
 64 | 	b.Run("concurrentGetV2_4K", benchmarkConcurrentGetV2)
 65 | 	b.Run("concurrentPut_4K", benchmarkConcurrentPut)
 66 | 	b.Run("concurrentBatchPut_4K", benchmarkConcurrentBatchPut)
 67 | }
 68 | 
 69 | func benchmarkPut(b *testing.B) {
 70 | 	newDB(b)
 71 | 	defer db.Close()
 72 | 
 73 | 	meta := bitcask.NewMeta(nil)
 74 | 	opts := &bitcask.WriteOptions{}
 75 | 
 76 | 	b.ResetTimer()
 77 | 	b.ReportAllocs()
 78 | 
 79 | 	for i := 0; i < b.N; i++ {
 80 | 		err := db.Put(ns[:], genTestKey(i), bin4KB, meta, opts)
 81 | 		assert.Nil(b, err)
 82 | 	}
 83 | }
 84 | 
 85 | func benchmarkConcurrentPut(b *testing.B) {
 86 | 	newDB(b)
 87 | 	defer db.Close()
 88 | 
 89 | 	meta := bitcask.NewMeta(nil)
 90 | 	opts := &bitcask.WriteOptions{}
 91 | 
 92 | 	b.ResetTimer()
 93 | 	b.ReportAllocs()
 94 | 
 95 | 	b.RunParallel(func(pb *testing.PB) {
 96 | 		iteration := 0
 97 | 		for pb.Next() {
 98 | 			err := db.Put(ns[:], genTestKey(iteration), bin4KB, meta, opts)
 99 | 			assert.Nil(b, err)
100 | 
101 | 			iteration++
102 | 		}
103 | 	})
104 | }
105 | 
106 | func benchmarkBatchPut(b *testing.B) {
107 | 	newDB(b)
108 | 	defer db.Close()
109 | 
110 | 	meta := bitcask.NewMeta(nil)
111 | 	opts := &bitcask.WriteOptions{}
112 | 
113 | 	b.ResetTimer()
114 | 	b.ReportAllocs()
115 | 
116 | 	batch := bitcask.NewBatch()
117 | 	for i := 0; i < b.N; i++ {
118 | 		batch.Put(ns[:], genTestKey(i), bin4KB, meta)
119 | 
120 | 		if i%BatchSize == 0 {
121 | 			err := db.Write(batch, opts)
122 | 			assert.Nil(b, err)
123 | 			batch.Clear()
124 | 		}
125 | 	}
126 | 
127 | 	if batch.Size() != 0 {
128 | 		err := db.Write(batch, opts)
129 | 		assert.Nil(b, err)
130 | 	}
131 | }
132 | 
133 | func getPrepare(b *testing.B) {
134 | 	meta := bitcask.NewMeta(nil)
135 | 	wOpts := &bitcask.WriteOptions{}
136 | 
137 | 	batch := bitcask.NewBatch()
138 | 	for i := 0; i < 200001; i++ {
139 | 		batch.Put(ns[:], genTestKey(i), bin4KB, meta)
140 | 
141 | 		if i%BatchSize == 0 {
142 | 			err := db.Write(batch, wOpts)
143 | 			assert.Nil(b, err)
144 | 			batch.Clear()
145 | 		}
146 | 	}
147 | }
148 | 
149 | func benchmarkGet(b *testing.B) {
150 | 	newDB(b)
151 | 	defer db.Close()
152 | 
153 | 	getPrepare(b)
154 | 
155 | 	rOpts := &bitcask.ReadOptions{}
156 | 
157 | 	b.ResetTimer()
158 | 	b.ReportAllocs()
159 | 
160 | 	for i := 0; i < b.N; i++ {
161 | 		_, _, err := db.Get(ns[:], genTestKey(i%100000), rOpts)
162 | 		assert.Nilf(b, err, "i: %v, err: %v", i, err)
163 | 	}
164 | }
165 | 
166 | func benchmarkConcurrentGet(b *testing.B) {
167 | 	newDB(b)
168 | 	defer db.Close()
169 | 
170 | 	getPrepare(b)
171 | 
172 | 	rOpts := &bitcask.ReadOptions{}
173 | 
174 | 	b.ResetTimer()
175 | 	b.ReportAllocs()
176 | 
177 | 	b.RunParallel(func(pb *testing.PB) {
178 | 		iteration := 0
179 | 		for pb.Next() {
180 | 			_, _, err := db.Get(ns[:], genTestKey(iteration%200000), rOpts)
181 | 			assert.Nil(b, err)
182 | 
183 | 			iteration++
184 | 		}
185 | 	})
186 | }
187 | 
188 | func benchmarkConcurrentGetV2(b *testing.B) {
189 | 	newDB(b)
190 | 	defer db.Close()
191 | 
192 | 	getPrepare(b)
193 | 
194 | 	rOpts := &bitcask.ReadOptions{}
195 | 
196 | 	b.ResetTimer()
197 | 	b.ReportAllocs()
198 | 
199 | 	b.RunParallel(func(pb *testing.PB) {
200 | 		iteration := 0
201 | 		for pb.Next() {
202 | 			_, _, err := db.GetV2(ns[:], genTestKey(iteration%200000), rOpts)
203 | 			assert.Nil(b, err)
204 | 
205 | 			iteration++
206 | 		}
207 | 	})
208 | }
209 | 
210 | func benchmarkConcurrentBatchPut(b *testing.B) {
211 | 	newDB(b)
212 | 	defer db.Close()
213 | 
214 | 	meta := bitcask.NewMeta(nil)
215 | 	opts := &bitcask.WriteOptions{}
216 | 
217 | 	b.ResetTimer()
218 | 	b.ReportAllocs()
219 | 
220 | 	b.RunParallel(func(pb *testing.PB) {
221 | 		iteration := 0
222 | 		batch := bitcask.NewBatch()
223 | 		for pb.Next() {
224 | 			batch.Put(ns[:], genTestKey(iteration), bin4KB, meta)
225 | 
226 | 			if iteration%BatchSize == 0 {
227 | 				err := db.Write(batch, opts)
228 | 				assert.Nil(b, err)
229 | 				batch.Clear()
230 | 			}
231 | 
232 | 			iteration++
233 | 		}
234 | 	})
235 | }
236 | 


--------------------------------------------------------------------------------
/deque_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/stretchr/testify/assert"
  7 | )
  8 | 
  9 | func TestDequeBasicOperations(t *testing.T) {
 10 | 	d := NewDeque[int]()
 11 | 
 12 | 	// Initially, deque should be empty
 13 | 	if !d.Empty() {
 14 | 		t.Errorf("Deque should be empty initially")
 15 | 	}
 16 | 
 17 | 	// Insert elements
 18 | 	d.PushBack(10)
 19 | 	d.PushBack(20)
 20 | 	d.PushFront(5)
 21 | 
 22 | 	if d.Len() != 3 {
 23 | 		t.Errorf("Expected length 3, got %d", d.Len())
 24 | 	}
 25 | 
 26 | 	// Check Front and Back
 27 | 	front, _ := d.Front()
 28 | 	if *front != 5 {
 29 | 		t.Errorf("Expected front to be 5, got %d", *front)
 30 | 	}
 31 | 
 32 | 	back, _ := d.Back()
 33 | 	if *back != 20 {
 34 | 		t.Errorf("Expected back to be 20, got %d", *back)
 35 | 	}
 36 | 
 37 | 	// Test At()
 38 | 	val, _ := d.At(1)
 39 | 	if *val != 10 {
 40 | 		t.Errorf("Expected At(1) to be 10, got %d", *val)
 41 | 	}
 42 | 
 43 | 	// Test PopFront and PopBack
 44 | 	_ = d.PopFront() // Remove 5
 45 | 	front, _ = d.Front()
 46 | 	if *front != 10 {
 47 | 		t.Errorf("Expected front to be 10, got %d", *front)
 48 | 	}
 49 | 
 50 | 	_ = d.PopBack() // Remove 20
 51 | 	back, _ = d.Back()
 52 | 	if *back != 10 {
 53 | 		t.Errorf("Expected back to be 10, got %d", *back)
 54 | 	}
 55 | 
 56 | 	_ = d.PopBack() // Remove 10, should become empty
 57 | 	if !d.Empty() {
 58 | 		t.Errorf("Deque should be empty after popping all elements")
 59 | 	}
 60 | }
 61 | 
 62 | func TestDequeBounds(t *testing.T) {
 63 | 	d := NewDeque[int]()
 64 | 
 65 | 	// Calling Front/Back on an empty deque
 66 | 	if _, err := d.Front(); err != ErrDequeEmpty {
 67 | 		t.Errorf("Expected DequeEmptyErr for Front() on empty deque")
 68 | 	}
 69 | 	if _, err := d.Back(); err != ErrDequeEmpty {
 70 | 		t.Errorf("Expected DequeEmptyErr for Back() on empty deque")
 71 | 	}
 72 | 
 73 | 	// PopFront / PopBack on an empty deque
 74 | 	if err := d.PopFront(); err != ErrDequeEmpty {
 75 | 		t.Errorf("Expected DequeEmptyErr for PopFront() on empty deque")
 76 | 	}
 77 | 	if err := d.PopBack(); err != ErrDequeEmpty {
 78 | 		t.Errorf("Expected DequeEmptyErr for PopBack() on empty deque")
 79 | 	}
 80 | 
 81 | 	// Accessing out-of-range index
 82 | 	d.PushBack(1)
 83 | 	d.PushBack(2)
 84 | 	if _, err := d.At(3); err != ErrDequeOutOfRange {
 85 | 		t.Errorf("Expected DequeOutOfRangeErr for At(3)")
 86 | 	}
 87 | }
 88 | 
 89 | func TestDequeAutoGrow(t *testing.T) {
 90 | 	d := NewDeque[int]()
 91 | 
 92 | 	// PushBack elements until deque expands
 93 | 	for i := 0; i < 2000; i++ {
 94 | 		d.PushBack(i)
 95 | 	}
 96 | 
 97 | 	if d.Len() != 2000 {
 98 | 		t.Errorf("Expected length 2000, got %d", d.Len())
 99 | 	}
100 | 
101 | 	// Ensure first 10 and last 10 elements are correct
102 | 	for i := 0; i < 10; i++ {
103 | 		val, _ := d.At(i)
104 | 		if *val != i {
105 | 			t.Errorf("At(%d) expected %d, got %d", i, i, *val)
106 | 		}
107 | 	}
108 | 
109 | 	for i := 1990; i < 2000; i++ {
110 | 		val, _ := d.At(i)
111 | 		if *val != i {
112 | 			t.Errorf("At(%d) expected %d, got %d", i, i, *val)
113 | 		}
114 | 	}
115 | 
116 | 	// Reverse PopBack, deque should become empty
117 | 	for i := 0; i < 2000; i++ {
118 | 		_ = d.PopBack()
119 | 	}
120 | 
121 | 	if !d.Empty() {
122 | 		t.Errorf("Expected empty deque after popping all elements")
123 | 	}
124 | }
125 | 
126 | func TestDequePushFrontPopBack(t *testing.T) {
127 | 	d := NewDeque[int]()
128 | 
129 | 	// Insert into the front
130 | 	for i := 0; i < 100; i++ {
131 | 		d.PushFront(i)
132 | 	}
133 | 
134 | 	if d.Len() != 100 {
135 | 		t.Errorf("Expected length 100, got %d", d.Len())
136 | 	}
137 | 
138 | 	// Remove elements from back, should be 0,1,2,...99
139 | 	for i := 0; i < 100; i++ {
140 | 		val, _ := d.Back()
141 | 		if *val != i {
142 | 			t.Errorf("Expected back %d, got %d", i, *val)
143 | 		}
144 | 		_ = d.PopBack()
145 | 	}
146 | 
147 | 	if !d.Empty() {
148 | 		t.Errorf("Expected empty deque after popping all elements")
149 | 	}
150 | }
151 | 
152 | func TestDequeLargeData(t *testing.T) {
153 | 	d := NewDeque[int]()
154 | 	num := 1_000_000
155 | 
156 | 	// Performance test: Insert 1M elements
157 | 	for i := 0; i < num; i++ {
158 | 		d.PushBack(i)
159 | 	}
160 | 
161 | 	if d.Len() != num {
162 | 		t.Errorf("Expected length %d, got %d", num, d.Len())
163 | 	}
164 | 
165 | 	// Check first and last 10 elements
166 | 	for i := 0; i < 10; i++ {
167 | 		val, _ := d.At(i)
168 | 		if *val != i {
169 | 			t.Errorf("At(%d) expected %d, got %d", i, i, *val)
170 | 		}
171 | 	}
172 | 
173 | 	for i := num - 10; i < num; i++ {
174 | 		val, _ := d.At(i)
175 | 		if *val != i {
176 | 			t.Errorf("At(%d) expected %d, got %d", i, i, *val)
177 | 		}
178 | 	}
179 | 
180 | 	// Remove all elements
181 | 	for i := 0; i < num; i++ {
182 | 		_ = d.PopFront()
183 | 	}
184 | 
185 | 	if !d.Empty() {
186 | 		t.Errorf("Expected empty deque after popping all elements")
187 | 	}
188 | }
189 | 
190 | func TestDequeCornerCase1(t *testing.T) {
191 | 	d := NewDeque[int]()
192 | 
193 | 	for i := 0; i < DequeChunkSize; i++ {
194 | 		d.PushBack(i)
195 | 	}
196 | 
197 | 	d.PushBack(100)
198 | 
199 | 	for i := 0; i < DequeChunkSize; i++ {
200 | 		err := d.PopFront()
201 | 		assert.Nil(t, err)
202 | 	}
203 | 
204 | 	assert.Equal(t, d.Len(), 1)
205 | 	num, err := d.Front()
206 | 	assert.Nil(t, err)
207 | 	assert.Equal(t, *num, 100)
208 | 
209 | 	err = d.PopFront()
210 | 	assert.Nil(t, err)
211 | 	assert.True(t, d.Empty())
212 | }
213 | 
214 | func TestDequeCornerCase2(t *testing.T) {
215 | 	d := NewDeque[int]()
216 | 
217 | 	for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ {
218 | 		d.PushFront(i)
219 | 	}
220 | 
221 | 	for i := 0; i < DequeChunkSize; i++ {
222 | 		d.PushBack(i)
223 | 	}
224 | 
225 | 	assert.Equal(t, d.Len(), int(DequeChunkSize*(DequeFrontReserveSize+2)))
226 | 
227 | 	for i := 0; i < DequeChunkSize; i++ {
228 | 		err := d.PopFront()
229 | 		assert.Nil(t, err)
230 | 	}
231 | 
232 | 	for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ {
233 | 		err := d.PopBack()
234 | 		assert.Nil(t, err)
235 | 	}
236 | 
237 | 	assert.True(t, d.Empty())
238 | }
239 | 
240 | func TestDequeMemoryInGrow(t *testing.T) {
241 | 	d := NewDeque[int]()
242 | 
243 | 	d.PushFront(1000)
244 | 	addr1, err := d.Front()
245 | 	assert.Nil(t, err)
246 | 
247 | 	for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ {
248 | 		d.PushFront(i)
249 | 	}
250 | 
251 | 	addr2, err := d.Back()
252 | 	assert.Nil(t, err)
253 | 	assert.Equal(t, addr1, addr2)
254 | 
255 | 	for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ {
256 | 		err = d.PopFront()
257 | 		assert.Nil(t, err)
258 | 	}
259 | 
260 | 	addr3, err := d.Front()
261 | 	assert.Nil(t, err)
262 | 	assert.Equal(t, addr2, addr3)
263 | 	assert.Equal(t, *addr3, 1000)
264 | }
265 | 


--------------------------------------------------------------------------------
/wal_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | func setupWal(name string, t *testing.T) *Wal {
 11 | 	wal, err := NewWal(name, 0, -1)
 12 | 	assert.Nil(t, err)
 13 | 	return wal
 14 | }
 15 | 
 16 | // Test basic WAL operations: writing and reading records
 17 | func TestWal_BasicOperations(t *testing.T) {
 18 | 	wal := setupWal("test_wal_basic.wal", t)
 19 | 	defer wal.Unref()
 20 | 
 21 | 	data := []byte("hello world")
 22 | 	offset, err := wal.WriteRecord(data)
 23 | 	if err != nil {
 24 | 		t.Fatalf("Failed to write record: %v", err)
 25 | 	}
 26 | 	assert.Nil(t, wal.Flush())
 27 | 
 28 | 	readData, err := wal.ReadRecord(offset, uint64(len(data)), true)
 29 | 	if err != nil {
 30 | 		t.Fatalf("Failed to read record: %v", err)
 31 | 	}
 32 | 
 33 | 	if string(readData) != string(data) {
 34 | 		t.Fatalf("Data mismatch: expected %s, got %s", string(data), string(readData))
 35 | 	}
 36 | 
 37 | 	readData, err = wal.ReadRecord(offset, uint64(len(data)), false)
 38 | 	assert.Nil(t, err)
 39 | 	assert.Equal(t, data, readData)
 40 | }
 41 | 
 42 | // Test WAL behavior with multiple writes and reads
 43 | func TestWal_MultipleRecords(t *testing.T) {
 44 | 	wal := setupWal("test_wal_multiple.wal", t)
 45 | 	defer wal.Unref()
 46 | 
 47 | 	records := [][]byte{
 48 | 		[]byte("first record"),
 49 | 		[]byte("second record"),
 50 | 		[]byte("third record"),
 51 | 	}
 52 | 
 53 | 	var offsets []uint64
 54 | 	for _, record := range records {
 55 | 		offset, err := wal.WriteRecord(record)
 56 | 		if err != nil {
 57 | 			t.Fatalf("Failed to write record: %v", err)
 58 | 		}
 59 | 		offsets = append(offsets, offset)
 60 | 	}
 61 | 	assert.Nil(t, wal.Flush())
 62 | 
 63 | 	for i, offset := range offsets {
 64 | 		readData, err := wal.ReadRecord(offset, uint64(len(records[i])), true)
 65 | 		if err != nil {
 66 | 			t.Fatalf("Failed to read record at offset %d: %v", offset, err)
 67 | 		}
 68 | 		assert.Equal(t, readData, records[i])
 69 | 	}
 70 | }
 71 | 
 72 | // Test WAL record spanning multiple blocks
 73 | func TestWal_LargeRecord(t *testing.T) {
 74 | 	wal := setupWal("test_wal_large.wal", t)
 75 | 	defer wal.Unref()
 76 | 
 77 | 	largeData := make([]byte, BlockSize*2) // A record spanning multiple blocks
 78 | 	for i := range largeData {
 79 | 		largeData[i] = byte(i % 256)
 80 | 	}
 81 | 
 82 | 	offset, err := wal.WriteRecord(largeData)
 83 | 	if err != nil {
 84 | 		t.Fatalf("Failed to write large record: %v", err)
 85 | 	}
 86 | 	assert.Nil(t, wal.Flush())
 87 | 
 88 | 	readData, err := wal.ReadRecord(offset, uint64(len(largeData)), true)
 89 | 	if err != nil {
 90 | 		t.Fatalf("Failed to read large record: %v", err)
 91 | 	}
 92 | 
 93 | 	assert.Equal(t, readData, largeData)
 94 | }
 95 | 
 96 | func TestWal_LargeRecord2(t *testing.T) {
 97 | 	wal := setupWal("test_wal_large2.wal", t)
 98 | 	defer wal.Unref()
 99 | 
100 | 	data := GenNKBytes(5)
101 | 	offsets := make([]uint64, 1000)
102 | 	for i := 0; i < 1000; i++ {
103 | 		off, err := wal.WriteRecord(data)
104 | 		assert.Nil(t, err)
105 | 		assert.Nil(t, wal.Flush())
106 | 		offsets[i] = off
107 | 	}
108 | 
109 | 	// check
110 | 	for i := 0; i < 1000; i++ {
111 | 		readData, err := wal.ReadRecord(offsets[i], uint64(len(data)), true)
112 | 		assert.Nil(t, err)
113 | 		assert.Equal(t, readData, data)
114 | 	}
115 | }
116 | 
117 | // Test handling of corrupted WAL records
118 | func TestWal_CorruptedRead(t *testing.T) {
119 | 	filename := "test_wal_corrupt.wal"
120 | 	wal := setupWal(filename, t)
121 | 	defer os.Remove(wal.Path())
122 | 
123 | 	data := []byte("valid record")
124 | 	offset, err := wal.WriteRecord(data)
125 | 	if err != nil {
126 | 		t.Fatalf("Failed to write record: %v", err)
127 | 	}
128 | 	assert.Nil(t, wal.Flush())
129 | 
130 | 	// Close WAL before corrupting the file
131 | 	wal.Close()
132 | 
133 | 	// Manually corrupt the file
134 | 	file, err := os.OpenFile(filename, os.O_RDWR, 0o644)
135 | 	if err != nil {
136 | 		t.Fatalf("Failed to open WAL file for corruption: %v", err)
137 | 	}
138 | 
139 | 	_, err = file.WriteAt([]byte{0xFF, 0xFF}, int64(offset+2)) // Corrupt part of the record
140 | 	if err != nil {
141 | 		t.Fatalf("Failed to corrupt WAL file: %v", err)
142 | 	}
143 | 
144 | 	// Reopen WAL and try to read
145 | 	wal, err = LoadWal(filename, 0)
146 | 	if err != nil {
147 | 		t.Fatalf("Failed to reopen WAL: %v", err)
148 | 	}
149 | 	defer wal.Close()
150 | 
151 | 	_, err = wal.ReadRecord(offset, uint64(len(data)), true)
152 | 	if err == nil {
153 | 		t.Fatalf("Expected error when reading corrupted record, but got none")
154 | 	}
155 | }
156 | 
157 | // Test padding when block space is insufficient
158 | func TestWal_BlockPadding(t *testing.T) {
159 | 	wal := setupWal("test_wal_padding.wal", t)
160 | 	defer wal.Unref()
161 | 
162 | 	// Write a record that nearly fills a block
163 | 	data := make([]byte, BlockSize-RecordHeaderSize)
164 | 	offset, err := wal.WriteRecord(data)
165 | 	if err != nil {
166 | 		t.Fatalf("Failed to write record: %v", err)
167 | 	}
168 | 	assert.Nil(t, wal.Flush())
169 | 
170 | 	// Write another record that should go into the next block due to padding
171 | 	secondData := []byte("new block record")
172 | 	secondOffset, err := wal.WriteRecord(secondData)
173 | 	if err != nil {
174 | 		t.Fatalf("Failed to write second record: %v", err)
175 | 	}
176 | 	assert.Nil(t, wal.Flush())
177 | 
178 | 	// Ensure both records can be read correctly
179 | 	readData, err := wal.ReadRecord(offset, uint64(len(data)), true)
180 | 	if err != nil {
181 | 		t.Fatalf("Failed to read first record: %v", err)
182 | 	}
183 | 	assert.Equal(t, readData, data)
184 | 
185 | 	readData, err = wal.ReadRecord(secondOffset, uint64(len(secondData)), true)
186 | 	if err != nil {
187 | 		t.Fatalf("Failed to read second record: %v", err)
188 | 	}
189 | 	assert.Equal(t, readData, secondData)
190 | }
191 | 
192 | // Test reopening WAL and ensuring persistence
193 | func TestWal_ReopenPersistence(t *testing.T) {
194 | 	filename := "test_wal_persistence.wal"
195 | 	wal := setupWal(filename, t)
196 | 	defer os.Remove(filename)
197 | 
198 | 	data := []byte("persistent data")
199 | 	offset, err := wal.WriteRecord(data)
200 | 	assert.Nil(t, err)
201 | 	assert.Nil(t, wal.Flush())
202 | 
203 | 	// Close and reopen WAL
204 | 	wal.Close()
205 | 	wal, err = LoadWal(filename, 0)
206 | 	assert.Nil(t, err)
207 | 
208 | 	// write one record
209 | 	data1 := []byte("one record")
210 | 	offset1, err := wal.WriteRecord(data1)
211 | 	assert.Nil(t, err)
212 | 	assert.Nil(t, wal.Flush())
213 | 
214 | 	// check
215 | 	readData, err := wal.ReadRecord(offset, uint64(len(data)), true)
216 | 	assert.Nil(t, err)
217 | 	assert.Equal(t, readData, data)
218 | 
219 | 	readData, err = wal.ReadRecord(offset1, uint64(len(data1)), true)
220 | 	assert.Nil(t, err)
221 | 	assert.Equal(t, readData, data1)
222 | 
223 | 	// repeat check
224 | 	wal.Close()
225 | 	wal, err = LoadWal(filename, 0)
226 | 	assert.Nil(t, err)
227 | 
228 | 	defer wal.Close()
229 | 
230 | 	readData, err = wal.ReadRecord(offset, uint64(len(data)), true)
231 | 	assert.Nil(t, err)
232 | 	assert.Equal(t, readData, data)
233 | 
234 | 	readData, err = wal.ReadRecord(offset1, uint64(len(data1)), true)
235 | 	assert.Nil(t, err)
236 | 	assert.Equal(t, readData, data1)
237 | }
238 | 


--------------------------------------------------------------------------------
/record.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"errors"
  6 | 
  7 | 	"github.com/vmihailenco/msgpack/v5"
  8 | )
  9 | 
 10 | type Record struct {
 11 | 	Ns    []byte
 12 | 	Key   []byte
 13 | 	Value []byte
 14 | 	Meta  *Meta
 15 | 
 16 | 	// mark whether it's a delete operation
 17 | 	// it's fundamentally different from the meta tombstone.
 18 | 	// the tombstone indicates s soft deletion, but in fact, the key still exists in database
 19 | 	// the deleted tag indicates that the key will be removed directly in the database
 20 | 	// this deleted tag would not be serialized
 21 | 	Deleted bool
 22 | }
 23 | 
 24 | // serialization format:
 25 | // | header size | header | key | value | meta |
 26 | //
 27 | // the header including:
 28 | // - header size: 1B
 29 | // - ns: fixed size
 30 | // - flags: 1B
 31 | // - key size: varint32 1~5B
 32 | // - value size: varint32 1~5B
 33 | // - meta size: varint32 1~5B
 34 | // - etag: optional, fixed size
 35 | // - expire: optional, varint32 1~5B
 36 | // - other optional fields if need
 37 | //
 38 | // for small record:
 39 | // if key=16B, value=128B, meta=64B, its header is about 50B, and effective space usage is about 80%
 40 | //
 41 | // for medium record:
 42 | // if key=64B, value=128KB, meta=1KB, it's header is about 50B, and effective space usage is abort 99%
 43 | 
 44 | const (
 45 | 	noEtagFieldBit    = 0
 46 | 	noExpireFieldBit  = 1
 47 | 	tombstoneFieldBit = 2
 48 | )
 49 | 
 50 | func (r *Record) ApproximateSize() int {
 51 | 	// 1B header size + len(ns) + 1B flags + 2B varint * 3 + len(meta) + 2B expire
 52 | 	approximateRecordHeaderSize := 1 + len(r.Ns) + 1 + 2*3 + len(r.Meta.Etag) + 2
 53 | 
 54 | 	return approximateRecordHeaderSize + len(r.Key) + len(r.Value) + r.Meta.AppMetaApproximateSize()
 55 | }
 56 | 
 57 | func (r *Record) Encode(backStore []byte, baseTime uint64) ([]byte, error) {
 58 | 	flag := byte(0)
 59 | 	if len(r.Meta.Etag) == 0 {
 60 | 		flag |= byte(1 << noEtagFieldBit)
 61 | 	}
 62 | 
 63 | 	if r.Meta.IsTombstone() {
 64 | 		flag |= byte(1 << tombstoneFieldBit)
 65 | 	}
 66 | 
 67 | 	expireSize := 0
 68 | 	var expireBytes [binary.MaxVarintLen32]byte
 69 | 	switch {
 70 | 	case r.Meta.Expire == MetaNoExpire:
 71 | 		flag |= byte(1 << noExpireFieldBit)
 72 | 
 73 | 	case r.Meta.Expire < baseTime:
 74 | 		return nil, errors.New("invalid expire")
 75 | 
 76 | 	default: // expire > base time
 77 | 		expireSize = binary.PutUvarint(expireBytes[:], r.Meta.Expire-baseTime)
 78 | 	}
 79 | 
 80 | 	var err error
 81 | 	var metaEncoded []byte
 82 | 	if r.Meta.AppMetaSize != 0 {
 83 | 		if metaEncoded, err = msgpack.Marshal(r.Meta.AppMeta); err != nil {
 84 | 			return nil, err
 85 | 		}
 86 | 	}
 87 | 
 88 | 	// try to encode the varint32 fields
 89 | 	offset := 0
 90 | 	var tmp [3 * binary.MaxVarintLen32]byte
 91 | 	offset += binary.PutUvarint(tmp[offset:], uint64(len(r.Key)))
 92 | 	offset += binary.PutUvarint(tmp[offset:], uint64(len(r.Value)))
 93 | 	offset += binary.PutUvarint(tmp[offset:], uint64(len(metaEncoded)))
 94 | 	tmpSize := offset
 95 | 
 96 | 	// plus 2 bytes: flag and header size
 97 | 	headerSize := offset + expireSize + len(r.Ns) + len(r.Meta.Etag) + 2
 98 | 	totalSize := headerSize + len(r.Key) + len(r.Value) + len(metaEncoded)
 99 | 
100 | 	// prefer use the backing store
101 | 	buf := backStore[0:]
102 | 	if totalSize > cap(backStore) {
103 | 		buf = make([]byte, totalSize)
104 | 	}
105 | 
106 | 	offset = 0
107 | 
108 | 	// header size
109 | 	buf[0] = byte(headerSize)
110 | 	offset++
111 | 
112 | 	// namespace
113 | 	offset += copy(buf[offset:], r.Ns)
114 | 
115 | 	// flag
116 | 	buf[offset] = flag
117 | 	offset++
118 | 
119 | 	// varint
120 | 	offset += copy(buf[offset:], tmp[:tmpSize])
121 | 
122 | 	// optional etag
123 | 	offset += copy(buf[offset:], r.Meta.Etag)
124 | 
125 | 	// optional ttl
126 | 	offset += copy(buf[offset:], expireBytes[:expireSize])
127 | 
128 | 	// key
129 | 	offset += copy(buf[offset:], r.Key)
130 | 
131 | 	// value
132 | 	offset += copy(buf[offset:], r.Value)
133 | 
134 | 	// meta
135 | 	offset += copy(buf[offset:], metaEncoded)
136 | 
137 | 	return buf[:offset], nil
138 | }
139 | 
140 | func RecordFromBytes(data []byte, baseTime uint64) (*Record, error) {
141 | 	nsSize := int(GetOptions().NsSize)
142 | 
143 | 	// 1B header size + len(ns) + 1B flags + 1B variant * 3
144 | 	minRecordHeaderSize := 1 + nsSize + 1 + 1*3
145 | 
146 | 	if len(data) < minRecordHeaderSize {
147 | 		return nil, errors.New("invalid data")
148 | 	}
149 | 
150 | 	offset := 0
151 | 
152 | 	// header size
153 | 	headerSize := int(data[0])
154 | 	offset++
155 | 
156 | 	// namespace
157 | 	ns := data[offset : offset+nsSize]
158 | 	offset += nsSize
159 | 
160 | 	// flag
161 | 	flag := data[offset]
162 | 	offset++
163 | 
164 | 	// key size
165 | 	keyLen, keySize := DecodeUvarint(data[offset:])
166 | 	offset += keySize
167 | 
168 | 	// value size
169 | 	valLen, valSize := DecodeUvarint(data[offset:])
170 | 	offset += valSize
171 | 
172 | 	// meta size
173 | 	metaLen, metaSize := DecodeUvarint(data[offset:])
174 | 	offset += metaSize
175 | 
176 | 	// validation
177 | 	// avoid out of range of data buffer
178 | 	etagLen := int(GetOptions().EtagSize)
179 | 	if flag&(1<<noEtagFieldBit) != 0 {
180 | 		etagLen = 0
181 | 	}
182 | 
183 | 	expireSize := 0
184 | 	expire := uint64(MetaNoExpire)
185 | 	if flag&(1<<noExpireFieldBit) == 0 {
186 | 		expire, expireSize = DecodeUvarint(data[offset+etagLen:])
187 | 		expire += baseTime
188 | 	}
189 | 
190 | 	currentHeaderSize := offset + etagLen + expireSize
191 | 	currentTotalSize := currentHeaderSize + int(keyLen+valLen+metaLen)
192 | 	if currentHeaderSize != headerSize || currentTotalSize != len(data) {
193 | 		return nil, errors.New("invalid data")
194 | 	}
195 | 
196 | 	// etag
197 | 	etag := data[offset : offset+etagLen]
198 | 	offset += etagLen
199 | 
200 | 	// ttl
201 | 	offset += expireSize
202 | 
203 | 	// key
204 | 	key := data[offset : offset+int(keyLen)]
205 | 	offset += int(keyLen)
206 | 
207 | 	// value
208 | 	value := data[offset : offset+int(valLen)]
209 | 	offset += int(valLen)
210 | 
211 | 	// meta
212 | 	meta := data[offset : offset+int(metaLen)]
213 | 	// offset += int(metaLen)
214 | 
215 | 	// app meta deserialization
216 | 	serverMeta := NewMeta(nil)
217 | 	if metaLen > 0 {
218 | 		var appMeta map[string]string
219 | 		if err := msgpack.Unmarshal(meta, &appMeta); err != nil {
220 | 			return nil, err
221 | 		}
222 | 		serverMeta.SetAppMeta(appMeta)
223 | 	}
224 | 
225 | 	serverMeta.SetEtag(
226 | 		etag,
227 | 	).SetTombstone(
228 | 		flag&(1<<tombstoneFieldBit) != 0,
229 | 	).SetExpire(
230 | 		expire,
231 | 	)
232 | 
233 | 	return &Record{
234 | 		Ns:    ns,
235 | 		Key:   key,
236 | 		Meta:  serverMeta,
237 | 		Value: value,
238 | 	}, nil
239 | }
240 | 
241 | // Iterate the specific wal and the callback will pass record, offset and data size
242 | func IterateRecord(wal *Wal, cb func(record *Record, foff, size uint64) error) error {
243 | 	it := NewWalIterator(wal)
244 | 	defer it.Close()
245 | 
246 | 	for {
247 | 		foff, recordBytes, err := it.Next()
248 | 		if err != nil {
249 | 			if errors.Is(err, ErrWalIteratorEOF) {
250 | 				break
251 | 			}
252 | 			return err
253 | 		}
254 | 
255 | 		record, err := RecordFromBytes(recordBytes, wal.BaseTime())
256 | 		if err != nil {
257 | 			return err
258 | 		}
259 | 
260 | 		if err = cb(record, foff, uint64(len(recordBytes))); err != nil {
261 | 			return err
262 | 		}
263 | 	}
264 | 
265 | 	return nil
266 | }
267 | 


--------------------------------------------------------------------------------
/map_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	"math/rand"
  7 | 	"strconv"
  8 | 	"sync"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/spaolacci/murmur3"
 13 | 
 14 | 	"github.com/stretchr/testify/assert"
 15 | )
 16 | 
 17 | type mockSimpleMapOperator struct {
 18 | 	fixedValues []uint64
 19 | 	index       int
 20 | }
 21 | 
 22 | func (op *mockSimpleMapOperator) Hash(key *uint64) uint64 {
 23 | 	return *key
 24 | }
 25 | 
 26 | func (op *mockSimpleMapOperator) Equals(lhs, rhs *uint64) bool {
 27 | 	return *lhs == *rhs
 28 | }
 29 | 
 30 | func (op *mockSimpleMapOperator) Rand(n uint64) uint64 {
 31 | 	if len(op.fixedValues) == 0 {
 32 | 		return 0 // Default to first bucket if no fixed values provided
 33 | 	}
 34 | 	if op.index >= len(op.fixedValues) {
 35 | 		op.index = 0
 36 | 	}
 37 | 	val := op.fixedValues[op.index] % n
 38 | 	op.index++
 39 | 	return val
 40 | }
 41 | 
 42 | func (op *mockSimpleMapOperator) WallTime() time.Time {
 43 | 	return time.Now()
 44 | }
 45 | 
 46 | func TestMap_SimpleMapBasicOperations(t *testing.T) {
 47 | 	evictionOrder := []uint64{1, 2, 3} // Define a fixed eviction order
 48 | 	optr := &mockSimpleMapOperator{fixedValues: evictionOrder}
 49 | 	opts := &MapOptions{
 50 | 		Capacity:             100,
 51 | 		Limited:              80,
 52 | 		EvictionPoolCapacity: 16,
 53 | 		SampleKeys:           3,
 54 | 	}
 55 | 
 56 | 	m, err := NewMap[uint64, uint64](optr, opts)
 57 | 	assert.Nil(t, err)
 58 | 
 59 | 	key1, val1 := uint64(1), uint64(1)
 60 | 	key2, val2 := uint64(2), uint64(2)
 61 | 	key3, val3 := uint64(3), uint64(3)
 62 | 
 63 | 	// Test Set and Get
 64 | 	_, _ = m.Set(&key1, &val1)
 65 | 	_, _ = m.Set(&key2, &val2)
 66 | 	_, _ = m.Set(&key3, &val3)
 67 | 
 68 | 	res, err := m.Get(&key1)
 69 | 	assert.Nil(t, err)
 70 | 	assert.Equal(t, val1, *res)
 71 | 
 72 | 	res, err = m.Get(&key2)
 73 | 	assert.Nil(t, err)
 74 | 	assert.Equal(t, val2, *res)
 75 | 
 76 | 	// Update existing element
 77 | 	val1Updated := uint64(11)
 78 | 	old, err := m.Set(&key1, &val1Updated)
 79 | 	assert.Nil(t, err)
 80 | 	assert.Equal(t, *old, val1)
 81 | 
 82 | 	res, err = m.Get(&key1)
 83 | 	assert.Nil(t, err)
 84 | 	assert.Equal(t, val1Updated, *res)
 85 | 
 86 | 	// Exceed Limited and trigger eviction
 87 | 	for i := 4; i <= 81; i++ {
 88 | 		key, val := uint64(i), uint64(i)
 89 | 		_, err = m.Set(&key, &val)
 90 | 		assert.Nil(t, err)
 91 | 	}
 92 | 
 93 | 	_, err = m.Get(&evictionOrder[0])
 94 | 	assert.NotNil(t, err) // Since eviction occurs, the first evicted key should be removed
 95 | 
 96 | 	// Test Delete
 97 | 	old, err = m.Delete(&key2)
 98 | 	assert.Nil(t, err)
 99 | 	assert.Equal(t, *old, val2)
100 | 
101 | 	_, err = m.Get(&key2)
102 | 	assert.NotNil(t, err)
103 | 	assert.True(t, errors.Is(err, ErrKeyNotFound))
104 | }
105 | 
106 | func TestMap_SimpleMapEvictionOrder(t *testing.T) {
107 | 	evictionOrder := []uint64{1, 2, 3, 4, 5, 6} // Fixed eviction order
108 | 	optr := &mockSimpleMapOperator{fixedValues: evictionOrder}
109 | 	opts := &MapOptions{
110 | 		Capacity:             100,
111 | 		Limited:              80,
112 | 		EvictionPoolCapacity: 16,
113 | 		SampleKeys:           3,
114 | 	}
115 | 
116 | 	m, err := NewMap[uint64, uint64](optr, opts)
117 | 	assert.Nil(t, err)
118 | 
119 | 	// reach the limit
120 | 	for i := 1; i <= 80; i++ {
121 | 		key, val := uint64(i), uint64(i)
122 | 
123 | 		old, err := m.Set(&key, &val)
124 | 		assert.Nil(t, err)
125 | 		assert.Nil(t, old)
126 | 	}
127 | 
128 | 	// insert a new key, triggering eviction
129 | 	key81, val81 := uint64(81), uint64(81)
130 | 	old, err := m.Set(&key81, &val81)
131 | 	assert.Nil(t, err)
132 | 	assert.Equal(t, *old, uint64(1))
133 | 
134 | 	// the first evicted key should be 1
135 | 	key1 := uint64(1)
136 | 	_, err = m.Get(&key1)
137 | 	assert.NotNil(t, err)
138 | 	assert.True(t, errors.Is(err, ErrKeyNotFound))
139 | 
140 | 	// the remaining keys should still exist
141 | 	for i := 2; i <= 81; i++ {
142 | 		key, val := uint64(i), uint64(i)
143 | 		res, err := m.Get(&key)
144 | 		assert.Nil(t, err)
145 | 		assert.Equal(t, *res, val)
146 | 	}
147 | 
148 | 	// insert more keys to trigger further eviction
149 | 	key82, val82 := uint64(82), uint64(82)
150 | 	old, err = m.Set(&key82, &val82)
151 | 	assert.Nil(t, err)
152 | 	assert.Equal(t, *old, uint64(2))
153 | 
154 | 	// the second evicted key should be 2
155 | 	key2 := uint64(2)
156 | 	_, err = m.Get(&key2)
157 | 	assert.NotNil(t, err)
158 | 	assert.True(t, errors.Is(err, ErrKeyNotFound))
159 | 
160 | 	// ensure remaining keys are still available
161 | 	for i := 3; i <= 82; i++ {
162 | 		key, val := uint64(i), uint64(i)
163 | 		res, err := m.Get(&key)
164 | 		assert.Nil(t, err)
165 | 		assert.Equal(t, *res, val)
166 | 	}
167 | }
168 | 
169 | type mockShardMapOperator struct{}
170 | 
171 | func (op *mockShardMapOperator) Hash(key *[]byte) uint64 {
172 | 	hasher := murmur3.New64()
173 | 	hasher.Write(*key)
174 | 	return hasher.Sum64()
175 | }
176 | 
177 | func (op *mockShardMapOperator) Equals(lhs, rhs *[]byte) bool {
178 | 	return bytes.Equal(*lhs, *rhs)
179 | }
180 | 
181 | func (op *mockShardMapOperator) Rand(n uint64) uint64 {
182 | 	return uint64(rand.Int63n(int64(n)))
183 | }
184 | 
185 | func (op *mockShardMapOperator) WallTime() time.Time {
186 | 	return time.Now()
187 | }
188 | 
189 | func TestMap_ShardMapBasic(t *testing.T) {
190 | 	opts := &MapOptions{
191 | 		Capacity:             1000,
192 | 		Limited:              800,
193 | 		EvictionPoolCapacity: 16,
194 | 		SampleKeys:           3,
195 | 	}
196 | 
197 | 	m, err := NewShardMap[[]byte, []byte](&mockShardMapOperator{}, opts)
198 | 	assert.Nil(t, err)
199 | 
200 | 	key1, val1 := []byte("123"), []byte("123")
201 | 	key2, val2 := []byte("456"), []byte("456")
202 | 	key3, val3 := []byte("789"), []byte("789")
203 | 
204 | 	// Test Set and Get
205 | 	_, _ = m.Set(&key1, &val1)
206 | 	_, _ = m.Set(&key2, &val2)
207 | 	_, _ = m.Set(&key3, &val3)
208 | 
209 | 	res, err := m.Get(&key1)
210 | 	assert.Nil(t, err)
211 | 	assert.Equal(t, val1, *res)
212 | 
213 | 	res, err = m.Get(&key2)
214 | 	assert.Nil(t, err)
215 | 	assert.Equal(t, val2, *res)
216 | 
217 | 	// Update existing element
218 | 	val1Updated := []byte("111")
219 | 	old, err := m.Set(&key1, &val1Updated)
220 | 	assert.Nil(t, err)
221 | 	assert.Equal(t, *old, val1)
222 | 
223 | 	res, err = m.Get(&key1)
224 | 	assert.Nil(t, err)
225 | 	assert.Equal(t, val1Updated, *res)
226 | 
227 | 	// Test Delete
228 | 	old, err = m.Delete(&key2)
229 | 	assert.Nil(t, err)
230 | 	assert.Equal(t, *old, val2)
231 | 
232 | 	_, err = m.Get(&key2)
233 | 	assert.NotNil(t, err)
234 | 	assert.True(t, errors.Is(err, ErrKeyNotFound))
235 | }
236 | 
237 | func TestMap_ShardMapLRUEviction(t *testing.T) {
238 | 	opts := &MapOptions{
239 | 		Capacity:             1000000,
240 | 		Limited:              800000,
241 | 		EvictionPoolCapacity: 32,
242 | 		SampleKeys:           5,
243 | 	}
244 | 
245 | 	m, err := NewShardMap[[]byte, []byte](&mockShardMapOperator{}, opts)
246 | 	assert.Nil(t, err)
247 | 
248 | 	for i := 1; i < 1000000; i++ {
249 | 		numStr := strconv.Itoa(i)
250 | 		key, val := []byte(numStr), []byte(numStr)
251 | 
252 | 		_, err := m.Set(&key, &val)
253 | 		assert.Nil(t, err)
254 | 	}
255 | 
256 | 	// the first half of the elements are evicted more
257 | 	num := 0
258 | 	for i := 1; i <= 500000; i++ {
259 | 		numStr := strconv.Itoa(i)
260 | 		key, val := []byte(numStr), []byte(numStr)
261 | 		res, err := m.Get(&key)
262 | 		if err != nil && errors.Is(err, ErrKeyNotFound) {
263 | 			num++
264 | 		}
265 | 
266 | 		if err == nil {
267 | 			assert.Equal(t, *res, val)
268 | 		}
269 | 	}
270 | 
271 | 	// total eviction elements should be 200000 (capacity - limited)
272 | 	assert.True(t, num > 100000)
273 | }
274 | 
275 | func TestMap_ShardMapConcurrentReadAndWrite(t *testing.T) {
276 | 	opts := &MapOptions{
277 | 		Capacity:             1000000,
278 | 		Limited:              800000,
279 | 		EvictionPoolCapacity: 32,
280 | 		SampleKeys:           5,
281 | 	}
282 | 
283 | 	genBytes := func(i, j int) []byte {
284 | 		return []byte("i" + strconv.Itoa(i) + "j" + strconv.Itoa(j))
285 | 	}
286 | 
287 | 	m, err := NewShardMap[[]byte, []byte](&mockShardMapOperator{}, opts)
288 | 	assert.Nil(t, err)
289 | 
290 | 	var wg sync.WaitGroup
291 | 
292 | 	for i := 0; i < 20; i++ {
293 | 		wg.Add(1)
294 | 		go func(i int) {
295 | 			defer wg.Done()
296 | 			for j := 0; j < 1000; j++ {
297 | 				key := genBytes(i, j)
298 | 				val := genBytes(i, j)
299 | 
300 | 				_, err := m.Set(&key, &val)
301 | 				assert.Nil(t, err)
302 | 			}
303 | 
304 | 			for j := 0; j < 1000; j++ {
305 | 				key := genBytes(i, j)
306 | 				val, err := m.Get(&key)
307 | 				assert.Nil(t, err)
308 | 				assert.Equal(t, key, *val)
309 | 			}
310 | 		}(i)
311 | 	}
312 | 
313 | 	wg.Wait()
314 | }
315 | 


--------------------------------------------------------------------------------
/README-CN.md:
--------------------------------------------------------------------------------
  1 | # bitcaskDB 是什么？
  2 | 
  3 | bitcaskDB是一个基于bitcask存储模型的轻量级、快速、固定容量的键值对存储引擎。
  4 | 
  5 | 它最大的特点是在内存中缓存键值对的索引，每次查询只需要单次 disk seek。按照 100 字节 key，4KB value 的小对象计算，缓存 10 million 个对象，大约需要 1GB 内存，40GB 磁盘空间。相反，如果采用类似 redis，memcached 全内存的缓存方案，相比之下，内存的开销很大。
  6 | 
  7 | # 动机
  8 | 
  9 | - 硬件资源受限，如 4C8G 100G 磁盘
 10 | - 缓存数以千万的小对象
 11 | 
 12 | 
 13 | # 特性
 14 | 
 15 | - 追加写
 16 | - 固定长度的 namespace
 17 | - 固定磁盘容量和内存用量
 18 | - 细粒度的合并
 19 | - 近似 LRU 淘汰策略
 20 | - 自定义记录的元数据
 21 | - 自定义合并策略
 22 | - 自定义挑选策略
 23 | - 批量写
 24 | - 允许过期时间和数据指纹 Etag
 25 | - 基于 hint 的快速恢复
 26 | - 软删除
 27 | 
 28 | # 对比分析
 29 | 
 30 | ## LSM
 31 | - 追加写
 32 | - 读操作可能需要多次随机寻址
 33 | - 写放大
 34 |   - 链式合并
 35 | - 范围查询
 36 | - 有序性
 37 | - 回收磁盘空间较慢
 38 |   - 多个数据版本
 39 | 
 40 | 
 41 | ## B+Tree
 42 | - 原地更新
 43 | - 有序性
 44 | - 范围查询
 45 | - 很难回收磁盘空间
 46 | 
 47 | 
 48 | ## Bitcask
 49 | - 追加写
 50 | - 明确的查询和插入性能
 51 | - 查询仅需要单次寻址
 52 | - 快速的回收磁盘空间
 53 |   - 内存仅保留最新的数据版本
 54 | - 内存可使用多种数据模型，如 btree，hashtable
 55 |   - hashtable 更加紧凑，但无序，不支持范围查询
 56 |   - btree 支持范围查询，顺序迭代，但内存开销更大
 57 | 
 58 | 
 59 | # 快速开始
 60 | 
 61 | 
 62 | ```golang
 63 | import "github.com/wenzhang-dev/bitcaskDB"
 64 | 
 65 | const data = `
 66 | <!DOCTYPE html>
 67 | <html>
 68 | <head>
 69 |     <title>Hello Page</title>
 70 | </head>
 71 | <body>
 72 |     <h1>Hello, BitcaskDB!</h1>
 73 | </body>
 74 | </html>
 75 | `
 76 | 
 77 | func main() {
 78 |     opts := &bitcask.Options{
 79 |         Dir:                       "./bitcaskDB",
 80 |         WalMaxSize:                1024 * 1024 * 1024, // 1GB
 81 |         ManifestMaxSize:           1024 * 1024, // 1MB
 82 |         IndexCapacity:             10000000, // 10 million
 83 |         IndexLimited:              8000000,
 84 |         IndexEvictionPoolCapacity: 32,
 85 |         IndexSampleKeys:           5,
 86 |         DiskUsageLimited:          1024 * 1024 * 1024 * 100, // 100GB
 87 |         NsSize:                    DefaultNsSize,
 88 |         EtagSize:                  DefaultEtagSize,
 89 |     }
 90 | 
 91 |     db, err := bitcask.NewDB(opts)
 92 |     if err != nil {
 93 |         panic(err)
 94 |     }
 95 |     defer func() {
 96 |         _ = db.Close()
 97 |     }()
 98 | 
 99 |     ns := GenSha1NS("ns") // fixed-size ns
100 |     key := []byte("testKey")
101 |     value := []byte(data)
102 |     now := uint64(db.WallTime().Unix())
103 | 
104 |     // customized metadata
105 |     appMeta := make(map[string]string)
106 |     appMeta["type"] = "html"
107 |     meta := NewMeta(appMeta).SetExpire(now+60).SetEtag(GenSha1Etag(value))
108 | 
109 |     // set a key
110 |     err = db.Put(ns, key, value, meta, &WriteOptions{})
111 |     if err != nil {
112 |         panic(err)
113 |     }
114 | 
115 |     // get a key
116 |     readVal, readMeta, err := db.Get(ns, key, &ReadOptions{})
117 |     if err != nil {
118 |         panic(err)
119 |     }
120 | 
121 |     println(readVal)
122 |     println(readMeta)
123 | 
124 |     // delete a key
125 |     err = db.Delete(ns, key, &WriteOptions{})
126 |     if err != nil {
127 |         panic(err)
128 |     }
129 | }
130 | ```
131 | 
132 | 如果你想简单使用一个 database CRUD http server，可以考虑这个[仓库](https://github.com/wenzhang-dev/bitcaskDB-server)。
133 | 
134 | http server 以 docker 容器运行。顺便说，读写 bitcaskDB 的开销，相比网络通信的开销而言，可以忽略不计。
135 | 
136 | 
137 | # 性能测试
138 | 
139 | 读写 4KB 的压测报告如下：
140 | 
141 | ```
142 | go test -bench=PutGet -benchtime=60s -count=3 -timeout=50m
143 | goos: linux
144 | goarch: amd64
145 | pkg: github.com/wenzhang-dev/bitcaskDB/bench
146 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
147 | BenchmarkPutGet/put4K-8                  5331782   25259 ns/op   11795 B/op   21 allocs/op
148 | BenchmarkPutGet/put4K-8                  5130870   25417 ns/op   11767 B/op   21 allocs/op
149 | BenchmarkPutGet/put4K-8                  4898403   26676 ns/op   11742 B/op   21 allocs/op
150 | BenchmarkPutGet/batchPut4K-8            10548615   15340 ns/op    1695 B/op   11 allocs/op
151 | BenchmarkPutGet/batchPut4K-8             9220388   14278 ns/op    1694 B/op   11 allocs/op
152 | BenchmarkPutGet/batchPut4K-8            10363459   15019 ns/op    1686 B/op   11 allocs/op
153 | BenchmarkPutGet/get4K-8                  8812342    8076 ns/op   10119 B/op   10 allocs/op
154 | BenchmarkPutGet/get4K-8                  7963098    7952 ns/op   10119 B/op   10 allocs/op
155 | BenchmarkPutGet/get4K-8                  8480240    7997 ns/op   10119 B/op   10 allocs/op
156 | BenchmarkPutGet/concurrentGet4K-8       17233309    4427 ns/op   10044 B/op    7 allocs/op
157 | BenchmarkPutGet/concurrentGet4K-8       26745726    3681 ns/op   10044 B/op    7 allocs/op
158 | BenchmarkPutGet/concurrentGet4K-8       29305041    3654 ns/op   10044 B/op    7 allocs/op
159 | BenchmarkPutGet/concurrentPut4K-8        4558645   19829 ns/op    8340 B/op   18 allocs/op
160 | BenchmarkPutGet/concurrentPut4K-8        4433334   18664 ns/op   10031 B/op   18 allocs/op
161 | BenchmarkPutGet/concurrentPut4K-8        4366149   17031 ns/op    8175 B/op   17 allocs/op
162 | BenchmarkPutGet/concurrentBatchPut4K-8   9443377   12520 ns/op    1527 B/op    9 allocs/op
163 | BenchmarkPutGet/concurrentBatchPut4K-8  11338162   12429 ns/op    1517 B/op    9 allocs/op
164 | BenchmarkPutGet/concurrentBatchPut4K-8  11394081   12101 ns/op    1510 B/op    9 allocs/op
165 | PASS
166 | ok   github.com/wenzhang-dev/bitcaskDB/bench 2310.401s
167 | ```
168 | 
169 | 同时，也测试了几个主流的 KV 存储引擎在读写 4KB 的性能，并记录了它们在测试过程中的 RSS 占用。
170 | 性能测试仓库为：[codebase](https://github.com/wenzhang-dev/bitcaskDB-benchmark)
171 | 
172 | ```shell
173 | go test -bench=Read -benchtime=60s -timeout=30m -count=3
174 | goos: linux
175 | goarch: amd64
176 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark
177 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
178 | BenchmarkReadWithBitcaskDB/read4K-8  11459024   6313 ns/op  1.217 AvgRSS(GB)  1.275 PeakRSS(GB)  10120 B/op  10 allocs/op
179 | BenchmarkReadWithBitcaskDB/read4K-8  12512324   6522 ns/op  1.220 AvgRSS(GB)  1.234 PeakRSS(GB)  10120 B/op  10 allocs/op
180 | BenchmarkReadWithBitcaskDB/read4K-8  12414660   6468 ns/op  1.206 AvgRSS(GB)  1.231 PeakRSS(GB)  10120 B/op  10 allocs/op
181 | BenchmarkReadWithBadger/read4K-8      4575487  13526 ns/op  2.716 AvgRSS(GB)  4.350 PeakRSS(GB)  19416 B/op  43 allocs/op
182 | BenchmarkReadWithBadger/read4K-8      4960239  13741 ns/op  1.629 AvgRSS(GB)  1.681 PeakRSS(GB)  19406 B/op  43 allocs/op
183 | BenchmarkReadWithBadger/read4K-8      4851144  14429 ns/op  1.591 AvgRSS(GB)  1.650 PeakRSS(GB)  19422 B/op  44 allocs/op
184 | BenchmarkReadWithLevelDB/read4K-8     1569663  50710 ns/op  0.111 AvgRSS(GB)  0.134 PeakRSS(GB)  55021 B/op  35 allocs/op
185 | BenchmarkReadWithLevelDB/read4K-8     1000000  63066 ns/op  0.113 AvgRSS(GB)  0.129 PeakRSS(GB)  54264 B/op  35 allocs/op
186 | BenchmarkReadWithLevelDB/read4K-8     1236408  57268 ns/op  0.114 AvgRSS(GB)  0.138 PeakRSS(GB)  54624 B/op  35 allocs/op
187 | BenchmarkReadWithBoltDB/read4K-8     12587562   5269 ns/op  5.832 AvgRSS(GB)  5.838 PeakRSS(GB)    832 B/op  13 allocs/op
188 | BenchmarkReadWithBoltDB/read4K-8     16920481   4482 ns/op  5.832 AvgRSS(GB)  5.833 PeakRSS(GB)    832 B/op  13 allocs/op
189 | BenchmarkReadWithBoltDB/read4K-8     19141418   5276 ns/op  5.832 AvgRSS(GB)  5.835 PeakRSS(GB)    832 B/op  13 allocs/op
190 | PASS
191 | ok   github.com/wenzhang-dev/bitcaskDB-benchmark 1475.172s
192 | ```
193 | 
194 | 
195 | ```shell
196 | go test -bench=Write -benchtime=60s -timeout=30m -count=3
197 | goos: linux
198 | goarch: amd64
199 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark
200 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
201 | BenchmarkWriteWithBitcaskDB/write4K-8  8334304  13217 ns/op  0.7905 AvgRSS(GB)   0.934 PeakRSS(GB)    1666 B/op   11 allocs/op
202 | BenchmarkWriteWithBitcaskDB/write4K-8  5323338  14976 ns/op  0.9732 AvgRSS(GB)   1.058 PeakRSS(GB)    1727 B/op   12 allocs/op
203 | BenchmarkWriteWithBitcaskDB/write4K-8  5435398  13929 ns/op  0.9639 AvgRSS(GB)   1.122 PeakRSS(GB)    1756 B/op   12 allocs/op
204 | BenchmarkWriteWithLevelDB/write4K-8    1047753  68691 ns/op  0.0615 AvgRSS(GB)  0.0636 PeakRSS(GB)    2946 B/op   16 allocs/op
205 | BenchmarkWriteWithLevelDB/write4K-8    1179555  71497 ns/op  0.0617 AvgRSS(GB)  0.0634 PeakRSS(GB)    3250 B/op   18 allocs/op
206 | BenchmarkWriteWithLevelDB/write4K-8     992488  74130 ns/op  0.0613 AvgRSS(GB)  0.0625 PeakRSS(GB)    3444 B/op   19 allocs/op
207 | BenchmarkWriteWithBadger/write4K-8     3776720  20036 ns/op   6.409 AvgRSS(GB)   7.534 PeakRSS(GB)   30062 B/op   68 allocs/op
208 | BenchmarkWriteWithBadger/write4K-8     4106070  50959 ns/op   10.77 AvgRSS(GB)   13.63 PeakRSS(GB)  115442 B/op  152 allocs/op
209 | BenchmarkWriteWithBadger/write4K-8     1491906  49955 ns/op   11.45 AvgRSS(GB)   13.72 PeakRSS(GB)   88941 B/op  130 allocs/op
210 | BenchmarkWriteWithBoltDB/write4K-8     2808206  23131 ns/op   0.626 AvgRSS(GB)   0.999 PeakRSS(GB)    7579 B/op   11 allocs/op
211 | BenchmarkWriteWithBoltDB/write4K-8     4303538  22836 ns/op   1.713 AvgRSS(GB)   2.971 PeakRSS(GB)    7765 B/op   11 allocs/op
212 | BenchmarkWriteWithBoltDB/write4K-8     3755002  19385 ns/op   2.481 AvgRSS(GB)   2.872 PeakRSS(GB)    7896 B/op   12 allocs/op
213 | PASS
214 | ok   github.com/wenzhang-dev/bitcaskDB-benchmark 1541.068s
215 | ```
216 | 
217 | 指定磁盘容量的压测报告: [benchmark2](https://github.com/wenzhang-dev/bitcaskDB/blob/main/bench/benchmark2)
218 | 


--------------------------------------------------------------------------------
/db_impl_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"crypto/sha1"
  5 | 	"os"
  6 | 	"strconv"
  7 | 	"sync"
  8 | 	"testing"
  9 | 
 10 | 	"github.com/stretchr/testify/assert"
 11 | )
 12 | 
 13 | func sha1Bytes(input string) [20]byte {
 14 | 	return sha1.Sum([]byte(input))
 15 | }
 16 | 
 17 | func setupDB(t *testing.T) *DBImpl {
 18 | 	dir := "./test_bitcask_db"
 19 | 	_ = os.RemoveAll(dir)
 20 | 
 21 | 	assert.Nil(t, os.MkdirAll(dir, os.ModePerm))
 22 | 
 23 | 	opts := &Options{
 24 | 		Dir:                            dir,
 25 | 		WalMaxSize:                     1024 * 1024, // 1MB
 26 | 		ManifestMaxSize:                1024 * 1024, // 1MB
 27 | 		IndexCapacity:                  1000000,
 28 | 		IndexLimited:                   800000,
 29 | 		IndexEvictionPoolCapacity:      32,
 30 | 		IndexSampleKeys:                5,
 31 | 		BlockCacheCapacity:             8192, // 256MB
 32 | 		BlockCacheLimited:              8192,
 33 | 		BlockCacheSampleKeys:           5,
 34 | 		BlockCacheEvictionPoolCapacity: 32,
 35 | 		BlockReaderConcurrent:          32,
 36 | 		NsSize:                         DefaultNsSize,
 37 | 		EtagSize:                       DefaultEtagSize,
 38 | 	}
 39 | 
 40 | 	db, err := NewDB(opts)
 41 | 	assert.Nil(t, err)
 42 | 	assert.NotNil(t, db)
 43 | 	return db
 44 | }
 45 | 
 46 | func loadDB(t *testing.T) *DBImpl {
 47 | 	dir := "./test_bitcask_db"
 48 | 
 49 | 	opts := &Options{
 50 | 		Dir:                            dir,
 51 | 		WalMaxSize:                     1024 * 1024, // 1MB
 52 | 		ManifestMaxSize:                1024 * 1024, // 1MB
 53 | 		IndexCapacity:                  1000000,
 54 | 		IndexLimited:                   800000,
 55 | 		IndexEvictionPoolCapacity:      32,
 56 | 		IndexSampleKeys:                5,
 57 | 		BlockCacheCapacity:             8192, // 256MB
 58 | 		BlockCacheLimited:              8192,
 59 | 		BlockCacheSampleKeys:           5,
 60 | 		BlockCacheEvictionPoolCapacity: 32,
 61 | 		BlockReaderConcurrent:          32,
 62 | 		NsSize:                         DefaultNsSize,
 63 | 		EtagSize:                       DefaultEtagSize,
 64 | 	}
 65 | 
 66 | 	db, err := NewDB(opts)
 67 | 	assert.Nil(t, err)
 68 | 	assert.NotNil(t, db)
 69 | 	return db
 70 | }
 71 | 
 72 | func teardownDB(db *DBImpl) {
 73 | 	db.Close()
 74 | 	_ = os.RemoveAll(db.opts.Dir)
 75 | }
 76 | 
 77 | func TestDBImplBasicWriteRead(t *testing.T) {
 78 | 	db := setupDB(t)
 79 | 	defer teardownDB(db)
 80 | 
 81 | 	now := uint64(db.WallTime().Unix())
 82 | 	ns := sha1Bytes("namespace")
 83 | 	key := []byte("testKey")
 84 | 	value := []byte("testValue")
 85 | 	etag := sha1Bytes(string(value))
 86 | 	meta := NewMeta(nil).SetExpire(now + 60).SetEtag(etag[:])
 87 | 
 88 | 	err := db.Put(ns[:], key, value, meta, &WriteOptions{})
 89 | 	assert.NoError(t, err)
 90 | 
 91 | 	// write without options
 92 | 	err = db.Put(ns[:], key, value, meta, nil)
 93 | 	assert.Nil(t, err)
 94 | 
 95 | 	readVal, readMeta, err := db.Get(ns[:], key, &ReadOptions{})
 96 | 	assert.NoError(t, err)
 97 | 	assert.Equal(t, value, readVal)
 98 | 	assert.Equal(t, meta, readMeta)
 99 | 
100 | 	// read without options
101 | 	readVal, readMeta, err = db.Get(ns[:], key, nil)
102 | 	assert.NoError(t, err)
103 | 	assert.Equal(t, value, readVal)
104 | 	assert.Equal(t, meta, readMeta)
105 | }
106 | 
107 | func TestDBImplBasicWriteReadV2(t *testing.T) {
108 | 	db := setupDB(t)
109 | 	defer teardownDB(db)
110 | 
111 | 	now := uint64(db.WallTime().Unix())
112 | 	ns := sha1Bytes("namespace")
113 | 	key := []byte("testKey")
114 | 	value := []byte("testValue")
115 | 	etag := sha1Bytes(string(value))
116 | 	meta := NewMeta(nil).SetExpire(now + 60).SetEtag(etag[:])
117 | 
118 | 	err := db.Put(ns[:], key, value, meta, &WriteOptions{})
119 | 	assert.NoError(t, err)
120 | 
121 | 	// write without options
122 | 	err = db.Put(ns[:], key, value, meta, nil)
123 | 	assert.Nil(t, err)
124 | 
125 | 	readVal, readMeta, err := db.GetV2(ns[:], key, &ReadOptions{})
126 | 	assert.NoError(t, err)
127 | 	assert.Equal(t, value, readVal)
128 | 	assert.Equal(t, meta, readMeta)
129 | 
130 | 	// read without options
131 | 	readVal, readMeta, err = db.GetV2(ns[:], key, nil)
132 | 	assert.NoError(t, err)
133 | 	assert.Equal(t, value, readVal)
134 | 	assert.Equal(t, meta, readMeta)
135 | }
136 | 
137 | func TestDBImplWriteEmptyValue(t *testing.T) {
138 | 	db := setupDB(t)
139 | 	defer teardownDB(db)
140 | 
141 | 	ns := sha1Bytes("namespace")
142 | 	key := []byte("testKey")
143 | 	meta := NewMeta(nil)
144 | 
145 | 	// no etag, no expire and no value
146 | 	err := db.Put(ns[:], key, nil, meta, &WriteOptions{})
147 | 	assert.NoError(t, err)
148 | 
149 | 	readVal, _, err := db.Get(ns[:], key, &ReadOptions{})
150 | 	assert.NoError(t, err)
151 | 	assert.Equal(t, len(readVal), 0)
152 | }
153 | 
154 | func TestDBImplWriteDeleteRead(t *testing.T) {
155 | 	db := setupDB(t)
156 | 	defer teardownDB(db)
157 | 
158 | 	ns := sha1Bytes("namespace")
159 | 	key := []byte("testKey")
160 | 	value := []byte("testValue")
161 | 	meta := NewMeta(nil)
162 | 
163 | 	_ = db.Put(ns[:], key, value, meta, &WriteOptions{})
164 | 	_ = db.Delete(ns[:], key, &WriteOptions{})
165 | 
166 | 	readVal, _, err := db.Get(ns[:], key, &ReadOptions{})
167 | 	assert.ErrorIs(t, err, ErrKeyNotFound)
168 | 	assert.Nil(t, readVal)
169 | }
170 | 
171 | func TestDBImplWALRotate(t *testing.T) {
172 | 	db := setupDB(t)
173 | 	defer teardownDB(db)
174 | 
175 | 	ns := sha1Bytes("wal-rotation")
176 | 	meta := NewMeta(nil)
177 | 	opts := &WriteOptions{}
178 | 
179 | 	initFid := db.manifest.active.Fid()
180 | 
181 | 	// 50000 > (1MB Wal / min 50B per record = 20000)
182 | 	for i := 0; i < 50000; i++ {
183 | 		key := sha1Bytes("key" + strconv.Itoa(i))
184 | 		value := sha1Bytes("val" + strconv.Itoa(i))
185 | 		err := db.Put(ns[:], key[:], value[:], meta, opts)
186 | 		assert.Nil(t, err)
187 | 	}
188 | 
189 | 	assert.NotEqual(t, initFid, db.manifest.active.Fid())
190 | }
191 | 
192 | func TestDBImplPersistence(t *testing.T) {
193 | 	db := setupDB(t)
194 | 
195 | 	bin4K := GenNKBytes(4)
196 | 	appMeta := make(map[string]string)
197 | 	appMeta["test"] = string(bin4K)
198 | 
199 | 	ns := sha1Bytes("persistence")
200 | 	meta := NewMeta(appMeta)
201 | 	opts := &WriteOptions{}
202 | 
203 | 	// write 10000 keys
204 | 	for i := 0; i < 10000; i++ {
205 | 		key := sha1Bytes("key" + strconv.Itoa(i))
206 | 		value := sha1Bytes("val" + strconv.Itoa(i))
207 | 		err := db.Put(ns[:], key[:], value[:], meta, opts)
208 | 
209 | 		assert.Nil(t, err)
210 | 	}
211 | 
212 | 	// check
213 | 	for i := 0; i < 10000; i++ {
214 | 		key := sha1Bytes("key" + strconv.Itoa(i))
215 | 		value := sha1Bytes("val" + strconv.Itoa(i))
216 | 		readVal, readMeta, err := db.Get(ns[:], key[:], &ReadOptions{})
217 | 
218 | 		assert.Nil(t, err)
219 | 		assert.Equal(t, readVal, value[:])
220 | 		assert.Equal(t, readMeta.AppMeta, meta.AppMeta)
221 | 	}
222 | 
223 | 	// re-open db
224 | 	db.Close()
225 | 
226 | 	db = loadDB(t)
227 | 	defer teardownDB(db)
228 | 
229 | 	// check again
230 | 	for i := 0; i < 10000; i++ {
231 | 		key := sha1Bytes("key" + strconv.Itoa(i))
232 | 		value := sha1Bytes("val" + strconv.Itoa(i))
233 | 		readVal, readMeta, err := db.Get(ns[:], key[:], &ReadOptions{})
234 | 
235 | 		assert.Nil(t, err)
236 | 		assert.Equal(t, readVal, value[:])
237 | 		assert.Equal(t, readMeta.AppMeta, meta.AppMeta)
238 | 	}
239 | }
240 | 
241 | /*
242 | func TestDBImplBatchWrite(t *testing.T) {
243 | }
244 | */
245 | 
246 | func TestDBImplConcurrentWriteAndRead(t *testing.T) {
247 | 	db := setupDB(t)
248 | 	defer teardownDB(db)
249 | 
250 | 	var wg sync.WaitGroup
251 | 	ns := sha1Bytes("concurrent")
252 | 	meta := NewMeta(nil)
253 | 
254 | 	genBytes := func(i, j int) []byte {
255 | 		return []byte("i" + strconv.Itoa(i) + "j" + strconv.Itoa(j))
256 | 	}
257 | 
258 | 	// total: 25000 keys
259 | 	for i := 0; i < 50; i++ {
260 | 		wg.Add(1)
261 | 		// each goroutine writes 500 keys
262 | 		go func(i int) {
263 | 			defer wg.Done()
264 | 			for j := 0; j < 500; j++ {
265 | 				// key equals it's value
266 | 				key := genBytes(i, j)
267 | 				value := genBytes(i, j)
268 | 				err := db.Put(ns[:], key, value, meta, &WriteOptions{})
269 | 				assert.Nil(t, err)
270 | 			}
271 | 
272 | 			// check
273 | 			for j := 0; j < 500; j++ {
274 | 				key := genBytes(i, j)
275 | 				val, _, err := db.Get(ns[:], key, &ReadOptions{})
276 | 				if err != nil {
277 | 					print(err.Error())
278 | 				}
279 | 				assert.Nil(t, err)
280 | 				assert.Equal(t, val, key)
281 | 			}
282 | 		}(i)
283 | 	}
284 | 
285 | 	wg.Wait()
286 | }
287 | 
288 | func TestDBImplConcurrentWriteAndReadV2(t *testing.T) {
289 | 	db := setupDB(t)
290 | 	defer teardownDB(db)
291 | 
292 | 	var wg sync.WaitGroup
293 | 	ns := sha1Bytes("concurrent")
294 | 	meta := NewMeta(nil)
295 | 
296 | 	genBytes := func(i, j int) []byte {
297 | 		return []byte("i" + strconv.Itoa(i) + "j" + strconv.Itoa(j))
298 | 	}
299 | 
300 | 	// total: 25000 keys
301 | 	for i := 0; i < 50; i++ {
302 | 		wg.Add(1)
303 | 		// each goroutine writes 500 keys
304 | 		go func(i int) {
305 | 			defer wg.Done()
306 | 			for j := 0; j < 500; j++ {
307 | 				// key equals it's value
308 | 				key := genBytes(i, j)
309 | 				value := genBytes(i, j)
310 | 				err := db.Put(ns[:], key, value, meta, &WriteOptions{})
311 | 				assert.Nil(t, err)
312 | 			}
313 | 
314 | 			// check
315 | 			for j := 0; j < 500; j++ {
316 | 				key := genBytes(i, j)
317 | 				val, _, err := db.GetV2(ns[:], key, &ReadOptions{})
318 | 				if err != nil {
319 | 					print(err.Error())
320 | 				}
321 | 				assert.Nil(t, err)
322 | 				assert.Equal(t, val, key)
323 | 			}
324 | 		}(i)
325 | 	}
326 | 
327 | 	wg.Wait()
328 | }
329 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <strong>
  3 | <samp>
  4 | 
  5 | [English](https://github.com/wenzhang-dev/bitcaskDB/blob/main/README.md) · [简体中文](https://github.com/wenzhang-dev/bitcaskDB/blob/main/README-CN.md)
  6 | 
  7 | </samp>
  8 | </strong>
  9 | </div>
 10 | 
 11 | # What is bitcaskDB
 12 | 
 13 | bitcaskDB is a light-weight, fast, fixed capacity key/value storage engine base on bitcask storage model.
 14 | 
 15 | Its biggest feature is that it caches the index of key-value pairs in memory, and each query only requires a single disk seek. Based on the calculation of small objects with 100 bytes of key and 4KB of value, caching 10 million objects requires about 1GB of memory and 40GB of disk space. On the contrary, if a full-memory caching solution like redis or memcached is used, the memory overhead is very high.
 16 | 
 17 | 
 18 | # Motivation
 19 | 
 20 | - limited hardware resources
 21 | - cache tens of millions of small objects
 22 | 
 23 | 
 24 | # Features
 25 | - append-only
 26 | - fixed-size namespace
 27 | - fixed memory and disk usage
 28 | - fine-grained compaction
 29 | - LRU-like eviction policy in memory
 30 | - customized record metadata
 31 | - customized compaction filter
 32 | - customized compaction picker
 33 | - bulk writes
 34 | - allow expire and value fingerprint(etag)
 35 | - fast recover based on hint wal
 36 | - soft deletion
 37 | 
 38 | 
 39 | # Comparation
 40 | 
 41 | ## LSM
 42 | - append-only
 43 | - multiple disk seek in worst case
 44 | - write amplification
 45 |   - chained compaction
 46 | - range search
 47 | - ordered
 48 | - slow to reclaim disk space
 49 |   - multiple data version
 50 | 
 51 | 
 52 | ## B+Tree
 53 | - update-inplace
 54 | - ordered
 55 | - range search
 56 | - hard to reclaim disk space
 57 | 
 58 | 
 59 | ## Bitcask
 60 | - append-only
 61 | - predictable lookup and insert performance
 62 | - single seek to retrieve any value
 63 | - fast to reclaim disk space
 64 |   - only one data version in memory
 65 | - multiple data model in memory, such as btree, hashtable
 66 |   - hashtable is more compact, but un-ordered and un-support range search
 67 | 
 68 | 
 69 | # Getting started
 70 | 
 71 | 
 72 | ```golang
 73 | import "github.com/wenzhang-dev/bitcaskDB"
 74 | 
 75 | const data = `
 76 | <!DOCTYPE html>
 77 | <html>
 78 | <head>
 79 |     <title>Hello Page</title>
 80 | </head>
 81 | <body>
 82 |     <h1>Hello, BitcaskDB!</h1>
 83 | </body>
 84 | </html>
 85 | `
 86 | 
 87 | func main() {
 88 |     opts := &bitcask.Options{
 89 |         Dir:                       "./bitcaskDB",
 90 |         WalMaxSize:                1024 * 1024 * 1024, // 1GB
 91 |         ManifestMaxSize:           1024 * 1024, // 1MB
 92 |         IndexCapacity:             10000000, // 10 million
 93 |         IndexLimited:              8000000,
 94 |         IndexEvictionPoolCapacity: 32,
 95 |         IndexSampleKeys:           5,
 96 |         DiskUsageLimited:          1024 * 1024 * 1024 * 100, // 100GB
 97 |         NsSize:                    DefaultNsSize,
 98 |         EtagSize:                  DefaultEtagSize,
 99 |     }
100 | 
101 |     db, err := bitcask.NewDB(opts)
102 |     if err != nil {
103 |         panic(err)
104 |     }
105 |     defer func() {
106 |         _ = db.Close()
107 |     }()
108 | 
109 |     ns := GenSha1NS("ns") // fixed-size ns
110 |     key := []byte("testKey")
111 |     value := []byte(data)
112 |     now := uint64(db.WallTime().Unix())
113 | 
114 |     // customized metadata
115 |     appMeta := make(map[string]string)
116 |     appMeta["type"] = "html"
117 |     meta := NewMeta(appMeta).SetExpire(now+60).SetEtag(GenSha1Etag(value))
118 | 
119 |     // set a key
120 |     err = db.Put(ns, key, value, meta, &WriteOptions{})
121 |     if err != nil {
122 |         panic(err)
123 |     }
124 | 
125 |     // get a key
126 |     readVal, readMeta, err := db.Get(ns, key, &ReadOptions{})
127 |     if err != nil {
128 |         panic(err)
129 |     }
130 | 
131 |     println(readVal)
132 |     println(readMeta)
133 | 
134 |     // delete a key
135 |     err = db.Delete(ns, key, &WriteOptions{})
136 |     if err != nil {
137 |         panic(err)
138 |     }
139 | }
140 | ```
141 | 
142 | If you want to simply use a database CRUD http server, consider this [repository](https://github.com/wenzhang-dev/bitcaskDB-server).
143 | 
144 | The http server runs as a docker container. By the way, the overhead of reading and writing bitcaskDB is negligible compared to the overhead of network communication.
145 | 
146 | 
147 | # Benchmark
148 | 
149 | Here are the benchmarks for reading and writing 4KB:
150 | 
151 | ```
152 | go test -bench=PutGet -benchtime=60s -count=3 -timeout=50m
153 | goos: linux
154 | goarch: amd64
155 | pkg: github.com/wenzhang-dev/bitcaskDB/bench
156 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
157 | BenchmarkPutGet/put4K-8                  5331782   25259 ns/op   11795 B/op   21 allocs/op
158 | BenchmarkPutGet/put4K-8                  5130870   25417 ns/op   11767 B/op   21 allocs/op
159 | BenchmarkPutGet/put4K-8                  4898403   26676 ns/op   11742 B/op   21 allocs/op
160 | BenchmarkPutGet/batchPut4K-8            10548615   15340 ns/op    1695 B/op   11 allocs/op
161 | BenchmarkPutGet/batchPut4K-8             9220388   14278 ns/op    1694 B/op   11 allocs/op
162 | BenchmarkPutGet/batchPut4K-8            10363459   15019 ns/op    1686 B/op   11 allocs/op
163 | BenchmarkPutGet/get4K-8                  8812342    8076 ns/op   10119 B/op   10 allocs/op
164 | BenchmarkPutGet/get4K-8                  7963098    7952 ns/op   10119 B/op   10 allocs/op
165 | BenchmarkPutGet/get4K-8                  8480240    7997 ns/op   10119 B/op   10 allocs/op
166 | BenchmarkPutGet/concurrentGet4K-8       17233309    4427 ns/op   10044 B/op    7 allocs/op
167 | BenchmarkPutGet/concurrentGet4K-8       26745726    3681 ns/op   10044 B/op    7 allocs/op
168 | BenchmarkPutGet/concurrentGet4K-8       29305041    3654 ns/op   10044 B/op    7 allocs/op
169 | BenchmarkPutGet/concurrentPut4K-8        4558645   19829 ns/op    8340 B/op   18 allocs/op
170 | BenchmarkPutGet/concurrentPut4K-8        4433334   18664 ns/op   10031 B/op   18 allocs/op
171 | BenchmarkPutGet/concurrentPut4K-8        4366149   17031 ns/op    8175 B/op   17 allocs/op
172 | BenchmarkPutGet/concurrentBatchPut4K-8   9443377   12520 ns/op    1527 B/op    9 allocs/op
173 | BenchmarkPutGet/concurrentBatchPut4K-8  11338162   12429 ns/op    1517 B/op    9 allocs/op
174 | BenchmarkPutGet/concurrentBatchPut4K-8  11394081   12101 ns/op    1510 B/op    9 allocs/op
175 | PASS
176 | ok   github.com/wenzhang-dev/bitcaskDB/bench 2310.401s
177 | ```
178 | 
179 | Here, several popular KV storage engines are tested for reading and writing 4KB, and their RSS usages are recorded.
180 | The repository for this benchmark is: [codebase](https://github.com/wenzhang-dev/bitcaskDB-benchmark)
181 | 
182 | ```shell
183 | go test -bench=Read -benchtime=60s -timeout=30m -count=3
184 | goos: linux
185 | goarch: amd64
186 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark
187 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
188 | BenchmarkReadWithBitcaskDB/read4K-8  11459024   6313 ns/op  1.217 AvgRSS(GB)  1.275 PeakRSS(GB)  10120 B/op  10 allocs/op
189 | BenchmarkReadWithBitcaskDB/read4K-8  12512324   6522 ns/op  1.220 AvgRSS(GB)  1.234 PeakRSS(GB)  10120 B/op  10 allocs/op
190 | BenchmarkReadWithBitcaskDB/read4K-8  12414660   6468 ns/op  1.206 AvgRSS(GB)  1.231 PeakRSS(GB)  10120 B/op  10 allocs/op
191 | BenchmarkReadWithBadger/read4K-8      4575487  13526 ns/op  2.716 AvgRSS(GB)  4.350 PeakRSS(GB)  19416 B/op  43 allocs/op
192 | BenchmarkReadWithBadger/read4K-8      4960239  13741 ns/op  1.629 AvgRSS(GB)  1.681 PeakRSS(GB)  19406 B/op  43 allocs/op
193 | BenchmarkReadWithBadger/read4K-8      4851144  14429 ns/op  1.591 AvgRSS(GB)  1.650 PeakRSS(GB)  19422 B/op  44 allocs/op
194 | BenchmarkReadWithLevelDB/read4K-8     1569663  50710 ns/op  0.111 AvgRSS(GB)  0.134 PeakRSS(GB)  55021 B/op  35 allocs/op
195 | BenchmarkReadWithLevelDB/read4K-8     1000000  63066 ns/op  0.113 AvgRSS(GB)  0.129 PeakRSS(GB)  54264 B/op  35 allocs/op
196 | BenchmarkReadWithLevelDB/read4K-8     1236408  57268 ns/op  0.114 AvgRSS(GB)  0.138 PeakRSS(GB)  54624 B/op  35 allocs/op
197 | BenchmarkReadWithBoltDB/read4K-8     12587562   5269 ns/op  5.832 AvgRSS(GB)  5.838 PeakRSS(GB)    832 B/op  13 allocs/op
198 | BenchmarkReadWithBoltDB/read4K-8     16920481   4482 ns/op  5.832 AvgRSS(GB)  5.833 PeakRSS(GB)    832 B/op  13 allocs/op
199 | BenchmarkReadWithBoltDB/read4K-8     19141418   5276 ns/op  5.832 AvgRSS(GB)  5.835 PeakRSS(GB)    832 B/op  13 allocs/op
200 | PASS
201 | ok   github.com/wenzhang-dev/bitcaskDB-benchmark 1475.172s
202 | ```
203 | 
204 | 
205 | ```shell
206 | go test -bench=Write -benchtime=60s -timeout=30m -count=3
207 | goos: linux
208 | goarch: amd64
209 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark
210 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz
211 | BenchmarkWriteWithBitcaskDB/write4K-8  8334304  13217 ns/op  0.7905 AvgRSS(GB)   0.934 PeakRSS(GB)    1666 B/op   11 allocs/op
212 | BenchmarkWriteWithBitcaskDB/write4K-8  5323338  14976 ns/op  0.9732 AvgRSS(GB)   1.058 PeakRSS(GB)    1727 B/op   12 allocs/op
213 | BenchmarkWriteWithBitcaskDB/write4K-8  5435398  13929 ns/op  0.9639 AvgRSS(GB)   1.122 PeakRSS(GB)    1756 B/op   12 allocs/op
214 | BenchmarkWriteWithLevelDB/write4K-8    1047753  68691 ns/op  0.0615 AvgRSS(GB)  0.0636 PeakRSS(GB)    2946 B/op   16 allocs/op
215 | BenchmarkWriteWithLevelDB/write4K-8    1179555  71497 ns/op  0.0617 AvgRSS(GB)  0.0634 PeakRSS(GB)    3250 B/op   18 allocs/op
216 | BenchmarkWriteWithLevelDB/write4K-8     992488  74130 ns/op  0.0613 AvgRSS(GB)  0.0625 PeakRSS(GB)    3444 B/op   19 allocs/op
217 | BenchmarkWriteWithBadger/write4K-8     3776720  20036 ns/op   6.409 AvgRSS(GB)   7.534 PeakRSS(GB)   30062 B/op   68 allocs/op
218 | BenchmarkWriteWithBadger/write4K-8     4106070  50959 ns/op   10.77 AvgRSS(GB)   13.63 PeakRSS(GB)  115442 B/op  152 allocs/op
219 | BenchmarkWriteWithBadger/write4K-8     1491906  49955 ns/op   11.45 AvgRSS(GB)   13.72 PeakRSS(GB)   88941 B/op  130 allocs/op
220 | BenchmarkWriteWithBoltDB/write4K-8     2808206  23131 ns/op   0.626 AvgRSS(GB)   0.999 PeakRSS(GB)    7579 B/op   11 allocs/op
221 | BenchmarkWriteWithBoltDB/write4K-8     4303538  22836 ns/op   1.713 AvgRSS(GB)   2.971 PeakRSS(GB)    7765 B/op   11 allocs/op
222 | BenchmarkWriteWithBoltDB/write4K-8     3755002  19385 ns/op   2.481 AvgRSS(GB)   2.872 PeakRSS(GB)    7896 B/op   12 allocs/op
223 | PASS
224 | ok   github.com/wenzhang-dev/bitcaskDB-benchmark 1541.068s
225 | ```
226 | 
227 | The benchmarks with specified disk capacity: [benchmark2](https://github.com/wenzhang-dev/bitcaskDB/blob/main/bench/benchmark2)
228 | 


--------------------------------------------------------------------------------
/compaction_test.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"strconv"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	"github.com/stretchr/testify/assert"
  9 | )
 10 | 
 11 | func TestCompaction_OneFullRewriteWal(t *testing.T) {
 12 | 	db := setupDB(t)
 13 | 	defer teardownDB(db)
 14 | 
 15 | 	// the wal capacity is 1MB
 16 | 	// one wal can store up to 256 elements
 17 | 	meta := NewMeta(nil)
 18 | 	bin4K := GenNKBytes(4)
 19 | 	ns := sha1Bytes("compaction")
 20 | 	opts := &WriteOptions{}
 21 | 
 22 | 	for i := 0; i < 100; i++ {
 23 | 		key := sha1Bytes(strconv.Itoa(i))
 24 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
 25 | 		assert.Nil(t, err)
 26 | 	}
 27 | 
 28 | 	// only one wal
 29 | 	assert.Equal(t, len(db.manifest.wals), 1)
 30 | 
 31 | 	// manual rotate wal
 32 | 	_, err := db.manifest.RotateWal()
 33 | 	assert.Nil(t, err)
 34 | 	assert.Equal(t, len(db.manifest.wals), 2)
 35 | 
 36 | 	// re-write the data in active
 37 | 	// the data in the previous wal should be evicted
 38 | 	for i := 0; i < 100; i++ {
 39 | 		key := sha1Bytes(strconv.Itoa(i))
 40 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
 41 | 		assert.Nil(t, err)
 42 | 	}
 43 | 
 44 | 	// reach here: only two wals
 45 | 	// manual trigger compaction
 46 | 	db.maybeScheduleCompaction()
 47 | 	assert.True(t, db.compacting.Load())
 48 | 
 49 | 	// wait up to 10 seconds
 50 | 	waitTimes := 0
 51 | 	for {
 52 | 		if !db.compacting.Load() {
 53 | 			break
 54 | 		}
 55 | 
 56 | 		waitTimes++
 57 | 		if waitTimes > 10 {
 58 | 			break
 59 | 		}
 60 | 		time.Sleep(time.Second)
 61 | 	}
 62 | 
 63 | 	// the compaction should be finished
 64 | 	assert.True(t, waitTimes <= 10)
 65 | 	assert.False(t, db.compacting.Load())
 66 | 
 67 | 	// after compaction, only one wal lefts
 68 | 	assert.Equal(t, len(db.manifest.wals), 1)
 69 | 
 70 | 	// after compaction, we can get the data
 71 | 	for i := 0; i < 100; i++ {
 72 | 		key := sha1Bytes(strconv.Itoa(i))
 73 | 		readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{})
 74 | 
 75 | 		assert.Nil(t, err)
 76 | 		assert.Equal(t, bin4K, readVal)
 77 | 	}
 78 | }
 79 | 
 80 | func TestCompaction_OneNonFullRewriteWal(t *testing.T) {
 81 | 	db := setupDB(t)
 82 | 	defer teardownDB(db)
 83 | 
 84 | 	// the wal capacity is 1MB
 85 | 	// one wal can store up to 256 elements
 86 | 	meta := NewMeta(nil)
 87 | 	bin4K := GenNKBytes(4)
 88 | 	ns := sha1Bytes("compaction")
 89 | 	opts := &WriteOptions{}
 90 | 
 91 | 	for i := 0; i < 100; i++ {
 92 | 		key := sha1Bytes(strconv.Itoa(i))
 93 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
 94 | 		assert.Nil(t, err)
 95 | 	}
 96 | 
 97 | 	// only one wal
 98 | 	assert.Equal(t, len(db.manifest.wals), 1)
 99 | 
100 | 	// manual rotate wal
101 | 	_, err := db.manifest.RotateWal()
102 | 	assert.Nil(t, err)
103 | 	assert.Equal(t, len(db.manifest.wals), 2)
104 | 
105 | 	// re-write the data in active
106 | 	for i := 0; i < 90; i++ {
107 | 		key := sha1Bytes(strconv.Itoa(i))
108 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
109 | 		assert.Nil(t, err)
110 | 	}
111 | 
112 | 	// reach here: only two wals
113 | 	// manual trigger compaction
114 | 	db.maybeScheduleCompaction()
115 | 	assert.True(t, db.compacting.Load())
116 | 
117 | 	// wait up to 10 seconds
118 | 	waitTimes := 0
119 | 	for {
120 | 		if !db.compacting.Load() {
121 | 			break
122 | 		}
123 | 
124 | 		waitTimes++
125 | 		if waitTimes > 10 {
126 | 			break
127 | 		}
128 | 		time.Sleep(time.Second)
129 | 	}
130 | 
131 | 	// the compaction should be finished
132 | 	assert.True(t, waitTimes <= 10)
133 | 	assert.False(t, db.compacting.Load())
134 | 
135 | 	// after compaction, two wals left
136 | 	assert.Equal(t, len(db.manifest.wals), 2)
137 | 
138 | 	// after compaction, we can get the data
139 | 	for i := 0; i < 100; i++ {
140 | 		key := sha1Bytes(strconv.Itoa(i))
141 | 		readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{})
142 | 
143 | 		assert.Nil(t, err)
144 | 		assert.Equal(t, bin4K, readVal)
145 | 	}
146 | }
147 | 
148 | func TestCompaction_TwoFullRewriteWals(t *testing.T) {
149 | 	db := setupDB(t)
150 | 	defer teardownDB(db)
151 | 
152 | 	// the wal capacity is 1MB
153 | 	// one wal can store up to 256 elements
154 | 	meta := NewMeta(nil)
155 | 	bin4K := GenNKBytes(4)
156 | 	ns := sha1Bytes("compaction")
157 | 	opts := &WriteOptions{}
158 | 
159 | 	for i := 0; i < 100; i++ {
160 | 		key := sha1Bytes(strconv.Itoa(i))
161 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
162 | 		assert.Nil(t, err)
163 | 	}
164 | 
165 | 	// only one wal
166 | 	assert.Equal(t, len(db.manifest.wals), 1)
167 | 
168 | 	// manual rotate wal
169 | 	_, err := db.manifest.RotateWal()
170 | 	assert.Nil(t, err)
171 | 	assert.Equal(t, len(db.manifest.wals), 2)
172 | 
173 | 	// re-write the data in active
174 | 	for i := 0; i < 100; i++ {
175 | 		key := sha1Bytes(strconv.Itoa(i))
176 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
177 | 		assert.Nil(t, err)
178 | 	}
179 | 
180 | 	// manual rorate wal again
181 | 	_, err = db.manifest.RotateWal()
182 | 	assert.Nil(t, err)
183 | 	assert.Equal(t, len(db.manifest.wals), 3)
184 | 
185 | 	// re-write the data in active
186 | 	for i := 0; i < 100; i++ {
187 | 		key := sha1Bytes(strconv.Itoa(i))
188 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
189 | 		assert.Nil(t, err)
190 | 	}
191 | 
192 | 	// reach here: only three wals
193 | 	// manual trigger compaction
194 | 	db.maybeScheduleCompaction()
195 | 	assert.True(t, db.compacting.Load())
196 | 
197 | 	// wait up to 10 seconds
198 | 	waitTimes := 0
199 | 	for {
200 | 		if !db.compacting.Load() {
201 | 			break
202 | 		}
203 | 
204 | 		waitTimes++
205 | 		if waitTimes > 10 {
206 | 			break
207 | 		}
208 | 		time.Sleep(time.Second)
209 | 	}
210 | 
211 | 	// the compaction should be finished
212 | 	assert.True(t, waitTimes <= 10)
213 | 	assert.False(t, db.compacting.Load())
214 | 
215 | 	// after compaction, only one wal lefts
216 | 	assert.Equal(t, len(db.manifest.wals), 1)
217 | 
218 | 	// after compaction, we can get the data
219 | 	for i := 0; i < 100; i++ {
220 | 		key := sha1Bytes(strconv.Itoa(i))
221 | 		readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{})
222 | 
223 | 		assert.Nil(t, err)
224 | 		assert.Equal(t, bin4K, readVal)
225 | 	}
226 | }
227 | 
228 | func TestCompaction_TwoNonFullRewriteWals(t *testing.T) {
229 | 	db := setupDB(t)
230 | 	defer teardownDB(db)
231 | 
232 | 	// the wal capacity is 1MB
233 | 	// one wal can store up to 256 elements
234 | 	meta := NewMeta(nil)
235 | 	bin4K := GenNKBytes(4)
236 | 	ns := sha1Bytes("compaction")
237 | 	opts := &WriteOptions{}
238 | 
239 | 	for i := 0; i < 100; i++ {
240 | 		key := sha1Bytes(strconv.Itoa(i))
241 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
242 | 		assert.Nil(t, err)
243 | 	}
244 | 
245 | 	// only one wal
246 | 	assert.Equal(t, len(db.manifest.wals), 1)
247 | 
248 | 	// manual rotate wal
249 | 	_, err := db.manifest.RotateWal()
250 | 	assert.Nil(t, err)
251 | 	assert.Equal(t, len(db.manifest.wals), 2)
252 | 
253 | 	// re-write the data in active
254 | 	for i := 0; i < 90; i++ {
255 | 		key := sha1Bytes(strconv.Itoa(i))
256 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
257 | 		assert.Nil(t, err)
258 | 	}
259 | 
260 | 	// manual rorate wal again
261 | 	_, err = db.manifest.RotateWal()
262 | 	assert.Nil(t, err)
263 | 	assert.Equal(t, len(db.manifest.wals), 3)
264 | 
265 | 	// re-write the data in active
266 | 	for i := 0; i < 90; i++ {
267 | 		key := sha1Bytes(strconv.Itoa(i))
268 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
269 | 		assert.Nil(t, err)
270 | 	}
271 | 
272 | 	// reach here: only three wals
273 | 	// manual trigger compaction
274 | 	db.maybeScheduleCompaction()
275 | 	assert.True(t, db.compacting.Load())
276 | 
277 | 	// wait up to 10 seconds
278 | 	waitTimes := 0
279 | 	for {
280 | 		if !db.compacting.Load() {
281 | 			break
282 | 		}
283 | 
284 | 		waitTimes++
285 | 		if waitTimes > 10 {
286 | 			break
287 | 		}
288 | 		time.Sleep(time.Second)
289 | 	}
290 | 
291 | 	// the compaction should be finished
292 | 	assert.True(t, waitTimes <= 10)
293 | 	assert.False(t, db.compacting.Load())
294 | 
295 | 	// after compaction, two wals left
296 | 	assert.Equal(t, len(db.manifest.wals), 2)
297 | 
298 | 	// after compaction, we can get the data
299 | 	for i := 0; i < 100; i++ {
300 | 		key := sha1Bytes(strconv.Itoa(i))
301 | 		readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{})
302 | 
303 | 		assert.Nil(t, err)
304 | 		assert.Equal(t, bin4K, readVal)
305 | 	}
306 | }
307 | 
308 | func TestCompaction_TwoNonFullRewriteWals2(t *testing.T) {
309 | 	db := setupDB(t)
310 | 	defer teardownDB(db)
311 | 
312 | 	// the wal capacity is 1MB
313 | 	// one wal can store up to 256 elements
314 | 	meta := NewMeta(nil)
315 | 	bin4K := GenNKBytes(4)
316 | 	ns := sha1Bytes("compaction")
317 | 	opts := &WriteOptions{}
318 | 
319 | 	for i := 0; i < 100; i++ {
320 | 		key := sha1Bytes(strconv.Itoa(i))
321 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
322 | 		assert.Nil(t, err)
323 | 	}
324 | 
325 | 	// only one wal
326 | 	assert.Equal(t, len(db.manifest.wals), 1)
327 | 
328 | 	// manual rotate wal
329 | 	_, err := db.manifest.RotateWal()
330 | 	assert.Nil(t, err)
331 | 	assert.Equal(t, len(db.manifest.wals), 2)
332 | 
333 | 	// re-write the data in active
334 | 	bin2K := GenNKBytes(2)
335 | 	for i := 0; i < 100; i++ {
336 | 		// skip half elements
337 | 		if i%2 == 0 {
338 | 			continue
339 | 		}
340 | 
341 | 		key := sha1Bytes(strconv.Itoa(i))
342 | 		err := db.Put(ns[:], key[:], bin2K, meta, opts)
343 | 		assert.Nil(t, err)
344 | 	}
345 | 
346 | 	// manual rorate wal again
347 | 	_, err = db.manifest.RotateWal()
348 | 	assert.Nil(t, err)
349 | 	assert.Equal(t, len(db.manifest.wals), 3)
350 | 
351 | 	// reach here: only three wals
352 | 	// manual trigger compaction
353 | 	db.maybeScheduleCompaction()
354 | 	assert.True(t, db.compacting.Load())
355 | 
356 | 	// wait up to 10 seconds
357 | 	waitTimes := 0
358 | 	for {
359 | 		if !db.compacting.Load() {
360 | 			break
361 | 		}
362 | 
363 | 		waitTimes++
364 | 		if waitTimes > 10 {
365 | 			break
366 | 		}
367 | 		time.Sleep(time.Second)
368 | 	}
369 | 
370 | 	// the compaction should be finished
371 | 	assert.True(t, waitTimes <= 10)
372 | 	assert.False(t, db.compacting.Load())
373 | 
374 | 	// after compaction, three wals left
375 | 	assert.Equal(t, len(db.manifest.wals), 3)
376 | 
377 | 	// check the data
378 | 	for i := 0; i < 100; i++ {
379 | 		key := sha1Bytes(strconv.Itoa(i))
380 | 		readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{})
381 | 		assert.Nil(t, err)
382 | 
383 | 		if i%2 == 0 {
384 | 			assert.Equal(t, bin4K, readVal)
385 | 		} else {
386 | 			assert.Equal(t, bin2K, readVal)
387 | 		}
388 | 	}
389 | }
390 | 
391 | func TestCompaction_ReclaimDiskUsage(t *testing.T) {
392 | 	db := setupDB(t)
393 | 	defer teardownDB(db)
394 | 
395 | 	// the wal capacity is 1MB
396 | 	// one wal can store up to 256 elements
397 | 	meta := NewMeta(nil)
398 | 	bin4K := GenNKBytes(4)
399 | 	ns := sha1Bytes("compaction")
400 | 	opts := &WriteOptions{}
401 | 
402 | 	for i := 0; i < 100; i++ {
403 | 		key := sha1Bytes(strconv.Itoa(i))
404 | 		err := db.Put(ns[:], key[:], bin4K, meta, opts)
405 | 		assert.Nil(t, err)
406 | 	}
407 | 
408 | 	// only one wal
409 | 	assert.Equal(t, len(db.manifest.wals), 1)
410 | 
411 | 	// manual rotate wal
412 | 	_, err := db.manifest.RotateWal()
413 | 	assert.Nil(t, err)
414 | 	assert.Equal(t, len(db.manifest.wals), 2)
415 | 
416 | 	// trigger reclaim disk usage
417 | 	expect := int64(db.manifest.ActiveWal().Size() - 1)
418 | 	db.reclaimDiskUsage(expect)
419 | 
420 | 	// one wal has been removed
421 | 	assert.Equal(t, len(db.manifest.wals), 1)
422 | }
423 | 


--------------------------------------------------------------------------------
/map.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"sort"
  6 | 	"sync"
  7 | 	"time"
  8 | )
  9 | 
 10 | var ErrMapOptions = errors.New("invalid map options")
 11 | 
 12 | // the reason why we use the specific hashtable implementation:
 13 | // - it's hard to control golang map capacity and disable autoscale
 14 | // - support any type as hashable key
 15 | // - support the customized hash function
 16 | //
 17 | // comparation:
 18 | // - for open addressing hashtable, delete marker makes query performance decrease.
 19 | //   so it's not suitable for frequent deletion
 20 | // - for linked-base hashtable, memory overhead is larger because of more pointers,
 21 | //   which is less cpu cache-friendly
 22 | 
 23 | type MapOperatorBase interface {
 24 | 	// used to generate random slot
 25 | 	Rand(uint64) uint64
 26 | 
 27 | 	// second level clock
 28 | 	WallTime() time.Time
 29 | }
 30 | 
 31 | type MapOperator[K any] interface {
 32 | 	MapOperatorBase
 33 | 
 34 | 	// used to map key to slot
 35 | 	Hash(key *K) uint64
 36 | 
 37 | 	// used to compare key equalization
 38 | 	Equals(lhs, rhs *K) bool
 39 | }
 40 | 
 41 | type Bucket[K any, V any] struct {
 42 | 	key    *K
 43 | 	val    *V
 44 | 	next   *Bucket[K, V]
 45 | 	expire uint32
 46 | }
 47 | 
 48 | // the eviction poll is fixed size, which store some keys of random buckets
 49 | // when the number of entries in map reaches the limit, it will trigger eviction
 50 | // the sample random keys will be added to this eviction pool, and evict the
 51 | // entry of minimum expire
 52 | //
 53 | // notes: the eviction is based on the expire value. but the cached key of
 54 | // pool maybe be updated, the expire maybe not accurate
 55 | // we allow this scenario, and the map only is an approximate LRU
 56 | type EvictionPoolEntry[K any] struct {
 57 | 	slot   uint64
 58 | 	key    *K
 59 | 	expire uint32
 60 | }
 61 | 
 62 | // it's thread-safe
 63 | type SimpleMap[K any, V any] struct {
 64 | 	optr     MapOperator[K]
 65 | 	capacity uint64
 66 | 
 67 | 	// the length and capacity of this slice are fixed
 68 | 	// it reduce the overhead of rehash
 69 | 	buckets []Bucket[K, V]
 70 | 
 71 | 	// the actual used number of buckets
 72 | 	used uint64
 73 | 
 74 | 	// the maximum number of used buckets
 75 | 	// the prefer limited is less than capacity * 0.75, and the reservation
 76 | 	// enhance hashtable performance when hash collision
 77 | 	limited uint64
 78 | 
 79 | 	// the eviction pool for fixed size
 80 | 	evictPoolSize     uint64
 81 | 	evictPoolCapacity uint64
 82 | 	sampleKeys        uint64
 83 | 	evictPool         []EvictionPoolEntry[K]
 84 | 
 85 | 	// all expire is relative time of initial time
 86 | 	initTime time.Time
 87 | 
 88 | 	mu sync.Mutex
 89 | }
 90 | 
 91 | type MapOptions struct {
 92 | 	// the number of bucket
 93 | 	Capacity uint64
 94 | 
 95 | 	// the number of elements
 96 | 	Limited uint64
 97 | 
 98 | 	EvictionPoolCapacity uint64
 99 | 	SampleKeys           uint64
100 | }
101 | 
102 | func (opt *MapOptions) validate() error {
103 | 	if opt.Limited > opt.Capacity {
104 | 		return ErrMapOptions
105 | 	}
106 | 
107 | 	if opt.EvictionPoolCapacity > opt.Limited {
108 | 		return ErrMapOptions
109 | 	}
110 | 
111 | 	if opt.SampleKeys < 1 {
112 | 		return ErrMapOptions
113 | 	}
114 | 
115 | 	if opt.EvictionPoolCapacity < 16 {
116 | 		return ErrMapOptions
117 | 	}
118 | 
119 | 	return nil
120 | }
121 | 
122 | func NewMap[K any, V any](optr MapOperator[K], opts *MapOptions) (*SimpleMap[K, V], error) {
123 | 	if err := opts.validate(); err != nil {
124 | 		return nil, err
125 | 	}
126 | 
127 | 	return &SimpleMap[K, V]{
128 | 		optr:              optr,
129 | 		capacity:          opts.Capacity,
130 | 		buckets:           make([]Bucket[K, V], opts.Capacity),
131 | 		used:              0,
132 | 		limited:           opts.Limited,
133 | 		evictPoolCapacity: opts.EvictionPoolCapacity,
134 | 		evictPool:         make([]EvictionPoolEntry[K], opts.EvictionPoolCapacity),
135 | 		evictPoolSize:     0,
136 | 		sampleKeys:        opts.SampleKeys,
137 | 		initTime:          optr.WallTime(),
138 | 	}, nil
139 | }
140 | 
141 | func (m *SimpleMap[K, V]) Size() uint64 {
142 | 	return m.used
143 | }
144 | 
145 | func (m *SimpleMap[K, V]) Capacity() uint64 {
146 | 	return m.capacity
147 | }
148 | 
149 | func (m *SimpleMap[K, V]) genExpire() uint32 {
150 | 	now := m.optr.WallTime()
151 | 	if now.Before(m.initTime) {
152 | 		return 0
153 | 	}
154 | 
155 | 	return uint32(now.Sub(m.initTime).Seconds())
156 | }
157 | 
158 | // the Set method should always work and return nil
159 | // and it will return the previous value
160 | func (m *SimpleMap[K, V]) Set(key *K, value *V) (*V, error) {
161 | 	slot := m.optr.Hash(key)
162 | 	return m.setWithSlot(key, value, slot)
163 | }
164 | 
165 | func (m *SimpleMap[K, V]) setWithSlot(key *K, value *V, slot uint64) (*V, error) {
166 | 	var old *V
167 | 	var err error
168 | 	var entry *Bucket[K, V]
169 | 
170 | 	slot %= m.capacity
171 | 
172 | 	m.mu.Lock()
173 | 	defer m.mu.Unlock()
174 | 
175 | 	entry, err = m.getEntryWithSlot(key, slot)
176 | 	if err == nil {
177 | 		// found the key
178 | 		old = entry.val
179 | 
180 | 		entry.val = value
181 | 		entry.expire = m.genExpire()
182 | 		return old, nil
183 | 	}
184 | 
185 | 	if m.used+1 > m.limited {
186 | 		old = m.evict()
187 | 	}
188 | 
189 | 	// insert always wokrs
190 | 	m.used++
191 | 
192 | 	// empty slot
193 | 	if m.buckets[slot].key == nil {
194 | 		m.buckets[slot].key = key
195 | 		m.buckets[slot].val = value
196 | 		m.buckets[slot].expire = m.genExpire()
197 | 		return old, nil
198 | 	}
199 | 
200 | 	// insert slot at head
201 | 	entry = &Bucket[K, V]{
202 | 		key:    key,
203 | 		val:    value,
204 | 		next:   m.buckets[slot].next,
205 | 		expire: m.genExpire(),
206 | 	}
207 | 
208 | 	m.buckets[slot].next = entry
209 | 	return old, nil
210 | }
211 | 
212 | func (m *SimpleMap[K, V]) Get(key *K) (*V, error) {
213 | 	slot := m.optr.Hash(key)
214 | 	return m.getWithSlot(key, slot)
215 | }
216 | 
217 | func (m *SimpleMap[K, V]) getWithSlot(key *K, slot uint64) (*V, error) {
218 | 	m.mu.Lock()
219 | 	defer m.mu.Unlock()
220 | 
221 | 	entry, err := m.getEntryWithSlot(key, slot)
222 | 	if err != nil {
223 | 		return nil, err
224 | 	}
225 | 
226 | 	return entry.val, nil
227 | }
228 | 
229 | func (m *SimpleMap[K, V]) getEntryWithSlot(key *K, slot uint64) (*Bucket[K, V], error) {
230 | 	slot %= m.capacity
231 | 	if m.buckets[slot].key == nil {
232 | 		return nil, ErrKeyNotFound
233 | 	}
234 | 
235 | 	entry := &m.buckets[slot]
236 | 	for entry != nil {
237 | 		if m.optr.Equals(key, entry.key) {
238 | 			return entry, nil
239 | 		}
240 | 
241 | 		entry = entry.next
242 | 	}
243 | 
244 | 	return nil, ErrKeyNotFound
245 | }
246 | 
247 | // delete the key and return the previous value
248 | func (m *SimpleMap[K, V]) Delete(key *K) (*V, error) {
249 | 	slot := m.optr.Hash(key)
250 | 	return m.deleteWithSlot(key, slot)
251 | }
252 | 
253 | func (m *SimpleMap[K, V]) deleteWithSlot(key *K, slot uint64) (*V, error) {
254 | 	m.mu.Lock()
255 | 	defer m.mu.Unlock()
256 | 
257 | 	return m.deleteWithSlotInternal(key, slot)
258 | }
259 | 
260 | func (m *SimpleMap[K, V]) deleteWithSlotInternal(key *K, slot uint64) (old *V, err error) {
261 | 	slot %= m.capacity
262 | 	if m.buckets[slot].key == nil {
263 | 		return nil, ErrKeyNotFound
264 | 	}
265 | 
266 | 	if m.optr.Equals(m.buckets[slot].key, key) {
267 | 		old = m.buckets[slot].val
268 | 
269 | 		if m.buckets[slot].next != nil {
270 | 			m.buckets[slot] = *m.buckets[slot].next
271 | 		} else {
272 | 			m.buckets[slot].key = nil
273 | 			m.buckets[slot].val = nil
274 | 		}
275 | 		m.used--
276 | 		return
277 | 	}
278 | 
279 | 	entry := &m.buckets[slot]
280 | 	for entry.next != nil {
281 | 		if m.optr.Equals(entry.next.key, key) {
282 | 			old = entry.next.val
283 | 
284 | 			entry.next = entry.next.next
285 | 			m.used--
286 | 			return
287 | 		}
288 | 		entry = entry.next
289 | 	}
290 | 
291 | 	return nil, ErrKeyNotFound
292 | }
293 | 
294 | func (m *SimpleMap[K, V]) insertEvictionEntry(entry EvictionPoolEntry[K]) {
295 | 	// ascending order by expire
296 | 	// find the upper bound position
297 | 	idx := sort.Search(int(m.evictPoolSize), func(i int) bool {
298 | 		return entry.expire < m.evictPool[i].expire
299 | 	})
300 | 
301 | 	// not found
302 | 	if idx == int(m.evictPoolSize) {
303 | 		idx = int(m.evictPoolSize - 1)
304 | 		if m.evictPoolSize != m.evictPoolCapacity {
305 | 			idx = int(m.evictPoolSize)
306 | 		}
307 | 	}
308 | 
309 | 	if m.evictPoolSize != m.evictPoolCapacity {
310 | 		m.evictPoolSize++
311 | 	}
312 | 
313 | 	// move [idx, size-1) to [idx+1, size)
314 | 	copy(m.evictPool[idx+1:], m.evictPool[idx:m.evictPoolSize-1])
315 | 	m.evictPool[idx] = entry
316 | }
317 | 
318 | // return the eviction entry
319 | func (m *SimpleMap[K, V]) evictMinExpireEntry() *V {
320 | 	var err error
321 | 	var old *V
322 | 	var pos uint64
323 | 
324 | 	for pos < m.evictPoolSize {
325 | 		key := m.evictPool[pos].key
326 | 		slot := m.evictPool[pos].slot
327 | 
328 | 		// ignore the deletion error
329 | 		if old, err = m.deleteWithSlotInternal(key, slot); err == nil {
330 | 			break
331 | 		}
332 | 
333 | 		pos++
334 | 	}
335 | 
336 | 	// remove the unused entries
337 | 	// the range [0, pos] will be removed
338 | 	copy(m.evictPool, m.evictPool[pos+1:m.evictPoolSize])
339 | 	m.evictPoolSize -= pos
340 | 
341 | 	return old
342 | }
343 | 
344 | // the eviction should always work and return the eviction value
345 | //
346 | // since each eviction adds up to the sample keys, at least one of the sample keys
347 | // is guaranteed to be evicted, even if all the keys in previous eviction pool are
348 | // removed
349 | func (m *SimpleMap[K, V]) evict() *V {
350 | 	sampleKeys := m.sampleKeys
351 | 	for sampleKeys > 0 {
352 | 		slot := m.optr.Rand(m.capacity)
353 | 		entry := &m.buckets[slot]
354 | 		if entry.key == nil {
355 | 			continue
356 | 		}
357 | 
358 | 		for sampleKeys > 0 && entry != nil {
359 | 			m.insertEvictionEntry(EvictionPoolEntry[K]{
360 | 				expire: entry.expire,
361 | 				key:    entry.key,
362 | 				slot:   slot,
363 | 			})
364 | 
365 | 			entry = entry.next
366 | 			sampleKeys--
367 | 		}
368 | 	}
369 | 
370 | 	return m.evictMinExpireEntry()
371 | }
372 | 
373 | const (
374 | 	MapShardNum = 16
375 | )
376 | 
377 | // it's thread-safe
378 | type ShardMap[K any, V any] struct {
379 | 	opts   *MapOptions
380 | 	optr   MapOperator[K]
381 | 	shards [MapShardNum]*SimpleMap[K, V]
382 | }
383 | 
384 | func NewShardMap[K any, V any](optr MapOperator[K], opts *MapOptions) (*ShardMap[K, V], error) {
385 | 	shardOpts := &MapOptions{
386 | 		Capacity:             opts.Capacity / MapShardNum,
387 | 		Limited:              opts.Limited / MapShardNum,
388 | 		EvictionPoolCapacity: opts.EvictionPoolCapacity,
389 | 		SampleKeys:           opts.SampleKeys,
390 | 	}
391 | 
392 | 	shardMap := &ShardMap[K, V]{
393 | 		opts: opts,
394 | 		optr: optr,
395 | 	}
396 | 
397 | 	var err error
398 | 	for idx := range shardMap.shards {
399 | 		shardMap.shards[idx], err = NewMap[K, V](optr, shardOpts)
400 | 		if err != nil {
401 | 			return nil, err
402 | 		}
403 | 	}
404 | 
405 | 	return shardMap, nil
406 | }
407 | 
408 | func (s *ShardMap[K, V]) Get(key *K) (*V, error) {
409 | 	slot := s.optr.Hash(key)
410 | 	shard := s.shards[slot%MapShardNum]
411 | 	return shard.getWithSlot(key, slot)
412 | }
413 | 
414 | func (s *ShardMap[K, V]) Set(key *K, value *V) (*V, error) {
415 | 	slot := s.optr.Hash(key)
416 | 	shard := s.shards[slot%MapShardNum]
417 | 	return shard.setWithSlot(key, value, slot)
418 | }
419 | 
420 | func (s *ShardMap[K, V]) Delete(key *K) (*V, error) {
421 | 	slot := s.optr.Hash(key)
422 | 	shard := s.shards[slot%MapShardNum]
423 | 	return shard.deleteWithSlot(key, slot)
424 | }
425 | 
426 | func (s *ShardMap[K, V]) Capacity() uint64 {
427 | 	return s.opts.Capacity
428 | }
429 | 


--------------------------------------------------------------------------------
/compaction.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"os"
  6 | 	"slices"
  7 | 	"sort"
  8 | 	"sync"
  9 | )
 10 | 
 11 | type Compaction struct {
 12 | 	inputs []*Wal
 13 | 
 14 | 	output     *Wal
 15 | 	writer     *WalRewriter
 16 | 	hintWriter *HintWriter
 17 | 
 18 | 	edit *ManifestEdit
 19 | 
 20 | 	mu sync.RWMutex
 21 | }
 22 | 
 23 | func NewCompaction(inputs []*Wal, outputFid uint64) (*Compaction, error) {
 24 | 	dir := inputs[0].Dir()
 25 | 	baseTime := inputs[0].BaseTime()
 26 | 	deleteFiles := make([]LogFile, len(inputs))
 27 | 	for idx := range inputs {
 28 | 		inputs[idx].Ref()
 29 | 		baseTime = min(baseTime, inputs[idx].BaseTime())
 30 | 		deleteFiles[idx] = LogFile{
 31 | 			wal: inputs[idx],
 32 | 			fid: inputs[idx].Fid(),
 33 | 		}
 34 | 	}
 35 | 
 36 | 	outputWal, err := NewWal(MergePath(dir, outputFid), outputFid, int64(baseTime))
 37 | 	if err != nil {
 38 | 		return nil, err
 39 | 	}
 40 | 
 41 | 	writer, err := NewHintWriter(TmpPath(dir, outputFid), outputFid, int64(baseTime))
 42 | 	if err != nil {
 43 | 		return nil, err
 44 | 	}
 45 | 
 46 | 	// the compaction may generate an empty wal, which don't have to keep it
 47 | 	edit := &ManifestEdit{
 48 | 		addFiles:    nil,
 49 | 		deleteFiles: deleteFiles,
 50 | 		hasNextFid:  true,
 51 | 		nextFid:     outputFid + 1,
 52 | 	}
 53 | 
 54 | 	return &Compaction{
 55 | 		inputs:     inputs,
 56 | 		output:     outputWal,
 57 | 		hintWriter: writer,
 58 | 		writer:     NewWalRewriter(outputWal, 1024*1024), // 1MB
 59 | 		edit:       edit,
 60 | 	}, nil
 61 | }
 62 | 
 63 | func (c *Compaction) Finalize() error {
 64 | 	c.mu.Lock()
 65 | 	defer c.mu.Unlock()
 66 | 
 67 | 	var err error
 68 | 	if err = c.writer.Flush(); err != nil {
 69 | 		return err
 70 | 	}
 71 | 
 72 | 	if err = c.hintWriter.Flush(); err != nil {
 73 | 		return err
 74 | 	}
 75 | 
 76 | 	// corner case: empty output wal
 77 | 	// otherwise, we should add the output wal to manifest
 78 | 	if c.output.Empty() {
 79 | 		return nil
 80 | 	}
 81 | 
 82 | 	c.edit.addFiles = append(c.edit.addFiles, LogFile{
 83 | 		wal: c.output,
 84 | 		fid: c.output.Fid(),
 85 | 	})
 86 | 
 87 | 	walName := WalFilename(c.output.Fid())
 88 | 	hintName := HintFilename(c.output.Fid())
 89 | 
 90 | 	if err = c.output.Rename(walName); err != nil {
 91 | 		return err
 92 | 	}
 93 | 
 94 | 	return c.hintWriter.Wal().Rename(hintName)
 95 | }
 96 | 
 97 | func (c *Compaction) Destroy() {
 98 | 	c.mu.Lock()
 99 | 	defer c.mu.Unlock()
100 | 
101 | 	// corner case: empty hint file
102 | 	if c.hintWriter.Wal().Empty() {
103 | 		c.hintWriter.Wal().Unref()
104 | 	}
105 | 
106 | 	_ = c.hintWriter.Close()
107 | 	_ = c.writer.Close()
108 | 
109 | 	c.output.Unref()
110 | 
111 | 	for idx := range c.inputs {
112 | 		c.inputs[idx].Unref()
113 | 	}
114 | }
115 | 
116 | func (db *DBImpl) maybeScheduleCompaction() {
117 | 	if db.reclaiming.Load() {
118 | 		return
119 | 	}
120 | 
121 | 	if !db.compacting.CompareAndSwap(false, true) {
122 | 		return
123 | 	}
124 | 
125 | 	// only one reach here
126 | 
127 | 	db.mu.Lock()
128 | 	defer db.mu.Unlock()
129 | 
130 | 	if db.bgErr != nil {
131 | 		db.compacting.Store(false)
132 | 		return
133 | 	}
134 | 
135 | 	candidateWals := make([]PickerWalInfo, 0, len(db.manifest.wals))
136 | 	for fid := range db.manifest.wals {
137 | 		// skip the active wal
138 | 		if fid == db.manifest.active.fid {
139 | 			continue
140 | 		}
141 | 
142 | 		candidateWals = append(candidateWals, PickerWalInfo{
143 | 			Fid:        fid,
144 | 			WalSize:    db.manifest.wals[fid].wal.Size(),
145 | 			CreateTime: db.manifest.wals[fid].wal.CreateTime(),
146 | 			FreeBytes:  db.manifest.wals[fid].freeBytes + db.manifest.wals[fid].deltaFreeBytes,
147 | 		})
148 | 	}
149 | 
150 | 	filterdWals := db.opts.CompactionPicker(candidateWals)
151 | 	if len(filterdWals) == 0 {
152 | 		db.compacting.Store(false)
153 | 		return
154 | 	}
155 | 
156 | 	db.backgroundCompactionLocked(filterdWals)
157 | }
158 | 
159 | func (db *DBImpl) backgroundCompactionLocked(wals []uint64) {
160 | 	inputs := make([]*Wal, len(wals))
161 | 	for idx := range wals {
162 | 		inputs[idx] = db.manifest.wals[wals[idx]].wal
163 | 	}
164 | 
165 | 	fid := db.manifest.GenFid()
166 | 	compaction, err := NewCompaction(inputs, fid)
167 | 	if err != nil {
168 | 		db.bgErr = err
169 | 		db.compacting.Store(false)
170 | 		return
171 | 	}
172 | 
173 | 	db.compaction = compaction
174 | 
175 | 	db.logger.Info().Uints64("input wals", wals).Uint64("output wal", fid).Msg("new compaction")
176 | 
177 | 	// run compaction without any lock
178 | 	go db.doCompaction(compaction)
179 | }
180 | 
181 | func (db *DBImpl) doCompaction(compaction *Compaction) {
182 | 	var err error
183 | 
184 | 	defer func() {
185 | 		db.compacting.Store(false)
186 | 		compaction.Destroy()
187 | 	}()
188 | 
189 | 	if err = db.doCompactionWork(compaction); err == nil {
190 | 		db.logger.Info().Msg("compaction finished")
191 | 		return
192 | 	}
193 | 
194 | 	db.mu.Lock()
195 | 	defer db.mu.Unlock()
196 | 	db.bgErr = err
197 | 
198 | 	db.logger.Err(err).Msg("failed compaction")
199 | }
200 | 
201 | func (db *DBImpl) doCompactionWork(compaction *Compaction) error {
202 | 	var err error
203 | 	for idx := range compaction.inputs {
204 | 		if err = db.compactOneWal(
205 | 			compaction.writer, compaction.hintWriter, compaction.inputs[idx],
206 | 		); err != nil {
207 | 			return err
208 | 		}
209 | 
210 | 		db.logger.Info().Uint64("wal", compaction.inputs[idx].Fid()).Msg("part compaction")
211 | 	}
212 | 
213 | 	if err = compaction.Finalize(); err != nil {
214 | 		return err
215 | 	}
216 | 
217 | 	db.logger.Info().Msg("prepare to submit the compaction")
218 | 
219 | 	// here, we should update the manifest and index synchronously and atomicly
220 | 	// otherwise, whether the index or manifest update first, the query will not find the
221 | 	// related wal, and return key not found
222 | 	//
223 | 	// at the same time, we don't want to hold the mutex for a long time, especially updating
224 | 	// the index via hint wal, which maybe time consuming
225 | 	var txn *ManifestTxn
226 | 	onePhase := func() error {
227 | 		var err error
228 | 
229 | 		db.mu.Lock()
230 | 		txn, err = db.manifest.NewTxn()
231 | 		db.mu.Unlock()
232 | 
233 | 		if err != nil {
234 | 			return err
235 | 		}
236 | 
237 | 		// let the edit visible
238 | 		edit := &ManifestEdit{
239 | 			addFiles:   compaction.edit.addFiles,
240 | 			hasNextFid: compaction.edit.hasNextFid,
241 | 			nextFid:    compaction.edit.nextFid,
242 | 		}
243 | 		txn.Apply(edit)
244 | 		db.logger.Info().Msg("one phase: apply the edit")
245 | 
246 | 		// update the index without any lock and ignore any error
247 | 		// FIXME: put operations may evict some keys, we should put it into an edit
248 | 		_ = IterateHint(compaction.hintWriter.Wal(), func(record *HintRecord) error {
249 | 			_ = db.index.Put(record.ns, record.key, record.fid, record.off, record.size, nil)
250 | 			return nil
251 | 		})
252 | 		db.logger.Info().Msg("one phase: gen hint wal")
253 | 		return nil
254 | 	}
255 | 
256 | 	twoPhase := func() error {
257 | 		db.mu.Lock()
258 | 		defer db.mu.Unlock()
259 | 
260 | 		// commit the txn
261 | 		edit := &ManifestEdit{
262 | 			deleteFiles: compaction.edit.deleteFiles,
263 | 		}
264 | 		if err := txn.Commit(edit); err != nil {
265 | 			return err
266 | 		}
267 | 
268 | 		db.logger.Info().Msg("two phase: commit the txn")
269 | 
270 | 		// clean un-used files
271 | 		_ = db.manifest.CleanFiles(false)
272 | 
273 | 		db.compaction = nil
274 | 
275 | 		// cache the hint file size
276 | 		hintWal := compaction.hintWriter.Wal()
277 | 		db.hintSizeCache[hintWal.Fid()] = int64(hintWal.Size())
278 | 
279 | 		for idx := range compaction.edit.deleteFiles {
280 | 			logFile := compaction.edit.deleteFiles[idx]
281 | 			delete(db.hintSizeCache, logFile.fid)
282 | 		}
283 | 
284 | 		return nil
285 | 	}
286 | 
287 | 	if err = onePhase(); err != nil {
288 | 		return err
289 | 	}
290 | 
291 | 	return twoPhase()
292 | }
293 | 
294 | func (db *DBImpl) compactOneWal(dst *WalRewriter, hintWriter *HintWriter, src *Wal) error {
295 | 	bufPtr, _ := db.recordPool.Get().(*[]byte)
296 | 	defer db.recordPool.Put(bufPtr)
297 | 
298 | 	var hintRecord HintRecord
299 | 	return IterateRecord(src, func(record *Record, foff, _ uint64) error {
300 | 		// the foff points to the start offset of data in the wal
301 | 		// however, the offset used by ReadRecord of wal expects the start offset of data header
302 | 		foff -= RecordHeaderSize
303 | 
304 | 		if db.doFilter(record, src.fid, foff) {
305 | 			return nil
306 | 		}
307 | 
308 | 		recordBytes, err := record.Encode(*bufPtr, dst.Wal().BaseTime())
309 | 		if err != nil {
310 | 			return err
311 | 		}
312 | 
313 | 		// write dst wal
314 | 		if foff, err = dst.AppendRecord(recordBytes); err != nil {
315 | 			return err
316 | 		}
317 | 
318 | 		// write dst hint wal
319 | 		hintRecord.ns = record.Ns
320 | 		hintRecord.key = record.Key
321 | 		hintRecord.fid = dst.Wal().Fid()
322 | 		hintRecord.off = foff
323 | 		hintRecord.size = uint64(len(recordBytes))
324 | 
325 | 		return hintWriter.AppendRecord(&hintRecord)
326 | 	})
327 | }
328 | 
329 | func (db *DBImpl) doFilter(srcRecord *Record, srcFid, srcOff uint64) bool {
330 | 	fid, off, _, err := db.index.Get(srcRecord.Ns, srcRecord.Key)
331 | 	if err != nil { // the key has been deleted or evicted
332 | 		return true
333 | 	}
334 | 
335 | 	if fid != srcFid || off != srcOff { // the key has been updated
336 | 		return true
337 | 	}
338 | 
339 | 	if db.opts.CompactionFilter != nil {
340 | 		if db.opts.CompactionFilter(srcRecord.Ns, srcRecord.Key, srcRecord.Value, srcRecord.Meta) {
341 | 			// compaction filter failed, the key should be deleted
342 | 			return true
343 | 		}
344 | 	}
345 | 
346 | 	// the key should be retained
347 | 	return false
348 | }
349 | 
350 | func (db *DBImpl) getCompactionWalsLocked() []uint64 {
351 | 	if !db.compacting.Load() || db.compaction == nil {
352 | 		return nil
353 | 	}
354 | 
355 | 	c := db.compaction
356 | 	c.mu.RLock()
357 | 	defer c.mu.RUnlock()
358 | 
359 | 	wals := make([]uint64, 0, len(c.inputs)+1)
360 | 	wals = append(wals, c.output.Fid())
361 | 
362 | 	for idx := range c.inputs {
363 | 		wals = append(wals, c.inputs[idx].Fid())
364 | 	}
365 | 
366 | 	return wals
367 | }
368 | 
369 | func (db *DBImpl) reclaimDiskUsage(expect int64) {
370 | 	if !db.reclaiming.CompareAndSwap(false, true) {
371 | 		return
372 | 	}
373 | 
374 | 	// only one reach here
375 | 
376 | 	defer func() {
377 | 		db.reclaiming.Store(false)
378 | 	}()
379 | 
380 | 	db.mu.Lock()
381 | 	defer db.mu.Unlock()
382 | 
383 | 	if db.bgErr != nil {
384 | 		return
385 | 	}
386 | 
387 | 	usage, err := db.approximateDiskUsageLocked()
388 | 	if err != nil {
389 | 		db.bgErr = errors.Join(err, ErrDiskOutOfLimit)
390 | 		return
391 | 	}
392 | 
393 | 	db.logger.Info().Int64("expect", expect).Int64("usage", usage).Msg("reclaim disk usage")
394 | 	if usage <= expect {
395 | 		return
396 | 	}
397 | 
398 | 	compactionWals := db.getCompactionWalsLocked()
399 | 	files := make([]LogFile, 0, len(db.manifest.wals))
400 | 	for fid := range db.manifest.wals {
401 | 		// exclude the compation wals
402 | 		if slices.Contains(compactionWals, fid) {
403 | 			continue
404 | 		}
405 | 
406 | 		// skip the active wal
407 | 		if fid == db.manifest.ActiveWal().Fid() {
408 | 			continue
409 | 		}
410 | 
411 | 		files = append(files, LogFile{
412 | 			fid: fid,
413 | 			wal: db.manifest.wals[fid].wal,
414 | 		})
415 | 	}
416 | 
417 | 	db.mu.Unlock()
418 | 
419 | 	// sort by create time in positive order
420 | 	sort.Slice(files, func(i, j int) bool {
421 | 		return files[i].wal.CreateTime() < files[j].wal.CreateTime()
422 | 	})
423 | 
424 | 	idx := 0
425 | 	deleteFiles := make([]LogFile, 0, 3)
426 | 
427 | 	// reclaim the old wals
428 | 	for usage > expect && idx < len(files) {
429 | 		usage -= int64(files[idx].wal.Size())
430 | 		deleteFiles = append(deleteFiles, files[idx])
431 | 
432 | 		idx++
433 | 	}
434 | 
435 | 	db.logger.Info().Uints64("wals", Map(deleteFiles, func(f LogFile) uint64 {
436 | 		return f.fid
437 | 	})).Msg("prepare to reclaim wals")
438 | 
439 | 	db.mu.Lock()
440 | 
441 | 	if len(deleteFiles) == 0 {
442 | 		db.bgErr = ErrDiskOutOfLimit
443 | 		db.logger.Err(db.bgErr).Msg("failed to reclaim disk usage")
444 | 		return
445 | 	}
446 | 
447 | 	// apply the edit
448 | 	edit := &ManifestEdit{
449 | 		deleteFiles: deleteFiles,
450 | 	}
451 | 
452 | 	if err = db.manifest.LogAndApply(edit); err != nil {
453 | 		db.bgErr = errors.Join(err, ErrDiskOutOfLimit)
454 | 		db.logger.Err(db.bgErr).Msg("failed to apply")
455 | 	}
456 | 
457 | 	db.logger.Info().Msg("reclaim successfully")
458 | 
459 | 	// delete the related hint wals
460 | 	for idx := range deleteFiles {
461 | 		// ignore errors
462 | 		delete(db.hintSizeCache, deleteFiles[idx].fid)
463 | 		_ = os.Remove(HintPath(db.opts.Dir, deleteFiles[idx].fid))
464 | 	}
465 | }
466 | 
467 | // the method estimates total size of database
468 | // warning: the return size includes the total size of database reference files
469 | func (db *DBImpl) approximateDiskUsageLocked() (int64, error) {
470 | 	var usage int64
471 | 
472 | 	// manifest file size
473 | 	usage += int64(db.manifest.FileSize())
474 | 
475 | 	// hint and wal file size
476 | 	for fid, info := range db.manifest.wals {
477 | 		usage += int64(info.wal.Size())
478 | 		usage += db.hintSizeCache[fid]
479 | 	}
480 | 
481 | 	// remove the un-used hint cache items
482 | 	for fid := range db.hintSizeCache {
483 | 		if _, exists := db.manifest.wals[fid]; !exists {
484 | 			delete(db.hintSizeCache, fid)
485 | 		}
486 | 	}
487 | 
488 | 	return usage, nil
489 | }
490 | 


--------------------------------------------------------------------------------
/manifest.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"slices"
 10 | 	"strings"
 11 | 	"time"
 12 | )
 13 | 
 14 | var (
 15 | 	ErrUnknownManifestFile   = errors.New("unknown manifest file")
 16 | 	ErrConcurrentManifestTxn = errors.New("concurrent manifest txn")
 17 | )
 18 | 
 19 | // it's not thread-safe
 20 | // the MANIFEST is append-only file
 21 | // it includes multiple edits, which record how the database changes
 22 | type Manifest struct {
 23 | 	// the base directory
 24 | 	dir string
 25 | 
 26 | 	// manifest file
 27 | 	fp *os.File
 28 | 
 29 | 	// the manifest fid
 30 | 	fid uint64
 31 | 
 32 | 	// manifest file size
 33 | 	size uint64
 34 | 
 35 | 	// the active wal log
 36 | 	// active and hint log file use the same log number
 37 | 	// usually, the log number is largest
 38 | 	active *Wal
 39 | 
 40 | 	// the next allocatable file number
 41 | 	nextFid uint64
 42 | 
 43 | 	// all wal files
 44 | 	// mapping from fid to walInfo
 45 | 	wals map[uint64]*WalInfo
 46 | 
 47 | 	// TODO: support multiple transactions
 48 | 	txn *ManifestTxn
 49 | }
 50 | 
 51 | type WalInfo struct {
 52 | 	wal *Wal
 53 | 
 54 | 	// the total unuse size of wal file
 55 | 	// if the original data is updated or deleted, the related wal
 56 | 	// can free some disk space
 57 | 	freeBytes uint64
 58 | 
 59 | 	// delta free bytes of wal file
 60 | 	// it indicates the unuse size of wal file, which is not persisted to manifest yet
 61 | 	deltaFreeBytes uint64
 62 | }
 63 | 
 64 | func NewManifest(dir string) (*Manifest, error) {
 65 | 	runners := NewRunner()
 66 | 	defer runners.Do()
 67 | 
 68 | 	manifestDir := ManifestPath(dir, 1)
 69 | 	fp, err := os.OpenFile(manifestDir, os.O_RDWR|os.O_CREATE, 0o644)
 70 | 	if err != nil {
 71 | 		return nil, err
 72 | 	}
 73 | 
 74 | 	runners.Post(func() {
 75 | 		fp.Close()
 76 | 		os.Remove(manifestDir)
 77 | 	})
 78 | 
 79 | 	active, err := NewWal(WalPath(dir, 2), 2, time.Now().Unix())
 80 | 	if err != nil {
 81 | 		return nil, err
 82 | 	}
 83 | 
 84 | 	defer active.Unref()
 85 | 
 86 | 	manifest := &Manifest{
 87 | 		dir:     dir,
 88 | 		fp:      fp,
 89 | 		fid:     1,
 90 | 		nextFid: 3,
 91 | 		wals:    make(map[uint64]*WalInfo),
 92 | 		size:    0,
 93 | 		active:  active,
 94 | 	}
 95 | 
 96 | 	edit := &ManifestEdit{
 97 | 		addFiles:   []LogFile{{active, 2}},
 98 | 		hasNextFid: true,
 99 | 		nextFid:    3,
100 | 	}
101 | 
102 | 	// write the MANIFEST file
103 | 	if err = manifest.LogAndApply(edit); err != nil {
104 | 		return nil, err
105 | 	}
106 | 
107 | 	// write the CURRENT file
108 | 	if err = os.WriteFile(CurrentPath(dir), []byte(ManifestFilename(1)), 0o644); err != nil {
109 | 		return nil, err
110 | 	}
111 | 
112 | 	// abort all functors
113 | 	runners.Rollback()
114 | 
115 | 	return manifest, nil
116 | }
117 | 
118 | func NewManifestIfNotExists(dir string) (*Manifest, error) {
119 | 	if PathExists(CurrentPath(dir)) {
120 | 		return LoadManifest(dir)
121 | 	}
122 | 
123 | 	return NewManifest(dir)
124 | }
125 | 
126 | // load the MANIFEST file according to CURRENT file
127 | func LoadManifest(dir string) (*Manifest, error) {
128 | 	runners := NewRunner()
129 | 	defer runners.Do()
130 | 
131 | 	data, err := os.ReadFile(CurrentPath(dir))
132 | 	if err != nil {
133 | 		return nil, fmt.Errorf("failed to read CURRENT file: %w", err)
134 | 	}
135 | 
136 | 	ft, fid, err := ParseFilename(string(data))
137 | 	if err != nil || ft != ManifestFileType {
138 | 		return nil, ErrUnknownManifestFile
139 | 	}
140 | 
141 | 	manifestPath := filepath.Join(dir, strings.TrimSpace(string(data)))
142 | 	fp, err := os.OpenFile(manifestPath, os.O_RDWR|os.O_APPEND, 0o644)
143 | 	if err != nil {
144 | 		return nil, fmt.Errorf("failed to open manifest file: %w", err)
145 | 	}
146 | 
147 | 	runners.Post(func() {
148 | 		fp.Close()
149 | 	})
150 | 
151 | 	fileInfo, err := fp.Stat()
152 | 	if err != nil {
153 | 		return nil, err
154 | 	}
155 | 
156 | 	manifest := &Manifest{
157 | 		dir:     dir,
158 | 		fid:     fid,
159 | 		fp:      fp,
160 | 		wals:    make(map[uint64]*WalInfo),
161 | 		nextFid: 0,
162 | 		size:    uint64(fileInfo.Size()),
163 | 	}
164 | 
165 | 	if err = manifest.recoverFromManifest(); err != nil {
166 | 		return nil, err
167 | 	}
168 | 
169 | 	if len(manifest.wals) > 0 {
170 | 		// find the max fid
171 | 		maxFid := uint64(0)
172 | 		for fid := range manifest.wals {
173 | 			maxFid = max(maxFid, fid)
174 | 		}
175 | 
176 | 		// freeze older wals
177 | 		for fid := range manifest.wals {
178 | 			if fid != maxFid {
179 | 				manifest.wals[fid].wal.Freeze()
180 | 			}
181 | 		}
182 | 
183 | 		// the max fid as the active wal
184 | 		// in some failure cases, perhaps this is not true. for example, when a compaction has just finished
185 | 		// its wal fid is highest at the time, and then a restart happens
186 | 		manifest.active = manifest.wals[maxFid].wal
187 | 	}
188 | 
189 | 	// abort all functors
190 | 	runners.Rollback()
191 | 
192 | 	return manifest, nil
193 | }
194 | 
195 | func (m *Manifest) recoverFromManifest() error {
196 | 	buf, err := io.ReadAll(m.fp)
197 | 	if err != nil {
198 | 		return err
199 | 	}
200 | 
201 | 	edit := NewManifestEdit()
202 | 	if err = edit.DecodeFrom(buf); err != nil {
203 | 		return err
204 | 	}
205 | 
206 | 	// positive order
207 | 	deleteFids := make([]uint64, 0, len(edit.deleteFiles))
208 | 	for idx := range edit.deleteFiles {
209 | 		deleteFids = append(deleteFids, edit.deleteFiles[idx].fid)
210 | 	}
211 | 	slices.Sort(deleteFids)
212 | 
213 | 	var addFiles []LogFile
214 | 	for idx := range edit.addFiles {
215 | 		if !slices.Contains(deleteFids, edit.addFiles[idx].fid) {
216 | 			addFiles = append(addFiles, edit.addFiles[idx])
217 | 		}
218 | 	}
219 | 
220 | 	// in recover, all delete files should be included in edit.addFiles
221 | 	if len(deleteFids)+len(addFiles) != len(edit.addFiles) {
222 | 		return ErrCorruptedManifest
223 | 	}
224 | 
225 | 	// load related wals
226 | 	for idx := range addFiles {
227 | 		wal, err := LoadWal(WalPath(m.dir, addFiles[idx].fid), addFiles[idx].fid)
228 | 		if err != nil {
229 | 			return err
230 | 		}
231 | 		defer wal.Unref()
232 | 		addFiles[idx].wal = wal
233 | 	}
234 | 
235 | 	// optimize the edit
236 | 	// deleteFiles should be empty in recover
237 | 	edit.addFiles = addFiles
238 | 	edit.deleteFiles = nil
239 | 
240 | 	return m.Apply(edit)
241 | }
242 | 
243 | // return the active wal file
244 | func (m *Manifest) ActiveWal() *Wal {
245 | 	return m.active
246 | }
247 | 
248 | // rotate the active wal
249 | func (m *Manifest) RotateWal() (old *Wal, err error) {
250 | 	fid := m.GenFid()
251 | 	walPath := WalPath(m.dir, fid)
252 | 
253 | 	// FIXME: use wall time of database
254 | 	wal, err := NewWal(walPath, fid, time.Now().Unix())
255 | 	if err != nil {
256 | 		return nil, err
257 | 	}
258 | 
259 | 	defer wal.Unref()
260 | 
261 | 	edit := &ManifestEdit{
262 | 		addFiles:   []LogFile{{wal, fid}},
263 | 		hasNextFid: true,
264 | 		nextFid:    fid + 1,
265 | 	}
266 | 
267 | 	if err = m.LogAndApply(edit); err != nil {
268 | 		return
269 | 	}
270 | 
271 | 	old = m.active
272 | 	old.Freeze()
273 | 	m.active = wal
274 | 
275 | 	return
276 | }
277 | 
278 | // rotate the manifest
279 | func (m *Manifest) RotateManifest() error {
280 | 	runners := NewRunner()
281 | 	defer runners.Do()
282 | 
283 | 	fid := m.GenFid()
284 | 	manifestPath := ManifestPath(m.dir, fid)
285 | 	fp, err := os.Create(manifestPath)
286 | 	if err != nil {
287 | 		return err
288 | 	}
289 | 
290 | 	runners.Post(func() {
291 | 		fp.Close()
292 | 		os.Remove(manifestPath)
293 | 	})
294 | 
295 | 	edit := &ManifestEdit{
296 | 		hasNextFid: true,
297 | 		nextFid:    fid + 1,
298 | 	}
299 | 
300 | 	// all wals will be written the new manifest file
301 | 	for tfid := range m.wals {
302 | 		edit.addFiles = append(edit.addFiles, LogFile{fid: tfid})
303 | 	}
304 | 
305 | 	nbytes, err := m.persistManifestEdit(fp, edit)
306 | 	if err != nil {
307 | 		return err
308 | 	}
309 | 
310 | 	newManifest := ManifestFilename(fid)
311 | 	if err = os.WriteFile(CurrentPath(m.dir), []byte(newManifest), 0o644); err != nil {
312 | 		return err
313 | 	}
314 | 
315 | 	// delete old manifest file
316 | 	_ = m.fp.Close()
317 | 	oldMainfestPath := ManifestPath(m.dir, m.fid)
318 | 	_ = os.Remove(oldMainfestPath)
319 | 
320 | 	m.fp = fp
321 | 	m.fid = fid
322 | 	m.size = nbytes
323 | 
324 | 	// abort all functors
325 | 	runners.Rollback()
326 | 
327 | 	return nil
328 | }
329 | 
330 | // return the size of MANIFEST file
331 | func (m *Manifest) FileSize() uint64 {
332 | 	return m.size
333 | }
334 | 
335 | // clean the un-used files
336 | // if force is true, all un-reference files will be removed
337 | //
338 | // usually, when the database bootstrap, force can be true
339 | // for other situations, the force should be false
340 | func (m *Manifest) CleanFiles(force bool) error {
341 | 	files, err := os.ReadDir(m.dir)
342 | 	if err != nil {
343 | 		return err
344 | 	}
345 | 
346 | 	for _, file := range files {
347 | 		name := file.Name()
348 | 		filetype, fid, err := ParseFilename(name)
349 | 		if err != nil {
350 | 			continue
351 | 		}
352 | 
353 | 		needDelete := false
354 | 
355 | 		switch filetype {
356 | 		case LockFileType:
357 | 			// skip
358 | 		case CurrentFileType:
359 | 			// skip
360 | 		case WalFileType:
361 | 			// wal not found, maybe others are in use
362 | 			if _, exists := m.wals[fid]; !exists {
363 | 				needDelete = force
364 | 			}
365 | 		case HintFileType:
366 | 			// wal not found, hint should be removed
367 | 			if _, exists := m.wals[fid]; !exists {
368 | 				needDelete = true
369 | 			}
370 | 		case TmpFileType:
371 | 			fallthrough
372 | 		case MergeFileType:
373 | 			// tmp and merge file maybe in use
374 | 			needDelete = force
375 | 		case ManifestFileType:
376 | 			// old manifest should be deleted
377 | 			needDelete = (fid != m.fid) && force
378 | 		default:
379 | 			// skip unknown file type
380 | 		}
381 | 
382 | 		if needDelete {
383 | 			_ = os.Remove(filepath.Join(m.dir, name))
384 | 		}
385 | 	}
386 | 
387 | 	return nil
388 | }
389 | 
390 | func (m *Manifest) NewTxn() (*ManifestTxn, error) {
391 | 	if m.txn != nil && !m.txn.IsDone() {
392 | 		return nil, ErrConcurrentManifestTxn
393 | 	}
394 | 
395 | 	m.txn = NewManifestTxn(m)
396 | 	return m.txn, nil
397 | }
398 | 
399 | func (m *Manifest) ToWal(fid uint64) *Wal {
400 | 	if info, exists := m.wals[fid]; exists {
401 | 		return info.wal
402 | 	}
403 | 
404 | 	if m.txn != nil && !m.txn.IsDone() {
405 | 		return m.txn.ToWal(fid)
406 | 	}
407 | 
408 | 	m.txn = nil
409 | 	return nil
410 | }
411 | 
412 | func (m *Manifest) ToWalWithRef(fid uint64) *Wal {
413 | 	if info, exists := m.wals[fid]; exists {
414 | 		info.wal.Ref()
415 | 		return info.wal
416 | 	}
417 | 
418 | 	if m.txn != nil && !m.txn.IsDone() {
419 | 		return m.txn.ToWalWithRef(fid)
420 | 	}
421 | 
422 | 	m.txn = nil
423 | 	return nil
424 | }
425 | 
426 | func (m *Manifest) GenFid() uint64 {
427 | 	nextFid := m.NextFid()
428 | 	m.nextFid = nextFid + 1
429 | 
430 | 	return nextFid
431 | }
432 | 
433 | func (m *Manifest) NextFid() uint64 {
434 | 	nextFid := m.nextFid
435 | 
436 | 	if m.txn != nil && !m.txn.IsDone() {
437 | 		nextFid = max(nextFid, m.txn.NextFid())
438 | 	} else {
439 | 		m.txn = nil
440 | 	}
441 | 
442 | 	return nextFid
443 | }
444 | 
445 | func (m *Manifest) prepareApply(edit *ManifestEdit) error {
446 | 	wals := make(map[uint64]struct{}, len(m.wals))
447 | 	for k := range m.wals {
448 | 		wals[k] = struct{}{}
449 | 	}
450 | 
451 | 	// validate the add files
452 | 	for idx := range edit.addFiles {
453 | 		if _, exists := wals[edit.addFiles[idx].fid]; exists {
454 | 			return errors.New("add the existed file")
455 | 		}
456 | 		wals[edit.addFiles[idx].fid] = struct{}{}
457 | 	}
458 | 
459 | 	// validate the delete files
460 | 	for idx := range edit.deleteFiles {
461 | 		if _, exists := wals[edit.deleteFiles[idx].fid]; !exists {
462 | 			return errors.New("unknown delete file")
463 | 		}
464 | 	}
465 | 
466 | 	return nil
467 | }
468 | 
469 | // apply one edit, but don't persist
470 | func (m *Manifest) Apply(edit *ManifestEdit) error {
471 | 	if err := m.prepareApply(edit); err != nil {
472 | 		return err
473 | 	}
474 | 
475 | 	// reach here: this edit should apply without any error
476 | 
477 | 	m.apply(edit)
478 | 
479 | 	return nil
480 | }
481 | 
482 | // apply the manifest without any error
483 | func (m *Manifest) apply(edit *ManifestEdit) {
484 | 	// add wals
485 | 	for _, add := range edit.addFiles {
486 | 		add.wal.Ref()
487 | 		m.wals[add.fid] = &WalInfo{
488 | 			wal:            add.wal,
489 | 			freeBytes:      0,
490 | 			deltaFreeBytes: 0,
491 | 		}
492 | 	}
493 | 
494 | 	// delete wals
495 | 	for _, del := range edit.deleteFiles {
496 | 		m.wals[del.fid].wal.Unref()
497 | 		delete(m.wals, del.fid)
498 | 	}
499 | 
500 | 	// update next file number
501 | 	if edit.hasNextFid {
502 | 		m.nextFid = max(m.nextFid, edit.nextFid)
503 | 	}
504 | 
505 | 	// update delta free bytes of wal
506 | 	for fid := range edit.freeBytes {
507 | 		if _, exists := m.wals[fid]; !exists {
508 | 			continue
509 | 		}
510 | 
511 | 		m.wals[fid].deltaFreeBytes += edit.freeBytes[fid]
512 | 	}
513 | }
514 | 
515 | func (m *Manifest) applyFreeBytes(delta map[uint64]uint64) {
516 | 	for fid := range delta {
517 | 		if _, exists := m.wals[fid]; !exists {
518 | 			continue
519 | 		}
520 | 
521 | 		m.wals[fid].freeBytes += delta[fid]
522 | 		m.wals[fid].deltaFreeBytes = 0
523 | 	}
524 | }
525 | 
526 | // apply one edit and persist it
527 | func (m *Manifest) LogAndApply(edit *ManifestEdit) error {
528 | 	var err error
529 | 	if err = m.prepareApply(edit); err != nil {
530 | 		return err
531 | 	}
532 | 
533 | 	// try to append delta free bytes of other wals
534 | 	// TODO: only append delta free bytes large enough
535 | 	deltaBytes := make(map[uint64]uint64)
536 | 	for fid := range edit.freeBytes {
537 | 		deltaBytes[fid] = edit.freeBytes[fid]
538 | 	}
539 | 	for fid := range m.wals {
540 | 		deltaBytes[fid] += m.wals[fid].deltaFreeBytes
541 | 	}
542 | 
543 | 	// persist the edit
544 | 	edit.freeBytes = deltaBytes
545 | 	nbytes, err := m.persistManifestEdit(m.fp, edit)
546 | 	if err != nil {
547 | 		return err
548 | 	}
549 | 
550 | 	m.size += nbytes
551 | 
552 | 	// the delta free bytes have persisted, so don't apply them
553 | 	edit.freeBytes = nil
554 | 	m.apply(edit)
555 | 
556 | 	// update the free bytes
557 | 	m.applyFreeBytes(deltaBytes)
558 | 
559 | 	return nil
560 | }
561 | 
562 | func (m *Manifest) persistManifestEdit(fp *os.File, edit *ManifestEdit) (uint64, error) {
563 | 	var err error
564 | 	var nbytes int
565 | 
566 | 	content := edit.Encode()
567 | 	currentBytes := 0
568 | 	expectBytes := len(content)
569 | 
570 | 	for currentBytes < expectBytes && err == nil {
571 | 		nbytes, err = fp.Write(content[currentBytes:])
572 | 		currentBytes += nbytes
573 | 	}
574 | 
575 | 	if err == nil {
576 | 		err = fp.Sync()
577 | 	}
578 | 
579 | 	return uint64(currentBytes), err
580 | }
581 | 
582 | func (m *Manifest) Close() error {
583 | 	for _, info := range m.wals {
584 | 		if info != nil && info.wal != nil {
585 | 			info.wal.Close()
586 | 		}
587 | 	}
588 | 
589 | 	return m.fp.Close()
590 | }
591 | 


--------------------------------------------------------------------------------
/wal.go:
--------------------------------------------------------------------------------
  1 | package bitcask
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"errors"
  7 | 	"os"
  8 | 	"path"
  9 | 	"path/filepath"
 10 | 	"sync"
 11 | 	"sync/atomic"
 12 | 	"time"
 13 | )
 14 | 
 15 | var (
 16 | 	ErrWalIteratorEOF       = errors.New("eof")
 17 | 	ErrWalMismatchCRC       = errors.New("CRC mismatch, corrupted data")
 18 | 	ErrWalMismatchSize      = errors.New("size mismatch, corrupted data")
 19 | 	ErrWalCorruptedData     = errors.New("corrupted data")
 20 | 	ErrWalIncompleteRecord  = errors.New("incomplete record")
 21 | 	ErrWalMismatchMagic     = errors.New("magic number mismatch")
 22 | 	ErrWalMismatchBlockSize = errors.New("block size mismatch")
 23 | 	ErrWalUnknownRecordType = errors.New("invalid record type")
 24 | 	ErrWalUnavailable       = errors.New("wal unavailable")
 25 | 	ErrWalFrozen            = errors.New("wal frozen")
 26 | )
 27 | 
 28 | // wal super block at head
 29 | type superBlock struct {
 30 | 	magic     uint64
 31 | 	blockSize uint64
 32 | 
 33 | 	// start position of actual data
 34 | 	// that points the next byte of checksum
 35 | 	startOff uint32
 36 | 
 37 | 	// maybe have other fields
 38 | 	createTime uint64
 39 | 	baseTime   uint64
 40 | 
 41 | 	// checksum as footer
 42 | 	crc32 uint32
 43 | }
 44 | 
 45 | const (
 46 | 	BlockSize = 32 * 1024 // 32KB per block
 47 | 
 48 | 	RecordFull   = 1 // Record fits entirely in a single block
 49 | 	RecordFirst  = 2 // First chunk of a record
 50 | 	RecordMiddle = 3 // Middle chunk of a record
 51 | 	RecordLast   = 4 // Last chunk of a record
 52 | 
 53 | 	RecordHeaderSize = 7 // 4 bytes CRC + 2 bytes Length + 1 byte Type
 54 | 
 55 | 	MagicNumber           = 0x77616C64 // magic number
 56 | 	SuperBlockSize        = 40         // 8 + 8 + 4 + 8 + 8 + 4 bytes
 57 | 	SuperBlockCRC32Offset = SuperBlockSize - 4
 58 | )
 59 | 
 60 | // calculate the physical footprint of the record based on its actual size and offset
 61 | func WalRecordSize(offset, size uint64) uint64 {
 62 | 	left := size
 63 | 	phySize := uint64(0)
 64 | 
 65 | 	// skip the super block
 66 | 	offset -= SuperBlockSize
 67 | 
 68 | 	for left > 0 {
 69 | 		leftover := BlockSize - (offset % BlockSize)
 70 | 		if leftover < RecordHeaderSize {
 71 | 			phySize += leftover
 72 | 			offset += leftover
 73 | 			leftover = BlockSize
 74 | 		}
 75 | 
 76 | 		avail := leftover - RecordHeaderSize
 77 | 		fragmentLength := min(left, avail)
 78 | 
 79 | 		phySize += (RecordHeaderSize + fragmentLength)
 80 | 		offset += (RecordHeaderSize + fragmentLength)
 81 | 
 82 | 		left -= fragmentLength
 83 | 	}
 84 | 
 85 | 	return phySize
 86 | }
 87 | 
 88 | func WalBlockIndexRange(offset, size uint64) (firstBlkIdx, firstBlkOffset, blkNum uint64) {
 89 | 	recordSize := WalRecordSize(offset, size)
 90 | 
 91 | 	firstBlkIdx = (offset - SuperBlockSize) / BlockSize
 92 | 	firstBlkOffset = firstBlkIdx*BlockSize + SuperBlockSize
 93 | 
 94 | 	lastBlkIdx := (offset - SuperBlockSize + recordSize) / BlockSize
 95 | 	blkNum = lastBlkIdx - firstBlkIdx + 1
 96 | 	return
 97 | }
 98 | 
 99 | func WalBlockOffset(blkIdx uint64) (off uint64) {
100 | 	off = blkIdx*BlockSize + SuperBlockSize
101 | 	return
102 | }
103 | 
104 | func WalBlockIdx(offset uint64) (blkIdx uint64) {
105 | 	blkIdx = (offset - SuperBlockSize) / BlockSize
106 | 	return
107 | }
108 | 
109 | // parse the record from the given buffers
110 | //
111 | // there are two sources of `blks`.
112 | //   - one is created from block cache. This part of cache maybe reused after cache eviction.
113 | //     so it's nessary to new a buffer for the return binary record, and the `blks` has one
114 | //     or more blocks
115 | //   - the second is from pread-at-once. in this case, the value is usually large and contains
116 | //     multiple data blocks. therefore, the return binary record cannot be directly a slice of
117 | //     buffer and the `blks` has only one block
118 | //
119 | // the `size` means the number of bytes of record
120 | // the `blkOff` means the start offset of actual data in blks
121 | func WalParseRecord(size uint64, blkOff uint64, blks [][]byte, verifyChecksum bool) ([]byte, error) {
122 | 	record := make([]byte, 0, size)
123 | 
124 | 	blkSize := uint64(len(blks[0]))
125 | 
126 | 	// iterate all input blocks
127 | 	for i := 0; i < len(blks); i++ {
128 | 		// iterate record
129 | 		for {
130 | 			header := blks[i][blkOff : blkOff+RecordHeaderSize]
131 | 			blkOff += RecordHeaderSize
132 | 
133 | 			crc := binary.LittleEndian.Uint32(header[0:])
134 | 			length := uint64(binary.LittleEndian.Uint16(header[4:]))
135 | 			recordType := header[6]
136 | 
137 | 			// avoid the corrupted data triggering out of range
138 | 			if length > blkSize-blkOff {
139 | 				return nil, ErrWalCorruptedData
140 | 			}
141 | 
142 | 			data := blks[i][blkOff : blkOff+length]
143 | 			blkOff += length
144 | 
145 | 			if verifyChecksum && ComputeCRC32(data) != crc {
146 | 				return nil, ErrWalMismatchCRC
147 | 			}
148 | 
149 | 			record = append(record, data...)
150 | 
151 | 			switch recordType {
152 | 			case RecordFull, RecordLast:
153 | 				if len(record) != int(size) {
154 | 					return nil, ErrWalMismatchSize
155 | 				}
156 | 				return record, nil
157 | 			case RecordFirst, RecordMiddle:
158 | 				// Continue reading next chunk
159 | 			default:
160 | 				return nil, ErrWalUnknownRecordType
161 | 			}
162 | 
163 | 			// blks[i][blkOff:] has no space to store more records
164 | 			leftover := blkSize - blkOff
165 | 			if leftover <= RecordHeaderSize {
166 | 				blkOff = 0
167 | 				break
168 | 			}
169 | 		}
170 | 	}
171 | 
172 | 	return nil, ErrWalIncompleteRecord
173 | }
174 | 
175 | // it's not thread safe
176 | // usually, only one writer can operate the Wal. no race condition
177 | type Wal struct {
178 | 	fp    *os.File
179 | 	super *superBlock
180 | 
181 | 	// reference count
182 | 	refs        *atomic.Int64
183 | 	deleterOnce sync.Once
184 | 
185 | 	// internal buffer
186 | 	buf bytes.Buffer
187 | 
188 | 	// file name
189 | 	path string
190 | 
191 | 	// file id
192 | 	fid uint64
193 | 
194 | 	// file size
195 | 	size uint64
196 | 
197 | 	// the data start position
198 | 	offset uint32
199 | 
200 | 	// when the wal is marked immutable, which is not writable
201 | 	immutable bool
202 | 
203 | 	// indicate whether the wal can operate
204 | 	invalid bool
205 | }
206 | 
207 | func (wal *Wal) Rename(newName string) error {
208 | 	newPath := filepath.Join(wal.Dir(), newName)
209 | 	if err := os.Rename(wal.path, newPath); err != nil {
210 | 		return err
211 | 	}
212 | 
213 | 	wal.path = newPath
214 | 	return nil
215 | }
216 | 
217 | // load the existed wal file
218 | func LoadWal(path string, fid uint64) (*Wal, error) {
219 | 	runners := NewRunner()
220 | 	defer runners.Do()
221 | 
222 | 	file, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND, 0o644)
223 | 	if err != nil {
224 | 		return nil, err
225 | 	}
226 | 
227 | 	runners.Post(func() {
228 | 		file.Close()
229 | 	})
230 | 
231 | 	stat, err := file.Stat()
232 | 	if err != nil {
233 | 		return nil, err
234 | 	}
235 | 
236 | 	wal := &Wal{
237 | 		fp:        file,
238 | 		path:      path,
239 | 		fid:       fid,
240 | 		size:      uint64(stat.Size()),
241 | 		immutable: false,
242 | 		invalid:   false,
243 | 		refs:      new(atomic.Int64),
244 | 	}
245 | 
246 | 	wal.refs.Store(1)
247 | 
248 | 	wal.super, err = wal.loadSuperBlock()
249 | 	if err != nil {
250 | 		return nil, err
251 | 	}
252 | 
253 | 	wal.offset = wal.super.startOff
254 | 
255 | 	// abort all functors
256 | 	runners.Rollback()
257 | 
258 | 	return wal, nil
259 | }
260 | 
261 | // create the wal with specific path
262 | func NewWal(path string, fid uint64, baseTime int64) (*Wal, error) {
263 | 	runners := NewRunner()
264 | 	defer runners.Do()
265 | 
266 | 	file, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o644)
267 | 	if err != nil {
268 | 		return nil, err
269 | 	}
270 | 
271 | 	runners.Post(func() {
272 | 		file.Close()
273 | 	})
274 | 
275 | 	if baseTime < 0 {
276 | 		baseTime = time.Now().Unix()
277 | 	}
278 | 
279 | 	wal := &Wal{
280 | 		fp:        file,
281 | 		path:      path,
282 | 		fid:       fid,
283 | 		size:      0,
284 | 		immutable: false,
285 | 		invalid:   false,
286 | 		refs:      new(atomic.Int64),
287 | 	}
288 | 
289 | 	wal.refs.Store(1)
290 | 
291 | 	wal.super, err = wal.writeSuperBlock(uint64(baseTime))
292 | 	if err != nil {
293 | 		return nil, err
294 | 	}
295 | 
296 | 	wal.size += SuperBlockSize
297 | 	wal.offset = wal.super.startOff
298 | 
299 | 	// abort all functors
300 | 	runners.Rollback()
301 | 
302 | 	return wal, nil
303 | }
304 | 
305 | func (wal *Wal) deleteSelf() {
306 | 	wal.invalid = true
307 | 	_ = wal.fp.Close()
308 | 	_ = os.Remove(wal.path)
309 | }
310 | 
311 | // thread-safe
312 | // decrease the reference count
313 | // delete self when reference count equals zero
314 | func (wal *Wal) Unref() {
315 | 	if wal.refs.Add(-1) == 0 {
316 | 		wal.deleterOnce.Do(wal.deleteSelf)
317 | 	}
318 | }
319 | 
320 | // thread-safe
321 | // increase the reference count
322 | func (wal *Wal) Ref() {
323 | 	wal.refs.Add(1)
324 | }
325 | 
326 | // before read and write wal, usually incr the reference count
327 | // it makes sure the wal is always valid
328 | func (wal *Wal) Valid() bool {
329 | 	return !wal.invalid
330 | }
331 | 
332 | func (wal *Wal) writeSuperBlock(baseTime uint64) (*superBlock, error) {
333 | 	super := &superBlock{
334 | 		magic:      MagicNumber,
335 | 		blockSize:  BlockSize,
336 | 		startOff:   SuperBlockSize,
337 | 		createTime: uint64(time.Now().Unix()),
338 | 		baseTime:   baseTime,
339 | 	}
340 | 
341 | 	buf := make([]byte, SuperBlockSize)
342 | 	binary.LittleEndian.PutUint64(buf[0:], super.magic)
343 | 	binary.LittleEndian.PutUint64(buf[8:], super.blockSize)
344 | 	binary.LittleEndian.PutUint32(buf[16:], super.startOff)
345 | 	binary.LittleEndian.PutUint64(buf[20:], super.createTime)
346 | 	binary.LittleEndian.PutUint64(buf[28:], super.baseTime)
347 | 
348 | 	crc := ComputeCRC32(buf[:SuperBlockCRC32Offset])
349 | 	binary.LittleEndian.PutUint32(buf[SuperBlockCRC32Offset:], crc)
350 | 
351 | 	if _, err := wal.fp.Write(buf); err != nil {
352 | 		return nil, err
353 | 	}
354 | 
355 | 	if err := wal.fp.Sync(); err != nil {
356 | 		return nil, err
357 | 	}
358 | 
359 | 	return super, nil
360 | }
361 | 
362 | func (wal *Wal) loadSuperBlock() (*superBlock, error) {
363 | 	buf := make([]byte, SuperBlockSize)
364 | 	_, err := wal.fp.ReadAt(buf, 0)
365 | 	if err != nil {
366 | 		return nil, err
367 | 	}
368 | 
369 | 	crc := ComputeCRC32(buf[:SuperBlockCRC32Offset])
370 | 	crcExpect := binary.LittleEndian.Uint32(buf[SuperBlockCRC32Offset:])
371 | 	if crc != crcExpect {
372 | 		return nil, ErrWalMismatchCRC
373 | 	}
374 | 
375 | 	magic := binary.LittleEndian.Uint64(buf[0:])
376 | 	if magic != MagicNumber {
377 | 		return nil, ErrWalMismatchMagic
378 | 	}
379 | 
380 | 	blockSize := binary.LittleEndian.Uint64(buf[8:])
381 | 	startOff := binary.LittleEndian.Uint32(buf[16:])
382 | 
383 | 	if blockSize != BlockSize {
384 | 		return nil, ErrWalMismatchBlockSize
385 | 	}
386 | 
387 | 	createTime := binary.LittleEndian.Uint64(buf[20:])
388 | 	baseTime := binary.LittleEndian.Uint64(buf[28:])
389 | 
390 | 	return &superBlock{
391 | 		magic:      magic,
392 | 		blockSize:  blockSize,
393 | 		startOff:   startOff,
394 | 		createTime: createTime,
395 | 		baseTime:   baseTime,
396 | 		crc32:      crc,
397 | 	}, nil
398 | }
399 | 
400 | func (wal *Wal) CreateTime() uint64 {
401 | 	return wal.super.createTime
402 | }
403 | 
404 | func (wal *Wal) BaseTime() uint64 {
405 | 	return wal.super.baseTime
406 | }
407 | 
408 | func (wal *Wal) Sync() error {
409 | 	return wal.fp.Sync()
410 | }
411 | 
412 | func (wal *Wal) Freeze() {
413 | 	wal.immutable = true
414 | }
415 | 
416 | func (wal *Wal) Immutable() bool {
417 | 	return !wal.immutable
418 | }
419 | 
420 | func (wal *Wal) Close() error {
421 | 	wal.Flush()
422 | 	return wal.fp.Close()
423 | }
424 | 
425 | func (wal *Wal) Path() string {
426 | 	return wal.path
427 | }
428 | 
429 | func (wal *Wal) Dir() string {
430 | 	return path.Dir(wal.path)
431 | }
432 | 
433 | func (wal *Wal) Fid() uint64 {
434 | 	return wal.fid
435 | }
436 | 
437 | func (wal *Wal) Fd() int {
438 | 	return int(wal.fp.Fd())
439 | }
440 | 
441 | // return the current file size
442 | func (wal *Wal) Size() uint64 {
443 | 	return wal.size
444 | }
445 | 
446 | func (wal *Wal) Empty() bool {
447 | 	return wal.size == SuperBlockSize && wal.buf.Len() == 0
448 | }
449 | 
450 | // flush the internal buffer
451 | func (wal *Wal) Flush() error {
452 | 	if wal.buf.Len() == 0 {
453 | 		return nil
454 | 	}
455 | 
456 | 	data := wal.buf.Bytes()
457 | 	defer wal.buf.Reset()
458 | 
459 | 	n, err := wal.fp.Write(data)
460 | 
461 | 	// FIXME: maybe parital write
462 | 	wal.size += uint64(n)
463 | 
464 | 	return err
465 | }
466 | 
467 | // clean the internal buffer
468 | func (wal *Wal) ResetBuffer() {
469 | 	wal.buf.Reset()
470 | }
471 | 
472 | // append data to internal buffer
473 | func (wal *Wal) appendFile(data []byte) error {
474 | 	_, err := wal.buf.Write(data)
475 | 	return err
476 | }
477 | 
478 | // the record offset should include super block, which is the pysical offset of wal file
479 | // but the splitted blocks should exclude the super block. in other words, the file layout:
480 | // |  super block  |  block  |  block  | ... |
481 | // |<-    40B    ->|<- 32K ->|<- 32K ->| ... |
482 | func (wal *Wal) writeOffset(skipSuperBlock bool) uint64 {
483 | 	if !skipSuperBlock {
484 | 		return wal.size + uint64(wal.buf.Len())
485 | 	}
486 | 	return wal.size + uint64(wal.buf.Len()) - SuperBlockSize
487 | }
488 | 
489 | // write one record, and return the start offset of the record in wal file
490 | func (wal *Wal) WriteRecord(record []byte) (uint64, error) {
491 | 	if !wal.Valid() {
492 | 		return 0, ErrWalUnavailable
493 | 	}
494 | 
495 | 	if !wal.Immutable() {
496 | 		return 0, ErrWalFrozen
497 | 	}
498 | 
499 | 	var err error
500 | 	var offset uint64
501 | 	begin := true
502 | 	left := uint64(len(record))
503 | 	padding := [...]byte{0, 0, 0, 0, 0, 0}
504 | 
505 | 	for left > 0 {
506 | 		leftover := BlockSize - (wal.writeOffset(true) % BlockSize)
507 | 		if leftover < RecordHeaderSize {
508 | 			if err = wal.appendFile(padding[:leftover]); err != nil {
509 | 				return 0, err
510 | 			}
511 | 			leftover = BlockSize
512 | 		}
513 | 
514 | 		if begin {
515 | 			offset = wal.writeOffset(false)
516 | 		}
517 | 
518 | 		avail := leftover - RecordHeaderSize
519 | 		fragmentLength := min(left, avail)
520 | 
521 | 		var recordType byte
522 | 		end := (left == fragmentLength)
523 | 		switch {
524 | 		case begin && end:
525 | 			recordType = RecordFull
526 | 		case begin:
527 | 			recordType = RecordFirst
528 | 		case end:
529 | 			recordType = RecordLast
530 | 		default:
531 | 			recordType = RecordMiddle
532 | 		}
533 | 
534 | 		var header [RecordHeaderSize]byte
535 | 		binary.LittleEndian.PutUint32(header[0:], ComputeCRC32(record[:fragmentLength]))
536 | 		binary.LittleEndian.PutUint16(header[4:], uint16(fragmentLength))
537 | 		header[6] = recordType
538 | 
539 | 		if err = wal.appendFile(header[:]); err != nil {
540 | 			return 0, err
541 | 		}
542 | 
543 | 		if err = wal.appendFile(record[:fragmentLength]); err != nil {
544 | 			return 0, err
545 | 		}
546 | 
547 | 		record = record[fragmentLength:]
548 | 		left -= fragmentLength
549 | 		begin = false
550 | 	}
551 | 
552 | 	return offset, nil
553 | }
554 | 
555 | // read one record from specifc offset and size
556 | func (wal *Wal) ReadRecord(offset, size uint64, verifyChecksum bool) (record []byte, err error) {
557 | 	if !wal.Valid() {
558 | 		return nil, ErrWalUnavailable
559 | 	}
560 | 
561 | 	recordSize := WalRecordSize(offset, size)
562 | 	if offset+recordSize > wal.size {
563 | 		return nil, errors.New("read beyond file size")
564 | 	}
565 | 
566 | 	buffer := make([]byte, recordSize)
567 | 	// read all related data using only one disk read operation
568 | 	if err = PreadFull(wal.Fd(), buffer, int64(offset)); err != nil {
569 | 		return
570 | 	}
571 | 
572 | 	return WalParseRecord(size, 0, [][]byte{buffer}, verifyChecksum)
573 | }
574 | 


--------------------------------------------------------------------------------