├── block_reader ├── factory.go ├── factory_legacy.go ├── pread.go ├── interface.go └── iouring.go ├── consts.go ├── .gitignore ├── bench ├── benchmark3 ├── benchmark1 ├── bench_compaction_test.go ├── gc_overhead_test.go ├── bench_disk_usage_test.go └── bench_test.go ├── go.mod ├── wal_rewriter.go ├── .golangci.yml ├── batch.go ├── wal_iterator_test.go ├── db_test.go ├── meta.go ├── manifest_edit_test.go ├── hint_test.go ├── wal_iterator.go ├── manifest_txn.go ├── index_test.go ├── index.go ├── utils.go ├── go.sum ├── manifest_edit.go ├── record_test.go ├── manifest_txn_test.go ├── hint.go ├── deque.go ├── db.go ├── block_cache.go ├── manifest_test.go ├── deque_test.go ├── wal_test.go ├── record.go ├── map_test.go ├── README-CN.md ├── db_impl_test.go ├── README.md ├── compaction_test.go ├── map.go ├── compaction.go ├── manifest.go └── wal.go /block_reader/factory.go: -------------------------------------------------------------------------------- 1 | //go:build io_uring 2 | // +build io_uring 3 | 4 | package block_reader 5 | 6 | func NewDefaultBlockReader(concurrent uint64) (BlockReader, error) { 7 | return NewIOUringBlockReader(concurrent) 8 | } 9 | -------------------------------------------------------------------------------- /block_reader/factory_legacy.go: -------------------------------------------------------------------------------- 1 | //go:build !io_uring 2 | // +build !io_uring 3 | 4 | package block_reader 5 | 6 | func NewDefaultBlockReader(concurrent uint64) (BlockReader, error) { 7 | return NewPreadBlockReader(concurrent) 8 | } 9 | -------------------------------------------------------------------------------- /consts.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | const ( 4 | DefaultNsSize = 20 5 | DefaultEtagSize = 20 6 | 7 | // trigger one compaction per 60 second 8 | DefaultCompactionTriggerInterval = 60 9 | 10 | DefaultCheckDiskUsageInterval = 20 11 | 12 | DefaultCompactionPickerRatio = 0.4 13 | 14 | DefaultRecordBufferSize = 64 * 1024 // 64KB 15 | 16 | DefaultLogMaxSize = 20 // 20MB 17 | 18 | DefaultLogFile = "db.log" 19 | ) 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # env file 25 | .env 26 | -------------------------------------------------------------------------------- /block_reader/pread.go: -------------------------------------------------------------------------------- 1 | package block_reader 2 | 3 | import ( 4 | "golang.org/x/sys/unix" 5 | ) 6 | 7 | type PreadBlockReader struct{} 8 | 9 | func NewPreadBlockReader(concurrent uint64) (*PreadBlockReader, error) { 10 | return &PreadBlockReader{}, nil 11 | } 12 | 13 | func (r *PreadBlockReader) NewRequest(fd int, fid, offset uint64, blk []byte) *Request { 14 | return &Request{ 15 | Fd: fd, 16 | Fid: fid, 17 | Off: offset, 18 | Blk: blk, 19 | res: 0, 20 | err: nil, 21 | } 22 | } 23 | 24 | func (r *PreadBlockReader) Submit(reqs Requests) error { 25 | for idx, req := range reqs { 26 | n, err := unix.Pread(req.Fd, req.Blk, int64(req.Off)) 27 | reqs[idx].err = err 28 | reqs[idx].res = n 29 | } 30 | 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /bench/benchmark3: -------------------------------------------------------------------------------- 1 | go test -bench=GcOverhead -benchtime=300s -count=1 -timeout=30m 2 | goos: linux 3 | goarch: amd64 4 | pkg: github.com/wenzhang-dev/bitcaskDB/bench 5 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 6 | BenchmarkGcOverhead-8 7 | GC pause for startup: gc=1.246146ms 8 | GC pause for test: total=64.350396ms, gc=160.293µs, iter=1 9 | GC pause for startup: gc=719.201µs 10 | GC pause for test: total=105.89436ms, gc=176.146µs, iter=100 11 | GC pause for startup: gc=202.888µs 12 | GC pause for test: total=150.224754ms, gc=63.252µs, iter=10000 13 | GC pause for startup: gc=323.495µs 14 | GC pause for test: total=5.88998191s, gc=1.734889ms, iter=1000000 15 | GC pause for startup: gc=181.486µs 16 | GC pause for test: total=14m31.795948219s, gc=86.133309ms, iter=59247226 17 | 59247226 14743 ns/op 1698 B/op 10 allocs/op 18 | PASS 19 | ok github.com/wenzhang-dev/bitcaskDB/bench 884.982s 20 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/wenzhang-dev/bitcaskDB 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.24.2 6 | 7 | require ( 8 | github.com/gofrs/flock v0.12.1 9 | github.com/iceber/iouring-go v0.0.0 10 | github.com/rs/zerolog v1.34.0 11 | github.com/spaolacci/murmur3 v1.1.0 12 | github.com/stretchr/testify v1.9.0 13 | github.com/vmihailenco/msgpack/v5 v5.4.1 14 | golang.org/x/sys v0.32.0 15 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 16 | ) 17 | 18 | require ( 19 | github.com/davecgh/go-spew v1.1.1 // indirect 20 | github.com/kr/text v0.2.0 // indirect 21 | github.com/mattn/go-colorable v0.1.13 // indirect 22 | github.com/mattn/go-isatty v0.0.19 // indirect 23 | github.com/pmezard/go-difflib v1.0.0 // indirect 24 | github.com/rogpeppe/go-internal v1.14.1 // indirect 25 | github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect 26 | gopkg.in/yaml.v3 v3.0.1 // indirect 27 | ) 28 | 29 | replace github.com/iceber/iouring-go => github.com/royalcat/iouring-go v0.0.0-20240925200811-286062ac1b23 30 | -------------------------------------------------------------------------------- /wal_rewriter.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | type WalRewriter struct { 4 | wal *Wal 5 | 6 | bufLen int 7 | threshold int 8 | } 9 | 10 | func NewWalRewriter(wal *Wal, threshold int) *WalRewriter { 11 | if threshold < 4*1024 { 12 | threshold = 4 * 1024 13 | } 14 | 15 | wal.Ref() 16 | return &WalRewriter{ 17 | wal: wal, 18 | bufLen: 0, 19 | threshold: threshold, 20 | } 21 | } 22 | 23 | func (r *WalRewriter) Wal() *Wal { 24 | return r.wal 25 | } 26 | 27 | func (r *WalRewriter) Close() error { 28 | if r.bufLen != 0 { 29 | if err := r.wal.Flush(); err != nil { 30 | return err 31 | } 32 | } 33 | r.wal.Unref() 34 | return nil 35 | } 36 | 37 | func (r *WalRewriter) AppendRecord(record []byte) (off uint64, err error) { 38 | off, err = r.wal.WriteRecord(record) 39 | if err != nil { 40 | return 0, err 41 | } 42 | 43 | r.bufLen += len(record) 44 | if r.bufLen >= r.threshold { 45 | err = r.Flush() 46 | } 47 | 48 | return 49 | } 50 | 51 | func (r *WalRewriter) Flush() error { 52 | if r.bufLen != 0 { 53 | if err := r.wal.Flush(); err != nil { 54 | return err 55 | } 56 | r.bufLen = 0 57 | } 58 | return nil 59 | } 60 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | linters: 3 | default: none 4 | enable: 5 | - durationcheck 6 | - forcetypeassert 7 | - gocritic 8 | - gomodguard 9 | - govet 10 | - ineffassign 11 | - misspell 12 | - revive 13 | - staticcheck 14 | - unconvert 15 | - unused 16 | - usetesting 17 | - whitespace 18 | settings: 19 | misspell: 20 | locale: US 21 | staticcheck: 22 | checks: 23 | - all 24 | - -SA1008 25 | - -SA1019 26 | - -SA4000 27 | - -SA9004 28 | - -ST1000 29 | - -ST1005 30 | - -ST1016 31 | - -U1000 32 | exclusions: 33 | generated: lax 34 | rules: 35 | - linters: 36 | - forcetypeassert 37 | path: _test\.go 38 | - path: (.+)\.go$ 39 | text: 'empty-block:' 40 | - path: (.+)\.go$ 41 | text: 'unused-parameter:' 42 | - path: (.+)\.go$ 43 | text: 'dot-imports:' 44 | - path: (.+)\.go$ 45 | text: should have a package comment 46 | - path: (.+)\.go$ 47 | text: error strings should not be capitalized or end with punctuation or a newline 48 | issues: 49 | max-issues-per-linter: 100 50 | max-same-issues: 100 51 | formatters: 52 | enable: 53 | - gofumpt 54 | - goimports 55 | -------------------------------------------------------------------------------- /batch.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | type Batch struct { 4 | records []*Record 5 | 6 | // the total number of bytes of serialized records 7 | byteSize int 8 | } 9 | 10 | func NewBatch() *Batch { 11 | return &Batch{} 12 | } 13 | 14 | func (b *Batch) Put(ns, key, val []byte, meta *Meta) { 15 | record := &Record{ 16 | Ns: ns, 17 | Key: key, 18 | Meta: meta, 19 | Value: val, 20 | Deleted: false, 21 | } 22 | b.records = append(b.records, record) 23 | b.byteSize += record.ApproximateSize() 24 | } 25 | 26 | func (b *Batch) Delete(ns, key []byte) { 27 | record := &Record{ 28 | Ns: ns, 29 | Key: key, 30 | // the deletion operation will carry tombstone flag, and store in database 31 | // at the same time, the related index in memory will be removed. so the key will 32 | // not be found. the record with tombstone flag will be removed in compaction 33 | Meta: NewMetaWithTombstone(), 34 | Value: nil, 35 | Deleted: true, 36 | } 37 | b.records = append(b.records, record) 38 | b.byteSize += record.ApproximateSize() 39 | } 40 | 41 | func (b *Batch) Clear() { 42 | b.byteSize = 0 43 | b.records = nil 44 | } 45 | 46 | func (b *Batch) Append(batch *Batch) { 47 | if batch == nil { 48 | return 49 | } 50 | 51 | b.records = append(b.records, batch.records...) 52 | b.byteSize += batch.byteSize 53 | } 54 | 55 | func (b *Batch) Size() int { 56 | return len(b.records) 57 | } 58 | 59 | func (b *Batch) ByteSize() int { 60 | return b.byteSize 61 | } 62 | -------------------------------------------------------------------------------- /wal_iterator_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "strconv" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestWalIterator_Basic(t *testing.T) { 12 | wal := setupWal("test_wal_it_basic.wal", t) 13 | defer wal.Unref() 14 | 15 | for i := 0; i < 1000; i++ { 16 | data := []byte(strconv.Itoa(i)) 17 | _, err := wal.WriteRecord(data) 18 | assert.Nil(t, err) 19 | } 20 | 21 | wal.Flush() 22 | 23 | // iterate 24 | it := NewWalIterator(wal) 25 | defer it.Close() 26 | 27 | itNum := 0 28 | for { 29 | _, readData, err := it.Next() 30 | if errors.Is(err, ErrWalIteratorEOF) { 31 | break 32 | } 33 | assert.Nil(t, err) 34 | assert.Equal(t, readData, []byte(strconv.Itoa(itNum))) 35 | 36 | itNum++ 37 | } 38 | 39 | assert.Equal(t, itNum, 1000) 40 | } 41 | 42 | func TestWalIterator_LargeData(t *testing.T) { 43 | wal := setupWal("test_wal_it_large_data.wal", t) 44 | defer wal.Unref() 45 | 46 | data5KB := GenNKBytes(5) 47 | 48 | // total 5MB = 4KB * 1024 49 | for i := 0; i < 1024; i++ { 50 | _, err := wal.WriteRecord(data5KB) 51 | assert.Nil(t, err) 52 | assert.Nil(t, wal.Flush()) 53 | } 54 | 55 | // iterate 56 | it := NewWalIterator(wal) 57 | defer it.Close() 58 | 59 | itNum := 0 60 | var err error 61 | var readData []byte 62 | for { 63 | _, readData, err = it.Next() 64 | if err != nil { 65 | break 66 | } 67 | assert.Equal(t, data5KB, readData) 68 | 69 | itNum++ 70 | } 71 | 72 | assert.True(t, errors.Is(err, ErrWalIteratorEOF)) 73 | assert.Equal(t, 1024, itNum) 74 | } 75 | -------------------------------------------------------------------------------- /db_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestDB_ParseFilename(t *testing.T) { 10 | hintName := "00001.hint" 11 | mergeName := "00002.merge" 12 | walName := "00003.wal" 13 | manifestName := "MANIFEST-000004" 14 | tmpName := "00005.tmp" 15 | lockName := "LOCK" 16 | currentName := "CURRENT" 17 | unknownName := "test" 18 | 19 | ft, fid, err := ParseFilename(hintName) 20 | assert.Nil(t, err) 21 | assert.Equal(t, ft, HintFileType) 22 | assert.Equal(t, fid, uint64(1)) 23 | 24 | ft, fid, err = ParseFilename(mergeName) 25 | assert.Nil(t, err) 26 | assert.Equal(t, ft, MergeFileType) 27 | assert.Equal(t, fid, uint64(2)) 28 | 29 | ft, fid, err = ParseFilename(walName) 30 | assert.Nil(t, err) 31 | assert.Equal(t, ft, WalFileType) 32 | assert.Equal(t, fid, uint64(3)) 33 | 34 | ft, fid, err = ParseFilename(manifestName) 35 | assert.Nil(t, err) 36 | assert.Equal(t, ft, ManifestFileType) 37 | assert.Equal(t, fid, uint64(4)) 38 | 39 | ft, fid, err = ParseFilename(tmpName) 40 | assert.Nil(t, err) 41 | assert.Equal(t, ft, TmpFileType) 42 | assert.Equal(t, fid, uint64(5)) 43 | 44 | ft, fid, err = ParseFilename(lockName) 45 | assert.Nil(t, err) 46 | assert.Equal(t, ft, LockFileType) 47 | assert.Equal(t, fid, uint64(0)) 48 | 49 | ft, fid, err = ParseFilename(currentName) 50 | assert.Nil(t, err) 51 | assert.Equal(t, ft, CurrentFileType) 52 | assert.Equal(t, fid, uint64(0)) 53 | 54 | ft, fid, err = ParseFilename(unknownName) 55 | assert.Nil(t, err) 56 | assert.Equal(t, ft, UnknownFileType) 57 | assert.Equal(t, fid, uint64(0)) 58 | } 59 | -------------------------------------------------------------------------------- /block_reader/interface.go: -------------------------------------------------------------------------------- 1 | package block_reader 2 | 3 | import ( 4 | "errors" 5 | "golang.org/x/sys/unix" 6 | "slices" 7 | ) 8 | 9 | var ErrBlockReaderRequestFailed = errors.New("block reader request failed") 10 | 11 | type Request struct { 12 | Fd int 13 | Fid uint64 14 | Off uint64 15 | Blk []byte 16 | 17 | res int 18 | err error 19 | } 20 | 21 | func (r *Request) Err() error { 22 | if r.err != nil { 23 | return r.err 24 | } 25 | 26 | if r.res < 0 { 27 | return errors.Join(ErrBlockReaderRequestFailed, unix.Errno(-r.res)) 28 | } 29 | return nil 30 | } 31 | 32 | func (r *Request) NBytes() int { 33 | return r.res 34 | } 35 | 36 | type Requests []*Request 37 | 38 | func (r Requests) Sort() { 39 | slices.SortFunc(r, func(a, b *Request) int { 40 | if a.Fid < b.Fid { 41 | return -1 42 | } else if a.Fid > b.Fid { 43 | return 1 44 | } 45 | 46 | if a.Off < b.Off { 47 | return -1 48 | } else if a.Off > b.Off { 49 | return 1 50 | } 51 | 52 | return 0 53 | }) 54 | } 55 | 56 | func (r Requests) BinarySearch(fid, off uint64) (*Request, bool) { 57 | idx, found := slices.BinarySearchFunc(r, &Request{Fid: fid, Off: off}, func(a, b *Request) int { 58 | if a.Fid < b.Fid { 59 | return -1 60 | } else if a.Fid > b.Fid { 61 | return 1 62 | } 63 | 64 | if a.Off < b.Off { 65 | return -1 66 | } else if a.Off > b.Off { 67 | return 1 68 | } 69 | 70 | return 0 71 | }) 72 | 73 | if !found { 74 | return nil, false 75 | } 76 | 77 | return r[idx], true 78 | } 79 | 80 | // thread safe 81 | type BlockReader interface { 82 | NewRequest(fd int, fid, offset uint64, blk []byte) *Request 83 | Submit(reqs Requests) error 84 | } 85 | -------------------------------------------------------------------------------- /meta.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | const ( 4 | TombstoneFlagBit = 0 5 | MetaNoExpire = 0 6 | ) 7 | 8 | type Meta struct { 9 | // user specified meta data 10 | AppMeta map[string]string 11 | AppMetaSize int 12 | 13 | // control meta data 14 | Expire uint64 15 | Etag []byte 16 | 17 | // bitmap flag 18 | Flags uint8 19 | } 20 | 21 | func NewMeta(appMeta map[string]string) *Meta { 22 | meta := &Meta{ 23 | AppMeta: nil, 24 | AppMetaSize: 0, 25 | Expire: MetaNoExpire, 26 | Etag: nil, 27 | Flags: 0, 28 | } 29 | 30 | return meta.SetAppMeta(appMeta) 31 | } 32 | 33 | func NewMetaWithTombstone() *Meta { 34 | meta := NewMeta(nil) 35 | return meta.SetTombstone(true) 36 | } 37 | 38 | func (m *Meta) SetAppMeta(appMeta map[string]string) *Meta { 39 | // FIXME: insufficient 40 | size := 0 41 | for k, v := range appMeta { 42 | size += len(k) + len(v) 43 | } 44 | 45 | m.AppMeta = appMeta 46 | m.AppMetaSize = size 47 | 48 | return m 49 | } 50 | 51 | func (m *Meta) SetExpire(expire uint64) *Meta { 52 | m.Expire = expire 53 | return m 54 | } 55 | 56 | func (m *Meta) SetEtag(etag []byte) *Meta { 57 | m.Etag = etag 58 | return m 59 | } 60 | 61 | func (m *Meta) SetTombstone(enable bool) *Meta { 62 | if enable { 63 | m.Flags |= (uint8)(1 << TombstoneFlagBit) 64 | } else { 65 | m.Flags &= ^(uint8)(1 << TombstoneFlagBit) 66 | } 67 | return m 68 | } 69 | 70 | func (m *Meta) AppMetadata() map[string]string { 71 | return m.AppMeta 72 | } 73 | 74 | func (m *Meta) IsTombstone() bool { 75 | return m.Flags&(1< int(r.concurrent) { 49 | return ErrBlockReaderConcurrent 50 | } 51 | 52 | prepReqsPtr := r.reqPool.Get().(*[]iouring.PrepRequest) 53 | defer r.reqPool.Put(prepReqsPtr) 54 | 55 | prepReqs := *prepReqsPtr 56 | 57 | for idx := range reqs { 58 | prepReqs[idx] = iouring.Pread(reqs[idx].Fd, reqs[idx].Blk, reqs[idx].Off) 59 | } 60 | 61 | rset, err := r.iour.SubmitRequests(prepReqs[:len(reqs)], nil) 62 | if err != nil { 63 | return err 64 | } 65 | 66 | // wait for completion 67 | <-rset.Done() 68 | 69 | // the order of io_uring Request is the same to `reqs` arguments 70 | for idx, req := range rset.Requests() { 71 | res, _ := req.GetRes() 72 | reqs[idx].res = res 73 | } 74 | 75 | return nil 76 | } 77 | -------------------------------------------------------------------------------- /bench/benchmark1: -------------------------------------------------------------------------------- 1 | go test -bench=PutGet -benchtime=60s -count=3 -timeout=50m 2 | goos: linux 3 | goarch: amd64 4 | pkg: github.com/wenzhang-dev/bitcaskDB/bench 5 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 6 | BenchmarkPutGet/put4K-8 5331782 25259 ns/op 11795 B/op 21 allocs/op 7 | BenchmarkPutGet/put4K-8 5130870 25417 ns/op 11767 B/op 21 allocs/op 8 | BenchmarkPutGet/put4K-8 4898403 26676 ns/op 11742 B/op 21 allocs/op 9 | BenchmarkPutGet/batchPut4K-8 10548615 15340 ns/op 1695 B/op 11 allocs/op 10 | BenchmarkPutGet/batchPut4K-8 9220388 14278 ns/op 1694 B/op 11 allocs/op 11 | BenchmarkPutGet/batchPut4K-8 10363459 15019 ns/op 1686 B/op 11 allocs/op 12 | BenchmarkPutGet/get4K-8 8812342 8076 ns/op 10119 B/op 10 allocs/op 13 | BenchmarkPutGet/get4K-8 7963098 7952 ns/op 10119 B/op 10 allocs/op 14 | BenchmarkPutGet/get4K-8 8480240 7997 ns/op 10119 B/op 10 allocs/op 15 | BenchmarkPutGet/concurrentGet4K-8 17233309 4427 ns/op 10044 B/op 7 allocs/op 16 | BenchmarkPutGet/concurrentGet4K-8 26745726 3681 ns/op 10044 B/op 7 allocs/op 17 | BenchmarkPutGet/concurrentGet4K-8 29305041 3654 ns/op 10044 B/op 7 allocs/op 18 | BenchmarkPutGet/concurrentPut4K-8 4558645 19829 ns/op 8340 B/op 18 allocs/op 19 | BenchmarkPutGet/concurrentPut4K-8 4433334 18664 ns/op 10031 B/op 18 allocs/op 20 | BenchmarkPutGet/concurrentPut4K-8 4366149 17031 ns/op 8175 B/op 17 allocs/op 21 | BenchmarkPutGet/concurrentBatchPut4K-8 9443377 12520 ns/op 1527 B/op 9 allocs/op 22 | BenchmarkPutGet/concurrentBatchPut4K-8 11338162 12429 ns/op 1517 B/op 9 allocs/op 23 | BenchmarkPutGet/concurrentBatchPut4K-8 11394081 12101 ns/op 1510 B/op 9 allocs/op 24 | PASS 25 | ok github.com/wenzhang-dev/bitcaskDB/bench 2310.401s 26 | -------------------------------------------------------------------------------- /bench/bench_compaction_test.go: -------------------------------------------------------------------------------- 1 | package bench 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | "github.com/wenzhang-dev/bitcaskDB" 10 | ) 11 | 12 | func newCompactionDB(b *testing.B) { 13 | dir = "./bitcaskDB" 14 | _ = os.RemoveAll(dir) 15 | _ = os.MkdirAll(dir, os.ModePerm) 16 | 17 | opts := &bitcask.Options{ 18 | Dir: dir, 19 | WalMaxSize: 1024 * 1024 * 1024, // 1GB 20 | ManifestMaxSize: 10 * 1024 * 1024, // 10MB 21 | IndexCapacity: 10000000, // 10 million 22 | IndexLimited: 8000000, 23 | IndexEvictionPoolCapacity: 64, 24 | IndexSampleKeys: 5, 25 | CompactionPicker: nil, // default picker 26 | CompactionFilter: nil, // default filter 27 | NsSize: 0, 28 | EtagSize: 0, 29 | DisableCompaction: false, 30 | CompactionTriggerInterval: 10, // 10 seconds 31 | } 32 | 33 | var err error 34 | db, err = bitcask.NewDB(opts) 35 | assert.Nil(b, err) 36 | } 37 | 38 | func BenchmarkCompaction(b *testing.B) { 39 | newCompactionDB(b) 40 | defer db.Close() 41 | 42 | b.Run("compaction", func(b *testing.B) { 43 | benchmarkCompaction(b, db) 44 | }) 45 | } 46 | 47 | func benchmarkCompaction(b *testing.B, db bitcask.DB) { 48 | threshold := 10000000 49 | meta := bitcask.NewMeta(nil) 50 | value4KB := bitcask.GenNKBytes(4) 51 | opts := &bitcask.WriteOptions{} 52 | batchSize := 50 53 | 54 | newKey := func(hint, threshold int) []byte { 55 | hint %= threshold 56 | key := fmt.Sprintf("key=%10d,%10d", hint, hint) // 25 bytes 57 | return []byte(key) 58 | } 59 | 60 | // repeat write 10 million keys 61 | b.RunParallel(func(pb *testing.PB) { 62 | iteration := 1 63 | batch := bitcask.NewBatch() 64 | for pb.Next() { 65 | batch.Put(nil, newKey(iteration, threshold), value4KB, meta) 66 | 67 | if iteration%batchSize == 0 { 68 | err := db.Write(batch, opts) 69 | assert.Nil(b, err) 70 | batch.Clear() 71 | } 72 | 73 | iteration++ 74 | } 75 | }) 76 | } 77 | -------------------------------------------------------------------------------- /hint_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "os" 5 | "strconv" 6 | "testing" 7 | "time" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestHintEncodeAndDecode(t *testing.T) { 13 | ns := sha1Bytes("namespace") 14 | hintRecord := &HintRecord{ 15 | ns: ns[:], 16 | key: []byte("test-key"), 17 | fid: 2, 18 | off: 123, 19 | size: 100, 20 | } 21 | 22 | bytes, err := hintRecord.Encode() 23 | assert.Nil(t, err) 24 | 25 | decodeRecord := &HintRecord{} 26 | err = decodeRecord.Decode(bytes) 27 | assert.Nil(t, err) 28 | 29 | assert.Equal(t, hintRecord, decodeRecord) 30 | } 31 | 32 | func TestHint_NewHintByWal(t *testing.T) { 33 | wal := setupWal("new_hint_by_wal", t) 34 | defer wal.Unref() 35 | 36 | ns1 := sha1Bytes("namespace") 37 | backStore := make([]byte, DefaultRecordBufferSize) 38 | baseTime := uint64(time.Now().Unix()) 39 | record := &Record{ 40 | Ns: ns1[:], 41 | Key: []byte("test-key"), 42 | Meta: NewMeta(nil), 43 | Value: []byte("hello world"), 44 | } 45 | 46 | for i := 0; i < 1000; i++ { 47 | key := []byte("test-key" + strconv.Itoa(i)) 48 | record.Key = key 49 | 50 | bytes, err := record.Encode(backStore, baseTime) 51 | assert.Nil(t, err) 52 | 53 | _, err = wal.WriteRecord(bytes) 54 | assert.Nil(t, err) 55 | } 56 | 57 | wal.Flush() 58 | 59 | // test hint 60 | fileSize, err := NewHintByWal(wal) 61 | assert.Nil(t, err) 62 | assert.True(t, fileSize > 0) 63 | 64 | hintPath := HintPath(wal.Dir(), wal.Fid()) 65 | hint, err := LoadWal(hintPath, wal.Fid()) 66 | assert.Nil(t, err) 67 | defer hint.Close() 68 | defer os.Remove(hintPath) 69 | 70 | itNum := 0 71 | err = IterateHint(hint, func(hintRecord *HintRecord) error { 72 | assert.Equal(t, hintRecord.ns, ns1[:]) 73 | assert.Equal(t, hintRecord.key, []byte("test-key"+strconv.Itoa(itNum))) 74 | 75 | recordBytes, err := wal.ReadRecord(hintRecord.off, hintRecord.size, true) 76 | assert.Nil(t, err) 77 | 78 | readRecord, err := RecordFromBytes(recordBytes, wal.BaseTime()) 79 | assert.Nil(t, err) 80 | assert.Equal(t, readRecord.Ns, hintRecord.ns) 81 | assert.Equal(t, readRecord.Key, hintRecord.key) 82 | assert.Equal(t, readRecord.Value, record.Value) 83 | 84 | itNum++ 85 | return nil 86 | }) 87 | assert.Nil(t, err) 88 | assert.Equal(t, itNum, 1000) 89 | } 90 | -------------------------------------------------------------------------------- /bench/gc_overhead_test.go: -------------------------------------------------------------------------------- 1 | package bench 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "runtime" 7 | "runtime/debug" 8 | "sync/atomic" 9 | "testing" 10 | "time" 11 | 12 | "github.com/stretchr/testify/assert" 13 | "github.com/wenzhang-dev/bitcaskDB" 14 | ) 15 | 16 | func newGcOverheadDB(b *testing.B) { 17 | dir = "./bitcaskDB" 18 | _ = os.RemoveAll(dir) 19 | _ = os.MkdirAll(dir, os.ModePerm) 20 | 21 | opts := &bitcask.Options{ 22 | Dir: dir, 23 | WalMaxSize: 1024 * 1024 * 1024, // 1GB 24 | ManifestMaxSize: 10 * 1024 * 1024, // 10MB 25 | IndexCapacity: 10000000, // 10 million 26 | IndexLimited: 8000000, 27 | IndexEvictionPoolCapacity: 64, 28 | IndexSampleKeys: 5, 29 | CompactionPicker: nil, // default picker 30 | CompactionFilter: nil, // default filter 31 | NsSize: bitcask.DefaultNsSize, 32 | EtagSize: bitcask.DefaultEtagSize, 33 | DisableCompaction: false, 34 | DiskUsageLimited: 10 * 1024 * 1024 * 1024, // 10GB 35 | } 36 | 37 | var err error 38 | db, err = bitcask.NewDB(opts) 39 | assert.Nil(b, err) 40 | } 41 | 42 | var previousPause time.Duration 43 | 44 | func gcPause() time.Duration { 45 | runtime.GC() 46 | 47 | var stats debug.GCStats 48 | debug.ReadGCStats(&stats) 49 | 50 | pause := stats.PauseTotal - previousPause 51 | previousPause = stats.PauseTotal 52 | 53 | return pause 54 | } 55 | 56 | func BenchmarkGcOverhead(b *testing.B) { 57 | newGcOverheadDB(b) 58 | defer db.Close() 59 | 60 | meta := bitcask.NewMeta(nil) 61 | opts := &bitcask.WriteOptions{} 62 | 63 | b.ResetTimer() 64 | b.ReportAllocs() 65 | 66 | startTime := time.Now() 67 | fmt.Printf("GC pause for startup: gc=%s\n", gcPause()) 68 | 69 | var totalIteration int64 70 | b.RunParallel(func(pb *testing.PB) { 71 | var err error 72 | iteration := 0 73 | batch := bitcask.NewBatch() 74 | for pb.Next() { 75 | batch.Put(ns[:], genTestKey(iteration), bin4KB, meta) 76 | 77 | if iteration%BatchSize == 0 { 78 | err = db.Write(batch, opts) 79 | assert.Nil(b, err) 80 | batch.Clear() 81 | } 82 | 83 | iteration++ 84 | } 85 | 86 | atomic.AddInt64(&totalIteration, int64(iteration)) 87 | }) 88 | 89 | diff := time.Since(startTime) 90 | fmt.Printf("GC pause for test: total=%s, gc=%s, iter=%d\n", diff, gcPause(), totalIteration) 91 | } 92 | -------------------------------------------------------------------------------- /wal_iterator.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "encoding/binary" 5 | ) 6 | 7 | type WalIterator struct { 8 | wal *Wal 9 | fileOff int 10 | 11 | bufOff int 12 | bufSize int 13 | buf []byte 14 | 15 | err error 16 | } 17 | 18 | func NewWalIterator(wal *Wal) *WalIterator { 19 | wal.Ref() 20 | 21 | return &WalIterator{ 22 | wal: wal, 23 | fileOff: int(wal.offset), 24 | bufOff: 0, 25 | bufSize: 0, 26 | buf: make([]byte, BlockSize), 27 | } 28 | } 29 | 30 | func (i *WalIterator) Close() { 31 | i.wal.Unref() 32 | } 33 | 34 | func (i *WalIterator) fd() int { 35 | return int(i.wal.fp.Fd()) 36 | } 37 | 38 | // try to read a block unless there is less than one block left 39 | // the Next method will return the start offset of data in wal file, and the data itself 40 | func (i *WalIterator) Next() (uint64, []byte, error) { 41 | var off uint64 42 | var record []byte 43 | 44 | for i.err == nil { 45 | if i.bufOff+RecordHeaderSize > i.bufSize { 46 | i.fileOff += i.bufSize 47 | 48 | // skip the padding 49 | i.bufSize = min(BlockSize, int(i.wal.Size())-i.fileOff) 50 | if i.bufSize == 0 { 51 | i.err = ErrWalIteratorEOF 52 | return 0, nil, i.err 53 | } 54 | 55 | if i.err = PreadFull(i.fd(), i.buf[:i.bufSize], int64(i.fileOff)); i.err != nil { 56 | return 0, nil, i.err 57 | } 58 | 59 | i.bufOff = 0 60 | } 61 | 62 | header := i.buf[i.bufOff : i.bufOff+RecordHeaderSize] 63 | i.bufOff += RecordHeaderSize 64 | 65 | crc := binary.LittleEndian.Uint32(header[0:]) 66 | length := int(binary.LittleEndian.Uint16(header[4:])) 67 | recordType := header[6] 68 | 69 | // record the file offset 70 | if len(record) == 0 { 71 | off = uint64(i.fileOff + i.bufOff) 72 | } 73 | 74 | // avoid the corrupted data 75 | length = min(length, i.bufSize-i.bufOff) 76 | data := i.buf[i.bufOff : i.bufOff+length] 77 | i.bufOff += length 78 | 79 | if ComputeCRC32(data) != crc { 80 | i.err = ErrWalMismatchCRC 81 | return 0, nil, i.err 82 | } 83 | 84 | switch recordType { 85 | case RecordFull: 86 | // reference the backing store of slice 87 | return off, data, nil 88 | case RecordFirst, RecordMiddle: 89 | // Continue reading next chunk 90 | record = append(record, data...) 91 | case RecordLast: 92 | record = append(record, data...) 93 | return off, record, nil 94 | default: 95 | i.err = ErrWalUnknownRecordType 96 | } 97 | } 98 | 99 | return 0, nil, i.err 100 | } 101 | 102 | // the functionality is same to Next, except that the offset does not contain wal header 103 | // it's useful for the hint generation 104 | func (i *WalIterator) NextWithoutHeaderOffset() (uint64, []byte, error) { 105 | off, data, err := i.Next() 106 | if err != nil { 107 | return 0, nil, err 108 | } 109 | 110 | off -= RecordHeaderSize 111 | return off, data, nil 112 | } 113 | -------------------------------------------------------------------------------- /manifest_txn.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "sync" 6 | ) 7 | 8 | var ( 9 | ErrAbortedManifestTxn = errors.New("aborted manifest txn") 10 | ErrCommittedManfiestTxn = errors.New("committed manifest txn") 11 | ) 12 | 13 | // this is a rough implementation of manifest transaction. its main purpose is to make the applied 14 | // manifest edit visible to other operations. the pending edit of transaction will be persisted only 15 | // after the transaction is committed. 16 | // 17 | // the design is rough because the deleted wals in the applied manifest edits may also be visible to 18 | // others. also, only one running manifest transaction is supported currently. 19 | // 20 | // nevertheless, this design also works well, and avoid transitional design 21 | type ManifestTxn struct { 22 | manifest *Manifest 23 | pendingEdit *ManifestEdit 24 | 25 | committed bool 26 | aborted bool 27 | 28 | mu sync.RWMutex 29 | } 30 | 31 | func NewManifestTxn(manifest *Manifest) *ManifestTxn { 32 | return &ManifestTxn{ 33 | aborted: false, 34 | committed: false, 35 | manifest: manifest, 36 | pendingEdit: NewManifestEdit(), 37 | } 38 | } 39 | 40 | func (txn *ManifestTxn) IsDone() bool { 41 | txn.mu.RLock() 42 | defer txn.mu.RUnlock() 43 | 44 | return txn.aborted || txn.committed 45 | } 46 | 47 | func (txn *ManifestTxn) toWalLocked(fid uint64) *Wal { 48 | for idx := range txn.pendingEdit.addFiles { 49 | if txn.pendingEdit.addFiles[idx].fid == fid { 50 | return txn.pendingEdit.addFiles[idx].wal 51 | } 52 | } 53 | 54 | return nil 55 | } 56 | 57 | func (txn *ManifestTxn) ToWal(fid uint64) *Wal { 58 | txn.mu.RLock() 59 | defer txn.mu.RUnlock() 60 | 61 | return txn.toWalLocked(fid) 62 | } 63 | 64 | func (txn *ManifestTxn) ToWalWithRef(fid uint64) *Wal { 65 | txn.mu.RLock() 66 | defer txn.mu.RUnlock() 67 | 68 | if wal := txn.toWalLocked(fid); wal != nil { 69 | wal.Ref() 70 | return wal 71 | } 72 | 73 | return nil 74 | } 75 | 76 | func (txn *ManifestTxn) NextFid() uint64 { 77 | txn.mu.RLock() 78 | defer txn.mu.RUnlock() 79 | 80 | return txn.pendingEdit.nextFid 81 | } 82 | 83 | func (txn *ManifestTxn) Apply(edit *ManifestEdit) { 84 | if edit == nil { 85 | return 86 | } 87 | 88 | txn.mu.Lock() 89 | defer txn.mu.Unlock() 90 | 91 | txn.pendingEdit.Merge(edit) 92 | } 93 | 94 | func (txn *ManifestTxn) Commit(edit *ManifestEdit) error { 95 | txn.mu.Lock() 96 | defer txn.mu.Unlock() 97 | 98 | if txn.committed { 99 | return ErrCommittedManfiestTxn 100 | } 101 | 102 | if txn.aborted { 103 | return ErrAbortedManifestTxn 104 | } 105 | 106 | if edit != nil { 107 | txn.pendingEdit.Merge(edit) 108 | } 109 | 110 | if err := txn.manifest.LogAndApply(txn.pendingEdit); err != nil { 111 | txn.aborted = true 112 | return err 113 | } 114 | 115 | txn.committed = true 116 | return nil 117 | } 118 | 119 | func (txn *ManifestTxn) Abort() { 120 | txn.mu.Lock() 121 | defer txn.mu.Unlock() 122 | 123 | if txn.committed { 124 | return 125 | } 126 | 127 | txn.aborted = true 128 | } 129 | -------------------------------------------------------------------------------- /index_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "math/rand" 6 | "strconv" 7 | "testing" 8 | "time" 9 | 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | type mockIndexHelper struct{} 14 | 15 | func (mock *mockIndexHelper) Rand(upper uint64) uint64 { 16 | return uint64(rand.Int63n(int64(upper))) 17 | } 18 | 19 | func (mock *mockIndexHelper) WallTime() time.Time { 20 | return time.Now() 21 | } 22 | 23 | func setupIndex(t *testing.T) *Index { 24 | index, err := NewIndex(&IndexOptions{ 25 | Capacity: 1000, 26 | Limited: 800, 27 | EvictionPoolCapacity: 32, 28 | SampleKeys: 5, 29 | Helper: &mockIndexHelper{}, 30 | }) 31 | 32 | assert.Nil(t, err) 33 | return index 34 | } 35 | 36 | func TestIndexBasicOperations(t *testing.T) { 37 | index := setupIndex(t) 38 | 39 | ns1 := []byte("ns1") 40 | key1, key2 := []byte("key1"), []byte("key2") 41 | 42 | // Insert values 43 | err := index.Put(ns1, key1, 1, 100, 100, nil) 44 | assert.NoError(t, err) 45 | 46 | err = index.Put(ns1, key2, 2, 200, 100, nil) 47 | assert.NoError(t, err) 48 | 49 | // Retrieve values 50 | fid, off, _, err := index.Get(ns1, key1) 51 | assert.NoError(t, err) 52 | assert.Equal(t, uint64(1), fid) 53 | assert.Equal(t, uint64(100), off) 54 | 55 | fid, off, _, err = index.Get(ns1, key2) 56 | assert.NoError(t, err) 57 | assert.Equal(t, uint64(2), fid) 58 | assert.Equal(t, uint64(200), off) 59 | 60 | // Update an existing key 61 | err = index.Put(ns1, key1, 3, 300, 100, nil) 62 | assert.NoError(t, err) 63 | 64 | fid, off, _, err = index.Get(ns1, key1) 65 | assert.NoError(t, err) 66 | assert.Equal(t, uint64(3), fid) 67 | assert.Equal(t, uint64(300), off) 68 | } 69 | 70 | func TestIndexDeleteOperations(t *testing.T) { 71 | index := setupIndex(t) 72 | 73 | ns1 := []byte("ns1") 74 | key1, key2 := []byte("key1"), []byte("key2") 75 | 76 | // Insert and delete key 77 | err := index.Put(ns1, key1, 1, 100, 100, nil) 78 | assert.NoError(t, err) 79 | 80 | err = index.Delete(ns1, key1, nil) 81 | assert.NoError(t, err) 82 | 83 | _, _, _, err = index.Get(ns1, key1) 84 | assert.Error(t, err) // Should return error since key is deleted 85 | 86 | // Soft delete (overwrite with invalid offset) 87 | err = index.Put(ns1, key2, 2, 200, 100, nil) 88 | assert.NoError(t, err) 89 | 90 | err = index.SoftDelete(ns1, key2, nil) 91 | assert.NoError(t, err) 92 | 93 | _, _, _, err = index.Get(ns1, key2) 94 | assert.NotNil(t, err) 95 | assert.True(t, errors.Is(err, ErrKeySoftDeleted)) 96 | } 97 | 98 | func TestIndexEviction(t *testing.T) { 99 | index := setupIndex(t) 100 | 101 | ns1 := []byte("ns1") 102 | totalFreeBytes := uint64(0) 103 | 104 | // the Limited is 800, but 1 million keys has been written 105 | // the value size is 100 bytes, so the free bytes should equal to 100 * N 106 | for i := 1; i <= 1000000; i++ { 107 | key := []byte("key" + strconv.Itoa(i)) 108 | stat := &WriteStat{} 109 | err := index.Put(ns1, key, 1, uint64(i*100), 100, stat) 110 | assert.Nil(t, err) 111 | 112 | totalFreeBytes += stat.FreeBytes 113 | } 114 | 115 | assert.Equal(t, totalFreeBytes, uint64(100*(1000000-800))) 116 | } 117 | -------------------------------------------------------------------------------- /index.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "bytes" 5 | "sync" 6 | "time" 7 | 8 | "github.com/spaolacci/murmur3" 9 | ) 10 | 11 | type IndexOperator struct { 12 | helper MapOperatorBase 13 | } 14 | 15 | func (optr *IndexOperator) Hash(key *[]byte) uint64 { 16 | hasher := murmur3.New64() 17 | hasher.Write(*key) 18 | return hasher.Sum64() 19 | } 20 | 21 | func (optr *IndexOperator) Equals(lhs, rhs *[]byte) bool { 22 | return bytes.Equal(*lhs, *rhs) 23 | } 24 | 25 | func (optr *IndexOperator) Rand(upper uint64) uint64 { 26 | return optr.helper.Rand(upper) 27 | } 28 | 29 | func (optr *IndexOperator) WallTime() time.Time { 30 | return optr.helper.WallTime() 31 | } 32 | 33 | type IndexValue struct { 34 | fid uint64 35 | valueOff uint64 36 | valueSize uint64 37 | } 38 | 39 | type Index struct { 40 | ivPool sync.Pool 41 | maps *ShardMap[[]byte, IndexValue] 42 | } 43 | 44 | type IndexOptions struct { 45 | Capacity uint64 46 | Limited uint64 47 | EvictionPoolCapacity uint64 48 | SampleKeys uint64 49 | 50 | Helper MapOperatorBase 51 | } 52 | 53 | // TODO: batch optimization 54 | func NewIndex(opts *IndexOptions) (*Index, error) { 55 | mapOpts := &MapOptions{ 56 | Capacity: opts.Capacity, 57 | Limited: opts.Limited, 58 | EvictionPoolCapacity: opts.EvictionPoolCapacity, 59 | SampleKeys: opts.SampleKeys, 60 | } 61 | 62 | optr := &IndexOperator{ 63 | helper: opts.Helper, 64 | } 65 | 66 | maps, err := NewShardMap[[]byte, IndexValue](optr, mapOpts) 67 | if err != nil { 68 | return nil, err 69 | } 70 | 71 | return &Index{ 72 | maps: maps, 73 | ivPool: sync.Pool{ 74 | New: func() any { 75 | return &IndexValue{} 76 | }, 77 | }, 78 | }, nil 79 | } 80 | 81 | func (i *Index) Get(ns, key []byte) (fid uint64, off uint64, sz uint64, err error) { 82 | var val *IndexValue 83 | mergedKey := MergedKey(ns, key) 84 | 85 | if val, err = i.maps.Get(&mergedKey); err != nil { 86 | return 87 | } 88 | 89 | fid = val.fid 90 | off = val.valueOff 91 | sz = val.valueSize 92 | 93 | if off == 0 { // invalid offset 94 | err = ErrKeySoftDeleted 95 | } 96 | 97 | return 98 | } 99 | 100 | type WriteStat struct { 101 | // how much disk space is freed by this write 102 | FreeBytes uint64 103 | 104 | // which wal is affected 105 | FreeWalFid uint64 106 | } 107 | 108 | func (i *Index) Delete(ns, key []byte, stat *WriteStat) error { 109 | mergedKey := MergedKey(ns, key) 110 | old, err := i.maps.Delete(&mergedKey) 111 | if err != nil { 112 | return err 113 | } 114 | 115 | if stat != nil && old != nil { 116 | stat.FreeBytes = old.valueSize 117 | stat.FreeWalFid = old.fid 118 | 119 | i.ivPool.Put(old) 120 | } 121 | 122 | return nil 123 | } 124 | 125 | func (i *Index) SoftDelete(ns, key []byte, stat *WriteStat) error { 126 | mergedKey := MergedKey(ns, key) 127 | old, err := i.maps.Set(&mergedKey, &IndexValue{ 128 | valueOff: 0, // invalid offset 129 | }) 130 | if err != nil { 131 | return err 132 | } 133 | 134 | if stat != nil && old != nil { 135 | stat.FreeBytes = old.valueSize 136 | stat.FreeWalFid = old.fid 137 | 138 | i.ivPool.Put(old) 139 | } 140 | 141 | return nil 142 | } 143 | 144 | func (i *Index) Put(ns, key []byte, fid uint64, off uint64, sz uint64, stat *WriteStat) error { 145 | mergedKey := MergedKey(ns, key) 146 | 147 | iv, _ := i.ivPool.Get().(*IndexValue) 148 | iv.fid = fid 149 | iv.valueOff = off 150 | iv.valueSize = sz 151 | 152 | old, err := i.maps.Set(&mergedKey, iv) 153 | if err != nil { 154 | return err 155 | } 156 | 157 | if stat != nil && old != nil { 158 | stat.FreeBytes = old.valueSize 159 | stat.FreeWalFid = old.fid 160 | 161 | i.ivPool.Put(old) 162 | } 163 | 164 | return nil 165 | } 166 | 167 | func (i *Index) Capacity() uint64 { 168 | return i.maps.Capacity() 169 | } 170 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "crypto/sha1" 5 | "encoding/binary" 6 | "hash/crc32" 7 | "io" 8 | "os" 9 | "path/filepath" 10 | "strconv" 11 | "strings" 12 | 13 | "golang.org/x/sys/unix" 14 | ) 15 | 16 | func PathExists(path string) bool { 17 | _, err := os.Stat(path) 18 | if err != nil && os.IsNotExist(err) { 19 | return false 20 | } 21 | return true 22 | } 23 | 24 | func ComputeCRC32(data []byte) uint32 { 25 | const castagnoliPoly = 0x82f63b78 26 | table := crc32.MakeTable(castagnoliPoly) 27 | checksum := crc32.Checksum(data, table) 28 | return (checksum>>15 | checksum<<17) + 0xa282ead8 29 | } 30 | 31 | // pread does not modify the file pointer, so it has no effect on append write 32 | func PreadFull(fd int, buf []byte, offset int64) error { 33 | totalRead, expectRead := 0, len(buf) 34 | for totalRead < expectRead { 35 | // pread syscall try to read data with the specific buffer length 36 | n, err := unix.Pread(fd, buf[totalRead:], offset+int64(totalRead)) 37 | if err != nil { 38 | if err == io.EOF { 39 | break 40 | } 41 | return err 42 | } 43 | 44 | totalRead += n 45 | } 46 | 47 | return nil 48 | } 49 | 50 | // return 0, 0 for all exceptions 51 | func DecodeUvarint(data []byte) (uint64, int) { 52 | v, size := binary.Uvarint(data) 53 | if size <= 0 { 54 | return 0, 0 55 | } 56 | return v, size 57 | } 58 | 59 | type Runners struct { 60 | functors []func() 61 | committed bool 62 | } 63 | 64 | func NewRunner() *Runners { 65 | return &Runners{ 66 | committed: true, 67 | } 68 | } 69 | 70 | func NewReverseRunner() *Runners { 71 | return &Runners{ 72 | committed: false, 73 | } 74 | } 75 | 76 | func (r *Runners) Post(f func()) { 77 | r.functors = append(r.functors, f) 78 | } 79 | 80 | func (r *Runners) Do() { 81 | if !r.committed { 82 | return 83 | } 84 | 85 | for idx := range r.functors { 86 | r.functors[idx]() 87 | } 88 | } 89 | 90 | func (r *Runners) Rollback() { 91 | r.committed = false 92 | } 93 | 94 | func (r *Runners) Commit() { 95 | r.committed = true 96 | } 97 | 98 | func ParseFilename(name string) (fileType int, fid uint64, err error) { 99 | ext := filepath.Ext(name) 100 | 101 | switch ext { 102 | case WalFileSuffix: 103 | fid, err := strconv.Atoi(name[:len(name)-len(ext)]) 104 | return WalFileType, uint64(fid), err 105 | case HintFileSuffix: 106 | fid, err := strconv.Atoi(name[:len(name)-len(ext)]) 107 | return HintFileType, uint64(fid), err 108 | case MergeFileSuffix: 109 | fid, err := strconv.Atoi(name[:len(name)-len(ext)]) 110 | return MergeFileType, uint64(fid), err 111 | case TmpFileSuffix: 112 | fid, err := strconv.Atoi(name[:len(name)-len(ext)]) 113 | return TmpFileType, uint64(fid), err 114 | } 115 | 116 | if name == CurrentFile { 117 | return CurrentFileType, 0, nil 118 | } 119 | 120 | if name == LockFile { 121 | return LockFileType, 0, nil 122 | } 123 | 124 | if strings.HasPrefix(name, ManifestFilePrefix) { 125 | fid, err := strconv.Atoi(name[len(ManifestFilePrefix)+1:]) 126 | return ManifestFileType, uint64(fid), err 127 | } 128 | 129 | return UnknownFileType, 0, nil 130 | } 131 | 132 | // namespace is fixed size 133 | func MergedKey(ns, key []byte) []byte { 134 | mergedKey := make([]byte, len(ns)+len(key)) 135 | copy(mergedKey, ns) 136 | copy(mergedKey[len(ns):], key) 137 | 138 | return mergedKey 139 | } 140 | 141 | func GenSha1NS(ns string) []byte { 142 | hash := sha1.Sum([]byte(ns)) 143 | return hash[:] 144 | } 145 | 146 | func GenSha1Etag(data []byte) []byte { 147 | hash := sha1.Sum(data) 148 | return hash[:] 149 | } 150 | 151 | func Gen1KBytes() []byte { 152 | buf := make([]byte, 1024) 153 | for i := 0; i < 128; i++ { 154 | copy(buf[i*8:], []byte("01234567")) 155 | } 156 | return buf 157 | } 158 | 159 | func GenNKBytes(n int) []byte { 160 | bytes1KB := Gen1KBytes() 161 | buf := make([]byte, 1024*n) 162 | for i := 0; i < n; i++ { 163 | copy(buf[i*1024:], bytes1KB) 164 | } 165 | return buf 166 | } 167 | 168 | func Map[T any, R any](input []T, f func(T) R) []R { 169 | result := make([]R, len(input)) 170 | for i, v := range input { 171 | result[i] = f(v) 172 | } 173 | return result 174 | } 175 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= 2 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 3 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 4 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= 6 | github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E= 7 | github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0= 8 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 9 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 10 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 11 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 12 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= 13 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= 14 | github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 15 | github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= 16 | github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 17 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 18 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 19 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 20 | github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= 21 | github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= 22 | github.com/royalcat/iouring-go v0.0.0-20240925200811-286062ac1b23 h1:3yOlLKYd6iSGkRUOCPuBQibjjvZyrGB/4sm0fh3nNuQ= 23 | github.com/royalcat/iouring-go v0.0.0-20240925200811-286062ac1b23/go.mod h1:LEzdaZarZ5aqROlLIwJ4P7h3+4o71008fSy6wpaEB+s= 24 | github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= 25 | github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY= 26 | github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ= 27 | github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= 28 | github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= 29 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 30 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 31 | github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8= 32 | github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok= 33 | github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= 34 | github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= 35 | golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 36 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 37 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 38 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 39 | golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= 40 | golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 41 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 42 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 43 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 44 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= 45 | gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= 46 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 47 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 48 | -------------------------------------------------------------------------------- /bench/bench_disk_usage_test.go: -------------------------------------------------------------------------------- 1 | package bench 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "strconv" 9 | "strings" 10 | "sync/atomic" 11 | "testing" 12 | "time" 13 | 14 | "github.com/stretchr/testify/assert" 15 | "github.com/wenzhang-dev/bitcaskDB" 16 | ) 17 | 18 | func newDiskUsageDB(b *testing.B) { 19 | dir = "./bitcaskDB" 20 | _ = os.RemoveAll(dir) 21 | _ = os.MkdirAll(dir, os.ModePerm) 22 | 23 | opts := &bitcask.Options{ 24 | Dir: dir, 25 | WalMaxSize: 1024 * 1024 * 1024, // 1GB 26 | ManifestMaxSize: 10 * 1024 * 1024, // 10MB 27 | IndexCapacity: 10000000, // 10 million 28 | IndexLimited: 8000000, 29 | IndexEvictionPoolCapacity: 64, 30 | IndexSampleKeys: 5, 31 | CompactionPicker: nil, // default picker 32 | CompactionFilter: nil, // default filter 33 | NsSize: bitcask.DefaultNsSize, 34 | EtagSize: bitcask.DefaultEtagSize, 35 | DisableCompaction: false, 36 | DiskUsageLimited: 10 * 1024 * 1024 * 1024, // 10GB 37 | } 38 | 39 | var err error 40 | db, err = bitcask.NewDB(opts) 41 | assert.Nil(b, err) 42 | } 43 | 44 | func BenchmarkDiskUsage(b *testing.B) { 45 | b.Run("batchPut4K", benchmarkDiskUsageBatchPut) 46 | 47 | b.Run("concurrentBatchPut4K", benchmarkDiskUsageConcurrentBatchPut) 48 | } 49 | 50 | func getActualDiskUsage(path string) int64 { 51 | cmd := exec.Command("du", "-sb", path) 52 | 53 | var out bytes.Buffer 54 | cmd.Stdout = &out 55 | if err := cmd.Run(); err != nil { 56 | return 0 57 | } 58 | 59 | parts := strings.Fields(out.String()) 60 | if len(parts) < 1 { 61 | return 0 62 | } 63 | 64 | size, err := strconv.ParseInt(parts[0], 10, 64) 65 | if err != nil { 66 | return 0 67 | } 68 | 69 | return size 70 | } 71 | 72 | // print disk usgae per three seconds 73 | var ( 74 | totalBytesWritten int64 75 | stopCh chan struct{} 76 | ) 77 | 78 | func printDiskUsageStat() { 79 | fmt.Printf("\n") 80 | 81 | ticker := time.NewTicker(3 * time.Second) 82 | defer ticker.Stop() 83 | 84 | var lastTotal int64 85 | for { 86 | select { 87 | case <-ticker.C: 88 | current := atomic.LoadInt64(&totalBytesWritten) 89 | speed := current - lastTotal 90 | lastTotal = current 91 | fmt.Printf( 92 | "Write Speed %.2f MB/s | Write Total: %.2f GB | Disk Usage: %.2f GB\n", 93 | float64(speed)/1024/1024/3, 94 | float64(current)/1024/1024/1024, 95 | float64(getActualDiskUsage(dir))/1024/1024/1024, 96 | ) 97 | case <-stopCh: 98 | return 99 | } 100 | } 101 | } 102 | 103 | func benchmarkDiskUsageBatchPut(b *testing.B) { 104 | newDiskUsageDB(b) 105 | defer db.Close() 106 | 107 | totalBytesWritten = 0 108 | stopCh = make(chan struct{}) 109 | defer close(stopCh) 110 | 111 | go printDiskUsageStat() 112 | 113 | meta := bitcask.NewMeta(nil) 114 | opts := &bitcask.WriteOptions{} 115 | 116 | b.ResetTimer() 117 | b.ReportAllocs() 118 | 119 | batch := bitcask.NewBatch() 120 | for i := 0; i < b.N; i++ { 121 | batch.Put(ns[:], genTestKey(i), bin4KB, meta) 122 | 123 | if i%BatchSize == 0 { 124 | err := db.Write(batch, opts) 125 | assert.Nil(b, err) 126 | atomic.AddInt64(&totalBytesWritten, int64(batch.ByteSize())) 127 | 128 | batch.Clear() 129 | } 130 | } 131 | } 132 | 133 | func benchmarkDiskUsageConcurrentBatchPut(b *testing.B) { 134 | newDiskUsageDB(b) 135 | defer db.Close() 136 | 137 | totalBytesWritten = 0 138 | stopCh = make(chan struct{}) 139 | defer close(stopCh) 140 | 141 | go printDiskUsageStat() 142 | 143 | meta := bitcask.NewMeta(nil) 144 | opts := &bitcask.WriteOptions{} 145 | 146 | b.ResetTimer() 147 | b.ReportAllocs() 148 | 149 | b.RunParallel(func(pb *testing.PB) { 150 | var err error 151 | iteration := 0 152 | batch := bitcask.NewBatch() 153 | for pb.Next() { 154 | batch.Put(ns[:], genTestKey(iteration), bin4KB, meta) 155 | 156 | if iteration%BatchSize == 0 { 157 | err = db.Write(batch, opts) 158 | assert.Nil(b, err) 159 | atomic.AddInt64(&totalBytesWritten, int64(batch.ByteSize())) 160 | 161 | batch.Clear() 162 | } 163 | 164 | iteration++ 165 | } 166 | }) 167 | } 168 | -------------------------------------------------------------------------------- /manifest_edit.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | ) 8 | 9 | const ( 10 | manifestEditDeleteFileTag = 1 11 | manifestEditAddFileTag = 2 12 | manifestEditNextFidTag = 3 13 | manifestEditFreeBytesTag = 4 14 | ) 15 | 16 | var ( 17 | ErrCorruptedManifest = errors.New("corrupted manifest file") 18 | ErrUnknownManifestEditTag = errors.New("unknown manifest tag") 19 | ) 20 | 21 | type LogFile struct { 22 | wal *Wal 23 | 24 | fid uint64 25 | } 26 | 27 | // the manifest edit will be persist one by one, and append to 28 | // MANIFEST file 29 | type ManifestEdit struct { 30 | // the delete list of wal file 31 | deleteFiles []LogFile 32 | 33 | // the add list of wal file 34 | addFiles []LogFile 35 | 36 | // free bytes for each wal file 37 | freeBytes map[uint64]uint64 38 | 39 | // the available file number 40 | nextFid uint64 41 | hasNextFid bool 42 | } 43 | 44 | func NewManifestEdit() *ManifestEdit { 45 | return &ManifestEdit{ 46 | hasNextFid: false, 47 | freeBytes: make(map[uint64]uint64), 48 | } 49 | } 50 | 51 | func (edit *ManifestEdit) Merge(other *ManifestEdit) { 52 | if len(other.addFiles) > 0 { 53 | edit.addFiles = append(edit.addFiles, other.addFiles...) 54 | } 55 | 56 | if len(other.deleteFiles) > 0 { 57 | edit.deleteFiles = append(edit.deleteFiles, other.deleteFiles...) 58 | } 59 | 60 | if other.hasNextFid { 61 | edit.hasNextFid = true 62 | edit.nextFid = max(edit.nextFid, other.nextFid) 63 | } 64 | 65 | if len(other.freeBytes) > 0 { 66 | for fid, freeBytes := range other.freeBytes { 67 | edit.freeBytes[fid] += freeBytes 68 | } 69 | } 70 | } 71 | 72 | func (edit *ManifestEdit) Encode() []byte { 73 | var buf bytes.Buffer 74 | encodeVarint := func(v uint64) { 75 | var tmp [binary.MaxVarintLen64]byte 76 | n := binary.PutUvarint(tmp[:], v) 77 | buf.Write(tmp[:n]) 78 | } 79 | 80 | if edit.hasNextFid { 81 | encodeVarint(manifestEditNextFidTag) 82 | encodeVarint(edit.nextFid) 83 | } 84 | 85 | for _, file := range edit.addFiles { 86 | encodeVarint(manifestEditAddFileTag) 87 | encodeVarint(file.fid) 88 | } 89 | 90 | for _, file := range edit.deleteFiles { 91 | encodeVarint(manifestEditDeleteFileTag) 92 | encodeVarint(file.fid) 93 | } 94 | 95 | for fid, freeBytes := range edit.freeBytes { 96 | encodeVarint(manifestEditFreeBytesTag) 97 | encodeVarint(fid) 98 | encodeVarint(freeBytes) 99 | } 100 | 101 | return buf.Bytes() 102 | } 103 | 104 | func (edit *ManifestEdit) Clear() { 105 | edit.nextFid = 0 106 | edit.hasNextFid = false 107 | 108 | edit.addFiles = nil 109 | edit.deleteFiles = nil 110 | edit.freeBytes = make(map[uint64]uint64) 111 | } 112 | 113 | func (edit *ManifestEdit) DecodeFrom(data []byte) error { 114 | offset := 0 115 | edit.Clear() 116 | for offset < len(data) { 117 | tag, nbytes := binary.Uvarint(data[offset:]) 118 | if nbytes <= 0 { 119 | return ErrCorruptedManifest 120 | } 121 | offset += nbytes 122 | 123 | switch tag { 124 | case manifestEditDeleteFileTag: 125 | fid, nbytes := binary.Uvarint(data[offset:]) 126 | if nbytes <= 0 { 127 | return ErrCorruptedManifest 128 | } 129 | offset += nbytes 130 | edit.deleteFiles = append(edit.deleteFiles, LogFile{fid: fid}) 131 | case manifestEditAddFileTag: 132 | fid, nbytes := binary.Uvarint(data[offset:]) 133 | if nbytes <= 0 { 134 | return ErrCorruptedManifest 135 | } 136 | offset += nbytes 137 | edit.addFiles = append(edit.addFiles, LogFile{fid: fid}) 138 | case manifestEditNextFidTag: 139 | fid, nbytes := binary.Uvarint(data[offset:]) 140 | if nbytes <= 0 { 141 | return ErrCorruptedManifest 142 | } 143 | offset += nbytes 144 | edit.hasNextFid = true 145 | edit.nextFid = max(edit.nextFid, fid) 146 | case manifestEditFreeBytesTag: 147 | fid, nbytes := binary.Uvarint(data[offset:]) 148 | if nbytes <= 0 { 149 | return ErrCorruptedManifest 150 | } 151 | offset += nbytes 152 | 153 | freeBytes, nbytes := binary.Uvarint(data[offset:]) 154 | if nbytes <= 0 { 155 | return ErrCorruptedManifest 156 | } 157 | offset += nbytes 158 | edit.freeBytes[fid] += freeBytes 159 | default: 160 | return ErrUnknownManifestEditTag 161 | } 162 | } 163 | 164 | return nil 165 | } 166 | -------------------------------------------------------------------------------- /record_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestRecord_EmptyNs(t *testing.T) { 11 | // mock global options 12 | oldOpts := gOpts 13 | gOpts = &Options{ 14 | NsSize: 0, 15 | EtagSize: 0, 16 | } 17 | defer func() { 18 | gOpts = oldOpts 19 | }() 20 | 21 | // testcase 22 | record := &Record{ 23 | Ns: nil, 24 | Key: []byte("test-key"), 25 | Value: []byte("test-value"), 26 | Meta: NewMeta(nil), 27 | } 28 | 29 | backStore := make([]byte, DefaultRecordBufferSize) 30 | baseTime := uint64(time.Now().Unix()) 31 | encoded, err := record.Encode(backStore, baseTime) 32 | assert.Nil(t, err) 33 | 34 | decoded, err := RecordFromBytes(encoded, baseTime) 35 | assert.Nil(t, err) 36 | 37 | // check 38 | assert.Equal(t, len(decoded.Ns), 0) 39 | assert.Equal(t, decoded.Key, record.Key) 40 | assert.Equal(t, decoded.Value, record.Value) 41 | } 42 | 43 | func TestRecord_EncodingDecoding(t *testing.T) { 44 | // mock global options 45 | oldOpts := gOpts 46 | gOpts = &Options{ 47 | NsSize: DefaultNsSize, 48 | EtagSize: DefaultEtagSize, 49 | } 50 | defer func() { 51 | gOpts = oldOpts 52 | }() 53 | 54 | // testcase 55 | ns := sha1Bytes("test-ns") 56 | etag := sha1Bytes("etag") 57 | baseTime := uint64(time.Now().Unix()) 58 | 59 | tests := []struct { 60 | name string 61 | record *Record 62 | wantErr bool 63 | }{ 64 | { 65 | name: "Normal case with full metadata and value", 66 | record: &Record{ 67 | Ns: ns[:], 68 | Key: []byte("test-key"), 69 | Meta: NewMeta(map[string]string{"foo": "bar"}).SetExpire(baseTime + 60).SetEtag(etag[:]).SetTombstone(true), 70 | Value: []byte("hello world"), 71 | }, 72 | wantErr: false, 73 | }, 74 | { 75 | name: "Nil APP Meta", 76 | record: &Record{ 77 | Ns: ns[:], 78 | Key: []byte("test-key"), 79 | Meta: NewMeta(nil).SetExpire(baseTime + 61).SetEtag(etag[:]).SetTombstone(false), 80 | Value: []byte("hello world"), 81 | }, 82 | wantErr: false, 83 | }, 84 | { 85 | name: "Nil Value", 86 | record: &Record{ 87 | Ns: ns[:], 88 | Key: []byte("test-key"), 89 | Meta: NewMeta(map[string]string{"foo": "bar"}).SetExpire(baseTime + 62).SetEtag(etag[:]).SetTombstone(true), 90 | Value: []byte{}, 91 | }, 92 | wantErr: false, 93 | }, 94 | { 95 | name: "Nil APP Meta, Nil Value, Nil Etag and No Expire", 96 | record: &Record{ 97 | Ns: ns[:], 98 | Key: []byte("test-key"), 99 | Meta: NewMeta(nil), 100 | Value: []byte{}, 101 | }, 102 | wantErr: false, 103 | }, 104 | } 105 | 106 | backStore := make([]byte, DefaultRecordBufferSize) 107 | for _, tt := range tests { 108 | t.Run(tt.name, func(t *testing.T) { 109 | encoded, err := tt.record.Encode(backStore, baseTime) 110 | if (err != nil) != tt.wantErr { 111 | t.Errorf("Encode() error = %v, wantErr %v", err, tt.wantErr) 112 | return 113 | } 114 | 115 | if tt.wantErr { 116 | return // No need to proceed if encoding is expected to fail 117 | } 118 | 119 | decodedRecord, err := RecordFromBytes(encoded, baseTime) 120 | assert.NoError(t, err, "RecordFromBytes should not fail") 121 | 122 | // Ensure Namespace is correctly restored 123 | assert.Equal(t, tt.record.Ns, decodedRecord.Ns, "Namespace mismatch") 124 | 125 | // Ensure Key is correctly restored 126 | assert.Equal(t, tt.record.Key, decodedRecord.Key, "Key mismatch") 127 | 128 | // Ensure Value is correctly restored 129 | assert.Equal(t, tt.record.Value, decodedRecord.Value, "Value mismatch") 130 | 131 | // Check Meta field 132 | assert.NotNil(t, decodedRecord.Meta, "Meta should not be nil") 133 | assert.Equal(t, tt.record.Meta.Flags, decodedRecord.Meta.Flags, "Flags mismatch") 134 | assert.Equal(t, tt.record.Meta.Expire, decodedRecord.Meta.Expire, "Expire mismatch") 135 | 136 | if len(tt.record.Meta.Etag) != 0 { 137 | assert.Equal(t, tt.record.Meta.Etag, decodedRecord.Meta.Etag, "Etag mismatch") 138 | } 139 | 140 | if tt.record.Meta.AppMeta == nil { 141 | assert.Nil(t, decodedRecord.Meta.AppMeta, "AppMeta should be nil") 142 | } else { 143 | assert.Equal(t, tt.record.Meta.AppMeta, decodedRecord.Meta.AppMeta, "AppMeta mismatch") 144 | } 145 | }) 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /manifest_txn_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func setupManifestTxn(t *testing.T) (*ManifestTxn, func()) { 12 | dir := "./test_bitcask_db" 13 | _ = os.RemoveAll(dir) 14 | 15 | assert.Nil(t, os.MkdirAll(dir, os.ModePerm)) 16 | 17 | manifest, err := NewManifest(dir) 18 | assert.Nil(t, err) 19 | 20 | txn, err := manifest.NewTxn() 21 | assert.Nil(t, err) 22 | return txn, func() { 23 | os.RemoveAll(dir) 24 | } 25 | } 26 | 27 | func TestManifestTxn_Commit(t *testing.T) { 28 | txn, closer := setupManifestTxn(t) 29 | defer closer() 30 | 31 | manifest := txn.manifest 32 | dir := manifest.dir 33 | 34 | wal3, _ := NewWal(WalPath(dir, 3), 3, -1) 35 | wal4, _ := NewWal(WalPath(dir, 4), 4, -1) 36 | wal5, _ := NewWal(WalPath(dir, 5), 5, -1) 37 | wal6, _ := NewWal(WalPath(dir, 6), 6, -1) 38 | wal7, _ := NewWal(WalPath(dir, 7), 7, -1) 39 | 40 | edit1 := &ManifestEdit{ 41 | addFiles: []LogFile{ 42 | {fid: 3, wal: wal3}, 43 | {fid: 4, wal: wal4}, 44 | {fid: 5, wal: wal5}, 45 | }, 46 | deleteFiles: nil, 47 | hasNextFid: true, 48 | nextFid: 6, 49 | freeBytes: make(map[uint64]uint64), 50 | } 51 | edit1.freeBytes[2] = 123 52 | 53 | // apply 54 | txn.Apply(edit1) 55 | 56 | // check 57 | assert.False(t, txn.aborted) 58 | assert.False(t, txn.committed) 59 | assert.False(t, txn.IsDone()) 60 | assert.Equal(t, wal3, txn.ToWal(3)) 61 | assert.Equal(t, wal4, txn.ToWal(4)) 62 | assert.Equal(t, wal5, txn.ToWal(5)) 63 | assert.Equal(t, txn.NextFid(), uint64(6)) 64 | 65 | assert.Equal(t, wal3, manifest.ToWal(3)) 66 | assert.Equal(t, wal4, manifest.ToWal(4)) 67 | assert.Equal(t, wal5, manifest.ToWal(5)) 68 | assert.Equal(t, manifest.NextFid(), uint64(6)) 69 | 70 | // commit 71 | edit2 := &ManifestEdit{ 72 | addFiles: []LogFile{ 73 | {fid: 6, wal: wal6}, 74 | {fid: 7, wal: wal7}, 75 | }, 76 | deleteFiles: []LogFile{ 77 | {fid: 3, wal: wal3}, 78 | {fid: 4, wal: wal4}, 79 | {fid: 5, wal: wal5}, 80 | }, 81 | hasNextFid: true, 82 | nextFid: 8, 83 | freeBytes: make(map[uint64]uint64), 84 | } 85 | 86 | edit2.freeBytes[2] = 123 87 | 88 | assert.Nil(t, txn.Commit(edit2)) 89 | 90 | // check 91 | assert.True(t, txn.committed) 92 | assert.False(t, txn.aborted) 93 | assert.True(t, txn.IsDone()) 94 | 95 | assert.Nil(t, manifest.ToWal(3)) 96 | assert.Nil(t, manifest.ToWal(4)) 97 | assert.Nil(t, manifest.ToWal(5)) 98 | 99 | assert.Equal(t, wal6, manifest.ToWal(6)) 100 | assert.Equal(t, wal7, manifest.ToWal(7)) 101 | assert.Equal(t, manifest.NextFid(), uint64(8)) 102 | 103 | // commit again 104 | err := txn.Commit(nil) 105 | assert.True(t, errors.Is(err, ErrCommittedManfiestTxn)) 106 | } 107 | 108 | func TestManifestTxn_Abort(t *testing.T) { 109 | txn, closer := setupManifestTxn(t) 110 | defer closer() 111 | 112 | manifest := txn.manifest 113 | dir := manifest.dir 114 | 115 | wal3, _ := NewWal(WalPath(dir, 3), 3, -1) 116 | wal4, _ := NewWal(WalPath(dir, 4), 4, -1) 117 | wal5, _ := NewWal(WalPath(dir, 5), 5, -1) 118 | 119 | edit1 := &ManifestEdit{ 120 | addFiles: []LogFile{ 121 | {fid: 3, wal: wal3}, 122 | {fid: 4, wal: wal4}, 123 | {fid: 5, wal: wal5}, 124 | }, 125 | deleteFiles: nil, 126 | hasNextFid: true, 127 | nextFid: 6, 128 | freeBytes: make(map[uint64]uint64), 129 | } 130 | edit1.freeBytes[2] = 123 131 | 132 | // apply 133 | txn.Apply(edit1) 134 | 135 | // check 136 | assert.False(t, txn.aborted) 137 | assert.False(t, txn.committed) 138 | assert.False(t, txn.IsDone()) 139 | assert.Equal(t, wal3, txn.ToWal(3)) 140 | assert.Equal(t, wal4, txn.ToWal(4)) 141 | assert.Equal(t, wal5, txn.ToWal(5)) 142 | assert.Equal(t, txn.NextFid(), uint64(6)) 143 | 144 | assert.Equal(t, wal3, manifest.ToWal(3)) 145 | assert.Equal(t, wal4, manifest.ToWal(4)) 146 | assert.Equal(t, wal5, manifest.ToWal(5)) 147 | assert.Equal(t, manifest.NextFid(), uint64(6)) 148 | 149 | // abort 150 | txn.Abort() 151 | 152 | // check 153 | assert.True(t, txn.aborted) 154 | assert.False(t, txn.committed) 155 | assert.True(t, txn.IsDone()) 156 | 157 | assert.Nil(t, manifest.ToWal(3)) 158 | assert.Nil(t, manifest.ToWal(4)) 159 | assert.Nil(t, manifest.ToWal(5)) 160 | assert.Equal(t, manifest.NextFid(), uint64(3)) 161 | 162 | // abort again 163 | txn.Abort() 164 | } 165 | -------------------------------------------------------------------------------- /hint.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | ) 8 | 9 | type HintRecord struct { 10 | ns []byte 11 | key []byte 12 | 13 | fid uint64 14 | off uint64 15 | size uint64 16 | } 17 | 18 | const ( 19 | hintWalRewriterThrehold = 1024 * 1024 // 1MB 20 | ) 21 | 22 | var ErrCorruptedHintRecord = errors.New("corrupted hint record") 23 | 24 | // format: 25 | // | ns | key-size | key | fid | off | size | 26 | // 27 | // ns: fixed-size string 28 | // key-size: varint64 29 | // fid: varint64 30 | // off: varint64 31 | // size: varint64 32 | func (r *HintRecord) Encode() ([]byte, error) { 33 | var buf bytes.Buffer 34 | encodeVarint := func(v uint64) { 35 | var tmp [binary.MaxVarintLen64]byte 36 | n := binary.PutUvarint(tmp[:], v) 37 | buf.Write(tmp[:n]) 38 | } 39 | 40 | buf.Write(r.ns) 41 | encodeVarint(uint64(len(r.key))) 42 | buf.Write(r.key) 43 | encodeVarint(r.fid) 44 | encodeVarint(r.off) 45 | encodeVarint(r.size) 46 | 47 | return buf.Bytes(), nil 48 | } 49 | 50 | func (r *HintRecord) Decode(data []byte) error { 51 | nsSize := int(GetOptions().NsSize) 52 | minHintRecordSize := nsSize + 1 + 1 + 1*3 53 | if len(data) < minHintRecordSize { 54 | return ErrCorruptedHintRecord 55 | } 56 | 57 | offset := 0 58 | 59 | r.ns = data[:nsSize] 60 | offset += nsSize 61 | 62 | keyLen, nbytes := DecodeUvarint(data[offset:]) 63 | offset += nbytes 64 | 65 | keyOffset := offset 66 | offset += int(keyLen) 67 | 68 | r.fid, nbytes = DecodeUvarint(data[offset:]) 69 | offset += nbytes 70 | 71 | r.off, nbytes = DecodeUvarint(data[offset:]) 72 | offset += nbytes 73 | 74 | r.size, nbytes = DecodeUvarint(data[offset:]) 75 | offset += nbytes 76 | 77 | if offset != len(data) { 78 | return ErrCorruptedHintRecord 79 | } 80 | 81 | r.key = data[keyOffset : keyOffset+int(keyLen)] 82 | 83 | return nil 84 | } 85 | 86 | type HintWriter struct { 87 | rewriter *WalRewriter 88 | } 89 | 90 | func NewHintWriter(path string, fid uint64, baseTime int64) (*HintWriter, error) { 91 | hint, err := NewWal(path, fid, baseTime) 92 | if err != nil { 93 | return nil, err 94 | } 95 | 96 | return &HintWriter{ 97 | rewriter: NewWalRewriter(hint, hintWalRewriterThrehold), 98 | }, nil 99 | } 100 | 101 | func (w *HintWriter) AppendRecord(record *HintRecord) error { 102 | recordBytes, err := record.Encode() 103 | if err != nil { 104 | return err 105 | } 106 | 107 | _, err = w.rewriter.AppendRecord(recordBytes) 108 | return err 109 | } 110 | 111 | func (w *HintWriter) Wal() *Wal { 112 | return w.rewriter.wal 113 | } 114 | 115 | func (w *HintWriter) Close() error { 116 | return w.rewriter.Close() 117 | } 118 | 119 | func (w *HintWriter) Flush() error { 120 | return w.rewriter.Flush() 121 | } 122 | 123 | func NewHintByWal(wal *Wal) (uint64, error) { 124 | // hint wal use the same fid and base time 125 | hintPath := TmpPath(wal.Dir(), wal.fid) 126 | writer, err := NewHintWriter(hintPath, wal.fid, int64(wal.BaseTime())) 127 | if err != nil { 128 | return 0, err 129 | } 130 | 131 | defer writer.Close() 132 | 133 | if err = IterateRecord(wal, func(record *Record, foff, size uint64) error { 134 | // the foff points to the start offset of data in the wal 135 | // however, the offset used by ReadRecord of wal expects the start offset of data header 136 | foff -= RecordHeaderSize 137 | 138 | hintRecord := &HintRecord{ 139 | ns: record.Ns, 140 | key: record.Key, 141 | fid: wal.fid, 142 | off: foff, 143 | size: size, 144 | } 145 | 146 | return writer.AppendRecord(hintRecord) 147 | }); err != nil { 148 | return 0, err 149 | } 150 | 151 | if err = writer.Flush(); err != nil { 152 | return 0, err 153 | } 154 | 155 | // rename hint file 156 | if err = writer.Wal().Rename(HintFilename(wal.fid)); err != nil { 157 | return 0, err 158 | } 159 | 160 | return writer.Wal().Size(), nil 161 | } 162 | 163 | func IterateHint(hint *Wal, cb func(record *HintRecord) error) error { 164 | it := NewWalIterator(hint) 165 | defer it.Close() 166 | 167 | var err error 168 | var recordBytes []byte 169 | record := &HintRecord{} 170 | for { 171 | if _, recordBytes, err = it.Next(); err != nil { 172 | if errors.Is(err, ErrWalIteratorEOF) { 173 | break 174 | } 175 | return err 176 | } 177 | 178 | if err = record.Decode(recordBytes); err != nil { 179 | return err 180 | } 181 | 182 | if err = cb(record); err != nil { 183 | return err 184 | } 185 | } 186 | 187 | return nil 188 | } 189 | -------------------------------------------------------------------------------- /deque.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import "errors" 4 | 5 | const ( 6 | DequeChunkSize = 512 7 | DequeInitSize = 64 8 | DequeFrontReserveSize = 3 9 | ) 10 | 11 | var ( 12 | ErrDequeEmpty = errors.New("deque empty") 13 | ErrDequeOutOfRange = errors.New("deque out of range") 14 | ) 15 | 16 | type dequeChunk[T any] struct { 17 | buf [DequeChunkSize]T 18 | 19 | // the [start, end) is actual data range 20 | start int32 21 | end int32 22 | } 23 | 24 | func (c *dequeChunk[T]) size() int { 25 | return int(c.end - c.start) 26 | } 27 | 28 | func (c *dequeChunk[T]) rightFull() bool { 29 | return c.end == DequeChunkSize 30 | } 31 | 32 | func (c *dequeChunk[T]) leftFull() bool { 33 | return c.start == 0 34 | } 35 | 36 | func (c *dequeChunk[T]) empty() bool { 37 | return c.start == c.end 38 | } 39 | 40 | type Deque[T any] struct { 41 | // buffer 42 | chunks []*dequeChunk[T] 43 | 44 | // the [start, end) is actual data range 45 | start int64 46 | end int64 // end-1 index always points valid chunk 47 | 48 | // the total size of elements 49 | size int64 50 | } 51 | 52 | func NewDeque[T any]() *Deque[T] { 53 | // the init start end end range: [2, 4) 54 | d := &Deque[T]{ 55 | chunks: make([]*dequeChunk[T], DequeInitSize), 56 | start: DequeFrontReserveSize - 1, 57 | end: DequeFrontReserveSize + 1, 58 | size: 0, 59 | } 60 | 61 | // the start index(2) is writable 62 | d.chunks[d.start] = &dequeChunk[T]{ 63 | start: DequeChunkSize, 64 | end: DequeChunkSize, 65 | } 66 | 67 | // the end-1 index(3) is writable 68 | d.chunks[d.start+1] = &dequeChunk[T]{ 69 | start: 0, 70 | end: 0, 71 | } 72 | 73 | return d 74 | } 75 | 76 | func (d *Deque[T]) Back() (*T, error) { 77 | if d.Empty() { 78 | return nil, ErrDequeEmpty 79 | } 80 | 81 | return d.At(d.Len() - 1) 82 | } 83 | 84 | func (d *Deque[T]) Front() (*T, error) { 85 | if d.Empty() { 86 | return nil, ErrDequeEmpty 87 | } 88 | 89 | return d.At(0) 90 | } 91 | 92 | func (d *Deque[T]) grow() { 93 | if int(d.end) == len(d.chunks) || d.start == 0 { 94 | // copy pointers 95 | chunks := make([]*dequeChunk[T], 2*d.end) 96 | for i := d.start; i < d.end; i++ { 97 | chunks[DequeFrontReserveSize+(i-d.start)] = d.chunks[i] 98 | } 99 | 100 | // order matters 101 | d.end = DequeFrontReserveSize + (d.end - d.start) 102 | d.start = DequeFrontReserveSize 103 | d.chunks = chunks 104 | } 105 | } 106 | 107 | func (d *Deque[T]) PushBack(v T) { 108 | // the end-1 chunk always writable 109 | chunk := d.chunks[d.end-1] 110 | 111 | chunk.buf[chunk.end] = v 112 | chunk.end++ 113 | 114 | if chunk.rightFull() { 115 | d.grow() 116 | 117 | d.chunks[d.end] = &dequeChunk[T]{ 118 | start: 0, 119 | end: 0, 120 | } 121 | d.end++ 122 | } 123 | 124 | d.size++ 125 | } 126 | 127 | func (d *Deque[T]) PushFront(v T) { 128 | // the start chunk always writable 129 | chunk := d.chunks[d.start] 130 | 131 | chunk.buf[chunk.start-1] = v 132 | chunk.start-- 133 | 134 | if chunk.leftFull() { 135 | d.grow() 136 | 137 | d.chunks[d.start-1] = &dequeChunk[T]{ 138 | start: DequeChunkSize, 139 | end: DequeChunkSize, 140 | } 141 | d.start-- 142 | } 143 | 144 | d.size++ 145 | } 146 | 147 | func (d *Deque[T]) PopBack() error { 148 | if d.Empty() { 149 | return ErrDequeEmpty 150 | } 151 | 152 | chunk := d.chunks[d.end-1] 153 | if chunk.empty() { 154 | d.chunks[d.end-1] = nil 155 | d.end-- 156 | chunk = d.chunks[d.end-1] 157 | } 158 | 159 | // make sure the chunk always writable 160 | chunk.end-- 161 | 162 | d.size-- 163 | if d.size == 0 { 164 | d.Clear() 165 | } 166 | 167 | return nil 168 | } 169 | 170 | func (d *Deque[T]) PopFront() error { 171 | if d.Empty() { 172 | return ErrDequeEmpty 173 | } 174 | 175 | chunk := d.chunks[d.start] 176 | if chunk.empty() { 177 | d.chunks[d.start] = nil 178 | d.start++ 179 | chunk = d.chunks[d.start] 180 | } 181 | 182 | // make sure the chunk always writable 183 | chunk.start++ 184 | 185 | d.size-- 186 | if d.size == 0 { 187 | d.Clear() 188 | } 189 | 190 | return nil 191 | } 192 | 193 | func (d *Deque[T]) Len() int { 194 | return int(d.size) 195 | } 196 | 197 | func (d *Deque[T]) Empty() bool { 198 | return d.size == 0 199 | } 200 | 201 | func (d *Deque[T]) At(idx int) (*T, error) { 202 | if idx >= d.Len() { 203 | return nil, ErrDequeOutOfRange 204 | } 205 | 206 | if idx < d.chunks[d.start].size() { 207 | chunk := d.chunks[d.start] 208 | return &chunk.buf[int(chunk.start)+idx], nil 209 | } 210 | 211 | idx -= d.chunks[d.start].size() 212 | pos := int(d.start+1) + idx/DequeChunkSize 213 | off := idx % DequeChunkSize 214 | chunk := d.chunks[pos] 215 | return &chunk.buf[int(chunk.start)+off], nil 216 | } 217 | 218 | func (d *Deque[T]) Clear() { 219 | for i := d.start; i < d.end; i++ { 220 | d.chunks[i] = nil 221 | } 222 | 223 | d.start = DequeFrontReserveSize - 1 224 | d.end = DequeFrontReserveSize + 1 225 | 226 | // the start index(2) is writable 227 | d.chunks[d.start] = &dequeChunk[T]{ 228 | start: DequeChunkSize, 229 | end: DequeChunkSize, 230 | } 231 | 232 | // the end-1 index(3) is writable 233 | d.chunks[d.start+1] = &dequeChunk[T]{ 234 | start: 0, 235 | end: 0, 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /db.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "path/filepath" 7 | "sort" 8 | ) 9 | 10 | const ( 11 | WalFileSuffix = ".wal" 12 | HintFileSuffix = ".hint" 13 | MergeFileSuffix = ".merge" 14 | TmpFileSuffix = ".tmp" 15 | LockFile = "LOCK" 16 | ManifestFilePrefix = "MANIFEST" 17 | CurrentFile = "CURRENT" 18 | ) 19 | 20 | const ( 21 | UnknownFileType = iota 22 | 23 | WalFileType 24 | HintFileType 25 | MergeFileType 26 | TmpFileType 27 | LockFileType 28 | ManifestFileType 29 | CurrentFileType 30 | ) 31 | 32 | var ( 33 | ErrKeyNotFound = errors.New("key not found") 34 | ErrIncompleteRead = errors.New("incomplete block read") 35 | ErrKeySoftDeleted = errors.New("key soft delete") 36 | ) 37 | 38 | type WriteOptions struct { 39 | Sync bool 40 | } 41 | 42 | type ReadOptions struct { 43 | VerifyChecksum bool 44 | } 45 | 46 | type PickerWalInfo struct { 47 | CreateTime uint64 48 | FreeBytes uint64 49 | WalSize uint64 50 | Fid uint64 51 | } 52 | 53 | type ( 54 | CompactionPicker func([]PickerWalInfo) []uint64 55 | CompactionFilter func(ns, key, val []byte, meta *Meta) bool 56 | ) 57 | 58 | type Options struct { 59 | Dir string 60 | 61 | LogFile string 62 | LogDir string 63 | LogLevel int8 64 | LogMaxSize uint64 65 | LogMaxBackups uint64 66 | 67 | WalMaxSize uint64 68 | ManifestMaxSize uint64 69 | 70 | IndexCapacity uint64 71 | IndexLimited uint64 72 | IndexEvictionPoolCapacity uint64 73 | IndexSampleKeys uint64 74 | 75 | BlockCacheCapacity uint64 76 | BlockCacheLimited uint64 77 | BlockCacheEvictionPoolCapacity uint64 78 | BlockCacheSampleKeys uint64 79 | 80 | BlockReaderConcurrent uint64 81 | 82 | CompactionPicker CompactionPicker 83 | CompactionFilter CompactionFilter 84 | 85 | DiskUsageLimited uint64 86 | 87 | NsSize uint64 88 | EtagSize uint64 89 | 90 | CompactionTriggerInterval uint64 91 | CheckDiskUsageInterval uint64 92 | 93 | CompactionPickerRatio float64 94 | 95 | DisableCompaction bool 96 | 97 | RecordBufferSize uint64 98 | } 99 | 100 | func (o *Options) Init() { 101 | if o.LogDir == "" { 102 | o.LogDir = o.Dir 103 | } 104 | 105 | if o.LogFile == "" { 106 | o.LogFile = DefaultLogFile 107 | } 108 | 109 | if o.LogMaxSize == 0 { 110 | o.LogMaxSize = DefaultLogMaxSize 111 | } 112 | 113 | if o.CompactionPicker == nil { 114 | o.CompactionPicker = DefaultCompactionPicker 115 | } 116 | 117 | if o.CompactionTriggerInterval <= 0 { 118 | o.CompactionTriggerInterval = DefaultCompactionTriggerInterval 119 | } 120 | 121 | if o.CheckDiskUsageInterval <= 0 { 122 | o.CheckDiskUsageInterval = DefaultCheckDiskUsageInterval 123 | } 124 | 125 | if o.CompactionPickerRatio <= 0 { 126 | o.CompactionPickerRatio = DefaultCompactionPickerRatio 127 | } 128 | 129 | if o.RecordBufferSize <= 0 { 130 | o.RecordBufferSize = DefaultRecordBufferSize 131 | } 132 | 133 | gOpts = o 134 | } 135 | 136 | var gOpts *Options 137 | 138 | // read-only 139 | func GetOptions() *Options { 140 | return gOpts 141 | } 142 | 143 | type DB interface { 144 | Get(ns, key []byte, opts *ReadOptions) (val []byte, meta *Meta, err error) 145 | Put(ns, key, val []byte, meta *Meta, opts *WriteOptions) error 146 | 147 | Write(batch *Batch, opts *WriteOptions) error 148 | Delete(ns, key []byte, opts *WriteOptions) error 149 | Close() 150 | } 151 | 152 | func TmpFilename(fid uint64) string { 153 | return fmt.Sprintf("%06d%s", fid, TmpFileSuffix) 154 | } 155 | 156 | func WalFilename(fid uint64) string { 157 | return fmt.Sprintf("%06d%s", fid, WalFileSuffix) 158 | } 159 | 160 | func HintFilename(fid uint64) string { 161 | return fmt.Sprintf("%06d%s", fid, HintFileSuffix) 162 | } 163 | 164 | func MergeFilename(fid uint64) string { 165 | return fmt.Sprintf("%06d%s", fid, MergeFileSuffix) 166 | } 167 | 168 | func ManifestFilename(fid uint64) string { 169 | return fmt.Sprintf("%s-%06d", ManifestFilePrefix, fid) 170 | } 171 | 172 | func TmpPath(dir string, fid uint64) string { 173 | return filepath.Join(dir, TmpFilename(fid)) 174 | } 175 | 176 | func WalPath(dir string, fid uint64) string { 177 | return filepath.Join(dir, WalFilename(fid)) 178 | } 179 | 180 | func HintPath(dir string, fid uint64) string { 181 | return filepath.Join(dir, HintFilename(fid)) 182 | } 183 | 184 | func ManifestPath(dir string, fid uint64) string { 185 | return filepath.Join(dir, ManifestFilename(fid)) 186 | } 187 | 188 | func MergePath(dir string, fid uint64) string { 189 | return filepath.Join(dir, MergeFilename(fid)) 190 | } 191 | 192 | func LockPath(dir string) string { 193 | return filepath.Join(dir, LockFile) 194 | } 195 | 196 | func CurrentPath(dir string) string { 197 | return filepath.Join(dir, CurrentFile) 198 | } 199 | 200 | func DefaultCompactionPicker(wals []PickerWalInfo) []uint64 { 201 | compactionPickerRatio := GetOptions().CompactionPickerRatio 202 | 203 | // reverse order 204 | sort.Slice(wals, func(i, j int) bool { 205 | return wals[i].FreeBytes > wals[j].FreeBytes 206 | }) 207 | 208 | var res []uint64 209 | for idx := range wals { 210 | size := float64(wals[idx].WalSize) 211 | free := float64(wals[idx].FreeBytes) 212 | 213 | if free/size < compactionPickerRatio { 214 | break 215 | } 216 | 217 | res = append(res, wals[idx].Fid) 218 | if len(res) >= 2 { 219 | break 220 | } 221 | } 222 | 223 | return res 224 | } 225 | -------------------------------------------------------------------------------- /block_cache.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | const ( 10 | // fid use 5 bytes 11 | BlockCacheFidBits = 40 12 | BlockCacheFidMask = (1 << BlockCacheFidBits) - 1 13 | BlockCacheFidShift = (64 - 40) // high 40-bits 14 | 15 | // each data block is 32KB, and 22 bits can locate 128 GB 16 | BlockCacheIdxBits = 22 17 | BlockCacheIdxMask = (1 << BlockCacheIdxBits) - 1 18 | BlockCacheIdxShift = 0 // low 22-bits 19 | 20 | // the extra 2 bits reserved 21 | ) 22 | 23 | var ( 24 | ErrBlockCacheMiss = errors.New("block cache miss") 25 | ErrBlockCacheFidOutOfRange = errors.New("block cache fid out of range") 26 | ErrBlockCacheIdxOutOfRange = errors.New("block cache idx out of range") 27 | ) 28 | 29 | func BlockCacheKey(fid, blkIdx uint64) (uint64, error) { 30 | if fid > BlockCacheFidMask { 31 | return 0, ErrBlockCacheFidOutOfRange 32 | } 33 | 34 | if blkIdx > BlockCacheIdxMask { 35 | return 0, ErrBlockCacheIdxOutOfRange 36 | } 37 | 38 | return (fid << BlockCacheFidShift) | blkIdx, nil 39 | } 40 | 41 | type BlockCacheOperator struct { 42 | helper MapOperatorBase 43 | } 44 | 45 | func (optr *BlockCacheOperator) Hash(key *uint64) uint64 { 46 | return *key 47 | } 48 | 49 | func (optr *BlockCacheOperator) Equals(lhs, rhs *uint64) bool { 50 | return *lhs == *rhs 51 | } 52 | 53 | func (optr *BlockCacheOperator) Rand(upper uint64) uint64 { 54 | return optr.helper.Rand(upper) 55 | } 56 | 57 | func (optr *BlockCacheOperator) WallTime() time.Time { 58 | return optr.helper.WallTime() 59 | } 60 | 61 | type BlockCache struct { 62 | maps *ShardMap[uint64, []byte] 63 | 64 | blkPool *sync.Pool 65 | } 66 | 67 | type BlockCacheOptions struct { 68 | Capacity uint64 69 | Limited uint64 70 | EvictionPoolCapacity uint64 71 | SampleKeys uint64 72 | 73 | Helper MapOperatorBase 74 | } 75 | 76 | func NewBlockCache(opts *BlockCacheOptions) (*BlockCache, error) { 77 | blkPool := &sync.Pool{ 78 | New: func() any { 79 | b := make([]byte, BlockSize) 80 | return &b 81 | }, 82 | } 83 | 84 | if opts == nil || opts.Capacity == 0 { 85 | return &BlockCache{ 86 | maps: nil, 87 | blkPool: blkPool, 88 | }, nil 89 | } 90 | 91 | mapOpts := &MapOptions{ 92 | Capacity: opts.Capacity, 93 | Limited: opts.Limited, 94 | EvictionPoolCapacity: opts.EvictionPoolCapacity, 95 | SampleKeys: opts.SampleKeys, 96 | } 97 | 98 | optr := &BlockCacheOperator{ 99 | helper: opts.Helper, 100 | } 101 | 102 | maps, err := NewShardMap[uint64, []byte](optr, mapOpts) 103 | if err != nil { 104 | return nil, err 105 | } 106 | 107 | return &BlockCache{ 108 | maps: maps, 109 | blkPool: blkPool, 110 | }, nil 111 | } 112 | 113 | func (c *BlockCache) key(fid, blkIdx uint64) (uint64, error) { 114 | if fid > BlockCacheFidMask { 115 | return 0, ErrBlockCacheFidOutOfRange 116 | } 117 | 118 | if blkIdx > BlockCacheIdxMask { 119 | return 0, ErrBlockCacheIdxOutOfRange 120 | } 121 | 122 | return (fid << BlockCacheFidShift) | blkIdx, nil 123 | } 124 | 125 | func (c *BlockCache) Get(fid, blkIdx uint64) ([]byte, error) { 126 | if c.maps == nil { 127 | return nil, ErrBlockCacheMiss 128 | } 129 | 130 | key, err := c.key(fid, blkIdx) 131 | if err != nil { 132 | return nil, err 133 | } 134 | 135 | if blkPtr, err := c.maps.Get(&key); err == nil { 136 | return *blkPtr, nil 137 | } 138 | 139 | return nil, ErrBlockCacheMiss 140 | } 141 | 142 | func (c *BlockCache) BatchGet(fid, blkStartIdx, blkNum uint64) ([][]byte, error) { 143 | if c.maps == nil { 144 | return nil, ErrBlockCacheMiss 145 | } 146 | 147 | blks := make([][]byte, blkNum) 148 | for i := uint64(0); i < blkNum; i++ { 149 | key, err := c.key(fid, blkStartIdx+i) 150 | if err != nil { 151 | return nil, err 152 | } 153 | 154 | blkPtr, err := c.maps.Get(&key) 155 | if err != nil { 156 | return nil, err 157 | } 158 | 159 | blks[i] = *blkPtr 160 | } 161 | 162 | return blks, nil 163 | } 164 | 165 | func (c *BlockCache) Put(fid, blkIdx, length uint64, blk []byte) error { 166 | if c.maps == nil { 167 | c.blkPool.Put(&blk) 168 | return nil 169 | } 170 | 171 | // the actual length is less than len(blk), and we should not put it into cache 172 | if int(length) != len(blk) { 173 | c.blkPool.Put(&blk) 174 | return nil 175 | } 176 | 177 | key, err := c.key(fid, blkIdx) 178 | if err != nil { 179 | c.blkPool.Put(&blk) 180 | return err 181 | } 182 | 183 | oldBlkPtr, err := c.maps.Set(&key, &blk) 184 | if err != nil { 185 | c.blkPool.Put(&blk) 186 | return err 187 | } 188 | 189 | // maybe no eviction 190 | if oldBlkPtr != nil { 191 | c.blkPool.Put(oldBlkPtr) 192 | } 193 | 194 | return nil 195 | } 196 | 197 | func (c *BlockCache) BatchPut(fid, blkIdx, length uint64, blks [][]byte) error { 198 | if c.maps == nil { 199 | for idx := range blks { 200 | c.blkPool.Put(&blks[idx]) 201 | } 202 | return nil 203 | } 204 | 205 | // blks must include at least one elements 206 | blkNum := uint64(len(blks)) 207 | if length != blkNum*uint64(len(blks[0])) { 208 | blkNum-- 209 | c.blkPool.Put(&blks[len(blks)-1]) 210 | } 211 | 212 | for i := uint64(0); i < blkNum; i++ { 213 | key, err := c.key(fid, blkIdx+i) 214 | if err != nil { 215 | c.blkPool.Put(&blks[i]) 216 | continue 217 | } 218 | 219 | oldBlkPtr, err := c.maps.Set(&key, &blks[i]) 220 | if err != nil { 221 | c.blkPool.Put(&blks[i]) 222 | continue 223 | } 224 | 225 | if oldBlkPtr != nil { 226 | c.blkPool.Put(oldBlkPtr) 227 | } 228 | } 229 | 230 | return nil 231 | } 232 | 233 | func (c *BlockCache) NewBlock() []byte { 234 | blkPtr := c.blkPool.Get().(*[]byte) 235 | return *blkPtr 236 | } 237 | -------------------------------------------------------------------------------- /manifest_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func createTmpdir(t *testing.T) (dir string, closer func()) { 11 | dir = t.TempDir() 12 | closer = func() { 13 | os.RemoveAll(dir) 14 | } 15 | return 16 | } 17 | 18 | func TestManifest_NewManifest(t *testing.T) { 19 | dir, closer := createTmpdir(t) 20 | defer closer() 21 | 22 | manifest, err := NewManifest(dir) 23 | assert.Nil(t, err) 24 | 25 | defer manifest.Close() 26 | 27 | assert.Equal(t, manifest.fid, uint64(1)) 28 | assert.Equal(t, manifest.nextFid, uint64(3)) 29 | assert.Equal(t, manifest.NextFid(), uint64(3)) 30 | 31 | active := manifest.ActiveWal() 32 | assert.NotNil(t, active) 33 | assert.Equal(t, active.Fid(), uint64(2)) 34 | 35 | assert.True(t, manifest.FileSize() > 0) 36 | } 37 | 38 | func TestManifest_LoadManifest(t *testing.T) { 39 | dir, closer := createTmpdir(t) 40 | defer closer() 41 | 42 | manifest, err := NewManifest(dir) 43 | assert.Nil(t, err) 44 | 45 | manifest.Close() 46 | 47 | // load the previous manifest 48 | manifest, err = LoadManifest(dir) 49 | assert.Nil(t, err) 50 | 51 | defer manifest.Close() 52 | 53 | assert.Equal(t, manifest.fid, uint64(1)) 54 | assert.Equal(t, manifest.nextFid, uint64(3)) 55 | 56 | active := manifest.ActiveWal() 57 | assert.NotNil(t, active) 58 | assert.Equal(t, active.Fid(), uint64(2)) 59 | 60 | assert.True(t, manifest.FileSize() > 0) 61 | } 62 | 63 | func TestManifest_RotateWal(t *testing.T) { 64 | dir, closer := createTmpdir(t) 65 | defer closer() 66 | 67 | manifest, err := NewManifest(dir) 68 | assert.Nil(t, err) 69 | 70 | defer manifest.Close() 71 | 72 | old, err := manifest.RotateWal() 73 | assert.Nil(t, err) 74 | assert.Equal(t, old.Fid(), uint64(2)) 75 | assert.Equal(t, old.refs.Load(), int64(1)) 76 | 77 | active := manifest.ActiveWal() 78 | assert.NotNil(t, active) 79 | assert.Equal(t, active.refs.Load(), int64(1)) 80 | 81 | assert.Equal(t, manifest.nextFid, uint64(4)) 82 | } 83 | 84 | func TestManifest_RotateManifest(t *testing.T) { 85 | dir, closer := createTmpdir(t) 86 | defer closer() 87 | 88 | manifest, err := NewManifest(dir) 89 | assert.Nil(t, err) 90 | 91 | defer manifest.Close() 92 | 93 | assert.Equal(t, manifest.fid, uint64(1)) 94 | 95 | err = manifest.RotateManifest() 96 | assert.Nil(t, err) 97 | 98 | assert.Equal(t, manifest.fid, uint64(3)) 99 | assert.Equal(t, manifest.nextFid, uint64(4)) 100 | } 101 | 102 | func TestManifest_Apply(t *testing.T) { 103 | dir, closer := createTmpdir(t) 104 | defer closer() 105 | 106 | manifest, err := NewManifest(dir) 107 | assert.Nil(t, err) 108 | 109 | active := manifest.ActiveWal() 110 | assert.Equal(t, active.refs.Load(), int64(1)) 111 | assert.Equal(t, active.Fid(), uint64(2)) 112 | 113 | defer manifest.Close() 114 | 115 | wal3, err := NewWal(WalPath(dir, 3), 3, -1) 116 | assert.Nil(t, err) 117 | 118 | wal4, err := NewWal(WalPath(dir, 4), 4, -1) 119 | assert.Nil(t, err) 120 | 121 | wal5, err := NewWal(WalPath(dir, 5), 5, -1) 122 | assert.Nil(t, err) 123 | 124 | // apply 125 | edit1 := &ManifestEdit{ 126 | addFiles: []LogFile{ 127 | {fid: 3, wal: wal3}, 128 | {fid: 4, wal: wal4}, 129 | {fid: 5, wal: wal5}, 130 | }, 131 | hasNextFid: true, 132 | nextFid: 6, 133 | } 134 | 135 | assert.Nil(t, manifest.Apply(edit1)) 136 | assert.Equal(t, wal3.refs.Load(), int64(2)) 137 | assert.Equal(t, wal4.refs.Load(), int64(2)) 138 | assert.Equal(t, wal5.refs.Load(), int64(2)) 139 | 140 | edit2 := &ManifestEdit{ 141 | deleteFiles: []LogFile{{fid: 4}, {fid: 5}}, 142 | } 143 | 144 | assert.Nil(t, manifest.Apply(edit2)) 145 | assert.Equal(t, wal3.refs.Load(), int64(2)) 146 | assert.Equal(t, wal4.refs.Load(), int64(1)) 147 | assert.Equal(t, wal5.refs.Load(), int64(1)) 148 | 149 | assert.Nil(t, manifest.ToWal(4)) 150 | assert.Nil(t, manifest.ToWal(5)) 151 | assert.NotNil(t, manifest.ToWal(3)) 152 | assert.NotNil(t, manifest.ToWal(2)) 153 | assert.Equal(t, manifest.nextFid, uint64(6)) 154 | } 155 | 156 | func TestManifest_LogAndApply(t *testing.T) { 157 | dir, closer := createTmpdir(t) 158 | defer closer() 159 | 160 | manifest, err := NewManifest(dir) 161 | assert.Nil(t, err) 162 | 163 | wal3, err := NewWal(WalPath(dir, 3), 3, -1) 164 | assert.Nil(t, err) 165 | 166 | wal4, err := NewWal(WalPath(dir, 4), 4, -1) 167 | assert.Nil(t, err) 168 | 169 | wal5, err := NewWal(WalPath(dir, 5), 5, -1) 170 | assert.Nil(t, err) 171 | 172 | // apply 173 | edit1 := &ManifestEdit{ 174 | addFiles: []LogFile{ 175 | {fid: 3, wal: wal3}, 176 | {fid: 4, wal: wal4}, 177 | {fid: 5, wal: wal5}, 178 | }, 179 | hasNextFid: true, 180 | nextFid: 6, 181 | } 182 | 183 | assert.Nil(t, manifest.LogAndApply(edit1)) 184 | assert.Equal(t, wal3.refs.Load(), int64(2)) 185 | assert.Equal(t, wal4.refs.Load(), int64(2)) 186 | assert.Equal(t, wal5.refs.Load(), int64(2)) 187 | 188 | edit2 := &ManifestEdit{ 189 | deleteFiles: []LogFile{{fid: 4}, {fid: 5}}, 190 | } 191 | 192 | assert.Nil(t, manifest.LogAndApply(edit2)) 193 | assert.Equal(t, wal3.refs.Load(), int64(2)) 194 | assert.Equal(t, wal4.refs.Load(), int64(1)) 195 | assert.Equal(t, wal5.refs.Load(), int64(1)) 196 | 197 | // re-open manifest 198 | manifest.Close() // all referenced wals will be closed 199 | 200 | manifest, err = LoadManifest(dir) 201 | assert.Nil(t, err) 202 | 203 | // check 204 | assert.Nil(t, manifest.ToWal(4)) 205 | assert.Nil(t, manifest.ToWal(5)) 206 | 207 | wal2 := manifest.ToWal(2) 208 | assert.NotNil(t, wal2) 209 | assert.Equal(t, wal2.refs.Load(), int64(1)) 210 | 211 | wal3 = manifest.ToWal(3) 212 | assert.NotNil(t, wal3) 213 | assert.Equal(t, wal3.refs.Load(), int64(1)) 214 | 215 | assert.Equal(t, manifest.nextFid, uint64(6)) 216 | } 217 | -------------------------------------------------------------------------------- /bench/bench_test.go: -------------------------------------------------------------------------------- 1 | package bench 2 | 3 | import ( 4 | "crypto/sha1" 5 | "os" 6 | "strconv" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | "github.com/wenzhang-dev/bitcaskDB" 11 | ) 12 | 13 | var ( 14 | dir string 15 | db *bitcask.DBImpl 16 | ) 17 | 18 | const BatchSize = 50 19 | 20 | func genTestKey(i int) []byte { 21 | return []byte(strconv.Itoa(i)) 22 | } 23 | 24 | var ( 25 | bin4KB = bitcask.GenNKBytes(4) 26 | ns = sha1.Sum([]byte("benchmark")) 27 | ) 28 | 29 | func newDB(b *testing.B) { 30 | dir = "./bitcaskDB" 31 | _ = os.RemoveAll(dir) 32 | _ = os.MkdirAll(dir, os.ModePerm) 33 | 34 | opts := &bitcask.Options{ 35 | Dir: dir, 36 | WalMaxSize: 1024 * 1024 * 1024, // 1GB 37 | ManifestMaxSize: 10 * 1024 * 1024, // 10MB 38 | IndexCapacity: 10000000, // 10 million 39 | IndexLimited: 8000000, 40 | IndexEvictionPoolCapacity: 64, 41 | IndexSampleKeys: 5, 42 | BlockCacheCapacity: 8192, // 256MB 43 | BlockCacheLimited: 8192, 44 | BlockCacheSampleKeys: 5, 45 | BlockCacheEvictionPoolCapacity: 32, 46 | BlockReaderConcurrent: 64, 47 | CompactionPicker: nil, // default picker 48 | CompactionFilter: nil, // default filter 49 | NsSize: bitcask.DefaultNsSize, 50 | EtagSize: bitcask.DefaultEtagSize, 51 | DisableCompaction: true, 52 | } 53 | 54 | var err error 55 | db, err = bitcask.NewDB(opts) 56 | assert.Nil(b, err) 57 | } 58 | 59 | func BenchmarkPutGet(b *testing.B) { 60 | b.Run("put_4K", benchmarkPut) 61 | b.Run("batchPut_4K", benchmarkBatchPut) 62 | b.Run("get_4K", benchmarkGet) 63 | b.Run("concurrentGet_4K", benchmarkConcurrentGet) 64 | b.Run("concurrentGetV2_4K", benchmarkConcurrentGetV2) 65 | b.Run("concurrentPut_4K", benchmarkConcurrentPut) 66 | b.Run("concurrentBatchPut_4K", benchmarkConcurrentBatchPut) 67 | } 68 | 69 | func benchmarkPut(b *testing.B) { 70 | newDB(b) 71 | defer db.Close() 72 | 73 | meta := bitcask.NewMeta(nil) 74 | opts := &bitcask.WriteOptions{} 75 | 76 | b.ResetTimer() 77 | b.ReportAllocs() 78 | 79 | for i := 0; i < b.N; i++ { 80 | err := db.Put(ns[:], genTestKey(i), bin4KB, meta, opts) 81 | assert.Nil(b, err) 82 | } 83 | } 84 | 85 | func benchmarkConcurrentPut(b *testing.B) { 86 | newDB(b) 87 | defer db.Close() 88 | 89 | meta := bitcask.NewMeta(nil) 90 | opts := &bitcask.WriteOptions{} 91 | 92 | b.ResetTimer() 93 | b.ReportAllocs() 94 | 95 | b.RunParallel(func(pb *testing.PB) { 96 | iteration := 0 97 | for pb.Next() { 98 | err := db.Put(ns[:], genTestKey(iteration), bin4KB, meta, opts) 99 | assert.Nil(b, err) 100 | 101 | iteration++ 102 | } 103 | }) 104 | } 105 | 106 | func benchmarkBatchPut(b *testing.B) { 107 | newDB(b) 108 | defer db.Close() 109 | 110 | meta := bitcask.NewMeta(nil) 111 | opts := &bitcask.WriteOptions{} 112 | 113 | b.ResetTimer() 114 | b.ReportAllocs() 115 | 116 | batch := bitcask.NewBatch() 117 | for i := 0; i < b.N; i++ { 118 | batch.Put(ns[:], genTestKey(i), bin4KB, meta) 119 | 120 | if i%BatchSize == 0 { 121 | err := db.Write(batch, opts) 122 | assert.Nil(b, err) 123 | batch.Clear() 124 | } 125 | } 126 | 127 | if batch.Size() != 0 { 128 | err := db.Write(batch, opts) 129 | assert.Nil(b, err) 130 | } 131 | } 132 | 133 | func getPrepare(b *testing.B) { 134 | meta := bitcask.NewMeta(nil) 135 | wOpts := &bitcask.WriteOptions{} 136 | 137 | batch := bitcask.NewBatch() 138 | for i := 0; i < 200001; i++ { 139 | batch.Put(ns[:], genTestKey(i), bin4KB, meta) 140 | 141 | if i%BatchSize == 0 { 142 | err := db.Write(batch, wOpts) 143 | assert.Nil(b, err) 144 | batch.Clear() 145 | } 146 | } 147 | } 148 | 149 | func benchmarkGet(b *testing.B) { 150 | newDB(b) 151 | defer db.Close() 152 | 153 | getPrepare(b) 154 | 155 | rOpts := &bitcask.ReadOptions{} 156 | 157 | b.ResetTimer() 158 | b.ReportAllocs() 159 | 160 | for i := 0; i < b.N; i++ { 161 | _, _, err := db.Get(ns[:], genTestKey(i%100000), rOpts) 162 | assert.Nilf(b, err, "i: %v, err: %v", i, err) 163 | } 164 | } 165 | 166 | func benchmarkConcurrentGet(b *testing.B) { 167 | newDB(b) 168 | defer db.Close() 169 | 170 | getPrepare(b) 171 | 172 | rOpts := &bitcask.ReadOptions{} 173 | 174 | b.ResetTimer() 175 | b.ReportAllocs() 176 | 177 | b.RunParallel(func(pb *testing.PB) { 178 | iteration := 0 179 | for pb.Next() { 180 | _, _, err := db.Get(ns[:], genTestKey(iteration%200000), rOpts) 181 | assert.Nil(b, err) 182 | 183 | iteration++ 184 | } 185 | }) 186 | } 187 | 188 | func benchmarkConcurrentGetV2(b *testing.B) { 189 | newDB(b) 190 | defer db.Close() 191 | 192 | getPrepare(b) 193 | 194 | rOpts := &bitcask.ReadOptions{} 195 | 196 | b.ResetTimer() 197 | b.ReportAllocs() 198 | 199 | b.RunParallel(func(pb *testing.PB) { 200 | iteration := 0 201 | for pb.Next() { 202 | _, _, err := db.GetV2(ns[:], genTestKey(iteration%200000), rOpts) 203 | assert.Nil(b, err) 204 | 205 | iteration++ 206 | } 207 | }) 208 | } 209 | 210 | func benchmarkConcurrentBatchPut(b *testing.B) { 211 | newDB(b) 212 | defer db.Close() 213 | 214 | meta := bitcask.NewMeta(nil) 215 | opts := &bitcask.WriteOptions{} 216 | 217 | b.ResetTimer() 218 | b.ReportAllocs() 219 | 220 | b.RunParallel(func(pb *testing.PB) { 221 | iteration := 0 222 | batch := bitcask.NewBatch() 223 | for pb.Next() { 224 | batch.Put(ns[:], genTestKey(iteration), bin4KB, meta) 225 | 226 | if iteration%BatchSize == 0 { 227 | err := db.Write(batch, opts) 228 | assert.Nil(b, err) 229 | batch.Clear() 230 | } 231 | 232 | iteration++ 233 | } 234 | }) 235 | } 236 | -------------------------------------------------------------------------------- /deque_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestDequeBasicOperations(t *testing.T) { 10 | d := NewDeque[int]() 11 | 12 | // Initially, deque should be empty 13 | if !d.Empty() { 14 | t.Errorf("Deque should be empty initially") 15 | } 16 | 17 | // Insert elements 18 | d.PushBack(10) 19 | d.PushBack(20) 20 | d.PushFront(5) 21 | 22 | if d.Len() != 3 { 23 | t.Errorf("Expected length 3, got %d", d.Len()) 24 | } 25 | 26 | // Check Front and Back 27 | front, _ := d.Front() 28 | if *front != 5 { 29 | t.Errorf("Expected front to be 5, got %d", *front) 30 | } 31 | 32 | back, _ := d.Back() 33 | if *back != 20 { 34 | t.Errorf("Expected back to be 20, got %d", *back) 35 | } 36 | 37 | // Test At() 38 | val, _ := d.At(1) 39 | if *val != 10 { 40 | t.Errorf("Expected At(1) to be 10, got %d", *val) 41 | } 42 | 43 | // Test PopFront and PopBack 44 | _ = d.PopFront() // Remove 5 45 | front, _ = d.Front() 46 | if *front != 10 { 47 | t.Errorf("Expected front to be 10, got %d", *front) 48 | } 49 | 50 | _ = d.PopBack() // Remove 20 51 | back, _ = d.Back() 52 | if *back != 10 { 53 | t.Errorf("Expected back to be 10, got %d", *back) 54 | } 55 | 56 | _ = d.PopBack() // Remove 10, should become empty 57 | if !d.Empty() { 58 | t.Errorf("Deque should be empty after popping all elements") 59 | } 60 | } 61 | 62 | func TestDequeBounds(t *testing.T) { 63 | d := NewDeque[int]() 64 | 65 | // Calling Front/Back on an empty deque 66 | if _, err := d.Front(); err != ErrDequeEmpty { 67 | t.Errorf("Expected DequeEmptyErr for Front() on empty deque") 68 | } 69 | if _, err := d.Back(); err != ErrDequeEmpty { 70 | t.Errorf("Expected DequeEmptyErr for Back() on empty deque") 71 | } 72 | 73 | // PopFront / PopBack on an empty deque 74 | if err := d.PopFront(); err != ErrDequeEmpty { 75 | t.Errorf("Expected DequeEmptyErr for PopFront() on empty deque") 76 | } 77 | if err := d.PopBack(); err != ErrDequeEmpty { 78 | t.Errorf("Expected DequeEmptyErr for PopBack() on empty deque") 79 | } 80 | 81 | // Accessing out-of-range index 82 | d.PushBack(1) 83 | d.PushBack(2) 84 | if _, err := d.At(3); err != ErrDequeOutOfRange { 85 | t.Errorf("Expected DequeOutOfRangeErr for At(3)") 86 | } 87 | } 88 | 89 | func TestDequeAutoGrow(t *testing.T) { 90 | d := NewDeque[int]() 91 | 92 | // PushBack elements until deque expands 93 | for i := 0; i < 2000; i++ { 94 | d.PushBack(i) 95 | } 96 | 97 | if d.Len() != 2000 { 98 | t.Errorf("Expected length 2000, got %d", d.Len()) 99 | } 100 | 101 | // Ensure first 10 and last 10 elements are correct 102 | for i := 0; i < 10; i++ { 103 | val, _ := d.At(i) 104 | if *val != i { 105 | t.Errorf("At(%d) expected %d, got %d", i, i, *val) 106 | } 107 | } 108 | 109 | for i := 1990; i < 2000; i++ { 110 | val, _ := d.At(i) 111 | if *val != i { 112 | t.Errorf("At(%d) expected %d, got %d", i, i, *val) 113 | } 114 | } 115 | 116 | // Reverse PopBack, deque should become empty 117 | for i := 0; i < 2000; i++ { 118 | _ = d.PopBack() 119 | } 120 | 121 | if !d.Empty() { 122 | t.Errorf("Expected empty deque after popping all elements") 123 | } 124 | } 125 | 126 | func TestDequePushFrontPopBack(t *testing.T) { 127 | d := NewDeque[int]() 128 | 129 | // Insert into the front 130 | for i := 0; i < 100; i++ { 131 | d.PushFront(i) 132 | } 133 | 134 | if d.Len() != 100 { 135 | t.Errorf("Expected length 100, got %d", d.Len()) 136 | } 137 | 138 | // Remove elements from back, should be 0,1,2,...99 139 | for i := 0; i < 100; i++ { 140 | val, _ := d.Back() 141 | if *val != i { 142 | t.Errorf("Expected back %d, got %d", i, *val) 143 | } 144 | _ = d.PopBack() 145 | } 146 | 147 | if !d.Empty() { 148 | t.Errorf("Expected empty deque after popping all elements") 149 | } 150 | } 151 | 152 | func TestDequeLargeData(t *testing.T) { 153 | d := NewDeque[int]() 154 | num := 1_000_000 155 | 156 | // Performance test: Insert 1M elements 157 | for i := 0; i < num; i++ { 158 | d.PushBack(i) 159 | } 160 | 161 | if d.Len() != num { 162 | t.Errorf("Expected length %d, got %d", num, d.Len()) 163 | } 164 | 165 | // Check first and last 10 elements 166 | for i := 0; i < 10; i++ { 167 | val, _ := d.At(i) 168 | if *val != i { 169 | t.Errorf("At(%d) expected %d, got %d", i, i, *val) 170 | } 171 | } 172 | 173 | for i := num - 10; i < num; i++ { 174 | val, _ := d.At(i) 175 | if *val != i { 176 | t.Errorf("At(%d) expected %d, got %d", i, i, *val) 177 | } 178 | } 179 | 180 | // Remove all elements 181 | for i := 0; i < num; i++ { 182 | _ = d.PopFront() 183 | } 184 | 185 | if !d.Empty() { 186 | t.Errorf("Expected empty deque after popping all elements") 187 | } 188 | } 189 | 190 | func TestDequeCornerCase1(t *testing.T) { 191 | d := NewDeque[int]() 192 | 193 | for i := 0; i < DequeChunkSize; i++ { 194 | d.PushBack(i) 195 | } 196 | 197 | d.PushBack(100) 198 | 199 | for i := 0; i < DequeChunkSize; i++ { 200 | err := d.PopFront() 201 | assert.Nil(t, err) 202 | } 203 | 204 | assert.Equal(t, d.Len(), 1) 205 | num, err := d.Front() 206 | assert.Nil(t, err) 207 | assert.Equal(t, *num, 100) 208 | 209 | err = d.PopFront() 210 | assert.Nil(t, err) 211 | assert.True(t, d.Empty()) 212 | } 213 | 214 | func TestDequeCornerCase2(t *testing.T) { 215 | d := NewDeque[int]() 216 | 217 | for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ { 218 | d.PushFront(i) 219 | } 220 | 221 | for i := 0; i < DequeChunkSize; i++ { 222 | d.PushBack(i) 223 | } 224 | 225 | assert.Equal(t, d.Len(), int(DequeChunkSize*(DequeFrontReserveSize+2))) 226 | 227 | for i := 0; i < DequeChunkSize; i++ { 228 | err := d.PopFront() 229 | assert.Nil(t, err) 230 | } 231 | 232 | for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ { 233 | err := d.PopBack() 234 | assert.Nil(t, err) 235 | } 236 | 237 | assert.True(t, d.Empty()) 238 | } 239 | 240 | func TestDequeMemoryInGrow(t *testing.T) { 241 | d := NewDeque[int]() 242 | 243 | d.PushFront(1000) 244 | addr1, err := d.Front() 245 | assert.Nil(t, err) 246 | 247 | for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ { 248 | d.PushFront(i) 249 | } 250 | 251 | addr2, err := d.Back() 252 | assert.Nil(t, err) 253 | assert.Equal(t, addr1, addr2) 254 | 255 | for i := 0; i < DequeChunkSize*(DequeFrontReserveSize+1); i++ { 256 | err = d.PopFront() 257 | assert.Nil(t, err) 258 | } 259 | 260 | addr3, err := d.Front() 261 | assert.Nil(t, err) 262 | assert.Equal(t, addr2, addr3) 263 | assert.Equal(t, *addr3, 1000) 264 | } 265 | -------------------------------------------------------------------------------- /wal_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func setupWal(name string, t *testing.T) *Wal { 11 | wal, err := NewWal(name, 0, -1) 12 | assert.Nil(t, err) 13 | return wal 14 | } 15 | 16 | // Test basic WAL operations: writing and reading records 17 | func TestWal_BasicOperations(t *testing.T) { 18 | wal := setupWal("test_wal_basic.wal", t) 19 | defer wal.Unref() 20 | 21 | data := []byte("hello world") 22 | offset, err := wal.WriteRecord(data) 23 | if err != nil { 24 | t.Fatalf("Failed to write record: %v", err) 25 | } 26 | assert.Nil(t, wal.Flush()) 27 | 28 | readData, err := wal.ReadRecord(offset, uint64(len(data)), true) 29 | if err != nil { 30 | t.Fatalf("Failed to read record: %v", err) 31 | } 32 | 33 | if string(readData) != string(data) { 34 | t.Fatalf("Data mismatch: expected %s, got %s", string(data), string(readData)) 35 | } 36 | 37 | readData, err = wal.ReadRecord(offset, uint64(len(data)), false) 38 | assert.Nil(t, err) 39 | assert.Equal(t, data, readData) 40 | } 41 | 42 | // Test WAL behavior with multiple writes and reads 43 | func TestWal_MultipleRecords(t *testing.T) { 44 | wal := setupWal("test_wal_multiple.wal", t) 45 | defer wal.Unref() 46 | 47 | records := [][]byte{ 48 | []byte("first record"), 49 | []byte("second record"), 50 | []byte("third record"), 51 | } 52 | 53 | var offsets []uint64 54 | for _, record := range records { 55 | offset, err := wal.WriteRecord(record) 56 | if err != nil { 57 | t.Fatalf("Failed to write record: %v", err) 58 | } 59 | offsets = append(offsets, offset) 60 | } 61 | assert.Nil(t, wal.Flush()) 62 | 63 | for i, offset := range offsets { 64 | readData, err := wal.ReadRecord(offset, uint64(len(records[i])), true) 65 | if err != nil { 66 | t.Fatalf("Failed to read record at offset %d: %v", offset, err) 67 | } 68 | assert.Equal(t, readData, records[i]) 69 | } 70 | } 71 | 72 | // Test WAL record spanning multiple blocks 73 | func TestWal_LargeRecord(t *testing.T) { 74 | wal := setupWal("test_wal_large.wal", t) 75 | defer wal.Unref() 76 | 77 | largeData := make([]byte, BlockSize*2) // A record spanning multiple blocks 78 | for i := range largeData { 79 | largeData[i] = byte(i % 256) 80 | } 81 | 82 | offset, err := wal.WriteRecord(largeData) 83 | if err != nil { 84 | t.Fatalf("Failed to write large record: %v", err) 85 | } 86 | assert.Nil(t, wal.Flush()) 87 | 88 | readData, err := wal.ReadRecord(offset, uint64(len(largeData)), true) 89 | if err != nil { 90 | t.Fatalf("Failed to read large record: %v", err) 91 | } 92 | 93 | assert.Equal(t, readData, largeData) 94 | } 95 | 96 | func TestWal_LargeRecord2(t *testing.T) { 97 | wal := setupWal("test_wal_large2.wal", t) 98 | defer wal.Unref() 99 | 100 | data := GenNKBytes(5) 101 | offsets := make([]uint64, 1000) 102 | for i := 0; i < 1000; i++ { 103 | off, err := wal.WriteRecord(data) 104 | assert.Nil(t, err) 105 | assert.Nil(t, wal.Flush()) 106 | offsets[i] = off 107 | } 108 | 109 | // check 110 | for i := 0; i < 1000; i++ { 111 | readData, err := wal.ReadRecord(offsets[i], uint64(len(data)), true) 112 | assert.Nil(t, err) 113 | assert.Equal(t, readData, data) 114 | } 115 | } 116 | 117 | // Test handling of corrupted WAL records 118 | func TestWal_CorruptedRead(t *testing.T) { 119 | filename := "test_wal_corrupt.wal" 120 | wal := setupWal(filename, t) 121 | defer os.Remove(wal.Path()) 122 | 123 | data := []byte("valid record") 124 | offset, err := wal.WriteRecord(data) 125 | if err != nil { 126 | t.Fatalf("Failed to write record: %v", err) 127 | } 128 | assert.Nil(t, wal.Flush()) 129 | 130 | // Close WAL before corrupting the file 131 | wal.Close() 132 | 133 | // Manually corrupt the file 134 | file, err := os.OpenFile(filename, os.O_RDWR, 0o644) 135 | if err != nil { 136 | t.Fatalf("Failed to open WAL file for corruption: %v", err) 137 | } 138 | 139 | _, err = file.WriteAt([]byte{0xFF, 0xFF}, int64(offset+2)) // Corrupt part of the record 140 | if err != nil { 141 | t.Fatalf("Failed to corrupt WAL file: %v", err) 142 | } 143 | 144 | // Reopen WAL and try to read 145 | wal, err = LoadWal(filename, 0) 146 | if err != nil { 147 | t.Fatalf("Failed to reopen WAL: %v", err) 148 | } 149 | defer wal.Close() 150 | 151 | _, err = wal.ReadRecord(offset, uint64(len(data)), true) 152 | if err == nil { 153 | t.Fatalf("Expected error when reading corrupted record, but got none") 154 | } 155 | } 156 | 157 | // Test padding when block space is insufficient 158 | func TestWal_BlockPadding(t *testing.T) { 159 | wal := setupWal("test_wal_padding.wal", t) 160 | defer wal.Unref() 161 | 162 | // Write a record that nearly fills a block 163 | data := make([]byte, BlockSize-RecordHeaderSize) 164 | offset, err := wal.WriteRecord(data) 165 | if err != nil { 166 | t.Fatalf("Failed to write record: %v", err) 167 | } 168 | assert.Nil(t, wal.Flush()) 169 | 170 | // Write another record that should go into the next block due to padding 171 | secondData := []byte("new block record") 172 | secondOffset, err := wal.WriteRecord(secondData) 173 | if err != nil { 174 | t.Fatalf("Failed to write second record: %v", err) 175 | } 176 | assert.Nil(t, wal.Flush()) 177 | 178 | // Ensure both records can be read correctly 179 | readData, err := wal.ReadRecord(offset, uint64(len(data)), true) 180 | if err != nil { 181 | t.Fatalf("Failed to read first record: %v", err) 182 | } 183 | assert.Equal(t, readData, data) 184 | 185 | readData, err = wal.ReadRecord(secondOffset, uint64(len(secondData)), true) 186 | if err != nil { 187 | t.Fatalf("Failed to read second record: %v", err) 188 | } 189 | assert.Equal(t, readData, secondData) 190 | } 191 | 192 | // Test reopening WAL and ensuring persistence 193 | func TestWal_ReopenPersistence(t *testing.T) { 194 | filename := "test_wal_persistence.wal" 195 | wal := setupWal(filename, t) 196 | defer os.Remove(filename) 197 | 198 | data := []byte("persistent data") 199 | offset, err := wal.WriteRecord(data) 200 | assert.Nil(t, err) 201 | assert.Nil(t, wal.Flush()) 202 | 203 | // Close and reopen WAL 204 | wal.Close() 205 | wal, err = LoadWal(filename, 0) 206 | assert.Nil(t, err) 207 | 208 | // write one record 209 | data1 := []byte("one record") 210 | offset1, err := wal.WriteRecord(data1) 211 | assert.Nil(t, err) 212 | assert.Nil(t, wal.Flush()) 213 | 214 | // check 215 | readData, err := wal.ReadRecord(offset, uint64(len(data)), true) 216 | assert.Nil(t, err) 217 | assert.Equal(t, readData, data) 218 | 219 | readData, err = wal.ReadRecord(offset1, uint64(len(data1)), true) 220 | assert.Nil(t, err) 221 | assert.Equal(t, readData, data1) 222 | 223 | // repeat check 224 | wal.Close() 225 | wal, err = LoadWal(filename, 0) 226 | assert.Nil(t, err) 227 | 228 | defer wal.Close() 229 | 230 | readData, err = wal.ReadRecord(offset, uint64(len(data)), true) 231 | assert.Nil(t, err) 232 | assert.Equal(t, readData, data) 233 | 234 | readData, err = wal.ReadRecord(offset1, uint64(len(data1)), true) 235 | assert.Nil(t, err) 236 | assert.Equal(t, readData, data1) 237 | } 238 | -------------------------------------------------------------------------------- /record.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | 7 | "github.com/vmihailenco/msgpack/v5" 8 | ) 9 | 10 | type Record struct { 11 | Ns []byte 12 | Key []byte 13 | Value []byte 14 | Meta *Meta 15 | 16 | // mark whether it's a delete operation 17 | // it's fundamentally different from the meta tombstone. 18 | // the tombstone indicates s soft deletion, but in fact, the key still exists in database 19 | // the deleted tag indicates that the key will be removed directly in the database 20 | // this deleted tag would not be serialized 21 | Deleted bool 22 | } 23 | 24 | // serialization format: 25 | // | header size | header | key | value | meta | 26 | // 27 | // the header including: 28 | // - header size: 1B 29 | // - ns: fixed size 30 | // - flags: 1B 31 | // - key size: varint32 1~5B 32 | // - value size: varint32 1~5B 33 | // - meta size: varint32 1~5B 34 | // - etag: optional, fixed size 35 | // - expire: optional, varint32 1~5B 36 | // - other optional fields if need 37 | // 38 | // for small record: 39 | // if key=16B, value=128B, meta=64B, its header is about 50B, and effective space usage is about 80% 40 | // 41 | // for medium record: 42 | // if key=64B, value=128KB, meta=1KB, it's header is about 50B, and effective space usage is abort 99% 43 | 44 | const ( 45 | noEtagFieldBit = 0 46 | noExpireFieldBit = 1 47 | tombstoneFieldBit = 2 48 | ) 49 | 50 | func (r *Record) ApproximateSize() int { 51 | // 1B header size + len(ns) + 1B flags + 2B varint * 3 + len(meta) + 2B expire 52 | approximateRecordHeaderSize := 1 + len(r.Ns) + 1 + 2*3 + len(r.Meta.Etag) + 2 53 | 54 | return approximateRecordHeaderSize + len(r.Key) + len(r.Value) + r.Meta.AppMetaApproximateSize() 55 | } 56 | 57 | func (r *Record) Encode(backStore []byte, baseTime uint64) ([]byte, error) { 58 | flag := byte(0) 59 | if len(r.Meta.Etag) == 0 { 60 | flag |= byte(1 << noEtagFieldBit) 61 | } 62 | 63 | if r.Meta.IsTombstone() { 64 | flag |= byte(1 << tombstoneFieldBit) 65 | } 66 | 67 | expireSize := 0 68 | var expireBytes [binary.MaxVarintLen32]byte 69 | switch { 70 | case r.Meta.Expire == MetaNoExpire: 71 | flag |= byte(1 << noExpireFieldBit) 72 | 73 | case r.Meta.Expire < baseTime: 74 | return nil, errors.New("invalid expire") 75 | 76 | default: // expire > base time 77 | expireSize = binary.PutUvarint(expireBytes[:], r.Meta.Expire-baseTime) 78 | } 79 | 80 | var err error 81 | var metaEncoded []byte 82 | if r.Meta.AppMetaSize != 0 { 83 | if metaEncoded, err = msgpack.Marshal(r.Meta.AppMeta); err != nil { 84 | return nil, err 85 | } 86 | } 87 | 88 | // try to encode the varint32 fields 89 | offset := 0 90 | var tmp [3 * binary.MaxVarintLen32]byte 91 | offset += binary.PutUvarint(tmp[offset:], uint64(len(r.Key))) 92 | offset += binary.PutUvarint(tmp[offset:], uint64(len(r.Value))) 93 | offset += binary.PutUvarint(tmp[offset:], uint64(len(metaEncoded))) 94 | tmpSize := offset 95 | 96 | // plus 2 bytes: flag and header size 97 | headerSize := offset + expireSize + len(r.Ns) + len(r.Meta.Etag) + 2 98 | totalSize := headerSize + len(r.Key) + len(r.Value) + len(metaEncoded) 99 | 100 | // prefer use the backing store 101 | buf := backStore[0:] 102 | if totalSize > cap(backStore) { 103 | buf = make([]byte, totalSize) 104 | } 105 | 106 | offset = 0 107 | 108 | // header size 109 | buf[0] = byte(headerSize) 110 | offset++ 111 | 112 | // namespace 113 | offset += copy(buf[offset:], r.Ns) 114 | 115 | // flag 116 | buf[offset] = flag 117 | offset++ 118 | 119 | // varint 120 | offset += copy(buf[offset:], tmp[:tmpSize]) 121 | 122 | // optional etag 123 | offset += copy(buf[offset:], r.Meta.Etag) 124 | 125 | // optional ttl 126 | offset += copy(buf[offset:], expireBytes[:expireSize]) 127 | 128 | // key 129 | offset += copy(buf[offset:], r.Key) 130 | 131 | // value 132 | offset += copy(buf[offset:], r.Value) 133 | 134 | // meta 135 | offset += copy(buf[offset:], metaEncoded) 136 | 137 | return buf[:offset], nil 138 | } 139 | 140 | func RecordFromBytes(data []byte, baseTime uint64) (*Record, error) { 141 | nsSize := int(GetOptions().NsSize) 142 | 143 | // 1B header size + len(ns) + 1B flags + 1B variant * 3 144 | minRecordHeaderSize := 1 + nsSize + 1 + 1*3 145 | 146 | if len(data) < minRecordHeaderSize { 147 | return nil, errors.New("invalid data") 148 | } 149 | 150 | offset := 0 151 | 152 | // header size 153 | headerSize := int(data[0]) 154 | offset++ 155 | 156 | // namespace 157 | ns := data[offset : offset+nsSize] 158 | offset += nsSize 159 | 160 | // flag 161 | flag := data[offset] 162 | offset++ 163 | 164 | // key size 165 | keyLen, keySize := DecodeUvarint(data[offset:]) 166 | offset += keySize 167 | 168 | // value size 169 | valLen, valSize := DecodeUvarint(data[offset:]) 170 | offset += valSize 171 | 172 | // meta size 173 | metaLen, metaSize := DecodeUvarint(data[offset:]) 174 | offset += metaSize 175 | 176 | // validation 177 | // avoid out of range of data buffer 178 | etagLen := int(GetOptions().EtagSize) 179 | if flag&(1< 0 { 218 | var appMeta map[string]string 219 | if err := msgpack.Unmarshal(meta, &appMeta); err != nil { 220 | return nil, err 221 | } 222 | serverMeta.SetAppMeta(appMeta) 223 | } 224 | 225 | serverMeta.SetEtag( 226 | etag, 227 | ).SetTombstone( 228 | flag&(1<= len(op.fixedValues) { 35 | op.index = 0 36 | } 37 | val := op.fixedValues[op.index] % n 38 | op.index++ 39 | return val 40 | } 41 | 42 | func (op *mockSimpleMapOperator) WallTime() time.Time { 43 | return time.Now() 44 | } 45 | 46 | func TestMap_SimpleMapBasicOperations(t *testing.T) { 47 | evictionOrder := []uint64{1, 2, 3} // Define a fixed eviction order 48 | optr := &mockSimpleMapOperator{fixedValues: evictionOrder} 49 | opts := &MapOptions{ 50 | Capacity: 100, 51 | Limited: 80, 52 | EvictionPoolCapacity: 16, 53 | SampleKeys: 3, 54 | } 55 | 56 | m, err := NewMap[uint64, uint64](optr, opts) 57 | assert.Nil(t, err) 58 | 59 | key1, val1 := uint64(1), uint64(1) 60 | key2, val2 := uint64(2), uint64(2) 61 | key3, val3 := uint64(3), uint64(3) 62 | 63 | // Test Set and Get 64 | _, _ = m.Set(&key1, &val1) 65 | _, _ = m.Set(&key2, &val2) 66 | _, _ = m.Set(&key3, &val3) 67 | 68 | res, err := m.Get(&key1) 69 | assert.Nil(t, err) 70 | assert.Equal(t, val1, *res) 71 | 72 | res, err = m.Get(&key2) 73 | assert.Nil(t, err) 74 | assert.Equal(t, val2, *res) 75 | 76 | // Update existing element 77 | val1Updated := uint64(11) 78 | old, err := m.Set(&key1, &val1Updated) 79 | assert.Nil(t, err) 80 | assert.Equal(t, *old, val1) 81 | 82 | res, err = m.Get(&key1) 83 | assert.Nil(t, err) 84 | assert.Equal(t, val1Updated, *res) 85 | 86 | // Exceed Limited and trigger eviction 87 | for i := 4; i <= 81; i++ { 88 | key, val := uint64(i), uint64(i) 89 | _, err = m.Set(&key, &val) 90 | assert.Nil(t, err) 91 | } 92 | 93 | _, err = m.Get(&evictionOrder[0]) 94 | assert.NotNil(t, err) // Since eviction occurs, the first evicted key should be removed 95 | 96 | // Test Delete 97 | old, err = m.Delete(&key2) 98 | assert.Nil(t, err) 99 | assert.Equal(t, *old, val2) 100 | 101 | _, err = m.Get(&key2) 102 | assert.NotNil(t, err) 103 | assert.True(t, errors.Is(err, ErrKeyNotFound)) 104 | } 105 | 106 | func TestMap_SimpleMapEvictionOrder(t *testing.T) { 107 | evictionOrder := []uint64{1, 2, 3, 4, 5, 6} // Fixed eviction order 108 | optr := &mockSimpleMapOperator{fixedValues: evictionOrder} 109 | opts := &MapOptions{ 110 | Capacity: 100, 111 | Limited: 80, 112 | EvictionPoolCapacity: 16, 113 | SampleKeys: 3, 114 | } 115 | 116 | m, err := NewMap[uint64, uint64](optr, opts) 117 | assert.Nil(t, err) 118 | 119 | // reach the limit 120 | for i := 1; i <= 80; i++ { 121 | key, val := uint64(i), uint64(i) 122 | 123 | old, err := m.Set(&key, &val) 124 | assert.Nil(t, err) 125 | assert.Nil(t, old) 126 | } 127 | 128 | // insert a new key, triggering eviction 129 | key81, val81 := uint64(81), uint64(81) 130 | old, err := m.Set(&key81, &val81) 131 | assert.Nil(t, err) 132 | assert.Equal(t, *old, uint64(1)) 133 | 134 | // the first evicted key should be 1 135 | key1 := uint64(1) 136 | _, err = m.Get(&key1) 137 | assert.NotNil(t, err) 138 | assert.True(t, errors.Is(err, ErrKeyNotFound)) 139 | 140 | // the remaining keys should still exist 141 | for i := 2; i <= 81; i++ { 142 | key, val := uint64(i), uint64(i) 143 | res, err := m.Get(&key) 144 | assert.Nil(t, err) 145 | assert.Equal(t, *res, val) 146 | } 147 | 148 | // insert more keys to trigger further eviction 149 | key82, val82 := uint64(82), uint64(82) 150 | old, err = m.Set(&key82, &val82) 151 | assert.Nil(t, err) 152 | assert.Equal(t, *old, uint64(2)) 153 | 154 | // the second evicted key should be 2 155 | key2 := uint64(2) 156 | _, err = m.Get(&key2) 157 | assert.NotNil(t, err) 158 | assert.True(t, errors.Is(err, ErrKeyNotFound)) 159 | 160 | // ensure remaining keys are still available 161 | for i := 3; i <= 82; i++ { 162 | key, val := uint64(i), uint64(i) 163 | res, err := m.Get(&key) 164 | assert.Nil(t, err) 165 | assert.Equal(t, *res, val) 166 | } 167 | } 168 | 169 | type mockShardMapOperator struct{} 170 | 171 | func (op *mockShardMapOperator) Hash(key *[]byte) uint64 { 172 | hasher := murmur3.New64() 173 | hasher.Write(*key) 174 | return hasher.Sum64() 175 | } 176 | 177 | func (op *mockShardMapOperator) Equals(lhs, rhs *[]byte) bool { 178 | return bytes.Equal(*lhs, *rhs) 179 | } 180 | 181 | func (op *mockShardMapOperator) Rand(n uint64) uint64 { 182 | return uint64(rand.Int63n(int64(n))) 183 | } 184 | 185 | func (op *mockShardMapOperator) WallTime() time.Time { 186 | return time.Now() 187 | } 188 | 189 | func TestMap_ShardMapBasic(t *testing.T) { 190 | opts := &MapOptions{ 191 | Capacity: 1000, 192 | Limited: 800, 193 | EvictionPoolCapacity: 16, 194 | SampleKeys: 3, 195 | } 196 | 197 | m, err := NewShardMap[[]byte, []byte](&mockShardMapOperator{}, opts) 198 | assert.Nil(t, err) 199 | 200 | key1, val1 := []byte("123"), []byte("123") 201 | key2, val2 := []byte("456"), []byte("456") 202 | key3, val3 := []byte("789"), []byte("789") 203 | 204 | // Test Set and Get 205 | _, _ = m.Set(&key1, &val1) 206 | _, _ = m.Set(&key2, &val2) 207 | _, _ = m.Set(&key3, &val3) 208 | 209 | res, err := m.Get(&key1) 210 | assert.Nil(t, err) 211 | assert.Equal(t, val1, *res) 212 | 213 | res, err = m.Get(&key2) 214 | assert.Nil(t, err) 215 | assert.Equal(t, val2, *res) 216 | 217 | // Update existing element 218 | val1Updated := []byte("111") 219 | old, err := m.Set(&key1, &val1Updated) 220 | assert.Nil(t, err) 221 | assert.Equal(t, *old, val1) 222 | 223 | res, err = m.Get(&key1) 224 | assert.Nil(t, err) 225 | assert.Equal(t, val1Updated, *res) 226 | 227 | // Test Delete 228 | old, err = m.Delete(&key2) 229 | assert.Nil(t, err) 230 | assert.Equal(t, *old, val2) 231 | 232 | _, err = m.Get(&key2) 233 | assert.NotNil(t, err) 234 | assert.True(t, errors.Is(err, ErrKeyNotFound)) 235 | } 236 | 237 | func TestMap_ShardMapLRUEviction(t *testing.T) { 238 | opts := &MapOptions{ 239 | Capacity: 1000000, 240 | Limited: 800000, 241 | EvictionPoolCapacity: 32, 242 | SampleKeys: 5, 243 | } 244 | 245 | m, err := NewShardMap[[]byte, []byte](&mockShardMapOperator{}, opts) 246 | assert.Nil(t, err) 247 | 248 | for i := 1; i < 1000000; i++ { 249 | numStr := strconv.Itoa(i) 250 | key, val := []byte(numStr), []byte(numStr) 251 | 252 | _, err := m.Set(&key, &val) 253 | assert.Nil(t, err) 254 | } 255 | 256 | // the first half of the elements are evicted more 257 | num := 0 258 | for i := 1; i <= 500000; i++ { 259 | numStr := strconv.Itoa(i) 260 | key, val := []byte(numStr), []byte(numStr) 261 | res, err := m.Get(&key) 262 | if err != nil && errors.Is(err, ErrKeyNotFound) { 263 | num++ 264 | } 265 | 266 | if err == nil { 267 | assert.Equal(t, *res, val) 268 | } 269 | } 270 | 271 | // total eviction elements should be 200000 (capacity - limited) 272 | assert.True(t, num > 100000) 273 | } 274 | 275 | func TestMap_ShardMapConcurrentReadAndWrite(t *testing.T) { 276 | opts := &MapOptions{ 277 | Capacity: 1000000, 278 | Limited: 800000, 279 | EvictionPoolCapacity: 32, 280 | SampleKeys: 5, 281 | } 282 | 283 | genBytes := func(i, j int) []byte { 284 | return []byte("i" + strconv.Itoa(i) + "j" + strconv.Itoa(j)) 285 | } 286 | 287 | m, err := NewShardMap[[]byte, []byte](&mockShardMapOperator{}, opts) 288 | assert.Nil(t, err) 289 | 290 | var wg sync.WaitGroup 291 | 292 | for i := 0; i < 20; i++ { 293 | wg.Add(1) 294 | go func(i int) { 295 | defer wg.Done() 296 | for j := 0; j < 1000; j++ { 297 | key := genBytes(i, j) 298 | val := genBytes(i, j) 299 | 300 | _, err := m.Set(&key, &val) 301 | assert.Nil(t, err) 302 | } 303 | 304 | for j := 0; j < 1000; j++ { 305 | key := genBytes(i, j) 306 | val, err := m.Get(&key) 307 | assert.Nil(t, err) 308 | assert.Equal(t, key, *val) 309 | } 310 | }(i) 311 | } 312 | 313 | wg.Wait() 314 | } 315 | -------------------------------------------------------------------------------- /README-CN.md: -------------------------------------------------------------------------------- 1 | # bitcaskDB 是什么? 2 | 3 | bitcaskDB是一个基于bitcask存储模型的轻量级、快速、固定容量的键值对存储引擎。 4 | 5 | 它最大的特点是在内存中缓存键值对的索引,每次查询只需要单次 disk seek。按照 100 字节 key,4KB value 的小对象计算,缓存 10 million 个对象,大约需要 1GB 内存,40GB 磁盘空间。相反,如果采用类似 redis,memcached 全内存的缓存方案,相比之下,内存的开销很大。 6 | 7 | # 动机 8 | 9 | - 硬件资源受限,如 4C8G 100G 磁盘 10 | - 缓存数以千万的小对象 11 | 12 | 13 | # 特性 14 | 15 | - 追加写 16 | - 固定长度的 namespace 17 | - 固定磁盘容量和内存用量 18 | - 细粒度的合并 19 | - 近似 LRU 淘汰策略 20 | - 自定义记录的元数据 21 | - 自定义合并策略 22 | - 自定义挑选策略 23 | - 批量写 24 | - 允许过期时间和数据指纹 Etag 25 | - 基于 hint 的快速恢复 26 | - 软删除 27 | 28 | # 对比分析 29 | 30 | ## LSM 31 | - 追加写 32 | - 读操作可能需要多次随机寻址 33 | - 写放大 34 | - 链式合并 35 | - 范围查询 36 | - 有序性 37 | - 回收磁盘空间较慢 38 | - 多个数据版本 39 | 40 | 41 | ## B+Tree 42 | - 原地更新 43 | - 有序性 44 | - 范围查询 45 | - 很难回收磁盘空间 46 | 47 | 48 | ## Bitcask 49 | - 追加写 50 | - 明确的查询和插入性能 51 | - 查询仅需要单次寻址 52 | - 快速的回收磁盘空间 53 | - 内存仅保留最新的数据版本 54 | - 内存可使用多种数据模型,如 btree,hashtable 55 | - hashtable 更加紧凑,但无序,不支持范围查询 56 | - btree 支持范围查询,顺序迭代,但内存开销更大 57 | 58 | 59 | # 快速开始 60 | 61 | 62 | ```golang 63 | import "github.com/wenzhang-dev/bitcaskDB" 64 | 65 | const data = ` 66 | 67 | 68 | 69 | Hello Page 70 | 71 | 72 |

Hello, BitcaskDB!

73 | 74 | 75 | ` 76 | 77 | func main() { 78 | opts := &bitcask.Options{ 79 | Dir: "./bitcaskDB", 80 | WalMaxSize: 1024 * 1024 * 1024, // 1GB 81 | ManifestMaxSize: 1024 * 1024, // 1MB 82 | IndexCapacity: 10000000, // 10 million 83 | IndexLimited: 8000000, 84 | IndexEvictionPoolCapacity: 32, 85 | IndexSampleKeys: 5, 86 | DiskUsageLimited: 1024 * 1024 * 1024 * 100, // 100GB 87 | NsSize: DefaultNsSize, 88 | EtagSize: DefaultEtagSize, 89 | } 90 | 91 | db, err := bitcask.NewDB(opts) 92 | if err != nil { 93 | panic(err) 94 | } 95 | defer func() { 96 | _ = db.Close() 97 | }() 98 | 99 | ns := GenSha1NS("ns") // fixed-size ns 100 | key := []byte("testKey") 101 | value := []byte(data) 102 | now := uint64(db.WallTime().Unix()) 103 | 104 | // customized metadata 105 | appMeta := make(map[string]string) 106 | appMeta["type"] = "html" 107 | meta := NewMeta(appMeta).SetExpire(now+60).SetEtag(GenSha1Etag(value)) 108 | 109 | // set a key 110 | err = db.Put(ns, key, value, meta, &WriteOptions{}) 111 | if err != nil { 112 | panic(err) 113 | } 114 | 115 | // get a key 116 | readVal, readMeta, err := db.Get(ns, key, &ReadOptions{}) 117 | if err != nil { 118 | panic(err) 119 | } 120 | 121 | println(readVal) 122 | println(readMeta) 123 | 124 | // delete a key 125 | err = db.Delete(ns, key, &WriteOptions{}) 126 | if err != nil { 127 | panic(err) 128 | } 129 | } 130 | ``` 131 | 132 | 如果你想简单使用一个 database CRUD http server,可以考虑这个[仓库](https://github.com/wenzhang-dev/bitcaskDB-server)。 133 | 134 | http server 以 docker 容器运行。顺便说,读写 bitcaskDB 的开销,相比网络通信的开销而言,可以忽略不计。 135 | 136 | 137 | # 性能测试 138 | 139 | 读写 4KB 的压测报告如下: 140 | 141 | ``` 142 | go test -bench=PutGet -benchtime=60s -count=3 -timeout=50m 143 | goos: linux 144 | goarch: amd64 145 | pkg: github.com/wenzhang-dev/bitcaskDB/bench 146 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 147 | BenchmarkPutGet/put4K-8 5331782 25259 ns/op 11795 B/op 21 allocs/op 148 | BenchmarkPutGet/put4K-8 5130870 25417 ns/op 11767 B/op 21 allocs/op 149 | BenchmarkPutGet/put4K-8 4898403 26676 ns/op 11742 B/op 21 allocs/op 150 | BenchmarkPutGet/batchPut4K-8 10548615 15340 ns/op 1695 B/op 11 allocs/op 151 | BenchmarkPutGet/batchPut4K-8 9220388 14278 ns/op 1694 B/op 11 allocs/op 152 | BenchmarkPutGet/batchPut4K-8 10363459 15019 ns/op 1686 B/op 11 allocs/op 153 | BenchmarkPutGet/get4K-8 8812342 8076 ns/op 10119 B/op 10 allocs/op 154 | BenchmarkPutGet/get4K-8 7963098 7952 ns/op 10119 B/op 10 allocs/op 155 | BenchmarkPutGet/get4K-8 8480240 7997 ns/op 10119 B/op 10 allocs/op 156 | BenchmarkPutGet/concurrentGet4K-8 17233309 4427 ns/op 10044 B/op 7 allocs/op 157 | BenchmarkPutGet/concurrentGet4K-8 26745726 3681 ns/op 10044 B/op 7 allocs/op 158 | BenchmarkPutGet/concurrentGet4K-8 29305041 3654 ns/op 10044 B/op 7 allocs/op 159 | BenchmarkPutGet/concurrentPut4K-8 4558645 19829 ns/op 8340 B/op 18 allocs/op 160 | BenchmarkPutGet/concurrentPut4K-8 4433334 18664 ns/op 10031 B/op 18 allocs/op 161 | BenchmarkPutGet/concurrentPut4K-8 4366149 17031 ns/op 8175 B/op 17 allocs/op 162 | BenchmarkPutGet/concurrentBatchPut4K-8 9443377 12520 ns/op 1527 B/op 9 allocs/op 163 | BenchmarkPutGet/concurrentBatchPut4K-8 11338162 12429 ns/op 1517 B/op 9 allocs/op 164 | BenchmarkPutGet/concurrentBatchPut4K-8 11394081 12101 ns/op 1510 B/op 9 allocs/op 165 | PASS 166 | ok github.com/wenzhang-dev/bitcaskDB/bench 2310.401s 167 | ``` 168 | 169 | 同时,也测试了几个主流的 KV 存储引擎在读写 4KB 的性能,并记录了它们在测试过程中的 RSS 占用。 170 | 性能测试仓库为:[codebase](https://github.com/wenzhang-dev/bitcaskDB-benchmark) 171 | 172 | ```shell 173 | go test -bench=Read -benchtime=60s -timeout=30m -count=3 174 | goos: linux 175 | goarch: amd64 176 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark 177 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 178 | BenchmarkReadWithBitcaskDB/read4K-8 11459024 6313 ns/op 1.217 AvgRSS(GB) 1.275 PeakRSS(GB) 10120 B/op 10 allocs/op 179 | BenchmarkReadWithBitcaskDB/read4K-8 12512324 6522 ns/op 1.220 AvgRSS(GB) 1.234 PeakRSS(GB) 10120 B/op 10 allocs/op 180 | BenchmarkReadWithBitcaskDB/read4K-8 12414660 6468 ns/op 1.206 AvgRSS(GB) 1.231 PeakRSS(GB) 10120 B/op 10 allocs/op 181 | BenchmarkReadWithBadger/read4K-8 4575487 13526 ns/op 2.716 AvgRSS(GB) 4.350 PeakRSS(GB) 19416 B/op 43 allocs/op 182 | BenchmarkReadWithBadger/read4K-8 4960239 13741 ns/op 1.629 AvgRSS(GB) 1.681 PeakRSS(GB) 19406 B/op 43 allocs/op 183 | BenchmarkReadWithBadger/read4K-8 4851144 14429 ns/op 1.591 AvgRSS(GB) 1.650 PeakRSS(GB) 19422 B/op 44 allocs/op 184 | BenchmarkReadWithLevelDB/read4K-8 1569663 50710 ns/op 0.111 AvgRSS(GB) 0.134 PeakRSS(GB) 55021 B/op 35 allocs/op 185 | BenchmarkReadWithLevelDB/read4K-8 1000000 63066 ns/op 0.113 AvgRSS(GB) 0.129 PeakRSS(GB) 54264 B/op 35 allocs/op 186 | BenchmarkReadWithLevelDB/read4K-8 1236408 57268 ns/op 0.114 AvgRSS(GB) 0.138 PeakRSS(GB) 54624 B/op 35 allocs/op 187 | BenchmarkReadWithBoltDB/read4K-8 12587562 5269 ns/op 5.832 AvgRSS(GB) 5.838 PeakRSS(GB) 832 B/op 13 allocs/op 188 | BenchmarkReadWithBoltDB/read4K-8 16920481 4482 ns/op 5.832 AvgRSS(GB) 5.833 PeakRSS(GB) 832 B/op 13 allocs/op 189 | BenchmarkReadWithBoltDB/read4K-8 19141418 5276 ns/op 5.832 AvgRSS(GB) 5.835 PeakRSS(GB) 832 B/op 13 allocs/op 190 | PASS 191 | ok github.com/wenzhang-dev/bitcaskDB-benchmark 1475.172s 192 | ``` 193 | 194 | 195 | ```shell 196 | go test -bench=Write -benchtime=60s -timeout=30m -count=3 197 | goos: linux 198 | goarch: amd64 199 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark 200 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 201 | BenchmarkWriteWithBitcaskDB/write4K-8 8334304 13217 ns/op 0.7905 AvgRSS(GB) 0.934 PeakRSS(GB) 1666 B/op 11 allocs/op 202 | BenchmarkWriteWithBitcaskDB/write4K-8 5323338 14976 ns/op 0.9732 AvgRSS(GB) 1.058 PeakRSS(GB) 1727 B/op 12 allocs/op 203 | BenchmarkWriteWithBitcaskDB/write4K-8 5435398 13929 ns/op 0.9639 AvgRSS(GB) 1.122 PeakRSS(GB) 1756 B/op 12 allocs/op 204 | BenchmarkWriteWithLevelDB/write4K-8 1047753 68691 ns/op 0.0615 AvgRSS(GB) 0.0636 PeakRSS(GB) 2946 B/op 16 allocs/op 205 | BenchmarkWriteWithLevelDB/write4K-8 1179555 71497 ns/op 0.0617 AvgRSS(GB) 0.0634 PeakRSS(GB) 3250 B/op 18 allocs/op 206 | BenchmarkWriteWithLevelDB/write4K-8 992488 74130 ns/op 0.0613 AvgRSS(GB) 0.0625 PeakRSS(GB) 3444 B/op 19 allocs/op 207 | BenchmarkWriteWithBadger/write4K-8 3776720 20036 ns/op 6.409 AvgRSS(GB) 7.534 PeakRSS(GB) 30062 B/op 68 allocs/op 208 | BenchmarkWriteWithBadger/write4K-8 4106070 50959 ns/op 10.77 AvgRSS(GB) 13.63 PeakRSS(GB) 115442 B/op 152 allocs/op 209 | BenchmarkWriteWithBadger/write4K-8 1491906 49955 ns/op 11.45 AvgRSS(GB) 13.72 PeakRSS(GB) 88941 B/op 130 allocs/op 210 | BenchmarkWriteWithBoltDB/write4K-8 2808206 23131 ns/op 0.626 AvgRSS(GB) 0.999 PeakRSS(GB) 7579 B/op 11 allocs/op 211 | BenchmarkWriteWithBoltDB/write4K-8 4303538 22836 ns/op 1.713 AvgRSS(GB) 2.971 PeakRSS(GB) 7765 B/op 11 allocs/op 212 | BenchmarkWriteWithBoltDB/write4K-8 3755002 19385 ns/op 2.481 AvgRSS(GB) 2.872 PeakRSS(GB) 7896 B/op 12 allocs/op 213 | PASS 214 | ok github.com/wenzhang-dev/bitcaskDB-benchmark 1541.068s 215 | ``` 216 | 217 | 指定磁盘容量的压测报告: [benchmark2](https://github.com/wenzhang-dev/bitcaskDB/blob/main/bench/benchmark2) 218 | -------------------------------------------------------------------------------- /db_impl_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "crypto/sha1" 5 | "os" 6 | "strconv" 7 | "sync" 8 | "testing" 9 | 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func sha1Bytes(input string) [20]byte { 14 | return sha1.Sum([]byte(input)) 15 | } 16 | 17 | func setupDB(t *testing.T) *DBImpl { 18 | dir := "./test_bitcask_db" 19 | _ = os.RemoveAll(dir) 20 | 21 | assert.Nil(t, os.MkdirAll(dir, os.ModePerm)) 22 | 23 | opts := &Options{ 24 | Dir: dir, 25 | WalMaxSize: 1024 * 1024, // 1MB 26 | ManifestMaxSize: 1024 * 1024, // 1MB 27 | IndexCapacity: 1000000, 28 | IndexLimited: 800000, 29 | IndexEvictionPoolCapacity: 32, 30 | IndexSampleKeys: 5, 31 | BlockCacheCapacity: 8192, // 256MB 32 | BlockCacheLimited: 8192, 33 | BlockCacheSampleKeys: 5, 34 | BlockCacheEvictionPoolCapacity: 32, 35 | BlockReaderConcurrent: 32, 36 | NsSize: DefaultNsSize, 37 | EtagSize: DefaultEtagSize, 38 | } 39 | 40 | db, err := NewDB(opts) 41 | assert.Nil(t, err) 42 | assert.NotNil(t, db) 43 | return db 44 | } 45 | 46 | func loadDB(t *testing.T) *DBImpl { 47 | dir := "./test_bitcask_db" 48 | 49 | opts := &Options{ 50 | Dir: dir, 51 | WalMaxSize: 1024 * 1024, // 1MB 52 | ManifestMaxSize: 1024 * 1024, // 1MB 53 | IndexCapacity: 1000000, 54 | IndexLimited: 800000, 55 | IndexEvictionPoolCapacity: 32, 56 | IndexSampleKeys: 5, 57 | BlockCacheCapacity: 8192, // 256MB 58 | BlockCacheLimited: 8192, 59 | BlockCacheSampleKeys: 5, 60 | BlockCacheEvictionPoolCapacity: 32, 61 | BlockReaderConcurrent: 32, 62 | NsSize: DefaultNsSize, 63 | EtagSize: DefaultEtagSize, 64 | } 65 | 66 | db, err := NewDB(opts) 67 | assert.Nil(t, err) 68 | assert.NotNil(t, db) 69 | return db 70 | } 71 | 72 | func teardownDB(db *DBImpl) { 73 | db.Close() 74 | _ = os.RemoveAll(db.opts.Dir) 75 | } 76 | 77 | func TestDBImplBasicWriteRead(t *testing.T) { 78 | db := setupDB(t) 79 | defer teardownDB(db) 80 | 81 | now := uint64(db.WallTime().Unix()) 82 | ns := sha1Bytes("namespace") 83 | key := []byte("testKey") 84 | value := []byte("testValue") 85 | etag := sha1Bytes(string(value)) 86 | meta := NewMeta(nil).SetExpire(now + 60).SetEtag(etag[:]) 87 | 88 | err := db.Put(ns[:], key, value, meta, &WriteOptions{}) 89 | assert.NoError(t, err) 90 | 91 | // write without options 92 | err = db.Put(ns[:], key, value, meta, nil) 93 | assert.Nil(t, err) 94 | 95 | readVal, readMeta, err := db.Get(ns[:], key, &ReadOptions{}) 96 | assert.NoError(t, err) 97 | assert.Equal(t, value, readVal) 98 | assert.Equal(t, meta, readMeta) 99 | 100 | // read without options 101 | readVal, readMeta, err = db.Get(ns[:], key, nil) 102 | assert.NoError(t, err) 103 | assert.Equal(t, value, readVal) 104 | assert.Equal(t, meta, readMeta) 105 | } 106 | 107 | func TestDBImplBasicWriteReadV2(t *testing.T) { 108 | db := setupDB(t) 109 | defer teardownDB(db) 110 | 111 | now := uint64(db.WallTime().Unix()) 112 | ns := sha1Bytes("namespace") 113 | key := []byte("testKey") 114 | value := []byte("testValue") 115 | etag := sha1Bytes(string(value)) 116 | meta := NewMeta(nil).SetExpire(now + 60).SetEtag(etag[:]) 117 | 118 | err := db.Put(ns[:], key, value, meta, &WriteOptions{}) 119 | assert.NoError(t, err) 120 | 121 | // write without options 122 | err = db.Put(ns[:], key, value, meta, nil) 123 | assert.Nil(t, err) 124 | 125 | readVal, readMeta, err := db.GetV2(ns[:], key, &ReadOptions{}) 126 | assert.NoError(t, err) 127 | assert.Equal(t, value, readVal) 128 | assert.Equal(t, meta, readMeta) 129 | 130 | // read without options 131 | readVal, readMeta, err = db.GetV2(ns[:], key, nil) 132 | assert.NoError(t, err) 133 | assert.Equal(t, value, readVal) 134 | assert.Equal(t, meta, readMeta) 135 | } 136 | 137 | func TestDBImplWriteEmptyValue(t *testing.T) { 138 | db := setupDB(t) 139 | defer teardownDB(db) 140 | 141 | ns := sha1Bytes("namespace") 142 | key := []byte("testKey") 143 | meta := NewMeta(nil) 144 | 145 | // no etag, no expire and no value 146 | err := db.Put(ns[:], key, nil, meta, &WriteOptions{}) 147 | assert.NoError(t, err) 148 | 149 | readVal, _, err := db.Get(ns[:], key, &ReadOptions{}) 150 | assert.NoError(t, err) 151 | assert.Equal(t, len(readVal), 0) 152 | } 153 | 154 | func TestDBImplWriteDeleteRead(t *testing.T) { 155 | db := setupDB(t) 156 | defer teardownDB(db) 157 | 158 | ns := sha1Bytes("namespace") 159 | key := []byte("testKey") 160 | value := []byte("testValue") 161 | meta := NewMeta(nil) 162 | 163 | _ = db.Put(ns[:], key, value, meta, &WriteOptions{}) 164 | _ = db.Delete(ns[:], key, &WriteOptions{}) 165 | 166 | readVal, _, err := db.Get(ns[:], key, &ReadOptions{}) 167 | assert.ErrorIs(t, err, ErrKeyNotFound) 168 | assert.Nil(t, readVal) 169 | } 170 | 171 | func TestDBImplWALRotate(t *testing.T) { 172 | db := setupDB(t) 173 | defer teardownDB(db) 174 | 175 | ns := sha1Bytes("wal-rotation") 176 | meta := NewMeta(nil) 177 | opts := &WriteOptions{} 178 | 179 | initFid := db.manifest.active.Fid() 180 | 181 | // 50000 > (1MB Wal / min 50B per record = 20000) 182 | for i := 0; i < 50000; i++ { 183 | key := sha1Bytes("key" + strconv.Itoa(i)) 184 | value := sha1Bytes("val" + strconv.Itoa(i)) 185 | err := db.Put(ns[:], key[:], value[:], meta, opts) 186 | assert.Nil(t, err) 187 | } 188 | 189 | assert.NotEqual(t, initFid, db.manifest.active.Fid()) 190 | } 191 | 192 | func TestDBImplPersistence(t *testing.T) { 193 | db := setupDB(t) 194 | 195 | bin4K := GenNKBytes(4) 196 | appMeta := make(map[string]string) 197 | appMeta["test"] = string(bin4K) 198 | 199 | ns := sha1Bytes("persistence") 200 | meta := NewMeta(appMeta) 201 | opts := &WriteOptions{} 202 | 203 | // write 10000 keys 204 | for i := 0; i < 10000; i++ { 205 | key := sha1Bytes("key" + strconv.Itoa(i)) 206 | value := sha1Bytes("val" + strconv.Itoa(i)) 207 | err := db.Put(ns[:], key[:], value[:], meta, opts) 208 | 209 | assert.Nil(t, err) 210 | } 211 | 212 | // check 213 | for i := 0; i < 10000; i++ { 214 | key := sha1Bytes("key" + strconv.Itoa(i)) 215 | value := sha1Bytes("val" + strconv.Itoa(i)) 216 | readVal, readMeta, err := db.Get(ns[:], key[:], &ReadOptions{}) 217 | 218 | assert.Nil(t, err) 219 | assert.Equal(t, readVal, value[:]) 220 | assert.Equal(t, readMeta.AppMeta, meta.AppMeta) 221 | } 222 | 223 | // re-open db 224 | db.Close() 225 | 226 | db = loadDB(t) 227 | defer teardownDB(db) 228 | 229 | // check again 230 | for i := 0; i < 10000; i++ { 231 | key := sha1Bytes("key" + strconv.Itoa(i)) 232 | value := sha1Bytes("val" + strconv.Itoa(i)) 233 | readVal, readMeta, err := db.Get(ns[:], key[:], &ReadOptions{}) 234 | 235 | assert.Nil(t, err) 236 | assert.Equal(t, readVal, value[:]) 237 | assert.Equal(t, readMeta.AppMeta, meta.AppMeta) 238 | } 239 | } 240 | 241 | /* 242 | func TestDBImplBatchWrite(t *testing.T) { 243 | } 244 | */ 245 | 246 | func TestDBImplConcurrentWriteAndRead(t *testing.T) { 247 | db := setupDB(t) 248 | defer teardownDB(db) 249 | 250 | var wg sync.WaitGroup 251 | ns := sha1Bytes("concurrent") 252 | meta := NewMeta(nil) 253 | 254 | genBytes := func(i, j int) []byte { 255 | return []byte("i" + strconv.Itoa(i) + "j" + strconv.Itoa(j)) 256 | } 257 | 258 | // total: 25000 keys 259 | for i := 0; i < 50; i++ { 260 | wg.Add(1) 261 | // each goroutine writes 500 keys 262 | go func(i int) { 263 | defer wg.Done() 264 | for j := 0; j < 500; j++ { 265 | // key equals it's value 266 | key := genBytes(i, j) 267 | value := genBytes(i, j) 268 | err := db.Put(ns[:], key, value, meta, &WriteOptions{}) 269 | assert.Nil(t, err) 270 | } 271 | 272 | // check 273 | for j := 0; j < 500; j++ { 274 | key := genBytes(i, j) 275 | val, _, err := db.Get(ns[:], key, &ReadOptions{}) 276 | if err != nil { 277 | print(err.Error()) 278 | } 279 | assert.Nil(t, err) 280 | assert.Equal(t, val, key) 281 | } 282 | }(i) 283 | } 284 | 285 | wg.Wait() 286 | } 287 | 288 | func TestDBImplConcurrentWriteAndReadV2(t *testing.T) { 289 | db := setupDB(t) 290 | defer teardownDB(db) 291 | 292 | var wg sync.WaitGroup 293 | ns := sha1Bytes("concurrent") 294 | meta := NewMeta(nil) 295 | 296 | genBytes := func(i, j int) []byte { 297 | return []byte("i" + strconv.Itoa(i) + "j" + strconv.Itoa(j)) 298 | } 299 | 300 | // total: 25000 keys 301 | for i := 0; i < 50; i++ { 302 | wg.Add(1) 303 | // each goroutine writes 500 keys 304 | go func(i int) { 305 | defer wg.Done() 306 | for j := 0; j < 500; j++ { 307 | // key equals it's value 308 | key := genBytes(i, j) 309 | value := genBytes(i, j) 310 | err := db.Put(ns[:], key, value, meta, &WriteOptions{}) 311 | assert.Nil(t, err) 312 | } 313 | 314 | // check 315 | for j := 0; j < 500; j++ { 316 | key := genBytes(i, j) 317 | val, _, err := db.GetV2(ns[:], key, &ReadOptions{}) 318 | if err != nil { 319 | print(err.Error()) 320 | } 321 | assert.Nil(t, err) 322 | assert.Equal(t, val, key) 323 | } 324 | }(i) 325 | } 326 | 327 | wg.Wait() 328 | } 329 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | [English](https://github.com/wenzhang-dev/bitcaskDB/blob/main/README.md) · [简体中文](https://github.com/wenzhang-dev/bitcaskDB/blob/main/README-CN.md) 6 | 7 | 8 | 9 |
10 | 11 | # What is bitcaskDB 12 | 13 | bitcaskDB is a light-weight, fast, fixed capacity key/value storage engine base on bitcask storage model. 14 | 15 | Its biggest feature is that it caches the index of key-value pairs in memory, and each query only requires a single disk seek. Based on the calculation of small objects with 100 bytes of key and 4KB of value, caching 10 million objects requires about 1GB of memory and 40GB of disk space. On the contrary, if a full-memory caching solution like redis or memcached is used, the memory overhead is very high. 16 | 17 | 18 | # Motivation 19 | 20 | - limited hardware resources 21 | - cache tens of millions of small objects 22 | 23 | 24 | # Features 25 | - append-only 26 | - fixed-size namespace 27 | - fixed memory and disk usage 28 | - fine-grained compaction 29 | - LRU-like eviction policy in memory 30 | - customized record metadata 31 | - customized compaction filter 32 | - customized compaction picker 33 | - bulk writes 34 | - allow expire and value fingerprint(etag) 35 | - fast recover based on hint wal 36 | - soft deletion 37 | 38 | 39 | # Comparation 40 | 41 | ## LSM 42 | - append-only 43 | - multiple disk seek in worst case 44 | - write amplification 45 | - chained compaction 46 | - range search 47 | - ordered 48 | - slow to reclaim disk space 49 | - multiple data version 50 | 51 | 52 | ## B+Tree 53 | - update-inplace 54 | - ordered 55 | - range search 56 | - hard to reclaim disk space 57 | 58 | 59 | ## Bitcask 60 | - append-only 61 | - predictable lookup and insert performance 62 | - single seek to retrieve any value 63 | - fast to reclaim disk space 64 | - only one data version in memory 65 | - multiple data model in memory, such as btree, hashtable 66 | - hashtable is more compact, but un-ordered and un-support range search 67 | 68 | 69 | # Getting started 70 | 71 | 72 | ```golang 73 | import "github.com/wenzhang-dev/bitcaskDB" 74 | 75 | const data = ` 76 | 77 | 78 | 79 | Hello Page 80 | 81 | 82 |

Hello, BitcaskDB!

83 | 84 | 85 | ` 86 | 87 | func main() { 88 | opts := &bitcask.Options{ 89 | Dir: "./bitcaskDB", 90 | WalMaxSize: 1024 * 1024 * 1024, // 1GB 91 | ManifestMaxSize: 1024 * 1024, // 1MB 92 | IndexCapacity: 10000000, // 10 million 93 | IndexLimited: 8000000, 94 | IndexEvictionPoolCapacity: 32, 95 | IndexSampleKeys: 5, 96 | DiskUsageLimited: 1024 * 1024 * 1024 * 100, // 100GB 97 | NsSize: DefaultNsSize, 98 | EtagSize: DefaultEtagSize, 99 | } 100 | 101 | db, err := bitcask.NewDB(opts) 102 | if err != nil { 103 | panic(err) 104 | } 105 | defer func() { 106 | _ = db.Close() 107 | }() 108 | 109 | ns := GenSha1NS("ns") // fixed-size ns 110 | key := []byte("testKey") 111 | value := []byte(data) 112 | now := uint64(db.WallTime().Unix()) 113 | 114 | // customized metadata 115 | appMeta := make(map[string]string) 116 | appMeta["type"] = "html" 117 | meta := NewMeta(appMeta).SetExpire(now+60).SetEtag(GenSha1Etag(value)) 118 | 119 | // set a key 120 | err = db.Put(ns, key, value, meta, &WriteOptions{}) 121 | if err != nil { 122 | panic(err) 123 | } 124 | 125 | // get a key 126 | readVal, readMeta, err := db.Get(ns, key, &ReadOptions{}) 127 | if err != nil { 128 | panic(err) 129 | } 130 | 131 | println(readVal) 132 | println(readMeta) 133 | 134 | // delete a key 135 | err = db.Delete(ns, key, &WriteOptions{}) 136 | if err != nil { 137 | panic(err) 138 | } 139 | } 140 | ``` 141 | 142 | If you want to simply use a database CRUD http server, consider this [repository](https://github.com/wenzhang-dev/bitcaskDB-server). 143 | 144 | The http server runs as a docker container. By the way, the overhead of reading and writing bitcaskDB is negligible compared to the overhead of network communication. 145 | 146 | 147 | # Benchmark 148 | 149 | Here are the benchmarks for reading and writing 4KB: 150 | 151 | ``` 152 | go test -bench=PutGet -benchtime=60s -count=3 -timeout=50m 153 | goos: linux 154 | goarch: amd64 155 | pkg: github.com/wenzhang-dev/bitcaskDB/bench 156 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 157 | BenchmarkPutGet/put4K-8 5331782 25259 ns/op 11795 B/op 21 allocs/op 158 | BenchmarkPutGet/put4K-8 5130870 25417 ns/op 11767 B/op 21 allocs/op 159 | BenchmarkPutGet/put4K-8 4898403 26676 ns/op 11742 B/op 21 allocs/op 160 | BenchmarkPutGet/batchPut4K-8 10548615 15340 ns/op 1695 B/op 11 allocs/op 161 | BenchmarkPutGet/batchPut4K-8 9220388 14278 ns/op 1694 B/op 11 allocs/op 162 | BenchmarkPutGet/batchPut4K-8 10363459 15019 ns/op 1686 B/op 11 allocs/op 163 | BenchmarkPutGet/get4K-8 8812342 8076 ns/op 10119 B/op 10 allocs/op 164 | BenchmarkPutGet/get4K-8 7963098 7952 ns/op 10119 B/op 10 allocs/op 165 | BenchmarkPutGet/get4K-8 8480240 7997 ns/op 10119 B/op 10 allocs/op 166 | BenchmarkPutGet/concurrentGet4K-8 17233309 4427 ns/op 10044 B/op 7 allocs/op 167 | BenchmarkPutGet/concurrentGet4K-8 26745726 3681 ns/op 10044 B/op 7 allocs/op 168 | BenchmarkPutGet/concurrentGet4K-8 29305041 3654 ns/op 10044 B/op 7 allocs/op 169 | BenchmarkPutGet/concurrentPut4K-8 4558645 19829 ns/op 8340 B/op 18 allocs/op 170 | BenchmarkPutGet/concurrentPut4K-8 4433334 18664 ns/op 10031 B/op 18 allocs/op 171 | BenchmarkPutGet/concurrentPut4K-8 4366149 17031 ns/op 8175 B/op 17 allocs/op 172 | BenchmarkPutGet/concurrentBatchPut4K-8 9443377 12520 ns/op 1527 B/op 9 allocs/op 173 | BenchmarkPutGet/concurrentBatchPut4K-8 11338162 12429 ns/op 1517 B/op 9 allocs/op 174 | BenchmarkPutGet/concurrentBatchPut4K-8 11394081 12101 ns/op 1510 B/op 9 allocs/op 175 | PASS 176 | ok github.com/wenzhang-dev/bitcaskDB/bench 2310.401s 177 | ``` 178 | 179 | Here, several popular KV storage engines are tested for reading and writing 4KB, and their RSS usages are recorded. 180 | The repository for this benchmark is: [codebase](https://github.com/wenzhang-dev/bitcaskDB-benchmark) 181 | 182 | ```shell 183 | go test -bench=Read -benchtime=60s -timeout=30m -count=3 184 | goos: linux 185 | goarch: amd64 186 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark 187 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 188 | BenchmarkReadWithBitcaskDB/read4K-8 11459024 6313 ns/op 1.217 AvgRSS(GB) 1.275 PeakRSS(GB) 10120 B/op 10 allocs/op 189 | BenchmarkReadWithBitcaskDB/read4K-8 12512324 6522 ns/op 1.220 AvgRSS(GB) 1.234 PeakRSS(GB) 10120 B/op 10 allocs/op 190 | BenchmarkReadWithBitcaskDB/read4K-8 12414660 6468 ns/op 1.206 AvgRSS(GB) 1.231 PeakRSS(GB) 10120 B/op 10 allocs/op 191 | BenchmarkReadWithBadger/read4K-8 4575487 13526 ns/op 2.716 AvgRSS(GB) 4.350 PeakRSS(GB) 19416 B/op 43 allocs/op 192 | BenchmarkReadWithBadger/read4K-8 4960239 13741 ns/op 1.629 AvgRSS(GB) 1.681 PeakRSS(GB) 19406 B/op 43 allocs/op 193 | BenchmarkReadWithBadger/read4K-8 4851144 14429 ns/op 1.591 AvgRSS(GB) 1.650 PeakRSS(GB) 19422 B/op 44 allocs/op 194 | BenchmarkReadWithLevelDB/read4K-8 1569663 50710 ns/op 0.111 AvgRSS(GB) 0.134 PeakRSS(GB) 55021 B/op 35 allocs/op 195 | BenchmarkReadWithLevelDB/read4K-8 1000000 63066 ns/op 0.113 AvgRSS(GB) 0.129 PeakRSS(GB) 54264 B/op 35 allocs/op 196 | BenchmarkReadWithLevelDB/read4K-8 1236408 57268 ns/op 0.114 AvgRSS(GB) 0.138 PeakRSS(GB) 54624 B/op 35 allocs/op 197 | BenchmarkReadWithBoltDB/read4K-8 12587562 5269 ns/op 5.832 AvgRSS(GB) 5.838 PeakRSS(GB) 832 B/op 13 allocs/op 198 | BenchmarkReadWithBoltDB/read4K-8 16920481 4482 ns/op 5.832 AvgRSS(GB) 5.833 PeakRSS(GB) 832 B/op 13 allocs/op 199 | BenchmarkReadWithBoltDB/read4K-8 19141418 5276 ns/op 5.832 AvgRSS(GB) 5.835 PeakRSS(GB) 832 B/op 13 allocs/op 200 | PASS 201 | ok github.com/wenzhang-dev/bitcaskDB-benchmark 1475.172s 202 | ``` 203 | 204 | 205 | ```shell 206 | go test -bench=Write -benchtime=60s -timeout=30m -count=3 207 | goos: linux 208 | goarch: amd64 209 | pkg: github.com/wenzhang-dev/bitcaskDB-benchmark 210 | cpu: Intel(R) Xeon(R) Gold 5318N CPU @ 2.10GHz 211 | BenchmarkWriteWithBitcaskDB/write4K-8 8334304 13217 ns/op 0.7905 AvgRSS(GB) 0.934 PeakRSS(GB) 1666 B/op 11 allocs/op 212 | BenchmarkWriteWithBitcaskDB/write4K-8 5323338 14976 ns/op 0.9732 AvgRSS(GB) 1.058 PeakRSS(GB) 1727 B/op 12 allocs/op 213 | BenchmarkWriteWithBitcaskDB/write4K-8 5435398 13929 ns/op 0.9639 AvgRSS(GB) 1.122 PeakRSS(GB) 1756 B/op 12 allocs/op 214 | BenchmarkWriteWithLevelDB/write4K-8 1047753 68691 ns/op 0.0615 AvgRSS(GB) 0.0636 PeakRSS(GB) 2946 B/op 16 allocs/op 215 | BenchmarkWriteWithLevelDB/write4K-8 1179555 71497 ns/op 0.0617 AvgRSS(GB) 0.0634 PeakRSS(GB) 3250 B/op 18 allocs/op 216 | BenchmarkWriteWithLevelDB/write4K-8 992488 74130 ns/op 0.0613 AvgRSS(GB) 0.0625 PeakRSS(GB) 3444 B/op 19 allocs/op 217 | BenchmarkWriteWithBadger/write4K-8 3776720 20036 ns/op 6.409 AvgRSS(GB) 7.534 PeakRSS(GB) 30062 B/op 68 allocs/op 218 | BenchmarkWriteWithBadger/write4K-8 4106070 50959 ns/op 10.77 AvgRSS(GB) 13.63 PeakRSS(GB) 115442 B/op 152 allocs/op 219 | BenchmarkWriteWithBadger/write4K-8 1491906 49955 ns/op 11.45 AvgRSS(GB) 13.72 PeakRSS(GB) 88941 B/op 130 allocs/op 220 | BenchmarkWriteWithBoltDB/write4K-8 2808206 23131 ns/op 0.626 AvgRSS(GB) 0.999 PeakRSS(GB) 7579 B/op 11 allocs/op 221 | BenchmarkWriteWithBoltDB/write4K-8 4303538 22836 ns/op 1.713 AvgRSS(GB) 2.971 PeakRSS(GB) 7765 B/op 11 allocs/op 222 | BenchmarkWriteWithBoltDB/write4K-8 3755002 19385 ns/op 2.481 AvgRSS(GB) 2.872 PeakRSS(GB) 7896 B/op 12 allocs/op 223 | PASS 224 | ok github.com/wenzhang-dev/bitcaskDB-benchmark 1541.068s 225 | ``` 226 | 227 | The benchmarks with specified disk capacity: [benchmark2](https://github.com/wenzhang-dev/bitcaskDB/blob/main/bench/benchmark2) 228 | -------------------------------------------------------------------------------- /compaction_test.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "strconv" 5 | "testing" 6 | "time" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestCompaction_OneFullRewriteWal(t *testing.T) { 12 | db := setupDB(t) 13 | defer teardownDB(db) 14 | 15 | // the wal capacity is 1MB 16 | // one wal can store up to 256 elements 17 | meta := NewMeta(nil) 18 | bin4K := GenNKBytes(4) 19 | ns := sha1Bytes("compaction") 20 | opts := &WriteOptions{} 21 | 22 | for i := 0; i < 100; i++ { 23 | key := sha1Bytes(strconv.Itoa(i)) 24 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 25 | assert.Nil(t, err) 26 | } 27 | 28 | // only one wal 29 | assert.Equal(t, len(db.manifest.wals), 1) 30 | 31 | // manual rotate wal 32 | _, err := db.manifest.RotateWal() 33 | assert.Nil(t, err) 34 | assert.Equal(t, len(db.manifest.wals), 2) 35 | 36 | // re-write the data in active 37 | // the data in the previous wal should be evicted 38 | for i := 0; i < 100; i++ { 39 | key := sha1Bytes(strconv.Itoa(i)) 40 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 41 | assert.Nil(t, err) 42 | } 43 | 44 | // reach here: only two wals 45 | // manual trigger compaction 46 | db.maybeScheduleCompaction() 47 | assert.True(t, db.compacting.Load()) 48 | 49 | // wait up to 10 seconds 50 | waitTimes := 0 51 | for { 52 | if !db.compacting.Load() { 53 | break 54 | } 55 | 56 | waitTimes++ 57 | if waitTimes > 10 { 58 | break 59 | } 60 | time.Sleep(time.Second) 61 | } 62 | 63 | // the compaction should be finished 64 | assert.True(t, waitTimes <= 10) 65 | assert.False(t, db.compacting.Load()) 66 | 67 | // after compaction, only one wal lefts 68 | assert.Equal(t, len(db.manifest.wals), 1) 69 | 70 | // after compaction, we can get the data 71 | for i := 0; i < 100; i++ { 72 | key := sha1Bytes(strconv.Itoa(i)) 73 | readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{}) 74 | 75 | assert.Nil(t, err) 76 | assert.Equal(t, bin4K, readVal) 77 | } 78 | } 79 | 80 | func TestCompaction_OneNonFullRewriteWal(t *testing.T) { 81 | db := setupDB(t) 82 | defer teardownDB(db) 83 | 84 | // the wal capacity is 1MB 85 | // one wal can store up to 256 elements 86 | meta := NewMeta(nil) 87 | bin4K := GenNKBytes(4) 88 | ns := sha1Bytes("compaction") 89 | opts := &WriteOptions{} 90 | 91 | for i := 0; i < 100; i++ { 92 | key := sha1Bytes(strconv.Itoa(i)) 93 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 94 | assert.Nil(t, err) 95 | } 96 | 97 | // only one wal 98 | assert.Equal(t, len(db.manifest.wals), 1) 99 | 100 | // manual rotate wal 101 | _, err := db.manifest.RotateWal() 102 | assert.Nil(t, err) 103 | assert.Equal(t, len(db.manifest.wals), 2) 104 | 105 | // re-write the data in active 106 | for i := 0; i < 90; i++ { 107 | key := sha1Bytes(strconv.Itoa(i)) 108 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 109 | assert.Nil(t, err) 110 | } 111 | 112 | // reach here: only two wals 113 | // manual trigger compaction 114 | db.maybeScheduleCompaction() 115 | assert.True(t, db.compacting.Load()) 116 | 117 | // wait up to 10 seconds 118 | waitTimes := 0 119 | for { 120 | if !db.compacting.Load() { 121 | break 122 | } 123 | 124 | waitTimes++ 125 | if waitTimes > 10 { 126 | break 127 | } 128 | time.Sleep(time.Second) 129 | } 130 | 131 | // the compaction should be finished 132 | assert.True(t, waitTimes <= 10) 133 | assert.False(t, db.compacting.Load()) 134 | 135 | // after compaction, two wals left 136 | assert.Equal(t, len(db.manifest.wals), 2) 137 | 138 | // after compaction, we can get the data 139 | for i := 0; i < 100; i++ { 140 | key := sha1Bytes(strconv.Itoa(i)) 141 | readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{}) 142 | 143 | assert.Nil(t, err) 144 | assert.Equal(t, bin4K, readVal) 145 | } 146 | } 147 | 148 | func TestCompaction_TwoFullRewriteWals(t *testing.T) { 149 | db := setupDB(t) 150 | defer teardownDB(db) 151 | 152 | // the wal capacity is 1MB 153 | // one wal can store up to 256 elements 154 | meta := NewMeta(nil) 155 | bin4K := GenNKBytes(4) 156 | ns := sha1Bytes("compaction") 157 | opts := &WriteOptions{} 158 | 159 | for i := 0; i < 100; i++ { 160 | key := sha1Bytes(strconv.Itoa(i)) 161 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 162 | assert.Nil(t, err) 163 | } 164 | 165 | // only one wal 166 | assert.Equal(t, len(db.manifest.wals), 1) 167 | 168 | // manual rotate wal 169 | _, err := db.manifest.RotateWal() 170 | assert.Nil(t, err) 171 | assert.Equal(t, len(db.manifest.wals), 2) 172 | 173 | // re-write the data in active 174 | for i := 0; i < 100; i++ { 175 | key := sha1Bytes(strconv.Itoa(i)) 176 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 177 | assert.Nil(t, err) 178 | } 179 | 180 | // manual rorate wal again 181 | _, err = db.manifest.RotateWal() 182 | assert.Nil(t, err) 183 | assert.Equal(t, len(db.manifest.wals), 3) 184 | 185 | // re-write the data in active 186 | for i := 0; i < 100; i++ { 187 | key := sha1Bytes(strconv.Itoa(i)) 188 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 189 | assert.Nil(t, err) 190 | } 191 | 192 | // reach here: only three wals 193 | // manual trigger compaction 194 | db.maybeScheduleCompaction() 195 | assert.True(t, db.compacting.Load()) 196 | 197 | // wait up to 10 seconds 198 | waitTimes := 0 199 | for { 200 | if !db.compacting.Load() { 201 | break 202 | } 203 | 204 | waitTimes++ 205 | if waitTimes > 10 { 206 | break 207 | } 208 | time.Sleep(time.Second) 209 | } 210 | 211 | // the compaction should be finished 212 | assert.True(t, waitTimes <= 10) 213 | assert.False(t, db.compacting.Load()) 214 | 215 | // after compaction, only one wal lefts 216 | assert.Equal(t, len(db.manifest.wals), 1) 217 | 218 | // after compaction, we can get the data 219 | for i := 0; i < 100; i++ { 220 | key := sha1Bytes(strconv.Itoa(i)) 221 | readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{}) 222 | 223 | assert.Nil(t, err) 224 | assert.Equal(t, bin4K, readVal) 225 | } 226 | } 227 | 228 | func TestCompaction_TwoNonFullRewriteWals(t *testing.T) { 229 | db := setupDB(t) 230 | defer teardownDB(db) 231 | 232 | // the wal capacity is 1MB 233 | // one wal can store up to 256 elements 234 | meta := NewMeta(nil) 235 | bin4K := GenNKBytes(4) 236 | ns := sha1Bytes("compaction") 237 | opts := &WriteOptions{} 238 | 239 | for i := 0; i < 100; i++ { 240 | key := sha1Bytes(strconv.Itoa(i)) 241 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 242 | assert.Nil(t, err) 243 | } 244 | 245 | // only one wal 246 | assert.Equal(t, len(db.manifest.wals), 1) 247 | 248 | // manual rotate wal 249 | _, err := db.manifest.RotateWal() 250 | assert.Nil(t, err) 251 | assert.Equal(t, len(db.manifest.wals), 2) 252 | 253 | // re-write the data in active 254 | for i := 0; i < 90; i++ { 255 | key := sha1Bytes(strconv.Itoa(i)) 256 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 257 | assert.Nil(t, err) 258 | } 259 | 260 | // manual rorate wal again 261 | _, err = db.manifest.RotateWal() 262 | assert.Nil(t, err) 263 | assert.Equal(t, len(db.manifest.wals), 3) 264 | 265 | // re-write the data in active 266 | for i := 0; i < 90; i++ { 267 | key := sha1Bytes(strconv.Itoa(i)) 268 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 269 | assert.Nil(t, err) 270 | } 271 | 272 | // reach here: only three wals 273 | // manual trigger compaction 274 | db.maybeScheduleCompaction() 275 | assert.True(t, db.compacting.Load()) 276 | 277 | // wait up to 10 seconds 278 | waitTimes := 0 279 | for { 280 | if !db.compacting.Load() { 281 | break 282 | } 283 | 284 | waitTimes++ 285 | if waitTimes > 10 { 286 | break 287 | } 288 | time.Sleep(time.Second) 289 | } 290 | 291 | // the compaction should be finished 292 | assert.True(t, waitTimes <= 10) 293 | assert.False(t, db.compacting.Load()) 294 | 295 | // after compaction, two wals left 296 | assert.Equal(t, len(db.manifest.wals), 2) 297 | 298 | // after compaction, we can get the data 299 | for i := 0; i < 100; i++ { 300 | key := sha1Bytes(strconv.Itoa(i)) 301 | readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{}) 302 | 303 | assert.Nil(t, err) 304 | assert.Equal(t, bin4K, readVal) 305 | } 306 | } 307 | 308 | func TestCompaction_TwoNonFullRewriteWals2(t *testing.T) { 309 | db := setupDB(t) 310 | defer teardownDB(db) 311 | 312 | // the wal capacity is 1MB 313 | // one wal can store up to 256 elements 314 | meta := NewMeta(nil) 315 | bin4K := GenNKBytes(4) 316 | ns := sha1Bytes("compaction") 317 | opts := &WriteOptions{} 318 | 319 | for i := 0; i < 100; i++ { 320 | key := sha1Bytes(strconv.Itoa(i)) 321 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 322 | assert.Nil(t, err) 323 | } 324 | 325 | // only one wal 326 | assert.Equal(t, len(db.manifest.wals), 1) 327 | 328 | // manual rotate wal 329 | _, err := db.manifest.RotateWal() 330 | assert.Nil(t, err) 331 | assert.Equal(t, len(db.manifest.wals), 2) 332 | 333 | // re-write the data in active 334 | bin2K := GenNKBytes(2) 335 | for i := 0; i < 100; i++ { 336 | // skip half elements 337 | if i%2 == 0 { 338 | continue 339 | } 340 | 341 | key := sha1Bytes(strconv.Itoa(i)) 342 | err := db.Put(ns[:], key[:], bin2K, meta, opts) 343 | assert.Nil(t, err) 344 | } 345 | 346 | // manual rorate wal again 347 | _, err = db.manifest.RotateWal() 348 | assert.Nil(t, err) 349 | assert.Equal(t, len(db.manifest.wals), 3) 350 | 351 | // reach here: only three wals 352 | // manual trigger compaction 353 | db.maybeScheduleCompaction() 354 | assert.True(t, db.compacting.Load()) 355 | 356 | // wait up to 10 seconds 357 | waitTimes := 0 358 | for { 359 | if !db.compacting.Load() { 360 | break 361 | } 362 | 363 | waitTimes++ 364 | if waitTimes > 10 { 365 | break 366 | } 367 | time.Sleep(time.Second) 368 | } 369 | 370 | // the compaction should be finished 371 | assert.True(t, waitTimes <= 10) 372 | assert.False(t, db.compacting.Load()) 373 | 374 | // after compaction, three wals left 375 | assert.Equal(t, len(db.manifest.wals), 3) 376 | 377 | // check the data 378 | for i := 0; i < 100; i++ { 379 | key := sha1Bytes(strconv.Itoa(i)) 380 | readVal, _, err := db.Get(ns[:], key[:], &ReadOptions{}) 381 | assert.Nil(t, err) 382 | 383 | if i%2 == 0 { 384 | assert.Equal(t, bin4K, readVal) 385 | } else { 386 | assert.Equal(t, bin2K, readVal) 387 | } 388 | } 389 | } 390 | 391 | func TestCompaction_ReclaimDiskUsage(t *testing.T) { 392 | db := setupDB(t) 393 | defer teardownDB(db) 394 | 395 | // the wal capacity is 1MB 396 | // one wal can store up to 256 elements 397 | meta := NewMeta(nil) 398 | bin4K := GenNKBytes(4) 399 | ns := sha1Bytes("compaction") 400 | opts := &WriteOptions{} 401 | 402 | for i := 0; i < 100; i++ { 403 | key := sha1Bytes(strconv.Itoa(i)) 404 | err := db.Put(ns[:], key[:], bin4K, meta, opts) 405 | assert.Nil(t, err) 406 | } 407 | 408 | // only one wal 409 | assert.Equal(t, len(db.manifest.wals), 1) 410 | 411 | // manual rotate wal 412 | _, err := db.manifest.RotateWal() 413 | assert.Nil(t, err) 414 | assert.Equal(t, len(db.manifest.wals), 2) 415 | 416 | // trigger reclaim disk usage 417 | expect := int64(db.manifest.ActiveWal().Size() - 1) 418 | db.reclaimDiskUsage(expect) 419 | 420 | // one wal has been removed 421 | assert.Equal(t, len(db.manifest.wals), 1) 422 | } 423 | -------------------------------------------------------------------------------- /map.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "sort" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | var ErrMapOptions = errors.New("invalid map options") 11 | 12 | // the reason why we use the specific hashtable implementation: 13 | // - it's hard to control golang map capacity and disable autoscale 14 | // - support any type as hashable key 15 | // - support the customized hash function 16 | // 17 | // comparation: 18 | // - for open addressing hashtable, delete marker makes query performance decrease. 19 | // so it's not suitable for frequent deletion 20 | // - for linked-base hashtable, memory overhead is larger because of more pointers, 21 | // which is less cpu cache-friendly 22 | 23 | type MapOperatorBase interface { 24 | // used to generate random slot 25 | Rand(uint64) uint64 26 | 27 | // second level clock 28 | WallTime() time.Time 29 | } 30 | 31 | type MapOperator[K any] interface { 32 | MapOperatorBase 33 | 34 | // used to map key to slot 35 | Hash(key *K) uint64 36 | 37 | // used to compare key equalization 38 | Equals(lhs, rhs *K) bool 39 | } 40 | 41 | type Bucket[K any, V any] struct { 42 | key *K 43 | val *V 44 | next *Bucket[K, V] 45 | expire uint32 46 | } 47 | 48 | // the eviction poll is fixed size, which store some keys of random buckets 49 | // when the number of entries in map reaches the limit, it will trigger eviction 50 | // the sample random keys will be added to this eviction pool, and evict the 51 | // entry of minimum expire 52 | // 53 | // notes: the eviction is based on the expire value. but the cached key of 54 | // pool maybe be updated, the expire maybe not accurate 55 | // we allow this scenario, and the map only is an approximate LRU 56 | type EvictionPoolEntry[K any] struct { 57 | slot uint64 58 | key *K 59 | expire uint32 60 | } 61 | 62 | // it's thread-safe 63 | type SimpleMap[K any, V any] struct { 64 | optr MapOperator[K] 65 | capacity uint64 66 | 67 | // the length and capacity of this slice are fixed 68 | // it reduce the overhead of rehash 69 | buckets []Bucket[K, V] 70 | 71 | // the actual used number of buckets 72 | used uint64 73 | 74 | // the maximum number of used buckets 75 | // the prefer limited is less than capacity * 0.75, and the reservation 76 | // enhance hashtable performance when hash collision 77 | limited uint64 78 | 79 | // the eviction pool for fixed size 80 | evictPoolSize uint64 81 | evictPoolCapacity uint64 82 | sampleKeys uint64 83 | evictPool []EvictionPoolEntry[K] 84 | 85 | // all expire is relative time of initial time 86 | initTime time.Time 87 | 88 | mu sync.Mutex 89 | } 90 | 91 | type MapOptions struct { 92 | // the number of bucket 93 | Capacity uint64 94 | 95 | // the number of elements 96 | Limited uint64 97 | 98 | EvictionPoolCapacity uint64 99 | SampleKeys uint64 100 | } 101 | 102 | func (opt *MapOptions) validate() error { 103 | if opt.Limited > opt.Capacity { 104 | return ErrMapOptions 105 | } 106 | 107 | if opt.EvictionPoolCapacity > opt.Limited { 108 | return ErrMapOptions 109 | } 110 | 111 | if opt.SampleKeys < 1 { 112 | return ErrMapOptions 113 | } 114 | 115 | if opt.EvictionPoolCapacity < 16 { 116 | return ErrMapOptions 117 | } 118 | 119 | return nil 120 | } 121 | 122 | func NewMap[K any, V any](optr MapOperator[K], opts *MapOptions) (*SimpleMap[K, V], error) { 123 | if err := opts.validate(); err != nil { 124 | return nil, err 125 | } 126 | 127 | return &SimpleMap[K, V]{ 128 | optr: optr, 129 | capacity: opts.Capacity, 130 | buckets: make([]Bucket[K, V], opts.Capacity), 131 | used: 0, 132 | limited: opts.Limited, 133 | evictPoolCapacity: opts.EvictionPoolCapacity, 134 | evictPool: make([]EvictionPoolEntry[K], opts.EvictionPoolCapacity), 135 | evictPoolSize: 0, 136 | sampleKeys: opts.SampleKeys, 137 | initTime: optr.WallTime(), 138 | }, nil 139 | } 140 | 141 | func (m *SimpleMap[K, V]) Size() uint64 { 142 | return m.used 143 | } 144 | 145 | func (m *SimpleMap[K, V]) Capacity() uint64 { 146 | return m.capacity 147 | } 148 | 149 | func (m *SimpleMap[K, V]) genExpire() uint32 { 150 | now := m.optr.WallTime() 151 | if now.Before(m.initTime) { 152 | return 0 153 | } 154 | 155 | return uint32(now.Sub(m.initTime).Seconds()) 156 | } 157 | 158 | // the Set method should always work and return nil 159 | // and it will return the previous value 160 | func (m *SimpleMap[K, V]) Set(key *K, value *V) (*V, error) { 161 | slot := m.optr.Hash(key) 162 | return m.setWithSlot(key, value, slot) 163 | } 164 | 165 | func (m *SimpleMap[K, V]) setWithSlot(key *K, value *V, slot uint64) (*V, error) { 166 | var old *V 167 | var err error 168 | var entry *Bucket[K, V] 169 | 170 | slot %= m.capacity 171 | 172 | m.mu.Lock() 173 | defer m.mu.Unlock() 174 | 175 | entry, err = m.getEntryWithSlot(key, slot) 176 | if err == nil { 177 | // found the key 178 | old = entry.val 179 | 180 | entry.val = value 181 | entry.expire = m.genExpire() 182 | return old, nil 183 | } 184 | 185 | if m.used+1 > m.limited { 186 | old = m.evict() 187 | } 188 | 189 | // insert always wokrs 190 | m.used++ 191 | 192 | // empty slot 193 | if m.buckets[slot].key == nil { 194 | m.buckets[slot].key = key 195 | m.buckets[slot].val = value 196 | m.buckets[slot].expire = m.genExpire() 197 | return old, nil 198 | } 199 | 200 | // insert slot at head 201 | entry = &Bucket[K, V]{ 202 | key: key, 203 | val: value, 204 | next: m.buckets[slot].next, 205 | expire: m.genExpire(), 206 | } 207 | 208 | m.buckets[slot].next = entry 209 | return old, nil 210 | } 211 | 212 | func (m *SimpleMap[K, V]) Get(key *K) (*V, error) { 213 | slot := m.optr.Hash(key) 214 | return m.getWithSlot(key, slot) 215 | } 216 | 217 | func (m *SimpleMap[K, V]) getWithSlot(key *K, slot uint64) (*V, error) { 218 | m.mu.Lock() 219 | defer m.mu.Unlock() 220 | 221 | entry, err := m.getEntryWithSlot(key, slot) 222 | if err != nil { 223 | return nil, err 224 | } 225 | 226 | return entry.val, nil 227 | } 228 | 229 | func (m *SimpleMap[K, V]) getEntryWithSlot(key *K, slot uint64) (*Bucket[K, V], error) { 230 | slot %= m.capacity 231 | if m.buckets[slot].key == nil { 232 | return nil, ErrKeyNotFound 233 | } 234 | 235 | entry := &m.buckets[slot] 236 | for entry != nil { 237 | if m.optr.Equals(key, entry.key) { 238 | return entry, nil 239 | } 240 | 241 | entry = entry.next 242 | } 243 | 244 | return nil, ErrKeyNotFound 245 | } 246 | 247 | // delete the key and return the previous value 248 | func (m *SimpleMap[K, V]) Delete(key *K) (*V, error) { 249 | slot := m.optr.Hash(key) 250 | return m.deleteWithSlot(key, slot) 251 | } 252 | 253 | func (m *SimpleMap[K, V]) deleteWithSlot(key *K, slot uint64) (*V, error) { 254 | m.mu.Lock() 255 | defer m.mu.Unlock() 256 | 257 | return m.deleteWithSlotInternal(key, slot) 258 | } 259 | 260 | func (m *SimpleMap[K, V]) deleteWithSlotInternal(key *K, slot uint64) (old *V, err error) { 261 | slot %= m.capacity 262 | if m.buckets[slot].key == nil { 263 | return nil, ErrKeyNotFound 264 | } 265 | 266 | if m.optr.Equals(m.buckets[slot].key, key) { 267 | old = m.buckets[slot].val 268 | 269 | if m.buckets[slot].next != nil { 270 | m.buckets[slot] = *m.buckets[slot].next 271 | } else { 272 | m.buckets[slot].key = nil 273 | m.buckets[slot].val = nil 274 | } 275 | m.used-- 276 | return 277 | } 278 | 279 | entry := &m.buckets[slot] 280 | for entry.next != nil { 281 | if m.optr.Equals(entry.next.key, key) { 282 | old = entry.next.val 283 | 284 | entry.next = entry.next.next 285 | m.used-- 286 | return 287 | } 288 | entry = entry.next 289 | } 290 | 291 | return nil, ErrKeyNotFound 292 | } 293 | 294 | func (m *SimpleMap[K, V]) insertEvictionEntry(entry EvictionPoolEntry[K]) { 295 | // ascending order by expire 296 | // find the upper bound position 297 | idx := sort.Search(int(m.evictPoolSize), func(i int) bool { 298 | return entry.expire < m.evictPool[i].expire 299 | }) 300 | 301 | // not found 302 | if idx == int(m.evictPoolSize) { 303 | idx = int(m.evictPoolSize - 1) 304 | if m.evictPoolSize != m.evictPoolCapacity { 305 | idx = int(m.evictPoolSize) 306 | } 307 | } 308 | 309 | if m.evictPoolSize != m.evictPoolCapacity { 310 | m.evictPoolSize++ 311 | } 312 | 313 | // move [idx, size-1) to [idx+1, size) 314 | copy(m.evictPool[idx+1:], m.evictPool[idx:m.evictPoolSize-1]) 315 | m.evictPool[idx] = entry 316 | } 317 | 318 | // return the eviction entry 319 | func (m *SimpleMap[K, V]) evictMinExpireEntry() *V { 320 | var err error 321 | var old *V 322 | var pos uint64 323 | 324 | for pos < m.evictPoolSize { 325 | key := m.evictPool[pos].key 326 | slot := m.evictPool[pos].slot 327 | 328 | // ignore the deletion error 329 | if old, err = m.deleteWithSlotInternal(key, slot); err == nil { 330 | break 331 | } 332 | 333 | pos++ 334 | } 335 | 336 | // remove the unused entries 337 | // the range [0, pos] will be removed 338 | copy(m.evictPool, m.evictPool[pos+1:m.evictPoolSize]) 339 | m.evictPoolSize -= pos 340 | 341 | return old 342 | } 343 | 344 | // the eviction should always work and return the eviction value 345 | // 346 | // since each eviction adds up to the sample keys, at least one of the sample keys 347 | // is guaranteed to be evicted, even if all the keys in previous eviction pool are 348 | // removed 349 | func (m *SimpleMap[K, V]) evict() *V { 350 | sampleKeys := m.sampleKeys 351 | for sampleKeys > 0 { 352 | slot := m.optr.Rand(m.capacity) 353 | entry := &m.buckets[slot] 354 | if entry.key == nil { 355 | continue 356 | } 357 | 358 | for sampleKeys > 0 && entry != nil { 359 | m.insertEvictionEntry(EvictionPoolEntry[K]{ 360 | expire: entry.expire, 361 | key: entry.key, 362 | slot: slot, 363 | }) 364 | 365 | entry = entry.next 366 | sampleKeys-- 367 | } 368 | } 369 | 370 | return m.evictMinExpireEntry() 371 | } 372 | 373 | const ( 374 | MapShardNum = 16 375 | ) 376 | 377 | // it's thread-safe 378 | type ShardMap[K any, V any] struct { 379 | opts *MapOptions 380 | optr MapOperator[K] 381 | shards [MapShardNum]*SimpleMap[K, V] 382 | } 383 | 384 | func NewShardMap[K any, V any](optr MapOperator[K], opts *MapOptions) (*ShardMap[K, V], error) { 385 | shardOpts := &MapOptions{ 386 | Capacity: opts.Capacity / MapShardNum, 387 | Limited: opts.Limited / MapShardNum, 388 | EvictionPoolCapacity: opts.EvictionPoolCapacity, 389 | SampleKeys: opts.SampleKeys, 390 | } 391 | 392 | shardMap := &ShardMap[K, V]{ 393 | opts: opts, 394 | optr: optr, 395 | } 396 | 397 | var err error 398 | for idx := range shardMap.shards { 399 | shardMap.shards[idx], err = NewMap[K, V](optr, shardOpts) 400 | if err != nil { 401 | return nil, err 402 | } 403 | } 404 | 405 | return shardMap, nil 406 | } 407 | 408 | func (s *ShardMap[K, V]) Get(key *K) (*V, error) { 409 | slot := s.optr.Hash(key) 410 | shard := s.shards[slot%MapShardNum] 411 | return shard.getWithSlot(key, slot) 412 | } 413 | 414 | func (s *ShardMap[K, V]) Set(key *K, value *V) (*V, error) { 415 | slot := s.optr.Hash(key) 416 | shard := s.shards[slot%MapShardNum] 417 | return shard.setWithSlot(key, value, slot) 418 | } 419 | 420 | func (s *ShardMap[K, V]) Delete(key *K) (*V, error) { 421 | slot := s.optr.Hash(key) 422 | shard := s.shards[slot%MapShardNum] 423 | return shard.deleteWithSlot(key, slot) 424 | } 425 | 426 | func (s *ShardMap[K, V]) Capacity() uint64 { 427 | return s.opts.Capacity 428 | } 429 | -------------------------------------------------------------------------------- /compaction.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "slices" 7 | "sort" 8 | "sync" 9 | ) 10 | 11 | type Compaction struct { 12 | inputs []*Wal 13 | 14 | output *Wal 15 | writer *WalRewriter 16 | hintWriter *HintWriter 17 | 18 | edit *ManifestEdit 19 | 20 | mu sync.RWMutex 21 | } 22 | 23 | func NewCompaction(inputs []*Wal, outputFid uint64) (*Compaction, error) { 24 | dir := inputs[0].Dir() 25 | baseTime := inputs[0].BaseTime() 26 | deleteFiles := make([]LogFile, len(inputs)) 27 | for idx := range inputs { 28 | inputs[idx].Ref() 29 | baseTime = min(baseTime, inputs[idx].BaseTime()) 30 | deleteFiles[idx] = LogFile{ 31 | wal: inputs[idx], 32 | fid: inputs[idx].Fid(), 33 | } 34 | } 35 | 36 | outputWal, err := NewWal(MergePath(dir, outputFid), outputFid, int64(baseTime)) 37 | if err != nil { 38 | return nil, err 39 | } 40 | 41 | writer, err := NewHintWriter(TmpPath(dir, outputFid), outputFid, int64(baseTime)) 42 | if err != nil { 43 | return nil, err 44 | } 45 | 46 | // the compaction may generate an empty wal, which don't have to keep it 47 | edit := &ManifestEdit{ 48 | addFiles: nil, 49 | deleteFiles: deleteFiles, 50 | hasNextFid: true, 51 | nextFid: outputFid + 1, 52 | } 53 | 54 | return &Compaction{ 55 | inputs: inputs, 56 | output: outputWal, 57 | hintWriter: writer, 58 | writer: NewWalRewriter(outputWal, 1024*1024), // 1MB 59 | edit: edit, 60 | }, nil 61 | } 62 | 63 | func (c *Compaction) Finalize() error { 64 | c.mu.Lock() 65 | defer c.mu.Unlock() 66 | 67 | var err error 68 | if err = c.writer.Flush(); err != nil { 69 | return err 70 | } 71 | 72 | if err = c.hintWriter.Flush(); err != nil { 73 | return err 74 | } 75 | 76 | // corner case: empty output wal 77 | // otherwise, we should add the output wal to manifest 78 | if c.output.Empty() { 79 | return nil 80 | } 81 | 82 | c.edit.addFiles = append(c.edit.addFiles, LogFile{ 83 | wal: c.output, 84 | fid: c.output.Fid(), 85 | }) 86 | 87 | walName := WalFilename(c.output.Fid()) 88 | hintName := HintFilename(c.output.Fid()) 89 | 90 | if err = c.output.Rename(walName); err != nil { 91 | return err 92 | } 93 | 94 | return c.hintWriter.Wal().Rename(hintName) 95 | } 96 | 97 | func (c *Compaction) Destroy() { 98 | c.mu.Lock() 99 | defer c.mu.Unlock() 100 | 101 | // corner case: empty hint file 102 | if c.hintWriter.Wal().Empty() { 103 | c.hintWriter.Wal().Unref() 104 | } 105 | 106 | _ = c.hintWriter.Close() 107 | _ = c.writer.Close() 108 | 109 | c.output.Unref() 110 | 111 | for idx := range c.inputs { 112 | c.inputs[idx].Unref() 113 | } 114 | } 115 | 116 | func (db *DBImpl) maybeScheduleCompaction() { 117 | if db.reclaiming.Load() { 118 | return 119 | } 120 | 121 | if !db.compacting.CompareAndSwap(false, true) { 122 | return 123 | } 124 | 125 | // only one reach here 126 | 127 | db.mu.Lock() 128 | defer db.mu.Unlock() 129 | 130 | if db.bgErr != nil { 131 | db.compacting.Store(false) 132 | return 133 | } 134 | 135 | candidateWals := make([]PickerWalInfo, 0, len(db.manifest.wals)) 136 | for fid := range db.manifest.wals { 137 | // skip the active wal 138 | if fid == db.manifest.active.fid { 139 | continue 140 | } 141 | 142 | candidateWals = append(candidateWals, PickerWalInfo{ 143 | Fid: fid, 144 | WalSize: db.manifest.wals[fid].wal.Size(), 145 | CreateTime: db.manifest.wals[fid].wal.CreateTime(), 146 | FreeBytes: db.manifest.wals[fid].freeBytes + db.manifest.wals[fid].deltaFreeBytes, 147 | }) 148 | } 149 | 150 | filterdWals := db.opts.CompactionPicker(candidateWals) 151 | if len(filterdWals) == 0 { 152 | db.compacting.Store(false) 153 | return 154 | } 155 | 156 | db.backgroundCompactionLocked(filterdWals) 157 | } 158 | 159 | func (db *DBImpl) backgroundCompactionLocked(wals []uint64) { 160 | inputs := make([]*Wal, len(wals)) 161 | for idx := range wals { 162 | inputs[idx] = db.manifest.wals[wals[idx]].wal 163 | } 164 | 165 | fid := db.manifest.GenFid() 166 | compaction, err := NewCompaction(inputs, fid) 167 | if err != nil { 168 | db.bgErr = err 169 | db.compacting.Store(false) 170 | return 171 | } 172 | 173 | db.compaction = compaction 174 | 175 | db.logger.Info().Uints64("input wals", wals).Uint64("output wal", fid).Msg("new compaction") 176 | 177 | // run compaction without any lock 178 | go db.doCompaction(compaction) 179 | } 180 | 181 | func (db *DBImpl) doCompaction(compaction *Compaction) { 182 | var err error 183 | 184 | defer func() { 185 | db.compacting.Store(false) 186 | compaction.Destroy() 187 | }() 188 | 189 | if err = db.doCompactionWork(compaction); err == nil { 190 | db.logger.Info().Msg("compaction finished") 191 | return 192 | } 193 | 194 | db.mu.Lock() 195 | defer db.mu.Unlock() 196 | db.bgErr = err 197 | 198 | db.logger.Err(err).Msg("failed compaction") 199 | } 200 | 201 | func (db *DBImpl) doCompactionWork(compaction *Compaction) error { 202 | var err error 203 | for idx := range compaction.inputs { 204 | if err = db.compactOneWal( 205 | compaction.writer, compaction.hintWriter, compaction.inputs[idx], 206 | ); err != nil { 207 | return err 208 | } 209 | 210 | db.logger.Info().Uint64("wal", compaction.inputs[idx].Fid()).Msg("part compaction") 211 | } 212 | 213 | if err = compaction.Finalize(); err != nil { 214 | return err 215 | } 216 | 217 | db.logger.Info().Msg("prepare to submit the compaction") 218 | 219 | // here, we should update the manifest and index synchronously and atomicly 220 | // otherwise, whether the index or manifest update first, the query will not find the 221 | // related wal, and return key not found 222 | // 223 | // at the same time, we don't want to hold the mutex for a long time, especially updating 224 | // the index via hint wal, which maybe time consuming 225 | var txn *ManifestTxn 226 | onePhase := func() error { 227 | var err error 228 | 229 | db.mu.Lock() 230 | txn, err = db.manifest.NewTxn() 231 | db.mu.Unlock() 232 | 233 | if err != nil { 234 | return err 235 | } 236 | 237 | // let the edit visible 238 | edit := &ManifestEdit{ 239 | addFiles: compaction.edit.addFiles, 240 | hasNextFid: compaction.edit.hasNextFid, 241 | nextFid: compaction.edit.nextFid, 242 | } 243 | txn.Apply(edit) 244 | db.logger.Info().Msg("one phase: apply the edit") 245 | 246 | // update the index without any lock and ignore any error 247 | // FIXME: put operations may evict some keys, we should put it into an edit 248 | _ = IterateHint(compaction.hintWriter.Wal(), func(record *HintRecord) error { 249 | _ = db.index.Put(record.ns, record.key, record.fid, record.off, record.size, nil) 250 | return nil 251 | }) 252 | db.logger.Info().Msg("one phase: gen hint wal") 253 | return nil 254 | } 255 | 256 | twoPhase := func() error { 257 | db.mu.Lock() 258 | defer db.mu.Unlock() 259 | 260 | // commit the txn 261 | edit := &ManifestEdit{ 262 | deleteFiles: compaction.edit.deleteFiles, 263 | } 264 | if err := txn.Commit(edit); err != nil { 265 | return err 266 | } 267 | 268 | db.logger.Info().Msg("two phase: commit the txn") 269 | 270 | // clean un-used files 271 | _ = db.manifest.CleanFiles(false) 272 | 273 | db.compaction = nil 274 | 275 | // cache the hint file size 276 | hintWal := compaction.hintWriter.Wal() 277 | db.hintSizeCache[hintWal.Fid()] = int64(hintWal.Size()) 278 | 279 | for idx := range compaction.edit.deleteFiles { 280 | logFile := compaction.edit.deleteFiles[idx] 281 | delete(db.hintSizeCache, logFile.fid) 282 | } 283 | 284 | return nil 285 | } 286 | 287 | if err = onePhase(); err != nil { 288 | return err 289 | } 290 | 291 | return twoPhase() 292 | } 293 | 294 | func (db *DBImpl) compactOneWal(dst *WalRewriter, hintWriter *HintWriter, src *Wal) error { 295 | bufPtr, _ := db.recordPool.Get().(*[]byte) 296 | defer db.recordPool.Put(bufPtr) 297 | 298 | var hintRecord HintRecord 299 | return IterateRecord(src, func(record *Record, foff, _ uint64) error { 300 | // the foff points to the start offset of data in the wal 301 | // however, the offset used by ReadRecord of wal expects the start offset of data header 302 | foff -= RecordHeaderSize 303 | 304 | if db.doFilter(record, src.fid, foff) { 305 | return nil 306 | } 307 | 308 | recordBytes, err := record.Encode(*bufPtr, dst.Wal().BaseTime()) 309 | if err != nil { 310 | return err 311 | } 312 | 313 | // write dst wal 314 | if foff, err = dst.AppendRecord(recordBytes); err != nil { 315 | return err 316 | } 317 | 318 | // write dst hint wal 319 | hintRecord.ns = record.Ns 320 | hintRecord.key = record.Key 321 | hintRecord.fid = dst.Wal().Fid() 322 | hintRecord.off = foff 323 | hintRecord.size = uint64(len(recordBytes)) 324 | 325 | return hintWriter.AppendRecord(&hintRecord) 326 | }) 327 | } 328 | 329 | func (db *DBImpl) doFilter(srcRecord *Record, srcFid, srcOff uint64) bool { 330 | fid, off, _, err := db.index.Get(srcRecord.Ns, srcRecord.Key) 331 | if err != nil { // the key has been deleted or evicted 332 | return true 333 | } 334 | 335 | if fid != srcFid || off != srcOff { // the key has been updated 336 | return true 337 | } 338 | 339 | if db.opts.CompactionFilter != nil { 340 | if db.opts.CompactionFilter(srcRecord.Ns, srcRecord.Key, srcRecord.Value, srcRecord.Meta) { 341 | // compaction filter failed, the key should be deleted 342 | return true 343 | } 344 | } 345 | 346 | // the key should be retained 347 | return false 348 | } 349 | 350 | func (db *DBImpl) getCompactionWalsLocked() []uint64 { 351 | if !db.compacting.Load() || db.compaction == nil { 352 | return nil 353 | } 354 | 355 | c := db.compaction 356 | c.mu.RLock() 357 | defer c.mu.RUnlock() 358 | 359 | wals := make([]uint64, 0, len(c.inputs)+1) 360 | wals = append(wals, c.output.Fid()) 361 | 362 | for idx := range c.inputs { 363 | wals = append(wals, c.inputs[idx].Fid()) 364 | } 365 | 366 | return wals 367 | } 368 | 369 | func (db *DBImpl) reclaimDiskUsage(expect int64) { 370 | if !db.reclaiming.CompareAndSwap(false, true) { 371 | return 372 | } 373 | 374 | // only one reach here 375 | 376 | defer func() { 377 | db.reclaiming.Store(false) 378 | }() 379 | 380 | db.mu.Lock() 381 | defer db.mu.Unlock() 382 | 383 | if db.bgErr != nil { 384 | return 385 | } 386 | 387 | usage, err := db.approximateDiskUsageLocked() 388 | if err != nil { 389 | db.bgErr = errors.Join(err, ErrDiskOutOfLimit) 390 | return 391 | } 392 | 393 | db.logger.Info().Int64("expect", expect).Int64("usage", usage).Msg("reclaim disk usage") 394 | if usage <= expect { 395 | return 396 | } 397 | 398 | compactionWals := db.getCompactionWalsLocked() 399 | files := make([]LogFile, 0, len(db.manifest.wals)) 400 | for fid := range db.manifest.wals { 401 | // exclude the compation wals 402 | if slices.Contains(compactionWals, fid) { 403 | continue 404 | } 405 | 406 | // skip the active wal 407 | if fid == db.manifest.ActiveWal().Fid() { 408 | continue 409 | } 410 | 411 | files = append(files, LogFile{ 412 | fid: fid, 413 | wal: db.manifest.wals[fid].wal, 414 | }) 415 | } 416 | 417 | db.mu.Unlock() 418 | 419 | // sort by create time in positive order 420 | sort.Slice(files, func(i, j int) bool { 421 | return files[i].wal.CreateTime() < files[j].wal.CreateTime() 422 | }) 423 | 424 | idx := 0 425 | deleteFiles := make([]LogFile, 0, 3) 426 | 427 | // reclaim the old wals 428 | for usage > expect && idx < len(files) { 429 | usage -= int64(files[idx].wal.Size()) 430 | deleteFiles = append(deleteFiles, files[idx]) 431 | 432 | idx++ 433 | } 434 | 435 | db.logger.Info().Uints64("wals", Map(deleteFiles, func(f LogFile) uint64 { 436 | return f.fid 437 | })).Msg("prepare to reclaim wals") 438 | 439 | db.mu.Lock() 440 | 441 | if len(deleteFiles) == 0 { 442 | db.bgErr = ErrDiskOutOfLimit 443 | db.logger.Err(db.bgErr).Msg("failed to reclaim disk usage") 444 | return 445 | } 446 | 447 | // apply the edit 448 | edit := &ManifestEdit{ 449 | deleteFiles: deleteFiles, 450 | } 451 | 452 | if err = db.manifest.LogAndApply(edit); err != nil { 453 | db.bgErr = errors.Join(err, ErrDiskOutOfLimit) 454 | db.logger.Err(db.bgErr).Msg("failed to apply") 455 | } 456 | 457 | db.logger.Info().Msg("reclaim successfully") 458 | 459 | // delete the related hint wals 460 | for idx := range deleteFiles { 461 | // ignore errors 462 | delete(db.hintSizeCache, deleteFiles[idx].fid) 463 | _ = os.Remove(HintPath(db.opts.Dir, deleteFiles[idx].fid)) 464 | } 465 | } 466 | 467 | // the method estimates total size of database 468 | // warning: the return size includes the total size of database reference files 469 | func (db *DBImpl) approximateDiskUsageLocked() (int64, error) { 470 | var usage int64 471 | 472 | // manifest file size 473 | usage += int64(db.manifest.FileSize()) 474 | 475 | // hint and wal file size 476 | for fid, info := range db.manifest.wals { 477 | usage += int64(info.wal.Size()) 478 | usage += db.hintSizeCache[fid] 479 | } 480 | 481 | // remove the un-used hint cache items 482 | for fid := range db.hintSizeCache { 483 | if _, exists := db.manifest.wals[fid]; !exists { 484 | delete(db.hintSizeCache, fid) 485 | } 486 | } 487 | 488 | return usage, nil 489 | } 490 | -------------------------------------------------------------------------------- /manifest.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | "os" 8 | "path/filepath" 9 | "slices" 10 | "strings" 11 | "time" 12 | ) 13 | 14 | var ( 15 | ErrUnknownManifestFile = errors.New("unknown manifest file") 16 | ErrConcurrentManifestTxn = errors.New("concurrent manifest txn") 17 | ) 18 | 19 | // it's not thread-safe 20 | // the MANIFEST is append-only file 21 | // it includes multiple edits, which record how the database changes 22 | type Manifest struct { 23 | // the base directory 24 | dir string 25 | 26 | // manifest file 27 | fp *os.File 28 | 29 | // the manifest fid 30 | fid uint64 31 | 32 | // manifest file size 33 | size uint64 34 | 35 | // the active wal log 36 | // active and hint log file use the same log number 37 | // usually, the log number is largest 38 | active *Wal 39 | 40 | // the next allocatable file number 41 | nextFid uint64 42 | 43 | // all wal files 44 | // mapping from fid to walInfo 45 | wals map[uint64]*WalInfo 46 | 47 | // TODO: support multiple transactions 48 | txn *ManifestTxn 49 | } 50 | 51 | type WalInfo struct { 52 | wal *Wal 53 | 54 | // the total unuse size of wal file 55 | // if the original data is updated or deleted, the related wal 56 | // can free some disk space 57 | freeBytes uint64 58 | 59 | // delta free bytes of wal file 60 | // it indicates the unuse size of wal file, which is not persisted to manifest yet 61 | deltaFreeBytes uint64 62 | } 63 | 64 | func NewManifest(dir string) (*Manifest, error) { 65 | runners := NewRunner() 66 | defer runners.Do() 67 | 68 | manifestDir := ManifestPath(dir, 1) 69 | fp, err := os.OpenFile(manifestDir, os.O_RDWR|os.O_CREATE, 0o644) 70 | if err != nil { 71 | return nil, err 72 | } 73 | 74 | runners.Post(func() { 75 | fp.Close() 76 | os.Remove(manifestDir) 77 | }) 78 | 79 | active, err := NewWal(WalPath(dir, 2), 2, time.Now().Unix()) 80 | if err != nil { 81 | return nil, err 82 | } 83 | 84 | defer active.Unref() 85 | 86 | manifest := &Manifest{ 87 | dir: dir, 88 | fp: fp, 89 | fid: 1, 90 | nextFid: 3, 91 | wals: make(map[uint64]*WalInfo), 92 | size: 0, 93 | active: active, 94 | } 95 | 96 | edit := &ManifestEdit{ 97 | addFiles: []LogFile{{active, 2}}, 98 | hasNextFid: true, 99 | nextFid: 3, 100 | } 101 | 102 | // write the MANIFEST file 103 | if err = manifest.LogAndApply(edit); err != nil { 104 | return nil, err 105 | } 106 | 107 | // write the CURRENT file 108 | if err = os.WriteFile(CurrentPath(dir), []byte(ManifestFilename(1)), 0o644); err != nil { 109 | return nil, err 110 | } 111 | 112 | // abort all functors 113 | runners.Rollback() 114 | 115 | return manifest, nil 116 | } 117 | 118 | func NewManifestIfNotExists(dir string) (*Manifest, error) { 119 | if PathExists(CurrentPath(dir)) { 120 | return LoadManifest(dir) 121 | } 122 | 123 | return NewManifest(dir) 124 | } 125 | 126 | // load the MANIFEST file according to CURRENT file 127 | func LoadManifest(dir string) (*Manifest, error) { 128 | runners := NewRunner() 129 | defer runners.Do() 130 | 131 | data, err := os.ReadFile(CurrentPath(dir)) 132 | if err != nil { 133 | return nil, fmt.Errorf("failed to read CURRENT file: %w", err) 134 | } 135 | 136 | ft, fid, err := ParseFilename(string(data)) 137 | if err != nil || ft != ManifestFileType { 138 | return nil, ErrUnknownManifestFile 139 | } 140 | 141 | manifestPath := filepath.Join(dir, strings.TrimSpace(string(data))) 142 | fp, err := os.OpenFile(manifestPath, os.O_RDWR|os.O_APPEND, 0o644) 143 | if err != nil { 144 | return nil, fmt.Errorf("failed to open manifest file: %w", err) 145 | } 146 | 147 | runners.Post(func() { 148 | fp.Close() 149 | }) 150 | 151 | fileInfo, err := fp.Stat() 152 | if err != nil { 153 | return nil, err 154 | } 155 | 156 | manifest := &Manifest{ 157 | dir: dir, 158 | fid: fid, 159 | fp: fp, 160 | wals: make(map[uint64]*WalInfo), 161 | nextFid: 0, 162 | size: uint64(fileInfo.Size()), 163 | } 164 | 165 | if err = manifest.recoverFromManifest(); err != nil { 166 | return nil, err 167 | } 168 | 169 | if len(manifest.wals) > 0 { 170 | // find the max fid 171 | maxFid := uint64(0) 172 | for fid := range manifest.wals { 173 | maxFid = max(maxFid, fid) 174 | } 175 | 176 | // freeze older wals 177 | for fid := range manifest.wals { 178 | if fid != maxFid { 179 | manifest.wals[fid].wal.Freeze() 180 | } 181 | } 182 | 183 | // the max fid as the active wal 184 | // in some failure cases, perhaps this is not true. for example, when a compaction has just finished 185 | // its wal fid is highest at the time, and then a restart happens 186 | manifest.active = manifest.wals[maxFid].wal 187 | } 188 | 189 | // abort all functors 190 | runners.Rollback() 191 | 192 | return manifest, nil 193 | } 194 | 195 | func (m *Manifest) recoverFromManifest() error { 196 | buf, err := io.ReadAll(m.fp) 197 | if err != nil { 198 | return err 199 | } 200 | 201 | edit := NewManifestEdit() 202 | if err = edit.DecodeFrom(buf); err != nil { 203 | return err 204 | } 205 | 206 | // positive order 207 | deleteFids := make([]uint64, 0, len(edit.deleteFiles)) 208 | for idx := range edit.deleteFiles { 209 | deleteFids = append(deleteFids, edit.deleteFiles[idx].fid) 210 | } 211 | slices.Sort(deleteFids) 212 | 213 | var addFiles []LogFile 214 | for idx := range edit.addFiles { 215 | if !slices.Contains(deleteFids, edit.addFiles[idx].fid) { 216 | addFiles = append(addFiles, edit.addFiles[idx]) 217 | } 218 | } 219 | 220 | // in recover, all delete files should be included in edit.addFiles 221 | if len(deleteFids)+len(addFiles) != len(edit.addFiles) { 222 | return ErrCorruptedManifest 223 | } 224 | 225 | // load related wals 226 | for idx := range addFiles { 227 | wal, err := LoadWal(WalPath(m.dir, addFiles[idx].fid), addFiles[idx].fid) 228 | if err != nil { 229 | return err 230 | } 231 | defer wal.Unref() 232 | addFiles[idx].wal = wal 233 | } 234 | 235 | // optimize the edit 236 | // deleteFiles should be empty in recover 237 | edit.addFiles = addFiles 238 | edit.deleteFiles = nil 239 | 240 | return m.Apply(edit) 241 | } 242 | 243 | // return the active wal file 244 | func (m *Manifest) ActiveWal() *Wal { 245 | return m.active 246 | } 247 | 248 | // rotate the active wal 249 | func (m *Manifest) RotateWal() (old *Wal, err error) { 250 | fid := m.GenFid() 251 | walPath := WalPath(m.dir, fid) 252 | 253 | // FIXME: use wall time of database 254 | wal, err := NewWal(walPath, fid, time.Now().Unix()) 255 | if err != nil { 256 | return nil, err 257 | } 258 | 259 | defer wal.Unref() 260 | 261 | edit := &ManifestEdit{ 262 | addFiles: []LogFile{{wal, fid}}, 263 | hasNextFid: true, 264 | nextFid: fid + 1, 265 | } 266 | 267 | if err = m.LogAndApply(edit); err != nil { 268 | return 269 | } 270 | 271 | old = m.active 272 | old.Freeze() 273 | m.active = wal 274 | 275 | return 276 | } 277 | 278 | // rotate the manifest 279 | func (m *Manifest) RotateManifest() error { 280 | runners := NewRunner() 281 | defer runners.Do() 282 | 283 | fid := m.GenFid() 284 | manifestPath := ManifestPath(m.dir, fid) 285 | fp, err := os.Create(manifestPath) 286 | if err != nil { 287 | return err 288 | } 289 | 290 | runners.Post(func() { 291 | fp.Close() 292 | os.Remove(manifestPath) 293 | }) 294 | 295 | edit := &ManifestEdit{ 296 | hasNextFid: true, 297 | nextFid: fid + 1, 298 | } 299 | 300 | // all wals will be written the new manifest file 301 | for tfid := range m.wals { 302 | edit.addFiles = append(edit.addFiles, LogFile{fid: tfid}) 303 | } 304 | 305 | nbytes, err := m.persistManifestEdit(fp, edit) 306 | if err != nil { 307 | return err 308 | } 309 | 310 | newManifest := ManifestFilename(fid) 311 | if err = os.WriteFile(CurrentPath(m.dir), []byte(newManifest), 0o644); err != nil { 312 | return err 313 | } 314 | 315 | // delete old manifest file 316 | _ = m.fp.Close() 317 | oldMainfestPath := ManifestPath(m.dir, m.fid) 318 | _ = os.Remove(oldMainfestPath) 319 | 320 | m.fp = fp 321 | m.fid = fid 322 | m.size = nbytes 323 | 324 | // abort all functors 325 | runners.Rollback() 326 | 327 | return nil 328 | } 329 | 330 | // return the size of MANIFEST file 331 | func (m *Manifest) FileSize() uint64 { 332 | return m.size 333 | } 334 | 335 | // clean the un-used files 336 | // if force is true, all un-reference files will be removed 337 | // 338 | // usually, when the database bootstrap, force can be true 339 | // for other situations, the force should be false 340 | func (m *Manifest) CleanFiles(force bool) error { 341 | files, err := os.ReadDir(m.dir) 342 | if err != nil { 343 | return err 344 | } 345 | 346 | for _, file := range files { 347 | name := file.Name() 348 | filetype, fid, err := ParseFilename(name) 349 | if err != nil { 350 | continue 351 | } 352 | 353 | needDelete := false 354 | 355 | switch filetype { 356 | case LockFileType: 357 | // skip 358 | case CurrentFileType: 359 | // skip 360 | case WalFileType: 361 | // wal not found, maybe others are in use 362 | if _, exists := m.wals[fid]; !exists { 363 | needDelete = force 364 | } 365 | case HintFileType: 366 | // wal not found, hint should be removed 367 | if _, exists := m.wals[fid]; !exists { 368 | needDelete = true 369 | } 370 | case TmpFileType: 371 | fallthrough 372 | case MergeFileType: 373 | // tmp and merge file maybe in use 374 | needDelete = force 375 | case ManifestFileType: 376 | // old manifest should be deleted 377 | needDelete = (fid != m.fid) && force 378 | default: 379 | // skip unknown file type 380 | } 381 | 382 | if needDelete { 383 | _ = os.Remove(filepath.Join(m.dir, name)) 384 | } 385 | } 386 | 387 | return nil 388 | } 389 | 390 | func (m *Manifest) NewTxn() (*ManifestTxn, error) { 391 | if m.txn != nil && !m.txn.IsDone() { 392 | return nil, ErrConcurrentManifestTxn 393 | } 394 | 395 | m.txn = NewManifestTxn(m) 396 | return m.txn, nil 397 | } 398 | 399 | func (m *Manifest) ToWal(fid uint64) *Wal { 400 | if info, exists := m.wals[fid]; exists { 401 | return info.wal 402 | } 403 | 404 | if m.txn != nil && !m.txn.IsDone() { 405 | return m.txn.ToWal(fid) 406 | } 407 | 408 | m.txn = nil 409 | return nil 410 | } 411 | 412 | func (m *Manifest) ToWalWithRef(fid uint64) *Wal { 413 | if info, exists := m.wals[fid]; exists { 414 | info.wal.Ref() 415 | return info.wal 416 | } 417 | 418 | if m.txn != nil && !m.txn.IsDone() { 419 | return m.txn.ToWalWithRef(fid) 420 | } 421 | 422 | m.txn = nil 423 | return nil 424 | } 425 | 426 | func (m *Manifest) GenFid() uint64 { 427 | nextFid := m.NextFid() 428 | m.nextFid = nextFid + 1 429 | 430 | return nextFid 431 | } 432 | 433 | func (m *Manifest) NextFid() uint64 { 434 | nextFid := m.nextFid 435 | 436 | if m.txn != nil && !m.txn.IsDone() { 437 | nextFid = max(nextFid, m.txn.NextFid()) 438 | } else { 439 | m.txn = nil 440 | } 441 | 442 | return nextFid 443 | } 444 | 445 | func (m *Manifest) prepareApply(edit *ManifestEdit) error { 446 | wals := make(map[uint64]struct{}, len(m.wals)) 447 | for k := range m.wals { 448 | wals[k] = struct{}{} 449 | } 450 | 451 | // validate the add files 452 | for idx := range edit.addFiles { 453 | if _, exists := wals[edit.addFiles[idx].fid]; exists { 454 | return errors.New("add the existed file") 455 | } 456 | wals[edit.addFiles[idx].fid] = struct{}{} 457 | } 458 | 459 | // validate the delete files 460 | for idx := range edit.deleteFiles { 461 | if _, exists := wals[edit.deleteFiles[idx].fid]; !exists { 462 | return errors.New("unknown delete file") 463 | } 464 | } 465 | 466 | return nil 467 | } 468 | 469 | // apply one edit, but don't persist 470 | func (m *Manifest) Apply(edit *ManifestEdit) error { 471 | if err := m.prepareApply(edit); err != nil { 472 | return err 473 | } 474 | 475 | // reach here: this edit should apply without any error 476 | 477 | m.apply(edit) 478 | 479 | return nil 480 | } 481 | 482 | // apply the manifest without any error 483 | func (m *Manifest) apply(edit *ManifestEdit) { 484 | // add wals 485 | for _, add := range edit.addFiles { 486 | add.wal.Ref() 487 | m.wals[add.fid] = &WalInfo{ 488 | wal: add.wal, 489 | freeBytes: 0, 490 | deltaFreeBytes: 0, 491 | } 492 | } 493 | 494 | // delete wals 495 | for _, del := range edit.deleteFiles { 496 | m.wals[del.fid].wal.Unref() 497 | delete(m.wals, del.fid) 498 | } 499 | 500 | // update next file number 501 | if edit.hasNextFid { 502 | m.nextFid = max(m.nextFid, edit.nextFid) 503 | } 504 | 505 | // update delta free bytes of wal 506 | for fid := range edit.freeBytes { 507 | if _, exists := m.wals[fid]; !exists { 508 | continue 509 | } 510 | 511 | m.wals[fid].deltaFreeBytes += edit.freeBytes[fid] 512 | } 513 | } 514 | 515 | func (m *Manifest) applyFreeBytes(delta map[uint64]uint64) { 516 | for fid := range delta { 517 | if _, exists := m.wals[fid]; !exists { 518 | continue 519 | } 520 | 521 | m.wals[fid].freeBytes += delta[fid] 522 | m.wals[fid].deltaFreeBytes = 0 523 | } 524 | } 525 | 526 | // apply one edit and persist it 527 | func (m *Manifest) LogAndApply(edit *ManifestEdit) error { 528 | var err error 529 | if err = m.prepareApply(edit); err != nil { 530 | return err 531 | } 532 | 533 | // try to append delta free bytes of other wals 534 | // TODO: only append delta free bytes large enough 535 | deltaBytes := make(map[uint64]uint64) 536 | for fid := range edit.freeBytes { 537 | deltaBytes[fid] = edit.freeBytes[fid] 538 | } 539 | for fid := range m.wals { 540 | deltaBytes[fid] += m.wals[fid].deltaFreeBytes 541 | } 542 | 543 | // persist the edit 544 | edit.freeBytes = deltaBytes 545 | nbytes, err := m.persistManifestEdit(m.fp, edit) 546 | if err != nil { 547 | return err 548 | } 549 | 550 | m.size += nbytes 551 | 552 | // the delta free bytes have persisted, so don't apply them 553 | edit.freeBytes = nil 554 | m.apply(edit) 555 | 556 | // update the free bytes 557 | m.applyFreeBytes(deltaBytes) 558 | 559 | return nil 560 | } 561 | 562 | func (m *Manifest) persistManifestEdit(fp *os.File, edit *ManifestEdit) (uint64, error) { 563 | var err error 564 | var nbytes int 565 | 566 | content := edit.Encode() 567 | currentBytes := 0 568 | expectBytes := len(content) 569 | 570 | for currentBytes < expectBytes && err == nil { 571 | nbytes, err = fp.Write(content[currentBytes:]) 572 | currentBytes += nbytes 573 | } 574 | 575 | if err == nil { 576 | err = fp.Sync() 577 | } 578 | 579 | return uint64(currentBytes), err 580 | } 581 | 582 | func (m *Manifest) Close() error { 583 | for _, info := range m.wals { 584 | if info != nil && info.wal != nil { 585 | info.wal.Close() 586 | } 587 | } 588 | 589 | return m.fp.Close() 590 | } 591 | -------------------------------------------------------------------------------- /wal.go: -------------------------------------------------------------------------------- 1 | package bitcask 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | "os" 8 | "path" 9 | "path/filepath" 10 | "sync" 11 | "sync/atomic" 12 | "time" 13 | ) 14 | 15 | var ( 16 | ErrWalIteratorEOF = errors.New("eof") 17 | ErrWalMismatchCRC = errors.New("CRC mismatch, corrupted data") 18 | ErrWalMismatchSize = errors.New("size mismatch, corrupted data") 19 | ErrWalCorruptedData = errors.New("corrupted data") 20 | ErrWalIncompleteRecord = errors.New("incomplete record") 21 | ErrWalMismatchMagic = errors.New("magic number mismatch") 22 | ErrWalMismatchBlockSize = errors.New("block size mismatch") 23 | ErrWalUnknownRecordType = errors.New("invalid record type") 24 | ErrWalUnavailable = errors.New("wal unavailable") 25 | ErrWalFrozen = errors.New("wal frozen") 26 | ) 27 | 28 | // wal super block at head 29 | type superBlock struct { 30 | magic uint64 31 | blockSize uint64 32 | 33 | // start position of actual data 34 | // that points the next byte of checksum 35 | startOff uint32 36 | 37 | // maybe have other fields 38 | createTime uint64 39 | baseTime uint64 40 | 41 | // checksum as footer 42 | crc32 uint32 43 | } 44 | 45 | const ( 46 | BlockSize = 32 * 1024 // 32KB per block 47 | 48 | RecordFull = 1 // Record fits entirely in a single block 49 | RecordFirst = 2 // First chunk of a record 50 | RecordMiddle = 3 // Middle chunk of a record 51 | RecordLast = 4 // Last chunk of a record 52 | 53 | RecordHeaderSize = 7 // 4 bytes CRC + 2 bytes Length + 1 byte Type 54 | 55 | MagicNumber = 0x77616C64 // magic number 56 | SuperBlockSize = 40 // 8 + 8 + 4 + 8 + 8 + 4 bytes 57 | SuperBlockCRC32Offset = SuperBlockSize - 4 58 | ) 59 | 60 | // calculate the physical footprint of the record based on its actual size and offset 61 | func WalRecordSize(offset, size uint64) uint64 { 62 | left := size 63 | phySize := uint64(0) 64 | 65 | // skip the super block 66 | offset -= SuperBlockSize 67 | 68 | for left > 0 { 69 | leftover := BlockSize - (offset % BlockSize) 70 | if leftover < RecordHeaderSize { 71 | phySize += leftover 72 | offset += leftover 73 | leftover = BlockSize 74 | } 75 | 76 | avail := leftover - RecordHeaderSize 77 | fragmentLength := min(left, avail) 78 | 79 | phySize += (RecordHeaderSize + fragmentLength) 80 | offset += (RecordHeaderSize + fragmentLength) 81 | 82 | left -= fragmentLength 83 | } 84 | 85 | return phySize 86 | } 87 | 88 | func WalBlockIndexRange(offset, size uint64) (firstBlkIdx, firstBlkOffset, blkNum uint64) { 89 | recordSize := WalRecordSize(offset, size) 90 | 91 | firstBlkIdx = (offset - SuperBlockSize) / BlockSize 92 | firstBlkOffset = firstBlkIdx*BlockSize + SuperBlockSize 93 | 94 | lastBlkIdx := (offset - SuperBlockSize + recordSize) / BlockSize 95 | blkNum = lastBlkIdx - firstBlkIdx + 1 96 | return 97 | } 98 | 99 | func WalBlockOffset(blkIdx uint64) (off uint64) { 100 | off = blkIdx*BlockSize + SuperBlockSize 101 | return 102 | } 103 | 104 | func WalBlockIdx(offset uint64) (blkIdx uint64) { 105 | blkIdx = (offset - SuperBlockSize) / BlockSize 106 | return 107 | } 108 | 109 | // parse the record from the given buffers 110 | // 111 | // there are two sources of `blks`. 112 | // - one is created from block cache. This part of cache maybe reused after cache eviction. 113 | // so it's nessary to new a buffer for the return binary record, and the `blks` has one 114 | // or more blocks 115 | // - the second is from pread-at-once. in this case, the value is usually large and contains 116 | // multiple data blocks. therefore, the return binary record cannot be directly a slice of 117 | // buffer and the `blks` has only one block 118 | // 119 | // the `size` means the number of bytes of record 120 | // the `blkOff` means the start offset of actual data in blks 121 | func WalParseRecord(size uint64, blkOff uint64, blks [][]byte, verifyChecksum bool) ([]byte, error) { 122 | record := make([]byte, 0, size) 123 | 124 | blkSize := uint64(len(blks[0])) 125 | 126 | // iterate all input blocks 127 | for i := 0; i < len(blks); i++ { 128 | // iterate record 129 | for { 130 | header := blks[i][blkOff : blkOff+RecordHeaderSize] 131 | blkOff += RecordHeaderSize 132 | 133 | crc := binary.LittleEndian.Uint32(header[0:]) 134 | length := uint64(binary.LittleEndian.Uint16(header[4:])) 135 | recordType := header[6] 136 | 137 | // avoid the corrupted data triggering out of range 138 | if length > blkSize-blkOff { 139 | return nil, ErrWalCorruptedData 140 | } 141 | 142 | data := blks[i][blkOff : blkOff+length] 143 | blkOff += length 144 | 145 | if verifyChecksum && ComputeCRC32(data) != crc { 146 | return nil, ErrWalMismatchCRC 147 | } 148 | 149 | record = append(record, data...) 150 | 151 | switch recordType { 152 | case RecordFull, RecordLast: 153 | if len(record) != int(size) { 154 | return nil, ErrWalMismatchSize 155 | } 156 | return record, nil 157 | case RecordFirst, RecordMiddle: 158 | // Continue reading next chunk 159 | default: 160 | return nil, ErrWalUnknownRecordType 161 | } 162 | 163 | // blks[i][blkOff:] has no space to store more records 164 | leftover := blkSize - blkOff 165 | if leftover <= RecordHeaderSize { 166 | blkOff = 0 167 | break 168 | } 169 | } 170 | } 171 | 172 | return nil, ErrWalIncompleteRecord 173 | } 174 | 175 | // it's not thread safe 176 | // usually, only one writer can operate the Wal. no race condition 177 | type Wal struct { 178 | fp *os.File 179 | super *superBlock 180 | 181 | // reference count 182 | refs *atomic.Int64 183 | deleterOnce sync.Once 184 | 185 | // internal buffer 186 | buf bytes.Buffer 187 | 188 | // file name 189 | path string 190 | 191 | // file id 192 | fid uint64 193 | 194 | // file size 195 | size uint64 196 | 197 | // the data start position 198 | offset uint32 199 | 200 | // when the wal is marked immutable, which is not writable 201 | immutable bool 202 | 203 | // indicate whether the wal can operate 204 | invalid bool 205 | } 206 | 207 | func (wal *Wal) Rename(newName string) error { 208 | newPath := filepath.Join(wal.Dir(), newName) 209 | if err := os.Rename(wal.path, newPath); err != nil { 210 | return err 211 | } 212 | 213 | wal.path = newPath 214 | return nil 215 | } 216 | 217 | // load the existed wal file 218 | func LoadWal(path string, fid uint64) (*Wal, error) { 219 | runners := NewRunner() 220 | defer runners.Do() 221 | 222 | file, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND, 0o644) 223 | if err != nil { 224 | return nil, err 225 | } 226 | 227 | runners.Post(func() { 228 | file.Close() 229 | }) 230 | 231 | stat, err := file.Stat() 232 | if err != nil { 233 | return nil, err 234 | } 235 | 236 | wal := &Wal{ 237 | fp: file, 238 | path: path, 239 | fid: fid, 240 | size: uint64(stat.Size()), 241 | immutable: false, 242 | invalid: false, 243 | refs: new(atomic.Int64), 244 | } 245 | 246 | wal.refs.Store(1) 247 | 248 | wal.super, err = wal.loadSuperBlock() 249 | if err != nil { 250 | return nil, err 251 | } 252 | 253 | wal.offset = wal.super.startOff 254 | 255 | // abort all functors 256 | runners.Rollback() 257 | 258 | return wal, nil 259 | } 260 | 261 | // create the wal with specific path 262 | func NewWal(path string, fid uint64, baseTime int64) (*Wal, error) { 263 | runners := NewRunner() 264 | defer runners.Do() 265 | 266 | file, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o644) 267 | if err != nil { 268 | return nil, err 269 | } 270 | 271 | runners.Post(func() { 272 | file.Close() 273 | }) 274 | 275 | if baseTime < 0 { 276 | baseTime = time.Now().Unix() 277 | } 278 | 279 | wal := &Wal{ 280 | fp: file, 281 | path: path, 282 | fid: fid, 283 | size: 0, 284 | immutable: false, 285 | invalid: false, 286 | refs: new(atomic.Int64), 287 | } 288 | 289 | wal.refs.Store(1) 290 | 291 | wal.super, err = wal.writeSuperBlock(uint64(baseTime)) 292 | if err != nil { 293 | return nil, err 294 | } 295 | 296 | wal.size += SuperBlockSize 297 | wal.offset = wal.super.startOff 298 | 299 | // abort all functors 300 | runners.Rollback() 301 | 302 | return wal, nil 303 | } 304 | 305 | func (wal *Wal) deleteSelf() { 306 | wal.invalid = true 307 | _ = wal.fp.Close() 308 | _ = os.Remove(wal.path) 309 | } 310 | 311 | // thread-safe 312 | // decrease the reference count 313 | // delete self when reference count equals zero 314 | func (wal *Wal) Unref() { 315 | if wal.refs.Add(-1) == 0 { 316 | wal.deleterOnce.Do(wal.deleteSelf) 317 | } 318 | } 319 | 320 | // thread-safe 321 | // increase the reference count 322 | func (wal *Wal) Ref() { 323 | wal.refs.Add(1) 324 | } 325 | 326 | // before read and write wal, usually incr the reference count 327 | // it makes sure the wal is always valid 328 | func (wal *Wal) Valid() bool { 329 | return !wal.invalid 330 | } 331 | 332 | func (wal *Wal) writeSuperBlock(baseTime uint64) (*superBlock, error) { 333 | super := &superBlock{ 334 | magic: MagicNumber, 335 | blockSize: BlockSize, 336 | startOff: SuperBlockSize, 337 | createTime: uint64(time.Now().Unix()), 338 | baseTime: baseTime, 339 | } 340 | 341 | buf := make([]byte, SuperBlockSize) 342 | binary.LittleEndian.PutUint64(buf[0:], super.magic) 343 | binary.LittleEndian.PutUint64(buf[8:], super.blockSize) 344 | binary.LittleEndian.PutUint32(buf[16:], super.startOff) 345 | binary.LittleEndian.PutUint64(buf[20:], super.createTime) 346 | binary.LittleEndian.PutUint64(buf[28:], super.baseTime) 347 | 348 | crc := ComputeCRC32(buf[:SuperBlockCRC32Offset]) 349 | binary.LittleEndian.PutUint32(buf[SuperBlockCRC32Offset:], crc) 350 | 351 | if _, err := wal.fp.Write(buf); err != nil { 352 | return nil, err 353 | } 354 | 355 | if err := wal.fp.Sync(); err != nil { 356 | return nil, err 357 | } 358 | 359 | return super, nil 360 | } 361 | 362 | func (wal *Wal) loadSuperBlock() (*superBlock, error) { 363 | buf := make([]byte, SuperBlockSize) 364 | _, err := wal.fp.ReadAt(buf, 0) 365 | if err != nil { 366 | return nil, err 367 | } 368 | 369 | crc := ComputeCRC32(buf[:SuperBlockCRC32Offset]) 370 | crcExpect := binary.LittleEndian.Uint32(buf[SuperBlockCRC32Offset:]) 371 | if crc != crcExpect { 372 | return nil, ErrWalMismatchCRC 373 | } 374 | 375 | magic := binary.LittleEndian.Uint64(buf[0:]) 376 | if magic != MagicNumber { 377 | return nil, ErrWalMismatchMagic 378 | } 379 | 380 | blockSize := binary.LittleEndian.Uint64(buf[8:]) 381 | startOff := binary.LittleEndian.Uint32(buf[16:]) 382 | 383 | if blockSize != BlockSize { 384 | return nil, ErrWalMismatchBlockSize 385 | } 386 | 387 | createTime := binary.LittleEndian.Uint64(buf[20:]) 388 | baseTime := binary.LittleEndian.Uint64(buf[28:]) 389 | 390 | return &superBlock{ 391 | magic: magic, 392 | blockSize: blockSize, 393 | startOff: startOff, 394 | createTime: createTime, 395 | baseTime: baseTime, 396 | crc32: crc, 397 | }, nil 398 | } 399 | 400 | func (wal *Wal) CreateTime() uint64 { 401 | return wal.super.createTime 402 | } 403 | 404 | func (wal *Wal) BaseTime() uint64 { 405 | return wal.super.baseTime 406 | } 407 | 408 | func (wal *Wal) Sync() error { 409 | return wal.fp.Sync() 410 | } 411 | 412 | func (wal *Wal) Freeze() { 413 | wal.immutable = true 414 | } 415 | 416 | func (wal *Wal) Immutable() bool { 417 | return !wal.immutable 418 | } 419 | 420 | func (wal *Wal) Close() error { 421 | wal.Flush() 422 | return wal.fp.Close() 423 | } 424 | 425 | func (wal *Wal) Path() string { 426 | return wal.path 427 | } 428 | 429 | func (wal *Wal) Dir() string { 430 | return path.Dir(wal.path) 431 | } 432 | 433 | func (wal *Wal) Fid() uint64 { 434 | return wal.fid 435 | } 436 | 437 | func (wal *Wal) Fd() int { 438 | return int(wal.fp.Fd()) 439 | } 440 | 441 | // return the current file size 442 | func (wal *Wal) Size() uint64 { 443 | return wal.size 444 | } 445 | 446 | func (wal *Wal) Empty() bool { 447 | return wal.size == SuperBlockSize && wal.buf.Len() == 0 448 | } 449 | 450 | // flush the internal buffer 451 | func (wal *Wal) Flush() error { 452 | if wal.buf.Len() == 0 { 453 | return nil 454 | } 455 | 456 | data := wal.buf.Bytes() 457 | defer wal.buf.Reset() 458 | 459 | n, err := wal.fp.Write(data) 460 | 461 | // FIXME: maybe parital write 462 | wal.size += uint64(n) 463 | 464 | return err 465 | } 466 | 467 | // clean the internal buffer 468 | func (wal *Wal) ResetBuffer() { 469 | wal.buf.Reset() 470 | } 471 | 472 | // append data to internal buffer 473 | func (wal *Wal) appendFile(data []byte) error { 474 | _, err := wal.buf.Write(data) 475 | return err 476 | } 477 | 478 | // the record offset should include super block, which is the pysical offset of wal file 479 | // but the splitted blocks should exclude the super block. in other words, the file layout: 480 | // | super block | block | block | ... | 481 | // |<- 40B ->|<- 32K ->|<- 32K ->| ... | 482 | func (wal *Wal) writeOffset(skipSuperBlock bool) uint64 { 483 | if !skipSuperBlock { 484 | return wal.size + uint64(wal.buf.Len()) 485 | } 486 | return wal.size + uint64(wal.buf.Len()) - SuperBlockSize 487 | } 488 | 489 | // write one record, and return the start offset of the record in wal file 490 | func (wal *Wal) WriteRecord(record []byte) (uint64, error) { 491 | if !wal.Valid() { 492 | return 0, ErrWalUnavailable 493 | } 494 | 495 | if !wal.Immutable() { 496 | return 0, ErrWalFrozen 497 | } 498 | 499 | var err error 500 | var offset uint64 501 | begin := true 502 | left := uint64(len(record)) 503 | padding := [...]byte{0, 0, 0, 0, 0, 0} 504 | 505 | for left > 0 { 506 | leftover := BlockSize - (wal.writeOffset(true) % BlockSize) 507 | if leftover < RecordHeaderSize { 508 | if err = wal.appendFile(padding[:leftover]); err != nil { 509 | return 0, err 510 | } 511 | leftover = BlockSize 512 | } 513 | 514 | if begin { 515 | offset = wal.writeOffset(false) 516 | } 517 | 518 | avail := leftover - RecordHeaderSize 519 | fragmentLength := min(left, avail) 520 | 521 | var recordType byte 522 | end := (left == fragmentLength) 523 | switch { 524 | case begin && end: 525 | recordType = RecordFull 526 | case begin: 527 | recordType = RecordFirst 528 | case end: 529 | recordType = RecordLast 530 | default: 531 | recordType = RecordMiddle 532 | } 533 | 534 | var header [RecordHeaderSize]byte 535 | binary.LittleEndian.PutUint32(header[0:], ComputeCRC32(record[:fragmentLength])) 536 | binary.LittleEndian.PutUint16(header[4:], uint16(fragmentLength)) 537 | header[6] = recordType 538 | 539 | if err = wal.appendFile(header[:]); err != nil { 540 | return 0, err 541 | } 542 | 543 | if err = wal.appendFile(record[:fragmentLength]); err != nil { 544 | return 0, err 545 | } 546 | 547 | record = record[fragmentLength:] 548 | left -= fragmentLength 549 | begin = false 550 | } 551 | 552 | return offset, nil 553 | } 554 | 555 | // read one record from specifc offset and size 556 | func (wal *Wal) ReadRecord(offset, size uint64, verifyChecksum bool) (record []byte, err error) { 557 | if !wal.Valid() { 558 | return nil, ErrWalUnavailable 559 | } 560 | 561 | recordSize := WalRecordSize(offset, size) 562 | if offset+recordSize > wal.size { 563 | return nil, errors.New("read beyond file size") 564 | } 565 | 566 | buffer := make([]byte, recordSize) 567 | // read all related data using only one disk read operation 568 | if err = PreadFull(wal.Fd(), buffer, int64(offset)); err != nil { 569 | return 570 | } 571 | 572 | return WalParseRecord(size, 0, [][]byte{buffer}, verifyChecksum) 573 | } 574 | --------------------------------------------------------------------------------