├── .gitignore
├── go.mod
├── fs
├── os_mmap_windows_amd64.go
├── os_mmap_windows_386.go
├── os_mmap_test.go
├── mem_test.go
├── os_test.go
├── os_plan9.go
├── os_unix.go
├── os_mmap_unix.go
├── os_mmap_windows.go
├── os.go
├── os_windows.go
├── sub.go
├── fs.go
├── os_mmap.go
├── mem.go
└── fs_test.go
├── doc.go
├── db_rpc_test.go
├── db_mmap_test.go
├── internal
├── hash
│ ├── seed_test.go
│ ├── seed.go
│ ├── murmurhash32.go
│ └── murmurhash32_test.go
├── errors
│ ├── errors.go
│ └── errors_test.go
└── assert
│ ├── assert.go
│ └── assert_test.go
├── metrics.go
├── logger.go
├── lock.go
├── .github
└── workflows
│ ├── golangci-lint.yaml
│ └── test.yaml
├── errors.go
├── gobfile.go
├── example_test.go
├── header.go
├── iterator_test.go
├── datalog_test.go
├── options.go
├── CHANGELOG.md
├── backup_test.go
├── iterator.go
├── backup.go
├── file.go
├── file_test.go
├── README.md
├── bucket.go
├── recovery.go
├── segment.go
├── compaction.go
├── datalog.go
├── recovery_test.go
├── docs
└── design.md
├── index.go
├── compaction_test.go
├── db.go
├── LICENSE
└── db_test.go
/.gitignore:
--------------------------------------------------------------------------------
1 | /fs/test
2 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/akrylysov/pogreb
2 |
3 | go 1.18
4 |
--------------------------------------------------------------------------------
/fs/os_mmap_windows_amd64.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | const maxMmapSize = 1 << 48
4 |
--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package pogreb implements an embedded key-value store for read-heavy workloads.
3 | */
4 | package pogreb
5 |
--------------------------------------------------------------------------------
/fs/os_mmap_windows_386.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "math"
5 | )
6 |
7 | const maxMmapSize = math.MaxInt32
8 |
--------------------------------------------------------------------------------
/fs/os_mmap_test.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestOSMMapFS(t *testing.T) {
8 | testFS(t, Sub(OSMMap, t.TempDir()))
9 | }
10 |
--------------------------------------------------------------------------------
/db_rpc_test.go:
--------------------------------------------------------------------------------
1 | //go:build plan9
2 | // +build plan9
3 |
4 | package pogreb
5 |
6 | import (
7 | "github.com/akrylysov/pogreb/fs"
8 | )
9 |
10 | var testFileSystems = []fs.FileSystem{fs.Mem, fs.OS}
11 |
--------------------------------------------------------------------------------
/db_mmap_test.go:
--------------------------------------------------------------------------------
1 | //go:build !plan9
2 | // +build !plan9
3 |
4 | package pogreb
5 |
6 | import (
7 | "github.com/akrylysov/pogreb/fs"
8 | )
9 |
10 | var testFileSystems = []fs.FileSystem{fs.Mem, fs.OSMMap, fs.OS}
11 |
--------------------------------------------------------------------------------
/internal/hash/seed_test.go:
--------------------------------------------------------------------------------
1 | package hash
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/akrylysov/pogreb/internal/assert"
7 | )
8 |
9 | func TestRandSeed(t *testing.T) {
10 | _, err := RandSeed()
11 | assert.Nil(t, err)
12 | }
13 |
--------------------------------------------------------------------------------
/metrics.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import "expvar"
4 |
5 | // Metrics holds the DB metrics.
6 | type Metrics struct {
7 | Puts expvar.Int
8 | Dels expvar.Int
9 | Gets expvar.Int
10 | HashCollisions expvar.Int
11 | }
12 |
--------------------------------------------------------------------------------
/logger.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "log"
5 | "os"
6 | )
7 |
8 | var logger = log.New(os.Stderr, "pogreb: ", 0)
9 |
10 | // SetLogger sets the global logger.
11 | func SetLogger(l *log.Logger) {
12 | if l != nil {
13 | logger = l
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/lock.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/akrylysov/pogreb/fs"
7 | )
8 |
9 | const (
10 | lockName = "lock"
11 | )
12 |
13 | func createLockFile(opts *Options) (fs.LockFile, bool, error) {
14 | return opts.FileSystem.CreateLockFile(lockName, os.FileMode(0644))
15 | }
16 |
--------------------------------------------------------------------------------
/fs/mem_test.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestMemFS(t *testing.T) {
8 | testFS(t, Mem)
9 | }
10 |
11 | func TestMemLockFile(t *testing.T) {
12 | testLockFile(t, Mem)
13 | }
14 |
15 | func TestMemLockAcquireExisting(t *testing.T) {
16 | testLockFileAcquireExisting(t, Mem)
17 | }
18 |
--------------------------------------------------------------------------------
/internal/hash/seed.go:
--------------------------------------------------------------------------------
1 | package hash
2 |
3 | import (
4 | "crypto/rand"
5 | "encoding/binary"
6 | )
7 |
8 | // RandSeed generates a random hash seed.
9 | func RandSeed() (uint32, error) {
10 | b := make([]byte, 4)
11 | if _, err := rand.Read(b); err != nil {
12 | return 0, err
13 | }
14 | return binary.LittleEndian.Uint32(b), nil
15 | }
16 |
--------------------------------------------------------------------------------
/fs/os_test.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestOSFS(t *testing.T) {
8 | testFS(t, Sub(OS, t.TempDir()))
9 | }
10 |
11 | func TestOSLockFile(t *testing.T) {
12 | testLockFile(t, Sub(OS, t.TempDir()))
13 | }
14 |
15 | func TestOSLockAcquireExisting(t *testing.T) {
16 | testLockFileAcquireExisting(t, Sub(OS, t.TempDir()))
17 | }
18 |
--------------------------------------------------------------------------------
/.github/workflows/golangci-lint.yaml:
--------------------------------------------------------------------------------
1 | name: golangci-lint
2 | on:
3 | push:
4 | tags:
5 | - v*
6 | branches:
7 | - master
8 | - main
9 | pull_request:
10 | jobs:
11 | golangci:
12 | name: lint
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v4
16 | - name: golangci-lint
17 | uses: golangci/golangci-lint-action@v6
18 |
--------------------------------------------------------------------------------
/errors.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "github.com/akrylysov/pogreb/internal/errors"
5 | )
6 |
7 | var (
8 | errKeyTooLarge = errors.New("key is too large")
9 | errValueTooLarge = errors.New("value is too large")
10 | errFull = errors.New("database is full")
11 | errCorrupted = errors.New("database is corrupted")
12 | errLocked = errors.New("database is locked")
13 | errBusy = errors.New("database is busy")
14 | )
15 |
--------------------------------------------------------------------------------
/fs/os_plan9.go:
--------------------------------------------------------------------------------
1 | //go:build plan9
2 | // +build plan9
3 |
4 | package fs
5 |
6 | import (
7 | "os"
8 | "syscall"
9 | )
10 |
11 | func createLockFile(name string, perm os.FileMode) (LockFile, bool, error) {
12 | acquiredExisting := false
13 | if _, err := os.Stat(name); err == nil {
14 | acquiredExisting = true
15 | }
16 | f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE, syscall.DMEXCL|perm)
17 | if err != nil {
18 | return nil, false, err
19 | }
20 | return &osLockFile{f, name}, acquiredExisting, nil
21 | }
22 |
23 | // Return a default FileSystem for this platform.
24 | func DefaultFileSystem() FileSystem {
25 | return OS
26 | }
27 |
--------------------------------------------------------------------------------
/gobfile.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "encoding/gob"
5 |
6 | "github.com/akrylysov/pogreb/fs"
7 | )
8 |
9 | func readGobFile(fsys fs.FileSystem, name string, v interface{}) error {
10 | f, err := openFile(fsys, name, openFileFlags{readOnly: true})
11 | if err != nil {
12 | return err
13 | }
14 | defer f.Close()
15 | dec := gob.NewDecoder(f)
16 | return dec.Decode(v)
17 | }
18 |
19 | func writeGobFile(fsys fs.FileSystem, name string, v interface{}) error {
20 | f, err := openFile(fsys, name, openFileFlags{truncate: true})
21 | if err != nil {
22 | return err
23 | }
24 | defer f.Close()
25 | enc := gob.NewEncoder(f)
26 | return enc.Encode(v)
27 | }
28 |
--------------------------------------------------------------------------------
/fs/os_unix.go:
--------------------------------------------------------------------------------
1 | //go:build !(plan9 || windows)
2 | // +build !plan9,!windows
3 |
4 | package fs
5 |
6 | import (
7 | "os"
8 | "syscall"
9 | )
10 |
11 | func createLockFile(name string, perm os.FileMode) (LockFile, bool, error) {
12 | acquiredExisting := false
13 | if _, err := os.Stat(name); err == nil {
14 | acquiredExisting = true
15 | }
16 | f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE, perm)
17 | if err != nil {
18 | return nil, false, err
19 | }
20 | if err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil {
21 | if err == syscall.EWOULDBLOCK {
22 | err = os.ErrExist
23 | }
24 | return nil, false, err
25 | }
26 | return &osLockFile{f, name}, acquiredExisting, nil
27 | }
28 |
--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
1 | name: Test
2 | on: [push, pull_request]
3 | jobs:
4 | test:
5 | strategy:
6 | matrix:
7 | go-version: [1.18.x, 1.x]
8 | os: [ubuntu-latest, macos-latest, windows-latest]
9 | runs-on: ${{ matrix.os }}
10 | steps:
11 | - name: Install Go
12 | uses: actions/setup-go@v5
13 | with:
14 | go-version: ${{ matrix.go-version }}
15 | - name: Checkout code
16 | uses: actions/checkout@v4
17 | - name: Build GOARCH=386
18 | if: ${{ matrix.os != 'macos-latest' }}
19 | env:
20 | GOARCH: "386"
21 | run: go build
22 | - name: Test
23 | run: go test ./... -race -coverprofile=coverage.txt -covermode=atomic
24 | - name: Upload coverage to Codecov
25 | if: ${{ matrix.os == 'ubuntu-latest' }}
26 | uses: codecov/codecov-action@v5
27 |
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | package pogreb_test
2 |
3 | import (
4 | "log"
5 |
6 | "github.com/akrylysov/pogreb"
7 | )
8 |
9 | func Example() {
10 | db, err := pogreb.Open("pogreb.test", nil)
11 | if err != nil {
12 | log.Fatal(err)
13 | return
14 | }
15 | defer db.Close()
16 |
17 | // Insert a new key-value pair.
18 | if err := db.Put([]byte("testKey"), []byte("testValue")); err != nil {
19 | log.Fatal(err)
20 | }
21 |
22 | // Retrieve the inserted value.
23 | val, err := db.Get([]byte("testKey"))
24 | if err != nil {
25 | log.Fatal(err)
26 | }
27 | log.Printf("%s", val)
28 |
29 | // Iterate over items.
30 | it := db.Items()
31 | for {
32 | key, val, err := it.Next()
33 | if err == pogreb.ErrIterationDone {
34 | break
35 | }
36 | if err != nil {
37 | log.Fatal(err)
38 | }
39 | log.Printf("%s %s", key, val)
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/fs/os_mmap_unix.go:
--------------------------------------------------------------------------------
1 | //go:build !(plan9 || windows)
2 | // +build !plan9,!windows
3 |
4 | package fs
5 |
6 | import (
7 | "os"
8 | "syscall"
9 | "unsafe"
10 | )
11 |
12 | func mmap(f *os.File, fileSize int64, mappingSize int64) ([]byte, error) {
13 | p, err := syscall.Mmap(int(f.Fd()), 0, int(mappingSize), syscall.PROT_READ, syscall.MAP_SHARED)
14 | return p, err
15 | }
16 |
17 | func munmap(data []byte) error {
18 | return syscall.Munmap(data)
19 | }
20 |
21 | func madviceRandom(data []byte) error {
22 | _, _, errno := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&data[0])), uintptr(len(data)), uintptr(syscall.MADV_RANDOM))
23 | if errno != 0 {
24 | return errno
25 | }
26 | return nil
27 | }
28 |
29 | func (f *osMMapFile) Truncate(size int64) error {
30 | if err := f.File.Truncate(size); err != nil {
31 | return err
32 | }
33 | f.size = size
34 | return f.mremap()
35 | }
36 |
--------------------------------------------------------------------------------
/header.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "bytes"
5 | "encoding/binary"
6 | )
7 |
8 | const (
9 | formatVersion = 2 // File format version.
10 | headerSize = 512
11 | )
12 |
13 | var (
14 | signature = [8]byte{'p', 'o', 'g', 'r', 'e', 'b', '\x0e', '\xfd'}
15 | )
16 |
17 | type header struct {
18 | signature [8]byte
19 | formatVersion uint32
20 | }
21 |
22 | func newHeader() *header {
23 | return &header{
24 | signature: signature,
25 | formatVersion: formatVersion,
26 | }
27 | }
28 |
29 | func (h header) MarshalBinary() ([]byte, error) {
30 | buf := make([]byte, headerSize)
31 | copy(buf[:8], h.signature[:])
32 | binary.LittleEndian.PutUint32(buf[8:12], h.formatVersion)
33 | return buf, nil
34 | }
35 |
36 | func (h *header) UnmarshalBinary(data []byte) error {
37 | if !bytes.Equal(data[:8], signature[:]) {
38 | return errCorrupted
39 | }
40 | copy(h.signature[:], data[:8])
41 | h.formatVersion = binary.LittleEndian.Uint32(data[8:12])
42 | return nil
43 | }
44 |
--------------------------------------------------------------------------------
/internal/errors/errors.go:
--------------------------------------------------------------------------------
1 | package errors
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | )
7 |
8 | type wrappedError struct {
9 | cause error
10 | msg string
11 | }
12 |
13 | func (we wrappedError) Error() string {
14 | return we.msg + ": " + we.cause.Error()
15 | }
16 |
17 | func (we wrappedError) Unwrap() error {
18 | return we.cause
19 | }
20 |
21 | // New returns an error that formats as the given text.
22 | func New(text string) error {
23 | return errors.New(text)
24 | }
25 |
26 | // Wrap returns an error annotating err with an additional message.
27 | // Compatible with Go 1.13 error chains.
28 | func Wrap(cause error, message string) error {
29 | return wrappedError{
30 | cause: cause,
31 | msg: message,
32 | }
33 | }
34 |
35 | // Wrapf returns an error annotating err with an additional formatted message.
36 | // Compatible with Go 1.13 error chains.
37 | func Wrapf(cause error, format string, a ...interface{}) error {
38 | return wrappedError{
39 | cause: cause,
40 | msg: fmt.Sprintf(format, a...),
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/internal/hash/murmurhash32.go:
--------------------------------------------------------------------------------
1 | package hash
2 |
3 | import (
4 | "math/bits"
5 | )
6 |
7 | const (
8 | c1 uint32 = 0xcc9e2d51
9 | c2 uint32 = 0x1b873593
10 | )
11 |
12 | // Sum32WithSeed is a port of MurmurHash3_x86_32 function.
13 | func Sum32WithSeed(data []byte, seed uint32) uint32 {
14 | h1 := seed
15 | dlen := len(data)
16 |
17 | for len(data) >= 4 {
18 | k1 := uint32(data[0]) | uint32(data[1])<<8 | uint32(data[2])<<16 | uint32(data[3])<<24
19 | data = data[4:]
20 |
21 | k1 *= c1
22 | k1 = bits.RotateLeft32(k1, 15)
23 | k1 *= c2
24 |
25 | h1 ^= k1
26 | h1 = bits.RotateLeft32(h1, 13)
27 | h1 = h1*5 + 0xe6546b64
28 | }
29 |
30 | var k1 uint32
31 | switch len(data) {
32 | case 3:
33 | k1 ^= uint32(data[2]) << 16
34 | fallthrough
35 | case 2:
36 | k1 ^= uint32(data[1]) << 8
37 | fallthrough
38 | case 1:
39 | k1 ^= uint32(data[0])
40 | k1 *= c1
41 | k1 = bits.RotateLeft32(k1, 15)
42 | k1 *= c2
43 | h1 ^= k1
44 | }
45 |
46 | h1 ^= uint32(dlen)
47 |
48 | h1 ^= h1 >> 16
49 | h1 *= 0x85ebca6b
50 | h1 ^= h1 >> 13
51 | h1 *= 0xc2b2ae35
52 | h1 ^= h1 >> 16
53 |
54 | return h1
55 | }
56 |
--------------------------------------------------------------------------------
/internal/hash/murmurhash32_test.go:
--------------------------------------------------------------------------------
1 | package hash
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/akrylysov/pogreb/internal/assert"
8 | )
9 |
10 | func TestSum32WithSeed(t *testing.T) {
11 | testCases := []struct {
12 | in []byte
13 | seed uint32
14 | out uint32
15 | }{
16 | {
17 | in: nil,
18 | out: 0,
19 | },
20 | {
21 | in: nil,
22 | seed: 1,
23 | out: 1364076727,
24 | },
25 | {
26 | in: []byte{1},
27 | out: 3831157163,
28 | },
29 | {
30 | in: []byte{1, 2},
31 | out: 1690789502,
32 | },
33 | {
34 | in: []byte{1, 2, 3},
35 | out: 2161234436,
36 | },
37 | {
38 | in: []byte{1, 2, 3, 4},
39 | out: 1043635621,
40 | },
41 | {
42 | in: []byte{1, 2, 3, 4, 5},
43 | out: 2727459272,
44 | },
45 | }
46 | for i, tc := range testCases {
47 | t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
48 | assert.Equal(t, tc.out, Sum32WithSeed(tc.in, tc.seed))
49 | })
50 | }
51 | }
52 |
53 | func BenchmarkSum32WithSeed(b *testing.B) {
54 | data := []byte("pogreb_Sum32WithSeed_bench")
55 | b.SetBytes(int64(len(data)))
56 | for n := 0; n < b.N; n++ {
57 | Sum32WithSeed(data, 0)
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/internal/errors/errors_test.go:
--------------------------------------------------------------------------------
1 | package errors
2 |
3 | import (
4 | "errors"
5 | "testing"
6 |
7 | "github.com/akrylysov/pogreb/internal/assert"
8 | )
9 |
10 | func TestWrap(t *testing.T) {
11 | err1 := New("err1")
12 | w11 := Wrap(err1, "wrapped 11")
13 | w12 := Wrapf(w11, "wrapped %d%s", 1, "2")
14 |
15 | assert.Equal(t, err1, w11.(wrappedError).Unwrap())
16 | assert.Equal(t, w11, w12.(wrappedError).Unwrap())
17 |
18 | assert.Equal(t, "wrapped 11: err1", w11.Error())
19 | assert.Equal(t, "wrapped 12: wrapped 11: err1", w12.Error())
20 | }
21 |
22 | func TestIs(t *testing.T) {
23 | err1 := New("err1")
24 | w11 := Wrap(err1, "wrapped 11")
25 | w12 := Wrap(w11, "wrapped 12")
26 |
27 | err2 := New("err2")
28 | w21 := Wrap(err2, "wrapped 21")
29 |
30 | assert.Equal(t, true, errors.Is(err1, err1))
31 | assert.Equal(t, true, errors.Is(w11, err1))
32 | assert.Equal(t, true, errors.Is(w12, err1))
33 | assert.Equal(t, true, errors.Is(w12, w11))
34 |
35 | assert.Equal(t, false, errors.Is(err1, err2))
36 | assert.Equal(t, false, errors.Is(w11, err2))
37 | assert.Equal(t, false, errors.Is(w12, err2))
38 | assert.Equal(t, false, errors.Is(w21, err1))
39 | assert.Equal(t, false, errors.Is(w21, w11))
40 | }
41 |
--------------------------------------------------------------------------------
/fs/os_mmap_windows.go:
--------------------------------------------------------------------------------
1 | //go:build windows
2 | // +build windows
3 |
4 | package fs
5 |
6 | import (
7 | "os"
8 | "syscall"
9 | "unsafe"
10 | )
11 |
12 | func mmap(f *os.File, fileSize int64, mappingSize int64) ([]byte, error) {
13 | size := fileSize
14 | low, high := uint32(size), uint32(size>>32)
15 | fmap, err := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil)
16 | if err != nil {
17 | return nil, err
18 | }
19 | defer syscall.CloseHandle(fmap)
20 | ptr, err := syscall.MapViewOfFile(fmap, syscall.FILE_MAP_READ, 0, 0, uintptr(size))
21 | if err != nil {
22 | return nil, err
23 | }
24 | data := (*[maxMmapSize]byte)(unsafe.Pointer(ptr))[:size]
25 | return data, nil
26 | }
27 |
28 | func munmap(data []byte) error {
29 | return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&data[0])))
30 | }
31 |
32 | func madviceRandom(data []byte) error {
33 | return nil
34 | }
35 |
36 | func (f *osMMapFile) Truncate(size int64) error {
37 | // Truncating a memory-mapped file fails on Windows. Unmap it first.
38 | if err := f.munmap(); err != nil {
39 | return err
40 | }
41 | if err := f.File.Truncate(size); err != nil {
42 | return err
43 | }
44 | f.size = size
45 | return f.mremap()
46 | }
47 |
--------------------------------------------------------------------------------
/iterator_test.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/akrylysov/pogreb/internal/assert"
8 | )
9 |
10 | func TestIteratorEmpty(t *testing.T) {
11 | db, err := createTestDB(nil)
12 | assert.Nil(t, err)
13 | it := db.Items()
14 | for i := 0; i < 8; i++ {
15 | _, _, err := it.Next()
16 | if err != ErrIterationDone {
17 | t.Fatalf("expected %v; got %v", ErrIterationDone, err)
18 | }
19 | }
20 | assert.Nil(t, db.Close())
21 | }
22 |
23 | func TestIterator(t *testing.T) {
24 | db, err := createTestDB(nil)
25 | assert.Nil(t, err)
26 |
27 | items := map[byte]bool{}
28 | var i byte
29 | for i = 0; i < 255; i++ {
30 | items[i] = false
31 | err := db.Put([]byte{i}, []byte{i})
32 | assert.Nil(t, err)
33 | }
34 |
35 | it := db.Items()
36 | for {
37 | key, value, err := it.Next()
38 | if err == ErrIterationDone {
39 | break
40 | }
41 | assert.Nil(t, err)
42 | if k, ok := items[key[0]]; !ok {
43 | t.Fatalf("unknown key %v", k)
44 | }
45 | if !bytes.Equal(key, value) {
46 | t.Fatalf("expected %v; got %v", key, value)
47 | }
48 | items[key[0]] = true
49 | }
50 |
51 | for k, v := range items {
52 | if !v {
53 | t.Fatalf("expected to iterate over key %v", k)
54 | }
55 | }
56 |
57 | for i := 0; i < 8; i++ {
58 | _, _, err := it.Next()
59 | if err != ErrIterationDone {
60 | t.Fatalf("expected %v; got %v", ErrIterationDone, err)
61 | }
62 | }
63 |
64 | assert.Nil(t, db.Close())
65 | }
66 |
--------------------------------------------------------------------------------
/datalog_test.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/akrylysov/pogreb/internal/assert"
7 | )
8 |
9 | func (dl *datalog) segmentMetas() []segmentMeta {
10 | var metas []segmentMeta
11 | for _, seg := range dl.segmentsBySequenceID() {
12 | metas = append(metas, *seg.meta)
13 | }
14 | return metas
15 | }
16 |
17 | func TestDatalog(t *testing.T) {
18 | db, err := createTestDB(nil)
19 | assert.Nil(t, err)
20 |
21 | _, _, err = db.datalog.put([]byte{'1'}, []byte{'1'})
22 | assert.Nil(t, err)
23 | assert.Equal(t, &segmentMeta{PutRecords: 1}, db.datalog.segments[0].meta)
24 | assert.Nil(t, db.datalog.segments[1])
25 |
26 | sm := db.datalog.segmentsBySequenceID()
27 | assert.Equal(t, []*segment{db.datalog.segments[0]}, sm)
28 |
29 | // Writing to a full file swaps it.
30 | db.datalog.segments[0].meta.Full = true
31 | _, _, err = db.datalog.put([]byte{'1'}, []byte{'1'})
32 | assert.Nil(t, err)
33 | assert.Equal(t, &segmentMeta{PutRecords: 1, Full: true}, db.datalog.segments[0].meta)
34 | assert.Equal(t, &segmentMeta{PutRecords: 1}, db.datalog.segments[1].meta)
35 |
36 | sm = db.datalog.segmentsBySequenceID()
37 | assert.Equal(t, []*segment{db.datalog.segments[0], db.datalog.segments[1]}, sm)
38 |
39 | _, _, err = db.datalog.put([]byte{'1'}, []byte{'1'})
40 | assert.Nil(t, err)
41 | assert.Equal(t, &segmentMeta{PutRecords: 1, Full: true}, db.datalog.segments[0].meta)
42 | assert.Equal(t, &segmentMeta{PutRecords: 2}, db.datalog.segments[1].meta)
43 |
44 | assert.Nil(t, db.Close())
45 | }
46 |
--------------------------------------------------------------------------------
/fs/os.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "os"
5 | )
6 |
7 | type osFS struct{}
8 |
9 | // OS is a file system backed by the os package.
10 | var OS FileSystem = &osFS{}
11 |
12 | func (fs *osFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) {
13 | f, err := os.OpenFile(name, flag, perm)
14 | if err != nil {
15 | return nil, err
16 | }
17 | return &osFile{File: f}, nil
18 | }
19 |
20 | func (fs *osFS) CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) {
21 | return createLockFile(name, perm)
22 | }
23 |
24 | func (fs *osFS) Stat(name string) (os.FileInfo, error) {
25 | return os.Stat(name)
26 | }
27 |
28 | func (fs *osFS) Remove(name string) error {
29 | return os.Remove(name)
30 | }
31 |
32 | func (fs *osFS) Rename(oldpath, newpath string) error {
33 | return os.Rename(oldpath, newpath)
34 | }
35 |
36 | func (fs *osFS) ReadDir(name string) ([]os.DirEntry, error) {
37 | return os.ReadDir(name)
38 | }
39 |
40 | func (fs *osFS) MkdirAll(path string, perm os.FileMode) error {
41 | return os.MkdirAll(path, perm)
42 | }
43 |
44 | type osFile struct {
45 | *os.File
46 | }
47 |
48 | func (f *osFile) Slice(start int64, end int64) ([]byte, error) {
49 | buf := make([]byte, end-start)
50 | _, err := f.ReadAt(buf, start)
51 | if err != nil {
52 | return nil, err
53 | }
54 | return buf, nil
55 | }
56 |
57 | type osLockFile struct {
58 | *os.File
59 | path string
60 | }
61 |
62 | func (f *osLockFile) Unlock() error {
63 | if err := os.Remove(f.path); err != nil {
64 | return err
65 | }
66 | return f.Close()
67 | }
68 |
--------------------------------------------------------------------------------
/fs/os_windows.go:
--------------------------------------------------------------------------------
1 | //go:build windows
2 | // +build windows
3 |
4 | package fs
5 |
6 | import (
7 | "os"
8 | "syscall"
9 | "unsafe"
10 | )
11 |
12 | var (
13 | modkernel32 = syscall.NewLazyDLL("kernel32.dll")
14 | procLockFileEx = modkernel32.NewProc("LockFileEx")
15 | )
16 |
17 | const (
18 | errorLockViolation = 0x21
19 | )
20 |
21 | func lockfile(f *os.File) error {
22 | var ol syscall.Overlapped
23 |
24 | r1, _, err := syscall.Syscall6(
25 | procLockFileEx.Addr(),
26 | 6,
27 | uintptr(f.Fd()), // handle
28 | uintptr(0x0003),
29 | uintptr(0), // reserved
30 | uintptr(1), // locklow
31 | uintptr(0), // lockhigh
32 | uintptr(unsafe.Pointer(&ol)),
33 | )
34 | if r1 == 0 && (err == syscall.ERROR_FILE_EXISTS || err == errorLockViolation) {
35 | return os.ErrExist
36 | }
37 | return nil
38 | }
39 |
40 | func createLockFile(name string, perm os.FileMode) (LockFile, bool, error) {
41 | acquiredExisting := false
42 | if _, err := os.Stat(name); err == nil {
43 | acquiredExisting = true
44 | }
45 | fd, err := syscall.CreateFile(&(syscall.StringToUTF16(name)[0]),
46 | syscall.GENERIC_READ|syscall.GENERIC_WRITE,
47 | syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
48 | nil,
49 | syscall.CREATE_ALWAYS,
50 | syscall.FILE_ATTRIBUTE_NORMAL,
51 | 0)
52 | if err != nil {
53 | return nil, false, os.ErrExist
54 | }
55 | f := os.NewFile(uintptr(fd), name)
56 | if err := lockfile(f); err != nil {
57 | f.Close()
58 | return nil, false, err
59 | }
60 | return &osLockFile{f, name}, acquiredExisting, nil
61 | }
62 |
--------------------------------------------------------------------------------
/fs/sub.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "os"
5 | "path/filepath"
6 | )
7 |
8 | // Sub returns a new file system rooted at dir.
9 | func Sub(fsys FileSystem, dir string) FileSystem {
10 | return &subFS{
11 | fsys: fsys,
12 | root: dir,
13 | }
14 | }
15 |
16 | type subFS struct {
17 | fsys FileSystem
18 | root string
19 | }
20 |
21 | func (fs *subFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) {
22 | subName := filepath.Join(fs.root, name)
23 | return fs.fsys.OpenFile(subName, flag, perm)
24 | }
25 |
26 | func (fs *subFS) Stat(name string) (os.FileInfo, error) {
27 | subName := filepath.Join(fs.root, name)
28 | return fs.fsys.Stat(subName)
29 | }
30 |
31 | func (fs *subFS) Remove(name string) error {
32 | subName := filepath.Join(fs.root, name)
33 | return fs.fsys.Remove(subName)
34 | }
35 |
36 | func (fs *subFS) Rename(oldpath, newpath string) error {
37 | subOldpath := filepath.Join(fs.root, oldpath)
38 | subNewpath := filepath.Join(fs.root, newpath)
39 | return fs.fsys.Rename(subOldpath, subNewpath)
40 | }
41 |
42 | func (fs *subFS) ReadDir(name string) ([]os.DirEntry, error) {
43 | subName := filepath.Join(fs.root, name)
44 | return fs.fsys.ReadDir(subName)
45 | }
46 |
47 | func (fs *subFS) CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) {
48 | subName := filepath.Join(fs.root, name)
49 | return fs.fsys.CreateLockFile(subName, perm)
50 | }
51 |
52 | func (fs *subFS) MkdirAll(path string, perm os.FileMode) error {
53 | subPath := filepath.Join(fs.root, path)
54 | return fs.fsys.MkdirAll(subPath, perm)
55 | }
56 |
57 | var _ FileSystem = &subFS{}
58 |
--------------------------------------------------------------------------------
/options.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "math"
5 | "time"
6 |
7 | "github.com/akrylysov/pogreb/fs"
8 | )
9 |
10 | // Options holds the optional DB parameters.
11 | type Options struct {
12 | // BackgroundSyncInterval sets the amount of time between background Sync() calls.
13 | //
14 | // Setting the value to 0 disables the automatic background synchronization.
15 | // Setting the value to -1 makes the DB call Sync() after every write operation.
16 | // Default: 0
17 | BackgroundSyncInterval time.Duration
18 |
19 | // BackgroundCompactionInterval sets the amount of time between background Compact() calls.
20 | //
21 | // Setting the value to 0 disables the automatic background compaction.
22 | // Default: 0
23 | BackgroundCompactionInterval time.Duration
24 |
25 | // FileSystem sets the file system implementation.
26 | //
27 | // Default: fs.OSMMap.
28 | FileSystem fs.FileSystem
29 | rootFS fs.FileSystem
30 |
31 | maxSegmentSize uint32
32 | compactionMinSegmentSize uint32
33 | compactionMinFragmentation float32
34 | }
35 |
36 | func (src *Options) copyWithDefaults(path string) *Options {
37 | opts := Options{}
38 | if src != nil {
39 | opts = *src
40 | }
41 | if opts.FileSystem == nil {
42 | opts.FileSystem = fs.DefaultFileSystem()
43 | }
44 | opts.rootFS = opts.FileSystem
45 | opts.FileSystem = fs.Sub(opts.FileSystem, path)
46 | if opts.maxSegmentSize == 0 {
47 | opts.maxSegmentSize = math.MaxUint32
48 | }
49 | if opts.compactionMinSegmentSize == 0 {
50 | opts.compactionMinSegmentSize = 32 << 20
51 | }
52 | if opts.compactionMinFragmentation == 0 {
53 | opts.compactionMinFragmentation = 0.5
54 | }
55 | return &opts
56 | }
57 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## [0.10.2] - 2023-12-10
4 | ### Fixed
5 | - Fix an edge case causing recovery to fail.
6 |
7 | ## [0.10.1] - 2021-05-01
8 | ### Changed
9 | - Improve error reporting.
10 | ### Fixed
11 | - Fix compilation for 32-bit OS.
12 |
13 | ## [0.10.0] - 2021-02-09
14 | ### Added
15 | - Memory-mapped file access can now be disabled by setting `Options.FileSystem` to `fs.OS`.
16 | ### Changed
17 | - The default file system implementation is changed to `fs.OSMMap`.
18 |
19 | ## [0.9.2] - 2021-01-01
20 | ### Changed
21 | - Write-ahead log doesn't rely on wall-clock time anymore. It prevents potential race conditions during compaction and recovery.
22 | ### Fixed
23 | - Fix recovery writing extra delete records.
24 |
25 | ## [0.9.1] - 2020-04-03
26 | ### Changed
27 | - Improve Go 1.14 compatibility (remove "unsafe" usage).
28 |
29 | ## [0.9.0] - 2020-03-08
30 | ### Changed
31 | - Replace the unstructured data file for storing key-value pairs with a write-ahead log.
32 | ### Added
33 | - In the event of a crash or a power loss the database is automatically recovered.
34 | - Optional background compaction allows reclaiming disk space occupied by overwritten or deleted keys.
35 | ### Fixed
36 | - Fix disk space overhead when storing small keys and values.
37 |
38 | ## [0.8.3] - 2019-11-03
39 | ### Fixed
40 | - Fix slice bounds out of range error mapping files on Windows.
41 |
42 | ## [0.8.2] - 2019-09-04
43 | ### Fixed
44 | - Race condition could lead to data corruption.
45 |
46 | ## [0.8.1] - 2019-06-30
47 | ### Fixed
48 | - Fix panic when accessing closed database.
49 | - Return error opening invalid database.
50 |
51 | ## [0.8] - 2019-03-30
52 | ### Changed
53 | - ~2x write performance improvement on non-Windows.
54 |
55 | ## [0.7] - 2019-03-23
56 | ### Added
57 | - Windows support (@mattn).
58 | ### Changed
59 | - Improve freelist performance.
60 |
--------------------------------------------------------------------------------
/backup_test.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/akrylysov/pogreb/internal/assert"
7 | )
8 |
9 | const testDBBackupName = testDBName + ".backup"
10 |
11 | func TestBackup(t *testing.T) {
12 | opts := &Options{
13 | maxSegmentSize: 1024,
14 | compactionMinSegmentSize: 520,
15 | compactionMinFragmentation: 0.02,
16 | }
17 |
18 | run := func(name string, f func(t *testing.T, db *DB)) bool {
19 | return t.Run(name, func(t *testing.T) {
20 | db, err := createTestDB(opts)
21 | assert.Nil(t, err)
22 | f(t, db)
23 | assert.Nil(t, db.Close())
24 | _ = cleanDir(testDBBackupName)
25 | })
26 | }
27 |
28 | run("empty", func(t *testing.T, db *DB) {
29 | assert.Nil(t, db.Backup(testDBBackupName))
30 | db2, err := Open(testDBBackupName, opts)
31 | assert.Nil(t, err)
32 | assert.Nil(t, db2.Close())
33 | })
34 |
35 | run("single segment", func(t *testing.T, db *DB) {
36 | assert.Nil(t, db.Put([]byte{0}, []byte{0}))
37 | assert.Equal(t, 1, countSegments(t, db))
38 | assert.Nil(t, db.Backup(testDBBackupName))
39 | db2, err := Open(testDBBackupName, opts)
40 | assert.Nil(t, err)
41 | v, err := db2.Get([]byte{0})
42 | assert.Equal(t, []byte{0}, v)
43 | assert.Nil(t, err)
44 | assert.Nil(t, db2.Close())
45 | })
46 |
47 | run("multiple segments", func(t *testing.T, db *DB) {
48 | for i := byte(0); i < 100; i++ {
49 | assert.Nil(t, db.Put([]byte{i}, []byte{i}))
50 | }
51 | assert.Equal(t, 3, countSegments(t, db))
52 | assert.Nil(t, db.Backup(testDBBackupName))
53 | db2, err := Open(testDBBackupName, opts)
54 | assert.Equal(t, 3, countSegments(t, db2))
55 | assert.Nil(t, err)
56 | for i := byte(0); i < 100; i++ {
57 | v, err := db2.Get([]byte{i})
58 | assert.Nil(t, err)
59 | assert.Equal(t, []byte{i}, v)
60 | }
61 | assert.Nil(t, db2.Close())
62 | })
63 | }
64 |
--------------------------------------------------------------------------------
/fs/fs.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package fs provides a file system interface.
3 | */
4 | package fs
5 |
6 | import (
7 | "errors"
8 | "io"
9 | "os"
10 | )
11 |
12 | var (
13 | errAppendModeNotSupported = errors.New("append mode is not supported")
14 | )
15 |
16 | // File is the interface compatible with os.File.
17 | // All methods are not thread-safe, except for ReadAt, Slice and Stat.
18 | type File interface {
19 | io.Closer
20 | io.Reader
21 | io.ReaderAt
22 | io.Seeker
23 | io.Writer
24 | io.WriterAt
25 |
26 | // Stat returns os.FileInfo describing the file.
27 | Stat() (os.FileInfo, error)
28 |
29 | // Sync commits the current contents of the file.
30 | Sync() error
31 |
32 | // Truncate changes the size of the file.
33 | Truncate(size int64) error
34 |
35 | // Slice reads and returns the contents of file from offset start to offset end.
36 | Slice(start int64, end int64) ([]byte, error)
37 | }
38 |
39 | // LockFile represents a lock file.
40 | type LockFile interface {
41 | // Unlock and removes the lock file.
42 | Unlock() error
43 | }
44 |
45 | // FileSystem represents a file system.
46 | type FileSystem interface {
47 | // OpenFile opens the file with specified flag.
48 | OpenFile(name string, flag int, perm os.FileMode) (File, error)
49 |
50 | // Stat returns os.FileInfo describing the file.
51 | Stat(name string) (os.FileInfo, error)
52 |
53 | // Remove removes the file.
54 | Remove(name string) error
55 |
56 | // Rename renames oldpath to newpath.
57 | Rename(oldpath, newpath string) error
58 |
59 | // ReadDir reads the directory and returns a list of directory entries.
60 | ReadDir(name string) ([]os.DirEntry, error)
61 |
62 | // CreateLockFile creates a lock file.
63 | CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error)
64 |
65 | // MkdirAll creates a directory named path.
66 | MkdirAll(path string, perm os.FileMode) error
67 | }
68 |
--------------------------------------------------------------------------------
/iterator.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "errors"
5 | "sync"
6 | )
7 |
8 | // ErrIterationDone is returned by ItemIterator.Next calls when there are no more items to return.
9 | var ErrIterationDone = errors.New("no more items in iterator")
10 |
11 | type item struct {
12 | key []byte
13 | value []byte
14 | }
15 |
16 | // ItemIterator is an iterator over DB key-value pairs. It iterates the items in an unspecified order.
17 | type ItemIterator struct {
18 | db *DB
19 | nextBucketIdx uint32
20 | queue []item
21 | mu sync.Mutex
22 | }
23 |
24 | // fetchItems adds items to the iterator queue from a bucket located at nextBucketIdx.
25 | func (it *ItemIterator) fetchItems(nextBucketIdx uint32) error {
26 | bit := it.db.index.newBucketIterator(nextBucketIdx)
27 | for {
28 | b, err := bit.next()
29 | if err == ErrIterationDone {
30 | return nil
31 | }
32 | if err != nil {
33 | return err
34 | }
35 | for i := 0; i < slotsPerBucket; i++ {
36 | sl := b.slots[i]
37 | if sl.offset == 0 {
38 | // No more items in the bucket.
39 | break
40 | }
41 | key, value, err := it.db.datalog.readKeyValue(sl)
42 | if err != nil {
43 | return err
44 | }
45 | key = cloneBytes(key)
46 | value = cloneBytes(value)
47 | it.queue = append(it.queue, item{key: key, value: value})
48 | }
49 | }
50 | }
51 |
52 | // Next returns the next key-value pair if available, otherwise it returns ErrIterationDone error.
53 | func (it *ItemIterator) Next() ([]byte, []byte, error) {
54 | it.mu.Lock()
55 | defer it.mu.Unlock()
56 |
57 | it.db.mu.RLock()
58 | defer it.db.mu.RUnlock()
59 |
60 | // The iterator queue is empty and we have more buckets to check.
61 | for len(it.queue) == 0 && it.nextBucketIdx < it.db.index.numBuckets {
62 | if err := it.fetchItems(it.nextBucketIdx); err != nil {
63 | return nil, nil, err
64 | }
65 | it.nextBucketIdx++
66 | }
67 |
68 | if len(it.queue) > 0 {
69 | item := it.queue[0]
70 | it.queue = it.queue[1:]
71 | return item.key, item.value, nil
72 | }
73 |
74 | return nil, nil, ErrIterationDone
75 | }
76 |
--------------------------------------------------------------------------------
/backup.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "io"
5 | "os"
6 |
7 | "github.com/akrylysov/pogreb/fs"
8 | )
9 |
10 | func touchFile(fsys fs.FileSystem, path string) error {
11 | f, err := fsys.OpenFile(path, os.O_CREATE|os.O_TRUNC, os.FileMode(0640))
12 | if err != nil {
13 | return err
14 | }
15 | return f.Close()
16 | }
17 |
18 | // Backup creates a database backup at the specified path.
19 | func (db *DB) Backup(path string) error {
20 | // Make sure the compaction is not running during backup.
21 | db.maintenanceMu.Lock()
22 | defer db.maintenanceMu.Unlock()
23 |
24 | if err := db.opts.rootFS.MkdirAll(path, 0755); err != nil {
25 | return err
26 | }
27 |
28 | db.mu.RLock()
29 | var segments []*segment
30 | activeSegmentSizes := make(map[uint16]int64)
31 | for _, seg := range db.datalog.segmentsBySequenceID() {
32 | segments = append(segments, seg)
33 | if !seg.meta.Full {
34 | // Save the size of the active segments to copy only the data persisted up to the point
35 | // of when the backup started.
36 | activeSegmentSizes[seg.id] = seg.size
37 | }
38 | }
39 | db.mu.RUnlock()
40 |
41 | srcFS := db.opts.FileSystem
42 | dstFS := fs.Sub(db.opts.rootFS, path)
43 |
44 | for _, seg := range segments {
45 | name := segmentName(seg.id, seg.sequenceID)
46 | mode := os.FileMode(0640)
47 | srcFile, err := srcFS.OpenFile(name, os.O_RDONLY, mode)
48 | if err != nil {
49 | return err
50 | }
51 |
52 | dstFile, err := dstFS.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_TRUNC, mode)
53 | if err != nil {
54 | return err
55 | }
56 |
57 | if srcSize, ok := activeSegmentSizes[seg.id]; ok {
58 | if _, err := io.CopyN(dstFile, srcFile, srcSize); err != nil {
59 | return err
60 | }
61 | } else {
62 | if _, err := io.Copy(dstFile, srcFile); err != nil {
63 | return err
64 | }
65 | }
66 |
67 | if err := srcFile.Close(); err != nil {
68 | return err
69 | }
70 | if err := dstFile.Close(); err != nil {
71 | return err
72 | }
73 | }
74 |
75 | if err := touchFile(dstFS, lockName); err != nil {
76 | return err
77 | }
78 |
79 | return nil
80 | }
81 |
--------------------------------------------------------------------------------
/internal/assert/assert.go:
--------------------------------------------------------------------------------
1 | package assert
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | "time"
7 | )
8 |
9 | // Equal fails the test when expected is not equal to actual.
10 | func Equal(t testing.TB, expected interface{}, actual interface{}) {
11 | if !reflect.DeepEqual(expected, actual) {
12 | t.Helper()
13 | t.Fatalf("expected %+v; got %+v", expected, actual)
14 | }
15 | }
16 |
17 | // https://github.com/golang/go/blob/go1.15/src/reflect/value.go#L1071
18 | var nillableKinds = map[reflect.Kind]bool{
19 | reflect.Chan: true,
20 | reflect.Func: true,
21 | reflect.Map: true,
22 | reflect.Ptr: true,
23 | reflect.UnsafePointer: true,
24 | reflect.Interface: true,
25 | reflect.Slice: true,
26 | }
27 |
28 | // Nil fails the test when obj is not nil.
29 | func Nil(t testing.TB, obj interface{}) {
30 | if obj == nil {
31 | return
32 | }
33 | val := reflect.ValueOf(obj)
34 | if !nillableKinds[val.Kind()] || !val.IsNil() {
35 | t.Helper()
36 | t.Fatalf("expected nil; got %+v", obj)
37 | }
38 | }
39 |
40 | // NotNil fails the test when obj is nil.
41 | func NotNil(t testing.TB, obj interface{}) {
42 | val := reflect.ValueOf(obj)
43 | if obj == nil || (nillableKinds[val.Kind()] && val.IsNil()) {
44 | t.Helper()
45 | t.Fatalf("expected not nil; got %+v", obj)
46 | }
47 | }
48 |
49 | const pollingInterval = time.Millisecond * 10 // How often CompleteWithin polls the cond function.
50 |
51 | // CompleteWithin fails the test when cond doesn't succeed within waitDur.
52 | func CompleteWithin(t testing.TB, waitDur time.Duration, cond func() bool) {
53 | start := time.Now()
54 | for time.Since(start) < waitDur {
55 | if cond() {
56 | return
57 | }
58 | time.Sleep(pollingInterval)
59 | }
60 | t.Helper()
61 | t.Fatalf("expected to complete within %v", waitDur)
62 | }
63 |
64 | // Panic fails the test when the test doesn't panic with the expected message.
65 | func Panic(t testing.TB, expectedMessage string, f func()) {
66 | t.Helper()
67 | var message interface{}
68 | func() {
69 | defer func() {
70 | message = recover()
71 | }()
72 | f()
73 | }()
74 | Equal(t, expectedMessage, message)
75 | }
76 |
--------------------------------------------------------------------------------
/file.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "io"
5 | "os"
6 |
7 | "github.com/akrylysov/pogreb/fs"
8 | )
9 |
10 | // file is a database file.
11 | // When stored in a file system, the file starts with a header.
12 | type file struct {
13 | fs.File
14 | size int64
15 | }
16 |
17 | type openFileFlags struct {
18 | truncate bool
19 | readOnly bool
20 | }
21 |
22 | func openFile(fsyst fs.FileSystem, name string, flags openFileFlags) (*file, error) {
23 | var flag int
24 | if flags.readOnly {
25 | flag = os.O_RDONLY
26 | } else {
27 | flag = os.O_CREATE | os.O_RDWR
28 | if flags.truncate {
29 | flag |= os.O_TRUNC
30 | }
31 | }
32 | fi, err := fsyst.OpenFile(name, flag, os.FileMode(0640))
33 | f := &file{}
34 | if err != nil {
35 | return f, err
36 | }
37 | clean := fi.Close
38 | defer func() {
39 | if clean != nil {
40 | _ = clean()
41 | }
42 | }()
43 | f.File = fi
44 | stat, err := fi.Stat()
45 | if err != nil {
46 | return f, err
47 | }
48 | f.size = stat.Size()
49 | if f.size == 0 {
50 | // It's a new file - write header.
51 | if err := f.writeHeader(); err != nil {
52 | return nil, err
53 | }
54 | } else {
55 | if err := f.readHeader(); err != nil {
56 | return nil, err
57 | }
58 | }
59 | if _, err := f.Seek(int64(headerSize), io.SeekStart); err != nil {
60 | return nil, err
61 | }
62 | clean = nil
63 | return f, nil
64 | }
65 |
66 | func (f *file) writeHeader() error {
67 | h := newHeader()
68 | data, err := h.MarshalBinary()
69 | if err != nil {
70 | return err
71 | }
72 | if _, err = f.append(data); err != nil {
73 | return err
74 | }
75 | return nil
76 | }
77 |
78 | func (f *file) readHeader() error {
79 | h := &header{}
80 | buf := make([]byte, headerSize)
81 | if _, err := io.ReadFull(f, buf); err != nil {
82 | return err
83 | }
84 | return h.UnmarshalBinary(buf)
85 | }
86 |
87 | func (f *file) empty() bool {
88 | return f.size == int64(headerSize)
89 | }
90 |
91 | func (f *file) extend(size uint32) (int64, error) {
92 | off := f.size
93 | if err := f.Truncate(off + int64(size)); err != nil {
94 | return 0, err
95 | }
96 | f.size += int64(size)
97 | return off, nil
98 | }
99 |
100 | func (f *file) append(data []byte) (int64, error) {
101 | off := f.size
102 | if _, err := f.WriteAt(data, off); err != nil {
103 | return 0, err
104 | }
105 | f.size += int64(len(data))
106 | return off, nil
107 | }
108 |
--------------------------------------------------------------------------------
/file_test.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "errors"
5 | "os"
6 | "time"
7 |
8 | "github.com/akrylysov/pogreb/fs"
9 | )
10 |
11 | type errfs struct{}
12 |
13 | func (fs *errfs) OpenFile(name string, flag int, perm os.FileMode) (fs.File, error) {
14 | return &errfile{}, nil
15 | }
16 |
17 | func (fs *errfs) CreateLockFile(name string, perm os.FileMode) (fs.LockFile, bool, error) {
18 | return &errfile{}, false, nil
19 | }
20 |
21 | func (fs *errfs) Stat(name string) (os.FileInfo, error) {
22 | return nil, errfileError
23 | }
24 |
25 | func (fs *errfs) Remove(name string) error {
26 | return errfileError
27 | }
28 |
29 | func (fs *errfs) Rename(oldpath, newpath string) error {
30 | return errfileError
31 | }
32 |
33 | func (fs *errfs) ReadDir(name string) ([]os.DirEntry, error) {
34 | return nil, errfileError
35 | }
36 |
37 | func (fs *errfs) MkdirAll(path string, perm os.FileMode) error {
38 | return errfileError
39 | }
40 |
41 | type errfile struct{}
42 |
43 | var errfileError = errors.New("errfile error")
44 |
45 | func (m *errfile) Close() error {
46 | return errfileError
47 | }
48 |
49 | func (m *errfile) Unlock() error {
50 | return errfileError
51 | }
52 |
53 | func (m *errfile) ReadAt(p []byte, off int64) (int, error) {
54 | return 0, errfileError
55 | }
56 |
57 | func (m *errfile) Read(p []byte) (int, error) {
58 | return 0, errfileError
59 | }
60 |
61 | func (m *errfile) WriteAt(p []byte, off int64) (int, error) {
62 | return 0, errfileError
63 | }
64 |
65 | func (m *errfile) Write(p []byte) (int, error) {
66 | return 0, errfileError
67 | }
68 |
69 | func (m *errfile) Seek(offset int64, whence int) (int64, error) {
70 | return 0, errfileError
71 | }
72 |
73 | func (m *errfile) Stat() (os.FileInfo, error) {
74 | return nil, errfileError
75 | }
76 |
77 | func (m *errfile) Sync() error {
78 | return errfileError
79 | }
80 |
81 | func (m *errfile) Truncate(size int64) error {
82 | return errfileError
83 | }
84 |
85 | func (m *errfile) Name() string {
86 | return "errfile"
87 | }
88 |
89 | func (m *errfile) Size() int64 {
90 | return 0
91 | }
92 |
93 | func (m *errfile) Mode() os.FileMode {
94 | return os.FileMode(0)
95 | }
96 |
97 | func (m *errfile) ModTime() time.Time {
98 | return time.Now()
99 | }
100 |
101 | func (m *errfile) IsDir() bool {
102 | return false
103 | }
104 |
105 | func (m *errfile) Sys() interface{} {
106 | return errfileError
107 | }
108 |
109 | func (m *errfile) Slice(start int64, end int64) ([]byte, error) {
110 | return nil, errfileError
111 | }
112 |
113 | func (m *errfile) Mmap(fileSize int64, mappingSize int64) error {
114 | return errfileError
115 | }
116 |
117 | func (m *errfile) Munmap() error {
118 | return errfileError
119 | }
120 |
121 | // Compile time interface assertion.
122 | var _ fs.File = &errfile{}
123 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |

2 |
3 | # Pogreb
4 | [](https://pkg.go.dev/github.com/akrylysov/pogreb)
5 | [](https://github.com/akrylysov/pogreb/actions)
6 | [](https://goreportcard.com/report/github.com/akrylysov/pogreb)
7 | [](https://codecov.io/gh/akrylysov/pogreb)
8 |
9 | Pogreb is an embedded key-value store for read-heavy workloads written in Go.
10 |
11 | ## Key characteristics
12 |
13 | - 100% Go.
14 | - Optimized for fast random lookups and infrequent bulk inserts.
15 | - Can store larger-than-memory data sets.
16 | - Low memory usage.
17 | - All DB methods are safe for concurrent use by multiple goroutines.
18 |
19 | ## Installation
20 |
21 | ```sh
22 | $ go get -u github.com/akrylysov/pogreb
23 | ```
24 |
25 | ## Usage
26 |
27 | ### Opening a database
28 |
29 | To open or create a new database, use the `pogreb.Open()` function:
30 |
31 | ```go
32 | package main
33 |
34 | import (
35 | "log"
36 |
37 | "github.com/akrylysov/pogreb"
38 | )
39 |
40 | func main() {
41 | db, err := pogreb.Open("pogreb.test", nil)
42 | if err != nil {
43 | log.Fatal(err)
44 | return
45 | }
46 | defer db.Close()
47 | }
48 | ```
49 |
50 | ### Writing to a database
51 |
52 | Use the `DB.Put()` function to insert a new key-value pair:
53 |
54 | ```go
55 | err := db.Put([]byte("testKey"), []byte("testValue"))
56 | if err != nil {
57 | log.Fatal(err)
58 | }
59 | ```
60 |
61 | ### Reading from a database
62 |
63 | To retrieve the inserted value, use the `DB.Get()` function:
64 |
65 | ```go
66 | val, err := db.Get([]byte("testKey"))
67 | if err != nil {
68 | log.Fatal(err)
69 | }
70 | log.Printf("%s", val)
71 | ```
72 |
73 | ### Deleting from a database
74 |
75 | Use the `DB.Delete()` function to delete a key-value pair:
76 |
77 | ```go
78 | err := db.Delete([]byte("testKey"))
79 | if err != nil {
80 | log.Fatal(err)
81 | }
82 | ```
83 |
84 | ### Iterating over items
85 |
86 | To iterate over items, use `ItemIterator` returned by `DB.Items()`:
87 |
88 | ```go
89 | it := db.Items()
90 | for {
91 | key, val, err := it.Next()
92 | if err == pogreb.ErrIterationDone {
93 | break
94 | }
95 | if err != nil {
96 | log.Fatal(err)
97 | }
98 | log.Printf("%s %s", key, val)
99 | }
100 | ```
101 |
102 | ## Performance
103 |
104 | The benchmarking code can be found in the [pogreb-bench](https://github.com/akrylysov/pogreb-bench) repository.
105 |
106 | Results of read performance benchmark of pogreb, goleveldb, bolt and badgerdb
107 | on DigitalOcean 8 CPUs / 16 GB RAM / 160 GB SSD + Ubuntu 16.04.3 (higher is better):
108 |
109 | 
110 |
111 | ## Internals
112 |
113 | [Design document](/docs/design.md).
114 |
115 | ## Limitations
116 |
117 | The design choices made to optimize for point lookups bring limitations for other potential use-cases. For example, using a hash table for indexing makes range scans impossible. Additionally, having a single hash table shared across all WAL segments makes the recovery process require rebuilding the entire index, which may be impractical for large databases.
--------------------------------------------------------------------------------
/bucket.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "encoding/binary"
5 | )
6 |
7 | const (
8 | bucketSize = 512
9 | slotsPerBucket = 31 // Maximum number of slots possible to fit in a 512-byte bucket.
10 | )
11 |
12 | // slot corresponds to a single item in the hash table.
13 | type slot struct {
14 | hash uint32
15 | segmentID uint16
16 | keySize uint16
17 | valueSize uint32
18 | offset uint32 // Offset of the record in a segment.
19 | }
20 |
21 | func (sl slot) kvSize() uint32 {
22 | return uint32(sl.keySize) + sl.valueSize
23 | }
24 |
25 | // bucket is an array of slots.
26 | type bucket struct {
27 | slots [slotsPerBucket]slot
28 | next int64 // Offset of overflow bucket.
29 | }
30 |
31 | // bucketHandle is a bucket, plus its offset and the file it's written to.
32 | type bucketHandle struct {
33 | bucket
34 | file *file
35 | offset int64
36 | }
37 |
38 | func (b bucket) MarshalBinary() ([]byte, error) {
39 | buf := make([]byte, bucketSize)
40 | data := buf
41 | for i := 0; i < slotsPerBucket; i++ {
42 | sl := b.slots[i]
43 | binary.LittleEndian.PutUint32(buf[:4], sl.hash)
44 | binary.LittleEndian.PutUint16(buf[4:6], sl.segmentID)
45 | binary.LittleEndian.PutUint16(buf[6:8], sl.keySize)
46 | binary.LittleEndian.PutUint32(buf[8:12], sl.valueSize)
47 | binary.LittleEndian.PutUint32(buf[12:16], sl.offset)
48 | buf = buf[16:]
49 | }
50 | binary.LittleEndian.PutUint64(buf[:8], uint64(b.next))
51 | return data, nil
52 | }
53 |
54 | func (b *bucket) UnmarshalBinary(data []byte) error {
55 | for i := 0; i < slotsPerBucket; i++ {
56 | _ = data[16] // bounds check hint to compiler; see golang.org/issue/14808
57 | b.slots[i].hash = binary.LittleEndian.Uint32(data[:4])
58 | b.slots[i].segmentID = binary.LittleEndian.Uint16(data[4:6])
59 | b.slots[i].keySize = binary.LittleEndian.Uint16(data[6:8])
60 | b.slots[i].valueSize = binary.LittleEndian.Uint32(data[8:12])
61 | b.slots[i].offset = binary.LittleEndian.Uint32(data[12:16])
62 | data = data[16:]
63 | }
64 | b.next = int64(binary.LittleEndian.Uint64(data[:8]))
65 | return nil
66 | }
67 |
68 | func (b *bucket) del(slotIdx int) {
69 | i := slotIdx
70 | // Shift slots.
71 | for ; i < slotsPerBucket-1; i++ {
72 | b.slots[i] = b.slots[i+1]
73 | }
74 | b.slots[i] = slot{}
75 | }
76 |
77 | func (b *bucketHandle) read() error {
78 | buf, err := b.file.Slice(b.offset, b.offset+int64(bucketSize))
79 | if err != nil {
80 | return err
81 | }
82 | return b.UnmarshalBinary(buf)
83 | }
84 |
85 | func (b *bucketHandle) write() error {
86 | buf, err := b.MarshalBinary()
87 | if err != nil {
88 | return err
89 | }
90 | _, err = b.file.WriteAt(buf, b.offset)
91 | return err
92 | }
93 |
94 | // slotWriter inserts and writes slots into a bucket.
95 | type slotWriter struct {
96 | bucket *bucketHandle
97 | slotIdx int
98 | prevBuckets []*bucketHandle
99 | }
100 |
101 | func (sw *slotWriter) insert(sl slot, idx *index) error {
102 | if sw.slotIdx == slotsPerBucket {
103 | // Bucket is full, create a new overflow bucket.
104 | nextBucket, err := idx.createOverflowBucket()
105 | if err != nil {
106 | return err
107 | }
108 | sw.bucket.next = nextBucket.offset
109 | sw.prevBuckets = append(sw.prevBuckets, sw.bucket)
110 | sw.bucket = nextBucket
111 | sw.slotIdx = 0
112 | }
113 | sw.bucket.slots[sw.slotIdx] = sl
114 | sw.slotIdx++
115 | return nil
116 | }
117 |
118 | func (sw *slotWriter) write() error {
119 | // Write previous buckets first.
120 | for i := len(sw.prevBuckets) - 1; i >= 0; i-- {
121 | if err := sw.prevBuckets[i].write(); err != nil {
122 | return err
123 | }
124 | }
125 | return sw.bucket.write()
126 | }
127 |
--------------------------------------------------------------------------------
/fs/os_mmap.go:
--------------------------------------------------------------------------------
1 | //go:build !plan9
2 |
3 | package fs
4 |
5 | import (
6 | "io"
7 | "os"
8 | )
9 |
10 | const (
11 | initialMmapSize = 1024 << 20 // 1 GiB
12 | )
13 |
14 | type osMMapFS struct {
15 | osFS
16 | }
17 |
18 | // OSMMap is a file system backed by the os package and memory-mapped files.
19 | var OSMMap FileSystem = &osMMapFS{}
20 |
21 | func (fs *osMMapFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) {
22 | if flag&os.O_APPEND != 0 {
23 | // osMMapFS doesn't support opening files in append-only mode.
24 | // The database doesn't currently use O_APPEND.
25 | return nil, errAppendModeNotSupported
26 | }
27 | f, err := os.OpenFile(name, flag, perm)
28 | if err != nil {
29 | return nil, err
30 | }
31 |
32 | stat, err := f.Stat()
33 | if err != nil {
34 | return nil, err
35 | }
36 |
37 | mf := &osMMapFile{
38 | File: f,
39 | size: stat.Size(),
40 | }
41 | if err := mf.mremap(); err != nil {
42 | return nil, err
43 | }
44 | return mf, nil
45 | }
46 |
47 | type osMMapFile struct {
48 | *os.File
49 | data []byte
50 | offset int64
51 | size int64
52 | mmapSize int64
53 | }
54 |
55 | func (f *osMMapFile) WriteAt(p []byte, off int64) (int, error) {
56 | n, err := f.File.WriteAt(p, off)
57 | if err != nil {
58 | return 0, err
59 | }
60 | writeOff := off + int64(n)
61 | if writeOff > f.size {
62 | f.size = writeOff
63 | }
64 | return n, f.mremap()
65 | }
66 |
67 | func (f *osMMapFile) Write(p []byte) (int, error) {
68 | n, err := f.File.Write(p)
69 | if err != nil {
70 | return 0, err
71 | }
72 | f.offset += int64(n)
73 | if f.offset > f.size {
74 | f.size = f.offset
75 | }
76 | return n, f.mremap()
77 | }
78 |
79 | func (f *osMMapFile) Seek(offset int64, whence int) (int64, error) {
80 | off, err := f.File.Seek(offset, whence)
81 | f.offset = off
82 | return off, err
83 | }
84 |
85 | func (f *osMMapFile) Read(p []byte) (int, error) {
86 | n, err := f.File.Read(p)
87 | f.offset += int64(n)
88 | return n, err
89 | }
90 |
91 | func (f *osMMapFile) Slice(start int64, end int64) ([]byte, error) {
92 | if end > f.size {
93 | return nil, io.EOF
94 | }
95 | if f.data == nil {
96 | return nil, os.ErrClosed
97 | }
98 | return f.data[start:end], nil
99 | }
100 |
101 | func (f *osMMapFile) munmap() error {
102 | if f.data == nil {
103 | return nil
104 | }
105 | if err := munmap(f.data); err != nil {
106 | return err
107 | }
108 | f.data = nil
109 | f.mmapSize = 0
110 | return nil
111 | }
112 |
113 | func (f *osMMapFile) mmap(fileSize int64, mappingSize int64) error {
114 | if f.data != nil {
115 | if err := munmap(f.data); err != nil {
116 | return err
117 | }
118 | }
119 |
120 | data, err := mmap(f.File, fileSize, mappingSize)
121 | if err != nil {
122 | return err
123 | }
124 |
125 | _ = madviceRandom(data)
126 |
127 | f.data = data
128 | return nil
129 | }
130 |
131 | func (f *osMMapFile) mremap() error {
132 | mmapSize := f.mmapSize
133 |
134 | if mmapSize >= f.size {
135 | return nil
136 | }
137 |
138 | if mmapSize == 0 {
139 | mmapSize = initialMmapSize
140 | if mmapSize < f.size {
141 | mmapSize = f.size
142 | }
143 | } else {
144 | if err := f.munmap(); err != nil {
145 | return err
146 | }
147 | mmapSize *= 2
148 | }
149 |
150 | if err := f.mmap(f.size, mmapSize); err != nil {
151 | return err
152 | }
153 |
154 | // On Windows mmap may memory-map less than the requested size.
155 | f.mmapSize = int64(len(f.data))
156 |
157 | return nil
158 | }
159 |
160 | func (f *osMMapFile) Close() error {
161 | if err := f.munmap(); err != nil {
162 | return err
163 | }
164 | return f.File.Close()
165 | }
166 |
167 | // Return a default FileSystem for this platform.
168 | func DefaultFileSystem() FileSystem {
169 | return OSMMap
170 | }
171 |
--------------------------------------------------------------------------------
/recovery.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "io"
5 | "path/filepath"
6 |
7 | "github.com/akrylysov/pogreb/fs"
8 | )
9 |
10 | const (
11 | recoveryBackupExt = ".bac"
12 | )
13 |
14 | func backupNonsegmentFiles(fsys fs.FileSystem) error {
15 | logger.Println("moving non-segment files...")
16 |
17 | files, err := fsys.ReadDir(".")
18 | if err != nil {
19 | return err
20 | }
21 |
22 | for _, file := range files {
23 | name := file.Name()
24 | ext := filepath.Ext(name)
25 | if ext == segmentExt || name == lockName {
26 | continue
27 | }
28 | dst := name + recoveryBackupExt
29 | if err := fsys.Rename(name, dst); err != nil {
30 | return err
31 | }
32 | logger.Printf("moved %s to %s", name, dst)
33 | }
34 |
35 | return nil
36 | }
37 |
38 | func removeRecoveryBackupFiles(fsys fs.FileSystem) error {
39 | logger.Println("removing recovery backup files...")
40 |
41 | files, err := fsys.ReadDir(".")
42 | if err != nil {
43 | return err
44 | }
45 |
46 | for _, file := range files {
47 | name := file.Name()
48 | ext := filepath.Ext(name)
49 | if ext != recoveryBackupExt {
50 | continue
51 | }
52 | if err := fsys.Remove(name); err != nil {
53 | return err
54 | }
55 | logger.Printf("removed %s", name)
56 | }
57 |
58 | return nil
59 | }
60 |
61 | // recoveryIterator iterates over records of all datalog segments in insertion order.
62 | // Corrupted segments are truncated to the last valid record.
63 | type recoveryIterator struct {
64 | segments []*segment
65 | segit *segmentIterator
66 | }
67 |
68 | func newRecoveryIterator(segments []*segment) *recoveryIterator {
69 | return &recoveryIterator{
70 | segments: segments,
71 | }
72 | }
73 |
74 | func (it *recoveryIterator) next() (record, error) {
75 | for {
76 | if it.segit == nil {
77 | if len(it.segments) == 0 {
78 | return record{}, ErrIterationDone
79 | }
80 | var err error
81 | it.segit, err = newSegmentIterator(it.segments[0])
82 | if err != nil {
83 | return record{}, err
84 | }
85 | it.segments = it.segments[1:]
86 | }
87 | rec, err := it.segit.next()
88 | if err == io.EOF || err == io.ErrUnexpectedEOF || err == errCorrupted {
89 | // Truncate file to the last valid offset.
90 | if err := it.segit.f.Truncate(int64(it.segit.offset)); err != nil {
91 | return record{}, err
92 | }
93 | fi, fierr := it.segit.f.Stat()
94 | if fierr != nil {
95 | return record{}, fierr
96 | }
97 | logger.Printf("truncated segment %s to offset %d", fi.Name(), it.segit.offset)
98 | err = ErrIterationDone
99 | }
100 | if err == ErrIterationDone {
101 | it.segit = nil
102 | continue
103 | }
104 | if err != nil {
105 | return record{}, err
106 | }
107 | return rec, nil
108 | }
109 | }
110 |
111 | func (db *DB) recover() error {
112 | logger.Println("started recovery")
113 | logger.Println("rebuilding index...")
114 |
115 | segments := db.datalog.segmentsBySequenceID()
116 | it := newRecoveryIterator(segments)
117 | for {
118 | rec, err := it.next()
119 | if err == ErrIterationDone {
120 | break
121 | }
122 | if err != nil {
123 | return err
124 | }
125 |
126 | h := db.hash(rec.key)
127 | meta := db.datalog.segments[rec.segmentID].meta
128 | if rec.rtype == recordTypePut {
129 | sl := slot{
130 | hash: h,
131 | segmentID: rec.segmentID,
132 | keySize: uint16(len(rec.key)),
133 | valueSize: uint32(len(rec.value)),
134 | offset: rec.offset,
135 | }
136 | if err := db.put(sl, rec.key); err != nil {
137 | return err
138 | }
139 | meta.PutRecords++
140 | } else {
141 | if err := db.del(h, rec.key, false); err != nil {
142 | return err
143 | }
144 | meta.DeleteRecords++
145 | meta.DeletedBytes += uint32(len(rec.data))
146 | }
147 | }
148 |
149 | // Mark all segments except the newest as full.
150 | for i := 0; i < len(segments)-1; i++ {
151 | segments[i].meta.Full = true
152 | }
153 |
154 | if err := removeRecoveryBackupFiles(db.opts.FileSystem); err != nil {
155 | logger.Printf("error removing recovery backups files: %v", err)
156 | }
157 |
158 | logger.Println("successfully recovered database")
159 |
160 | return nil
161 | }
162 |
--------------------------------------------------------------------------------
/segment.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "bufio"
5 | "encoding/binary"
6 | "fmt"
7 | "hash/crc32"
8 | "io"
9 | )
10 |
11 | type recordType int
12 |
13 | const (
14 | recordTypePut recordType = iota
15 | recordTypeDelete
16 |
17 | segmentExt = ".psg"
18 | )
19 |
20 | // segment is a write-ahead log segment.
21 | // It consists of a sequence of binary-encoded variable length records.
22 | type segment struct {
23 | *file
24 | id uint16 // Physical segment identifier.
25 | sequenceID uint64 // Logical monotonically increasing segment identifier.
26 | name string
27 | meta *segmentMeta
28 | }
29 |
30 | func segmentName(id uint16, sequenceID uint64) string {
31 | return fmt.Sprintf("%05d-%d%s", id, sequenceID, segmentExt)
32 | }
33 |
34 | type segmentMeta struct {
35 | Full bool
36 | PutRecords uint32
37 | DeleteRecords uint32
38 | DeletedKeys uint32
39 | DeletedBytes uint32
40 | }
41 |
42 | func segmentMetaName(id uint16, sequenceID uint64) string {
43 | return segmentName(id, sequenceID) + metaExt
44 | }
45 |
46 | // Binary representation of a segment record:
47 | // +---------------+------------------+------------------+-...-+--...--+----------+
48 | // | Key Size (2B) | Record Type (1b) | Value Size (31b) | Key | Value | CRC (4B) |
49 | // +---------------+------------------+------------------+-...-+--...--+----------+
50 | type record struct {
51 | rtype recordType
52 | segmentID uint16
53 | offset uint32
54 | data []byte
55 | key []byte
56 | value []byte
57 | }
58 |
59 | func encodedRecordSize(kvSize uint32) uint32 {
60 | // key size, value size, key, value, crc32
61 | return 2 + 4 + kvSize + 4
62 | }
63 |
64 | func encodeRecord(key []byte, value []byte, rt recordType) []byte {
65 | size := encodedRecordSize(uint32(len(key) + len(value)))
66 | data := make([]byte, size)
67 | binary.LittleEndian.PutUint16(data[:2], uint16(len(key)))
68 |
69 | valLen := uint32(len(value))
70 | if rt == recordTypeDelete { // Set delete bit.
71 | valLen |= 1 << 31
72 | }
73 | binary.LittleEndian.PutUint32(data[2:], valLen)
74 |
75 | copy(data[6:], key)
76 | copy(data[6+len(key):], value)
77 | checksum := crc32.ChecksumIEEE(data[:6+len(key)+len(value)])
78 | binary.LittleEndian.PutUint32(data[size-4:size], checksum)
79 | return data
80 | }
81 |
82 | func encodePutRecord(key []byte, value []byte) []byte {
83 | return encodeRecord(key, value, recordTypePut)
84 | }
85 |
86 | func encodeDeleteRecord(key []byte) []byte {
87 | return encodeRecord(key, nil, recordTypeDelete)
88 | }
89 |
90 | // segmentIterator iterates over segment records.
91 | type segmentIterator struct {
92 | f *segment
93 | offset uint32
94 | r *bufio.Reader
95 | buf []byte // kv size and crc32 reusable buffer.
96 | }
97 |
98 | func newSegmentIterator(f *segment) (*segmentIterator, error) {
99 | if _, err := f.Seek(int64(headerSize), io.SeekStart); err != nil {
100 | return nil, err
101 | }
102 | return &segmentIterator{
103 | f: f,
104 | offset: headerSize,
105 | r: bufio.NewReader(f),
106 | buf: make([]byte, 6),
107 | }, nil
108 | }
109 |
110 | func (it *segmentIterator) next() (record, error) {
111 | // Read key and value size.
112 | kvSizeBuf := it.buf
113 | if _, err := io.ReadFull(it.r, kvSizeBuf); err != nil {
114 | if err == io.EOF {
115 | return record{}, ErrIterationDone
116 | }
117 | return record{}, err
118 | }
119 |
120 | // Decode key size.
121 | keySize := uint32(binary.LittleEndian.Uint16(kvSizeBuf[:2]))
122 |
123 | // Decode value size and record type.
124 | rt := recordTypePut
125 | valueSize := binary.LittleEndian.Uint32(kvSizeBuf[2:])
126 | if valueSize&(1<<31) != 0 {
127 | rt = recordTypeDelete
128 | valueSize &^= 1 << 31
129 | }
130 |
131 | // Read key, value and checksum.
132 | recordSize := encodedRecordSize(keySize + valueSize)
133 | data := make([]byte, recordSize)
134 | copy(data, kvSizeBuf)
135 | if _, err := io.ReadFull(it.r, data[6:]); err != nil {
136 | return record{}, err
137 | }
138 |
139 | // Verify checksum.
140 | checksum := binary.LittleEndian.Uint32(data[len(data)-4:])
141 | if checksum != crc32.ChecksumIEEE(data[:len(data)-4]) {
142 | return record{}, errCorrupted
143 | }
144 |
145 | offset := it.offset
146 | it.offset += recordSize
147 | rec := record{
148 | rtype: rt,
149 | segmentID: it.f.id,
150 | offset: offset,
151 | data: data,
152 | key: data[6 : 6+keySize],
153 | value: data[6+keySize : 6+keySize+valueSize],
154 | }
155 | return rec, nil
156 | }
157 |
--------------------------------------------------------------------------------
/compaction.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "github.com/akrylysov/pogreb/internal/errors"
5 | )
6 |
7 | // promoteRecord writes the record to the current segment if the index still points to the record.
8 | // Otherwise it discards the record.
9 | func (db *DB) promoteRecord(rec record) (bool, error) {
10 | hash := db.hash(rec.key)
11 | it := db.index.newBucketIterator(db.index.bucketIndex(hash))
12 | for {
13 | b, err := it.next()
14 | if err == ErrIterationDone {
15 | // Exhausted all buckets and the slot wasn't found.
16 | // The key was deleted or overwritten. The record is safe to discard.
17 | return true, nil
18 | }
19 | if err != nil {
20 | return false, err
21 | }
22 | for i := 0; i < slotsPerBucket; i++ {
23 | sl := b.slots[i]
24 |
25 | // No more slots in the bucket.
26 | if sl.offset == 0 {
27 | break
28 | }
29 |
30 | // Slot points to a different record.
31 | if hash != sl.hash || rec.offset != sl.offset || rec.segmentID != sl.segmentID {
32 | continue
33 | }
34 |
35 | // The record is in the index, write it to the current segment.
36 | segmentID, offset, err := db.datalog.writeRecord(rec.data, rec.rtype) // TODO: batch writes
37 | if err != nil {
38 | return false, err
39 | }
40 |
41 | // Update index.
42 | b.slots[i].segmentID = segmentID
43 | b.slots[i].offset = offset
44 | return false, b.write()
45 | }
46 | }
47 | }
48 |
49 | // CompactionResult holds the compaction result.
50 | type CompactionResult struct {
51 | CompactedSegments int
52 | ReclaimedRecords int
53 | ReclaimedBytes int
54 | }
55 |
56 | func (db *DB) compact(sourceSeg *segment) (CompactionResult, error) {
57 | cr := CompactionResult{}
58 |
59 | db.mu.Lock()
60 | sourceSeg.meta.Full = true // Prevent writes to the compacted file.
61 | db.mu.Unlock()
62 |
63 | it, err := newSegmentIterator(sourceSeg)
64 | if err != nil {
65 | return cr, err
66 | }
67 | // Copy records from sourceSeg to the current segment.
68 | for {
69 | err := func() error {
70 | db.mu.Lock()
71 | defer db.mu.Unlock()
72 | rec, err := it.next()
73 | if err != nil {
74 | return err
75 | }
76 | if rec.rtype == recordTypeDelete {
77 | cr.ReclaimedRecords++
78 | cr.ReclaimedBytes += len(rec.data)
79 | return nil
80 | }
81 | reclaimed, err := db.promoteRecord(rec)
82 | if reclaimed {
83 | cr.ReclaimedRecords++
84 | cr.ReclaimedBytes += len(rec.data)
85 | }
86 | return err
87 | }()
88 | if err == ErrIterationDone {
89 | break
90 | }
91 | if err != nil {
92 | return cr, err
93 | }
94 | }
95 |
96 | db.mu.Lock()
97 | defer db.mu.Unlock()
98 | err = db.datalog.removeSegment(sourceSeg)
99 | return cr, err
100 | }
101 |
102 | // pickForCompaction returns segments eligible for compaction.
103 | func (db *DB) pickForCompaction() []*segment {
104 | segments := db.datalog.segmentsBySequenceID()
105 | var picked []*segment
106 | for i := len(segments) - 1; i >= 0; i-- {
107 | seg := segments[i]
108 |
109 | if uint32(seg.size) < db.opts.compactionMinSegmentSize {
110 | continue
111 | }
112 |
113 | fragmentation := float32(seg.meta.DeletedBytes) / float32(seg.size)
114 | if fragmentation < db.opts.compactionMinFragmentation {
115 | continue
116 | }
117 |
118 | if seg.meta.DeleteRecords > 0 {
119 | // Delete records can be discarded only when older segments contain no put records
120 | // for the corresponding keys.
121 | // All segments older than the segment eligible for compaction have to be compacted.
122 | return append(segments[:i+1], picked...)
123 | }
124 |
125 | picked = append([]*segment{seg}, picked...)
126 | }
127 | return picked
128 | }
129 |
130 | // Compact compacts the DB. Deleted and overwritten items are discarded.
131 | // Returns an error if compaction is already in progress.
132 | func (db *DB) Compact() (CompactionResult, error) {
133 | cr := CompactionResult{}
134 |
135 | // Run only a single compaction at a time.
136 | if !db.maintenanceMu.TryLock() {
137 | return cr, errBusy
138 | }
139 | defer func() {
140 | db.maintenanceMu.Unlock()
141 | }()
142 |
143 | db.mu.RLock()
144 | segments := db.pickForCompaction()
145 | db.mu.RUnlock()
146 |
147 | for _, seg := range segments {
148 | segcr, err := db.compact(seg)
149 | if err != nil {
150 | return cr, errors.Wrapf(err, "compacting segment %s", seg.name)
151 | }
152 | cr.CompactedSegments++
153 | cr.ReclaimedRecords += segcr.ReclaimedRecords
154 | cr.ReclaimedBytes += segcr.ReclaimedBytes
155 | }
156 |
157 | return cr, nil
158 | }
159 |
--------------------------------------------------------------------------------
/internal/assert/assert_test.go:
--------------------------------------------------------------------------------
1 | package assert
2 |
3 | import (
4 | "fmt"
5 | "sync"
6 | "testing"
7 | "time"
8 | )
9 |
10 | func TestEqual(t *testing.T) {
11 | testCases := []struct {
12 | first interface{}
13 | second interface{}
14 | expectedFailed bool
15 | }{
16 | {
17 | first: 1,
18 | second: 1,
19 | expectedFailed: false,
20 | },
21 |
22 | {
23 | first: nil,
24 | second: nil,
25 | expectedFailed: false,
26 | },
27 | {
28 | first: "1",
29 | second: "1",
30 | expectedFailed: false,
31 | },
32 | {
33 | first: struct{}{},
34 | second: struct{}{},
35 | expectedFailed: false,
36 | },
37 | {
38 | first: struct{ x int }{x: 1},
39 | second: struct{ x int }{x: 1},
40 | expectedFailed: false,
41 | },
42 | {
43 | first: 1,
44 | second: 2,
45 | expectedFailed: true,
46 | },
47 | {
48 | first: 1,
49 | second: "1",
50 | expectedFailed: true,
51 | },
52 | {
53 | first: 1,
54 | second: 1.0,
55 | expectedFailed: true,
56 | },
57 | {
58 | first: struct{ x int }{x: 1},
59 | second: struct{ x int }{x: 2},
60 | expectedFailed: true,
61 | },
62 | {
63 | first: struct{ x int }{x: 1},
64 | second: struct{ y int }{y: 1},
65 | expectedFailed: true,
66 | },
67 | }
68 |
69 | for i, tc := range testCases {
70 | t.Run(fmt.Sprintf("%d %+v", i, tc), func(t *testing.T) {
71 | mock := &testing.T{}
72 | wg := &sync.WaitGroup{}
73 | wg.Add(1)
74 | // Run the asserting in a goroutine. t.Fatal calls runtime.Goexit.
75 | go func() {
76 | defer wg.Done()
77 | Equal(mock, tc.first, tc.second)
78 | }()
79 | wg.Wait()
80 | failed := mock.Failed()
81 | if tc.expectedFailed != failed {
82 | t.Fatalf("expected to fail: %t; failed: %t", tc.expectedFailed, failed)
83 | }
84 | })
85 | }
86 | }
87 |
88 | func TestNil(t *testing.T) {
89 | var nilIntPtr *int
90 | var nilStructPtr *struct{ x int }
91 | var nilSlice []string
92 |
93 | testCases := []struct {
94 | obj interface{}
95 | isNil bool
96 | }{
97 | {
98 | obj: nil,
99 | isNil: true,
100 | },
101 | {
102 | obj: nilIntPtr,
103 | isNil: true,
104 | },
105 | {
106 | obj: nilStructPtr,
107 | isNil: true,
108 | },
109 | {
110 | obj: nilSlice,
111 | isNil: true,
112 | },
113 | {
114 | obj: 1,
115 | isNil: false,
116 | },
117 | {
118 | obj: "1",
119 | isNil: false,
120 | },
121 | {
122 | obj: []string{},
123 | isNil: false,
124 | },
125 | {
126 | obj: [2]int{1, 1},
127 | isNil: false,
128 | },
129 | }
130 |
131 | for i, tc := range testCases {
132 | t.Run(fmt.Sprintf("%d %+v", i, tc.obj), func(t *testing.T) {
133 | mockNil := &testing.T{}
134 | mockNotNil := &testing.T{}
135 | wg := &sync.WaitGroup{}
136 | wg.Add(2)
137 | go func() {
138 | defer wg.Done()
139 | Nil(mockNil, tc.obj)
140 | }()
141 | go func() {
142 | defer wg.Done()
143 | NotNil(mockNotNil, tc.obj)
144 | }()
145 | wg.Wait()
146 | if tc.isNil == mockNil.Failed() {
147 | t.Fatalf("Nil expected to fail: %t; failed: %t", !tc.isNil, mockNil.Failed())
148 | }
149 | if !tc.isNil == mockNotNil.Failed() {
150 | t.Fatalf("NotNil expected to fail: %t; failed: %t", tc.isNil, mockNotNil.Failed())
151 | }
152 | })
153 | }
154 | }
155 |
156 | func TestPanic(t *testing.T) {
157 | testCases := []struct {
158 | name string
159 | f func()
160 | expectedFailed bool
161 | }{
162 | {
163 | name: "panic",
164 | f: func() {
165 | panic("message123")
166 | },
167 | expectedFailed: false,
168 | },
169 | {
170 | name: "panic: wrong message",
171 | f: func() {
172 | panic("message456")
173 | },
174 | expectedFailed: true,
175 | },
176 | {
177 | name: "no panic",
178 | f: func() {},
179 | expectedFailed: true,
180 | },
181 | }
182 | for _, tc := range testCases {
183 | t.Run(tc.name, func(t *testing.T) {
184 | mock := &testing.T{}
185 | wg := &sync.WaitGroup{}
186 | wg.Add(1)
187 | go func() {
188 | defer wg.Done()
189 | Panic(mock, "message123", tc.f)
190 | }()
191 | wg.Wait()
192 | if tc.expectedFailed != mock.Failed() {
193 | t.Fatalf("expected to fail: %t; failed: %t", tc.expectedFailed, mock.Failed())
194 | }
195 | })
196 | }
197 | }
198 |
199 | func TestCompleteWithin(t *testing.T) {
200 | var tc2Tries int
201 | var tc4Tries int
202 | testCases := []struct {
203 | name string
204 | dur time.Duration
205 | cond func() bool
206 | expectedFailed bool
207 | }{
208 | {
209 | name: "completed: first try",
210 | dur: time.Hour,
211 | cond: func() bool {
212 | return true
213 | },
214 | expectedFailed: false,
215 | },
216 | {
217 | name: "completed: second try",
218 | dur: time.Hour,
219 | cond: func() bool {
220 | if tc2Tries == 0 {
221 | tc2Tries++
222 | return false
223 | }
224 | return true
225 | },
226 | expectedFailed: false,
227 | },
228 | {
229 | name: "not completed",
230 | dur: time.Nanosecond,
231 | cond: func() bool {
232 | return false
233 | },
234 | expectedFailed: true,
235 | },
236 | {
237 | name: "not completed: timeout",
238 | dur: time.Nanosecond,
239 | cond: func() bool {
240 | if tc4Tries == 0 {
241 | tc4Tries++
242 | time.Sleep(pollingInterval * 2)
243 | return false
244 | }
245 | return true
246 | },
247 | expectedFailed: true,
248 | },
249 | }
250 | for _, tc := range testCases {
251 | t.Run(tc.name, func(t *testing.T) {
252 | mock := &testing.T{}
253 | wg := &sync.WaitGroup{}
254 | wg.Add(1)
255 | go func() {
256 | defer wg.Done()
257 | CompleteWithin(mock, tc.dur, tc.cond)
258 | }()
259 | wg.Wait()
260 | if tc.expectedFailed != mock.Failed() {
261 | t.Fatalf("expected to fail: %t; failed: %t", tc.expectedFailed, mock.Failed())
262 | }
263 | })
264 | }
265 | }
266 |
--------------------------------------------------------------------------------
/fs/mem.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "io"
5 | "os"
6 | "path/filepath"
7 | "time"
8 | )
9 |
10 | type memFS struct {
11 | files map[string]*memFile
12 | }
13 |
14 | // Mem is a file system backed by memory.
15 | // It should be used for testing only.
16 | var Mem FileSystem = &memFS{files: map[string]*memFile{}}
17 |
18 | func (fs *memFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) {
19 | if flag&os.O_APPEND != 0 {
20 | // memFS doesn't support opening files in append-only mode.
21 | // The database doesn't currently use O_APPEND.
22 | return nil, errAppendModeNotSupported
23 | }
24 | f := fs.files[name]
25 | if f == nil {
26 | // The file doesn't exist.
27 | if (flag & os.O_CREATE) == 0 {
28 | return nil, os.ErrNotExist
29 | }
30 | f = &memFile{
31 | name: name,
32 | perm: perm, // Perm is saved to return it in Mode, but don't do anything else with it yet.
33 | refs: 1,
34 | }
35 | fs.files[name] = f
36 | } else {
37 | if (flag & os.O_TRUNC) != 0 {
38 | f.size = 0
39 | f.buf = nil
40 | }
41 | f.refs += 1
42 | }
43 | return &seekableMemFile{memFile: f}, nil
44 | }
45 |
46 | func (fs *memFS) CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) {
47 | f, exists := fs.files[name]
48 | if f != nil && f.refs > 0 {
49 | return nil, false, os.ErrExist
50 | }
51 | _, err := fs.OpenFile(name, os.O_CREATE, perm)
52 | if err != nil {
53 | return nil, false, err
54 | }
55 | return fs.files[name], exists, nil
56 | }
57 |
58 | func (fs *memFS) Stat(name string) (os.FileInfo, error) {
59 | if f, ok := fs.files[name]; ok {
60 | return f, nil
61 | }
62 | return nil, os.ErrNotExist
63 | }
64 |
65 | func (fs *memFS) Remove(name string) error {
66 | if _, ok := fs.files[name]; ok {
67 | delete(fs.files, name)
68 | return nil
69 | }
70 | return os.ErrNotExist
71 | }
72 |
73 | func (fs *memFS) Rename(oldpath, newpath string) error {
74 | if f, ok := fs.files[oldpath]; ok {
75 | delete(fs.files, oldpath)
76 | fs.files[newpath] = f
77 | f.name = newpath
78 | return nil
79 | }
80 | return os.ErrNotExist
81 | }
82 |
83 | func (fs *memFS) ReadDir(dir string) ([]os.DirEntry, error) {
84 | dir = filepath.Clean(dir)
85 | var entries []os.DirEntry
86 | for name, f := range fs.files {
87 | if filepath.Dir(name) == dir {
88 | entries = append(entries, f)
89 | }
90 | }
91 | return entries, nil
92 | }
93 |
94 | func (fs *memFS) MkdirAll(path string, perm os.FileMode) error {
95 | // FIXME: the implementation is incomplete.
96 | // memFS lets create a file even when the parent directory doesn't exist.
97 | return nil
98 | }
99 |
100 | type memFile struct {
101 | name string
102 | perm os.FileMode
103 | buf []byte
104 | size int64
105 | refs int
106 | }
107 |
108 | func (f *memFile) Close() error {
109 | if f.refs == 0 {
110 | return os.ErrClosed
111 | }
112 | f.refs -= 1
113 | return nil
114 | }
115 |
116 | func (f *memFile) Unlock() error {
117 | if err := f.Close(); err != nil {
118 | return err
119 | }
120 | return Mem.Remove(f.name)
121 | }
122 |
123 | func (f *memFile) ReadAt(p []byte, off int64) (int, error) {
124 | if f.refs == 0 {
125 | return 0, os.ErrClosed
126 | }
127 | if off >= f.size {
128 | return 0, io.EOF
129 | }
130 | n := int64(len(p))
131 | if n > f.size-off {
132 | copy(p, f.buf[off:])
133 | return int(f.size - off), nil
134 | }
135 | copy(p, f.buf[off:off+n])
136 | return int(n), nil
137 | }
138 |
139 | func (f *memFile) WriteAt(p []byte, off int64) (int, error) {
140 | if f.refs == 0 {
141 | return 0, os.ErrClosed
142 | }
143 | n := int64(len(p))
144 | if off+n > f.size {
145 | f.truncate(off + n)
146 | }
147 | copy(f.buf[off:off+n], p)
148 | return int(n), nil
149 | }
150 |
151 | func (f *memFile) Stat() (os.FileInfo, error) {
152 | if f.refs == 0 {
153 | return f, os.ErrClosed
154 | }
155 | return f, nil
156 | }
157 |
158 | func (f *memFile) Sync() error {
159 | if f.refs == 0 {
160 | return os.ErrClosed
161 | }
162 | return nil
163 | }
164 |
165 | func (f *memFile) truncate(size int64) {
166 | if size > f.size {
167 | diff := int(size - f.size)
168 | f.buf = append(f.buf, make([]byte, diff)...)
169 | } else {
170 | f.buf = f.buf[:size]
171 | }
172 | f.size = size
173 | }
174 |
175 | func (f *memFile) Truncate(size int64) error {
176 | if f.refs == 0 {
177 | return os.ErrClosed
178 | }
179 | f.truncate(size)
180 | return nil
181 | }
182 |
183 | func (f *memFile) Name() string {
184 | _, name := filepath.Split(f.name)
185 | return name
186 | }
187 |
188 | func (f *memFile) Size() int64 {
189 | return f.size
190 | }
191 |
192 | func (f *memFile) Mode() os.FileMode {
193 | return f.perm
194 | }
195 |
196 | func (f *memFile) ModTime() time.Time {
197 | return time.Now()
198 | }
199 |
200 | func (f *memFile) IsDir() bool {
201 | return false
202 | }
203 |
204 | func (f *memFile) Sys() interface{} {
205 | return nil
206 | }
207 |
208 | func (f *memFile) Type() os.FileMode {
209 | return f.perm
210 | }
211 |
212 | func (f *memFile) Info() (os.FileInfo, error) {
213 | return f.Stat()
214 | }
215 |
216 | func (f *memFile) Slice(start int64, end int64) ([]byte, error) {
217 | if f.refs == 0 {
218 | return nil, os.ErrClosed
219 | }
220 | if end > f.size {
221 | return nil, io.EOF
222 | }
223 | return f.buf[start:end], nil
224 | }
225 |
226 | type seekableMemFile struct {
227 | *memFile
228 | offset int64
229 | }
230 |
231 | func (f *seekableMemFile) Read(p []byte) (int, error) {
232 | n, err := f.ReadAt(p, f.offset)
233 | if err != nil {
234 | return n, err
235 | }
236 | f.offset += int64(n)
237 | return n, err
238 | }
239 |
240 | func (f *seekableMemFile) Write(p []byte) (int, error) {
241 | n, err := f.WriteAt(p, f.offset)
242 | if err != nil {
243 | return n, err
244 | }
245 | f.offset += int64(n)
246 | return n, err
247 | }
248 |
249 | func (f *seekableMemFile) Seek(offset int64, whence int) (int64, error) {
250 | if f.refs == 0 {
251 | return 0, os.ErrClosed
252 | }
253 | switch whence {
254 | case io.SeekEnd:
255 | f.offset = f.size + offset
256 | case io.SeekStart:
257 | f.offset = offset
258 | case io.SeekCurrent:
259 | f.offset += offset
260 | }
261 | return f.offset, nil
262 | }
263 |
--------------------------------------------------------------------------------
/datalog.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "fmt"
5 | "math"
6 | "os"
7 | "path/filepath"
8 | "sort"
9 | "strconv"
10 | "strings"
11 |
12 | "github.com/akrylysov/pogreb/internal/errors"
13 | )
14 |
15 | const (
16 | maxSegments = math.MaxInt16
17 | )
18 |
19 | // datalog is a write-ahead log.
20 | type datalog struct {
21 | opts *Options
22 | curSeg *segment
23 | segments [maxSegments]*segment
24 | maxSequenceID uint64
25 | }
26 |
27 | func openDatalog(opts *Options) (*datalog, error) {
28 | files, err := opts.FileSystem.ReadDir(".")
29 | if err != nil {
30 | return nil, err
31 | }
32 |
33 | dl := &datalog{
34 | opts: opts,
35 | }
36 |
37 | // Open existing segments.
38 | for _, file := range files {
39 | name := file.Name()
40 | ext := filepath.Ext(name)
41 | if ext != segmentExt {
42 | continue
43 | }
44 | id, seqID, err := parseSegmentName(name)
45 | if err != nil {
46 | return nil, err
47 | }
48 | seg, err := dl.openSegment(name, id, seqID)
49 | if err != nil {
50 | return nil, errors.Wrapf(err, "opening segment %s", name)
51 | }
52 | if seg.sequenceID > dl.maxSequenceID {
53 | dl.maxSequenceID = seg.sequenceID
54 | }
55 | dl.segments[seg.id] = seg
56 | }
57 |
58 | if err := dl.swapSegment(); err != nil {
59 | return nil, err
60 | }
61 |
62 | return dl, nil
63 | }
64 |
65 | func parseSegmentName(name string) (uint16, uint64, error) {
66 | parts := strings.SplitN(strings.TrimSuffix(name, segmentExt), "-", 2)
67 | id, err := strconv.ParseUint(parts[0], 10, 16)
68 | if err != nil {
69 | return 0, 0, err
70 | }
71 | var seqID uint64
72 | if len(parts) == 2 {
73 | seqID, err = strconv.ParseUint(parts[1], 10, 64)
74 | if err != nil {
75 | return 0, 0, err
76 | }
77 | }
78 | return uint16(id), seqID, nil
79 | }
80 |
81 | func (dl *datalog) openSegment(name string, id uint16, seqID uint64) (*segment, error) {
82 | f, err := openFile(dl.opts.FileSystem, name, openFileFlags{})
83 | if err != nil {
84 | return nil, err
85 | }
86 |
87 | meta := &segmentMeta{}
88 | if !f.empty() {
89 | metaName := name + metaExt
90 | if err := readGobFile(dl.opts.FileSystem, metaName, &meta); err != nil {
91 | logger.Printf("error reading segment meta %d: %v", id, err)
92 | // TODO: rebuild meta?
93 | }
94 | }
95 |
96 | seg := &segment{
97 | file: f,
98 | id: id,
99 | sequenceID: seqID,
100 | name: name,
101 | meta: meta,
102 | }
103 |
104 | return seg, nil
105 | }
106 |
107 | func (dl *datalog) nextWritableSegmentID() (uint16, uint64, error) {
108 | for id, seg := range dl.segments {
109 | // Pick empty segment.
110 | if seg == nil {
111 | dl.maxSequenceID++
112 | return uint16(id), dl.maxSequenceID, nil
113 | }
114 | }
115 | return 0, 0, fmt.Errorf("number of segments exceeds %d", maxSegments)
116 | }
117 |
118 | func (dl *datalog) swapSegment() error {
119 | // Pick unfilled segment.
120 | for _, seg := range dl.segments {
121 | if seg != nil && !seg.meta.Full {
122 | dl.curSeg = seg
123 | return nil
124 | }
125 | }
126 |
127 | // Create new segment.
128 | id, seqID, err := dl.nextWritableSegmentID()
129 | if err != nil {
130 | return err
131 | }
132 |
133 | name := segmentName(id, seqID)
134 | seg, err := dl.openSegment(name, id, seqID)
135 | if err != nil {
136 | return err
137 | }
138 |
139 | dl.segments[id] = seg
140 | dl.curSeg = seg
141 |
142 | return nil
143 | }
144 |
145 | func (dl *datalog) removeSegment(seg *segment) error {
146 | dl.segments[seg.id] = nil
147 |
148 | if err := seg.Close(); err != nil {
149 | return err
150 | }
151 |
152 | // Remove segment meta from FS.
153 | metaName := seg.name + segmentExt
154 | if err := dl.opts.FileSystem.Remove(metaName); err != nil && !os.IsNotExist(err) {
155 | return err
156 | }
157 |
158 | // Remove segment from FS.
159 | if err := dl.opts.FileSystem.Remove(seg.name); err != nil {
160 | return err
161 | }
162 |
163 | return nil
164 | }
165 |
166 | func (dl *datalog) readKeyValue(sl slot) ([]byte, []byte, error) {
167 | off := int64(sl.offset) + 6 // Skip key size and value size.
168 | seg := dl.segments[sl.segmentID]
169 | keyValue, err := seg.Slice(off, off+int64(sl.kvSize()))
170 | if err != nil {
171 | return nil, nil, err
172 | }
173 | return keyValue[:sl.keySize], keyValue[sl.keySize:], nil
174 | }
175 |
176 | func (dl *datalog) readKey(sl slot) ([]byte, error) {
177 | off := int64(sl.offset) + 6
178 | seg := dl.segments[sl.segmentID]
179 | return seg.Slice(off, off+int64(sl.keySize))
180 | }
181 |
182 | // trackDel updates segment's metadata for deleted or overwritten items.
183 | func (dl *datalog) trackDel(sl slot) {
184 | meta := dl.segments[sl.segmentID].meta
185 | meta.DeletedKeys++
186 | meta.DeletedBytes += encodedRecordSize(sl.kvSize())
187 | }
188 |
189 | func (dl *datalog) del(key []byte) error {
190 | rec := encodeDeleteRecord(key)
191 | _, _, err := dl.writeRecord(rec, recordTypeDelete)
192 | if err != nil {
193 | return err
194 | }
195 | // Compaction removes delete records, increment DeletedBytes.
196 | dl.curSeg.meta.DeletedBytes += uint32(len(rec))
197 | return nil
198 | }
199 |
200 | func (dl *datalog) writeRecord(data []byte, rt recordType) (uint16, uint32, error) {
201 | if dl.curSeg.meta.Full || dl.curSeg.size+int64(len(data)) > int64(dl.opts.maxSegmentSize) {
202 | // Current segment is full, create a new one.
203 | dl.curSeg.meta.Full = true
204 | if err := dl.swapSegment(); err != nil {
205 | return 0, 0, err
206 | }
207 | }
208 | off, err := dl.curSeg.append(data)
209 | if err != nil {
210 | return 0, 0, err
211 | }
212 | switch rt {
213 | case recordTypePut:
214 | dl.curSeg.meta.PutRecords++
215 | case recordTypeDelete:
216 | dl.curSeg.meta.DeleteRecords++
217 | }
218 | return dl.curSeg.id, uint32(off), nil
219 | }
220 |
221 | func (dl *datalog) put(key []byte, value []byte) (uint16, uint32, error) {
222 | return dl.writeRecord(encodePutRecord(key, value), recordTypePut)
223 | }
224 |
225 | func (dl *datalog) sync() error {
226 | return dl.curSeg.Sync()
227 | }
228 |
229 | func (dl *datalog) close() error {
230 | for _, seg := range dl.segments {
231 | if seg == nil {
232 | continue
233 | }
234 | if err := seg.Close(); err != nil {
235 | return err
236 | }
237 | metaName := seg.name + metaExt
238 | if err := writeGobFile(dl.opts.FileSystem, metaName, seg.meta); err != nil {
239 | return err
240 | }
241 | }
242 | return nil
243 | }
244 |
245 | // segmentsBySequenceID returns segments ordered from oldest to newest.
246 | func (dl *datalog) segmentsBySequenceID() []*segment {
247 | var segments []*segment
248 |
249 | for _, seg := range dl.segments {
250 | if seg == nil {
251 | continue
252 | }
253 | segments = append(segments, seg)
254 | }
255 |
256 | sort.SliceStable(segments, func(i, j int) bool {
257 | return segments[i].sequenceID < segments[j].sequenceID
258 | })
259 |
260 | return segments
261 | }
262 |
--------------------------------------------------------------------------------
/recovery_test.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "path/filepath"
5 | "testing"
6 |
7 | "github.com/akrylysov/pogreb/internal/assert"
8 | )
9 |
10 | func TestRecovery(t *testing.T) {
11 | segPath := filepath.Join(testDBName, segmentName(0, 1))
12 | testCases := []struct {
13 | name string
14 | fn func() error
15 | }{
16 | {
17 | name: "all zeroes",
18 | fn: func() error {
19 | return appendFile(segPath, make([]byte, 128))
20 | },
21 | },
22 | {
23 | name: "partial kv size",
24 | fn: func() error {
25 | return appendFile(segPath, []byte{1})
26 | },
27 | },
28 | {
29 | name: "only kv size",
30 | fn: func() error {
31 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0})
32 | },
33 | },
34 | {
35 | name: "kv size and key",
36 | fn: func() error {
37 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1})
38 | },
39 | },
40 | {
41 | name: "kv size, key, value",
42 | fn: func() error {
43 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1})
44 | },
45 | },
46 | {
47 | name: "kv size, key, value, partial crc32",
48 | fn: func() error {
49 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 40})
50 | },
51 | },
52 | {
53 | name: "kv size, key, value, invalid crc32",
54 | fn: func() error {
55 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 40, 19, 197, 0})
56 | },
57 | },
58 | {
59 | name: "corrupted and not corrupted record",
60 | fn: func() error {
61 | if err := appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 40, 19, 197, 0}); err != nil {
62 | return err
63 | }
64 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12})
65 | },
66 | },
67 | }
68 |
69 | for _, testCase := range testCases {
70 | t.Run(testCase.name, func(t *testing.T) {
71 | opts := &Options{FileSystem: testFS}
72 | db, err := createTestDB(opts)
73 | assert.Nil(t, err)
74 | // Fill segment 0.
75 | var i uint8
76 | for i = 0; i < 128; i++ {
77 | assert.Nil(t, db.Put([]byte{i}, []byte{i}))
78 | }
79 | assert.Equal(t, uint32(128), db.Count())
80 | assert.Nil(t, db.Close())
81 |
82 | // Simulate crash.
83 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName)))
84 |
85 | assert.Nil(t, testCase.fn())
86 |
87 | db, err = Open(testDBName, opts)
88 | assert.Nil(t, err)
89 | assert.Equal(t, uint32(128), db.Count())
90 | assert.Nil(t, db.Close())
91 |
92 | db, err = Open(testDBName, opts)
93 | assert.Nil(t, err)
94 | assert.Equal(t, uint32(128), db.Count())
95 | for i = 0; i < 128; i++ {
96 | v, err := db.Get([]byte{i})
97 | assert.Nil(t, err)
98 | assert.Equal(t, []byte{i}, v)
99 | }
100 | assert.Nil(t, db.Close())
101 | })
102 | }
103 | }
104 |
105 | func TestRecoveryDelete(t *testing.T) {
106 | opts := &Options{FileSystem: testFS}
107 | db, err := createTestDB(opts)
108 | assert.Nil(t, err)
109 | assert.Nil(t, db.Put([]byte{1}, []byte{1}))
110 | assert.Nil(t, db.Put([]byte{2}, []byte{2}))
111 | assert.Nil(t, db.Delete([]byte{1}))
112 | assert.Equal(t, uint32(1), db.Count())
113 | assert.Nil(t, db.Close())
114 |
115 | // Simulate crash.
116 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName)))
117 |
118 | db, err = Open(testDBName, opts)
119 | assert.Nil(t, err)
120 |
121 | assert.Equal(t, uint32(1), db.Count())
122 |
123 | assert.Nil(t, db.Close())
124 | }
125 |
126 | func TestRecoveryCompaction(t *testing.T) {
127 | opts := &Options{
128 | FileSystem: testFS,
129 | maxSegmentSize: 1024,
130 | compactionMinSegmentSize: 512,
131 | compactionMinFragmentation: 0.2,
132 | }
133 |
134 | db, err := createTestDB(opts)
135 | assert.Nil(t, err)
136 |
137 | // Fill file 0.
138 | for i := 0; i < 41; i++ {
139 | assert.Nil(t, db.Put([]byte{0}, []byte{0}))
140 | }
141 | assert.Nil(t, db.Put([]byte{1}, []byte{1}))
142 |
143 | // Write to file 1.
144 | assert.Nil(t, db.Put([]byte{0}, []byte{0}))
145 | assert.Nil(t, db.Put([]byte{0}, []byte{0}))
146 |
147 | assert.Equal(t, &segmentMeta{Full: true, PutRecords: 42, DeletedKeys: 41, DeletedBytes: 492}, db.datalog.segments[0].meta)
148 | assert.Equal(t, &segmentMeta{PutRecords: 2, DeletedKeys: 1, DeletedBytes: 12}, db.datalog.segments[1].meta)
149 |
150 | cm, err := db.Compact()
151 | assert.Nil(t, err)
152 | assert.Equal(t, CompactionResult{CompactedSegments: 1, ReclaimedRecords: 41, ReclaimedBytes: 492}, cm)
153 | assert.Nil(t, db.datalog.segments[0]) // Items were moved from file 0 to file 1.
154 | assert.Equal(t, &segmentMeta{PutRecords: 3, DeletedKeys: 1, DeletedBytes: 12}, db.datalog.segments[1].meta)
155 |
156 | // Fill file 1.
157 | for i := 0; i < 40; i++ {
158 | assert.Nil(t, db.Put([]byte{1}, []byte{2}))
159 | }
160 |
161 | // Fill file 0.
162 | for i := 0; i < 42; i++ {
163 | assert.Nil(t, db.Put([]byte{1}, []byte{2}))
164 | }
165 | // Write to file 2.
166 | assert.Nil(t, db.Put([]byte{0}, []byte{0}))
167 |
168 | assert.Equal(t, &segmentMeta{Full: true, PutRecords: 42, DeletedKeys: 42, DeletedBytes: 504}, db.datalog.segments[0].meta)
169 | assert.Equal(t, &segmentMeta{Full: true, PutRecords: 42, DeletedKeys: 42, DeletedBytes: 504}, db.datalog.segments[1].meta)
170 | assert.Equal(t, &segmentMeta{PutRecords: 2}, db.datalog.segments[2].meta)
171 |
172 | v, err := db.Get([]byte{1})
173 | assert.Nil(t, err)
174 | assert.Equal(t, []byte{2}, v)
175 |
176 | assert.Equal(t, uint32(2), db.Count())
177 |
178 | assert.Nil(t, db.Close())
179 |
180 | // Simulate crash.
181 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName)))
182 |
183 | db, err = Open(testDBName, opts)
184 | assert.Nil(t, err)
185 |
186 | assert.Equal(t, uint32(2), db.Count())
187 |
188 | v, err = db.Get([]byte{1})
189 | assert.Nil(t, err)
190 | assert.Equal(t, []byte{2}, v)
191 |
192 | assert.Nil(t, db.Close())
193 | }
194 |
195 | func TestRecoveryIterator(t *testing.T) {
196 | db, err := createTestDB(nil)
197 | assert.Nil(t, err)
198 |
199 | listRecords := func() []record {
200 | var records []record
201 | it := newRecoveryIterator(db.datalog.segmentsBySequenceID())
202 | for {
203 | rec, err := it.next()
204 | if err == ErrIterationDone {
205 | break
206 | }
207 | assert.Nil(t, err)
208 | records = append(records, rec)
209 | }
210 | return records
211 | }
212 |
213 | assert.Equal(t, 0, len(listRecords()))
214 |
215 | if err := db.Put([]byte{1}, []byte{1}); err != nil {
216 | t.Fatal(err)
217 | }
218 | assert.Equal(t,
219 | []record{
220 | {recordTypePut, 0, 512, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}},
221 | },
222 | listRecords(),
223 | )
224 |
225 | if err := db.Put([]byte{1}, []byte{1}); err != nil {
226 | t.Fatal(err)
227 | }
228 | assert.Equal(t,
229 | []record{
230 | {recordTypePut, 0, 512, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}},
231 | {recordTypePut, 0, 524, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}},
232 | },
233 | listRecords(),
234 | )
235 |
236 | if err := db.Put([]byte{2}, []byte{2}); err != nil {
237 | t.Fatal(err)
238 | }
239 | assert.Equal(t,
240 | []record{
241 | {recordTypePut, 0, 512, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}},
242 | {recordTypePut, 0, 524, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}},
243 | {recordTypePut, 0, 536, []byte{1, 0, 1, 0, 0, 0, 2, 2, 252, 15, 236, 190}, []byte{2}, []byte{2}},
244 | },
245 | listRecords(),
246 | )
247 |
248 | assert.Nil(t, db.Close())
249 | }
250 |
--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
1 | - Date: 2020-02-02
2 | - Authors: Artem Krylysov
3 |
4 | # About
5 |
6 | This document is a new version of the initial Pogreb design
7 | [blog post](https://artem.krylysov.com/blog/2018/03/24/pogreb-key-value-store/) published in 2018.
8 |
9 | The new version replaces the unstructured data file for storing key-value pairs with a write-ahead log to achieve
10 | durability.
11 |
12 | # Overview
13 |
14 | Pogreb is an embedded key-value store for read-heavy workloads.
15 | It aims to provide fast point lookups by indexing keys in an on-disk hash table.
16 |
17 | # Design
18 |
19 | Two key components of Pogreb are a write-ahead log (WAL) and a hash table index.
20 | The WAL stores key-value pairs on disk in append-only files.
21 | The on-disk hash table allows constant time lookups from keys to key-value pairs in the WAL.
22 |
23 | ## Write-ahead log
24 |
25 | The WAL consists of multiple append-only segments. Once the current segment file is full (reaches 4 GB), a new segment
26 | is created, the full segment becomes read-only.
27 |
28 | ```
29 | Write-ahead log
30 | +-----------+-----------+-...-+-----------+
31 | | Segment 0 | Segment 1 | ... | Segment N |
32 | +-----------+-----------+-...-+-----------+
33 | ```
34 |
35 | ### Segment
36 |
37 | A segment is a sequence of variable-length binary-encoded records.
38 |
39 | ```
40 | Segment
41 | +----------+----------+-...-+----------+
42 | | Record 0 | Record 1 | ... | Record N |
43 | +----------+----------+-...-+----------+
44 | ```
45 |
46 | The record layout:
47 |
48 | ```
49 | Record
50 | +---------------+------------------+------------------+-...-+--...--+----------+
51 | | Key Size (2B) | Record Type (1b) | Value Size (31b) | Key | Value | CRC (4B) |
52 | +---------------+------------------+------------------+-...-+--...--+----------+
53 | ```
54 |
55 | The Record Type field is either `Put` (0) or `Delete` (1).
56 |
57 | ## Hash table index
58 |
59 | Pogreb uses two files to store the hash table on disk - "main" and "overflow" index files.
60 |
61 | Each index file holds an array of buckets.
62 |
63 | ```
64 | Index
65 | +----------+----------+-...-+----------+
66 | | Bucket 0 | Bucket 1 | ... | Bucket N |
67 | +----------+----------+-...-+----------+
68 | ```
69 |
70 | ### Bucket
71 |
72 | A bucket is an array of slots followed by an optional file pointer to the overflow bucket (stored in the "overflow"
73 | index).
74 | The number of slots in a bucket is 31 - that is the maximum number of slots that is possible to fit in 512
75 | bytes.
76 |
77 | ```
78 | Bucket
79 | +--------+--------+-...-+--------+-----------------------------+
80 | | Slot 0 | Slot 1 | ... | Slot N | Overflow Bucket Offset (8B) |
81 | +--------+--------+-...-+--------+-----------------------------+
82 | ```
83 |
84 | ### Slot
85 |
86 | A slot contains the hash, the size of the key, the value size and a 32-bit offset of the key-value pair in the WAL.
87 |
88 | ```
89 | Slot
90 | +-----------+-----------------+---------------+-----------------+-------------+
91 | | Hash (4B) | Segment ID (2B) | Key Size (2B) | Value Size (4B) | Offset (4B) |
92 | +-----------+-----------------+---------------+-----------------+-------------+
93 | ```
94 |
95 | ## Linear hashing
96 |
97 | Pogreb uses the [Linear hashing](https://en.wikipedia.org/wiki/Linear_hashing) algorithm which grows the hash table
98 | one bucket at a time instead of rebuilding it entirely.
99 |
100 | Initially, the hash table contains a single bucket (*N=1*).
101 |
102 | Level *L* (initially *L=0*) represents the maximum number of buckets on a logarithmic scale the hash table can store.
103 | For example, a hash table with *L=0* contains between 0 and 1 buckets; *L=3* contains between 4 and 8 buckets.
104 |
105 | *S* is the index of the "split" bucket (initially *S=0*).
106 |
107 | Collisions are resolved using the bucket chaining technique.
108 | The "overflow" index file stores overflow buckets that form a linked list.
109 |
110 | ### Lookup
111 |
112 | Position of a bucket in the index file is calculated by applying a hash function to a key:
113 |
114 | ```
115 | Index
116 | +----------+
117 | | Bucket 0 | Bucket
118 | +----------+ +--------+--------+-...-+--------+
119 | h(key) -> | Bucket 1 | -> | Slot 0 | Slot 1 | ... | Slot N |
120 | +-........-+ +--------+--------+-...-+--------+
121 | | ........ | |
122 | +-........-+ |
123 | | Bucket N | |
124 | +----------+ |
125 | v
126 | Write-ahead log
127 | +-----------+-----------+-...-+-----------+
128 | | Segment 0 | Segment 1 | ... | Segment N |
129 | +-----------+-----------+-...-+-----------+
130 | ```
131 |
132 | To get the position of the bucket:
133 |
134 | 1. Hash the key (Pogreb uses the 32-bit version of MurmurHash3).
135 | 2. Use 2L bits of the hash to get the position of the bucket - `hash % math.Pow(2, L)`.
136 | 3. Set the position to `hash % math.Pow(2, L+1)` if the previously calculated position comes before the
137 | split bucket *S*.
138 |
139 | The lookup function reads a bucket at the given position from the index file and performs a linear search to find a slot
140 | with the required hash.
141 | If the bucket doesn't contain a slot with the required hash, but the pointer to the overflow bucket is non-zero, the
142 | overflow bucket is inspected.
143 | The process continues until a required slot is found or until there is no more overflow buckets for the given key.
144 | Once a slot with the required key is found, Pogreb reads the key-value pair from the WAL.
145 |
146 | The average lookup requires two I/O operations - one is to find a slot in the index and another one is to read the key
147 | and value from the WAL.
148 |
149 | ### Insertion
150 |
151 | Insertion is performed by adding a new "put" record to the WAL and updating a bucket in the index.
152 | If the bucket has all of its slots occupied, a new overflow bucket is created.
153 |
154 | ### Split
155 |
156 | When the number of items in the hash table exceeds the load factor threshold (70%), the split operation is performed on
157 | the split bucket *S*:
158 |
159 | 1. Allocate a new bucket at the end of the index file.
160 | 2. Increment the split bucket index *S*.
161 | 3. Increment *L* and reset *S* to 0 if *S* points to 2L.
162 | 4. Divide items from the old split bucket between the newly allocated bucket and the old split bucket by
163 | recalculating the positions of the keys in the hash table.
164 | 5. Increment the number of buckets *N*.
165 |
166 | ### Removal
167 |
168 | The removal operation lookups a bucket by key, removes a slot from the bucket, overwrites the bucket in the index
169 | and then appends a new "delete" record to the WAL.
170 |
171 | ## Compaction
172 |
173 | Since the WAL is append-only, the disk space occupied by overwritten or deleted keys is not reclaimed immediately.
174 | Pogreb supports optional online compaction.
175 |
176 | Every time a key is overwritten or deleted, Pogreb increments the number of "deleted" bytes and keys for the
177 | corresponding WAL segment.
178 | The background compaction thread periodically loops through the WAL segment metadata and picks segments with 50% or
179 | higher disk space fragmentation for compaction.
180 | The compaction thread finds segment's live records (not deleted or overwritten) by looking up keys in the index.
181 | It writes live records to a new segment file and updates the corresponding slots in the index file.
182 | After the compaction is successfully finished, the compacted segment files are removed.
183 |
184 | ## Recovery
185 |
186 | In the event of a crash caused by a power loss or an operating system failure, Pogreb discards the index and replays the
187 | WAL building a new index from scratch.
188 | Segments are iterated from the oldest to the newest and items are inserted into the index.
189 |
190 | # Limitations
191 |
192 | The design choices made to optimize for point lookups bring limitations for other potential use-cases. For example, using a hash table for indexing makes range scans impossible. Additionally, having a single hash table shared across all WAL segments makes the recovery process require rebuilding the entire index, which may be impractical for large databases.
--------------------------------------------------------------------------------
/fs/fs_test.go:
--------------------------------------------------------------------------------
1 | package fs
2 |
3 | import (
4 | "io"
5 | "os"
6 | "testing"
7 |
8 | "github.com/akrylysov/pogreb/internal/assert"
9 | )
10 |
11 | const (
12 | lockTestPath = "test.lock"
13 | )
14 |
15 | var (
16 | lockTestMode = os.FileMode(0666)
17 | )
18 |
19 | func testLockFile(t *testing.T, fs FileSystem) {
20 | _ = fs.Remove(lockTestPath)
21 | lock, acquiredExisting, err := fs.CreateLockFile(lockTestPath, lockTestMode)
22 | if lock == nil || acquiredExisting || err != nil {
23 | t.Fatal(lock, err, acquiredExisting)
24 | }
25 | lock2, acquiredExisting2, err2 := fs.CreateLockFile(lockTestPath, lockTestMode)
26 | if lock2 != nil || acquiredExisting2 || err2 != os.ErrExist {
27 | t.Fatal(lock2, acquiredExisting2, err2)
28 | }
29 |
30 | err = lock.Unlock()
31 | assert.Nil(t, err)
32 |
33 | _, err = fs.Stat(lockTestPath)
34 | assert.NotNil(t, err)
35 | }
36 |
37 | func touchFile(fs FileSystem, path string) error {
38 | f, err := fs.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_TRUNC, os.FileMode(0666))
39 | if err != nil {
40 | return err
41 | }
42 | return f.Close()
43 | }
44 |
45 | func testLockFileAcquireExisting(t *testing.T, fs FileSystem) {
46 | err := touchFile(fs, lockTestPath)
47 | assert.Nil(t, err)
48 |
49 | lock, acquiredExisting, err := fs.CreateLockFile(lockTestPath, lockTestMode)
50 | if lock == nil || !acquiredExisting || err != nil {
51 | t.Fatal(lock, err, acquiredExisting)
52 | }
53 |
54 | err = lock.Unlock()
55 | assert.Nil(t, err)
56 |
57 | _, err = fs.Stat(lockTestPath)
58 | assert.NotNil(t, err)
59 | }
60 |
61 | func testFS(t *testing.T, fsys FileSystem) {
62 | f, err := fsys.OpenFile("test", os.O_CREATE|os.O_RDWR|os.O_TRUNC, os.FileMode(0666))
63 | assert.Nil(t, err)
64 |
65 | buf := make([]byte, 10)
66 |
67 | t.Run("Empty file", func(t *testing.T) {
68 | off, err := f.Seek(0, io.SeekCurrent)
69 | assert.Nil(t, err)
70 | assert.Equal(t, int64(0), off)
71 |
72 | n, err := f.Read(buf)
73 | assert.Equal(t, 0, n)
74 | assert.Equal(t, io.EOF, err)
75 |
76 | n, err = f.ReadAt(buf, 0)
77 | assert.Equal(t, 0, n)
78 | assert.Equal(t, io.EOF, err)
79 |
80 | n, err = f.ReadAt(buf, 10)
81 | assert.Equal(t, 0, n)
82 | assert.Equal(t, io.EOF, err)
83 |
84 | b, err := f.Slice(1, 10)
85 | assert.Equal(t, io.EOF, err)
86 | assert.Nil(t, b)
87 | })
88 |
89 | testData := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
90 |
91 | t.Run("Write", func(t *testing.T) {
92 | n, err := f.Write(testData[:9])
93 | assert.Nil(t, err)
94 | assert.Equal(t, 9, n)
95 |
96 | off, err := f.Seek(0, io.SeekCurrent)
97 | assert.Nil(t, err)
98 | assert.Equal(t, int64(9), off)
99 | })
100 |
101 | t.Run("Write beyond EOF", func(t *testing.T) {
102 | off, err := f.Seek(2, io.SeekStart)
103 | assert.Nil(t, err)
104 | assert.Equal(t, int64(2), off)
105 |
106 | n, err := f.Write(testData[2:])
107 | assert.Nil(t, err)
108 | assert.Equal(t, 8, n)
109 |
110 | off, err = f.Seek(0, io.SeekCurrent)
111 | assert.Nil(t, err)
112 | assert.Equal(t, int64(10), off)
113 | })
114 |
115 | t.Run("Slice", func(t *testing.T) {
116 | b, err := f.Slice(1, 9)
117 | assert.Nil(t, err)
118 | assert.Equal(t, testData[1:9], b)
119 |
120 | b, err = f.Slice(0, 10)
121 | assert.Nil(t, err)
122 | assert.Equal(t, testData, b)
123 |
124 | // Offset larger than mapping.
125 | b, err = f.Slice(0, 12)
126 | assert.Equal(t, io.EOF, err)
127 | assert.Nil(t, b)
128 | })
129 |
130 | t.Run("WriteAt", func(t *testing.T) {
131 | n, err := f.WriteAt(testData[1:4], 1)
132 | assert.Nil(t, err)
133 | assert.Equal(t, 3, n)
134 |
135 | // WriteAt doesn't move offset.
136 | off, err := f.Seek(0, io.SeekCurrent)
137 | assert.Nil(t, err)
138 | assert.Equal(t, int64(10), off)
139 | })
140 |
141 | t.Run("Sync", func(t *testing.T) {
142 | // Not tested yet, just make sure it doesn't return an error.
143 | assert.Nil(t, f.Sync())
144 | })
145 |
146 | t.Run("Stat", func(t *testing.T) {
147 | fi, err := f.Stat()
148 | assert.Nil(t, err)
149 | assert.Equal(t, "test", fi.Name())
150 | assert.Equal(t, int64(len(testData)), fi.Size())
151 | assert.Equal(t, false, fi.IsDir())
152 | // FIXME: not implemented for all file systems.
153 | // assert.Equal(t, os.FileMode(0666), fi.Mode())
154 | _ = fi.Mode()
155 | _ = fi.ModTime()
156 | _ = fi.Sys()
157 |
158 | // File doesn't exist.
159 | _, err = fsys.Stat("foobar")
160 | assert.NotNil(t, err)
161 | })
162 |
163 | t.Run("ReadAt", func(t *testing.T) {
164 | n, err := f.ReadAt(buf, 0)
165 | assert.Nil(t, err)
166 | assert.Equal(t, len(testData), n)
167 | assert.Equal(t, testData, buf)
168 | })
169 |
170 | t.Run("Read EOF", func(t *testing.T) {
171 | n, err := f.Read(buf)
172 | assert.Equal(t, io.EOF, err)
173 | assert.Equal(t, 0, n)
174 | })
175 |
176 | t.Run("Read", func(t *testing.T) {
177 | // SeekEnd and Read
178 | off, err := f.Seek(0, io.SeekEnd)
179 | assert.Nil(t, err)
180 | assert.Equal(t, int64(len(testData)), off)
181 |
182 | n, err := f.Read(buf)
183 | assert.Equal(t, io.EOF, err)
184 | assert.Equal(t, 0, n)
185 |
186 | // SeekStart and Read
187 | off, err = f.Seek(0, io.SeekStart)
188 | assert.Nil(t, err)
189 | assert.Equal(t, int64(0), off)
190 |
191 | n, err = f.Read(buf)
192 | assert.Nil(t, err)
193 | assert.Equal(t, len(testData), n)
194 | assert.Equal(t, testData, buf)
195 |
196 | off, err = f.Seek(0, io.SeekCurrent)
197 | assert.Equal(t, int64(n), off)
198 | assert.Nil(t, err)
199 |
200 | // SeekStart 2 and Read
201 | testOff := int64(2)
202 | lbuf := make([]byte, 8)
203 | off, err = f.Seek(testOff, io.SeekStart)
204 | assert.Nil(t, err)
205 | assert.Equal(t, testOff, off)
206 |
207 | n, err = f.Read(lbuf)
208 | assert.Nil(t, err)
209 | assert.Equal(t, len(testData)-int(testOff), n)
210 | assert.Equal(t, testData[testOff:], lbuf)
211 | })
212 |
213 | t.Run("Read larger than file", func(t *testing.T) {
214 | off, err := f.Seek(0, io.SeekStart)
215 | assert.Nil(t, err)
216 | assert.Equal(t, int64(0), off)
217 |
218 | lbuf := make([]byte, 4096)
219 | n, err := f.Read(lbuf)
220 | assert.Nil(t, err)
221 | assert.Equal(t, len(testData), n)
222 | assert.Equal(t, testData, lbuf[:n])
223 |
224 | n, err = f.Read(lbuf)
225 | assert.Equal(t, io.EOF, err)
226 | assert.Equal(t, 0, n)
227 | })
228 |
229 | t.Run("Close and Open again", func(t *testing.T) {
230 | assert.Nil(t, f.Close())
231 |
232 | f, err = fsys.OpenFile("test", os.O_RDWR, os.FileMode(0666))
233 | assert.Nil(t, err)
234 |
235 | b, err := f.Slice(1, 10)
236 | assert.Nil(t, err)
237 | assert.Equal(t, testData[1:], b)
238 | })
239 |
240 | t.Run("Truncate extend", func(t *testing.T) {
241 | err := f.Truncate(11)
242 | assert.Nil(t, err)
243 |
244 | lbuf := make([]byte, 11)
245 | n, err := f.ReadAt(lbuf, 0)
246 | assert.Nil(t, err)
247 | assert.Equal(t, 11, n)
248 | assert.Equal(t, testData, lbuf[:10])
249 |
250 | b, err := f.Slice(0, 11)
251 | assert.Nil(t, err)
252 | assert.Equal(t, testData, b[:10])
253 |
254 | fi, err := f.Stat()
255 | assert.Nil(t, err)
256 | assert.Equal(t, int64(11), fi.Size())
257 | })
258 |
259 | t.Run("Truncate shrink", func(t *testing.T) {
260 | err := f.Truncate(1)
261 | assert.Nil(t, err)
262 |
263 | lbuf := make([]byte, 1)
264 | n, err := f.ReadAt(lbuf, 0)
265 | assert.Nil(t, err)
266 | assert.Equal(t, 1, n)
267 | assert.Equal(t, testData[:1], lbuf)
268 |
269 | b, err := f.Slice(0, 1)
270 | assert.Nil(t, err)
271 | assert.Equal(t, testData[:1], b)
272 |
273 | b, err = f.Slice(0, 10)
274 | assert.Equal(t, io.EOF, err)
275 | assert.Nil(t, b)
276 |
277 | fi, err := f.Stat()
278 | assert.Nil(t, err)
279 | assert.Equal(t, int64(1), fi.Size())
280 | })
281 |
282 | t.Run("Truncate shrink to zero", func(t *testing.T) {
283 | err := f.Truncate(0)
284 | assert.Nil(t, err)
285 |
286 | n, err := f.ReadAt(buf, 0)
287 | assert.Equal(t, io.EOF, err)
288 | assert.Equal(t, 0, n)
289 |
290 | b, err := f.Slice(0, 1)
291 | assert.Equal(t, io.EOF, err)
292 | assert.Nil(t, b)
293 |
294 | fi, err := f.Stat()
295 | assert.Nil(t, err)
296 | assert.Equal(t, int64(0), fi.Size())
297 | })
298 |
299 | t.Run("Close", func(t *testing.T) {
300 | assert.Nil(t, f.Close())
301 |
302 | err := f.Close()
303 | assert.NotNil(t, err)
304 |
305 | _, err = f.Seek(1, io.SeekStart)
306 | assert.NotNil(t, err)
307 | })
308 |
309 | t.Run("Rename", func(t *testing.T) {
310 | err := fsys.Rename("foobar", "baz")
311 | assert.NotNil(t, err)
312 |
313 | assert.Nil(t, fsys.Rename("test", "test2"))
314 | fi, err := fsys.Stat("test2")
315 | assert.Nil(t, err)
316 | assert.Equal(t, int64(0), fi.Size())
317 | assert.Equal(t, "test2", fi.Name())
318 | })
319 |
320 | t.Run("ReadDir", func(t *testing.T) {
321 | fis, err := fsys.ReadDir(".")
322 | assert.Nil(t, err)
323 |
324 | var hasTestFile bool
325 | for _, fi := range fis {
326 | if fi.Name() == "test2" {
327 | hasTestFile = true
328 | }
329 | }
330 | assert.Equal(t, true, hasTestFile)
331 | })
332 |
333 | t.Run("Remove", func(t *testing.T) {
334 | err := fsys.Remove("test2")
335 | assert.Nil(t, err)
336 |
337 | _, err = fsys.Stat("test2")
338 | assert.NotNil(t, err)
339 | })
340 | }
341 |
--------------------------------------------------------------------------------
/index.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "github.com/akrylysov/pogreb/internal/errors"
5 | )
6 |
7 | const (
8 | indexExt = ".pix"
9 | indexMainName = "main" + indexExt
10 | indexOverflowName = "overflow" + indexExt
11 | indexMetaName = "index" + metaExt
12 | loadFactor = 0.7
13 | )
14 |
15 | // index is an on-disk linear hashing hash table.
16 | // It uses two files to store the hash table on disk - "main" and "overflow" index files.
17 | // Each index file holds an array of buckets.
18 | type index struct {
19 | opts *Options
20 | main *file // Main index file.
21 | overflow *file // Overflow index file.
22 | freeBucketOffs []int64 // Offsets of freed buckets.
23 | level uint8 // Maximum number of buckets on a logarithmic scale.
24 | numKeys uint32 // Number of keys.
25 | numBuckets uint32 // Number of buckets.
26 | splitBucketIdx uint32 // Index of the bucket to split on next split.
27 | }
28 |
29 | type indexMeta struct {
30 | Level uint8
31 | NumKeys uint32
32 | NumBuckets uint32
33 | SplitBucketIndex uint32
34 | FreeOverflowBuckets []int64
35 | }
36 |
37 | // matchKeyFunc returns whether the slot matches the key sought.
38 | type matchKeyFunc func(slot) (bool, error)
39 |
40 | func openIndex(opts *Options) (*index, error) {
41 | main, err := openFile(opts.FileSystem, indexMainName, openFileFlags{})
42 | if err != nil {
43 | return nil, errors.Wrap(err, "opening main index")
44 | }
45 | overflow, err := openFile(opts.FileSystem, indexOverflowName, openFileFlags{})
46 | if err != nil {
47 | _ = main.Close()
48 | return nil, errors.Wrap(err, "opening overflow index")
49 | }
50 | idx := &index{
51 | opts: opts,
52 | main: main,
53 | overflow: overflow,
54 | numBuckets: 1,
55 | }
56 | if main.empty() {
57 | // Add an empty bucket.
58 | if _, err = idx.main.extend(bucketSize); err != nil {
59 | _ = main.Close()
60 | _ = overflow.Close()
61 | return nil, err
62 | }
63 | } else if err := idx.readMeta(); err != nil {
64 | _ = main.Close()
65 | _ = overflow.Close()
66 | return nil, errors.Wrap(err, "opening index meta")
67 | }
68 | return idx, nil
69 | }
70 |
71 | func (idx *index) writeMeta() error {
72 | m := indexMeta{
73 | Level: idx.level,
74 | NumKeys: idx.numKeys,
75 | NumBuckets: idx.numBuckets,
76 | SplitBucketIndex: idx.splitBucketIdx,
77 | FreeOverflowBuckets: idx.freeBucketOffs,
78 | }
79 | return writeGobFile(idx.opts.FileSystem, indexMetaName, m)
80 | }
81 |
82 | func (idx *index) readMeta() error {
83 | m := indexMeta{}
84 | if err := readGobFile(idx.opts.FileSystem, indexMetaName, &m); err != nil {
85 | return err
86 | }
87 | idx.level = m.Level
88 | idx.numKeys = m.NumKeys
89 | idx.numBuckets = m.NumBuckets
90 | idx.splitBucketIdx = m.SplitBucketIndex
91 | idx.freeBucketOffs = m.FreeOverflowBuckets
92 | return nil
93 | }
94 |
95 | func (idx *index) bucketIndex(hash uint32) uint32 {
96 | bidx := hash & ((1 << idx.level) - 1)
97 | if bidx < idx.splitBucketIdx {
98 | return hash & ((1 << (idx.level + 1)) - 1)
99 | }
100 | return bidx
101 | }
102 |
103 | type bucketIterator struct {
104 | off int64 // Offset of the next bucket.
105 | f *file // Current index file.
106 | overflow *file // Overflow index file.
107 | }
108 |
109 | // bucketOffset returns on-disk bucket offset by the bucket index.
110 | func bucketOffset(idx uint32) int64 {
111 | return int64(headerSize) + (int64(bucketSize) * int64(idx))
112 | }
113 |
114 | func (idx *index) newBucketIterator(startBucketIdx uint32) *bucketIterator {
115 | return &bucketIterator{
116 | off: bucketOffset(startBucketIdx),
117 | f: idx.main,
118 | overflow: idx.overflow,
119 | }
120 | }
121 |
122 | func (it *bucketIterator) next() (bucketHandle, error) {
123 | if it.off == 0 {
124 | return bucketHandle{}, ErrIterationDone
125 | }
126 | b := bucketHandle{file: it.f, offset: it.off}
127 | if err := b.read(); err != nil {
128 | return bucketHandle{}, err
129 | }
130 | it.f = it.overflow
131 | it.off = b.next
132 | return b, nil
133 | }
134 |
135 | func (idx *index) get(hash uint32, matchKey matchKeyFunc) error {
136 | it := idx.newBucketIterator(idx.bucketIndex(hash))
137 | for {
138 | b, err := it.next()
139 | if err == ErrIterationDone {
140 | return nil
141 | }
142 | if err != nil {
143 | return err
144 | }
145 | for i := 0; i < slotsPerBucket; i++ {
146 | sl := b.slots[i]
147 | // No more slots in the bucket.
148 | if sl.offset == 0 {
149 | break
150 | }
151 | if hash != sl.hash {
152 | continue
153 | }
154 | if match, err := matchKey(sl); match || err != nil {
155 | return err
156 | }
157 | }
158 | }
159 | }
160 |
161 | func (idx *index) findInsertionBucket(newSlot slot, matchKey matchKeyFunc) (*slotWriter, bool, error) {
162 | sw := &slotWriter{}
163 | it := idx.newBucketIterator(idx.bucketIndex(newSlot.hash))
164 | for {
165 | b, err := it.next()
166 | if err == ErrIterationDone {
167 | return nil, false, errors.New("failed to insert a new slot")
168 | }
169 | if err != nil {
170 | return nil, false, err
171 | }
172 | sw.bucket = &b
173 | var i int
174 | for i = 0; i < slotsPerBucket; i++ {
175 | sl := b.slots[i]
176 | if sl.offset == 0 {
177 | // Found an empty slot.
178 | sw.slotIdx = i
179 | return sw, false, nil
180 | }
181 | if newSlot.hash != sl.hash {
182 | continue
183 | }
184 | match, err := matchKey(sl)
185 | if err != nil {
186 | return nil, false, err
187 | }
188 | if match {
189 | // Key already in the index.
190 | // The slot writer will overwrite the existing slot.
191 | sw.slotIdx = i
192 | return sw, true, nil
193 | }
194 | }
195 | if b.next == 0 {
196 | // No more buckets in the chain.
197 | sw.slotIdx = i
198 | return sw, false, nil
199 | }
200 | }
201 | }
202 |
203 | func (idx *index) put(newSlot slot, matchKey matchKeyFunc) error {
204 | if idx.numKeys == MaxKeys {
205 | return errFull
206 | }
207 | sw, overwritingExisting, err := idx.findInsertionBucket(newSlot, matchKey)
208 | if err != nil {
209 | return err
210 | }
211 | if err := sw.insert(newSlot, idx); err != nil {
212 | return err
213 | }
214 | if err := sw.write(); err != nil {
215 | return err
216 | }
217 | if overwritingExisting {
218 | return nil
219 | }
220 | idx.numKeys++
221 | if float64(idx.numKeys)/float64(idx.numBuckets*slotsPerBucket) > loadFactor {
222 | if err := idx.split(); err != nil {
223 | return err
224 | }
225 | }
226 | return nil
227 | }
228 |
229 | func (idx *index) delete(hash uint32, matchKey matchKeyFunc) error {
230 | it := idx.newBucketIterator(idx.bucketIndex(hash))
231 | for {
232 | b, err := it.next()
233 | if err == ErrIterationDone {
234 | return nil
235 | }
236 | if err != nil {
237 | return err
238 | }
239 | for i := 0; i < slotsPerBucket; i++ {
240 | sl := b.slots[i]
241 | if sl.offset == 0 {
242 | break
243 | }
244 | if hash != sl.hash {
245 | continue
246 | }
247 | match, err := matchKey(sl)
248 | if err != nil {
249 | return err
250 | }
251 | if !match {
252 | continue
253 | }
254 | b.del(i)
255 | if err := b.write(); err != nil {
256 | return err
257 | }
258 | idx.numKeys--
259 | return nil
260 | }
261 | }
262 | }
263 |
264 | func (idx *index) createOverflowBucket() (*bucketHandle, error) {
265 | var off int64
266 | if len(idx.freeBucketOffs) > 0 {
267 | off = idx.freeBucketOffs[0]
268 | idx.freeBucketOffs = idx.freeBucketOffs[1:]
269 | } else {
270 | var err error
271 | off, err = idx.overflow.extend(bucketSize)
272 | if err != nil {
273 | return nil, err
274 | }
275 | }
276 | return &bucketHandle{file: idx.overflow, offset: off}, nil
277 | }
278 |
279 | func (idx *index) freeOverflowBucket(offsets ...int64) {
280 | idx.freeBucketOffs = append(idx.freeBucketOffs, offsets...)
281 | }
282 |
283 | func (idx *index) split() error {
284 | updatedBucketIdx := idx.splitBucketIdx
285 | updatedBucketOff := bucketOffset(updatedBucketIdx)
286 | updatedBucket := slotWriter{
287 | bucket: &bucketHandle{file: idx.main, offset: updatedBucketOff},
288 | }
289 |
290 | newBucketOff, err := idx.main.extend(bucketSize)
291 | if err != nil {
292 | return err
293 | }
294 |
295 | sw := slotWriter{
296 | bucket: &bucketHandle{file: idx.main, offset: newBucketOff},
297 | }
298 |
299 | idx.splitBucketIdx++
300 | if idx.splitBucketIdx == 1< 0 || db.opts.BackgroundCompactionInterval > 0 {
115 | db.startBackgroundWorker()
116 | }
117 |
118 | return db, nil
119 | }
120 |
121 | func cloneBytes(src []byte) []byte {
122 | dst := make([]byte, len(src))
123 | copy(dst, src)
124 | return dst
125 | }
126 |
127 | func (db *DB) writeMeta() error {
128 | m := dbMeta{
129 | HashSeed: db.hashSeed,
130 | }
131 | return writeGobFile(db.opts.FileSystem, dbMetaName, m)
132 | }
133 |
134 | func (db *DB) readMeta() error {
135 | m := dbMeta{}
136 | if err := readGobFile(db.opts.FileSystem, dbMetaName, &m); err != nil {
137 | return err
138 | }
139 | db.hashSeed = m.HashSeed
140 | return nil
141 | }
142 |
143 | func (db *DB) hash(data []byte) uint32 {
144 | return hash.Sum32WithSeed(data, db.hashSeed)
145 | }
146 |
147 | // newNullableTicker is a wrapper around time.NewTicker that allows creating a nil ticker.
148 | // A nil ticker never ticks.
149 | func newNullableTicker(d time.Duration) (<-chan time.Time, func()) {
150 | if d > 0 {
151 | t := time.NewTicker(d)
152 | return t.C, t.Stop
153 | }
154 | return nil, func() {}
155 | }
156 |
157 | func (db *DB) startBackgroundWorker() {
158 | ctx, cancel := context.WithCancel(context.Background())
159 | db.cancelBgWorker = cancel
160 | db.closeWg.Add(1)
161 |
162 | go func() {
163 | defer db.closeWg.Done()
164 |
165 | syncC, syncStop := newNullableTicker(db.opts.BackgroundSyncInterval)
166 | defer syncStop()
167 |
168 | compactC, compactStop := newNullableTicker(db.opts.BackgroundCompactionInterval)
169 | defer compactStop()
170 |
171 | for {
172 | select {
173 | case <-ctx.Done():
174 | return
175 | case <-syncC:
176 | if err := db.Sync(); err != nil {
177 | logger.Printf("error synchronizing database: %v", err)
178 | }
179 | case <-compactC:
180 | if cr, err := db.Compact(); err != nil {
181 | logger.Printf("error compacting database: %v", err)
182 | } else if cr.CompactedSegments > 0 {
183 | logger.Printf("compacted database: %+v", cr)
184 | }
185 | }
186 | }
187 | }()
188 | }
189 |
190 | // Get returns the value for the given key stored in the DB or nil if the key doesn't exist.
191 | func (db *DB) Get(key []byte) ([]byte, error) {
192 | h := db.hash(key)
193 | db.metrics.Gets.Add(1)
194 | db.mu.RLock()
195 | defer db.mu.RUnlock()
196 | var retValue []byte
197 | err := db.index.get(h, func(sl slot) (bool, error) {
198 | if uint16(len(key)) != sl.keySize {
199 | return false, nil
200 | }
201 | slKey, value, err := db.datalog.readKeyValue(sl)
202 | if err != nil {
203 | return true, err
204 | }
205 | if bytes.Equal(key, slKey) {
206 | retValue = cloneBytes(value)
207 | return true, nil
208 | }
209 | db.metrics.HashCollisions.Add(1)
210 | return false, nil
211 | })
212 | if err != nil {
213 | return nil, err
214 | }
215 | return retValue, nil
216 | }
217 |
218 | // GetAppend returns the value for the given key (appended into buffer) stored in the DB or nil if the key doesn't exist
219 | func (db *DB) GetAppend(key, buf []byte) ([]byte, error) {
220 | h := db.hash(key)
221 | db.metrics.Gets.Add(1)
222 | db.mu.RLock()
223 | defer db.mu.RUnlock()
224 | var retValue []byte
225 | err := db.index.get(h, func(sl slot) (bool, error) {
226 | if uint16(len(key)) != sl.keySize {
227 | return false, nil
228 | }
229 | slKey, value, err := db.datalog.readKeyValue(sl)
230 | if err != nil {
231 | return true, err
232 | }
233 | if bytes.Equal(key, slKey) {
234 | retValue = append(buf, value...)
235 | return true, nil
236 | }
237 | db.metrics.HashCollisions.Add(1)
238 | return false, nil
239 | })
240 | if err != nil {
241 | return nil, err
242 | }
243 | return retValue, nil
244 | }
245 |
246 | // Has returns true if the DB contains the given key.
247 | func (db *DB) Has(key []byte) (bool, error) {
248 | h := db.hash(key)
249 | db.metrics.Gets.Add(1)
250 | found := false
251 | db.mu.RLock()
252 | defer db.mu.RUnlock()
253 | err := db.index.get(h, func(sl slot) (bool, error) {
254 | if uint16(len(key)) != sl.keySize {
255 | return false, nil
256 | }
257 | slKey, err := db.datalog.readKey(sl)
258 | if err != nil {
259 | return true, err
260 | }
261 | if bytes.Equal(key, slKey) {
262 | found = true
263 | return true, nil
264 | }
265 | return false, nil
266 | })
267 | if err != nil {
268 | return false, err
269 | }
270 | return found, nil
271 | }
272 |
273 | func (db *DB) put(sl slot, key []byte) error {
274 | return db.index.put(sl, func(cursl slot) (bool, error) {
275 | if uint16(len(key)) != cursl.keySize {
276 | return false, nil
277 | }
278 | slKey, err := db.datalog.readKey(cursl)
279 | if err != nil {
280 | return true, err
281 | }
282 | if bytes.Equal(key, slKey) {
283 | db.datalog.trackDel(cursl) // Overwriting existing key.
284 | return true, nil
285 | }
286 | return false, nil
287 | })
288 | }
289 |
290 | // Put sets the value for the given key. It updates the value for the existing key.
291 | func (db *DB) Put(key []byte, value []byte) error {
292 | if len(key) > MaxKeyLength {
293 | return errKeyTooLarge
294 | }
295 | if len(value) > MaxValueLength {
296 | return errValueTooLarge
297 | }
298 | h := db.hash(key)
299 | db.metrics.Puts.Add(1)
300 | db.mu.Lock()
301 | defer db.mu.Unlock()
302 |
303 | segID, offset, err := db.datalog.put(key, value)
304 | if err != nil {
305 | return err
306 | }
307 |
308 | sl := slot{
309 | hash: h,
310 | segmentID: segID,
311 | keySize: uint16(len(key)),
312 | valueSize: uint32(len(value)),
313 | offset: offset,
314 | }
315 |
316 | if err := db.put(sl, key); err != nil {
317 | return err
318 | }
319 |
320 | if db.syncWrites {
321 | return db.sync()
322 | }
323 | return nil
324 | }
325 |
326 | func (db *DB) del(h uint32, key []byte, writeWAL bool) error {
327 | err := db.index.delete(h, func(sl slot) (b bool, e error) {
328 | if uint16(len(key)) != sl.keySize {
329 | return false, nil
330 | }
331 | slKey, err := db.datalog.readKey(sl)
332 | if err != nil {
333 | return true, err
334 | }
335 | if bytes.Equal(key, slKey) {
336 | db.datalog.trackDel(sl)
337 | var err error
338 | if writeWAL {
339 | err = db.datalog.del(key)
340 | }
341 | return true, err
342 | }
343 | return false, nil
344 | })
345 | return err
346 | }
347 |
348 | // Delete deletes the given key from the DB.
349 | func (db *DB) Delete(key []byte) error {
350 | h := db.hash(key)
351 | db.metrics.Dels.Add(1)
352 | db.mu.Lock()
353 | defer db.mu.Unlock()
354 | if err := db.del(h, key, true); err != nil {
355 | return err
356 | }
357 | if db.syncWrites {
358 | return db.sync()
359 | }
360 | return nil
361 | }
362 |
363 | // Close closes the DB.
364 | func (db *DB) Close() error {
365 | if db.cancelBgWorker != nil {
366 | db.cancelBgWorker()
367 | }
368 | db.closeWg.Wait()
369 | db.mu.Lock()
370 | defer db.mu.Unlock()
371 | if err := db.writeMeta(); err != nil {
372 | return err
373 | }
374 | if err := db.datalog.close(); err != nil {
375 | return err
376 | }
377 | if err := db.index.close(); err != nil {
378 | return err
379 | }
380 | if err := db.lock.Unlock(); err != nil {
381 | return err
382 | }
383 | return nil
384 | }
385 |
386 | func (db *DB) sync() error {
387 | return db.datalog.sync()
388 | }
389 |
390 | // Items returns a new ItemIterator.
391 | func (db *DB) Items() *ItemIterator {
392 | return &ItemIterator{db: db}
393 | }
394 |
395 | // Sync commits the contents of the database to the backing FileSystem.
396 | func (db *DB) Sync() error {
397 | db.mu.Lock()
398 | defer db.mu.Unlock()
399 | return db.sync()
400 | }
401 |
402 | // Count returns the number of keys in the DB.
403 | func (db *DB) Count() uint32 {
404 | db.mu.RLock()
405 | defer db.mu.RUnlock()
406 | return db.index.count()
407 | }
408 |
409 | // Metrics returns the DB metrics.
410 | func (db *DB) Metrics() *Metrics {
411 | return db.metrics
412 | }
413 |
414 | // FileSize returns the total size of the disk storage used by the DB.
415 | func (db *DB) FileSize() (int64, error) {
416 | var size int64
417 | files, err := db.opts.FileSystem.ReadDir(".")
418 | if err != nil {
419 | return 0, err
420 | }
421 | for _, file := range files {
422 | info, err := file.Info()
423 | if err != nil {
424 | return 0, err
425 | }
426 | size += info.Size()
427 | }
428 | return size, nil
429 | }
430 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/db_test.go:
--------------------------------------------------------------------------------
1 | package pogreb
2 |
3 | import (
4 | "bufio"
5 | "encoding/binary"
6 | "flag"
7 | "fmt"
8 | "io"
9 | "log"
10 | "os"
11 | "path/filepath"
12 | "strings"
13 | "testing"
14 | "time"
15 |
16 | "github.com/akrylysov/pogreb/fs"
17 | "github.com/akrylysov/pogreb/internal/assert"
18 | )
19 |
20 | const (
21 | testDBName = "test.db"
22 | )
23 |
24 | var (
25 | // File system used for all tests.
26 | testFS fs.FileSystem
27 | )
28 |
29 | func TestMain(m *testing.M) {
30 | flag.Parse()
31 | if !testing.Verbose() {
32 | SetLogger(log.New(io.Discard, "", 0))
33 | }
34 | // Run tests against all file systems.
35 | for _, fsys := range testFileSystems {
36 | var tmpDir string
37 | if fsys == fs.Mem {
38 | testFS = fsys
39 | } else {
40 | var err error
41 | tmpDir, err = os.MkdirTemp("", "pogreb-test")
42 | if err != nil {
43 | fmt.Printf("failed to create temporary directory: %v", err)
44 | os.Exit(1)
45 | }
46 | testFS = fs.Sub(fsys, tmpDir)
47 | }
48 | if testing.Verbose() {
49 | fmt.Printf("=== SET\tFS=%T\ttmpDir=%s\n", fsys, tmpDir)
50 | }
51 | exitCode := m.Run()
52 | if tmpDir != "" {
53 | _ = os.RemoveAll(tmpDir)
54 | }
55 | if exitCode != 0 {
56 | fmt.Printf("DEBUG\tFS=%T\n", fsys)
57 | os.Exit(exitCode)
58 | }
59 | }
60 | os.Exit(0)
61 | }
62 |
63 | func appendFile(path string, data []byte) error {
64 | f, err := testFS.OpenFile(path, os.O_RDWR, os.FileMode(0640))
65 | if err != nil {
66 | return err
67 | }
68 | defer f.Close()
69 | if _, err = f.Seek(0, io.SeekEnd); err != nil {
70 | return err
71 | }
72 | _, err = f.Write(data)
73 | return err
74 | }
75 |
76 | func align512(n uint32) uint32 {
77 | return (n + 511) &^ 511
78 | }
79 |
80 | func TestBucketSize(t *testing.T) {
81 | serializedSize := uint32(binary.Size(bucket{}))
82 | if bucketSize != align512(serializedSize) {
83 | t.Fatal("wrong bucketSize value", bucketSize)
84 | }
85 | if bucketSize-serializedSize > 32 {
86 | t.Fatal("bucket is wasting too much space", bucketSize, serializedSize)
87 | }
88 | }
89 |
90 | func TestHeaderSize(t *testing.T) {
91 | if headerSize != align512(uint32(binary.Size(header{}))) || headerSize != 512 {
92 | t.Fatal("wrong headerSize value", headerSize)
93 | }
94 | }
95 |
96 | func cleanDir(path string) error {
97 | files, err := testFS.ReadDir(path)
98 | if err != nil {
99 | if os.IsNotExist(err) {
100 | return nil
101 | }
102 | return err
103 | }
104 | for _, file := range files {
105 | _ = testFS.Remove(filepath.Join(path, file.Name()))
106 | }
107 | return nil
108 | }
109 |
110 | func createTestDB(opts *Options) (*DB, error) {
111 | if opts == nil {
112 | opts = &Options{FileSystem: testFS}
113 | } else {
114 | if opts.FileSystem == nil {
115 | opts.FileSystem = testFS
116 | }
117 | }
118 | if err := cleanDir(testDBName); err != nil {
119 | return nil, err
120 | }
121 | return Open(testDBName, opts)
122 | }
123 |
124 | func TestEmpty(t *testing.T) {
125 | opts := &Options{FileSystem: testFS}
126 | db, err := createTestDB(opts)
127 | assert.Nil(t, err)
128 | assert.Nil(t, db.Close())
129 | db, err = Open(testDBName, opts)
130 | assert.Nil(t, err)
131 | assert.Nil(t, db.Close())
132 | }
133 |
134 | func TestFull(t *testing.T) {
135 | fullTest(t, func(db *DB, key []byte) ([]byte, error) {
136 | return db.Get(key)
137 | })
138 | var buf []byte
139 | fullTest(t, func(db *DB, key []byte) ([]byte, error) {
140 | var err error
141 | buf, err = db.GetAppend(key, buf[:0])
142 | return buf, err
143 | })
144 | }
145 |
146 | func fullTest(t *testing.T, getFunc func(db *DB, key []byte) ([]byte, error)) {
147 | opts := &Options{
148 | BackgroundSyncInterval: -1,
149 | FileSystem: testFS,
150 | maxSegmentSize: 1024,
151 | }
152 | db, err := createTestDB(opts)
153 | assert.Nil(t, err)
154 | var i byte
155 | var n uint8 = 255
156 | assert.Equal(t, uint32(0), db.Count())
157 | for i = 0; i < n; i++ {
158 | if has, err := db.Has([]byte{i}); has || err != nil {
159 | t.Fatal(has, err)
160 | }
161 | }
162 | assert.Nil(t, db.Delete([]byte{128}))
163 | assert.Equal(t, uint32(0), db.Count())
164 | for i = 0; i < n; i++ {
165 | assert.Nil(t, db.Put([]byte{i}, []byte{i}))
166 | }
167 | assert.Equal(t, uint32(255), db.Count())
168 | assert.Equal(t, int64(n), db.Metrics().Puts.Value())
169 | assert.Nil(t, db.Sync())
170 |
171 | sz, err := db.FileSize()
172 | assert.Nil(t, err)
173 | if sz <= 0 {
174 | t.Fatal(sz)
175 | }
176 |
177 | assert.Nil(t, db.Delete([]byte{128}))
178 | assert.Equal(t, uint32(254), db.Count())
179 | if has, err := db.Has([]byte{128}); has || err != nil {
180 | t.Fatal(has, err)
181 | }
182 | assert.Nil(t, db.Put([]byte{128}, []byte{128}))
183 | assert.Equal(t, uint32(255), db.Count())
184 |
185 | verifyKeysAndClose := func(valueOffset uint8) {
186 | t.Helper()
187 | assert.Equal(t, uint32(255), db.Count())
188 | for i = 0; i < n; i++ {
189 | if has, err := db.Has([]byte{i}); !has || err != nil {
190 | t.Fatal(has, err)
191 | }
192 | if has, err := db.Has([]byte{0, i}); has || err != nil {
193 | t.Fatal(has, err)
194 | }
195 | v, err := getFunc(db, []byte{i})
196 | if err != nil {
197 | t.Fatal(err)
198 | }
199 | assert.Equal(t, []byte{i + valueOffset}, v)
200 | }
201 | assert.Nil(t, db.Close())
202 | }
203 |
204 | expectedSegMetas := db.datalog.segmentMetas()
205 | verifyKeysAndClose(0)
206 |
207 | // Open and check again
208 | db, err = Open(testDBName, opts)
209 | assert.Nil(t, err)
210 | verifyKeysAndClose(0)
211 |
212 | // Simulate crash.
213 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName)))
214 | assert.Nil(t, testFS.Remove(filepath.Join(testDBName, segmentMetaName(0, 1))))
215 | assert.Nil(t, testFS.Remove(filepath.Join(testDBName, indexMetaName)))
216 |
217 | // Open and check again
218 | db, err = Open(testDBName, opts)
219 | assert.Nil(t, err)
220 | verifyKeysAndClose(0)
221 |
222 | assert.Equal(t, expectedSegMetas, db.datalog.segmentMetas())
223 |
224 | // Update all items
225 | db, err = Open(testDBName, opts)
226 | assert.Nil(t, err)
227 | for i = 0; i < n; i++ {
228 | assert.Nil(t, db.Put([]byte{i}, []byte{i + 6}))
229 | }
230 | verifyKeysAndClose(6)
231 |
232 | // Delete all items
233 | db, err = Open(testDBName, &Options{BackgroundSyncInterval: time.Millisecond, FileSystem: testFS})
234 | assert.Nil(t, err)
235 | for i = 0; i < n; i++ {
236 | assert.Nil(t, db.Delete([]byte{i}))
237 | }
238 | for i = 0; i < n; i++ {
239 | if has, err := db.Has([]byte{i}); has || err != nil {
240 | t.Fatal(has, err)
241 | }
242 | }
243 | assert.Equal(t, uint32(0), db.Count())
244 | assert.Nil(t, db.Close())
245 | }
246 |
247 | func TestLock(t *testing.T) {
248 | opts := &Options{FileSystem: testFS}
249 | db, err := createTestDB(opts)
250 | assert.Nil(t, err)
251 |
252 | // Opening already opened database returns an error.
253 | db2, err2 := Open(testDBName, opts)
254 | assert.Nil(t, db2)
255 | assert.NotNil(t, err2)
256 |
257 | assert.Nil(t, db.Close())
258 | }
259 |
260 | func TestEmptyKey(t *testing.T) {
261 | db, err := createTestDB(nil)
262 | assert.Nil(t, err)
263 | if err := db.Put([]byte{}, []byte{1}); err != nil {
264 | t.Fatal(err)
265 | }
266 | v, err := db.Get([]byte{})
267 | assert.Nil(t, err)
268 | assert.Equal(t, []byte{1}, v)
269 | assert.Nil(t, db.Close())
270 | }
271 |
272 | func TestEmptyValue(t *testing.T) {
273 | db, err := createTestDB(nil)
274 | assert.Nil(t, err)
275 | // Returns a nil value if key not found.
276 | if v, err := db.Get([]byte{1}); err != nil || v != nil {
277 | t.Fatal(err)
278 | }
279 | err = db.Put([]byte{1}, []byte{})
280 | assert.Nil(t, err)
281 | // Returns an empty slice if value is empty.
282 | if v, err := db.Get([]byte{1}); err != nil || v == nil || len(v) != 0 {
283 | t.Fatal(err)
284 | }
285 | assert.Nil(t, db.Close())
286 | }
287 |
288 | func TestEmptyKeyValue(t *testing.T) {
289 | db, err := createTestDB(nil)
290 | assert.Nil(t, err)
291 | assert.Nil(t, db.Put([]byte{}, []byte{}))
292 | v, err := db.Get([]byte{})
293 | assert.Nil(t, err)
294 | assert.Equal(t, []byte{}, v)
295 | assert.Nil(t, db.Close())
296 | }
297 |
298 | func TestDataRecycle(t *testing.T) {
299 | db, err := createTestDB(nil)
300 | assert.Nil(t, err)
301 | assert.Nil(t, db.Put([]byte{1}, []byte{8}))
302 | v, err := db.Get([]byte{1})
303 | assert.Nil(t, err)
304 | assert.Equal(t, []byte{8}, v)
305 | err = db.Delete([]byte{1})
306 | assert.Nil(t, err)
307 | err = db.Put([]byte{1}, []byte{9})
308 | assert.Nil(t, err)
309 | assert.Equal(t, []byte{8}, v)
310 | assert.Nil(t, db.Close())
311 | }
312 |
313 | func TestClose(t *testing.T) {
314 | db, err := createTestDB(nil)
315 | assert.Nil(t, err)
316 | assert.Nil(t, db.Close())
317 | _, err = db.Get([]byte{1})
318 | assert.NotNil(t, err)
319 | assert.NotNil(t, db.Close())
320 | }
321 |
322 | func TestCorruptedIndex(t *testing.T) {
323 | opts := &Options{FileSystem: testFS}
324 | db, err := createTestDB(opts)
325 | assert.Nil(t, err)
326 | assert.Nil(t, db.Close())
327 |
328 | f, err := testFS.OpenFile(filepath.Join(testDBName, indexMetaName), os.O_RDWR, 0)
329 | assert.Nil(t, err)
330 | _, err = f.Write([]byte("corrupted"))
331 | assert.Nil(t, err)
332 | assert.Nil(t, f.Close())
333 |
334 | db, err = Open(testDBName, opts)
335 | assert.Nil(t, db)
336 | assert.NotNil(t, err)
337 | }
338 |
339 | func TestFileError(t *testing.T) {
340 | db, err := createTestDB(nil)
341 | assert.Nil(t, err)
342 | assert.Nil(t, db.Put(nil, nil))
343 |
344 | errf := &errfile{}
345 |
346 | testDB := func(t *testing.T) {
347 | v, err := db.Get(nil)
348 | assert.Nil(t, v)
349 | assert.Equal(t, errfileError, err)
350 |
351 | assert.Equal(t, errfileError, db.Put(nil, nil))
352 | assert.Equal(t, errfileError, db.Delete(nil))
353 |
354 | has, err := db.Has(nil)
355 | assert.Equal(t, false, has)
356 | assert.Equal(t, errfileError, err)
357 |
358 | it := db.Items()
359 | k, v, err := it.Next()
360 | assert.Nil(t, k)
361 | assert.Nil(t, v)
362 | assert.Equal(t, errfileError, err)
363 | }
364 |
365 | t.Run("segment error", func(t *testing.T) {
366 | oldf := db.datalog.segments[0].File
367 | db.datalog.segments[0].File = errf
368 |
369 | testDB(t)
370 |
371 | assert.Equal(t, errfileError, db.Close())
372 |
373 | db.datalog.segments[0].File = oldf
374 | })
375 |
376 | t.Run("index error", func(t *testing.T) {
377 | oldf := db.index.main.File
378 | db.index.main.File = errf
379 |
380 | testDB(t)
381 | assert.Equal(t, errfileError, db.index.close())
382 |
383 | db.index.main.File = oldf
384 | })
385 |
386 | errfs := &errfs{}
387 | oldfs := db.opts.FileSystem
388 | db.opts.FileSystem = errfs
389 | assert.Equal(t, errfileError, db.Close())
390 | assert.Equal(t, errfileError, db.index.close())
391 | db.opts.FileSystem = oldfs
392 |
393 | assert.Nil(t, db.Close())
394 | }
395 |
396 | func TestFSError(t *testing.T) {
397 | db, err := createTestDB(&Options{FileSystem: &errfs{}})
398 | assert.Nil(t, db)
399 | assert.NotNil(t, err)
400 | }
401 |
402 | func TestWordsDict(t *testing.T) {
403 | if testFS != fs.Mem {
404 | t.Skip()
405 | }
406 | fwords, err := os.Open("/usr/share/dict/words")
407 | if err != nil {
408 | t.Skip("words file not found")
409 | }
410 | defer fwords.Close()
411 | db, err := createTestDB(nil)
412 | assert.Nil(t, err)
413 | scanner := bufio.NewScanner(fwords)
414 | items := make(map[string]string)
415 | for scanner.Scan() {
416 | k := scanner.Text()
417 | v := strings.ToUpper(k)
418 | items[k] = v
419 | assert.Nil(t, db.Put([]byte(k), []byte(v)))
420 | }
421 | assert.Nil(t, scanner.Err())
422 | for k, v := range items {
423 | v2, err := db.Get([]byte(k))
424 | if string(v2) != v {
425 | t.Fatalf("expected %v; got value=%v, err=%v for key %v", v, string(v2), err, k)
426 | }
427 | }
428 | assert.Nil(t, db.Close())
429 | }
430 |
431 | func BenchmarkPut(b *testing.B) {
432 | db, err := createTestDB(nil)
433 | assert.Nil(b, err)
434 | b.ResetTimer()
435 | k := []byte{1}
436 | for i := 0; i < b.N; i++ {
437 | if err := db.Put(k, k); err != nil {
438 | b.Fail()
439 | }
440 | }
441 | assert.Nil(b, db.Close())
442 | }
443 |
444 | func BenchmarkGet(b *testing.B) {
445 | db, err := createTestDB(nil)
446 | assert.Nil(b, err)
447 | k := []byte{1}
448 | if err := db.Put(k, make([]byte, 1024)); err != nil {
449 | b.Fail()
450 | }
451 | b.ResetTimer()
452 | b.ReportAllocs()
453 | for i := 0; i < b.N; i++ {
454 | if _, err := db.Get(k); err != nil {
455 | b.Fatal()
456 | }
457 | }
458 | assert.Nil(b, db.Close())
459 | }
460 |
461 | func BenchmarkGetAppend(b *testing.B) {
462 | db, err := createTestDB(nil)
463 | assert.Nil(b, err)
464 | k := []byte{1}
465 | if err := db.Put(k, make([]byte, 1024)); err != nil {
466 | b.Fail()
467 | }
468 | b.ResetTimer()
469 | b.ReportAllocs()
470 | buf := make([]byte, 0, 1024)
471 | for i := 0; i < b.N; i++ {
472 | value, err := db.GetAppend(k, buf[:0])
473 | if err != nil {
474 | b.Fatal()
475 | }
476 | buf = value
477 | }
478 | assert.Nil(b, db.Close())
479 | }
480 |
481 | func BenchmarkBucket_UnmarshalBinary(b *testing.B) {
482 | testBucket := bucket{
483 | slots: [slotsPerBucket]slot{},
484 | }
485 | for i := 0; i < slotsPerBucket; i++ {
486 | testBucket.slots[i].hash = uint32(i)
487 | testBucket.slots[i].keySize = uint16(i + 1)
488 | testBucket.slots[i].valueSize = uint32(i + 17)
489 | }
490 | data, _ := testBucket.MarshalBinary()
491 | b.ResetTimer()
492 | for i := 0; i < b.N; i++ {
493 | tmp := bucket{}
494 | err := tmp.UnmarshalBinary(data)
495 | if err != nil {
496 | b.Fatal()
497 | }
498 | }
499 | }
500 |
--------------------------------------------------------------------------------