├── .gitignore ├── go.mod ├── fs ├── os_mmap_windows_amd64.go ├── os_mmap_windows_386.go ├── os_mmap_test.go ├── mem_test.go ├── os_test.go ├── os_plan9.go ├── os_unix.go ├── os_mmap_unix.go ├── os_mmap_windows.go ├── os.go ├── os_windows.go ├── sub.go ├── fs.go ├── os_mmap.go ├── mem.go └── fs_test.go ├── doc.go ├── db_rpc_test.go ├── db_mmap_test.go ├── internal ├── hash │ ├── seed_test.go │ ├── seed.go │ ├── murmurhash32.go │ └── murmurhash32_test.go ├── errors │ ├── errors.go │ └── errors_test.go └── assert │ ├── assert.go │ └── assert_test.go ├── metrics.go ├── logger.go ├── lock.go ├── .github └── workflows │ ├── golangci-lint.yaml │ └── test.yaml ├── errors.go ├── gobfile.go ├── example_test.go ├── header.go ├── iterator_test.go ├── datalog_test.go ├── options.go ├── CHANGELOG.md ├── backup_test.go ├── iterator.go ├── backup.go ├── file.go ├── file_test.go ├── README.md ├── bucket.go ├── recovery.go ├── segment.go ├── compaction.go ├── datalog.go ├── recovery_test.go ├── docs └── design.md ├── index.go ├── compaction_test.go ├── db.go ├── LICENSE └── db_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | /fs/test 2 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/akrylysov/pogreb 2 | 3 | go 1.18 4 | -------------------------------------------------------------------------------- /fs/os_mmap_windows_amd64.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | const maxMmapSize = 1 << 48 4 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package pogreb implements an embedded key-value store for read-heavy workloads. 3 | */ 4 | package pogreb 5 | -------------------------------------------------------------------------------- /fs/os_mmap_windows_386.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | const maxMmapSize = math.MaxInt32 8 | -------------------------------------------------------------------------------- /fs/os_mmap_test.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestOSMMapFS(t *testing.T) { 8 | testFS(t, Sub(OSMMap, t.TempDir())) 9 | } 10 | -------------------------------------------------------------------------------- /db_rpc_test.go: -------------------------------------------------------------------------------- 1 | //go:build plan9 2 | // +build plan9 3 | 4 | package pogreb 5 | 6 | import ( 7 | "github.com/akrylysov/pogreb/fs" 8 | ) 9 | 10 | var testFileSystems = []fs.FileSystem{fs.Mem, fs.OS} 11 | -------------------------------------------------------------------------------- /db_mmap_test.go: -------------------------------------------------------------------------------- 1 | //go:build !plan9 2 | // +build !plan9 3 | 4 | package pogreb 5 | 6 | import ( 7 | "github.com/akrylysov/pogreb/fs" 8 | ) 9 | 10 | var testFileSystems = []fs.FileSystem{fs.Mem, fs.OSMMap, fs.OS} 11 | -------------------------------------------------------------------------------- /internal/hash/seed_test.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/akrylysov/pogreb/internal/assert" 7 | ) 8 | 9 | func TestRandSeed(t *testing.T) { 10 | _, err := RandSeed() 11 | assert.Nil(t, err) 12 | } 13 | -------------------------------------------------------------------------------- /metrics.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import "expvar" 4 | 5 | // Metrics holds the DB metrics. 6 | type Metrics struct { 7 | Puts expvar.Int 8 | Dels expvar.Int 9 | Gets expvar.Int 10 | HashCollisions expvar.Int 11 | } 12 | -------------------------------------------------------------------------------- /logger.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "log" 5 | "os" 6 | ) 7 | 8 | var logger = log.New(os.Stderr, "pogreb: ", 0) 9 | 10 | // SetLogger sets the global logger. 11 | func SetLogger(l *log.Logger) { 12 | if l != nil { 13 | logger = l 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /lock.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/akrylysov/pogreb/fs" 7 | ) 8 | 9 | const ( 10 | lockName = "lock" 11 | ) 12 | 13 | func createLockFile(opts *Options) (fs.LockFile, bool, error) { 14 | return opts.FileSystem.CreateLockFile(lockName, os.FileMode(0644)) 15 | } 16 | -------------------------------------------------------------------------------- /fs/mem_test.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestMemFS(t *testing.T) { 8 | testFS(t, Mem) 9 | } 10 | 11 | func TestMemLockFile(t *testing.T) { 12 | testLockFile(t, Mem) 13 | } 14 | 15 | func TestMemLockAcquireExisting(t *testing.T) { 16 | testLockFileAcquireExisting(t, Mem) 17 | } 18 | -------------------------------------------------------------------------------- /internal/hash/seed.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "crypto/rand" 5 | "encoding/binary" 6 | ) 7 | 8 | // RandSeed generates a random hash seed. 9 | func RandSeed() (uint32, error) { 10 | b := make([]byte, 4) 11 | if _, err := rand.Read(b); err != nil { 12 | return 0, err 13 | } 14 | return binary.LittleEndian.Uint32(b), nil 15 | } 16 | -------------------------------------------------------------------------------- /fs/os_test.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestOSFS(t *testing.T) { 8 | testFS(t, Sub(OS, t.TempDir())) 9 | } 10 | 11 | func TestOSLockFile(t *testing.T) { 12 | testLockFile(t, Sub(OS, t.TempDir())) 13 | } 14 | 15 | func TestOSLockAcquireExisting(t *testing.T) { 16 | testLockFileAcquireExisting(t, Sub(OS, t.TempDir())) 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/golangci-lint.yaml: -------------------------------------------------------------------------------- 1 | name: golangci-lint 2 | on: 3 | push: 4 | tags: 5 | - v* 6 | branches: 7 | - master 8 | - main 9 | pull_request: 10 | jobs: 11 | golangci: 12 | name: lint 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: golangci-lint 17 | uses: golangci/golangci-lint-action@v6 18 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "github.com/akrylysov/pogreb/internal/errors" 5 | ) 6 | 7 | var ( 8 | errKeyTooLarge = errors.New("key is too large") 9 | errValueTooLarge = errors.New("value is too large") 10 | errFull = errors.New("database is full") 11 | errCorrupted = errors.New("database is corrupted") 12 | errLocked = errors.New("database is locked") 13 | errBusy = errors.New("database is busy") 14 | ) 15 | -------------------------------------------------------------------------------- /fs/os_plan9.go: -------------------------------------------------------------------------------- 1 | //go:build plan9 2 | // +build plan9 3 | 4 | package fs 5 | 6 | import ( 7 | "os" 8 | "syscall" 9 | ) 10 | 11 | func createLockFile(name string, perm os.FileMode) (LockFile, bool, error) { 12 | acquiredExisting := false 13 | if _, err := os.Stat(name); err == nil { 14 | acquiredExisting = true 15 | } 16 | f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE, syscall.DMEXCL|perm) 17 | if err != nil { 18 | return nil, false, err 19 | } 20 | return &osLockFile{f, name}, acquiredExisting, nil 21 | } 22 | 23 | // Return a default FileSystem for this platform. 24 | func DefaultFileSystem() FileSystem { 25 | return OS 26 | } 27 | -------------------------------------------------------------------------------- /gobfile.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "encoding/gob" 5 | 6 | "github.com/akrylysov/pogreb/fs" 7 | ) 8 | 9 | func readGobFile(fsys fs.FileSystem, name string, v interface{}) error { 10 | f, err := openFile(fsys, name, openFileFlags{readOnly: true}) 11 | if err != nil { 12 | return err 13 | } 14 | defer f.Close() 15 | dec := gob.NewDecoder(f) 16 | return dec.Decode(v) 17 | } 18 | 19 | func writeGobFile(fsys fs.FileSystem, name string, v interface{}) error { 20 | f, err := openFile(fsys, name, openFileFlags{truncate: true}) 21 | if err != nil { 22 | return err 23 | } 24 | defer f.Close() 25 | enc := gob.NewEncoder(f) 26 | return enc.Encode(v) 27 | } 28 | -------------------------------------------------------------------------------- /fs/os_unix.go: -------------------------------------------------------------------------------- 1 | //go:build !(plan9 || windows) 2 | // +build !plan9,!windows 3 | 4 | package fs 5 | 6 | import ( 7 | "os" 8 | "syscall" 9 | ) 10 | 11 | func createLockFile(name string, perm os.FileMode) (LockFile, bool, error) { 12 | acquiredExisting := false 13 | if _, err := os.Stat(name); err == nil { 14 | acquiredExisting = true 15 | } 16 | f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE, perm) 17 | if err != nil { 18 | return nil, false, err 19 | } 20 | if err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil { 21 | if err == syscall.EWOULDBLOCK { 22 | err = os.ErrExist 23 | } 24 | return nil, false, err 25 | } 26 | return &osLockFile{f, name}, acquiredExisting, nil 27 | } 28 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: [push, pull_request] 3 | jobs: 4 | test: 5 | strategy: 6 | matrix: 7 | go-version: [1.18.x, 1.x] 8 | os: [ubuntu-latest, macos-latest, windows-latest] 9 | runs-on: ${{ matrix.os }} 10 | steps: 11 | - name: Install Go 12 | uses: actions/setup-go@v5 13 | with: 14 | go-version: ${{ matrix.go-version }} 15 | - name: Checkout code 16 | uses: actions/checkout@v4 17 | - name: Build GOARCH=386 18 | if: ${{ matrix.os != 'macos-latest' }} 19 | env: 20 | GOARCH: "386" 21 | run: go build 22 | - name: Test 23 | run: go test ./... -race -coverprofile=coverage.txt -covermode=atomic 24 | - name: Upload coverage to Codecov 25 | if: ${{ matrix.os == 'ubuntu-latest' }} 26 | uses: codecov/codecov-action@v5 27 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package pogreb_test 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/akrylysov/pogreb" 7 | ) 8 | 9 | func Example() { 10 | db, err := pogreb.Open("pogreb.test", nil) 11 | if err != nil { 12 | log.Fatal(err) 13 | return 14 | } 15 | defer db.Close() 16 | 17 | // Insert a new key-value pair. 18 | if err := db.Put([]byte("testKey"), []byte("testValue")); err != nil { 19 | log.Fatal(err) 20 | } 21 | 22 | // Retrieve the inserted value. 23 | val, err := db.Get([]byte("testKey")) 24 | if err != nil { 25 | log.Fatal(err) 26 | } 27 | log.Printf("%s", val) 28 | 29 | // Iterate over items. 30 | it := db.Items() 31 | for { 32 | key, val, err := it.Next() 33 | if err == pogreb.ErrIterationDone { 34 | break 35 | } 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | log.Printf("%s %s", key, val) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /fs/os_mmap_unix.go: -------------------------------------------------------------------------------- 1 | //go:build !(plan9 || windows) 2 | // +build !plan9,!windows 3 | 4 | package fs 5 | 6 | import ( 7 | "os" 8 | "syscall" 9 | "unsafe" 10 | ) 11 | 12 | func mmap(f *os.File, fileSize int64, mappingSize int64) ([]byte, error) { 13 | p, err := syscall.Mmap(int(f.Fd()), 0, int(mappingSize), syscall.PROT_READ, syscall.MAP_SHARED) 14 | return p, err 15 | } 16 | 17 | func munmap(data []byte) error { 18 | return syscall.Munmap(data) 19 | } 20 | 21 | func madviceRandom(data []byte) error { 22 | _, _, errno := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&data[0])), uintptr(len(data)), uintptr(syscall.MADV_RANDOM)) 23 | if errno != 0 { 24 | return errno 25 | } 26 | return nil 27 | } 28 | 29 | func (f *osMMapFile) Truncate(size int64) error { 30 | if err := f.File.Truncate(size); err != nil { 31 | return err 32 | } 33 | f.size = size 34 | return f.mremap() 35 | } 36 | -------------------------------------------------------------------------------- /header.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | ) 7 | 8 | const ( 9 | formatVersion = 2 // File format version. 10 | headerSize = 512 11 | ) 12 | 13 | var ( 14 | signature = [8]byte{'p', 'o', 'g', 'r', 'e', 'b', '\x0e', '\xfd'} 15 | ) 16 | 17 | type header struct { 18 | signature [8]byte 19 | formatVersion uint32 20 | } 21 | 22 | func newHeader() *header { 23 | return &header{ 24 | signature: signature, 25 | formatVersion: formatVersion, 26 | } 27 | } 28 | 29 | func (h header) MarshalBinary() ([]byte, error) { 30 | buf := make([]byte, headerSize) 31 | copy(buf[:8], h.signature[:]) 32 | binary.LittleEndian.PutUint32(buf[8:12], h.formatVersion) 33 | return buf, nil 34 | } 35 | 36 | func (h *header) UnmarshalBinary(data []byte) error { 37 | if !bytes.Equal(data[:8], signature[:]) { 38 | return errCorrupted 39 | } 40 | copy(h.signature[:], data[:8]) 41 | h.formatVersion = binary.LittleEndian.Uint32(data[8:12]) 42 | return nil 43 | } 44 | -------------------------------------------------------------------------------- /internal/errors/errors.go: -------------------------------------------------------------------------------- 1 | package errors 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | type wrappedError struct { 9 | cause error 10 | msg string 11 | } 12 | 13 | func (we wrappedError) Error() string { 14 | return we.msg + ": " + we.cause.Error() 15 | } 16 | 17 | func (we wrappedError) Unwrap() error { 18 | return we.cause 19 | } 20 | 21 | // New returns an error that formats as the given text. 22 | func New(text string) error { 23 | return errors.New(text) 24 | } 25 | 26 | // Wrap returns an error annotating err with an additional message. 27 | // Compatible with Go 1.13 error chains. 28 | func Wrap(cause error, message string) error { 29 | return wrappedError{ 30 | cause: cause, 31 | msg: message, 32 | } 33 | } 34 | 35 | // Wrapf returns an error annotating err with an additional formatted message. 36 | // Compatible with Go 1.13 error chains. 37 | func Wrapf(cause error, format string, a ...interface{}) error { 38 | return wrappedError{ 39 | cause: cause, 40 | msg: fmt.Sprintf(format, a...), 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /internal/hash/murmurhash32.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "math/bits" 5 | ) 6 | 7 | const ( 8 | c1 uint32 = 0xcc9e2d51 9 | c2 uint32 = 0x1b873593 10 | ) 11 | 12 | // Sum32WithSeed is a port of MurmurHash3_x86_32 function. 13 | func Sum32WithSeed(data []byte, seed uint32) uint32 { 14 | h1 := seed 15 | dlen := len(data) 16 | 17 | for len(data) >= 4 { 18 | k1 := uint32(data[0]) | uint32(data[1])<<8 | uint32(data[2])<<16 | uint32(data[3])<<24 19 | data = data[4:] 20 | 21 | k1 *= c1 22 | k1 = bits.RotateLeft32(k1, 15) 23 | k1 *= c2 24 | 25 | h1 ^= k1 26 | h1 = bits.RotateLeft32(h1, 13) 27 | h1 = h1*5 + 0xe6546b64 28 | } 29 | 30 | var k1 uint32 31 | switch len(data) { 32 | case 3: 33 | k1 ^= uint32(data[2]) << 16 34 | fallthrough 35 | case 2: 36 | k1 ^= uint32(data[1]) << 8 37 | fallthrough 38 | case 1: 39 | k1 ^= uint32(data[0]) 40 | k1 *= c1 41 | k1 = bits.RotateLeft32(k1, 15) 42 | k1 *= c2 43 | h1 ^= k1 44 | } 45 | 46 | h1 ^= uint32(dlen) 47 | 48 | h1 ^= h1 >> 16 49 | h1 *= 0x85ebca6b 50 | h1 ^= h1 >> 13 51 | h1 *= 0xc2b2ae35 52 | h1 ^= h1 >> 16 53 | 54 | return h1 55 | } 56 | -------------------------------------------------------------------------------- /internal/hash/murmurhash32_test.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/akrylysov/pogreb/internal/assert" 8 | ) 9 | 10 | func TestSum32WithSeed(t *testing.T) { 11 | testCases := []struct { 12 | in []byte 13 | seed uint32 14 | out uint32 15 | }{ 16 | { 17 | in: nil, 18 | out: 0, 19 | }, 20 | { 21 | in: nil, 22 | seed: 1, 23 | out: 1364076727, 24 | }, 25 | { 26 | in: []byte{1}, 27 | out: 3831157163, 28 | }, 29 | { 30 | in: []byte{1, 2}, 31 | out: 1690789502, 32 | }, 33 | { 34 | in: []byte{1, 2, 3}, 35 | out: 2161234436, 36 | }, 37 | { 38 | in: []byte{1, 2, 3, 4}, 39 | out: 1043635621, 40 | }, 41 | { 42 | in: []byte{1, 2, 3, 4, 5}, 43 | out: 2727459272, 44 | }, 45 | } 46 | for i, tc := range testCases { 47 | t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { 48 | assert.Equal(t, tc.out, Sum32WithSeed(tc.in, tc.seed)) 49 | }) 50 | } 51 | } 52 | 53 | func BenchmarkSum32WithSeed(b *testing.B) { 54 | data := []byte("pogreb_Sum32WithSeed_bench") 55 | b.SetBytes(int64(len(data))) 56 | for n := 0; n < b.N; n++ { 57 | Sum32WithSeed(data, 0) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /internal/errors/errors_test.go: -------------------------------------------------------------------------------- 1 | package errors 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | 7 | "github.com/akrylysov/pogreb/internal/assert" 8 | ) 9 | 10 | func TestWrap(t *testing.T) { 11 | err1 := New("err1") 12 | w11 := Wrap(err1, "wrapped 11") 13 | w12 := Wrapf(w11, "wrapped %d%s", 1, "2") 14 | 15 | assert.Equal(t, err1, w11.(wrappedError).Unwrap()) 16 | assert.Equal(t, w11, w12.(wrappedError).Unwrap()) 17 | 18 | assert.Equal(t, "wrapped 11: err1", w11.Error()) 19 | assert.Equal(t, "wrapped 12: wrapped 11: err1", w12.Error()) 20 | } 21 | 22 | func TestIs(t *testing.T) { 23 | err1 := New("err1") 24 | w11 := Wrap(err1, "wrapped 11") 25 | w12 := Wrap(w11, "wrapped 12") 26 | 27 | err2 := New("err2") 28 | w21 := Wrap(err2, "wrapped 21") 29 | 30 | assert.Equal(t, true, errors.Is(err1, err1)) 31 | assert.Equal(t, true, errors.Is(w11, err1)) 32 | assert.Equal(t, true, errors.Is(w12, err1)) 33 | assert.Equal(t, true, errors.Is(w12, w11)) 34 | 35 | assert.Equal(t, false, errors.Is(err1, err2)) 36 | assert.Equal(t, false, errors.Is(w11, err2)) 37 | assert.Equal(t, false, errors.Is(w12, err2)) 38 | assert.Equal(t, false, errors.Is(w21, err1)) 39 | assert.Equal(t, false, errors.Is(w21, w11)) 40 | } 41 | -------------------------------------------------------------------------------- /fs/os_mmap_windows.go: -------------------------------------------------------------------------------- 1 | //go:build windows 2 | // +build windows 3 | 4 | package fs 5 | 6 | import ( 7 | "os" 8 | "syscall" 9 | "unsafe" 10 | ) 11 | 12 | func mmap(f *os.File, fileSize int64, mappingSize int64) ([]byte, error) { 13 | size := fileSize 14 | low, high := uint32(size), uint32(size>>32) 15 | fmap, err := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil) 16 | if err != nil { 17 | return nil, err 18 | } 19 | defer syscall.CloseHandle(fmap) 20 | ptr, err := syscall.MapViewOfFile(fmap, syscall.FILE_MAP_READ, 0, 0, uintptr(size)) 21 | if err != nil { 22 | return nil, err 23 | } 24 | data := (*[maxMmapSize]byte)(unsafe.Pointer(ptr))[:size] 25 | return data, nil 26 | } 27 | 28 | func munmap(data []byte) error { 29 | return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&data[0]))) 30 | } 31 | 32 | func madviceRandom(data []byte) error { 33 | return nil 34 | } 35 | 36 | func (f *osMMapFile) Truncate(size int64) error { 37 | // Truncating a memory-mapped file fails on Windows. Unmap it first. 38 | if err := f.munmap(); err != nil { 39 | return err 40 | } 41 | if err := f.File.Truncate(size); err != nil { 42 | return err 43 | } 44 | f.size = size 45 | return f.mremap() 46 | } 47 | -------------------------------------------------------------------------------- /iterator_test.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/akrylysov/pogreb/internal/assert" 8 | ) 9 | 10 | func TestIteratorEmpty(t *testing.T) { 11 | db, err := createTestDB(nil) 12 | assert.Nil(t, err) 13 | it := db.Items() 14 | for i := 0; i < 8; i++ { 15 | _, _, err := it.Next() 16 | if err != ErrIterationDone { 17 | t.Fatalf("expected %v; got %v", ErrIterationDone, err) 18 | } 19 | } 20 | assert.Nil(t, db.Close()) 21 | } 22 | 23 | func TestIterator(t *testing.T) { 24 | db, err := createTestDB(nil) 25 | assert.Nil(t, err) 26 | 27 | items := map[byte]bool{} 28 | var i byte 29 | for i = 0; i < 255; i++ { 30 | items[i] = false 31 | err := db.Put([]byte{i}, []byte{i}) 32 | assert.Nil(t, err) 33 | } 34 | 35 | it := db.Items() 36 | for { 37 | key, value, err := it.Next() 38 | if err == ErrIterationDone { 39 | break 40 | } 41 | assert.Nil(t, err) 42 | if k, ok := items[key[0]]; !ok { 43 | t.Fatalf("unknown key %v", k) 44 | } 45 | if !bytes.Equal(key, value) { 46 | t.Fatalf("expected %v; got %v", key, value) 47 | } 48 | items[key[0]] = true 49 | } 50 | 51 | for k, v := range items { 52 | if !v { 53 | t.Fatalf("expected to iterate over key %v", k) 54 | } 55 | } 56 | 57 | for i := 0; i < 8; i++ { 58 | _, _, err := it.Next() 59 | if err != ErrIterationDone { 60 | t.Fatalf("expected %v; got %v", ErrIterationDone, err) 61 | } 62 | } 63 | 64 | assert.Nil(t, db.Close()) 65 | } 66 | -------------------------------------------------------------------------------- /datalog_test.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/akrylysov/pogreb/internal/assert" 7 | ) 8 | 9 | func (dl *datalog) segmentMetas() []segmentMeta { 10 | var metas []segmentMeta 11 | for _, seg := range dl.segmentsBySequenceID() { 12 | metas = append(metas, *seg.meta) 13 | } 14 | return metas 15 | } 16 | 17 | func TestDatalog(t *testing.T) { 18 | db, err := createTestDB(nil) 19 | assert.Nil(t, err) 20 | 21 | _, _, err = db.datalog.put([]byte{'1'}, []byte{'1'}) 22 | assert.Nil(t, err) 23 | assert.Equal(t, &segmentMeta{PutRecords: 1}, db.datalog.segments[0].meta) 24 | assert.Nil(t, db.datalog.segments[1]) 25 | 26 | sm := db.datalog.segmentsBySequenceID() 27 | assert.Equal(t, []*segment{db.datalog.segments[0]}, sm) 28 | 29 | // Writing to a full file swaps it. 30 | db.datalog.segments[0].meta.Full = true 31 | _, _, err = db.datalog.put([]byte{'1'}, []byte{'1'}) 32 | assert.Nil(t, err) 33 | assert.Equal(t, &segmentMeta{PutRecords: 1, Full: true}, db.datalog.segments[0].meta) 34 | assert.Equal(t, &segmentMeta{PutRecords: 1}, db.datalog.segments[1].meta) 35 | 36 | sm = db.datalog.segmentsBySequenceID() 37 | assert.Equal(t, []*segment{db.datalog.segments[0], db.datalog.segments[1]}, sm) 38 | 39 | _, _, err = db.datalog.put([]byte{'1'}, []byte{'1'}) 40 | assert.Nil(t, err) 41 | assert.Equal(t, &segmentMeta{PutRecords: 1, Full: true}, db.datalog.segments[0].meta) 42 | assert.Equal(t, &segmentMeta{PutRecords: 2}, db.datalog.segments[1].meta) 43 | 44 | assert.Nil(t, db.Close()) 45 | } 46 | -------------------------------------------------------------------------------- /fs/os.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "os" 5 | ) 6 | 7 | type osFS struct{} 8 | 9 | // OS is a file system backed by the os package. 10 | var OS FileSystem = &osFS{} 11 | 12 | func (fs *osFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) { 13 | f, err := os.OpenFile(name, flag, perm) 14 | if err != nil { 15 | return nil, err 16 | } 17 | return &osFile{File: f}, nil 18 | } 19 | 20 | func (fs *osFS) CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) { 21 | return createLockFile(name, perm) 22 | } 23 | 24 | func (fs *osFS) Stat(name string) (os.FileInfo, error) { 25 | return os.Stat(name) 26 | } 27 | 28 | func (fs *osFS) Remove(name string) error { 29 | return os.Remove(name) 30 | } 31 | 32 | func (fs *osFS) Rename(oldpath, newpath string) error { 33 | return os.Rename(oldpath, newpath) 34 | } 35 | 36 | func (fs *osFS) ReadDir(name string) ([]os.DirEntry, error) { 37 | return os.ReadDir(name) 38 | } 39 | 40 | func (fs *osFS) MkdirAll(path string, perm os.FileMode) error { 41 | return os.MkdirAll(path, perm) 42 | } 43 | 44 | type osFile struct { 45 | *os.File 46 | } 47 | 48 | func (f *osFile) Slice(start int64, end int64) ([]byte, error) { 49 | buf := make([]byte, end-start) 50 | _, err := f.ReadAt(buf, start) 51 | if err != nil { 52 | return nil, err 53 | } 54 | return buf, nil 55 | } 56 | 57 | type osLockFile struct { 58 | *os.File 59 | path string 60 | } 61 | 62 | func (f *osLockFile) Unlock() error { 63 | if err := os.Remove(f.path); err != nil { 64 | return err 65 | } 66 | return f.Close() 67 | } 68 | -------------------------------------------------------------------------------- /fs/os_windows.go: -------------------------------------------------------------------------------- 1 | //go:build windows 2 | // +build windows 3 | 4 | package fs 5 | 6 | import ( 7 | "os" 8 | "syscall" 9 | "unsafe" 10 | ) 11 | 12 | var ( 13 | modkernel32 = syscall.NewLazyDLL("kernel32.dll") 14 | procLockFileEx = modkernel32.NewProc("LockFileEx") 15 | ) 16 | 17 | const ( 18 | errorLockViolation = 0x21 19 | ) 20 | 21 | func lockfile(f *os.File) error { 22 | var ol syscall.Overlapped 23 | 24 | r1, _, err := syscall.Syscall6( 25 | procLockFileEx.Addr(), 26 | 6, 27 | uintptr(f.Fd()), // handle 28 | uintptr(0x0003), 29 | uintptr(0), // reserved 30 | uintptr(1), // locklow 31 | uintptr(0), // lockhigh 32 | uintptr(unsafe.Pointer(&ol)), 33 | ) 34 | if r1 == 0 && (err == syscall.ERROR_FILE_EXISTS || err == errorLockViolation) { 35 | return os.ErrExist 36 | } 37 | return nil 38 | } 39 | 40 | func createLockFile(name string, perm os.FileMode) (LockFile, bool, error) { 41 | acquiredExisting := false 42 | if _, err := os.Stat(name); err == nil { 43 | acquiredExisting = true 44 | } 45 | fd, err := syscall.CreateFile(&(syscall.StringToUTF16(name)[0]), 46 | syscall.GENERIC_READ|syscall.GENERIC_WRITE, 47 | syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE, 48 | nil, 49 | syscall.CREATE_ALWAYS, 50 | syscall.FILE_ATTRIBUTE_NORMAL, 51 | 0) 52 | if err != nil { 53 | return nil, false, os.ErrExist 54 | } 55 | f := os.NewFile(uintptr(fd), name) 56 | if err := lockfile(f); err != nil { 57 | f.Close() 58 | return nil, false, err 59 | } 60 | return &osLockFile{f, name}, acquiredExisting, nil 61 | } 62 | -------------------------------------------------------------------------------- /fs/sub.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | ) 7 | 8 | // Sub returns a new file system rooted at dir. 9 | func Sub(fsys FileSystem, dir string) FileSystem { 10 | return &subFS{ 11 | fsys: fsys, 12 | root: dir, 13 | } 14 | } 15 | 16 | type subFS struct { 17 | fsys FileSystem 18 | root string 19 | } 20 | 21 | func (fs *subFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) { 22 | subName := filepath.Join(fs.root, name) 23 | return fs.fsys.OpenFile(subName, flag, perm) 24 | } 25 | 26 | func (fs *subFS) Stat(name string) (os.FileInfo, error) { 27 | subName := filepath.Join(fs.root, name) 28 | return fs.fsys.Stat(subName) 29 | } 30 | 31 | func (fs *subFS) Remove(name string) error { 32 | subName := filepath.Join(fs.root, name) 33 | return fs.fsys.Remove(subName) 34 | } 35 | 36 | func (fs *subFS) Rename(oldpath, newpath string) error { 37 | subOldpath := filepath.Join(fs.root, oldpath) 38 | subNewpath := filepath.Join(fs.root, newpath) 39 | return fs.fsys.Rename(subOldpath, subNewpath) 40 | } 41 | 42 | func (fs *subFS) ReadDir(name string) ([]os.DirEntry, error) { 43 | subName := filepath.Join(fs.root, name) 44 | return fs.fsys.ReadDir(subName) 45 | } 46 | 47 | func (fs *subFS) CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) { 48 | subName := filepath.Join(fs.root, name) 49 | return fs.fsys.CreateLockFile(subName, perm) 50 | } 51 | 52 | func (fs *subFS) MkdirAll(path string, perm os.FileMode) error { 53 | subPath := filepath.Join(fs.root, path) 54 | return fs.fsys.MkdirAll(subPath, perm) 55 | } 56 | 57 | var _ FileSystem = &subFS{} 58 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "math" 5 | "time" 6 | 7 | "github.com/akrylysov/pogreb/fs" 8 | ) 9 | 10 | // Options holds the optional DB parameters. 11 | type Options struct { 12 | // BackgroundSyncInterval sets the amount of time between background Sync() calls. 13 | // 14 | // Setting the value to 0 disables the automatic background synchronization. 15 | // Setting the value to -1 makes the DB call Sync() after every write operation. 16 | // Default: 0 17 | BackgroundSyncInterval time.Duration 18 | 19 | // BackgroundCompactionInterval sets the amount of time between background Compact() calls. 20 | // 21 | // Setting the value to 0 disables the automatic background compaction. 22 | // Default: 0 23 | BackgroundCompactionInterval time.Duration 24 | 25 | // FileSystem sets the file system implementation. 26 | // 27 | // Default: fs.OSMMap. 28 | FileSystem fs.FileSystem 29 | rootFS fs.FileSystem 30 | 31 | maxSegmentSize uint32 32 | compactionMinSegmentSize uint32 33 | compactionMinFragmentation float32 34 | } 35 | 36 | func (src *Options) copyWithDefaults(path string) *Options { 37 | opts := Options{} 38 | if src != nil { 39 | opts = *src 40 | } 41 | if opts.FileSystem == nil { 42 | opts.FileSystem = fs.DefaultFileSystem() 43 | } 44 | opts.rootFS = opts.FileSystem 45 | opts.FileSystem = fs.Sub(opts.FileSystem, path) 46 | if opts.maxSegmentSize == 0 { 47 | opts.maxSegmentSize = math.MaxUint32 48 | } 49 | if opts.compactionMinSegmentSize == 0 { 50 | opts.compactionMinSegmentSize = 32 << 20 51 | } 52 | if opts.compactionMinFragmentation == 0 { 53 | opts.compactionMinFragmentation = 0.5 54 | } 55 | return &opts 56 | } 57 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.10.2] - 2023-12-10 4 | ### Fixed 5 | - Fix an edge case causing recovery to fail. 6 | 7 | ## [0.10.1] - 2021-05-01 8 | ### Changed 9 | - Improve error reporting. 10 | ### Fixed 11 | - Fix compilation for 32-bit OS. 12 | 13 | ## [0.10.0] - 2021-02-09 14 | ### Added 15 | - Memory-mapped file access can now be disabled by setting `Options.FileSystem` to `fs.OS`. 16 | ### Changed 17 | - The default file system implementation is changed to `fs.OSMMap`. 18 | 19 | ## [0.9.2] - 2021-01-01 20 | ### Changed 21 | - Write-ahead log doesn't rely on wall-clock time anymore. It prevents potential race conditions during compaction and recovery. 22 | ### Fixed 23 | - Fix recovery writing extra delete records. 24 | 25 | ## [0.9.1] - 2020-04-03 26 | ### Changed 27 | - Improve Go 1.14 compatibility (remove "unsafe" usage). 28 | 29 | ## [0.9.0] - 2020-03-08 30 | ### Changed 31 | - Replace the unstructured data file for storing key-value pairs with a write-ahead log. 32 | ### Added 33 | - In the event of a crash or a power loss the database is automatically recovered. 34 | - Optional background compaction allows reclaiming disk space occupied by overwritten or deleted keys. 35 | ### Fixed 36 | - Fix disk space overhead when storing small keys and values. 37 | 38 | ## [0.8.3] - 2019-11-03 39 | ### Fixed 40 | - Fix slice bounds out of range error mapping files on Windows. 41 | 42 | ## [0.8.2] - 2019-09-04 43 | ### Fixed 44 | - Race condition could lead to data corruption. 45 | 46 | ## [0.8.1] - 2019-06-30 47 | ### Fixed 48 | - Fix panic when accessing closed database. 49 | - Return error opening invalid database. 50 | 51 | ## [0.8] - 2019-03-30 52 | ### Changed 53 | - ~2x write performance improvement on non-Windows. 54 | 55 | ## [0.7] - 2019-03-23 56 | ### Added 57 | - Windows support (@mattn). 58 | ### Changed 59 | - Improve freelist performance. 60 | -------------------------------------------------------------------------------- /backup_test.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/akrylysov/pogreb/internal/assert" 7 | ) 8 | 9 | const testDBBackupName = testDBName + ".backup" 10 | 11 | func TestBackup(t *testing.T) { 12 | opts := &Options{ 13 | maxSegmentSize: 1024, 14 | compactionMinSegmentSize: 520, 15 | compactionMinFragmentation: 0.02, 16 | } 17 | 18 | run := func(name string, f func(t *testing.T, db *DB)) bool { 19 | return t.Run(name, func(t *testing.T) { 20 | db, err := createTestDB(opts) 21 | assert.Nil(t, err) 22 | f(t, db) 23 | assert.Nil(t, db.Close()) 24 | _ = cleanDir(testDBBackupName) 25 | }) 26 | } 27 | 28 | run("empty", func(t *testing.T, db *DB) { 29 | assert.Nil(t, db.Backup(testDBBackupName)) 30 | db2, err := Open(testDBBackupName, opts) 31 | assert.Nil(t, err) 32 | assert.Nil(t, db2.Close()) 33 | }) 34 | 35 | run("single segment", func(t *testing.T, db *DB) { 36 | assert.Nil(t, db.Put([]byte{0}, []byte{0})) 37 | assert.Equal(t, 1, countSegments(t, db)) 38 | assert.Nil(t, db.Backup(testDBBackupName)) 39 | db2, err := Open(testDBBackupName, opts) 40 | assert.Nil(t, err) 41 | v, err := db2.Get([]byte{0}) 42 | assert.Equal(t, []byte{0}, v) 43 | assert.Nil(t, err) 44 | assert.Nil(t, db2.Close()) 45 | }) 46 | 47 | run("multiple segments", func(t *testing.T, db *DB) { 48 | for i := byte(0); i < 100; i++ { 49 | assert.Nil(t, db.Put([]byte{i}, []byte{i})) 50 | } 51 | assert.Equal(t, 3, countSegments(t, db)) 52 | assert.Nil(t, db.Backup(testDBBackupName)) 53 | db2, err := Open(testDBBackupName, opts) 54 | assert.Equal(t, 3, countSegments(t, db2)) 55 | assert.Nil(t, err) 56 | for i := byte(0); i < 100; i++ { 57 | v, err := db2.Get([]byte{i}) 58 | assert.Nil(t, err) 59 | assert.Equal(t, []byte{i}, v) 60 | } 61 | assert.Nil(t, db2.Close()) 62 | }) 63 | } 64 | -------------------------------------------------------------------------------- /fs/fs.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package fs provides a file system interface. 3 | */ 4 | package fs 5 | 6 | import ( 7 | "errors" 8 | "io" 9 | "os" 10 | ) 11 | 12 | var ( 13 | errAppendModeNotSupported = errors.New("append mode is not supported") 14 | ) 15 | 16 | // File is the interface compatible with os.File. 17 | // All methods are not thread-safe, except for ReadAt, Slice and Stat. 18 | type File interface { 19 | io.Closer 20 | io.Reader 21 | io.ReaderAt 22 | io.Seeker 23 | io.Writer 24 | io.WriterAt 25 | 26 | // Stat returns os.FileInfo describing the file. 27 | Stat() (os.FileInfo, error) 28 | 29 | // Sync commits the current contents of the file. 30 | Sync() error 31 | 32 | // Truncate changes the size of the file. 33 | Truncate(size int64) error 34 | 35 | // Slice reads and returns the contents of file from offset start to offset end. 36 | Slice(start int64, end int64) ([]byte, error) 37 | } 38 | 39 | // LockFile represents a lock file. 40 | type LockFile interface { 41 | // Unlock and removes the lock file. 42 | Unlock() error 43 | } 44 | 45 | // FileSystem represents a file system. 46 | type FileSystem interface { 47 | // OpenFile opens the file with specified flag. 48 | OpenFile(name string, flag int, perm os.FileMode) (File, error) 49 | 50 | // Stat returns os.FileInfo describing the file. 51 | Stat(name string) (os.FileInfo, error) 52 | 53 | // Remove removes the file. 54 | Remove(name string) error 55 | 56 | // Rename renames oldpath to newpath. 57 | Rename(oldpath, newpath string) error 58 | 59 | // ReadDir reads the directory and returns a list of directory entries. 60 | ReadDir(name string) ([]os.DirEntry, error) 61 | 62 | // CreateLockFile creates a lock file. 63 | CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) 64 | 65 | // MkdirAll creates a directory named path. 66 | MkdirAll(path string, perm os.FileMode) error 67 | } 68 | -------------------------------------------------------------------------------- /iterator.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "errors" 5 | "sync" 6 | ) 7 | 8 | // ErrIterationDone is returned by ItemIterator.Next calls when there are no more items to return. 9 | var ErrIterationDone = errors.New("no more items in iterator") 10 | 11 | type item struct { 12 | key []byte 13 | value []byte 14 | } 15 | 16 | // ItemIterator is an iterator over DB key-value pairs. It iterates the items in an unspecified order. 17 | type ItemIterator struct { 18 | db *DB 19 | nextBucketIdx uint32 20 | queue []item 21 | mu sync.Mutex 22 | } 23 | 24 | // fetchItems adds items to the iterator queue from a bucket located at nextBucketIdx. 25 | func (it *ItemIterator) fetchItems(nextBucketIdx uint32) error { 26 | bit := it.db.index.newBucketIterator(nextBucketIdx) 27 | for { 28 | b, err := bit.next() 29 | if err == ErrIterationDone { 30 | return nil 31 | } 32 | if err != nil { 33 | return err 34 | } 35 | for i := 0; i < slotsPerBucket; i++ { 36 | sl := b.slots[i] 37 | if sl.offset == 0 { 38 | // No more items in the bucket. 39 | break 40 | } 41 | key, value, err := it.db.datalog.readKeyValue(sl) 42 | if err != nil { 43 | return err 44 | } 45 | key = cloneBytes(key) 46 | value = cloneBytes(value) 47 | it.queue = append(it.queue, item{key: key, value: value}) 48 | } 49 | } 50 | } 51 | 52 | // Next returns the next key-value pair if available, otherwise it returns ErrIterationDone error. 53 | func (it *ItemIterator) Next() ([]byte, []byte, error) { 54 | it.mu.Lock() 55 | defer it.mu.Unlock() 56 | 57 | it.db.mu.RLock() 58 | defer it.db.mu.RUnlock() 59 | 60 | // The iterator queue is empty and we have more buckets to check. 61 | for len(it.queue) == 0 && it.nextBucketIdx < it.db.index.numBuckets { 62 | if err := it.fetchItems(it.nextBucketIdx); err != nil { 63 | return nil, nil, err 64 | } 65 | it.nextBucketIdx++ 66 | } 67 | 68 | if len(it.queue) > 0 { 69 | item := it.queue[0] 70 | it.queue = it.queue[1:] 71 | return item.key, item.value, nil 72 | } 73 | 74 | return nil, nil, ErrIterationDone 75 | } 76 | -------------------------------------------------------------------------------- /backup.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "io" 5 | "os" 6 | 7 | "github.com/akrylysov/pogreb/fs" 8 | ) 9 | 10 | func touchFile(fsys fs.FileSystem, path string) error { 11 | f, err := fsys.OpenFile(path, os.O_CREATE|os.O_TRUNC, os.FileMode(0640)) 12 | if err != nil { 13 | return err 14 | } 15 | return f.Close() 16 | } 17 | 18 | // Backup creates a database backup at the specified path. 19 | func (db *DB) Backup(path string) error { 20 | // Make sure the compaction is not running during backup. 21 | db.maintenanceMu.Lock() 22 | defer db.maintenanceMu.Unlock() 23 | 24 | if err := db.opts.rootFS.MkdirAll(path, 0755); err != nil { 25 | return err 26 | } 27 | 28 | db.mu.RLock() 29 | var segments []*segment 30 | activeSegmentSizes := make(map[uint16]int64) 31 | for _, seg := range db.datalog.segmentsBySequenceID() { 32 | segments = append(segments, seg) 33 | if !seg.meta.Full { 34 | // Save the size of the active segments to copy only the data persisted up to the point 35 | // of when the backup started. 36 | activeSegmentSizes[seg.id] = seg.size 37 | } 38 | } 39 | db.mu.RUnlock() 40 | 41 | srcFS := db.opts.FileSystem 42 | dstFS := fs.Sub(db.opts.rootFS, path) 43 | 44 | for _, seg := range segments { 45 | name := segmentName(seg.id, seg.sequenceID) 46 | mode := os.FileMode(0640) 47 | srcFile, err := srcFS.OpenFile(name, os.O_RDONLY, mode) 48 | if err != nil { 49 | return err 50 | } 51 | 52 | dstFile, err := dstFS.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_TRUNC, mode) 53 | if err != nil { 54 | return err 55 | } 56 | 57 | if srcSize, ok := activeSegmentSizes[seg.id]; ok { 58 | if _, err := io.CopyN(dstFile, srcFile, srcSize); err != nil { 59 | return err 60 | } 61 | } else { 62 | if _, err := io.Copy(dstFile, srcFile); err != nil { 63 | return err 64 | } 65 | } 66 | 67 | if err := srcFile.Close(); err != nil { 68 | return err 69 | } 70 | if err := dstFile.Close(); err != nil { 71 | return err 72 | } 73 | } 74 | 75 | if err := touchFile(dstFS, lockName); err != nil { 76 | return err 77 | } 78 | 79 | return nil 80 | } 81 | -------------------------------------------------------------------------------- /internal/assert/assert.go: -------------------------------------------------------------------------------- 1 | package assert 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | // Equal fails the test when expected is not equal to actual. 10 | func Equal(t testing.TB, expected interface{}, actual interface{}) { 11 | if !reflect.DeepEqual(expected, actual) { 12 | t.Helper() 13 | t.Fatalf("expected %+v; got %+v", expected, actual) 14 | } 15 | } 16 | 17 | // https://github.com/golang/go/blob/go1.15/src/reflect/value.go#L1071 18 | var nillableKinds = map[reflect.Kind]bool{ 19 | reflect.Chan: true, 20 | reflect.Func: true, 21 | reflect.Map: true, 22 | reflect.Ptr: true, 23 | reflect.UnsafePointer: true, 24 | reflect.Interface: true, 25 | reflect.Slice: true, 26 | } 27 | 28 | // Nil fails the test when obj is not nil. 29 | func Nil(t testing.TB, obj interface{}) { 30 | if obj == nil { 31 | return 32 | } 33 | val := reflect.ValueOf(obj) 34 | if !nillableKinds[val.Kind()] || !val.IsNil() { 35 | t.Helper() 36 | t.Fatalf("expected nil; got %+v", obj) 37 | } 38 | } 39 | 40 | // NotNil fails the test when obj is nil. 41 | func NotNil(t testing.TB, obj interface{}) { 42 | val := reflect.ValueOf(obj) 43 | if obj == nil || (nillableKinds[val.Kind()] && val.IsNil()) { 44 | t.Helper() 45 | t.Fatalf("expected not nil; got %+v", obj) 46 | } 47 | } 48 | 49 | const pollingInterval = time.Millisecond * 10 // How often CompleteWithin polls the cond function. 50 | 51 | // CompleteWithin fails the test when cond doesn't succeed within waitDur. 52 | func CompleteWithin(t testing.TB, waitDur time.Duration, cond func() bool) { 53 | start := time.Now() 54 | for time.Since(start) < waitDur { 55 | if cond() { 56 | return 57 | } 58 | time.Sleep(pollingInterval) 59 | } 60 | t.Helper() 61 | t.Fatalf("expected to complete within %v", waitDur) 62 | } 63 | 64 | // Panic fails the test when the test doesn't panic with the expected message. 65 | func Panic(t testing.TB, expectedMessage string, f func()) { 66 | t.Helper() 67 | var message interface{} 68 | func() { 69 | defer func() { 70 | message = recover() 71 | }() 72 | f() 73 | }() 74 | Equal(t, expectedMessage, message) 75 | } 76 | -------------------------------------------------------------------------------- /file.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "io" 5 | "os" 6 | 7 | "github.com/akrylysov/pogreb/fs" 8 | ) 9 | 10 | // file is a database file. 11 | // When stored in a file system, the file starts with a header. 12 | type file struct { 13 | fs.File 14 | size int64 15 | } 16 | 17 | type openFileFlags struct { 18 | truncate bool 19 | readOnly bool 20 | } 21 | 22 | func openFile(fsyst fs.FileSystem, name string, flags openFileFlags) (*file, error) { 23 | var flag int 24 | if flags.readOnly { 25 | flag = os.O_RDONLY 26 | } else { 27 | flag = os.O_CREATE | os.O_RDWR 28 | if flags.truncate { 29 | flag |= os.O_TRUNC 30 | } 31 | } 32 | fi, err := fsyst.OpenFile(name, flag, os.FileMode(0640)) 33 | f := &file{} 34 | if err != nil { 35 | return f, err 36 | } 37 | clean := fi.Close 38 | defer func() { 39 | if clean != nil { 40 | _ = clean() 41 | } 42 | }() 43 | f.File = fi 44 | stat, err := fi.Stat() 45 | if err != nil { 46 | return f, err 47 | } 48 | f.size = stat.Size() 49 | if f.size == 0 { 50 | // It's a new file - write header. 51 | if err := f.writeHeader(); err != nil { 52 | return nil, err 53 | } 54 | } else { 55 | if err := f.readHeader(); err != nil { 56 | return nil, err 57 | } 58 | } 59 | if _, err := f.Seek(int64(headerSize), io.SeekStart); err != nil { 60 | return nil, err 61 | } 62 | clean = nil 63 | return f, nil 64 | } 65 | 66 | func (f *file) writeHeader() error { 67 | h := newHeader() 68 | data, err := h.MarshalBinary() 69 | if err != nil { 70 | return err 71 | } 72 | if _, err = f.append(data); err != nil { 73 | return err 74 | } 75 | return nil 76 | } 77 | 78 | func (f *file) readHeader() error { 79 | h := &header{} 80 | buf := make([]byte, headerSize) 81 | if _, err := io.ReadFull(f, buf); err != nil { 82 | return err 83 | } 84 | return h.UnmarshalBinary(buf) 85 | } 86 | 87 | func (f *file) empty() bool { 88 | return f.size == int64(headerSize) 89 | } 90 | 91 | func (f *file) extend(size uint32) (int64, error) { 92 | off := f.size 93 | if err := f.Truncate(off + int64(size)); err != nil { 94 | return 0, err 95 | } 96 | f.size += int64(size) 97 | return off, nil 98 | } 99 | 100 | func (f *file) append(data []byte) (int64, error) { 101 | off := f.size 102 | if _, err := f.WriteAt(data, off); err != nil { 103 | return 0, err 104 | } 105 | f.size += int64(len(data)) 106 | return off, nil 107 | } 108 | -------------------------------------------------------------------------------- /file_test.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "time" 7 | 8 | "github.com/akrylysov/pogreb/fs" 9 | ) 10 | 11 | type errfs struct{} 12 | 13 | func (fs *errfs) OpenFile(name string, flag int, perm os.FileMode) (fs.File, error) { 14 | return &errfile{}, nil 15 | } 16 | 17 | func (fs *errfs) CreateLockFile(name string, perm os.FileMode) (fs.LockFile, bool, error) { 18 | return &errfile{}, false, nil 19 | } 20 | 21 | func (fs *errfs) Stat(name string) (os.FileInfo, error) { 22 | return nil, errfileError 23 | } 24 | 25 | func (fs *errfs) Remove(name string) error { 26 | return errfileError 27 | } 28 | 29 | func (fs *errfs) Rename(oldpath, newpath string) error { 30 | return errfileError 31 | } 32 | 33 | func (fs *errfs) ReadDir(name string) ([]os.DirEntry, error) { 34 | return nil, errfileError 35 | } 36 | 37 | func (fs *errfs) MkdirAll(path string, perm os.FileMode) error { 38 | return errfileError 39 | } 40 | 41 | type errfile struct{} 42 | 43 | var errfileError = errors.New("errfile error") 44 | 45 | func (m *errfile) Close() error { 46 | return errfileError 47 | } 48 | 49 | func (m *errfile) Unlock() error { 50 | return errfileError 51 | } 52 | 53 | func (m *errfile) ReadAt(p []byte, off int64) (int, error) { 54 | return 0, errfileError 55 | } 56 | 57 | func (m *errfile) Read(p []byte) (int, error) { 58 | return 0, errfileError 59 | } 60 | 61 | func (m *errfile) WriteAt(p []byte, off int64) (int, error) { 62 | return 0, errfileError 63 | } 64 | 65 | func (m *errfile) Write(p []byte) (int, error) { 66 | return 0, errfileError 67 | } 68 | 69 | func (m *errfile) Seek(offset int64, whence int) (int64, error) { 70 | return 0, errfileError 71 | } 72 | 73 | func (m *errfile) Stat() (os.FileInfo, error) { 74 | return nil, errfileError 75 | } 76 | 77 | func (m *errfile) Sync() error { 78 | return errfileError 79 | } 80 | 81 | func (m *errfile) Truncate(size int64) error { 82 | return errfileError 83 | } 84 | 85 | func (m *errfile) Name() string { 86 | return "errfile" 87 | } 88 | 89 | func (m *errfile) Size() int64 { 90 | return 0 91 | } 92 | 93 | func (m *errfile) Mode() os.FileMode { 94 | return os.FileMode(0) 95 | } 96 | 97 | func (m *errfile) ModTime() time.Time { 98 | return time.Now() 99 | } 100 | 101 | func (m *errfile) IsDir() bool { 102 | return false 103 | } 104 | 105 | func (m *errfile) Sys() interface{} { 106 | return errfileError 107 | } 108 | 109 | func (m *errfile) Slice(start int64, end int64) ([]byte, error) { 110 | return nil, errfileError 111 | } 112 | 113 | func (m *errfile) Mmap(fileSize int64, mappingSize int64) error { 114 | return errfileError 115 | } 116 | 117 | func (m *errfile) Munmap() error { 118 | return errfileError 119 | } 120 | 121 | // Compile time interface assertion. 122 | var _ fs.File = &errfile{} 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | # Pogreb 4 | [![Docs](https://godoc.org/github.com/akrylysov/pogreb?status.svg)](https://pkg.go.dev/github.com/akrylysov/pogreb) 5 | [![Build Status](https://github.com/akrylysov/pogreb/actions/workflows/test.yaml/badge.svg?branch=master)](https://github.com/akrylysov/pogreb/actions) 6 | [![Go Report Card](https://goreportcard.com/badge/github.com/akrylysov/pogreb)](https://goreportcard.com/report/github.com/akrylysov/pogreb) 7 | [![Codecov](https://codecov.io/gh/akrylysov/pogreb/branch/master/graph/badge.svg)](https://codecov.io/gh/akrylysov/pogreb) 8 | 9 | Pogreb is an embedded key-value store for read-heavy workloads written in Go. 10 | 11 | ## Key characteristics 12 | 13 | - 100% Go. 14 | - Optimized for fast random lookups and infrequent bulk inserts. 15 | - Can store larger-than-memory data sets. 16 | - Low memory usage. 17 | - All DB methods are safe for concurrent use by multiple goroutines. 18 | 19 | ## Installation 20 | 21 | ```sh 22 | $ go get -u github.com/akrylysov/pogreb 23 | ``` 24 | 25 | ## Usage 26 | 27 | ### Opening a database 28 | 29 | To open or create a new database, use the `pogreb.Open()` function: 30 | 31 | ```go 32 | package main 33 | 34 | import ( 35 | "log" 36 | 37 | "github.com/akrylysov/pogreb" 38 | ) 39 | 40 | func main() { 41 | db, err := pogreb.Open("pogreb.test", nil) 42 | if err != nil { 43 | log.Fatal(err) 44 | return 45 | } 46 | defer db.Close() 47 | } 48 | ``` 49 | 50 | ### Writing to a database 51 | 52 | Use the `DB.Put()` function to insert a new key-value pair: 53 | 54 | ```go 55 | err := db.Put([]byte("testKey"), []byte("testValue")) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | ``` 60 | 61 | ### Reading from a database 62 | 63 | To retrieve the inserted value, use the `DB.Get()` function: 64 | 65 | ```go 66 | val, err := db.Get([]byte("testKey")) 67 | if err != nil { 68 | log.Fatal(err) 69 | } 70 | log.Printf("%s", val) 71 | ``` 72 | 73 | ### Deleting from a database 74 | 75 | Use the `DB.Delete()` function to delete a key-value pair: 76 | 77 | ```go 78 | err := db.Delete([]byte("testKey")) 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | ``` 83 | 84 | ### Iterating over items 85 | 86 | To iterate over items, use `ItemIterator` returned by `DB.Items()`: 87 | 88 | ```go 89 | it := db.Items() 90 | for { 91 | key, val, err := it.Next() 92 | if err == pogreb.ErrIterationDone { 93 | break 94 | } 95 | if err != nil { 96 | log.Fatal(err) 97 | } 98 | log.Printf("%s %s", key, val) 99 | } 100 | ``` 101 | 102 | ## Performance 103 | 104 | The benchmarking code can be found in the [pogreb-bench](https://github.com/akrylysov/pogreb-bench) repository. 105 | 106 | Results of read performance benchmark of pogreb, goleveldb, bolt and badgerdb 107 | on DigitalOcean 8 CPUs / 16 GB RAM / 160 GB SSD + Ubuntu 16.04.3 (higher is better): 108 | 109 |

110 | 111 | ## Internals 112 | 113 | [Design document](/docs/design.md). 114 | 115 | ## Limitations 116 | 117 | The design choices made to optimize for point lookups bring limitations for other potential use-cases. For example, using a hash table for indexing makes range scans impossible. Additionally, having a single hash table shared across all WAL segments makes the recovery process require rebuilding the entire index, which may be impractical for large databases. -------------------------------------------------------------------------------- /bucket.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "encoding/binary" 5 | ) 6 | 7 | const ( 8 | bucketSize = 512 9 | slotsPerBucket = 31 // Maximum number of slots possible to fit in a 512-byte bucket. 10 | ) 11 | 12 | // slot corresponds to a single item in the hash table. 13 | type slot struct { 14 | hash uint32 15 | segmentID uint16 16 | keySize uint16 17 | valueSize uint32 18 | offset uint32 // Offset of the record in a segment. 19 | } 20 | 21 | func (sl slot) kvSize() uint32 { 22 | return uint32(sl.keySize) + sl.valueSize 23 | } 24 | 25 | // bucket is an array of slots. 26 | type bucket struct { 27 | slots [slotsPerBucket]slot 28 | next int64 // Offset of overflow bucket. 29 | } 30 | 31 | // bucketHandle is a bucket, plus its offset and the file it's written to. 32 | type bucketHandle struct { 33 | bucket 34 | file *file 35 | offset int64 36 | } 37 | 38 | func (b bucket) MarshalBinary() ([]byte, error) { 39 | buf := make([]byte, bucketSize) 40 | data := buf 41 | for i := 0; i < slotsPerBucket; i++ { 42 | sl := b.slots[i] 43 | binary.LittleEndian.PutUint32(buf[:4], sl.hash) 44 | binary.LittleEndian.PutUint16(buf[4:6], sl.segmentID) 45 | binary.LittleEndian.PutUint16(buf[6:8], sl.keySize) 46 | binary.LittleEndian.PutUint32(buf[8:12], sl.valueSize) 47 | binary.LittleEndian.PutUint32(buf[12:16], sl.offset) 48 | buf = buf[16:] 49 | } 50 | binary.LittleEndian.PutUint64(buf[:8], uint64(b.next)) 51 | return data, nil 52 | } 53 | 54 | func (b *bucket) UnmarshalBinary(data []byte) error { 55 | for i := 0; i < slotsPerBucket; i++ { 56 | _ = data[16] // bounds check hint to compiler; see golang.org/issue/14808 57 | b.slots[i].hash = binary.LittleEndian.Uint32(data[:4]) 58 | b.slots[i].segmentID = binary.LittleEndian.Uint16(data[4:6]) 59 | b.slots[i].keySize = binary.LittleEndian.Uint16(data[6:8]) 60 | b.slots[i].valueSize = binary.LittleEndian.Uint32(data[8:12]) 61 | b.slots[i].offset = binary.LittleEndian.Uint32(data[12:16]) 62 | data = data[16:] 63 | } 64 | b.next = int64(binary.LittleEndian.Uint64(data[:8])) 65 | return nil 66 | } 67 | 68 | func (b *bucket) del(slotIdx int) { 69 | i := slotIdx 70 | // Shift slots. 71 | for ; i < slotsPerBucket-1; i++ { 72 | b.slots[i] = b.slots[i+1] 73 | } 74 | b.slots[i] = slot{} 75 | } 76 | 77 | func (b *bucketHandle) read() error { 78 | buf, err := b.file.Slice(b.offset, b.offset+int64(bucketSize)) 79 | if err != nil { 80 | return err 81 | } 82 | return b.UnmarshalBinary(buf) 83 | } 84 | 85 | func (b *bucketHandle) write() error { 86 | buf, err := b.MarshalBinary() 87 | if err != nil { 88 | return err 89 | } 90 | _, err = b.file.WriteAt(buf, b.offset) 91 | return err 92 | } 93 | 94 | // slotWriter inserts and writes slots into a bucket. 95 | type slotWriter struct { 96 | bucket *bucketHandle 97 | slotIdx int 98 | prevBuckets []*bucketHandle 99 | } 100 | 101 | func (sw *slotWriter) insert(sl slot, idx *index) error { 102 | if sw.slotIdx == slotsPerBucket { 103 | // Bucket is full, create a new overflow bucket. 104 | nextBucket, err := idx.createOverflowBucket() 105 | if err != nil { 106 | return err 107 | } 108 | sw.bucket.next = nextBucket.offset 109 | sw.prevBuckets = append(sw.prevBuckets, sw.bucket) 110 | sw.bucket = nextBucket 111 | sw.slotIdx = 0 112 | } 113 | sw.bucket.slots[sw.slotIdx] = sl 114 | sw.slotIdx++ 115 | return nil 116 | } 117 | 118 | func (sw *slotWriter) write() error { 119 | // Write previous buckets first. 120 | for i := len(sw.prevBuckets) - 1; i >= 0; i-- { 121 | if err := sw.prevBuckets[i].write(); err != nil { 122 | return err 123 | } 124 | } 125 | return sw.bucket.write() 126 | } 127 | -------------------------------------------------------------------------------- /fs/os_mmap.go: -------------------------------------------------------------------------------- 1 | //go:build !plan9 2 | 3 | package fs 4 | 5 | import ( 6 | "io" 7 | "os" 8 | ) 9 | 10 | const ( 11 | initialMmapSize = 1024 << 20 // 1 GiB 12 | ) 13 | 14 | type osMMapFS struct { 15 | osFS 16 | } 17 | 18 | // OSMMap is a file system backed by the os package and memory-mapped files. 19 | var OSMMap FileSystem = &osMMapFS{} 20 | 21 | func (fs *osMMapFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) { 22 | if flag&os.O_APPEND != 0 { 23 | // osMMapFS doesn't support opening files in append-only mode. 24 | // The database doesn't currently use O_APPEND. 25 | return nil, errAppendModeNotSupported 26 | } 27 | f, err := os.OpenFile(name, flag, perm) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | stat, err := f.Stat() 33 | if err != nil { 34 | return nil, err 35 | } 36 | 37 | mf := &osMMapFile{ 38 | File: f, 39 | size: stat.Size(), 40 | } 41 | if err := mf.mremap(); err != nil { 42 | return nil, err 43 | } 44 | return mf, nil 45 | } 46 | 47 | type osMMapFile struct { 48 | *os.File 49 | data []byte 50 | offset int64 51 | size int64 52 | mmapSize int64 53 | } 54 | 55 | func (f *osMMapFile) WriteAt(p []byte, off int64) (int, error) { 56 | n, err := f.File.WriteAt(p, off) 57 | if err != nil { 58 | return 0, err 59 | } 60 | writeOff := off + int64(n) 61 | if writeOff > f.size { 62 | f.size = writeOff 63 | } 64 | return n, f.mremap() 65 | } 66 | 67 | func (f *osMMapFile) Write(p []byte) (int, error) { 68 | n, err := f.File.Write(p) 69 | if err != nil { 70 | return 0, err 71 | } 72 | f.offset += int64(n) 73 | if f.offset > f.size { 74 | f.size = f.offset 75 | } 76 | return n, f.mremap() 77 | } 78 | 79 | func (f *osMMapFile) Seek(offset int64, whence int) (int64, error) { 80 | off, err := f.File.Seek(offset, whence) 81 | f.offset = off 82 | return off, err 83 | } 84 | 85 | func (f *osMMapFile) Read(p []byte) (int, error) { 86 | n, err := f.File.Read(p) 87 | f.offset += int64(n) 88 | return n, err 89 | } 90 | 91 | func (f *osMMapFile) Slice(start int64, end int64) ([]byte, error) { 92 | if end > f.size { 93 | return nil, io.EOF 94 | } 95 | if f.data == nil { 96 | return nil, os.ErrClosed 97 | } 98 | return f.data[start:end], nil 99 | } 100 | 101 | func (f *osMMapFile) munmap() error { 102 | if f.data == nil { 103 | return nil 104 | } 105 | if err := munmap(f.data); err != nil { 106 | return err 107 | } 108 | f.data = nil 109 | f.mmapSize = 0 110 | return nil 111 | } 112 | 113 | func (f *osMMapFile) mmap(fileSize int64, mappingSize int64) error { 114 | if f.data != nil { 115 | if err := munmap(f.data); err != nil { 116 | return err 117 | } 118 | } 119 | 120 | data, err := mmap(f.File, fileSize, mappingSize) 121 | if err != nil { 122 | return err 123 | } 124 | 125 | _ = madviceRandom(data) 126 | 127 | f.data = data 128 | return nil 129 | } 130 | 131 | func (f *osMMapFile) mremap() error { 132 | mmapSize := f.mmapSize 133 | 134 | if mmapSize >= f.size { 135 | return nil 136 | } 137 | 138 | if mmapSize == 0 { 139 | mmapSize = initialMmapSize 140 | if mmapSize < f.size { 141 | mmapSize = f.size 142 | } 143 | } else { 144 | if err := f.munmap(); err != nil { 145 | return err 146 | } 147 | mmapSize *= 2 148 | } 149 | 150 | if err := f.mmap(f.size, mmapSize); err != nil { 151 | return err 152 | } 153 | 154 | // On Windows mmap may memory-map less than the requested size. 155 | f.mmapSize = int64(len(f.data)) 156 | 157 | return nil 158 | } 159 | 160 | func (f *osMMapFile) Close() error { 161 | if err := f.munmap(); err != nil { 162 | return err 163 | } 164 | return f.File.Close() 165 | } 166 | 167 | // Return a default FileSystem for this platform. 168 | func DefaultFileSystem() FileSystem { 169 | return OSMMap 170 | } 171 | -------------------------------------------------------------------------------- /recovery.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "io" 5 | "path/filepath" 6 | 7 | "github.com/akrylysov/pogreb/fs" 8 | ) 9 | 10 | const ( 11 | recoveryBackupExt = ".bac" 12 | ) 13 | 14 | func backupNonsegmentFiles(fsys fs.FileSystem) error { 15 | logger.Println("moving non-segment files...") 16 | 17 | files, err := fsys.ReadDir(".") 18 | if err != nil { 19 | return err 20 | } 21 | 22 | for _, file := range files { 23 | name := file.Name() 24 | ext := filepath.Ext(name) 25 | if ext == segmentExt || name == lockName { 26 | continue 27 | } 28 | dst := name + recoveryBackupExt 29 | if err := fsys.Rename(name, dst); err != nil { 30 | return err 31 | } 32 | logger.Printf("moved %s to %s", name, dst) 33 | } 34 | 35 | return nil 36 | } 37 | 38 | func removeRecoveryBackupFiles(fsys fs.FileSystem) error { 39 | logger.Println("removing recovery backup files...") 40 | 41 | files, err := fsys.ReadDir(".") 42 | if err != nil { 43 | return err 44 | } 45 | 46 | for _, file := range files { 47 | name := file.Name() 48 | ext := filepath.Ext(name) 49 | if ext != recoveryBackupExt { 50 | continue 51 | } 52 | if err := fsys.Remove(name); err != nil { 53 | return err 54 | } 55 | logger.Printf("removed %s", name) 56 | } 57 | 58 | return nil 59 | } 60 | 61 | // recoveryIterator iterates over records of all datalog segments in insertion order. 62 | // Corrupted segments are truncated to the last valid record. 63 | type recoveryIterator struct { 64 | segments []*segment 65 | segit *segmentIterator 66 | } 67 | 68 | func newRecoveryIterator(segments []*segment) *recoveryIterator { 69 | return &recoveryIterator{ 70 | segments: segments, 71 | } 72 | } 73 | 74 | func (it *recoveryIterator) next() (record, error) { 75 | for { 76 | if it.segit == nil { 77 | if len(it.segments) == 0 { 78 | return record{}, ErrIterationDone 79 | } 80 | var err error 81 | it.segit, err = newSegmentIterator(it.segments[0]) 82 | if err != nil { 83 | return record{}, err 84 | } 85 | it.segments = it.segments[1:] 86 | } 87 | rec, err := it.segit.next() 88 | if err == io.EOF || err == io.ErrUnexpectedEOF || err == errCorrupted { 89 | // Truncate file to the last valid offset. 90 | if err := it.segit.f.Truncate(int64(it.segit.offset)); err != nil { 91 | return record{}, err 92 | } 93 | fi, fierr := it.segit.f.Stat() 94 | if fierr != nil { 95 | return record{}, fierr 96 | } 97 | logger.Printf("truncated segment %s to offset %d", fi.Name(), it.segit.offset) 98 | err = ErrIterationDone 99 | } 100 | if err == ErrIterationDone { 101 | it.segit = nil 102 | continue 103 | } 104 | if err != nil { 105 | return record{}, err 106 | } 107 | return rec, nil 108 | } 109 | } 110 | 111 | func (db *DB) recover() error { 112 | logger.Println("started recovery") 113 | logger.Println("rebuilding index...") 114 | 115 | segments := db.datalog.segmentsBySequenceID() 116 | it := newRecoveryIterator(segments) 117 | for { 118 | rec, err := it.next() 119 | if err == ErrIterationDone { 120 | break 121 | } 122 | if err != nil { 123 | return err 124 | } 125 | 126 | h := db.hash(rec.key) 127 | meta := db.datalog.segments[rec.segmentID].meta 128 | if rec.rtype == recordTypePut { 129 | sl := slot{ 130 | hash: h, 131 | segmentID: rec.segmentID, 132 | keySize: uint16(len(rec.key)), 133 | valueSize: uint32(len(rec.value)), 134 | offset: rec.offset, 135 | } 136 | if err := db.put(sl, rec.key); err != nil { 137 | return err 138 | } 139 | meta.PutRecords++ 140 | } else { 141 | if err := db.del(h, rec.key, false); err != nil { 142 | return err 143 | } 144 | meta.DeleteRecords++ 145 | meta.DeletedBytes += uint32(len(rec.data)) 146 | } 147 | } 148 | 149 | // Mark all segments except the newest as full. 150 | for i := 0; i < len(segments)-1; i++ { 151 | segments[i].meta.Full = true 152 | } 153 | 154 | if err := removeRecoveryBackupFiles(db.opts.FileSystem); err != nil { 155 | logger.Printf("error removing recovery backups files: %v", err) 156 | } 157 | 158 | logger.Println("successfully recovered database") 159 | 160 | return nil 161 | } 162 | -------------------------------------------------------------------------------- /segment.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "fmt" 7 | "hash/crc32" 8 | "io" 9 | ) 10 | 11 | type recordType int 12 | 13 | const ( 14 | recordTypePut recordType = iota 15 | recordTypeDelete 16 | 17 | segmentExt = ".psg" 18 | ) 19 | 20 | // segment is a write-ahead log segment. 21 | // It consists of a sequence of binary-encoded variable length records. 22 | type segment struct { 23 | *file 24 | id uint16 // Physical segment identifier. 25 | sequenceID uint64 // Logical monotonically increasing segment identifier. 26 | name string 27 | meta *segmentMeta 28 | } 29 | 30 | func segmentName(id uint16, sequenceID uint64) string { 31 | return fmt.Sprintf("%05d-%d%s", id, sequenceID, segmentExt) 32 | } 33 | 34 | type segmentMeta struct { 35 | Full bool 36 | PutRecords uint32 37 | DeleteRecords uint32 38 | DeletedKeys uint32 39 | DeletedBytes uint32 40 | } 41 | 42 | func segmentMetaName(id uint16, sequenceID uint64) string { 43 | return segmentName(id, sequenceID) + metaExt 44 | } 45 | 46 | // Binary representation of a segment record: 47 | // +---------------+------------------+------------------+-...-+--...--+----------+ 48 | // | Key Size (2B) | Record Type (1b) | Value Size (31b) | Key | Value | CRC (4B) | 49 | // +---------------+------------------+------------------+-...-+--...--+----------+ 50 | type record struct { 51 | rtype recordType 52 | segmentID uint16 53 | offset uint32 54 | data []byte 55 | key []byte 56 | value []byte 57 | } 58 | 59 | func encodedRecordSize(kvSize uint32) uint32 { 60 | // key size, value size, key, value, crc32 61 | return 2 + 4 + kvSize + 4 62 | } 63 | 64 | func encodeRecord(key []byte, value []byte, rt recordType) []byte { 65 | size := encodedRecordSize(uint32(len(key) + len(value))) 66 | data := make([]byte, size) 67 | binary.LittleEndian.PutUint16(data[:2], uint16(len(key))) 68 | 69 | valLen := uint32(len(value)) 70 | if rt == recordTypeDelete { // Set delete bit. 71 | valLen |= 1 << 31 72 | } 73 | binary.LittleEndian.PutUint32(data[2:], valLen) 74 | 75 | copy(data[6:], key) 76 | copy(data[6+len(key):], value) 77 | checksum := crc32.ChecksumIEEE(data[:6+len(key)+len(value)]) 78 | binary.LittleEndian.PutUint32(data[size-4:size], checksum) 79 | return data 80 | } 81 | 82 | func encodePutRecord(key []byte, value []byte) []byte { 83 | return encodeRecord(key, value, recordTypePut) 84 | } 85 | 86 | func encodeDeleteRecord(key []byte) []byte { 87 | return encodeRecord(key, nil, recordTypeDelete) 88 | } 89 | 90 | // segmentIterator iterates over segment records. 91 | type segmentIterator struct { 92 | f *segment 93 | offset uint32 94 | r *bufio.Reader 95 | buf []byte // kv size and crc32 reusable buffer. 96 | } 97 | 98 | func newSegmentIterator(f *segment) (*segmentIterator, error) { 99 | if _, err := f.Seek(int64(headerSize), io.SeekStart); err != nil { 100 | return nil, err 101 | } 102 | return &segmentIterator{ 103 | f: f, 104 | offset: headerSize, 105 | r: bufio.NewReader(f), 106 | buf: make([]byte, 6), 107 | }, nil 108 | } 109 | 110 | func (it *segmentIterator) next() (record, error) { 111 | // Read key and value size. 112 | kvSizeBuf := it.buf 113 | if _, err := io.ReadFull(it.r, kvSizeBuf); err != nil { 114 | if err == io.EOF { 115 | return record{}, ErrIterationDone 116 | } 117 | return record{}, err 118 | } 119 | 120 | // Decode key size. 121 | keySize := uint32(binary.LittleEndian.Uint16(kvSizeBuf[:2])) 122 | 123 | // Decode value size and record type. 124 | rt := recordTypePut 125 | valueSize := binary.LittleEndian.Uint32(kvSizeBuf[2:]) 126 | if valueSize&(1<<31) != 0 { 127 | rt = recordTypeDelete 128 | valueSize &^= 1 << 31 129 | } 130 | 131 | // Read key, value and checksum. 132 | recordSize := encodedRecordSize(keySize + valueSize) 133 | data := make([]byte, recordSize) 134 | copy(data, kvSizeBuf) 135 | if _, err := io.ReadFull(it.r, data[6:]); err != nil { 136 | return record{}, err 137 | } 138 | 139 | // Verify checksum. 140 | checksum := binary.LittleEndian.Uint32(data[len(data)-4:]) 141 | if checksum != crc32.ChecksumIEEE(data[:len(data)-4]) { 142 | return record{}, errCorrupted 143 | } 144 | 145 | offset := it.offset 146 | it.offset += recordSize 147 | rec := record{ 148 | rtype: rt, 149 | segmentID: it.f.id, 150 | offset: offset, 151 | data: data, 152 | key: data[6 : 6+keySize], 153 | value: data[6+keySize : 6+keySize+valueSize], 154 | } 155 | return rec, nil 156 | } 157 | -------------------------------------------------------------------------------- /compaction.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "github.com/akrylysov/pogreb/internal/errors" 5 | ) 6 | 7 | // promoteRecord writes the record to the current segment if the index still points to the record. 8 | // Otherwise it discards the record. 9 | func (db *DB) promoteRecord(rec record) (bool, error) { 10 | hash := db.hash(rec.key) 11 | it := db.index.newBucketIterator(db.index.bucketIndex(hash)) 12 | for { 13 | b, err := it.next() 14 | if err == ErrIterationDone { 15 | // Exhausted all buckets and the slot wasn't found. 16 | // The key was deleted or overwritten. The record is safe to discard. 17 | return true, nil 18 | } 19 | if err != nil { 20 | return false, err 21 | } 22 | for i := 0; i < slotsPerBucket; i++ { 23 | sl := b.slots[i] 24 | 25 | // No more slots in the bucket. 26 | if sl.offset == 0 { 27 | break 28 | } 29 | 30 | // Slot points to a different record. 31 | if hash != sl.hash || rec.offset != sl.offset || rec.segmentID != sl.segmentID { 32 | continue 33 | } 34 | 35 | // The record is in the index, write it to the current segment. 36 | segmentID, offset, err := db.datalog.writeRecord(rec.data, rec.rtype) // TODO: batch writes 37 | if err != nil { 38 | return false, err 39 | } 40 | 41 | // Update index. 42 | b.slots[i].segmentID = segmentID 43 | b.slots[i].offset = offset 44 | return false, b.write() 45 | } 46 | } 47 | } 48 | 49 | // CompactionResult holds the compaction result. 50 | type CompactionResult struct { 51 | CompactedSegments int 52 | ReclaimedRecords int 53 | ReclaimedBytes int 54 | } 55 | 56 | func (db *DB) compact(sourceSeg *segment) (CompactionResult, error) { 57 | cr := CompactionResult{} 58 | 59 | db.mu.Lock() 60 | sourceSeg.meta.Full = true // Prevent writes to the compacted file. 61 | db.mu.Unlock() 62 | 63 | it, err := newSegmentIterator(sourceSeg) 64 | if err != nil { 65 | return cr, err 66 | } 67 | // Copy records from sourceSeg to the current segment. 68 | for { 69 | err := func() error { 70 | db.mu.Lock() 71 | defer db.mu.Unlock() 72 | rec, err := it.next() 73 | if err != nil { 74 | return err 75 | } 76 | if rec.rtype == recordTypeDelete { 77 | cr.ReclaimedRecords++ 78 | cr.ReclaimedBytes += len(rec.data) 79 | return nil 80 | } 81 | reclaimed, err := db.promoteRecord(rec) 82 | if reclaimed { 83 | cr.ReclaimedRecords++ 84 | cr.ReclaimedBytes += len(rec.data) 85 | } 86 | return err 87 | }() 88 | if err == ErrIterationDone { 89 | break 90 | } 91 | if err != nil { 92 | return cr, err 93 | } 94 | } 95 | 96 | db.mu.Lock() 97 | defer db.mu.Unlock() 98 | err = db.datalog.removeSegment(sourceSeg) 99 | return cr, err 100 | } 101 | 102 | // pickForCompaction returns segments eligible for compaction. 103 | func (db *DB) pickForCompaction() []*segment { 104 | segments := db.datalog.segmentsBySequenceID() 105 | var picked []*segment 106 | for i := len(segments) - 1; i >= 0; i-- { 107 | seg := segments[i] 108 | 109 | if uint32(seg.size) < db.opts.compactionMinSegmentSize { 110 | continue 111 | } 112 | 113 | fragmentation := float32(seg.meta.DeletedBytes) / float32(seg.size) 114 | if fragmentation < db.opts.compactionMinFragmentation { 115 | continue 116 | } 117 | 118 | if seg.meta.DeleteRecords > 0 { 119 | // Delete records can be discarded only when older segments contain no put records 120 | // for the corresponding keys. 121 | // All segments older than the segment eligible for compaction have to be compacted. 122 | return append(segments[:i+1], picked...) 123 | } 124 | 125 | picked = append([]*segment{seg}, picked...) 126 | } 127 | return picked 128 | } 129 | 130 | // Compact compacts the DB. Deleted and overwritten items are discarded. 131 | // Returns an error if compaction is already in progress. 132 | func (db *DB) Compact() (CompactionResult, error) { 133 | cr := CompactionResult{} 134 | 135 | // Run only a single compaction at a time. 136 | if !db.maintenanceMu.TryLock() { 137 | return cr, errBusy 138 | } 139 | defer func() { 140 | db.maintenanceMu.Unlock() 141 | }() 142 | 143 | db.mu.RLock() 144 | segments := db.pickForCompaction() 145 | db.mu.RUnlock() 146 | 147 | for _, seg := range segments { 148 | segcr, err := db.compact(seg) 149 | if err != nil { 150 | return cr, errors.Wrapf(err, "compacting segment %s", seg.name) 151 | } 152 | cr.CompactedSegments++ 153 | cr.ReclaimedRecords += segcr.ReclaimedRecords 154 | cr.ReclaimedBytes += segcr.ReclaimedBytes 155 | } 156 | 157 | return cr, nil 158 | } 159 | -------------------------------------------------------------------------------- /internal/assert/assert_test.go: -------------------------------------------------------------------------------- 1 | package assert 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func TestEqual(t *testing.T) { 11 | testCases := []struct { 12 | first interface{} 13 | second interface{} 14 | expectedFailed bool 15 | }{ 16 | { 17 | first: 1, 18 | second: 1, 19 | expectedFailed: false, 20 | }, 21 | 22 | { 23 | first: nil, 24 | second: nil, 25 | expectedFailed: false, 26 | }, 27 | { 28 | first: "1", 29 | second: "1", 30 | expectedFailed: false, 31 | }, 32 | { 33 | first: struct{}{}, 34 | second: struct{}{}, 35 | expectedFailed: false, 36 | }, 37 | { 38 | first: struct{ x int }{x: 1}, 39 | second: struct{ x int }{x: 1}, 40 | expectedFailed: false, 41 | }, 42 | { 43 | first: 1, 44 | second: 2, 45 | expectedFailed: true, 46 | }, 47 | { 48 | first: 1, 49 | second: "1", 50 | expectedFailed: true, 51 | }, 52 | { 53 | first: 1, 54 | second: 1.0, 55 | expectedFailed: true, 56 | }, 57 | { 58 | first: struct{ x int }{x: 1}, 59 | second: struct{ x int }{x: 2}, 60 | expectedFailed: true, 61 | }, 62 | { 63 | first: struct{ x int }{x: 1}, 64 | second: struct{ y int }{y: 1}, 65 | expectedFailed: true, 66 | }, 67 | } 68 | 69 | for i, tc := range testCases { 70 | t.Run(fmt.Sprintf("%d %+v", i, tc), func(t *testing.T) { 71 | mock := &testing.T{} 72 | wg := &sync.WaitGroup{} 73 | wg.Add(1) 74 | // Run the asserting in a goroutine. t.Fatal calls runtime.Goexit. 75 | go func() { 76 | defer wg.Done() 77 | Equal(mock, tc.first, tc.second) 78 | }() 79 | wg.Wait() 80 | failed := mock.Failed() 81 | if tc.expectedFailed != failed { 82 | t.Fatalf("expected to fail: %t; failed: %t", tc.expectedFailed, failed) 83 | } 84 | }) 85 | } 86 | } 87 | 88 | func TestNil(t *testing.T) { 89 | var nilIntPtr *int 90 | var nilStructPtr *struct{ x int } 91 | var nilSlice []string 92 | 93 | testCases := []struct { 94 | obj interface{} 95 | isNil bool 96 | }{ 97 | { 98 | obj: nil, 99 | isNil: true, 100 | }, 101 | { 102 | obj: nilIntPtr, 103 | isNil: true, 104 | }, 105 | { 106 | obj: nilStructPtr, 107 | isNil: true, 108 | }, 109 | { 110 | obj: nilSlice, 111 | isNil: true, 112 | }, 113 | { 114 | obj: 1, 115 | isNil: false, 116 | }, 117 | { 118 | obj: "1", 119 | isNil: false, 120 | }, 121 | { 122 | obj: []string{}, 123 | isNil: false, 124 | }, 125 | { 126 | obj: [2]int{1, 1}, 127 | isNil: false, 128 | }, 129 | } 130 | 131 | for i, tc := range testCases { 132 | t.Run(fmt.Sprintf("%d %+v", i, tc.obj), func(t *testing.T) { 133 | mockNil := &testing.T{} 134 | mockNotNil := &testing.T{} 135 | wg := &sync.WaitGroup{} 136 | wg.Add(2) 137 | go func() { 138 | defer wg.Done() 139 | Nil(mockNil, tc.obj) 140 | }() 141 | go func() { 142 | defer wg.Done() 143 | NotNil(mockNotNil, tc.obj) 144 | }() 145 | wg.Wait() 146 | if tc.isNil == mockNil.Failed() { 147 | t.Fatalf("Nil expected to fail: %t; failed: %t", !tc.isNil, mockNil.Failed()) 148 | } 149 | if !tc.isNil == mockNotNil.Failed() { 150 | t.Fatalf("NotNil expected to fail: %t; failed: %t", tc.isNil, mockNotNil.Failed()) 151 | } 152 | }) 153 | } 154 | } 155 | 156 | func TestPanic(t *testing.T) { 157 | testCases := []struct { 158 | name string 159 | f func() 160 | expectedFailed bool 161 | }{ 162 | { 163 | name: "panic", 164 | f: func() { 165 | panic("message123") 166 | }, 167 | expectedFailed: false, 168 | }, 169 | { 170 | name: "panic: wrong message", 171 | f: func() { 172 | panic("message456") 173 | }, 174 | expectedFailed: true, 175 | }, 176 | { 177 | name: "no panic", 178 | f: func() {}, 179 | expectedFailed: true, 180 | }, 181 | } 182 | for _, tc := range testCases { 183 | t.Run(tc.name, func(t *testing.T) { 184 | mock := &testing.T{} 185 | wg := &sync.WaitGroup{} 186 | wg.Add(1) 187 | go func() { 188 | defer wg.Done() 189 | Panic(mock, "message123", tc.f) 190 | }() 191 | wg.Wait() 192 | if tc.expectedFailed != mock.Failed() { 193 | t.Fatalf("expected to fail: %t; failed: %t", tc.expectedFailed, mock.Failed()) 194 | } 195 | }) 196 | } 197 | } 198 | 199 | func TestCompleteWithin(t *testing.T) { 200 | var tc2Tries int 201 | var tc4Tries int 202 | testCases := []struct { 203 | name string 204 | dur time.Duration 205 | cond func() bool 206 | expectedFailed bool 207 | }{ 208 | { 209 | name: "completed: first try", 210 | dur: time.Hour, 211 | cond: func() bool { 212 | return true 213 | }, 214 | expectedFailed: false, 215 | }, 216 | { 217 | name: "completed: second try", 218 | dur: time.Hour, 219 | cond: func() bool { 220 | if tc2Tries == 0 { 221 | tc2Tries++ 222 | return false 223 | } 224 | return true 225 | }, 226 | expectedFailed: false, 227 | }, 228 | { 229 | name: "not completed", 230 | dur: time.Nanosecond, 231 | cond: func() bool { 232 | return false 233 | }, 234 | expectedFailed: true, 235 | }, 236 | { 237 | name: "not completed: timeout", 238 | dur: time.Nanosecond, 239 | cond: func() bool { 240 | if tc4Tries == 0 { 241 | tc4Tries++ 242 | time.Sleep(pollingInterval * 2) 243 | return false 244 | } 245 | return true 246 | }, 247 | expectedFailed: true, 248 | }, 249 | } 250 | for _, tc := range testCases { 251 | t.Run(tc.name, func(t *testing.T) { 252 | mock := &testing.T{} 253 | wg := &sync.WaitGroup{} 254 | wg.Add(1) 255 | go func() { 256 | defer wg.Done() 257 | CompleteWithin(mock, tc.dur, tc.cond) 258 | }() 259 | wg.Wait() 260 | if tc.expectedFailed != mock.Failed() { 261 | t.Fatalf("expected to fail: %t; failed: %t", tc.expectedFailed, mock.Failed()) 262 | } 263 | }) 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /fs/mem.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "path/filepath" 7 | "time" 8 | ) 9 | 10 | type memFS struct { 11 | files map[string]*memFile 12 | } 13 | 14 | // Mem is a file system backed by memory. 15 | // It should be used for testing only. 16 | var Mem FileSystem = &memFS{files: map[string]*memFile{}} 17 | 18 | func (fs *memFS) OpenFile(name string, flag int, perm os.FileMode) (File, error) { 19 | if flag&os.O_APPEND != 0 { 20 | // memFS doesn't support opening files in append-only mode. 21 | // The database doesn't currently use O_APPEND. 22 | return nil, errAppendModeNotSupported 23 | } 24 | f := fs.files[name] 25 | if f == nil { 26 | // The file doesn't exist. 27 | if (flag & os.O_CREATE) == 0 { 28 | return nil, os.ErrNotExist 29 | } 30 | f = &memFile{ 31 | name: name, 32 | perm: perm, // Perm is saved to return it in Mode, but don't do anything else with it yet. 33 | refs: 1, 34 | } 35 | fs.files[name] = f 36 | } else { 37 | if (flag & os.O_TRUNC) != 0 { 38 | f.size = 0 39 | f.buf = nil 40 | } 41 | f.refs += 1 42 | } 43 | return &seekableMemFile{memFile: f}, nil 44 | } 45 | 46 | func (fs *memFS) CreateLockFile(name string, perm os.FileMode) (LockFile, bool, error) { 47 | f, exists := fs.files[name] 48 | if f != nil && f.refs > 0 { 49 | return nil, false, os.ErrExist 50 | } 51 | _, err := fs.OpenFile(name, os.O_CREATE, perm) 52 | if err != nil { 53 | return nil, false, err 54 | } 55 | return fs.files[name], exists, nil 56 | } 57 | 58 | func (fs *memFS) Stat(name string) (os.FileInfo, error) { 59 | if f, ok := fs.files[name]; ok { 60 | return f, nil 61 | } 62 | return nil, os.ErrNotExist 63 | } 64 | 65 | func (fs *memFS) Remove(name string) error { 66 | if _, ok := fs.files[name]; ok { 67 | delete(fs.files, name) 68 | return nil 69 | } 70 | return os.ErrNotExist 71 | } 72 | 73 | func (fs *memFS) Rename(oldpath, newpath string) error { 74 | if f, ok := fs.files[oldpath]; ok { 75 | delete(fs.files, oldpath) 76 | fs.files[newpath] = f 77 | f.name = newpath 78 | return nil 79 | } 80 | return os.ErrNotExist 81 | } 82 | 83 | func (fs *memFS) ReadDir(dir string) ([]os.DirEntry, error) { 84 | dir = filepath.Clean(dir) 85 | var entries []os.DirEntry 86 | for name, f := range fs.files { 87 | if filepath.Dir(name) == dir { 88 | entries = append(entries, f) 89 | } 90 | } 91 | return entries, nil 92 | } 93 | 94 | func (fs *memFS) MkdirAll(path string, perm os.FileMode) error { 95 | // FIXME: the implementation is incomplete. 96 | // memFS lets create a file even when the parent directory doesn't exist. 97 | return nil 98 | } 99 | 100 | type memFile struct { 101 | name string 102 | perm os.FileMode 103 | buf []byte 104 | size int64 105 | refs int 106 | } 107 | 108 | func (f *memFile) Close() error { 109 | if f.refs == 0 { 110 | return os.ErrClosed 111 | } 112 | f.refs -= 1 113 | return nil 114 | } 115 | 116 | func (f *memFile) Unlock() error { 117 | if err := f.Close(); err != nil { 118 | return err 119 | } 120 | return Mem.Remove(f.name) 121 | } 122 | 123 | func (f *memFile) ReadAt(p []byte, off int64) (int, error) { 124 | if f.refs == 0 { 125 | return 0, os.ErrClosed 126 | } 127 | if off >= f.size { 128 | return 0, io.EOF 129 | } 130 | n := int64(len(p)) 131 | if n > f.size-off { 132 | copy(p, f.buf[off:]) 133 | return int(f.size - off), nil 134 | } 135 | copy(p, f.buf[off:off+n]) 136 | return int(n), nil 137 | } 138 | 139 | func (f *memFile) WriteAt(p []byte, off int64) (int, error) { 140 | if f.refs == 0 { 141 | return 0, os.ErrClosed 142 | } 143 | n := int64(len(p)) 144 | if off+n > f.size { 145 | f.truncate(off + n) 146 | } 147 | copy(f.buf[off:off+n], p) 148 | return int(n), nil 149 | } 150 | 151 | func (f *memFile) Stat() (os.FileInfo, error) { 152 | if f.refs == 0 { 153 | return f, os.ErrClosed 154 | } 155 | return f, nil 156 | } 157 | 158 | func (f *memFile) Sync() error { 159 | if f.refs == 0 { 160 | return os.ErrClosed 161 | } 162 | return nil 163 | } 164 | 165 | func (f *memFile) truncate(size int64) { 166 | if size > f.size { 167 | diff := int(size - f.size) 168 | f.buf = append(f.buf, make([]byte, diff)...) 169 | } else { 170 | f.buf = f.buf[:size] 171 | } 172 | f.size = size 173 | } 174 | 175 | func (f *memFile) Truncate(size int64) error { 176 | if f.refs == 0 { 177 | return os.ErrClosed 178 | } 179 | f.truncate(size) 180 | return nil 181 | } 182 | 183 | func (f *memFile) Name() string { 184 | _, name := filepath.Split(f.name) 185 | return name 186 | } 187 | 188 | func (f *memFile) Size() int64 { 189 | return f.size 190 | } 191 | 192 | func (f *memFile) Mode() os.FileMode { 193 | return f.perm 194 | } 195 | 196 | func (f *memFile) ModTime() time.Time { 197 | return time.Now() 198 | } 199 | 200 | func (f *memFile) IsDir() bool { 201 | return false 202 | } 203 | 204 | func (f *memFile) Sys() interface{} { 205 | return nil 206 | } 207 | 208 | func (f *memFile) Type() os.FileMode { 209 | return f.perm 210 | } 211 | 212 | func (f *memFile) Info() (os.FileInfo, error) { 213 | return f.Stat() 214 | } 215 | 216 | func (f *memFile) Slice(start int64, end int64) ([]byte, error) { 217 | if f.refs == 0 { 218 | return nil, os.ErrClosed 219 | } 220 | if end > f.size { 221 | return nil, io.EOF 222 | } 223 | return f.buf[start:end], nil 224 | } 225 | 226 | type seekableMemFile struct { 227 | *memFile 228 | offset int64 229 | } 230 | 231 | func (f *seekableMemFile) Read(p []byte) (int, error) { 232 | n, err := f.ReadAt(p, f.offset) 233 | if err != nil { 234 | return n, err 235 | } 236 | f.offset += int64(n) 237 | return n, err 238 | } 239 | 240 | func (f *seekableMemFile) Write(p []byte) (int, error) { 241 | n, err := f.WriteAt(p, f.offset) 242 | if err != nil { 243 | return n, err 244 | } 245 | f.offset += int64(n) 246 | return n, err 247 | } 248 | 249 | func (f *seekableMemFile) Seek(offset int64, whence int) (int64, error) { 250 | if f.refs == 0 { 251 | return 0, os.ErrClosed 252 | } 253 | switch whence { 254 | case io.SeekEnd: 255 | f.offset = f.size + offset 256 | case io.SeekStart: 257 | f.offset = offset 258 | case io.SeekCurrent: 259 | f.offset += offset 260 | } 261 | return f.offset, nil 262 | } 263 | -------------------------------------------------------------------------------- /datalog.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "os" 7 | "path/filepath" 8 | "sort" 9 | "strconv" 10 | "strings" 11 | 12 | "github.com/akrylysov/pogreb/internal/errors" 13 | ) 14 | 15 | const ( 16 | maxSegments = math.MaxInt16 17 | ) 18 | 19 | // datalog is a write-ahead log. 20 | type datalog struct { 21 | opts *Options 22 | curSeg *segment 23 | segments [maxSegments]*segment 24 | maxSequenceID uint64 25 | } 26 | 27 | func openDatalog(opts *Options) (*datalog, error) { 28 | files, err := opts.FileSystem.ReadDir(".") 29 | if err != nil { 30 | return nil, err 31 | } 32 | 33 | dl := &datalog{ 34 | opts: opts, 35 | } 36 | 37 | // Open existing segments. 38 | for _, file := range files { 39 | name := file.Name() 40 | ext := filepath.Ext(name) 41 | if ext != segmentExt { 42 | continue 43 | } 44 | id, seqID, err := parseSegmentName(name) 45 | if err != nil { 46 | return nil, err 47 | } 48 | seg, err := dl.openSegment(name, id, seqID) 49 | if err != nil { 50 | return nil, errors.Wrapf(err, "opening segment %s", name) 51 | } 52 | if seg.sequenceID > dl.maxSequenceID { 53 | dl.maxSequenceID = seg.sequenceID 54 | } 55 | dl.segments[seg.id] = seg 56 | } 57 | 58 | if err := dl.swapSegment(); err != nil { 59 | return nil, err 60 | } 61 | 62 | return dl, nil 63 | } 64 | 65 | func parseSegmentName(name string) (uint16, uint64, error) { 66 | parts := strings.SplitN(strings.TrimSuffix(name, segmentExt), "-", 2) 67 | id, err := strconv.ParseUint(parts[0], 10, 16) 68 | if err != nil { 69 | return 0, 0, err 70 | } 71 | var seqID uint64 72 | if len(parts) == 2 { 73 | seqID, err = strconv.ParseUint(parts[1], 10, 64) 74 | if err != nil { 75 | return 0, 0, err 76 | } 77 | } 78 | return uint16(id), seqID, nil 79 | } 80 | 81 | func (dl *datalog) openSegment(name string, id uint16, seqID uint64) (*segment, error) { 82 | f, err := openFile(dl.opts.FileSystem, name, openFileFlags{}) 83 | if err != nil { 84 | return nil, err 85 | } 86 | 87 | meta := &segmentMeta{} 88 | if !f.empty() { 89 | metaName := name + metaExt 90 | if err := readGobFile(dl.opts.FileSystem, metaName, &meta); err != nil { 91 | logger.Printf("error reading segment meta %d: %v", id, err) 92 | // TODO: rebuild meta? 93 | } 94 | } 95 | 96 | seg := &segment{ 97 | file: f, 98 | id: id, 99 | sequenceID: seqID, 100 | name: name, 101 | meta: meta, 102 | } 103 | 104 | return seg, nil 105 | } 106 | 107 | func (dl *datalog) nextWritableSegmentID() (uint16, uint64, error) { 108 | for id, seg := range dl.segments { 109 | // Pick empty segment. 110 | if seg == nil { 111 | dl.maxSequenceID++ 112 | return uint16(id), dl.maxSequenceID, nil 113 | } 114 | } 115 | return 0, 0, fmt.Errorf("number of segments exceeds %d", maxSegments) 116 | } 117 | 118 | func (dl *datalog) swapSegment() error { 119 | // Pick unfilled segment. 120 | for _, seg := range dl.segments { 121 | if seg != nil && !seg.meta.Full { 122 | dl.curSeg = seg 123 | return nil 124 | } 125 | } 126 | 127 | // Create new segment. 128 | id, seqID, err := dl.nextWritableSegmentID() 129 | if err != nil { 130 | return err 131 | } 132 | 133 | name := segmentName(id, seqID) 134 | seg, err := dl.openSegment(name, id, seqID) 135 | if err != nil { 136 | return err 137 | } 138 | 139 | dl.segments[id] = seg 140 | dl.curSeg = seg 141 | 142 | return nil 143 | } 144 | 145 | func (dl *datalog) removeSegment(seg *segment) error { 146 | dl.segments[seg.id] = nil 147 | 148 | if err := seg.Close(); err != nil { 149 | return err 150 | } 151 | 152 | // Remove segment meta from FS. 153 | metaName := seg.name + segmentExt 154 | if err := dl.opts.FileSystem.Remove(metaName); err != nil && !os.IsNotExist(err) { 155 | return err 156 | } 157 | 158 | // Remove segment from FS. 159 | if err := dl.opts.FileSystem.Remove(seg.name); err != nil { 160 | return err 161 | } 162 | 163 | return nil 164 | } 165 | 166 | func (dl *datalog) readKeyValue(sl slot) ([]byte, []byte, error) { 167 | off := int64(sl.offset) + 6 // Skip key size and value size. 168 | seg := dl.segments[sl.segmentID] 169 | keyValue, err := seg.Slice(off, off+int64(sl.kvSize())) 170 | if err != nil { 171 | return nil, nil, err 172 | } 173 | return keyValue[:sl.keySize], keyValue[sl.keySize:], nil 174 | } 175 | 176 | func (dl *datalog) readKey(sl slot) ([]byte, error) { 177 | off := int64(sl.offset) + 6 178 | seg := dl.segments[sl.segmentID] 179 | return seg.Slice(off, off+int64(sl.keySize)) 180 | } 181 | 182 | // trackDel updates segment's metadata for deleted or overwritten items. 183 | func (dl *datalog) trackDel(sl slot) { 184 | meta := dl.segments[sl.segmentID].meta 185 | meta.DeletedKeys++ 186 | meta.DeletedBytes += encodedRecordSize(sl.kvSize()) 187 | } 188 | 189 | func (dl *datalog) del(key []byte) error { 190 | rec := encodeDeleteRecord(key) 191 | _, _, err := dl.writeRecord(rec, recordTypeDelete) 192 | if err != nil { 193 | return err 194 | } 195 | // Compaction removes delete records, increment DeletedBytes. 196 | dl.curSeg.meta.DeletedBytes += uint32(len(rec)) 197 | return nil 198 | } 199 | 200 | func (dl *datalog) writeRecord(data []byte, rt recordType) (uint16, uint32, error) { 201 | if dl.curSeg.meta.Full || dl.curSeg.size+int64(len(data)) > int64(dl.opts.maxSegmentSize) { 202 | // Current segment is full, create a new one. 203 | dl.curSeg.meta.Full = true 204 | if err := dl.swapSegment(); err != nil { 205 | return 0, 0, err 206 | } 207 | } 208 | off, err := dl.curSeg.append(data) 209 | if err != nil { 210 | return 0, 0, err 211 | } 212 | switch rt { 213 | case recordTypePut: 214 | dl.curSeg.meta.PutRecords++ 215 | case recordTypeDelete: 216 | dl.curSeg.meta.DeleteRecords++ 217 | } 218 | return dl.curSeg.id, uint32(off), nil 219 | } 220 | 221 | func (dl *datalog) put(key []byte, value []byte) (uint16, uint32, error) { 222 | return dl.writeRecord(encodePutRecord(key, value), recordTypePut) 223 | } 224 | 225 | func (dl *datalog) sync() error { 226 | return dl.curSeg.Sync() 227 | } 228 | 229 | func (dl *datalog) close() error { 230 | for _, seg := range dl.segments { 231 | if seg == nil { 232 | continue 233 | } 234 | if err := seg.Close(); err != nil { 235 | return err 236 | } 237 | metaName := seg.name + metaExt 238 | if err := writeGobFile(dl.opts.FileSystem, metaName, seg.meta); err != nil { 239 | return err 240 | } 241 | } 242 | return nil 243 | } 244 | 245 | // segmentsBySequenceID returns segments ordered from oldest to newest. 246 | func (dl *datalog) segmentsBySequenceID() []*segment { 247 | var segments []*segment 248 | 249 | for _, seg := range dl.segments { 250 | if seg == nil { 251 | continue 252 | } 253 | segments = append(segments, seg) 254 | } 255 | 256 | sort.SliceStable(segments, func(i, j int) bool { 257 | return segments[i].sequenceID < segments[j].sequenceID 258 | }) 259 | 260 | return segments 261 | } 262 | -------------------------------------------------------------------------------- /recovery_test.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "path/filepath" 5 | "testing" 6 | 7 | "github.com/akrylysov/pogreb/internal/assert" 8 | ) 9 | 10 | func TestRecovery(t *testing.T) { 11 | segPath := filepath.Join(testDBName, segmentName(0, 1)) 12 | testCases := []struct { 13 | name string 14 | fn func() error 15 | }{ 16 | { 17 | name: "all zeroes", 18 | fn: func() error { 19 | return appendFile(segPath, make([]byte, 128)) 20 | }, 21 | }, 22 | { 23 | name: "partial kv size", 24 | fn: func() error { 25 | return appendFile(segPath, []byte{1}) 26 | }, 27 | }, 28 | { 29 | name: "only kv size", 30 | fn: func() error { 31 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0}) 32 | }, 33 | }, 34 | { 35 | name: "kv size and key", 36 | fn: func() error { 37 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1}) 38 | }, 39 | }, 40 | { 41 | name: "kv size, key, value", 42 | fn: func() error { 43 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1}) 44 | }, 45 | }, 46 | { 47 | name: "kv size, key, value, partial crc32", 48 | fn: func() error { 49 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 40}) 50 | }, 51 | }, 52 | { 53 | name: "kv size, key, value, invalid crc32", 54 | fn: func() error { 55 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 40, 19, 197, 0}) 56 | }, 57 | }, 58 | { 59 | name: "corrupted and not corrupted record", 60 | fn: func() error { 61 | if err := appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 40, 19, 197, 0}); err != nil { 62 | return err 63 | } 64 | return appendFile(segPath, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}) 65 | }, 66 | }, 67 | } 68 | 69 | for _, testCase := range testCases { 70 | t.Run(testCase.name, func(t *testing.T) { 71 | opts := &Options{FileSystem: testFS} 72 | db, err := createTestDB(opts) 73 | assert.Nil(t, err) 74 | // Fill segment 0. 75 | var i uint8 76 | for i = 0; i < 128; i++ { 77 | assert.Nil(t, db.Put([]byte{i}, []byte{i})) 78 | } 79 | assert.Equal(t, uint32(128), db.Count()) 80 | assert.Nil(t, db.Close()) 81 | 82 | // Simulate crash. 83 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName))) 84 | 85 | assert.Nil(t, testCase.fn()) 86 | 87 | db, err = Open(testDBName, opts) 88 | assert.Nil(t, err) 89 | assert.Equal(t, uint32(128), db.Count()) 90 | assert.Nil(t, db.Close()) 91 | 92 | db, err = Open(testDBName, opts) 93 | assert.Nil(t, err) 94 | assert.Equal(t, uint32(128), db.Count()) 95 | for i = 0; i < 128; i++ { 96 | v, err := db.Get([]byte{i}) 97 | assert.Nil(t, err) 98 | assert.Equal(t, []byte{i}, v) 99 | } 100 | assert.Nil(t, db.Close()) 101 | }) 102 | } 103 | } 104 | 105 | func TestRecoveryDelete(t *testing.T) { 106 | opts := &Options{FileSystem: testFS} 107 | db, err := createTestDB(opts) 108 | assert.Nil(t, err) 109 | assert.Nil(t, db.Put([]byte{1}, []byte{1})) 110 | assert.Nil(t, db.Put([]byte{2}, []byte{2})) 111 | assert.Nil(t, db.Delete([]byte{1})) 112 | assert.Equal(t, uint32(1), db.Count()) 113 | assert.Nil(t, db.Close()) 114 | 115 | // Simulate crash. 116 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName))) 117 | 118 | db, err = Open(testDBName, opts) 119 | assert.Nil(t, err) 120 | 121 | assert.Equal(t, uint32(1), db.Count()) 122 | 123 | assert.Nil(t, db.Close()) 124 | } 125 | 126 | func TestRecoveryCompaction(t *testing.T) { 127 | opts := &Options{ 128 | FileSystem: testFS, 129 | maxSegmentSize: 1024, 130 | compactionMinSegmentSize: 512, 131 | compactionMinFragmentation: 0.2, 132 | } 133 | 134 | db, err := createTestDB(opts) 135 | assert.Nil(t, err) 136 | 137 | // Fill file 0. 138 | for i := 0; i < 41; i++ { 139 | assert.Nil(t, db.Put([]byte{0}, []byte{0})) 140 | } 141 | assert.Nil(t, db.Put([]byte{1}, []byte{1})) 142 | 143 | // Write to file 1. 144 | assert.Nil(t, db.Put([]byte{0}, []byte{0})) 145 | assert.Nil(t, db.Put([]byte{0}, []byte{0})) 146 | 147 | assert.Equal(t, &segmentMeta{Full: true, PutRecords: 42, DeletedKeys: 41, DeletedBytes: 492}, db.datalog.segments[0].meta) 148 | assert.Equal(t, &segmentMeta{PutRecords: 2, DeletedKeys: 1, DeletedBytes: 12}, db.datalog.segments[1].meta) 149 | 150 | cm, err := db.Compact() 151 | assert.Nil(t, err) 152 | assert.Equal(t, CompactionResult{CompactedSegments: 1, ReclaimedRecords: 41, ReclaimedBytes: 492}, cm) 153 | assert.Nil(t, db.datalog.segments[0]) // Items were moved from file 0 to file 1. 154 | assert.Equal(t, &segmentMeta{PutRecords: 3, DeletedKeys: 1, DeletedBytes: 12}, db.datalog.segments[1].meta) 155 | 156 | // Fill file 1. 157 | for i := 0; i < 40; i++ { 158 | assert.Nil(t, db.Put([]byte{1}, []byte{2})) 159 | } 160 | 161 | // Fill file 0. 162 | for i := 0; i < 42; i++ { 163 | assert.Nil(t, db.Put([]byte{1}, []byte{2})) 164 | } 165 | // Write to file 2. 166 | assert.Nil(t, db.Put([]byte{0}, []byte{0})) 167 | 168 | assert.Equal(t, &segmentMeta{Full: true, PutRecords: 42, DeletedKeys: 42, DeletedBytes: 504}, db.datalog.segments[0].meta) 169 | assert.Equal(t, &segmentMeta{Full: true, PutRecords: 42, DeletedKeys: 42, DeletedBytes: 504}, db.datalog.segments[1].meta) 170 | assert.Equal(t, &segmentMeta{PutRecords: 2}, db.datalog.segments[2].meta) 171 | 172 | v, err := db.Get([]byte{1}) 173 | assert.Nil(t, err) 174 | assert.Equal(t, []byte{2}, v) 175 | 176 | assert.Equal(t, uint32(2), db.Count()) 177 | 178 | assert.Nil(t, db.Close()) 179 | 180 | // Simulate crash. 181 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName))) 182 | 183 | db, err = Open(testDBName, opts) 184 | assert.Nil(t, err) 185 | 186 | assert.Equal(t, uint32(2), db.Count()) 187 | 188 | v, err = db.Get([]byte{1}) 189 | assert.Nil(t, err) 190 | assert.Equal(t, []byte{2}, v) 191 | 192 | assert.Nil(t, db.Close()) 193 | } 194 | 195 | func TestRecoveryIterator(t *testing.T) { 196 | db, err := createTestDB(nil) 197 | assert.Nil(t, err) 198 | 199 | listRecords := func() []record { 200 | var records []record 201 | it := newRecoveryIterator(db.datalog.segmentsBySequenceID()) 202 | for { 203 | rec, err := it.next() 204 | if err == ErrIterationDone { 205 | break 206 | } 207 | assert.Nil(t, err) 208 | records = append(records, rec) 209 | } 210 | return records 211 | } 212 | 213 | assert.Equal(t, 0, len(listRecords())) 214 | 215 | if err := db.Put([]byte{1}, []byte{1}); err != nil { 216 | t.Fatal(err) 217 | } 218 | assert.Equal(t, 219 | []record{ 220 | {recordTypePut, 0, 512, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}}, 221 | }, 222 | listRecords(), 223 | ) 224 | 225 | if err := db.Put([]byte{1}, []byte{1}); err != nil { 226 | t.Fatal(err) 227 | } 228 | assert.Equal(t, 229 | []record{ 230 | {recordTypePut, 0, 512, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}}, 231 | {recordTypePut, 0, 524, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}}, 232 | }, 233 | listRecords(), 234 | ) 235 | 236 | if err := db.Put([]byte{2}, []byte{2}); err != nil { 237 | t.Fatal(err) 238 | } 239 | assert.Equal(t, 240 | []record{ 241 | {recordTypePut, 0, 512, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}}, 242 | {recordTypePut, 0, 524, []byte{1, 0, 1, 0, 0, 0, 1, 1, 133, 13, 200, 12}, []byte{1}, []byte{1}}, 243 | {recordTypePut, 0, 536, []byte{1, 0, 1, 0, 0, 0, 2, 2, 252, 15, 236, 190}, []byte{2}, []byte{2}}, 244 | }, 245 | listRecords(), 246 | ) 247 | 248 | assert.Nil(t, db.Close()) 249 | } 250 | -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | - Date: 2020-02-02 2 | - Authors: Artem Krylysov 3 | 4 | # About 5 | 6 | This document is a new version of the initial Pogreb design 7 | [blog post](https://artem.krylysov.com/blog/2018/03/24/pogreb-key-value-store/) published in 2018. 8 | 9 | The new version replaces the unstructured data file for storing key-value pairs with a write-ahead log to achieve 10 | durability. 11 | 12 | # Overview 13 | 14 | Pogreb is an embedded key-value store for read-heavy workloads. 15 | It aims to provide fast point lookups by indexing keys in an on-disk hash table. 16 | 17 | # Design 18 | 19 | Two key components of Pogreb are a write-ahead log (WAL) and a hash table index. 20 | The WAL stores key-value pairs on disk in append-only files. 21 | The on-disk hash table allows constant time lookups from keys to key-value pairs in the WAL. 22 | 23 | ## Write-ahead log 24 | 25 | The WAL consists of multiple append-only segments. Once the current segment file is full (reaches 4 GB), a new segment 26 | is created, the full segment becomes read-only. 27 | 28 | ``` 29 | Write-ahead log 30 | +-----------+-----------+-...-+-----------+ 31 | | Segment 0 | Segment 1 | ... | Segment N | 32 | +-----------+-----------+-...-+-----------+ 33 | ``` 34 | 35 | ### Segment 36 | 37 | A segment is a sequence of variable-length binary-encoded records. 38 | 39 | ``` 40 | Segment 41 | +----------+----------+-...-+----------+ 42 | | Record 0 | Record 1 | ... | Record N | 43 | +----------+----------+-...-+----------+ 44 | ``` 45 | 46 | The record layout: 47 | 48 | ``` 49 | Record 50 | +---------------+------------------+------------------+-...-+--...--+----------+ 51 | | Key Size (2B) | Record Type (1b) | Value Size (31b) | Key | Value | CRC (4B) | 52 | +---------------+------------------+------------------+-...-+--...--+----------+ 53 | ``` 54 | 55 | The Record Type field is either `Put` (0) or `Delete` (1). 56 | 57 | ## Hash table index 58 | 59 | Pogreb uses two files to store the hash table on disk - "main" and "overflow" index files. 60 | 61 | Each index file holds an array of buckets. 62 | 63 | ``` 64 | Index 65 | +----------+----------+-...-+----------+ 66 | | Bucket 0 | Bucket 1 | ... | Bucket N | 67 | +----------+----------+-...-+----------+ 68 | ``` 69 | 70 | ### Bucket 71 | 72 | A bucket is an array of slots followed by an optional file pointer to the overflow bucket (stored in the "overflow" 73 | index). 74 | The number of slots in a bucket is 31 - that is the maximum number of slots that is possible to fit in 512 75 | bytes. 76 | 77 | ``` 78 | Bucket 79 | +--------+--------+-...-+--------+-----------------------------+ 80 | | Slot 0 | Slot 1 | ... | Slot N | Overflow Bucket Offset (8B) | 81 | +--------+--------+-...-+--------+-----------------------------+ 82 | ``` 83 | 84 | ### Slot 85 | 86 | A slot contains the hash, the size of the key, the value size and a 32-bit offset of the key-value pair in the WAL. 87 | 88 | ``` 89 | Slot 90 | +-----------+-----------------+---------------+-----------------+-------------+ 91 | | Hash (4B) | Segment ID (2B) | Key Size (2B) | Value Size (4B) | Offset (4B) | 92 | +-----------+-----------------+---------------+-----------------+-------------+ 93 | ``` 94 | 95 | ## Linear hashing 96 | 97 | Pogreb uses the [Linear hashing](https://en.wikipedia.org/wiki/Linear_hashing) algorithm which grows the hash table 98 | one bucket at a time instead of rebuilding it entirely. 99 | 100 | Initially, the hash table contains a single bucket (*N=1*). 101 | 102 | Level *L* (initially *L=0*) represents the maximum number of buckets on a logarithmic scale the hash table can store. 103 | For example, a hash table with *L=0* contains between 0 and 1 buckets; *L=3* contains between 4 and 8 buckets. 104 | 105 | *S* is the index of the "split" bucket (initially *S=0*). 106 | 107 | Collisions are resolved using the bucket chaining technique. 108 | The "overflow" index file stores overflow buckets that form a linked list. 109 | 110 | ### Lookup 111 | 112 | Position of a bucket in the index file is calculated by applying a hash function to a key: 113 | 114 | ``` 115 | Index 116 | +----------+ 117 | | Bucket 0 | Bucket 118 | +----------+ +--------+--------+-...-+--------+ 119 | h(key) -> | Bucket 1 | -> | Slot 0 | Slot 1 | ... | Slot N | 120 | +-........-+ +--------+--------+-...-+--------+ 121 | | ........ | | 122 | +-........-+ | 123 | | Bucket N | | 124 | +----------+ | 125 | v 126 | Write-ahead log 127 | +-----------+-----------+-...-+-----------+ 128 | | Segment 0 | Segment 1 | ... | Segment N | 129 | +-----------+-----------+-...-+-----------+ 130 | ``` 131 | 132 | To get the position of the bucket: 133 | 134 | 1. Hash the key (Pogreb uses the 32-bit version of MurmurHash3). 135 | 2. Use 2L bits of the hash to get the position of the bucket - `hash % math.Pow(2, L)`. 136 | 3. Set the position to `hash % math.Pow(2, L+1)` if the previously calculated position comes before the 137 | split bucket *S*. 138 | 139 | The lookup function reads a bucket at the given position from the index file and performs a linear search to find a slot 140 | with the required hash. 141 | If the bucket doesn't contain a slot with the required hash, but the pointer to the overflow bucket is non-zero, the 142 | overflow bucket is inspected. 143 | The process continues until a required slot is found or until there is no more overflow buckets for the given key. 144 | Once a slot with the required key is found, Pogreb reads the key-value pair from the WAL. 145 | 146 | The average lookup requires two I/O operations - one is to find a slot in the index and another one is to read the key 147 | and value from the WAL. 148 | 149 | ### Insertion 150 | 151 | Insertion is performed by adding a new "put" record to the WAL and updating a bucket in the index. 152 | If the bucket has all of its slots occupied, a new overflow bucket is created. 153 | 154 | ### Split 155 | 156 | When the number of items in the hash table exceeds the load factor threshold (70%), the split operation is performed on 157 | the split bucket *S*: 158 | 159 | 1. Allocate a new bucket at the end of the index file. 160 | 2. Increment the split bucket index *S*. 161 | 3. Increment *L* and reset *S* to 0 if *S* points to 2L. 162 | 4. Divide items from the old split bucket between the newly allocated bucket and the old split bucket by 163 | recalculating the positions of the keys in the hash table. 164 | 5. Increment the number of buckets *N*. 165 | 166 | ### Removal 167 | 168 | The removal operation lookups a bucket by key, removes a slot from the bucket, overwrites the bucket in the index 169 | and then appends a new "delete" record to the WAL. 170 | 171 | ## Compaction 172 | 173 | Since the WAL is append-only, the disk space occupied by overwritten or deleted keys is not reclaimed immediately. 174 | Pogreb supports optional online compaction. 175 | 176 | Every time a key is overwritten or deleted, Pogreb increments the number of "deleted" bytes and keys for the 177 | corresponding WAL segment. 178 | The background compaction thread periodically loops through the WAL segment metadata and picks segments with 50% or 179 | higher disk space fragmentation for compaction. 180 | The compaction thread finds segment's live records (not deleted or overwritten) by looking up keys in the index. 181 | It writes live records to a new segment file and updates the corresponding slots in the index file. 182 | After the compaction is successfully finished, the compacted segment files are removed. 183 | 184 | ## Recovery 185 | 186 | In the event of a crash caused by a power loss or an operating system failure, Pogreb discards the index and replays the 187 | WAL building a new index from scratch. 188 | Segments are iterated from the oldest to the newest and items are inserted into the index. 189 | 190 | # Limitations 191 | 192 | The design choices made to optimize for point lookups bring limitations for other potential use-cases. For example, using a hash table for indexing makes range scans impossible. Additionally, having a single hash table shared across all WAL segments makes the recovery process require rebuilding the entire index, which may be impractical for large databases. -------------------------------------------------------------------------------- /fs/fs_test.go: -------------------------------------------------------------------------------- 1 | package fs 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "testing" 7 | 8 | "github.com/akrylysov/pogreb/internal/assert" 9 | ) 10 | 11 | const ( 12 | lockTestPath = "test.lock" 13 | ) 14 | 15 | var ( 16 | lockTestMode = os.FileMode(0666) 17 | ) 18 | 19 | func testLockFile(t *testing.T, fs FileSystem) { 20 | _ = fs.Remove(lockTestPath) 21 | lock, acquiredExisting, err := fs.CreateLockFile(lockTestPath, lockTestMode) 22 | if lock == nil || acquiredExisting || err != nil { 23 | t.Fatal(lock, err, acquiredExisting) 24 | } 25 | lock2, acquiredExisting2, err2 := fs.CreateLockFile(lockTestPath, lockTestMode) 26 | if lock2 != nil || acquiredExisting2 || err2 != os.ErrExist { 27 | t.Fatal(lock2, acquiredExisting2, err2) 28 | } 29 | 30 | err = lock.Unlock() 31 | assert.Nil(t, err) 32 | 33 | _, err = fs.Stat(lockTestPath) 34 | assert.NotNil(t, err) 35 | } 36 | 37 | func touchFile(fs FileSystem, path string) error { 38 | f, err := fs.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_TRUNC, os.FileMode(0666)) 39 | if err != nil { 40 | return err 41 | } 42 | return f.Close() 43 | } 44 | 45 | func testLockFileAcquireExisting(t *testing.T, fs FileSystem) { 46 | err := touchFile(fs, lockTestPath) 47 | assert.Nil(t, err) 48 | 49 | lock, acquiredExisting, err := fs.CreateLockFile(lockTestPath, lockTestMode) 50 | if lock == nil || !acquiredExisting || err != nil { 51 | t.Fatal(lock, err, acquiredExisting) 52 | } 53 | 54 | err = lock.Unlock() 55 | assert.Nil(t, err) 56 | 57 | _, err = fs.Stat(lockTestPath) 58 | assert.NotNil(t, err) 59 | } 60 | 61 | func testFS(t *testing.T, fsys FileSystem) { 62 | f, err := fsys.OpenFile("test", os.O_CREATE|os.O_RDWR|os.O_TRUNC, os.FileMode(0666)) 63 | assert.Nil(t, err) 64 | 65 | buf := make([]byte, 10) 66 | 67 | t.Run("Empty file", func(t *testing.T) { 68 | off, err := f.Seek(0, io.SeekCurrent) 69 | assert.Nil(t, err) 70 | assert.Equal(t, int64(0), off) 71 | 72 | n, err := f.Read(buf) 73 | assert.Equal(t, 0, n) 74 | assert.Equal(t, io.EOF, err) 75 | 76 | n, err = f.ReadAt(buf, 0) 77 | assert.Equal(t, 0, n) 78 | assert.Equal(t, io.EOF, err) 79 | 80 | n, err = f.ReadAt(buf, 10) 81 | assert.Equal(t, 0, n) 82 | assert.Equal(t, io.EOF, err) 83 | 84 | b, err := f.Slice(1, 10) 85 | assert.Equal(t, io.EOF, err) 86 | assert.Nil(t, b) 87 | }) 88 | 89 | testData := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 90 | 91 | t.Run("Write", func(t *testing.T) { 92 | n, err := f.Write(testData[:9]) 93 | assert.Nil(t, err) 94 | assert.Equal(t, 9, n) 95 | 96 | off, err := f.Seek(0, io.SeekCurrent) 97 | assert.Nil(t, err) 98 | assert.Equal(t, int64(9), off) 99 | }) 100 | 101 | t.Run("Write beyond EOF", func(t *testing.T) { 102 | off, err := f.Seek(2, io.SeekStart) 103 | assert.Nil(t, err) 104 | assert.Equal(t, int64(2), off) 105 | 106 | n, err := f.Write(testData[2:]) 107 | assert.Nil(t, err) 108 | assert.Equal(t, 8, n) 109 | 110 | off, err = f.Seek(0, io.SeekCurrent) 111 | assert.Nil(t, err) 112 | assert.Equal(t, int64(10), off) 113 | }) 114 | 115 | t.Run("Slice", func(t *testing.T) { 116 | b, err := f.Slice(1, 9) 117 | assert.Nil(t, err) 118 | assert.Equal(t, testData[1:9], b) 119 | 120 | b, err = f.Slice(0, 10) 121 | assert.Nil(t, err) 122 | assert.Equal(t, testData, b) 123 | 124 | // Offset larger than mapping. 125 | b, err = f.Slice(0, 12) 126 | assert.Equal(t, io.EOF, err) 127 | assert.Nil(t, b) 128 | }) 129 | 130 | t.Run("WriteAt", func(t *testing.T) { 131 | n, err := f.WriteAt(testData[1:4], 1) 132 | assert.Nil(t, err) 133 | assert.Equal(t, 3, n) 134 | 135 | // WriteAt doesn't move offset. 136 | off, err := f.Seek(0, io.SeekCurrent) 137 | assert.Nil(t, err) 138 | assert.Equal(t, int64(10), off) 139 | }) 140 | 141 | t.Run("Sync", func(t *testing.T) { 142 | // Not tested yet, just make sure it doesn't return an error. 143 | assert.Nil(t, f.Sync()) 144 | }) 145 | 146 | t.Run("Stat", func(t *testing.T) { 147 | fi, err := f.Stat() 148 | assert.Nil(t, err) 149 | assert.Equal(t, "test", fi.Name()) 150 | assert.Equal(t, int64(len(testData)), fi.Size()) 151 | assert.Equal(t, false, fi.IsDir()) 152 | // FIXME: not implemented for all file systems. 153 | // assert.Equal(t, os.FileMode(0666), fi.Mode()) 154 | _ = fi.Mode() 155 | _ = fi.ModTime() 156 | _ = fi.Sys() 157 | 158 | // File doesn't exist. 159 | _, err = fsys.Stat("foobar") 160 | assert.NotNil(t, err) 161 | }) 162 | 163 | t.Run("ReadAt", func(t *testing.T) { 164 | n, err := f.ReadAt(buf, 0) 165 | assert.Nil(t, err) 166 | assert.Equal(t, len(testData), n) 167 | assert.Equal(t, testData, buf) 168 | }) 169 | 170 | t.Run("Read EOF", func(t *testing.T) { 171 | n, err := f.Read(buf) 172 | assert.Equal(t, io.EOF, err) 173 | assert.Equal(t, 0, n) 174 | }) 175 | 176 | t.Run("Read", func(t *testing.T) { 177 | // SeekEnd and Read 178 | off, err := f.Seek(0, io.SeekEnd) 179 | assert.Nil(t, err) 180 | assert.Equal(t, int64(len(testData)), off) 181 | 182 | n, err := f.Read(buf) 183 | assert.Equal(t, io.EOF, err) 184 | assert.Equal(t, 0, n) 185 | 186 | // SeekStart and Read 187 | off, err = f.Seek(0, io.SeekStart) 188 | assert.Nil(t, err) 189 | assert.Equal(t, int64(0), off) 190 | 191 | n, err = f.Read(buf) 192 | assert.Nil(t, err) 193 | assert.Equal(t, len(testData), n) 194 | assert.Equal(t, testData, buf) 195 | 196 | off, err = f.Seek(0, io.SeekCurrent) 197 | assert.Equal(t, int64(n), off) 198 | assert.Nil(t, err) 199 | 200 | // SeekStart 2 and Read 201 | testOff := int64(2) 202 | lbuf := make([]byte, 8) 203 | off, err = f.Seek(testOff, io.SeekStart) 204 | assert.Nil(t, err) 205 | assert.Equal(t, testOff, off) 206 | 207 | n, err = f.Read(lbuf) 208 | assert.Nil(t, err) 209 | assert.Equal(t, len(testData)-int(testOff), n) 210 | assert.Equal(t, testData[testOff:], lbuf) 211 | }) 212 | 213 | t.Run("Read larger than file", func(t *testing.T) { 214 | off, err := f.Seek(0, io.SeekStart) 215 | assert.Nil(t, err) 216 | assert.Equal(t, int64(0), off) 217 | 218 | lbuf := make([]byte, 4096) 219 | n, err := f.Read(lbuf) 220 | assert.Nil(t, err) 221 | assert.Equal(t, len(testData), n) 222 | assert.Equal(t, testData, lbuf[:n]) 223 | 224 | n, err = f.Read(lbuf) 225 | assert.Equal(t, io.EOF, err) 226 | assert.Equal(t, 0, n) 227 | }) 228 | 229 | t.Run("Close and Open again", func(t *testing.T) { 230 | assert.Nil(t, f.Close()) 231 | 232 | f, err = fsys.OpenFile("test", os.O_RDWR, os.FileMode(0666)) 233 | assert.Nil(t, err) 234 | 235 | b, err := f.Slice(1, 10) 236 | assert.Nil(t, err) 237 | assert.Equal(t, testData[1:], b) 238 | }) 239 | 240 | t.Run("Truncate extend", func(t *testing.T) { 241 | err := f.Truncate(11) 242 | assert.Nil(t, err) 243 | 244 | lbuf := make([]byte, 11) 245 | n, err := f.ReadAt(lbuf, 0) 246 | assert.Nil(t, err) 247 | assert.Equal(t, 11, n) 248 | assert.Equal(t, testData, lbuf[:10]) 249 | 250 | b, err := f.Slice(0, 11) 251 | assert.Nil(t, err) 252 | assert.Equal(t, testData, b[:10]) 253 | 254 | fi, err := f.Stat() 255 | assert.Nil(t, err) 256 | assert.Equal(t, int64(11), fi.Size()) 257 | }) 258 | 259 | t.Run("Truncate shrink", func(t *testing.T) { 260 | err := f.Truncate(1) 261 | assert.Nil(t, err) 262 | 263 | lbuf := make([]byte, 1) 264 | n, err := f.ReadAt(lbuf, 0) 265 | assert.Nil(t, err) 266 | assert.Equal(t, 1, n) 267 | assert.Equal(t, testData[:1], lbuf) 268 | 269 | b, err := f.Slice(0, 1) 270 | assert.Nil(t, err) 271 | assert.Equal(t, testData[:1], b) 272 | 273 | b, err = f.Slice(0, 10) 274 | assert.Equal(t, io.EOF, err) 275 | assert.Nil(t, b) 276 | 277 | fi, err := f.Stat() 278 | assert.Nil(t, err) 279 | assert.Equal(t, int64(1), fi.Size()) 280 | }) 281 | 282 | t.Run("Truncate shrink to zero", func(t *testing.T) { 283 | err := f.Truncate(0) 284 | assert.Nil(t, err) 285 | 286 | n, err := f.ReadAt(buf, 0) 287 | assert.Equal(t, io.EOF, err) 288 | assert.Equal(t, 0, n) 289 | 290 | b, err := f.Slice(0, 1) 291 | assert.Equal(t, io.EOF, err) 292 | assert.Nil(t, b) 293 | 294 | fi, err := f.Stat() 295 | assert.Nil(t, err) 296 | assert.Equal(t, int64(0), fi.Size()) 297 | }) 298 | 299 | t.Run("Close", func(t *testing.T) { 300 | assert.Nil(t, f.Close()) 301 | 302 | err := f.Close() 303 | assert.NotNil(t, err) 304 | 305 | _, err = f.Seek(1, io.SeekStart) 306 | assert.NotNil(t, err) 307 | }) 308 | 309 | t.Run("Rename", func(t *testing.T) { 310 | err := fsys.Rename("foobar", "baz") 311 | assert.NotNil(t, err) 312 | 313 | assert.Nil(t, fsys.Rename("test", "test2")) 314 | fi, err := fsys.Stat("test2") 315 | assert.Nil(t, err) 316 | assert.Equal(t, int64(0), fi.Size()) 317 | assert.Equal(t, "test2", fi.Name()) 318 | }) 319 | 320 | t.Run("ReadDir", func(t *testing.T) { 321 | fis, err := fsys.ReadDir(".") 322 | assert.Nil(t, err) 323 | 324 | var hasTestFile bool 325 | for _, fi := range fis { 326 | if fi.Name() == "test2" { 327 | hasTestFile = true 328 | } 329 | } 330 | assert.Equal(t, true, hasTestFile) 331 | }) 332 | 333 | t.Run("Remove", func(t *testing.T) { 334 | err := fsys.Remove("test2") 335 | assert.Nil(t, err) 336 | 337 | _, err = fsys.Stat("test2") 338 | assert.NotNil(t, err) 339 | }) 340 | } 341 | -------------------------------------------------------------------------------- /index.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "github.com/akrylysov/pogreb/internal/errors" 5 | ) 6 | 7 | const ( 8 | indexExt = ".pix" 9 | indexMainName = "main" + indexExt 10 | indexOverflowName = "overflow" + indexExt 11 | indexMetaName = "index" + metaExt 12 | loadFactor = 0.7 13 | ) 14 | 15 | // index is an on-disk linear hashing hash table. 16 | // It uses two files to store the hash table on disk - "main" and "overflow" index files. 17 | // Each index file holds an array of buckets. 18 | type index struct { 19 | opts *Options 20 | main *file // Main index file. 21 | overflow *file // Overflow index file. 22 | freeBucketOffs []int64 // Offsets of freed buckets. 23 | level uint8 // Maximum number of buckets on a logarithmic scale. 24 | numKeys uint32 // Number of keys. 25 | numBuckets uint32 // Number of buckets. 26 | splitBucketIdx uint32 // Index of the bucket to split on next split. 27 | } 28 | 29 | type indexMeta struct { 30 | Level uint8 31 | NumKeys uint32 32 | NumBuckets uint32 33 | SplitBucketIndex uint32 34 | FreeOverflowBuckets []int64 35 | } 36 | 37 | // matchKeyFunc returns whether the slot matches the key sought. 38 | type matchKeyFunc func(slot) (bool, error) 39 | 40 | func openIndex(opts *Options) (*index, error) { 41 | main, err := openFile(opts.FileSystem, indexMainName, openFileFlags{}) 42 | if err != nil { 43 | return nil, errors.Wrap(err, "opening main index") 44 | } 45 | overflow, err := openFile(opts.FileSystem, indexOverflowName, openFileFlags{}) 46 | if err != nil { 47 | _ = main.Close() 48 | return nil, errors.Wrap(err, "opening overflow index") 49 | } 50 | idx := &index{ 51 | opts: opts, 52 | main: main, 53 | overflow: overflow, 54 | numBuckets: 1, 55 | } 56 | if main.empty() { 57 | // Add an empty bucket. 58 | if _, err = idx.main.extend(bucketSize); err != nil { 59 | _ = main.Close() 60 | _ = overflow.Close() 61 | return nil, err 62 | } 63 | } else if err := idx.readMeta(); err != nil { 64 | _ = main.Close() 65 | _ = overflow.Close() 66 | return nil, errors.Wrap(err, "opening index meta") 67 | } 68 | return idx, nil 69 | } 70 | 71 | func (idx *index) writeMeta() error { 72 | m := indexMeta{ 73 | Level: idx.level, 74 | NumKeys: idx.numKeys, 75 | NumBuckets: idx.numBuckets, 76 | SplitBucketIndex: idx.splitBucketIdx, 77 | FreeOverflowBuckets: idx.freeBucketOffs, 78 | } 79 | return writeGobFile(idx.opts.FileSystem, indexMetaName, m) 80 | } 81 | 82 | func (idx *index) readMeta() error { 83 | m := indexMeta{} 84 | if err := readGobFile(idx.opts.FileSystem, indexMetaName, &m); err != nil { 85 | return err 86 | } 87 | idx.level = m.Level 88 | idx.numKeys = m.NumKeys 89 | idx.numBuckets = m.NumBuckets 90 | idx.splitBucketIdx = m.SplitBucketIndex 91 | idx.freeBucketOffs = m.FreeOverflowBuckets 92 | return nil 93 | } 94 | 95 | func (idx *index) bucketIndex(hash uint32) uint32 { 96 | bidx := hash & ((1 << idx.level) - 1) 97 | if bidx < idx.splitBucketIdx { 98 | return hash & ((1 << (idx.level + 1)) - 1) 99 | } 100 | return bidx 101 | } 102 | 103 | type bucketIterator struct { 104 | off int64 // Offset of the next bucket. 105 | f *file // Current index file. 106 | overflow *file // Overflow index file. 107 | } 108 | 109 | // bucketOffset returns on-disk bucket offset by the bucket index. 110 | func bucketOffset(idx uint32) int64 { 111 | return int64(headerSize) + (int64(bucketSize) * int64(idx)) 112 | } 113 | 114 | func (idx *index) newBucketIterator(startBucketIdx uint32) *bucketIterator { 115 | return &bucketIterator{ 116 | off: bucketOffset(startBucketIdx), 117 | f: idx.main, 118 | overflow: idx.overflow, 119 | } 120 | } 121 | 122 | func (it *bucketIterator) next() (bucketHandle, error) { 123 | if it.off == 0 { 124 | return bucketHandle{}, ErrIterationDone 125 | } 126 | b := bucketHandle{file: it.f, offset: it.off} 127 | if err := b.read(); err != nil { 128 | return bucketHandle{}, err 129 | } 130 | it.f = it.overflow 131 | it.off = b.next 132 | return b, nil 133 | } 134 | 135 | func (idx *index) get(hash uint32, matchKey matchKeyFunc) error { 136 | it := idx.newBucketIterator(idx.bucketIndex(hash)) 137 | for { 138 | b, err := it.next() 139 | if err == ErrIterationDone { 140 | return nil 141 | } 142 | if err != nil { 143 | return err 144 | } 145 | for i := 0; i < slotsPerBucket; i++ { 146 | sl := b.slots[i] 147 | // No more slots in the bucket. 148 | if sl.offset == 0 { 149 | break 150 | } 151 | if hash != sl.hash { 152 | continue 153 | } 154 | if match, err := matchKey(sl); match || err != nil { 155 | return err 156 | } 157 | } 158 | } 159 | } 160 | 161 | func (idx *index) findInsertionBucket(newSlot slot, matchKey matchKeyFunc) (*slotWriter, bool, error) { 162 | sw := &slotWriter{} 163 | it := idx.newBucketIterator(idx.bucketIndex(newSlot.hash)) 164 | for { 165 | b, err := it.next() 166 | if err == ErrIterationDone { 167 | return nil, false, errors.New("failed to insert a new slot") 168 | } 169 | if err != nil { 170 | return nil, false, err 171 | } 172 | sw.bucket = &b 173 | var i int 174 | for i = 0; i < slotsPerBucket; i++ { 175 | sl := b.slots[i] 176 | if sl.offset == 0 { 177 | // Found an empty slot. 178 | sw.slotIdx = i 179 | return sw, false, nil 180 | } 181 | if newSlot.hash != sl.hash { 182 | continue 183 | } 184 | match, err := matchKey(sl) 185 | if err != nil { 186 | return nil, false, err 187 | } 188 | if match { 189 | // Key already in the index. 190 | // The slot writer will overwrite the existing slot. 191 | sw.slotIdx = i 192 | return sw, true, nil 193 | } 194 | } 195 | if b.next == 0 { 196 | // No more buckets in the chain. 197 | sw.slotIdx = i 198 | return sw, false, nil 199 | } 200 | } 201 | } 202 | 203 | func (idx *index) put(newSlot slot, matchKey matchKeyFunc) error { 204 | if idx.numKeys == MaxKeys { 205 | return errFull 206 | } 207 | sw, overwritingExisting, err := idx.findInsertionBucket(newSlot, matchKey) 208 | if err != nil { 209 | return err 210 | } 211 | if err := sw.insert(newSlot, idx); err != nil { 212 | return err 213 | } 214 | if err := sw.write(); err != nil { 215 | return err 216 | } 217 | if overwritingExisting { 218 | return nil 219 | } 220 | idx.numKeys++ 221 | if float64(idx.numKeys)/float64(idx.numBuckets*slotsPerBucket) > loadFactor { 222 | if err := idx.split(); err != nil { 223 | return err 224 | } 225 | } 226 | return nil 227 | } 228 | 229 | func (idx *index) delete(hash uint32, matchKey matchKeyFunc) error { 230 | it := idx.newBucketIterator(idx.bucketIndex(hash)) 231 | for { 232 | b, err := it.next() 233 | if err == ErrIterationDone { 234 | return nil 235 | } 236 | if err != nil { 237 | return err 238 | } 239 | for i := 0; i < slotsPerBucket; i++ { 240 | sl := b.slots[i] 241 | if sl.offset == 0 { 242 | break 243 | } 244 | if hash != sl.hash { 245 | continue 246 | } 247 | match, err := matchKey(sl) 248 | if err != nil { 249 | return err 250 | } 251 | if !match { 252 | continue 253 | } 254 | b.del(i) 255 | if err := b.write(); err != nil { 256 | return err 257 | } 258 | idx.numKeys-- 259 | return nil 260 | } 261 | } 262 | } 263 | 264 | func (idx *index) createOverflowBucket() (*bucketHandle, error) { 265 | var off int64 266 | if len(idx.freeBucketOffs) > 0 { 267 | off = idx.freeBucketOffs[0] 268 | idx.freeBucketOffs = idx.freeBucketOffs[1:] 269 | } else { 270 | var err error 271 | off, err = idx.overflow.extend(bucketSize) 272 | if err != nil { 273 | return nil, err 274 | } 275 | } 276 | return &bucketHandle{file: idx.overflow, offset: off}, nil 277 | } 278 | 279 | func (idx *index) freeOverflowBucket(offsets ...int64) { 280 | idx.freeBucketOffs = append(idx.freeBucketOffs, offsets...) 281 | } 282 | 283 | func (idx *index) split() error { 284 | updatedBucketIdx := idx.splitBucketIdx 285 | updatedBucketOff := bucketOffset(updatedBucketIdx) 286 | updatedBucket := slotWriter{ 287 | bucket: &bucketHandle{file: idx.main, offset: updatedBucketOff}, 288 | } 289 | 290 | newBucketOff, err := idx.main.extend(bucketSize) 291 | if err != nil { 292 | return err 293 | } 294 | 295 | sw := slotWriter{ 296 | bucket: &bucketHandle{file: idx.main, offset: newBucketOff}, 297 | } 298 | 299 | idx.splitBucketIdx++ 300 | if idx.splitBucketIdx == 1< 0 || db.opts.BackgroundCompactionInterval > 0 { 115 | db.startBackgroundWorker() 116 | } 117 | 118 | return db, nil 119 | } 120 | 121 | func cloneBytes(src []byte) []byte { 122 | dst := make([]byte, len(src)) 123 | copy(dst, src) 124 | return dst 125 | } 126 | 127 | func (db *DB) writeMeta() error { 128 | m := dbMeta{ 129 | HashSeed: db.hashSeed, 130 | } 131 | return writeGobFile(db.opts.FileSystem, dbMetaName, m) 132 | } 133 | 134 | func (db *DB) readMeta() error { 135 | m := dbMeta{} 136 | if err := readGobFile(db.opts.FileSystem, dbMetaName, &m); err != nil { 137 | return err 138 | } 139 | db.hashSeed = m.HashSeed 140 | return nil 141 | } 142 | 143 | func (db *DB) hash(data []byte) uint32 { 144 | return hash.Sum32WithSeed(data, db.hashSeed) 145 | } 146 | 147 | // newNullableTicker is a wrapper around time.NewTicker that allows creating a nil ticker. 148 | // A nil ticker never ticks. 149 | func newNullableTicker(d time.Duration) (<-chan time.Time, func()) { 150 | if d > 0 { 151 | t := time.NewTicker(d) 152 | return t.C, t.Stop 153 | } 154 | return nil, func() {} 155 | } 156 | 157 | func (db *DB) startBackgroundWorker() { 158 | ctx, cancel := context.WithCancel(context.Background()) 159 | db.cancelBgWorker = cancel 160 | db.closeWg.Add(1) 161 | 162 | go func() { 163 | defer db.closeWg.Done() 164 | 165 | syncC, syncStop := newNullableTicker(db.opts.BackgroundSyncInterval) 166 | defer syncStop() 167 | 168 | compactC, compactStop := newNullableTicker(db.opts.BackgroundCompactionInterval) 169 | defer compactStop() 170 | 171 | for { 172 | select { 173 | case <-ctx.Done(): 174 | return 175 | case <-syncC: 176 | if err := db.Sync(); err != nil { 177 | logger.Printf("error synchronizing database: %v", err) 178 | } 179 | case <-compactC: 180 | if cr, err := db.Compact(); err != nil { 181 | logger.Printf("error compacting database: %v", err) 182 | } else if cr.CompactedSegments > 0 { 183 | logger.Printf("compacted database: %+v", cr) 184 | } 185 | } 186 | } 187 | }() 188 | } 189 | 190 | // Get returns the value for the given key stored in the DB or nil if the key doesn't exist. 191 | func (db *DB) Get(key []byte) ([]byte, error) { 192 | h := db.hash(key) 193 | db.metrics.Gets.Add(1) 194 | db.mu.RLock() 195 | defer db.mu.RUnlock() 196 | var retValue []byte 197 | err := db.index.get(h, func(sl slot) (bool, error) { 198 | if uint16(len(key)) != sl.keySize { 199 | return false, nil 200 | } 201 | slKey, value, err := db.datalog.readKeyValue(sl) 202 | if err != nil { 203 | return true, err 204 | } 205 | if bytes.Equal(key, slKey) { 206 | retValue = cloneBytes(value) 207 | return true, nil 208 | } 209 | db.metrics.HashCollisions.Add(1) 210 | return false, nil 211 | }) 212 | if err != nil { 213 | return nil, err 214 | } 215 | return retValue, nil 216 | } 217 | 218 | // GetAppend returns the value for the given key (appended into buffer) stored in the DB or nil if the key doesn't exist 219 | func (db *DB) GetAppend(key, buf []byte) ([]byte, error) { 220 | h := db.hash(key) 221 | db.metrics.Gets.Add(1) 222 | db.mu.RLock() 223 | defer db.mu.RUnlock() 224 | var retValue []byte 225 | err := db.index.get(h, func(sl slot) (bool, error) { 226 | if uint16(len(key)) != sl.keySize { 227 | return false, nil 228 | } 229 | slKey, value, err := db.datalog.readKeyValue(sl) 230 | if err != nil { 231 | return true, err 232 | } 233 | if bytes.Equal(key, slKey) { 234 | retValue = append(buf, value...) 235 | return true, nil 236 | } 237 | db.metrics.HashCollisions.Add(1) 238 | return false, nil 239 | }) 240 | if err != nil { 241 | return nil, err 242 | } 243 | return retValue, nil 244 | } 245 | 246 | // Has returns true if the DB contains the given key. 247 | func (db *DB) Has(key []byte) (bool, error) { 248 | h := db.hash(key) 249 | db.metrics.Gets.Add(1) 250 | found := false 251 | db.mu.RLock() 252 | defer db.mu.RUnlock() 253 | err := db.index.get(h, func(sl slot) (bool, error) { 254 | if uint16(len(key)) != sl.keySize { 255 | return false, nil 256 | } 257 | slKey, err := db.datalog.readKey(sl) 258 | if err != nil { 259 | return true, err 260 | } 261 | if bytes.Equal(key, slKey) { 262 | found = true 263 | return true, nil 264 | } 265 | return false, nil 266 | }) 267 | if err != nil { 268 | return false, err 269 | } 270 | return found, nil 271 | } 272 | 273 | func (db *DB) put(sl slot, key []byte) error { 274 | return db.index.put(sl, func(cursl slot) (bool, error) { 275 | if uint16(len(key)) != cursl.keySize { 276 | return false, nil 277 | } 278 | slKey, err := db.datalog.readKey(cursl) 279 | if err != nil { 280 | return true, err 281 | } 282 | if bytes.Equal(key, slKey) { 283 | db.datalog.trackDel(cursl) // Overwriting existing key. 284 | return true, nil 285 | } 286 | return false, nil 287 | }) 288 | } 289 | 290 | // Put sets the value for the given key. It updates the value for the existing key. 291 | func (db *DB) Put(key []byte, value []byte) error { 292 | if len(key) > MaxKeyLength { 293 | return errKeyTooLarge 294 | } 295 | if len(value) > MaxValueLength { 296 | return errValueTooLarge 297 | } 298 | h := db.hash(key) 299 | db.metrics.Puts.Add(1) 300 | db.mu.Lock() 301 | defer db.mu.Unlock() 302 | 303 | segID, offset, err := db.datalog.put(key, value) 304 | if err != nil { 305 | return err 306 | } 307 | 308 | sl := slot{ 309 | hash: h, 310 | segmentID: segID, 311 | keySize: uint16(len(key)), 312 | valueSize: uint32(len(value)), 313 | offset: offset, 314 | } 315 | 316 | if err := db.put(sl, key); err != nil { 317 | return err 318 | } 319 | 320 | if db.syncWrites { 321 | return db.sync() 322 | } 323 | return nil 324 | } 325 | 326 | func (db *DB) del(h uint32, key []byte, writeWAL bool) error { 327 | err := db.index.delete(h, func(sl slot) (b bool, e error) { 328 | if uint16(len(key)) != sl.keySize { 329 | return false, nil 330 | } 331 | slKey, err := db.datalog.readKey(sl) 332 | if err != nil { 333 | return true, err 334 | } 335 | if bytes.Equal(key, slKey) { 336 | db.datalog.trackDel(sl) 337 | var err error 338 | if writeWAL { 339 | err = db.datalog.del(key) 340 | } 341 | return true, err 342 | } 343 | return false, nil 344 | }) 345 | return err 346 | } 347 | 348 | // Delete deletes the given key from the DB. 349 | func (db *DB) Delete(key []byte) error { 350 | h := db.hash(key) 351 | db.metrics.Dels.Add(1) 352 | db.mu.Lock() 353 | defer db.mu.Unlock() 354 | if err := db.del(h, key, true); err != nil { 355 | return err 356 | } 357 | if db.syncWrites { 358 | return db.sync() 359 | } 360 | return nil 361 | } 362 | 363 | // Close closes the DB. 364 | func (db *DB) Close() error { 365 | if db.cancelBgWorker != nil { 366 | db.cancelBgWorker() 367 | } 368 | db.closeWg.Wait() 369 | db.mu.Lock() 370 | defer db.mu.Unlock() 371 | if err := db.writeMeta(); err != nil { 372 | return err 373 | } 374 | if err := db.datalog.close(); err != nil { 375 | return err 376 | } 377 | if err := db.index.close(); err != nil { 378 | return err 379 | } 380 | if err := db.lock.Unlock(); err != nil { 381 | return err 382 | } 383 | return nil 384 | } 385 | 386 | func (db *DB) sync() error { 387 | return db.datalog.sync() 388 | } 389 | 390 | // Items returns a new ItemIterator. 391 | func (db *DB) Items() *ItemIterator { 392 | return &ItemIterator{db: db} 393 | } 394 | 395 | // Sync commits the contents of the database to the backing FileSystem. 396 | func (db *DB) Sync() error { 397 | db.mu.Lock() 398 | defer db.mu.Unlock() 399 | return db.sync() 400 | } 401 | 402 | // Count returns the number of keys in the DB. 403 | func (db *DB) Count() uint32 { 404 | db.mu.RLock() 405 | defer db.mu.RUnlock() 406 | return db.index.count() 407 | } 408 | 409 | // Metrics returns the DB metrics. 410 | func (db *DB) Metrics() *Metrics { 411 | return db.metrics 412 | } 413 | 414 | // FileSize returns the total size of the disk storage used by the DB. 415 | func (db *DB) FileSize() (int64, error) { 416 | var size int64 417 | files, err := db.opts.FileSystem.ReadDir(".") 418 | if err != nil { 419 | return 0, err 420 | } 421 | for _, file := range files { 422 | info, err := file.Info() 423 | if err != nil { 424 | return 0, err 425 | } 426 | size += info.Size() 427 | } 428 | return size, nil 429 | } 430 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /db_test.go: -------------------------------------------------------------------------------- 1 | package pogreb 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "flag" 7 | "fmt" 8 | "io" 9 | "log" 10 | "os" 11 | "path/filepath" 12 | "strings" 13 | "testing" 14 | "time" 15 | 16 | "github.com/akrylysov/pogreb/fs" 17 | "github.com/akrylysov/pogreb/internal/assert" 18 | ) 19 | 20 | const ( 21 | testDBName = "test.db" 22 | ) 23 | 24 | var ( 25 | // File system used for all tests. 26 | testFS fs.FileSystem 27 | ) 28 | 29 | func TestMain(m *testing.M) { 30 | flag.Parse() 31 | if !testing.Verbose() { 32 | SetLogger(log.New(io.Discard, "", 0)) 33 | } 34 | // Run tests against all file systems. 35 | for _, fsys := range testFileSystems { 36 | var tmpDir string 37 | if fsys == fs.Mem { 38 | testFS = fsys 39 | } else { 40 | var err error 41 | tmpDir, err = os.MkdirTemp("", "pogreb-test") 42 | if err != nil { 43 | fmt.Printf("failed to create temporary directory: %v", err) 44 | os.Exit(1) 45 | } 46 | testFS = fs.Sub(fsys, tmpDir) 47 | } 48 | if testing.Verbose() { 49 | fmt.Printf("=== SET\tFS=%T\ttmpDir=%s\n", fsys, tmpDir) 50 | } 51 | exitCode := m.Run() 52 | if tmpDir != "" { 53 | _ = os.RemoveAll(tmpDir) 54 | } 55 | if exitCode != 0 { 56 | fmt.Printf("DEBUG\tFS=%T\n", fsys) 57 | os.Exit(exitCode) 58 | } 59 | } 60 | os.Exit(0) 61 | } 62 | 63 | func appendFile(path string, data []byte) error { 64 | f, err := testFS.OpenFile(path, os.O_RDWR, os.FileMode(0640)) 65 | if err != nil { 66 | return err 67 | } 68 | defer f.Close() 69 | if _, err = f.Seek(0, io.SeekEnd); err != nil { 70 | return err 71 | } 72 | _, err = f.Write(data) 73 | return err 74 | } 75 | 76 | func align512(n uint32) uint32 { 77 | return (n + 511) &^ 511 78 | } 79 | 80 | func TestBucketSize(t *testing.T) { 81 | serializedSize := uint32(binary.Size(bucket{})) 82 | if bucketSize != align512(serializedSize) { 83 | t.Fatal("wrong bucketSize value", bucketSize) 84 | } 85 | if bucketSize-serializedSize > 32 { 86 | t.Fatal("bucket is wasting too much space", bucketSize, serializedSize) 87 | } 88 | } 89 | 90 | func TestHeaderSize(t *testing.T) { 91 | if headerSize != align512(uint32(binary.Size(header{}))) || headerSize != 512 { 92 | t.Fatal("wrong headerSize value", headerSize) 93 | } 94 | } 95 | 96 | func cleanDir(path string) error { 97 | files, err := testFS.ReadDir(path) 98 | if err != nil { 99 | if os.IsNotExist(err) { 100 | return nil 101 | } 102 | return err 103 | } 104 | for _, file := range files { 105 | _ = testFS.Remove(filepath.Join(path, file.Name())) 106 | } 107 | return nil 108 | } 109 | 110 | func createTestDB(opts *Options) (*DB, error) { 111 | if opts == nil { 112 | opts = &Options{FileSystem: testFS} 113 | } else { 114 | if opts.FileSystem == nil { 115 | opts.FileSystem = testFS 116 | } 117 | } 118 | if err := cleanDir(testDBName); err != nil { 119 | return nil, err 120 | } 121 | return Open(testDBName, opts) 122 | } 123 | 124 | func TestEmpty(t *testing.T) { 125 | opts := &Options{FileSystem: testFS} 126 | db, err := createTestDB(opts) 127 | assert.Nil(t, err) 128 | assert.Nil(t, db.Close()) 129 | db, err = Open(testDBName, opts) 130 | assert.Nil(t, err) 131 | assert.Nil(t, db.Close()) 132 | } 133 | 134 | func TestFull(t *testing.T) { 135 | fullTest(t, func(db *DB, key []byte) ([]byte, error) { 136 | return db.Get(key) 137 | }) 138 | var buf []byte 139 | fullTest(t, func(db *DB, key []byte) ([]byte, error) { 140 | var err error 141 | buf, err = db.GetAppend(key, buf[:0]) 142 | return buf, err 143 | }) 144 | } 145 | 146 | func fullTest(t *testing.T, getFunc func(db *DB, key []byte) ([]byte, error)) { 147 | opts := &Options{ 148 | BackgroundSyncInterval: -1, 149 | FileSystem: testFS, 150 | maxSegmentSize: 1024, 151 | } 152 | db, err := createTestDB(opts) 153 | assert.Nil(t, err) 154 | var i byte 155 | var n uint8 = 255 156 | assert.Equal(t, uint32(0), db.Count()) 157 | for i = 0; i < n; i++ { 158 | if has, err := db.Has([]byte{i}); has || err != nil { 159 | t.Fatal(has, err) 160 | } 161 | } 162 | assert.Nil(t, db.Delete([]byte{128})) 163 | assert.Equal(t, uint32(0), db.Count()) 164 | for i = 0; i < n; i++ { 165 | assert.Nil(t, db.Put([]byte{i}, []byte{i})) 166 | } 167 | assert.Equal(t, uint32(255), db.Count()) 168 | assert.Equal(t, int64(n), db.Metrics().Puts.Value()) 169 | assert.Nil(t, db.Sync()) 170 | 171 | sz, err := db.FileSize() 172 | assert.Nil(t, err) 173 | if sz <= 0 { 174 | t.Fatal(sz) 175 | } 176 | 177 | assert.Nil(t, db.Delete([]byte{128})) 178 | assert.Equal(t, uint32(254), db.Count()) 179 | if has, err := db.Has([]byte{128}); has || err != nil { 180 | t.Fatal(has, err) 181 | } 182 | assert.Nil(t, db.Put([]byte{128}, []byte{128})) 183 | assert.Equal(t, uint32(255), db.Count()) 184 | 185 | verifyKeysAndClose := func(valueOffset uint8) { 186 | t.Helper() 187 | assert.Equal(t, uint32(255), db.Count()) 188 | for i = 0; i < n; i++ { 189 | if has, err := db.Has([]byte{i}); !has || err != nil { 190 | t.Fatal(has, err) 191 | } 192 | if has, err := db.Has([]byte{0, i}); has || err != nil { 193 | t.Fatal(has, err) 194 | } 195 | v, err := getFunc(db, []byte{i}) 196 | if err != nil { 197 | t.Fatal(err) 198 | } 199 | assert.Equal(t, []byte{i + valueOffset}, v) 200 | } 201 | assert.Nil(t, db.Close()) 202 | } 203 | 204 | expectedSegMetas := db.datalog.segmentMetas() 205 | verifyKeysAndClose(0) 206 | 207 | // Open and check again 208 | db, err = Open(testDBName, opts) 209 | assert.Nil(t, err) 210 | verifyKeysAndClose(0) 211 | 212 | // Simulate crash. 213 | assert.Nil(t, touchFile(testFS, filepath.Join(testDBName, lockName))) 214 | assert.Nil(t, testFS.Remove(filepath.Join(testDBName, segmentMetaName(0, 1)))) 215 | assert.Nil(t, testFS.Remove(filepath.Join(testDBName, indexMetaName))) 216 | 217 | // Open and check again 218 | db, err = Open(testDBName, opts) 219 | assert.Nil(t, err) 220 | verifyKeysAndClose(0) 221 | 222 | assert.Equal(t, expectedSegMetas, db.datalog.segmentMetas()) 223 | 224 | // Update all items 225 | db, err = Open(testDBName, opts) 226 | assert.Nil(t, err) 227 | for i = 0; i < n; i++ { 228 | assert.Nil(t, db.Put([]byte{i}, []byte{i + 6})) 229 | } 230 | verifyKeysAndClose(6) 231 | 232 | // Delete all items 233 | db, err = Open(testDBName, &Options{BackgroundSyncInterval: time.Millisecond, FileSystem: testFS}) 234 | assert.Nil(t, err) 235 | for i = 0; i < n; i++ { 236 | assert.Nil(t, db.Delete([]byte{i})) 237 | } 238 | for i = 0; i < n; i++ { 239 | if has, err := db.Has([]byte{i}); has || err != nil { 240 | t.Fatal(has, err) 241 | } 242 | } 243 | assert.Equal(t, uint32(0), db.Count()) 244 | assert.Nil(t, db.Close()) 245 | } 246 | 247 | func TestLock(t *testing.T) { 248 | opts := &Options{FileSystem: testFS} 249 | db, err := createTestDB(opts) 250 | assert.Nil(t, err) 251 | 252 | // Opening already opened database returns an error. 253 | db2, err2 := Open(testDBName, opts) 254 | assert.Nil(t, db2) 255 | assert.NotNil(t, err2) 256 | 257 | assert.Nil(t, db.Close()) 258 | } 259 | 260 | func TestEmptyKey(t *testing.T) { 261 | db, err := createTestDB(nil) 262 | assert.Nil(t, err) 263 | if err := db.Put([]byte{}, []byte{1}); err != nil { 264 | t.Fatal(err) 265 | } 266 | v, err := db.Get([]byte{}) 267 | assert.Nil(t, err) 268 | assert.Equal(t, []byte{1}, v) 269 | assert.Nil(t, db.Close()) 270 | } 271 | 272 | func TestEmptyValue(t *testing.T) { 273 | db, err := createTestDB(nil) 274 | assert.Nil(t, err) 275 | // Returns a nil value if key not found. 276 | if v, err := db.Get([]byte{1}); err != nil || v != nil { 277 | t.Fatal(err) 278 | } 279 | err = db.Put([]byte{1}, []byte{}) 280 | assert.Nil(t, err) 281 | // Returns an empty slice if value is empty. 282 | if v, err := db.Get([]byte{1}); err != nil || v == nil || len(v) != 0 { 283 | t.Fatal(err) 284 | } 285 | assert.Nil(t, db.Close()) 286 | } 287 | 288 | func TestEmptyKeyValue(t *testing.T) { 289 | db, err := createTestDB(nil) 290 | assert.Nil(t, err) 291 | assert.Nil(t, db.Put([]byte{}, []byte{})) 292 | v, err := db.Get([]byte{}) 293 | assert.Nil(t, err) 294 | assert.Equal(t, []byte{}, v) 295 | assert.Nil(t, db.Close()) 296 | } 297 | 298 | func TestDataRecycle(t *testing.T) { 299 | db, err := createTestDB(nil) 300 | assert.Nil(t, err) 301 | assert.Nil(t, db.Put([]byte{1}, []byte{8})) 302 | v, err := db.Get([]byte{1}) 303 | assert.Nil(t, err) 304 | assert.Equal(t, []byte{8}, v) 305 | err = db.Delete([]byte{1}) 306 | assert.Nil(t, err) 307 | err = db.Put([]byte{1}, []byte{9}) 308 | assert.Nil(t, err) 309 | assert.Equal(t, []byte{8}, v) 310 | assert.Nil(t, db.Close()) 311 | } 312 | 313 | func TestClose(t *testing.T) { 314 | db, err := createTestDB(nil) 315 | assert.Nil(t, err) 316 | assert.Nil(t, db.Close()) 317 | _, err = db.Get([]byte{1}) 318 | assert.NotNil(t, err) 319 | assert.NotNil(t, db.Close()) 320 | } 321 | 322 | func TestCorruptedIndex(t *testing.T) { 323 | opts := &Options{FileSystem: testFS} 324 | db, err := createTestDB(opts) 325 | assert.Nil(t, err) 326 | assert.Nil(t, db.Close()) 327 | 328 | f, err := testFS.OpenFile(filepath.Join(testDBName, indexMetaName), os.O_RDWR, 0) 329 | assert.Nil(t, err) 330 | _, err = f.Write([]byte("corrupted")) 331 | assert.Nil(t, err) 332 | assert.Nil(t, f.Close()) 333 | 334 | db, err = Open(testDBName, opts) 335 | assert.Nil(t, db) 336 | assert.NotNil(t, err) 337 | } 338 | 339 | func TestFileError(t *testing.T) { 340 | db, err := createTestDB(nil) 341 | assert.Nil(t, err) 342 | assert.Nil(t, db.Put(nil, nil)) 343 | 344 | errf := &errfile{} 345 | 346 | testDB := func(t *testing.T) { 347 | v, err := db.Get(nil) 348 | assert.Nil(t, v) 349 | assert.Equal(t, errfileError, err) 350 | 351 | assert.Equal(t, errfileError, db.Put(nil, nil)) 352 | assert.Equal(t, errfileError, db.Delete(nil)) 353 | 354 | has, err := db.Has(nil) 355 | assert.Equal(t, false, has) 356 | assert.Equal(t, errfileError, err) 357 | 358 | it := db.Items() 359 | k, v, err := it.Next() 360 | assert.Nil(t, k) 361 | assert.Nil(t, v) 362 | assert.Equal(t, errfileError, err) 363 | } 364 | 365 | t.Run("segment error", func(t *testing.T) { 366 | oldf := db.datalog.segments[0].File 367 | db.datalog.segments[0].File = errf 368 | 369 | testDB(t) 370 | 371 | assert.Equal(t, errfileError, db.Close()) 372 | 373 | db.datalog.segments[0].File = oldf 374 | }) 375 | 376 | t.Run("index error", func(t *testing.T) { 377 | oldf := db.index.main.File 378 | db.index.main.File = errf 379 | 380 | testDB(t) 381 | assert.Equal(t, errfileError, db.index.close()) 382 | 383 | db.index.main.File = oldf 384 | }) 385 | 386 | errfs := &errfs{} 387 | oldfs := db.opts.FileSystem 388 | db.opts.FileSystem = errfs 389 | assert.Equal(t, errfileError, db.Close()) 390 | assert.Equal(t, errfileError, db.index.close()) 391 | db.opts.FileSystem = oldfs 392 | 393 | assert.Nil(t, db.Close()) 394 | } 395 | 396 | func TestFSError(t *testing.T) { 397 | db, err := createTestDB(&Options{FileSystem: &errfs{}}) 398 | assert.Nil(t, db) 399 | assert.NotNil(t, err) 400 | } 401 | 402 | func TestWordsDict(t *testing.T) { 403 | if testFS != fs.Mem { 404 | t.Skip() 405 | } 406 | fwords, err := os.Open("/usr/share/dict/words") 407 | if err != nil { 408 | t.Skip("words file not found") 409 | } 410 | defer fwords.Close() 411 | db, err := createTestDB(nil) 412 | assert.Nil(t, err) 413 | scanner := bufio.NewScanner(fwords) 414 | items := make(map[string]string) 415 | for scanner.Scan() { 416 | k := scanner.Text() 417 | v := strings.ToUpper(k) 418 | items[k] = v 419 | assert.Nil(t, db.Put([]byte(k), []byte(v))) 420 | } 421 | assert.Nil(t, scanner.Err()) 422 | for k, v := range items { 423 | v2, err := db.Get([]byte(k)) 424 | if string(v2) != v { 425 | t.Fatalf("expected %v; got value=%v, err=%v for key %v", v, string(v2), err, k) 426 | } 427 | } 428 | assert.Nil(t, db.Close()) 429 | } 430 | 431 | func BenchmarkPut(b *testing.B) { 432 | db, err := createTestDB(nil) 433 | assert.Nil(b, err) 434 | b.ResetTimer() 435 | k := []byte{1} 436 | for i := 0; i < b.N; i++ { 437 | if err := db.Put(k, k); err != nil { 438 | b.Fail() 439 | } 440 | } 441 | assert.Nil(b, db.Close()) 442 | } 443 | 444 | func BenchmarkGet(b *testing.B) { 445 | db, err := createTestDB(nil) 446 | assert.Nil(b, err) 447 | k := []byte{1} 448 | if err := db.Put(k, make([]byte, 1024)); err != nil { 449 | b.Fail() 450 | } 451 | b.ResetTimer() 452 | b.ReportAllocs() 453 | for i := 0; i < b.N; i++ { 454 | if _, err := db.Get(k); err != nil { 455 | b.Fatal() 456 | } 457 | } 458 | assert.Nil(b, db.Close()) 459 | } 460 | 461 | func BenchmarkGetAppend(b *testing.B) { 462 | db, err := createTestDB(nil) 463 | assert.Nil(b, err) 464 | k := []byte{1} 465 | if err := db.Put(k, make([]byte, 1024)); err != nil { 466 | b.Fail() 467 | } 468 | b.ResetTimer() 469 | b.ReportAllocs() 470 | buf := make([]byte, 0, 1024) 471 | for i := 0; i < b.N; i++ { 472 | value, err := db.GetAppend(k, buf[:0]) 473 | if err != nil { 474 | b.Fatal() 475 | } 476 | buf = value 477 | } 478 | assert.Nil(b, db.Close()) 479 | } 480 | 481 | func BenchmarkBucket_UnmarshalBinary(b *testing.B) { 482 | testBucket := bucket{ 483 | slots: [slotsPerBucket]slot{}, 484 | } 485 | for i := 0; i < slotsPerBucket; i++ { 486 | testBucket.slots[i].hash = uint32(i) 487 | testBucket.slots[i].keySize = uint16(i + 1) 488 | testBucket.slots[i].valueSize = uint32(i + 17) 489 | } 490 | data, _ := testBucket.MarshalBinary() 491 | b.ResetTimer() 492 | for i := 0; i < b.N; i++ { 493 | tmp := bucket{} 494 | err := tmp.UnmarshalBinary(data) 495 | if err != nil { 496 | b.Fatal() 497 | } 498 | } 499 | } 500 | --------------------------------------------------------------------------------