├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── appveyor.yml ├── bolt_386.go ├── bolt_amd64.go ├── bolt_arm.go ├── bolt_arm64.go ├── bolt_linux.go ├── bolt_openbsd.go ├── bolt_ppc.go ├── bolt_ppc64.go ├── bolt_ppc64le.go ├── bolt_s390x.go ├── bolt_unix.go ├── bolt_unix_solaris.go ├── bolt_windows.go ├── boltsync_unix.go ├── bucket.go ├── bucket_test.go ├── cmd └── bolt │ ├── main.go │ └── main_test.go ├── cursor.go ├── cursor_test.go ├── db.go ├── db_test.go ├── doc.go ├── errors.go ├── freelist.go ├── freelist_test.go ├── node.go ├── node_test.go ├── page.go ├── page_test.go ├── quick_test.go ├── simulation_test.go ├── tx.go └── tx_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | *.prof 2 | *.test 3 | *.swp 4 | /bin/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Ben Johnson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BRANCH=`git rev-parse --abbrev-ref HEAD` 2 | COMMIT=`git rev-parse --short HEAD` 3 | GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)" 4 | 5 | default: build 6 | 7 | race: 8 | @go test -v -race -test.run="TestSimulate_(100op|1000op)" 9 | 10 | # go get github.com/kisielk/errcheck 11 | errcheck: 12 | @errcheck -ignorepkg=bytes -ignore=os:Remove github.com/boltdb/bolt 13 | 14 | test: 15 | @go test -v -cover . 16 | @go test -v ./cmd/bolt 17 | 18 | .PHONY: fmt test 19 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: "{build}" 2 | 3 | os: Windows Server 2012 R2 4 | 5 | clone_folder: c:\gopath\src\github.com\boltdb\bolt 6 | 7 | environment: 8 | GOPATH: c:\gopath 9 | 10 | install: 11 | - echo %PATH% 12 | - echo %GOPATH% 13 | - go version 14 | - go env 15 | - go get -v -t ./... 16 | 17 | build_script: 18 | - go test -v ./... 19 | -------------------------------------------------------------------------------- /bolt_386.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | // maxMapSize represents the largest mmap size supported by Bolt. 4 | const maxMapSize = 0x7FFFFFFF // 2GB 5 | 6 | // maxAllocSize is the size used when creating array pointers. 7 | const maxAllocSize = 0xFFFFFFF 8 | 9 | // Are unaligned load/stores broken on this arch? 10 | var brokenUnaligned = false 11 | -------------------------------------------------------------------------------- /bolt_amd64.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | // maxMapSize represents the largest mmap size supported by Bolt. 4 | const maxMapSize = 0xFFFFFFFFFFFF // 256TB 5 | 6 | // maxAllocSize is the size used when creating array pointers. 7 | const maxAllocSize = 0x7FFFFFFF 8 | 9 | // Are unaligned load/stores broken on this arch? 10 | var brokenUnaligned = false 11 | -------------------------------------------------------------------------------- /bolt_arm.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import "unsafe" 4 | 5 | // maxMapSize represents the largest mmap size supported by Bolt. 6 | const maxMapSize = 0x7FFFFFFF // 2GB 7 | 8 | // maxAllocSize is the size used when creating array pointers. 9 | const maxAllocSize = 0xFFFFFFF 10 | 11 | // Are unaligned load/stores broken on this arch? 12 | var brokenUnaligned bool 13 | 14 | func init() { 15 | // Simple check to see whether this arch handles unaligned load/stores 16 | // correctly. 17 | 18 | // ARM9 and older devices require load/stores to be from/to aligned 19 | // addresses. If not, the lower 2 bits are cleared and that address is 20 | // read in a jumbled up order. 21 | 22 | // See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka15414.html 23 | 24 | raw := [6]byte{0xfe, 0xef, 0x11, 0x22, 0x22, 0x11} 25 | val := *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&raw)) + 2)) 26 | 27 | brokenUnaligned = val != 0x11222211 28 | } 29 | -------------------------------------------------------------------------------- /bolt_arm64.go: -------------------------------------------------------------------------------- 1 | // +build arm64 2 | 3 | package bolt 4 | 5 | // maxMapSize represents the largest mmap size supported by Bolt. 6 | const maxMapSize = 0xFFFFFFFFFFFF // 256TB 7 | 8 | // maxAllocSize is the size used when creating array pointers. 9 | const maxAllocSize = 0x7FFFFFFF 10 | 11 | // Are unaligned load/stores broken on this arch? 12 | var brokenUnaligned = false 13 | -------------------------------------------------------------------------------- /bolt_linux.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "syscall" 5 | ) 6 | 7 | // fdatasync flushes written data to a file descriptor. 8 | func fdatasync(db *DB) error { 9 | return syscall.Fdatasync(int(db.file.Fd())) 10 | } 11 | -------------------------------------------------------------------------------- /bolt_openbsd.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "syscall" 5 | "unsafe" 6 | ) 7 | 8 | const ( 9 | msAsync = 1 << iota // perform asynchronous writes 10 | msSync // perform synchronous writes 11 | msInvalidate // invalidate cached data 12 | ) 13 | 14 | func msync(db *DB) error { 15 | _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate) 16 | if errno != 0 { 17 | return errno 18 | } 19 | return nil 20 | } 21 | 22 | func fdatasync(db *DB) error { 23 | if db.data != nil { 24 | return msync(db) 25 | } 26 | return db.file.Sync() 27 | } 28 | -------------------------------------------------------------------------------- /bolt_ppc.go: -------------------------------------------------------------------------------- 1 | // +build ppc 2 | 3 | package bolt 4 | 5 | // maxMapSize represents the largest mmap size supported by Bolt. 6 | const maxMapSize = 0x7FFFFFFF // 2GB 7 | 8 | // maxAllocSize is the size used when creating array pointers. 9 | const maxAllocSize = 0xFFFFFFF 10 | -------------------------------------------------------------------------------- /bolt_ppc64.go: -------------------------------------------------------------------------------- 1 | // +build ppc64 2 | 3 | package bolt 4 | 5 | // maxMapSize represents the largest mmap size supported by Bolt. 6 | const maxMapSize = 0xFFFFFFFFFFFF // 256TB 7 | 8 | // maxAllocSize is the size used when creating array pointers. 9 | const maxAllocSize = 0x7FFFFFFF 10 | 11 | // Are unaligned load/stores broken on this arch? 12 | var brokenUnaligned = false 13 | -------------------------------------------------------------------------------- /bolt_ppc64le.go: -------------------------------------------------------------------------------- 1 | // +build ppc64le 2 | 3 | package bolt 4 | 5 | // maxMapSize represents the largest mmap size supported by Bolt. 6 | const maxMapSize = 0xFFFFFFFFFFFF // 256TB 7 | 8 | // maxAllocSize is the size used when creating array pointers. 9 | const maxAllocSize = 0x7FFFFFFF 10 | 11 | // Are unaligned load/stores broken on this arch? 12 | var brokenUnaligned = false 13 | -------------------------------------------------------------------------------- /bolt_s390x.go: -------------------------------------------------------------------------------- 1 | // +build s390x 2 | 3 | package bolt 4 | 5 | // maxMapSize represents the largest mmap size supported by Bolt. 6 | const maxMapSize = 0xFFFFFFFFFFFF // 256TB 7 | 8 | // maxAllocSize is the size used when creating array pointers. 9 | const maxAllocSize = 0x7FFFFFFF 10 | 11 | // Are unaligned load/stores broken on this arch? 12 | var brokenUnaligned = false 13 | -------------------------------------------------------------------------------- /bolt_unix.go: -------------------------------------------------------------------------------- 1 | // +build !windows,!plan9,!solaris 2 | 3 | package bolt 4 | 5 | import ( 6 | "fmt" 7 | "os" 8 | "syscall" 9 | "time" 10 | "unsafe" 11 | ) 12 | 13 | // flock acquires an advisory lock on a file descriptor. 14 | func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error { 15 | var t time.Time 16 | for { 17 | // If we're beyond our timeout then return an error. 18 | // This can only occur after we've attempted a flock once. 19 | if t.IsZero() { 20 | t = time.Now() 21 | } else if timeout > 0 && time.Since(t) > timeout { 22 | return ErrTimeout 23 | } 24 | flag := syscall.LOCK_SH 25 | if exclusive { 26 | flag = syscall.LOCK_EX 27 | } 28 | 29 | // Otherwise attempt to obtain an exclusive lock. 30 | err := syscall.Flock(int(db.file.Fd()), flag|syscall.LOCK_NB) 31 | if err == nil { 32 | return nil 33 | } else if err != syscall.EWOULDBLOCK { 34 | return err 35 | } 36 | 37 | // Wait for a bit and try again. 38 | time.Sleep(50 * time.Millisecond) 39 | } 40 | } 41 | 42 | // funlock releases an advisory lock on a file descriptor. 43 | func funlock(db *DB) error { 44 | return syscall.Flock(int(db.file.Fd()), syscall.LOCK_UN) 45 | } 46 | 47 | // mmap memory maps a DB's data file. 48 | func mmap(db *DB, sz int) error { 49 | // Map the data file to memory. 50 | b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) 51 | if err != nil { 52 | return err 53 | } 54 | 55 | // Advise the kernel that the mmap is accessed randomly. 56 | if err := madvise(b, syscall.MADV_RANDOM); err != nil { 57 | return fmt.Errorf("madvise: %s", err) 58 | } 59 | 60 | // Save the original byte slice and convert to a byte array pointer. 61 | db.dataref = b 62 | db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) 63 | db.datasz = sz 64 | return nil 65 | } 66 | 67 | // munmap unmaps a DB's data file from memory. 68 | func munmap(db *DB) error { 69 | // Ignore the unmap if we have no mapped data. 70 | if db.dataref == nil { 71 | return nil 72 | } 73 | 74 | // Unmap using the original byte slice. 75 | err := syscall.Munmap(db.dataref) 76 | db.dataref = nil 77 | db.data = nil 78 | db.datasz = 0 79 | return err 80 | } 81 | 82 | // NOTE: This function is copied from stdlib because it is not available on darwin. 83 | func madvise(b []byte, advice int) (err error) { 84 | _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice)) 85 | if e1 != 0 { 86 | err = e1 87 | } 88 | return 89 | } 90 | -------------------------------------------------------------------------------- /bolt_unix_solaris.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "syscall" 7 | "time" 8 | "unsafe" 9 | 10 | "golang.org/x/sys/unix" 11 | ) 12 | 13 | // flock acquires an advisory lock on a file descriptor. 14 | func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error { 15 | var t time.Time 16 | for { 17 | // If we're beyond our timeout then return an error. 18 | // This can only occur after we've attempted a flock once. 19 | if t.IsZero() { 20 | t = time.Now() 21 | } else if timeout > 0 && time.Since(t) > timeout { 22 | return ErrTimeout 23 | } 24 | var lock syscall.Flock_t 25 | lock.Start = 0 26 | lock.Len = 0 27 | lock.Pid = 0 28 | lock.Whence = 0 29 | lock.Pid = 0 30 | if exclusive { 31 | lock.Type = syscall.F_WRLCK 32 | } else { 33 | lock.Type = syscall.F_RDLCK 34 | } 35 | err := syscall.FcntlFlock(db.file.Fd(), syscall.F_SETLK, &lock) 36 | if err == nil { 37 | return nil 38 | } else if err != syscall.EAGAIN { 39 | return err 40 | } 41 | 42 | // Wait for a bit and try again. 43 | time.Sleep(50 * time.Millisecond) 44 | } 45 | } 46 | 47 | // funlock releases an advisory lock on a file descriptor. 48 | func funlock(db *DB) error { 49 | var lock syscall.Flock_t 50 | lock.Start = 0 51 | lock.Len = 0 52 | lock.Type = syscall.F_UNLCK 53 | lock.Whence = 0 54 | return syscall.FcntlFlock(uintptr(db.file.Fd()), syscall.F_SETLK, &lock) 55 | } 56 | 57 | // mmap memory maps a DB's data file. 58 | func mmap(db *DB, sz int) error { 59 | // Map the data file to memory. 60 | b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) 61 | if err != nil { 62 | return err 63 | } 64 | 65 | // Advise the kernel that the mmap is accessed randomly. 66 | if err := unix.Madvise(b, syscall.MADV_RANDOM); err != nil { 67 | return fmt.Errorf("madvise: %s", err) 68 | } 69 | 70 | // Save the original byte slice and convert to a byte array pointer. 71 | db.dataref = b 72 | db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) 73 | db.datasz = sz 74 | return nil 75 | } 76 | 77 | // munmap unmaps a DB's data file from memory. 78 | func munmap(db *DB) error { 79 | // Ignore the unmap if we have no mapped data. 80 | if db.dataref == nil { 81 | return nil 82 | } 83 | 84 | // Unmap using the original byte slice. 85 | err := unix.Munmap(db.dataref) 86 | db.dataref = nil 87 | db.data = nil 88 | db.datasz = 0 89 | return err 90 | } 91 | -------------------------------------------------------------------------------- /bolt_windows.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "syscall" 7 | "time" 8 | "unsafe" 9 | ) 10 | 11 | // LockFileEx code derived from golang build filemutex_windows.go @ v1.5.1 12 | var ( 13 | modkernel32 = syscall.NewLazyDLL("kernel32.dll") 14 | procLockFileEx = modkernel32.NewProc("LockFileEx") 15 | procUnlockFileEx = modkernel32.NewProc("UnlockFileEx") 16 | ) 17 | 18 | const ( 19 | lockExt = ".lock" 20 | 21 | // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa365203(v=vs.85).aspx 22 | flagLockExclusive = 2 23 | flagLockFailImmediately = 1 24 | 25 | // see https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx 26 | errLockViolation syscall.Errno = 0x21 27 | ) 28 | 29 | func lockFileEx(h syscall.Handle, flags, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) { 30 | r, _, err := procLockFileEx.Call(uintptr(h), uintptr(flags), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol))) 31 | if r == 0 { 32 | return err 33 | } 34 | return nil 35 | } 36 | 37 | func unlockFileEx(h syscall.Handle, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) { 38 | r, _, err := procUnlockFileEx.Call(uintptr(h), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)), 0) 39 | if r == 0 { 40 | return err 41 | } 42 | return nil 43 | } 44 | 45 | // fdatasync flushes written data to a file descriptor. 46 | func fdatasync(db *DB) error { 47 | return db.file.Sync() 48 | } 49 | 50 | // flock acquires an advisory lock on a file descriptor. 51 | func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error { 52 | // Create a separate lock file on windows because a process 53 | // cannot share an exclusive lock on the same file. This is 54 | // needed during Tx.WriteTo(). 55 | f, err := os.OpenFile(db.path+lockExt, os.O_CREATE, mode) 56 | if err != nil { 57 | return err 58 | } 59 | db.lockfile = f 60 | 61 | var t time.Time 62 | for { 63 | // If we're beyond our timeout then return an error. 64 | // This can only occur after we've attempted a flock once. 65 | if t.IsZero() { 66 | t = time.Now() 67 | } else if timeout > 0 && time.Since(t) > timeout { 68 | return ErrTimeout 69 | } 70 | 71 | var flag uint32 = flagLockFailImmediately 72 | if exclusive { 73 | flag |= flagLockExclusive 74 | } 75 | 76 | err := lockFileEx(syscall.Handle(db.lockfile.Fd()), flag, 0, 1, 0, &syscall.Overlapped{}) 77 | if err == nil { 78 | return nil 79 | } else if err != errLockViolation { 80 | return err 81 | } 82 | 83 | // Wait for a bit and try again. 84 | time.Sleep(50 * time.Millisecond) 85 | } 86 | } 87 | 88 | // funlock releases an advisory lock on a file descriptor. 89 | func funlock(db *DB) error { 90 | err := unlockFileEx(syscall.Handle(db.lockfile.Fd()), 0, 1, 0, &syscall.Overlapped{}) 91 | db.lockfile.Close() 92 | os.Remove(db.path + lockExt) 93 | return err 94 | } 95 | 96 | // mmap memory maps a DB's data file. 97 | // Based on: https://github.com/edsrzf/mmap-go 98 | func mmap(db *DB, sz int) error { 99 | if !db.readOnly { 100 | // Truncate the database to the size of the mmap. 101 | if err := db.file.Truncate(int64(sz)); err != nil { 102 | return fmt.Errorf("truncate: %s", err) 103 | } 104 | } 105 | 106 | // Open a file mapping handle. 107 | sizelo := uint32(sz >> 32) 108 | sizehi := uint32(sz) & 0xffffffff 109 | h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil) 110 | if h == 0 { 111 | return os.NewSyscallError("CreateFileMapping", errno) 112 | } 113 | 114 | // Create the memory map. 115 | addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(sz)) 116 | if addr == 0 { 117 | return os.NewSyscallError("MapViewOfFile", errno) 118 | } 119 | 120 | // Close mapping handle. 121 | if err := syscall.CloseHandle(syscall.Handle(h)); err != nil { 122 | return os.NewSyscallError("CloseHandle", err) 123 | } 124 | 125 | // Convert to a byte array. 126 | db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr))) 127 | db.datasz = sz 128 | 129 | return nil 130 | } 131 | 132 | // munmap unmaps a pointer from a file. 133 | // Based on: https://github.com/edsrzf/mmap-go 134 | func munmap(db *DB) error { 135 | if db.data == nil { 136 | return nil 137 | } 138 | 139 | addr := (uintptr)(unsafe.Pointer(&db.data[0])) 140 | if err := syscall.UnmapViewOfFile(addr); err != nil { 141 | return os.NewSyscallError("UnmapViewOfFile", err) 142 | } 143 | return nil 144 | } 145 | -------------------------------------------------------------------------------- /boltsync_unix.go: -------------------------------------------------------------------------------- 1 | // +build !windows,!plan9,!linux,!openbsd 2 | 3 | package bolt 4 | 5 | // fdatasync flushes written data to a file descriptor. 6 | func fdatasync(db *DB) error { 7 | return db.file.Sync() 8 | } 9 | -------------------------------------------------------------------------------- /bucket.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "unsafe" 7 | ) 8 | 9 | const ( 10 | // MaxKeySize is the maximum length of a key, in bytes. 11 | MaxKeySize = 32768 12 | 13 | // MaxValueSize is the maximum length of a value, in bytes. 14 | MaxValueSize = (1 << 31) - 2 15 | ) 16 | 17 | const ( 18 | maxUint = ^uint(0) 19 | minUint = 0 20 | maxInt = int(^uint(0) >> 1) 21 | minInt = -maxInt - 1 22 | ) 23 | 24 | const bucketHeaderSize = int(unsafe.Sizeof(bucket{})) 25 | 26 | const ( 27 | minFillPercent = 0.1 28 | maxFillPercent = 1.0 29 | ) 30 | 31 | // DefaultFillPercent is the percentage that split pages are filled. 32 | // This value can be changed by setting Bucket.FillPercent. 33 | const DefaultFillPercent = 0.5 34 | 35 | // Bucket represents a collection of key/value pairs inside the database. 36 | type Bucket struct { 37 | *bucket 38 | tx *Tx // the associated transaction 39 | buckets map[string]*Bucket // subbucket cache 40 | page *page // inline page reference 41 | rootNode *node // materialized node for the root page. 42 | nodes map[pgid]*node // node cache 43 | 44 | // Sets the threshold for filling nodes when they split. By default, 45 | // the bucket will fill to 50% but it can be useful to increase this 46 | // amount if you know that your write workloads are mostly append-only. 47 | // 48 | // This is non-persisted across transactions so it must be set in every Tx. 49 | FillPercent float64 50 | } 51 | 52 | // bucket represents the on-file representation of a bucket. 53 | // This is stored as the "value" of a bucket key. If the bucket is small enough, 54 | // then its root page can be stored inline in the "value", after the bucket 55 | // header. In the case of inline buckets, the "root" will be 0. 56 | type bucket struct { 57 | root pgid // page id of the bucket's root-level page 58 | sequence uint64 // monotonically incrementing, used by NextSequence() 59 | } 60 | 61 | // newBucket returns a new bucket associated with a transaction. 62 | func newBucket(tx *Tx) Bucket { 63 | var b = Bucket{tx: tx, FillPercent: DefaultFillPercent} 64 | if tx.writable { 65 | b.buckets = make(map[string]*Bucket) 66 | b.nodes = make(map[pgid]*node) 67 | } 68 | return b 69 | } 70 | 71 | // Tx returns the tx of the bucket. 72 | func (b *Bucket) Tx() *Tx { 73 | return b.tx 74 | } 75 | 76 | // Root returns the root of the bucket. 77 | func (b *Bucket) Root() pgid { 78 | return b.root 79 | } 80 | 81 | // Writable returns whether the bucket is writable. 82 | func (b *Bucket) Writable() bool { 83 | return b.tx.writable 84 | } 85 | 86 | // Cursor creates a cursor associated with the bucket. 87 | // The cursor is only valid as long as the transaction is open. 88 | // Do not use a cursor after the transaction is closed. 89 | func (b *Bucket) Cursor() *Cursor { 90 | // Update transaction statistics. 91 | b.tx.stats.CursorCount++ 92 | 93 | // Allocate and return a cursor. 94 | return &Cursor{ 95 | bucket: b, 96 | stack: make([]elemRef, 0), 97 | } 98 | } 99 | 100 | // Bucket retrieves a nested bucket by name. 101 | // Returns nil if the bucket does not exist. 102 | // The bucket instance is only valid for the lifetime of the transaction. 103 | func (b *Bucket) Bucket(name []byte) *Bucket { 104 | if b.buckets != nil { 105 | if child := b.buckets[string(name)]; child != nil { 106 | return child 107 | } 108 | } 109 | 110 | // Move cursor to key. 111 | c := b.Cursor() 112 | k, v, flags := c.seek(name) 113 | 114 | // Return nil if the key doesn't exist or it is not a bucket. 115 | if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 { 116 | return nil 117 | } 118 | 119 | // Otherwise create a bucket and cache it. 120 | var child = b.openBucket(v) 121 | if b.buckets != nil { 122 | b.buckets[string(name)] = child 123 | } 124 | 125 | return child 126 | } 127 | 128 | // Helper method that re-interprets a sub-bucket value 129 | // from a parent into a Bucket 130 | func (b *Bucket) openBucket(value []byte) *Bucket { 131 | var child = newBucket(b.tx) 132 | 133 | // If unaligned load/stores are broken on this arch and value is 134 | // unaligned simply clone to an aligned byte array. 135 | unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0 136 | 137 | if unaligned { 138 | value = cloneBytes(value) 139 | } 140 | 141 | // If this is a writable transaction then we need to copy the bucket entry. 142 | // Read-only transactions can point directly at the mmap entry. 143 | if b.tx.writable && !unaligned { 144 | child.bucket = &bucket{} 145 | *child.bucket = *(*bucket)(unsafe.Pointer(&value[0])) 146 | } else { 147 | child.bucket = (*bucket)(unsafe.Pointer(&value[0])) 148 | } 149 | 150 | // Save a reference to the inline page if the bucket is inline. 151 | if child.root == 0 { 152 | child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) 153 | } 154 | 155 | return &child 156 | } 157 | 158 | // CreateBucket creates a new bucket at the given key and returns the new bucket. 159 | // Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long. 160 | // The bucket instance is only valid for the lifetime of the transaction. 161 | func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { 162 | if b.tx.db == nil { 163 | return nil, ErrTxClosed 164 | } else if !b.tx.writable { 165 | return nil, ErrTxNotWritable 166 | } else if len(key) == 0 { 167 | return nil, ErrBucketNameRequired 168 | } 169 | 170 | // Move cursor to correct position. 171 | c := b.Cursor() 172 | k, _, flags := c.seek(key) 173 | 174 | // Return an error if there is an existing key. 175 | if bytes.Equal(key, k) { 176 | if (flags & bucketLeafFlag) != 0 { 177 | return nil, ErrBucketExists 178 | } 179 | return nil, ErrIncompatibleValue 180 | } 181 | 182 | // Create empty, inline bucket. 183 | var bucket = Bucket{ 184 | bucket: &bucket{}, 185 | rootNode: &node{isLeaf: true}, 186 | FillPercent: DefaultFillPercent, 187 | } 188 | var value = bucket.write() 189 | 190 | // Insert into node. 191 | key = cloneBytes(key) 192 | c.node().put(key, key, value, 0, bucketLeafFlag) 193 | 194 | // Since subbuckets are not allowed on inline buckets, we need to 195 | // dereference the inline page, if it exists. This will cause the bucket 196 | // to be treated as a regular, non-inline bucket for the rest of the tx. 197 | b.page = nil 198 | 199 | return b.Bucket(key), nil 200 | } 201 | 202 | // CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it. 203 | // Returns an error if the bucket name is blank, or if the bucket name is too long. 204 | // The bucket instance is only valid for the lifetime of the transaction. 205 | func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) { 206 | child, err := b.CreateBucket(key) 207 | if err == ErrBucketExists { 208 | return b.Bucket(key), nil 209 | } else if err != nil { 210 | return nil, err 211 | } 212 | return child, nil 213 | } 214 | 215 | // DeleteBucket deletes a bucket at the given key. 216 | // Returns an error if the bucket does not exists, or if the key represents a non-bucket value. 217 | func (b *Bucket) DeleteBucket(key []byte) error { 218 | if b.tx.db == nil { 219 | return ErrTxClosed 220 | } else if !b.Writable() { 221 | return ErrTxNotWritable 222 | } 223 | 224 | // Move cursor to correct position. 225 | c := b.Cursor() 226 | k, _, flags := c.seek(key) 227 | 228 | // Return an error if bucket doesn't exist or is not a bucket. 229 | if !bytes.Equal(key, k) { 230 | return ErrBucketNotFound 231 | } else if (flags & bucketLeafFlag) == 0 { 232 | return ErrIncompatibleValue 233 | } 234 | 235 | // Recursively delete all child buckets. 236 | child := b.Bucket(key) 237 | err := child.ForEach(func(k, v []byte) error { 238 | if v == nil { 239 | if err := child.DeleteBucket(k); err != nil { 240 | return fmt.Errorf("delete bucket: %s", err) 241 | } 242 | } 243 | return nil 244 | }) 245 | if err != nil { 246 | return err 247 | } 248 | 249 | // Remove cached copy. 250 | delete(b.buckets, string(key)) 251 | 252 | // Release all bucket pages to freelist. 253 | child.nodes = nil 254 | child.rootNode = nil 255 | child.free() 256 | 257 | // Delete the node if we have a matching key. 258 | c.node().del(key) 259 | 260 | return nil 261 | } 262 | 263 | // Get retrieves the value for a key in the bucket. 264 | // Returns a nil value if the key does not exist or if the key is a nested bucket. 265 | // The returned value is only valid for the life of the transaction. 266 | func (b *Bucket) Get(key []byte) []byte { 267 | k, v, flags := b.Cursor().seek(key) 268 | 269 | // Return nil if this is a bucket. 270 | if (flags & bucketLeafFlag) != 0 { 271 | return nil 272 | } 273 | 274 | // If our target node isn't the same key as what's passed in then return nil. 275 | if !bytes.Equal(key, k) { 276 | return nil 277 | } 278 | return v 279 | } 280 | 281 | // Put sets the value for a key in the bucket. 282 | // If the key exist then its previous value will be overwritten. 283 | // Supplied value must remain valid for the life of the transaction. 284 | // Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large. 285 | func (b *Bucket) Put(key []byte, value []byte) error { 286 | if b.tx.db == nil { 287 | return ErrTxClosed 288 | } else if !b.Writable() { 289 | return ErrTxNotWritable 290 | } else if len(key) == 0 { 291 | return ErrKeyRequired 292 | } else if len(key) > MaxKeySize { 293 | return ErrKeyTooLarge 294 | } else if int64(len(value)) > MaxValueSize { 295 | return ErrValueTooLarge 296 | } 297 | 298 | // Move cursor to correct position. 299 | c := b.Cursor() 300 | k, _, flags := c.seek(key) 301 | 302 | // Return an error if there is an existing key with a bucket value. 303 | if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 { 304 | return ErrIncompatibleValue 305 | } 306 | 307 | // Insert into node. 308 | key = cloneBytes(key) 309 | c.node().put(key, key, value, 0, 0) 310 | 311 | return nil 312 | } 313 | 314 | // Delete removes a key from the bucket. 315 | // If the key does not exist then nothing is done and a nil error is returned. 316 | // Returns an error if the bucket was created from a read-only transaction. 317 | func (b *Bucket) Delete(key []byte) error { 318 | if b.tx.db == nil { 319 | return ErrTxClosed 320 | } else if !b.Writable() { 321 | return ErrTxNotWritable 322 | } 323 | 324 | // Move cursor to correct position. 325 | c := b.Cursor() 326 | _, _, flags := c.seek(key) 327 | 328 | // Return an error if there is already existing bucket value. 329 | if (flags & bucketLeafFlag) != 0 { 330 | return ErrIncompatibleValue 331 | } 332 | 333 | // Delete the node if we have a matching key. 334 | c.node().del(key) 335 | 336 | return nil 337 | } 338 | 339 | // Sequence returns the current integer for the bucket without incrementing it. 340 | func (b *Bucket) Sequence() uint64 { return b.bucket.sequence } 341 | 342 | // SetSequence updates the sequence number for the bucket. 343 | func (b *Bucket) SetSequence(v uint64) error { 344 | if b.tx.db == nil { 345 | return ErrTxClosed 346 | } else if !b.Writable() { 347 | return ErrTxNotWritable 348 | } 349 | 350 | // Materialize the root node if it hasn't been already so that the 351 | // bucket will be saved during commit. 352 | if b.rootNode == nil { 353 | _ = b.node(b.root, nil) 354 | } 355 | 356 | // Increment and return the sequence. 357 | b.bucket.sequence = v 358 | return nil 359 | } 360 | 361 | // NextSequence returns an autoincrementing integer for the bucket. 362 | func (b *Bucket) NextSequence() (uint64, error) { 363 | if b.tx.db == nil { 364 | return 0, ErrTxClosed 365 | } else if !b.Writable() { 366 | return 0, ErrTxNotWritable 367 | } 368 | 369 | // Materialize the root node if it hasn't been already so that the 370 | // bucket will be saved during commit. 371 | if b.rootNode == nil { 372 | _ = b.node(b.root, nil) 373 | } 374 | 375 | // Increment and return the sequence. 376 | b.bucket.sequence++ 377 | return b.bucket.sequence, nil 378 | } 379 | 380 | // ForEach executes a function for each key/value pair in a bucket. 381 | // If the provided function returns an error then the iteration is stopped and 382 | // the error is returned to the caller. The provided function must not modify 383 | // the bucket; this will result in undefined behavior. 384 | func (b *Bucket) ForEach(fn func(k, v []byte) error) error { 385 | if b.tx.db == nil { 386 | return ErrTxClosed 387 | } 388 | c := b.Cursor() 389 | for k, v := c.First(); k != nil; k, v = c.Next() { 390 | if err := fn(k, v); err != nil { 391 | return err 392 | } 393 | } 394 | return nil 395 | } 396 | 397 | // Stat returns stats on a bucket. 398 | func (b *Bucket) Stats() BucketStats { 399 | var s, subStats BucketStats 400 | pageSize := b.tx.db.pageSize 401 | s.BucketN += 1 402 | if b.root == 0 { 403 | s.InlineBucketN += 1 404 | } 405 | b.forEachPage(func(p *page, depth int) { 406 | if (p.flags & leafPageFlag) != 0 { 407 | s.KeyN += int(p.count) 408 | 409 | // used totals the used bytes for the page 410 | used := pageHeaderSize 411 | 412 | if p.count != 0 { 413 | // If page has any elements, add all element headers. 414 | used += leafPageElementSize * int(p.count-1) 415 | 416 | // Add all element key, value sizes. 417 | // The computation takes advantage of the fact that the position 418 | // of the last element's key/value equals to the total of the sizes 419 | // of all previous elements' keys and values. 420 | // It also includes the last element's header. 421 | lastElement := p.leafPageElement(p.count - 1) 422 | used += int(lastElement.pos + lastElement.ksize + lastElement.vsize) 423 | } 424 | 425 | if b.root == 0 { 426 | // For inlined bucket just update the inline stats 427 | s.InlineBucketInuse += used 428 | } else { 429 | // For non-inlined bucket update all the leaf stats 430 | s.LeafPageN++ 431 | s.LeafInuse += used 432 | s.LeafOverflowN += int(p.overflow) 433 | 434 | // Collect stats from sub-buckets. 435 | // Do that by iterating over all element headers 436 | // looking for the ones with the bucketLeafFlag. 437 | for i := uint16(0); i < p.count; i++ { 438 | e := p.leafPageElement(i) 439 | if (e.flags & bucketLeafFlag) != 0 { 440 | // For any bucket element, open the element value 441 | // and recursively call Stats on the contained bucket. 442 | subStats.Add(b.openBucket(e.value()).Stats()) 443 | } 444 | } 445 | } 446 | } else if (p.flags & branchPageFlag) != 0 { 447 | s.BranchPageN++ 448 | lastElement := p.branchPageElement(p.count - 1) 449 | 450 | // used totals the used bytes for the page 451 | // Add header and all element headers. 452 | used := pageHeaderSize + (branchPageElementSize * int(p.count-1)) 453 | 454 | // Add size of all keys and values. 455 | // Again, use the fact that last element's position equals to 456 | // the total of key, value sizes of all previous elements. 457 | used += int(lastElement.pos + lastElement.ksize) 458 | s.BranchInuse += used 459 | s.BranchOverflowN += int(p.overflow) 460 | } 461 | 462 | // Keep track of maximum page depth. 463 | if depth+1 > s.Depth { 464 | s.Depth = (depth + 1) 465 | } 466 | }) 467 | 468 | // Alloc stats can be computed from page counts and pageSize. 469 | s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize 470 | s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize 471 | 472 | // Add the max depth of sub-buckets to get total nested depth. 473 | s.Depth += subStats.Depth 474 | // Add the stats for all sub-buckets 475 | s.Add(subStats) 476 | return s 477 | } 478 | 479 | // forEachPage iterates over every page in a bucket, including inline pages. 480 | func (b *Bucket) forEachPage(fn func(*page, int)) { 481 | // If we have an inline page then just use that. 482 | if b.page != nil { 483 | fn(b.page, 0) 484 | return 485 | } 486 | 487 | // Otherwise traverse the page hierarchy. 488 | b.tx.forEachPage(b.root, 0, fn) 489 | } 490 | 491 | // forEachPageNode iterates over every page (or node) in a bucket. 492 | // This also includes inline pages. 493 | func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) { 494 | // If we have an inline page or root node then just use that. 495 | if b.page != nil { 496 | fn(b.page, nil, 0) 497 | return 498 | } 499 | b._forEachPageNode(b.root, 0, fn) 500 | } 501 | 502 | func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) { 503 | var p, n = b.pageNode(pgid) 504 | 505 | // Execute function. 506 | fn(p, n, depth) 507 | 508 | // Recursively loop over children. 509 | if p != nil { 510 | if (p.flags & branchPageFlag) != 0 { 511 | for i := 0; i < int(p.count); i++ { 512 | elem := p.branchPageElement(uint16(i)) 513 | b._forEachPageNode(elem.pgid, depth+1, fn) 514 | } 515 | } 516 | } else { 517 | if !n.isLeaf { 518 | for _, inode := range n.inodes { 519 | b._forEachPageNode(inode.pgid, depth+1, fn) 520 | } 521 | } 522 | } 523 | } 524 | 525 | // spill writes all the nodes for this bucket to dirty pages. 526 | func (b *Bucket) spill() error { 527 | // Spill all child buckets first. 528 | for name, child := range b.buckets { 529 | // If the child bucket is small enough and it has no child buckets then 530 | // write it inline into the parent bucket's page. Otherwise spill it 531 | // like a normal bucket and make the parent value a pointer to the page. 532 | var value []byte 533 | if child.inlineable() { 534 | child.free() 535 | value = child.write() 536 | } else { 537 | if err := child.spill(); err != nil { 538 | return err 539 | } 540 | 541 | // Update the child bucket header in this bucket. 542 | value = make([]byte, unsafe.Sizeof(bucket{})) 543 | var bucket = (*bucket)(unsafe.Pointer(&value[0])) 544 | *bucket = *child.bucket 545 | } 546 | 547 | // Skip writing the bucket if there are no materialized nodes. 548 | if child.rootNode == nil { 549 | continue 550 | } 551 | 552 | // Update parent node. 553 | var c = b.Cursor() 554 | k, _, flags := c.seek([]byte(name)) 555 | if !bytes.Equal([]byte(name), k) { 556 | panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k)) 557 | } 558 | if flags&bucketLeafFlag == 0 { 559 | panic(fmt.Sprintf("unexpected bucket header flag: %x", flags)) 560 | } 561 | c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag) 562 | } 563 | 564 | // Ignore if there's not a materialized root node. 565 | if b.rootNode == nil { 566 | return nil 567 | } 568 | 569 | // Spill nodes. 570 | if err := b.rootNode.spill(); err != nil { 571 | return err 572 | } 573 | b.rootNode = b.rootNode.root() 574 | 575 | // Update the root node for this bucket. 576 | if b.rootNode.pgid >= b.tx.meta.pgid { 577 | panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid)) 578 | } 579 | b.root = b.rootNode.pgid 580 | 581 | return nil 582 | } 583 | 584 | // inlineable returns true if a bucket is small enough to be written inline 585 | // and if it contains no subbuckets. Otherwise returns false. 586 | func (b *Bucket) inlineable() bool { 587 | var n = b.rootNode 588 | 589 | // Bucket must only contain a single leaf node. 590 | if n == nil || !n.isLeaf { 591 | return false 592 | } 593 | 594 | // Bucket is not inlineable if it contains subbuckets or if it goes beyond 595 | // our threshold for inline bucket size. 596 | var size = pageHeaderSize 597 | for _, inode := range n.inodes { 598 | size += leafPageElementSize + len(inode.key) + len(inode.value) 599 | 600 | if inode.flags&bucketLeafFlag != 0 { 601 | return false 602 | } else if size > b.maxInlineBucketSize() { 603 | return false 604 | } 605 | } 606 | 607 | return true 608 | } 609 | 610 | // Returns the maximum total size of a bucket to make it a candidate for inlining. 611 | func (b *Bucket) maxInlineBucketSize() int { 612 | return b.tx.db.pageSize / 4 613 | } 614 | 615 | // write allocates and writes a bucket to a byte slice. 616 | func (b *Bucket) write() []byte { 617 | // Allocate the appropriate size. 618 | var n = b.rootNode 619 | var value = make([]byte, bucketHeaderSize+n.size()) 620 | 621 | // Write a bucket header. 622 | var bucket = (*bucket)(unsafe.Pointer(&value[0])) 623 | *bucket = *b.bucket 624 | 625 | // Convert byte slice to a fake page and write the root node. 626 | var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) 627 | n.write(p) 628 | 629 | return value 630 | } 631 | 632 | // rebalance attempts to balance all nodes. 633 | func (b *Bucket) rebalance() { 634 | for _, n := range b.nodes { 635 | n.rebalance() 636 | } 637 | for _, child := range b.buckets { 638 | child.rebalance() 639 | } 640 | } 641 | 642 | // node creates a node from a page and associates it with a given parent. 643 | func (b *Bucket) node(pgid pgid, parent *node) *node { 644 | _assert(b.nodes != nil, "nodes map expected") 645 | 646 | // Retrieve node if it's already been created. 647 | if n := b.nodes[pgid]; n != nil { 648 | return n 649 | } 650 | 651 | // Otherwise create a node and cache it. 652 | n := &node{bucket: b, parent: parent} 653 | if parent == nil { 654 | b.rootNode = n 655 | } else { 656 | parent.children = append(parent.children, n) 657 | } 658 | 659 | // Use the inline page if this is an inline bucket. 660 | var p = b.page 661 | if p == nil { 662 | p = b.tx.page(pgid) 663 | } 664 | 665 | // Read the page into the node and cache it. 666 | n.read(p) 667 | b.nodes[pgid] = n 668 | 669 | // Update statistics. 670 | b.tx.stats.NodeCount++ 671 | 672 | return n 673 | } 674 | 675 | // free recursively frees all pages in the bucket. 676 | func (b *Bucket) free() { 677 | if b.root == 0 { 678 | return 679 | } 680 | 681 | var tx = b.tx 682 | b.forEachPageNode(func(p *page, n *node, _ int) { 683 | if p != nil { 684 | tx.db.freelist.free(tx.meta.txid, p) 685 | } else { 686 | n.free() 687 | } 688 | }) 689 | b.root = 0 690 | } 691 | 692 | // dereference removes all references to the old mmap. 693 | func (b *Bucket) dereference() { 694 | if b.rootNode != nil { 695 | b.rootNode.root().dereference() 696 | } 697 | 698 | for _, child := range b.buckets { 699 | child.dereference() 700 | } 701 | } 702 | 703 | // pageNode returns the in-memory node, if it exists. 704 | // Otherwise returns the underlying page. 705 | func (b *Bucket) pageNode(id pgid) (*page, *node) { 706 | // Inline buckets have a fake page embedded in their value so treat them 707 | // differently. We'll return the rootNode (if available) or the fake page. 708 | if b.root == 0 { 709 | if id != 0 { 710 | panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id)) 711 | } 712 | if b.rootNode != nil { 713 | return nil, b.rootNode 714 | } 715 | return b.page, nil 716 | } 717 | 718 | // Check the node cache for non-inline buckets. 719 | if b.nodes != nil { 720 | if n := b.nodes[id]; n != nil { 721 | return nil, n 722 | } 723 | } 724 | 725 | // Finally lookup the page from the transaction if no node is materialized. 726 | return b.tx.page(id), nil 727 | } 728 | 729 | // BucketStats records statistics about resources used by a bucket. 730 | type BucketStats struct { 731 | // Page count statistics. 732 | BranchPageN int // number of logical branch pages 733 | BranchOverflowN int // number of physical branch overflow pages 734 | LeafPageN int // number of logical leaf pages 735 | LeafOverflowN int // number of physical leaf overflow pages 736 | 737 | // Tree statistics. 738 | KeyN int // number of keys/value pairs 739 | Depth int // number of levels in B+tree 740 | 741 | // Page size utilization. 742 | BranchAlloc int // bytes allocated for physical branch pages 743 | BranchInuse int // bytes actually used for branch data 744 | LeafAlloc int // bytes allocated for physical leaf pages 745 | LeafInuse int // bytes actually used for leaf data 746 | 747 | // Bucket statistics 748 | BucketN int // total number of buckets including the top bucket 749 | InlineBucketN int // total number on inlined buckets 750 | InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse) 751 | } 752 | 753 | func (s *BucketStats) Add(other BucketStats) { 754 | s.BranchPageN += other.BranchPageN 755 | s.BranchOverflowN += other.BranchOverflowN 756 | s.LeafPageN += other.LeafPageN 757 | s.LeafOverflowN += other.LeafOverflowN 758 | s.KeyN += other.KeyN 759 | if s.Depth < other.Depth { 760 | s.Depth = other.Depth 761 | } 762 | s.BranchAlloc += other.BranchAlloc 763 | s.BranchInuse += other.BranchInuse 764 | s.LeafAlloc += other.LeafAlloc 765 | s.LeafInuse += other.LeafInuse 766 | 767 | s.BucketN += other.BucketN 768 | s.InlineBucketN += other.InlineBucketN 769 | s.InlineBucketInuse += other.InlineBucketInuse 770 | } 771 | 772 | // cloneBytes returns a copy of a given slice. 773 | func cloneBytes(v []byte) []byte { 774 | var clone = make([]byte, len(v)) 775 | copy(clone, v) 776 | return clone 777 | } 778 | -------------------------------------------------------------------------------- /cmd/bolt/main_test.go: -------------------------------------------------------------------------------- 1 | package main_test 2 | 3 | import ( 4 | "bytes" 5 | crypto "crypto/rand" 6 | "encoding/binary" 7 | "fmt" 8 | "io" 9 | "io/ioutil" 10 | "math/rand" 11 | "os" 12 | "strconv" 13 | "testing" 14 | 15 | "github.com/boltdb/bolt" 16 | "github.com/boltdb/bolt/cmd/bolt" 17 | ) 18 | 19 | // Ensure the "info" command can print information about a database. 20 | func TestInfoCommand_Run(t *testing.T) { 21 | db := MustOpen(0666, nil) 22 | db.DB.Close() 23 | defer db.Close() 24 | 25 | // Run the info command. 26 | m := NewMain() 27 | if err := m.Run("info", db.Path); err != nil { 28 | t.Fatal(err) 29 | } 30 | } 31 | 32 | // Ensure the "stats" command executes correctly with an empty database. 33 | func TestStatsCommand_Run_EmptyDatabase(t *testing.T) { 34 | // Ignore 35 | if os.Getpagesize() != 4096 { 36 | t.Skip("system does not use 4KB page size") 37 | } 38 | 39 | db := MustOpen(0666, nil) 40 | defer db.Close() 41 | db.DB.Close() 42 | 43 | // Generate expected result. 44 | exp := "Aggregate statistics for 0 buckets\n\n" + 45 | "Page count statistics\n" + 46 | "\tNumber of logical branch pages: 0\n" + 47 | "\tNumber of physical branch overflow pages: 0\n" + 48 | "\tNumber of logical leaf pages: 0\n" + 49 | "\tNumber of physical leaf overflow pages: 0\n" + 50 | "Tree statistics\n" + 51 | "\tNumber of keys/value pairs: 0\n" + 52 | "\tNumber of levels in B+tree: 0\n" + 53 | "Page size utilization\n" + 54 | "\tBytes allocated for physical branch pages: 0\n" + 55 | "\tBytes actually used for branch data: 0 (0%)\n" + 56 | "\tBytes allocated for physical leaf pages: 0\n" + 57 | "\tBytes actually used for leaf data: 0 (0%)\n" + 58 | "Bucket statistics\n" + 59 | "\tTotal number of buckets: 0\n" + 60 | "\tTotal number on inlined buckets: 0 (0%)\n" + 61 | "\tBytes used for inlined buckets: 0 (0%)\n" 62 | 63 | // Run the command. 64 | m := NewMain() 65 | if err := m.Run("stats", db.Path); err != nil { 66 | t.Fatal(err) 67 | } else if m.Stdout.String() != exp { 68 | t.Fatalf("unexpected stdout:\n\n%s", m.Stdout.String()) 69 | } 70 | } 71 | 72 | // Ensure the "stats" command can execute correctly. 73 | func TestStatsCommand_Run(t *testing.T) { 74 | // Ignore 75 | if os.Getpagesize() != 4096 { 76 | t.Skip("system does not use 4KB page size") 77 | } 78 | 79 | db := MustOpen(0666, nil) 80 | defer db.Close() 81 | 82 | if err := db.Update(func(tx *bolt.Tx) error { 83 | // Create "foo" bucket. 84 | b, err := tx.CreateBucket([]byte("foo")) 85 | if err != nil { 86 | return err 87 | } 88 | for i := 0; i < 10; i++ { 89 | if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { 90 | return err 91 | } 92 | } 93 | 94 | // Create "bar" bucket. 95 | b, err = tx.CreateBucket([]byte("bar")) 96 | if err != nil { 97 | return err 98 | } 99 | for i := 0; i < 100; i++ { 100 | if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { 101 | return err 102 | } 103 | } 104 | 105 | // Create "baz" bucket. 106 | b, err = tx.CreateBucket([]byte("baz")) 107 | if err != nil { 108 | return err 109 | } 110 | if err := b.Put([]byte("key"), []byte("value")); err != nil { 111 | return err 112 | } 113 | 114 | return nil 115 | }); err != nil { 116 | t.Fatal(err) 117 | } 118 | db.DB.Close() 119 | 120 | // Generate expected result. 121 | exp := "Aggregate statistics for 3 buckets\n\n" + 122 | "Page count statistics\n" + 123 | "\tNumber of logical branch pages: 0\n" + 124 | "\tNumber of physical branch overflow pages: 0\n" + 125 | "\tNumber of logical leaf pages: 1\n" + 126 | "\tNumber of physical leaf overflow pages: 0\n" + 127 | "Tree statistics\n" + 128 | "\tNumber of keys/value pairs: 111\n" + 129 | "\tNumber of levels in B+tree: 1\n" + 130 | "Page size utilization\n" + 131 | "\tBytes allocated for physical branch pages: 0\n" + 132 | "\tBytes actually used for branch data: 0 (0%)\n" + 133 | "\tBytes allocated for physical leaf pages: 4096\n" + 134 | "\tBytes actually used for leaf data: 1996 (48%)\n" + 135 | "Bucket statistics\n" + 136 | "\tTotal number of buckets: 3\n" + 137 | "\tTotal number on inlined buckets: 2 (66%)\n" + 138 | "\tBytes used for inlined buckets: 236 (11%)\n" 139 | 140 | // Run the command. 141 | m := NewMain() 142 | if err := m.Run("stats", db.Path); err != nil { 143 | t.Fatal(err) 144 | } else if m.Stdout.String() != exp { 145 | t.Fatalf("unexpected stdout:\n\n%s", m.Stdout.String()) 146 | } 147 | } 148 | 149 | // Main represents a test wrapper for main.Main that records output. 150 | type Main struct { 151 | *main.Main 152 | Stdin bytes.Buffer 153 | Stdout bytes.Buffer 154 | Stderr bytes.Buffer 155 | } 156 | 157 | // NewMain returns a new instance of Main. 158 | func NewMain() *Main { 159 | m := &Main{Main: main.NewMain()} 160 | m.Main.Stdin = &m.Stdin 161 | m.Main.Stdout = &m.Stdout 162 | m.Main.Stderr = &m.Stderr 163 | return m 164 | } 165 | 166 | // MustOpen creates a Bolt database in a temporary location. 167 | func MustOpen(mode os.FileMode, options *bolt.Options) *DB { 168 | // Create temporary path. 169 | f, _ := ioutil.TempFile("", "bolt-") 170 | f.Close() 171 | os.Remove(f.Name()) 172 | 173 | db, err := bolt.Open(f.Name(), mode, options) 174 | if err != nil { 175 | panic(err.Error()) 176 | } 177 | return &DB{DB: db, Path: f.Name()} 178 | } 179 | 180 | // DB is a test wrapper for bolt.DB. 181 | type DB struct { 182 | *bolt.DB 183 | Path string 184 | } 185 | 186 | // Close closes and removes the database. 187 | func (db *DB) Close() error { 188 | defer os.Remove(db.Path) 189 | return db.DB.Close() 190 | } 191 | 192 | func TestCompactCommand_Run(t *testing.T) { 193 | var s int64 194 | if err := binary.Read(crypto.Reader, binary.BigEndian, &s); err != nil { 195 | t.Fatal(err) 196 | } 197 | rand.Seed(s) 198 | 199 | dstdb := MustOpen(0666, nil) 200 | dstdb.Close() 201 | 202 | // fill the db 203 | db := MustOpen(0666, nil) 204 | if err := db.Update(func(tx *bolt.Tx) error { 205 | n := 2 + rand.Intn(5) 206 | for i := 0; i < n; i++ { 207 | k := []byte(fmt.Sprintf("b%d", i)) 208 | b, err := tx.CreateBucketIfNotExists(k) 209 | if err != nil { 210 | return err 211 | } 212 | if err := b.SetSequence(uint64(i)); err != nil { 213 | return err 214 | } 215 | if err := fillBucket(b, append(k, '.')); err != nil { 216 | return err 217 | } 218 | } 219 | return nil 220 | }); err != nil { 221 | db.Close() 222 | t.Fatal(err) 223 | } 224 | 225 | // make the db grow by adding large values, and delete them. 226 | if err := db.Update(func(tx *bolt.Tx) error { 227 | b, err := tx.CreateBucketIfNotExists([]byte("large_vals")) 228 | if err != nil { 229 | return err 230 | } 231 | n := 5 + rand.Intn(5) 232 | for i := 0; i < n; i++ { 233 | v := make([]byte, 1000*1000*(1+rand.Intn(5))) 234 | _, err := crypto.Read(v) 235 | if err != nil { 236 | return err 237 | } 238 | if err := b.Put([]byte(fmt.Sprintf("l%d", i)), v); err != nil { 239 | return err 240 | } 241 | } 242 | return nil 243 | }); err != nil { 244 | db.Close() 245 | t.Fatal(err) 246 | } 247 | if err := db.Update(func(tx *bolt.Tx) error { 248 | c := tx.Bucket([]byte("large_vals")).Cursor() 249 | for k, _ := c.First(); k != nil; k, _ = c.Next() { 250 | if err := c.Delete(); err != nil { 251 | return err 252 | } 253 | } 254 | return tx.DeleteBucket([]byte("large_vals")) 255 | }); err != nil { 256 | db.Close() 257 | t.Fatal(err) 258 | } 259 | db.DB.Close() 260 | defer db.Close() 261 | defer dstdb.Close() 262 | 263 | dbChk, err := chkdb(db.Path) 264 | if err != nil { 265 | t.Fatal(err) 266 | } 267 | 268 | m := NewMain() 269 | if err := m.Run("compact", "-o", dstdb.Path, db.Path); err != nil { 270 | t.Fatal(err) 271 | } 272 | 273 | dbChkAfterCompact, err := chkdb(db.Path) 274 | if err != nil { 275 | t.Fatal(err) 276 | } 277 | 278 | dstdbChk, err := chkdb(dstdb.Path) 279 | if err != nil { 280 | t.Fatal(err) 281 | } 282 | 283 | if !bytes.Equal(dbChk, dbChkAfterCompact) { 284 | t.Error("the original db has been touched") 285 | } 286 | if !bytes.Equal(dbChk, dstdbChk) { 287 | t.Error("the compacted db data isn't the same than the original db") 288 | } 289 | } 290 | 291 | func fillBucket(b *bolt.Bucket, prefix []byte) error { 292 | n := 10 + rand.Intn(50) 293 | for i := 0; i < n; i++ { 294 | v := make([]byte, 10*(1+rand.Intn(4))) 295 | _, err := crypto.Read(v) 296 | if err != nil { 297 | return err 298 | } 299 | k := append(prefix, []byte(fmt.Sprintf("k%d", i))...) 300 | if err := b.Put(k, v); err != nil { 301 | return err 302 | } 303 | } 304 | // limit depth of subbuckets 305 | s := 2 + rand.Intn(4) 306 | if len(prefix) > (2*s + 1) { 307 | return nil 308 | } 309 | n = 1 + rand.Intn(3) 310 | for i := 0; i < n; i++ { 311 | k := append(prefix, []byte(fmt.Sprintf("b%d", i))...) 312 | sb, err := b.CreateBucket(k) 313 | if err != nil { 314 | return err 315 | } 316 | if err := fillBucket(sb, append(k, '.')); err != nil { 317 | return err 318 | } 319 | } 320 | return nil 321 | } 322 | 323 | func chkdb(path string) ([]byte, error) { 324 | db, err := bolt.Open(path, 0666, nil) 325 | if err != nil { 326 | return nil, err 327 | } 328 | defer db.Close() 329 | var buf bytes.Buffer 330 | err = db.View(func(tx *bolt.Tx) error { 331 | return tx.ForEach(func(name []byte, b *bolt.Bucket) error { 332 | return walkBucket(b, name, nil, &buf) 333 | }) 334 | }) 335 | if err != nil { 336 | return nil, err 337 | } 338 | return buf.Bytes(), nil 339 | } 340 | 341 | func walkBucket(parent *bolt.Bucket, k []byte, v []byte, w io.Writer) error { 342 | if _, err := fmt.Fprintf(w, "%d:%x=%x\n", parent.Sequence(), k, v); err != nil { 343 | return err 344 | } 345 | 346 | // not a bucket, exit. 347 | if v != nil { 348 | return nil 349 | } 350 | return parent.ForEach(func(k, v []byte) error { 351 | if v == nil { 352 | return walkBucket(parent.Bucket(k), k, nil, w) 353 | } 354 | return walkBucket(parent, k, v, w) 355 | }) 356 | } 357 | -------------------------------------------------------------------------------- /cursor.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "sort" 7 | ) 8 | 9 | // Cursor represents an iterator that can traverse over all key/value pairs in a bucket in sorted order. 10 | // Cursors see nested buckets with value == nil. 11 | // Cursors can be obtained from a transaction and are valid as long as the transaction is open. 12 | // 13 | // Keys and values returned from the cursor are only valid for the life of the transaction. 14 | // 15 | // Changing data while traversing with a cursor may cause it to be invalidated 16 | // and return unexpected keys and/or values. You must reposition your cursor 17 | // after mutating data. 18 | type Cursor struct { 19 | bucket *Bucket 20 | stack []elemRef 21 | } 22 | 23 | // Bucket returns the bucket that this cursor was created from. 24 | func (c *Cursor) Bucket() *Bucket { 25 | return c.bucket 26 | } 27 | 28 | // First moves the cursor to the first item in the bucket and returns its key and value. 29 | // If the bucket is empty then a nil key and value are returned. 30 | // The returned key and value are only valid for the life of the transaction. 31 | func (c *Cursor) First() (key []byte, value []byte) { 32 | _assert(c.bucket.tx.db != nil, "tx closed") 33 | c.stack = c.stack[:0] 34 | p, n := c.bucket.pageNode(c.bucket.root) 35 | c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) 36 | c.first() 37 | 38 | // If we land on an empty page then move to the next value. 39 | // https://github.com/boltdb/bolt/issues/450 40 | if c.stack[len(c.stack)-1].count() == 0 { 41 | c.next() 42 | } 43 | 44 | k, v, flags := c.keyValue() 45 | if (flags & uint32(bucketLeafFlag)) != 0 { 46 | return k, nil 47 | } 48 | return k, v 49 | 50 | } 51 | 52 | // Last moves the cursor to the last item in the bucket and returns its key and value. 53 | // If the bucket is empty then a nil key and value are returned. 54 | // The returned key and value are only valid for the life of the transaction. 55 | func (c *Cursor) Last() (key []byte, value []byte) { 56 | _assert(c.bucket.tx.db != nil, "tx closed") 57 | c.stack = c.stack[:0] 58 | p, n := c.bucket.pageNode(c.bucket.root) 59 | ref := elemRef{page: p, node: n} 60 | ref.index = ref.count() - 1 61 | c.stack = append(c.stack, ref) 62 | c.last() 63 | k, v, flags := c.keyValue() 64 | if (flags & uint32(bucketLeafFlag)) != 0 { 65 | return k, nil 66 | } 67 | return k, v 68 | } 69 | 70 | // Next moves the cursor to the next item in the bucket and returns its key and value. 71 | // If the cursor is at the end of the bucket then a nil key and value are returned. 72 | // The returned key and value are only valid for the life of the transaction. 73 | func (c *Cursor) Next() (key []byte, value []byte) { 74 | _assert(c.bucket.tx.db != nil, "tx closed") 75 | k, v, flags := c.next() 76 | if (flags & uint32(bucketLeafFlag)) != 0 { 77 | return k, nil 78 | } 79 | return k, v 80 | } 81 | 82 | // Prev moves the cursor to the previous item in the bucket and returns its key and value. 83 | // If the cursor is at the beginning of the bucket then a nil key and value are returned. 84 | // The returned key and value are only valid for the life of the transaction. 85 | func (c *Cursor) Prev() (key []byte, value []byte) { 86 | _assert(c.bucket.tx.db != nil, "tx closed") 87 | 88 | // Attempt to move back one element until we're successful. 89 | // Move up the stack as we hit the beginning of each page in our stack. 90 | for i := len(c.stack) - 1; i >= 0; i-- { 91 | elem := &c.stack[i] 92 | if elem.index > 0 { 93 | elem.index-- 94 | break 95 | } 96 | c.stack = c.stack[:i] 97 | } 98 | 99 | // If we've hit the end then return nil. 100 | if len(c.stack) == 0 { 101 | return nil, nil 102 | } 103 | 104 | // Move down the stack to find the last element of the last leaf under this branch. 105 | c.last() 106 | k, v, flags := c.keyValue() 107 | if (flags & uint32(bucketLeafFlag)) != 0 { 108 | return k, nil 109 | } 110 | return k, v 111 | } 112 | 113 | // Seek moves the cursor to a given key and returns it. 114 | // If the key does not exist then the next key is used. If no keys 115 | // follow, a nil key is returned. 116 | // The returned key and value are only valid for the life of the transaction. 117 | func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) { 118 | k, v, flags := c.seek(seek) 119 | 120 | // If we ended up after the last element of a page then move to the next one. 121 | if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() { 122 | k, v, flags = c.next() 123 | } 124 | 125 | if k == nil { 126 | return nil, nil 127 | } else if (flags & uint32(bucketLeafFlag)) != 0 { 128 | return k, nil 129 | } 130 | return k, v 131 | } 132 | 133 | // Delete removes the current key/value under the cursor from the bucket. 134 | // Delete fails if current key/value is a bucket or if the transaction is not writable. 135 | func (c *Cursor) Delete() error { 136 | if c.bucket.tx.db == nil { 137 | return ErrTxClosed 138 | } else if !c.bucket.Writable() { 139 | return ErrTxNotWritable 140 | } 141 | 142 | key, _, flags := c.keyValue() 143 | // Return an error if current value is a bucket. 144 | if (flags & bucketLeafFlag) != 0 { 145 | return ErrIncompatibleValue 146 | } 147 | c.node().del(key) 148 | 149 | return nil 150 | } 151 | 152 | // seek moves the cursor to a given key and returns it. 153 | // If the key does not exist then the next key is used. 154 | func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { 155 | _assert(c.bucket.tx.db != nil, "tx closed") 156 | 157 | // Start from root page/node and traverse to correct page. 158 | c.stack = c.stack[:0] 159 | c.search(seek, c.bucket.root) 160 | ref := &c.stack[len(c.stack)-1] 161 | 162 | // If the cursor is pointing to the end of page/node then return nil. 163 | if ref.index >= ref.count() { 164 | return nil, nil, 0 165 | } 166 | 167 | // If this is a bucket then return a nil value. 168 | return c.keyValue() 169 | } 170 | 171 | // first moves the cursor to the first leaf element under the last page in the stack. 172 | func (c *Cursor) first() { 173 | for { 174 | // Exit when we hit a leaf page. 175 | var ref = &c.stack[len(c.stack)-1] 176 | if ref.isLeaf() { 177 | break 178 | } 179 | 180 | // Keep adding pages pointing to the first element to the stack. 181 | var pgid pgid 182 | if ref.node != nil { 183 | pgid = ref.node.inodes[ref.index].pgid 184 | } else { 185 | pgid = ref.page.branchPageElement(uint16(ref.index)).pgid 186 | } 187 | p, n := c.bucket.pageNode(pgid) 188 | c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) 189 | } 190 | } 191 | 192 | // last moves the cursor to the last leaf element under the last page in the stack. 193 | func (c *Cursor) last() { 194 | for { 195 | // Exit when we hit a leaf page. 196 | ref := &c.stack[len(c.stack)-1] 197 | if ref.isLeaf() { 198 | break 199 | } 200 | 201 | // Keep adding pages pointing to the last element in the stack. 202 | var pgid pgid 203 | if ref.node != nil { 204 | pgid = ref.node.inodes[ref.index].pgid 205 | } else { 206 | pgid = ref.page.branchPageElement(uint16(ref.index)).pgid 207 | } 208 | p, n := c.bucket.pageNode(pgid) 209 | 210 | var nextRef = elemRef{page: p, node: n} 211 | nextRef.index = nextRef.count() - 1 212 | c.stack = append(c.stack, nextRef) 213 | } 214 | } 215 | 216 | // next moves to the next leaf element and returns the key and value. 217 | // If the cursor is at the last leaf element then it stays there and returns nil. 218 | func (c *Cursor) next() (key []byte, value []byte, flags uint32) { 219 | for { 220 | // Attempt to move over one element until we're successful. 221 | // Move up the stack as we hit the end of each page in our stack. 222 | var i int 223 | for i = len(c.stack) - 1; i >= 0; i-- { 224 | elem := &c.stack[i] 225 | if elem.index < elem.count()-1 { 226 | elem.index++ 227 | break 228 | } 229 | } 230 | 231 | // If we've hit the root page then stop and return. This will leave the 232 | // cursor on the last element of the last page. 233 | if i == -1 { 234 | return nil, nil, 0 235 | } 236 | 237 | // Otherwise start from where we left off in the stack and find the 238 | // first element of the first leaf page. 239 | c.stack = c.stack[:i+1] 240 | c.first() 241 | 242 | // If this is an empty page then restart and move back up the stack. 243 | // https://github.com/boltdb/bolt/issues/450 244 | if c.stack[len(c.stack)-1].count() == 0 { 245 | continue 246 | } 247 | 248 | return c.keyValue() 249 | } 250 | } 251 | 252 | // search recursively performs a binary search against a given page/node until it finds a given key. 253 | func (c *Cursor) search(key []byte, pgid pgid) { 254 | p, n := c.bucket.pageNode(pgid) 255 | if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 { 256 | panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags)) 257 | } 258 | e := elemRef{page: p, node: n} 259 | c.stack = append(c.stack, e) 260 | 261 | // If we're on a leaf page/node then find the specific node. 262 | if e.isLeaf() { 263 | c.nsearch(key) 264 | return 265 | } 266 | 267 | if n != nil { 268 | c.searchNode(key, n) 269 | return 270 | } 271 | c.searchPage(key, p) 272 | } 273 | 274 | func (c *Cursor) searchNode(key []byte, n *node) { 275 | var exact bool 276 | index := sort.Search(len(n.inodes), func(i int) bool { 277 | // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. 278 | // sort.Search() finds the lowest index where f() != -1 but we need the highest index. 279 | ret := bytes.Compare(n.inodes[i].key, key) 280 | if ret == 0 { 281 | exact = true 282 | } 283 | return ret != -1 284 | }) 285 | if !exact && index > 0 { 286 | index-- 287 | } 288 | c.stack[len(c.stack)-1].index = index 289 | 290 | // Recursively search to the next page. 291 | c.search(key, n.inodes[index].pgid) 292 | } 293 | 294 | func (c *Cursor) searchPage(key []byte, p *page) { 295 | // Binary search for the correct range. 296 | inodes := p.branchPageElements() 297 | 298 | var exact bool 299 | index := sort.Search(int(p.count), func(i int) bool { 300 | // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. 301 | // sort.Search() finds the lowest index where f() != -1 but we need the highest index. 302 | ret := bytes.Compare(inodes[i].key(), key) 303 | if ret == 0 { 304 | exact = true 305 | } 306 | return ret != -1 307 | }) 308 | if !exact && index > 0 { 309 | index-- 310 | } 311 | c.stack[len(c.stack)-1].index = index 312 | 313 | // Recursively search to the next page. 314 | c.search(key, inodes[index].pgid) 315 | } 316 | 317 | // nsearch searches the leaf node on the top of the stack for a key. 318 | func (c *Cursor) nsearch(key []byte) { 319 | e := &c.stack[len(c.stack)-1] 320 | p, n := e.page, e.node 321 | 322 | // If we have a node then search its inodes. 323 | if n != nil { 324 | index := sort.Search(len(n.inodes), func(i int) bool { 325 | return bytes.Compare(n.inodes[i].key, key) != -1 326 | }) 327 | e.index = index 328 | return 329 | } 330 | 331 | // If we have a page then search its leaf elements. 332 | inodes := p.leafPageElements() 333 | index := sort.Search(int(p.count), func(i int) bool { 334 | return bytes.Compare(inodes[i].key(), key) != -1 335 | }) 336 | e.index = index 337 | } 338 | 339 | // keyValue returns the key and value of the current leaf element. 340 | func (c *Cursor) keyValue() ([]byte, []byte, uint32) { 341 | ref := &c.stack[len(c.stack)-1] 342 | if ref.count() == 0 || ref.index >= ref.count() { 343 | return nil, nil, 0 344 | } 345 | 346 | // Retrieve value from node. 347 | if ref.node != nil { 348 | inode := &ref.node.inodes[ref.index] 349 | return inode.key, inode.value, inode.flags 350 | } 351 | 352 | // Or retrieve value from page. 353 | elem := ref.page.leafPageElement(uint16(ref.index)) 354 | return elem.key(), elem.value(), elem.flags 355 | } 356 | 357 | // node returns the node that the cursor is currently positioned on. 358 | func (c *Cursor) node() *node { 359 | _assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack") 360 | 361 | // If the top of the stack is a leaf node then just return it. 362 | if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() { 363 | return ref.node 364 | } 365 | 366 | // Start from root and traverse down the hierarchy. 367 | var n = c.stack[0].node 368 | if n == nil { 369 | n = c.bucket.node(c.stack[0].page.id, nil) 370 | } 371 | for _, ref := range c.stack[:len(c.stack)-1] { 372 | _assert(!n.isLeaf, "expected branch node") 373 | n = n.childAt(int(ref.index)) 374 | } 375 | _assert(n.isLeaf, "expected leaf node") 376 | return n 377 | } 378 | 379 | // elemRef represents a reference to an element on a given page/node. 380 | type elemRef struct { 381 | page *page 382 | node *node 383 | index int 384 | } 385 | 386 | // isLeaf returns whether the ref is pointing at a leaf page/node. 387 | func (r *elemRef) isLeaf() bool { 388 | if r.node != nil { 389 | return r.node.isLeaf 390 | } 391 | return (r.page.flags & leafPageFlag) != 0 392 | } 393 | 394 | // count returns the number of inodes or page elements. 395 | func (r *elemRef) count() int { 396 | if r.node != nil { 397 | return len(r.node.inodes) 398 | } 399 | return int(r.page.count) 400 | } 401 | -------------------------------------------------------------------------------- /cursor_test.go: -------------------------------------------------------------------------------- 1 | package bolt_test 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "log" 8 | "os" 9 | "reflect" 10 | "sort" 11 | "testing" 12 | "testing/quick" 13 | 14 | "github.com/boltdb/bolt" 15 | ) 16 | 17 | // Ensure that a cursor can return a reference to the bucket that created it. 18 | func TestCursor_Bucket(t *testing.T) { 19 | db := MustOpenDB() 20 | defer db.MustClose() 21 | if err := db.Update(func(tx *bolt.Tx) error { 22 | b, err := tx.CreateBucket([]byte("widgets")) 23 | if err != nil { 24 | t.Fatal(err) 25 | } 26 | if cb := b.Cursor().Bucket(); !reflect.DeepEqual(cb, b) { 27 | t.Fatal("cursor bucket mismatch") 28 | } 29 | return nil 30 | }); err != nil { 31 | t.Fatal(err) 32 | } 33 | } 34 | 35 | // Ensure that a Tx cursor can seek to the appropriate keys. 36 | func TestCursor_Seek(t *testing.T) { 37 | db := MustOpenDB() 38 | defer db.MustClose() 39 | if err := db.Update(func(tx *bolt.Tx) error { 40 | b, err := tx.CreateBucket([]byte("widgets")) 41 | if err != nil { 42 | t.Fatal(err) 43 | } 44 | if err := b.Put([]byte("foo"), []byte("0001")); err != nil { 45 | t.Fatal(err) 46 | } 47 | if err := b.Put([]byte("bar"), []byte("0002")); err != nil { 48 | t.Fatal(err) 49 | } 50 | if err := b.Put([]byte("baz"), []byte("0003")); err != nil { 51 | t.Fatal(err) 52 | } 53 | 54 | if _, err := b.CreateBucket([]byte("bkt")); err != nil { 55 | t.Fatal(err) 56 | } 57 | return nil 58 | }); err != nil { 59 | t.Fatal(err) 60 | } 61 | 62 | if err := db.View(func(tx *bolt.Tx) error { 63 | c := tx.Bucket([]byte("widgets")).Cursor() 64 | 65 | // Exact match should go to the key. 66 | if k, v := c.Seek([]byte("bar")); !bytes.Equal(k, []byte("bar")) { 67 | t.Fatalf("unexpected key: %v", k) 68 | } else if !bytes.Equal(v, []byte("0002")) { 69 | t.Fatalf("unexpected value: %v", v) 70 | } 71 | 72 | // Inexact match should go to the next key. 73 | if k, v := c.Seek([]byte("bas")); !bytes.Equal(k, []byte("baz")) { 74 | t.Fatalf("unexpected key: %v", k) 75 | } else if !bytes.Equal(v, []byte("0003")) { 76 | t.Fatalf("unexpected value: %v", v) 77 | } 78 | 79 | // Low key should go to the first key. 80 | if k, v := c.Seek([]byte("")); !bytes.Equal(k, []byte("bar")) { 81 | t.Fatalf("unexpected key: %v", k) 82 | } else if !bytes.Equal(v, []byte("0002")) { 83 | t.Fatalf("unexpected value: %v", v) 84 | } 85 | 86 | // High key should return no key. 87 | if k, v := c.Seek([]byte("zzz")); k != nil { 88 | t.Fatalf("expected nil key: %v", k) 89 | } else if v != nil { 90 | t.Fatalf("expected nil value: %v", v) 91 | } 92 | 93 | // Buckets should return their key but no value. 94 | if k, v := c.Seek([]byte("bkt")); !bytes.Equal(k, []byte("bkt")) { 95 | t.Fatalf("unexpected key: %v", k) 96 | } else if v != nil { 97 | t.Fatalf("expected nil value: %v", v) 98 | } 99 | 100 | return nil 101 | }); err != nil { 102 | t.Fatal(err) 103 | } 104 | } 105 | 106 | func TestCursor_Delete(t *testing.T) { 107 | db := MustOpenDB() 108 | defer db.MustClose() 109 | 110 | const count = 1000 111 | 112 | // Insert every other key between 0 and $count. 113 | if err := db.Update(func(tx *bolt.Tx) error { 114 | b, err := tx.CreateBucket([]byte("widgets")) 115 | if err != nil { 116 | t.Fatal(err) 117 | } 118 | for i := 0; i < count; i += 1 { 119 | k := make([]byte, 8) 120 | binary.BigEndian.PutUint64(k, uint64(i)) 121 | if err := b.Put(k, make([]byte, 100)); err != nil { 122 | t.Fatal(err) 123 | } 124 | } 125 | if _, err := b.CreateBucket([]byte("sub")); err != nil { 126 | t.Fatal(err) 127 | } 128 | return nil 129 | }); err != nil { 130 | t.Fatal(err) 131 | } 132 | 133 | if err := db.Update(func(tx *bolt.Tx) error { 134 | c := tx.Bucket([]byte("widgets")).Cursor() 135 | bound := make([]byte, 8) 136 | binary.BigEndian.PutUint64(bound, uint64(count/2)) 137 | for key, _ := c.First(); bytes.Compare(key, bound) < 0; key, _ = c.Next() { 138 | if err := c.Delete(); err != nil { 139 | t.Fatal(err) 140 | } 141 | } 142 | 143 | c.Seek([]byte("sub")) 144 | if err := c.Delete(); err != bolt.ErrIncompatibleValue { 145 | t.Fatalf("unexpected error: %s", err) 146 | } 147 | 148 | return nil 149 | }); err != nil { 150 | t.Fatal(err) 151 | } 152 | 153 | if err := db.View(func(tx *bolt.Tx) error { 154 | stats := tx.Bucket([]byte("widgets")).Stats() 155 | if stats.KeyN != count/2+1 { 156 | t.Fatalf("unexpected KeyN: %d", stats.KeyN) 157 | } 158 | return nil 159 | }); err != nil { 160 | t.Fatal(err) 161 | } 162 | } 163 | 164 | // Ensure that a Tx cursor can seek to the appropriate keys when there are a 165 | // large number of keys. This test also checks that seek will always move 166 | // forward to the next key. 167 | // 168 | // Related: https://github.com/boltdb/bolt/pull/187 169 | func TestCursor_Seek_Large(t *testing.T) { 170 | db := MustOpenDB() 171 | defer db.MustClose() 172 | 173 | var count = 10000 174 | 175 | // Insert every other key between 0 and $count. 176 | if err := db.Update(func(tx *bolt.Tx) error { 177 | b, err := tx.CreateBucket([]byte("widgets")) 178 | if err != nil { 179 | t.Fatal(err) 180 | } 181 | 182 | for i := 0; i < count; i += 100 { 183 | for j := i; j < i+100; j += 2 { 184 | k := make([]byte, 8) 185 | binary.BigEndian.PutUint64(k, uint64(j)) 186 | if err := b.Put(k, make([]byte, 100)); err != nil { 187 | t.Fatal(err) 188 | } 189 | } 190 | } 191 | return nil 192 | }); err != nil { 193 | t.Fatal(err) 194 | } 195 | 196 | if err := db.View(func(tx *bolt.Tx) error { 197 | c := tx.Bucket([]byte("widgets")).Cursor() 198 | for i := 0; i < count; i++ { 199 | seek := make([]byte, 8) 200 | binary.BigEndian.PutUint64(seek, uint64(i)) 201 | 202 | k, _ := c.Seek(seek) 203 | 204 | // The last seek is beyond the end of the the range so 205 | // it should return nil. 206 | if i == count-1 { 207 | if k != nil { 208 | t.Fatal("expected nil key") 209 | } 210 | continue 211 | } 212 | 213 | // Otherwise we should seek to the exact key or the next key. 214 | num := binary.BigEndian.Uint64(k) 215 | if i%2 == 0 { 216 | if num != uint64(i) { 217 | t.Fatalf("unexpected num: %d", num) 218 | } 219 | } else { 220 | if num != uint64(i+1) { 221 | t.Fatalf("unexpected num: %d", num) 222 | } 223 | } 224 | } 225 | 226 | return nil 227 | }); err != nil { 228 | t.Fatal(err) 229 | } 230 | } 231 | 232 | // Ensure that a cursor can iterate over an empty bucket without error. 233 | func TestCursor_EmptyBucket(t *testing.T) { 234 | db := MustOpenDB() 235 | defer db.MustClose() 236 | if err := db.Update(func(tx *bolt.Tx) error { 237 | _, err := tx.CreateBucket([]byte("widgets")) 238 | return err 239 | }); err != nil { 240 | t.Fatal(err) 241 | } 242 | 243 | if err := db.View(func(tx *bolt.Tx) error { 244 | c := tx.Bucket([]byte("widgets")).Cursor() 245 | k, v := c.First() 246 | if k != nil { 247 | t.Fatalf("unexpected key: %v", k) 248 | } else if v != nil { 249 | t.Fatalf("unexpected value: %v", v) 250 | } 251 | return nil 252 | }); err != nil { 253 | t.Fatal(err) 254 | } 255 | } 256 | 257 | // Ensure that a Tx cursor can reverse iterate over an empty bucket without error. 258 | func TestCursor_EmptyBucketReverse(t *testing.T) { 259 | db := MustOpenDB() 260 | defer db.MustClose() 261 | 262 | if err := db.Update(func(tx *bolt.Tx) error { 263 | _, err := tx.CreateBucket([]byte("widgets")) 264 | return err 265 | }); err != nil { 266 | t.Fatal(err) 267 | } 268 | if err := db.View(func(tx *bolt.Tx) error { 269 | c := tx.Bucket([]byte("widgets")).Cursor() 270 | k, v := c.Last() 271 | if k != nil { 272 | t.Fatalf("unexpected key: %v", k) 273 | } else if v != nil { 274 | t.Fatalf("unexpected value: %v", v) 275 | } 276 | return nil 277 | }); err != nil { 278 | t.Fatal(err) 279 | } 280 | } 281 | 282 | // Ensure that a Tx cursor can iterate over a single root with a couple elements. 283 | func TestCursor_Iterate_Leaf(t *testing.T) { 284 | db := MustOpenDB() 285 | defer db.MustClose() 286 | 287 | if err := db.Update(func(tx *bolt.Tx) error { 288 | b, err := tx.CreateBucket([]byte("widgets")) 289 | if err != nil { 290 | t.Fatal(err) 291 | } 292 | if err := b.Put([]byte("baz"), []byte{}); err != nil { 293 | t.Fatal(err) 294 | } 295 | if err := b.Put([]byte("foo"), []byte{0}); err != nil { 296 | t.Fatal(err) 297 | } 298 | if err := b.Put([]byte("bar"), []byte{1}); err != nil { 299 | t.Fatal(err) 300 | } 301 | return nil 302 | }); err != nil { 303 | t.Fatal(err) 304 | } 305 | tx, err := db.Begin(false) 306 | if err != nil { 307 | t.Fatal(err) 308 | } 309 | defer func() { _ = tx.Rollback() }() 310 | 311 | c := tx.Bucket([]byte("widgets")).Cursor() 312 | 313 | k, v := c.First() 314 | if !bytes.Equal(k, []byte("bar")) { 315 | t.Fatalf("unexpected key: %v", k) 316 | } else if !bytes.Equal(v, []byte{1}) { 317 | t.Fatalf("unexpected value: %v", v) 318 | } 319 | 320 | k, v = c.Next() 321 | if !bytes.Equal(k, []byte("baz")) { 322 | t.Fatalf("unexpected key: %v", k) 323 | } else if !bytes.Equal(v, []byte{}) { 324 | t.Fatalf("unexpected value: %v", v) 325 | } 326 | 327 | k, v = c.Next() 328 | if !bytes.Equal(k, []byte("foo")) { 329 | t.Fatalf("unexpected key: %v", k) 330 | } else if !bytes.Equal(v, []byte{0}) { 331 | t.Fatalf("unexpected value: %v", v) 332 | } 333 | 334 | k, v = c.Next() 335 | if k != nil { 336 | t.Fatalf("expected nil key: %v", k) 337 | } else if v != nil { 338 | t.Fatalf("expected nil value: %v", v) 339 | } 340 | 341 | k, v = c.Next() 342 | if k != nil { 343 | t.Fatalf("expected nil key: %v", k) 344 | } else if v != nil { 345 | t.Fatalf("expected nil value: %v", v) 346 | } 347 | 348 | if err := tx.Rollback(); err != nil { 349 | t.Fatal(err) 350 | } 351 | } 352 | 353 | // Ensure that a Tx cursor can iterate in reverse over a single root with a couple elements. 354 | func TestCursor_LeafRootReverse(t *testing.T) { 355 | db := MustOpenDB() 356 | defer db.MustClose() 357 | 358 | if err := db.Update(func(tx *bolt.Tx) error { 359 | b, err := tx.CreateBucket([]byte("widgets")) 360 | if err != nil { 361 | t.Fatal(err) 362 | } 363 | if err := b.Put([]byte("baz"), []byte{}); err != nil { 364 | t.Fatal(err) 365 | } 366 | if err := b.Put([]byte("foo"), []byte{0}); err != nil { 367 | t.Fatal(err) 368 | } 369 | if err := b.Put([]byte("bar"), []byte{1}); err != nil { 370 | t.Fatal(err) 371 | } 372 | return nil 373 | }); err != nil { 374 | t.Fatal(err) 375 | } 376 | tx, err := db.Begin(false) 377 | if err != nil { 378 | t.Fatal(err) 379 | } 380 | c := tx.Bucket([]byte("widgets")).Cursor() 381 | 382 | if k, v := c.Last(); !bytes.Equal(k, []byte("foo")) { 383 | t.Fatalf("unexpected key: %v", k) 384 | } else if !bytes.Equal(v, []byte{0}) { 385 | t.Fatalf("unexpected value: %v", v) 386 | } 387 | 388 | if k, v := c.Prev(); !bytes.Equal(k, []byte("baz")) { 389 | t.Fatalf("unexpected key: %v", k) 390 | } else if !bytes.Equal(v, []byte{}) { 391 | t.Fatalf("unexpected value: %v", v) 392 | } 393 | 394 | if k, v := c.Prev(); !bytes.Equal(k, []byte("bar")) { 395 | t.Fatalf("unexpected key: %v", k) 396 | } else if !bytes.Equal(v, []byte{1}) { 397 | t.Fatalf("unexpected value: %v", v) 398 | } 399 | 400 | if k, v := c.Prev(); k != nil { 401 | t.Fatalf("expected nil key: %v", k) 402 | } else if v != nil { 403 | t.Fatalf("expected nil value: %v", v) 404 | } 405 | 406 | if k, v := c.Prev(); k != nil { 407 | t.Fatalf("expected nil key: %v", k) 408 | } else if v != nil { 409 | t.Fatalf("expected nil value: %v", v) 410 | } 411 | 412 | if err := tx.Rollback(); err != nil { 413 | t.Fatal(err) 414 | } 415 | } 416 | 417 | // Ensure that a Tx cursor can restart from the beginning. 418 | func TestCursor_Restart(t *testing.T) { 419 | db := MustOpenDB() 420 | defer db.MustClose() 421 | 422 | if err := db.Update(func(tx *bolt.Tx) error { 423 | b, err := tx.CreateBucket([]byte("widgets")) 424 | if err != nil { 425 | t.Fatal(err) 426 | } 427 | if err := b.Put([]byte("bar"), []byte{}); err != nil { 428 | t.Fatal(err) 429 | } 430 | if err := b.Put([]byte("foo"), []byte{}); err != nil { 431 | t.Fatal(err) 432 | } 433 | return nil 434 | }); err != nil { 435 | t.Fatal(err) 436 | } 437 | 438 | tx, err := db.Begin(false) 439 | if err != nil { 440 | t.Fatal(err) 441 | } 442 | c := tx.Bucket([]byte("widgets")).Cursor() 443 | 444 | if k, _ := c.First(); !bytes.Equal(k, []byte("bar")) { 445 | t.Fatalf("unexpected key: %v", k) 446 | } 447 | if k, _ := c.Next(); !bytes.Equal(k, []byte("foo")) { 448 | t.Fatalf("unexpected key: %v", k) 449 | } 450 | 451 | if k, _ := c.First(); !bytes.Equal(k, []byte("bar")) { 452 | t.Fatalf("unexpected key: %v", k) 453 | } 454 | if k, _ := c.Next(); !bytes.Equal(k, []byte("foo")) { 455 | t.Fatalf("unexpected key: %v", k) 456 | } 457 | 458 | if err := tx.Rollback(); err != nil { 459 | t.Fatal(err) 460 | } 461 | } 462 | 463 | // Ensure that a cursor can skip over empty pages that have been deleted. 464 | func TestCursor_First_EmptyPages(t *testing.T) { 465 | db := MustOpenDB() 466 | defer db.MustClose() 467 | 468 | // Create 1000 keys in the "widgets" bucket. 469 | if err := db.Update(func(tx *bolt.Tx) error { 470 | b, err := tx.CreateBucket([]byte("widgets")) 471 | if err != nil { 472 | t.Fatal(err) 473 | } 474 | 475 | for i := 0; i < 1000; i++ { 476 | if err := b.Put(u64tob(uint64(i)), []byte{}); err != nil { 477 | t.Fatal(err) 478 | } 479 | } 480 | 481 | return nil 482 | }); err != nil { 483 | t.Fatal(err) 484 | } 485 | 486 | // Delete half the keys and then try to iterate. 487 | if err := db.Update(func(tx *bolt.Tx) error { 488 | b := tx.Bucket([]byte("widgets")) 489 | for i := 0; i < 600; i++ { 490 | if err := b.Delete(u64tob(uint64(i))); err != nil { 491 | t.Fatal(err) 492 | } 493 | } 494 | 495 | c := b.Cursor() 496 | var n int 497 | for k, _ := c.First(); k != nil; k, _ = c.Next() { 498 | n++ 499 | } 500 | if n != 400 { 501 | t.Fatalf("unexpected key count: %d", n) 502 | } 503 | 504 | return nil 505 | }); err != nil { 506 | t.Fatal(err) 507 | } 508 | } 509 | 510 | // Ensure that a Tx can iterate over all elements in a bucket. 511 | func TestCursor_QuickCheck(t *testing.T) { 512 | f := func(items testdata) bool { 513 | db := MustOpenDB() 514 | defer db.MustClose() 515 | 516 | // Bulk insert all values. 517 | tx, err := db.Begin(true) 518 | if err != nil { 519 | t.Fatal(err) 520 | } 521 | b, err := tx.CreateBucket([]byte("widgets")) 522 | if err != nil { 523 | t.Fatal(err) 524 | } 525 | for _, item := range items { 526 | if err := b.Put(item.Key, item.Value); err != nil { 527 | t.Fatal(err) 528 | } 529 | } 530 | if err := tx.Commit(); err != nil { 531 | t.Fatal(err) 532 | } 533 | 534 | // Sort test data. 535 | sort.Sort(items) 536 | 537 | // Iterate over all items and check consistency. 538 | var index = 0 539 | tx, err = db.Begin(false) 540 | if err != nil { 541 | t.Fatal(err) 542 | } 543 | 544 | c := tx.Bucket([]byte("widgets")).Cursor() 545 | for k, v := c.First(); k != nil && index < len(items); k, v = c.Next() { 546 | if !bytes.Equal(k, items[index].Key) { 547 | t.Fatalf("unexpected key: %v", k) 548 | } else if !bytes.Equal(v, items[index].Value) { 549 | t.Fatalf("unexpected value: %v", v) 550 | } 551 | index++ 552 | } 553 | if len(items) != index { 554 | t.Fatalf("unexpected item count: %v, expected %v", len(items), index) 555 | } 556 | 557 | if err := tx.Rollback(); err != nil { 558 | t.Fatal(err) 559 | } 560 | 561 | return true 562 | } 563 | if err := quick.Check(f, qconfig()); err != nil { 564 | t.Error(err) 565 | } 566 | } 567 | 568 | // Ensure that a transaction can iterate over all elements in a bucket in reverse. 569 | func TestCursor_QuickCheck_Reverse(t *testing.T) { 570 | f := func(items testdata) bool { 571 | db := MustOpenDB() 572 | defer db.MustClose() 573 | 574 | // Bulk insert all values. 575 | tx, err := db.Begin(true) 576 | if err != nil { 577 | t.Fatal(err) 578 | } 579 | b, err := tx.CreateBucket([]byte("widgets")) 580 | if err != nil { 581 | t.Fatal(err) 582 | } 583 | for _, item := range items { 584 | if err := b.Put(item.Key, item.Value); err != nil { 585 | t.Fatal(err) 586 | } 587 | } 588 | if err := tx.Commit(); err != nil { 589 | t.Fatal(err) 590 | } 591 | 592 | // Sort test data. 593 | sort.Sort(revtestdata(items)) 594 | 595 | // Iterate over all items and check consistency. 596 | var index = 0 597 | tx, err = db.Begin(false) 598 | if err != nil { 599 | t.Fatal(err) 600 | } 601 | c := tx.Bucket([]byte("widgets")).Cursor() 602 | for k, v := c.Last(); k != nil && index < len(items); k, v = c.Prev() { 603 | if !bytes.Equal(k, items[index].Key) { 604 | t.Fatalf("unexpected key: %v", k) 605 | } else if !bytes.Equal(v, items[index].Value) { 606 | t.Fatalf("unexpected value: %v", v) 607 | } 608 | index++ 609 | } 610 | if len(items) != index { 611 | t.Fatalf("unexpected item count: %v, expected %v", len(items), index) 612 | } 613 | 614 | if err := tx.Rollback(); err != nil { 615 | t.Fatal(err) 616 | } 617 | 618 | return true 619 | } 620 | if err := quick.Check(f, qconfig()); err != nil { 621 | t.Error(err) 622 | } 623 | } 624 | 625 | // Ensure that a Tx cursor can iterate over subbuckets. 626 | func TestCursor_QuickCheck_BucketsOnly(t *testing.T) { 627 | db := MustOpenDB() 628 | defer db.MustClose() 629 | 630 | if err := db.Update(func(tx *bolt.Tx) error { 631 | b, err := tx.CreateBucket([]byte("widgets")) 632 | if err != nil { 633 | t.Fatal(err) 634 | } 635 | if _, err := b.CreateBucket([]byte("foo")); err != nil { 636 | t.Fatal(err) 637 | } 638 | if _, err := b.CreateBucket([]byte("bar")); err != nil { 639 | t.Fatal(err) 640 | } 641 | if _, err := b.CreateBucket([]byte("baz")); err != nil { 642 | t.Fatal(err) 643 | } 644 | return nil 645 | }); err != nil { 646 | t.Fatal(err) 647 | } 648 | 649 | if err := db.View(func(tx *bolt.Tx) error { 650 | var names []string 651 | c := tx.Bucket([]byte("widgets")).Cursor() 652 | for k, v := c.First(); k != nil; k, v = c.Next() { 653 | names = append(names, string(k)) 654 | if v != nil { 655 | t.Fatalf("unexpected value: %v", v) 656 | } 657 | } 658 | if !reflect.DeepEqual(names, []string{"bar", "baz", "foo"}) { 659 | t.Fatalf("unexpected names: %+v", names) 660 | } 661 | return nil 662 | }); err != nil { 663 | t.Fatal(err) 664 | } 665 | } 666 | 667 | // Ensure that a Tx cursor can reverse iterate over subbuckets. 668 | func TestCursor_QuickCheck_BucketsOnly_Reverse(t *testing.T) { 669 | db := MustOpenDB() 670 | defer db.MustClose() 671 | 672 | if err := db.Update(func(tx *bolt.Tx) error { 673 | b, err := tx.CreateBucket([]byte("widgets")) 674 | if err != nil { 675 | t.Fatal(err) 676 | } 677 | if _, err := b.CreateBucket([]byte("foo")); err != nil { 678 | t.Fatal(err) 679 | } 680 | if _, err := b.CreateBucket([]byte("bar")); err != nil { 681 | t.Fatal(err) 682 | } 683 | if _, err := b.CreateBucket([]byte("baz")); err != nil { 684 | t.Fatal(err) 685 | } 686 | return nil 687 | }); err != nil { 688 | t.Fatal(err) 689 | } 690 | 691 | if err := db.View(func(tx *bolt.Tx) error { 692 | var names []string 693 | c := tx.Bucket([]byte("widgets")).Cursor() 694 | for k, v := c.Last(); k != nil; k, v = c.Prev() { 695 | names = append(names, string(k)) 696 | if v != nil { 697 | t.Fatalf("unexpected value: %v", v) 698 | } 699 | } 700 | if !reflect.DeepEqual(names, []string{"foo", "baz", "bar"}) { 701 | t.Fatalf("unexpected names: %+v", names) 702 | } 703 | return nil 704 | }); err != nil { 705 | t.Fatal(err) 706 | } 707 | } 708 | 709 | func ExampleCursor() { 710 | // Open the database. 711 | db, err := bolt.Open(tempfile(), 0666, nil) 712 | if err != nil { 713 | log.Fatal(err) 714 | } 715 | defer os.Remove(db.Path()) 716 | 717 | // Start a read-write transaction. 718 | if err := db.Update(func(tx *bolt.Tx) error { 719 | // Create a new bucket. 720 | b, err := tx.CreateBucket([]byte("animals")) 721 | if err != nil { 722 | return err 723 | } 724 | 725 | // Insert data into a bucket. 726 | if err := b.Put([]byte("dog"), []byte("fun")); err != nil { 727 | log.Fatal(err) 728 | } 729 | if err := b.Put([]byte("cat"), []byte("lame")); err != nil { 730 | log.Fatal(err) 731 | } 732 | if err := b.Put([]byte("liger"), []byte("awesome")); err != nil { 733 | log.Fatal(err) 734 | } 735 | 736 | // Create a cursor for iteration. 737 | c := b.Cursor() 738 | 739 | // Iterate over items in sorted key order. This starts from the 740 | // first key/value pair and updates the k/v variables to the 741 | // next key/value on each iteration. 742 | // 743 | // The loop finishes at the end of the cursor when a nil key is returned. 744 | for k, v := c.First(); k != nil; k, v = c.Next() { 745 | fmt.Printf("A %s is %s.\n", k, v) 746 | } 747 | 748 | return nil 749 | }); err != nil { 750 | log.Fatal(err) 751 | } 752 | 753 | if err := db.Close(); err != nil { 754 | log.Fatal(err) 755 | } 756 | 757 | // Output: 758 | // A cat is lame. 759 | // A dog is fun. 760 | // A liger is awesome. 761 | } 762 | 763 | func ExampleCursor_reverse() { 764 | // Open the database. 765 | db, err := bolt.Open(tempfile(), 0666, nil) 766 | if err != nil { 767 | log.Fatal(err) 768 | } 769 | defer os.Remove(db.Path()) 770 | 771 | // Start a read-write transaction. 772 | if err := db.Update(func(tx *bolt.Tx) error { 773 | // Create a new bucket. 774 | b, err := tx.CreateBucket([]byte("animals")) 775 | if err != nil { 776 | return err 777 | } 778 | 779 | // Insert data into a bucket. 780 | if err := b.Put([]byte("dog"), []byte("fun")); err != nil { 781 | log.Fatal(err) 782 | } 783 | if err := b.Put([]byte("cat"), []byte("lame")); err != nil { 784 | log.Fatal(err) 785 | } 786 | if err := b.Put([]byte("liger"), []byte("awesome")); err != nil { 787 | log.Fatal(err) 788 | } 789 | 790 | // Create a cursor for iteration. 791 | c := b.Cursor() 792 | 793 | // Iterate over items in reverse sorted key order. This starts 794 | // from the last key/value pair and updates the k/v variables to 795 | // the previous key/value on each iteration. 796 | // 797 | // The loop finishes at the beginning of the cursor when a nil key 798 | // is returned. 799 | for k, v := c.Last(); k != nil; k, v = c.Prev() { 800 | fmt.Printf("A %s is %s.\n", k, v) 801 | } 802 | 803 | return nil 804 | }); err != nil { 805 | log.Fatal(err) 806 | } 807 | 808 | // Close the database to release the file lock. 809 | if err := db.Close(); err != nil { 810 | log.Fatal(err) 811 | } 812 | 813 | // Output: 814 | // A liger is awesome. 815 | // A dog is fun. 816 | // A cat is lame. 817 | } 818 | -------------------------------------------------------------------------------- /db.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "hash/fnv" 7 | "log" 8 | "os" 9 | "runtime" 10 | "runtime/debug" 11 | "strings" 12 | "sync" 13 | "time" 14 | "unsafe" 15 | ) 16 | 17 | // The largest step that can be taken when remapping the mmap. 18 | const maxMmapStep = 1 << 30 // 1GB 19 | 20 | // The data file format version. 21 | const version = 2 22 | 23 | // Represents a marker value to indicate that a file is a Bolt DB. 24 | const magic uint32 = 0xED0CDAED 25 | 26 | // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when 27 | // syncing changes to a file. This is required as some operating systems, 28 | // such as OpenBSD, do not have a unified buffer cache (UBC) and writes 29 | // must be synchronized using the msync(2) syscall. 30 | const IgnoreNoSync = runtime.GOOS == "openbsd" 31 | 32 | // Default values if not set in a DB instance. 33 | const ( 34 | DefaultMaxBatchSize int = 1000 35 | DefaultMaxBatchDelay = 10 * time.Millisecond 36 | DefaultAllocSize = 16 * 1024 * 1024 37 | ) 38 | 39 | // default page size for db is set to the OS page size. 40 | var defaultPageSize = os.Getpagesize() 41 | 42 | // DB represents a collection of buckets persisted to a file on disk. 43 | // All data access is performed through transactions which can be obtained through the DB. 44 | // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. 45 | type DB struct { 46 | // When enabled, the database will perform a Check() after every commit. 47 | // A panic is issued if the database is in an inconsistent state. This 48 | // flag has a large performance impact so it should only be used for 49 | // debugging purposes. 50 | StrictMode bool 51 | 52 | // Setting the NoSync flag will cause the database to skip fsync() 53 | // calls after each commit. This can be useful when bulk loading data 54 | // into a database and you can restart the bulk load in the event of 55 | // a system failure or database corruption. Do not set this flag for 56 | // normal use. 57 | // 58 | // If the package global IgnoreNoSync constant is true, this value is 59 | // ignored. See the comment on that constant for more details. 60 | // 61 | // THIS IS UNSAFE. PLEASE USE WITH CAUTION. 62 | NoSync bool 63 | 64 | // When true, skips the truncate call when growing the database. 65 | // Setting this to true is only safe on non-ext3/ext4 systems. 66 | // Skipping truncation avoids preallocation of hard drive space and 67 | // bypasses a truncate() and fsync() syscall on remapping. 68 | // 69 | // https://github.com/boltdb/bolt/issues/284 70 | NoGrowSync bool 71 | 72 | // If you want to read the entire database fast, you can set MmapFlag to 73 | // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. 74 | MmapFlags int 75 | 76 | // MaxBatchSize is the maximum size of a batch. Default value is 77 | // copied from DefaultMaxBatchSize in Open. 78 | // 79 | // If <=0, disables batching. 80 | // 81 | // Do not change concurrently with calls to Batch. 82 | MaxBatchSize int 83 | 84 | // MaxBatchDelay is the maximum delay before a batch starts. 85 | // Default value is copied from DefaultMaxBatchDelay in Open. 86 | // 87 | // If <=0, effectively disables batching. 88 | // 89 | // Do not change concurrently with calls to Batch. 90 | MaxBatchDelay time.Duration 91 | 92 | // AllocSize is the amount of space allocated when the database 93 | // needs to create new pages. This is done to amortize the cost 94 | // of truncate() and fsync() when growing the data file. 95 | AllocSize int 96 | 97 | path string 98 | file *os.File 99 | lockfile *os.File // windows only 100 | dataref []byte // mmap'ed readonly, write throws SEGV 101 | data *[maxMapSize]byte 102 | datasz int 103 | filesz int // current on disk file size 104 | meta0 *meta 105 | meta1 *meta 106 | pageSize int 107 | opened bool 108 | rwtx *Tx 109 | txs []*Tx 110 | freelist *freelist 111 | stats Stats 112 | 113 | pagePool sync.Pool 114 | 115 | batchMu sync.Mutex 116 | batch *batch 117 | 118 | rwlock sync.Mutex // Allows only one writer at a time. 119 | metalock sync.Mutex // Protects meta page access. 120 | mmaplock sync.RWMutex // Protects mmap access during remapping. 121 | statlock sync.RWMutex // Protects stats access. 122 | 123 | ops struct { 124 | writeAt func(b []byte, off int64) (n int, err error) 125 | } 126 | 127 | // Read only mode. 128 | // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately. 129 | readOnly bool 130 | } 131 | 132 | // Path returns the path to currently open database file. 133 | func (db *DB) Path() string { 134 | return db.path 135 | } 136 | 137 | // GoString returns the Go string representation of the database. 138 | func (db *DB) GoString() string { 139 | return fmt.Sprintf("bolt.DB{path:%q}", db.path) 140 | } 141 | 142 | // String returns the string representation of the database. 143 | func (db *DB) String() string { 144 | return fmt.Sprintf("DB<%q>", db.path) 145 | } 146 | 147 | // Open creates and opens a database at the given path. 148 | // If the file does not exist then it will be created automatically. 149 | // Passing in nil options will cause Bolt to open the database with the default options. 150 | func Open(path string, mode os.FileMode, options *Options) (*DB, error) { 151 | var db = &DB{opened: true} 152 | 153 | // Set default options if no options are provided. 154 | if options == nil { 155 | options = DefaultOptions 156 | } 157 | db.NoGrowSync = options.NoGrowSync 158 | db.MmapFlags = options.MmapFlags 159 | 160 | // Set default values for later DB operations. 161 | db.MaxBatchSize = DefaultMaxBatchSize 162 | db.MaxBatchDelay = DefaultMaxBatchDelay 163 | db.AllocSize = DefaultAllocSize 164 | 165 | flag := os.O_RDWR 166 | if options.ReadOnly { 167 | flag = os.O_RDONLY 168 | db.readOnly = true 169 | } 170 | 171 | // Open data file and separate sync handler for metadata writes. 172 | db.path = path 173 | var err error 174 | if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil { 175 | _ = db.close() 176 | return nil, err 177 | } 178 | 179 | // Lock file so that other processes using Bolt in read-write mode cannot 180 | // use the database at the same time. This would cause corruption since 181 | // the two processes would write meta pages and free pages separately. 182 | // The database file is locked exclusively (only one process can grab the lock) 183 | // if !options.ReadOnly. 184 | // The database file is locked using the shared lock (more than one process may 185 | // hold a lock at the same time) otherwise (options.ReadOnly is set). 186 | if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil { 187 | _ = db.close() 188 | return nil, err 189 | } 190 | 191 | // Default values for test hooks 192 | db.ops.writeAt = db.file.WriteAt 193 | 194 | // Initialize the database if it doesn't exist. 195 | if info, err := db.file.Stat(); err != nil { 196 | return nil, err 197 | } else if info.Size() == 0 { 198 | // Initialize new files with meta pages. 199 | if err := db.init(); err != nil { 200 | return nil, err 201 | } 202 | } else { 203 | // Read the first meta page to determine the page size. 204 | var buf [0x1000]byte 205 | if _, err := db.file.ReadAt(buf[:], 0); err == nil { 206 | m := db.pageInBuffer(buf[:], 0).meta() 207 | if err := m.validate(); err != nil { 208 | // If we can't read the page size, we can assume it's the same 209 | // as the OS -- since that's how the page size was chosen in the 210 | // first place. 211 | // 212 | // If the first page is invalid and this OS uses a different 213 | // page size than what the database was created with then we 214 | // are out of luck and cannot access the database. 215 | db.pageSize = os.Getpagesize() 216 | } else { 217 | db.pageSize = int(m.pageSize) 218 | } 219 | } 220 | } 221 | 222 | // Initialize page pool. 223 | db.pagePool = sync.Pool{ 224 | New: func() interface{} { 225 | return make([]byte, db.pageSize) 226 | }, 227 | } 228 | 229 | // Memory map the data file. 230 | if err := db.mmap(options.InitialMmapSize); err != nil { 231 | _ = db.close() 232 | return nil, err 233 | } 234 | 235 | // Read in the freelist. 236 | db.freelist = newFreelist() 237 | db.freelist.read(db.page(db.meta().freelist)) 238 | 239 | // Mark the database as opened and return. 240 | return db, nil 241 | } 242 | 243 | // mmap opens the underlying memory-mapped file and initializes the meta references. 244 | // minsz is the minimum size that the new mmap can be. 245 | func (db *DB) mmap(minsz int) error { 246 | db.mmaplock.Lock() 247 | defer db.mmaplock.Unlock() 248 | 249 | info, err := db.file.Stat() 250 | if err != nil { 251 | return fmt.Errorf("mmap stat error: %s", err) 252 | } else if int(info.Size()) < db.pageSize*2 { 253 | return fmt.Errorf("file size too small") 254 | } 255 | 256 | // Ensure the size is at least the minimum size. 257 | var size = int(info.Size()) 258 | if size < minsz { 259 | size = minsz 260 | } 261 | size, err = db.mmapSize(size) 262 | if err != nil { 263 | return err 264 | } 265 | 266 | // Dereference all mmap references before unmapping. 267 | if db.rwtx != nil { 268 | db.rwtx.root.dereference() 269 | } 270 | 271 | // Unmap existing data before continuing. 272 | if err := db.munmap(); err != nil { 273 | return err 274 | } 275 | 276 | // Memory-map the data file as a byte slice. 277 | if err := mmap(db, size); err != nil { 278 | return err 279 | } 280 | 281 | // Save references to the meta pages. 282 | db.meta0 = db.page(0).meta() 283 | db.meta1 = db.page(1).meta() 284 | 285 | // Validate the meta pages. We only return an error if both meta pages fail 286 | // validation, since meta0 failing validation means that it wasn't saved 287 | // properly -- but we can recover using meta1. And vice-versa. 288 | err0 := db.meta0.validate() 289 | err1 := db.meta1.validate() 290 | if err0 != nil && err1 != nil { 291 | return err0 292 | } 293 | 294 | return nil 295 | } 296 | 297 | // munmap unmaps the data file from memory. 298 | func (db *DB) munmap() error { 299 | if err := munmap(db); err != nil { 300 | return fmt.Errorf("unmap error: " + err.Error()) 301 | } 302 | return nil 303 | } 304 | 305 | // mmapSize determines the appropriate size for the mmap given the current size 306 | // of the database. The minimum size is 32KB and doubles until it reaches 1GB. 307 | // Returns an error if the new mmap size is greater than the max allowed. 308 | func (db *DB) mmapSize(size int) (int, error) { 309 | // Double the size from 32KB until 1GB. 310 | for i := uint(15); i <= 30; i++ { 311 | if size <= 1< maxMapSize { 318 | return 0, fmt.Errorf("mmap too large") 319 | } 320 | 321 | // If larger than 1GB then grow by 1GB at a time. 322 | sz := int64(size) 323 | if remainder := sz % int64(maxMmapStep); remainder > 0 { 324 | sz += int64(maxMmapStep) - remainder 325 | } 326 | 327 | // Ensure that the mmap size is a multiple of the page size. 328 | // This should always be true since we're incrementing in MBs. 329 | pageSize := int64(db.pageSize) 330 | if (sz % pageSize) != 0 { 331 | sz = ((sz / pageSize) + 1) * pageSize 332 | } 333 | 334 | // If we've exceeded the max size then only grow up to the max size. 335 | if sz > maxMapSize { 336 | sz = maxMapSize 337 | } 338 | 339 | return int(sz), nil 340 | } 341 | 342 | // init creates a new database file and initializes its meta pages. 343 | func (db *DB) init() error { 344 | // Set the page size to the OS page size. 345 | db.pageSize = os.Getpagesize() 346 | 347 | // Create two meta pages on a buffer. 348 | buf := make([]byte, db.pageSize*4) 349 | for i := 0; i < 2; i++ { 350 | p := db.pageInBuffer(buf[:], pgid(i)) 351 | p.id = pgid(i) 352 | p.flags = metaPageFlag 353 | 354 | // Initialize the meta page. 355 | m := p.meta() 356 | m.magic = magic 357 | m.version = version 358 | m.pageSize = uint32(db.pageSize) 359 | m.freelist = 2 360 | m.root = bucket{root: 3} 361 | m.pgid = 4 362 | m.txid = txid(i) 363 | m.checksum = m.sum64() 364 | } 365 | 366 | // Write an empty freelist at page 3. 367 | p := db.pageInBuffer(buf[:], pgid(2)) 368 | p.id = pgid(2) 369 | p.flags = freelistPageFlag 370 | p.count = 0 371 | 372 | // Write an empty leaf page at page 4. 373 | p = db.pageInBuffer(buf[:], pgid(3)) 374 | p.id = pgid(3) 375 | p.flags = leafPageFlag 376 | p.count = 0 377 | 378 | // Write the buffer to our data file. 379 | if _, err := db.ops.writeAt(buf, 0); err != nil { 380 | return err 381 | } 382 | if err := fdatasync(db); err != nil { 383 | return err 384 | } 385 | 386 | return nil 387 | } 388 | 389 | // Close releases all database resources. 390 | // All transactions must be closed before closing the database. 391 | func (db *DB) Close() error { 392 | db.rwlock.Lock() 393 | defer db.rwlock.Unlock() 394 | 395 | db.metalock.Lock() 396 | defer db.metalock.Unlock() 397 | 398 | db.mmaplock.RLock() 399 | defer db.mmaplock.RUnlock() 400 | 401 | return db.close() 402 | } 403 | 404 | func (db *DB) close() error { 405 | if !db.opened { 406 | return nil 407 | } 408 | 409 | db.opened = false 410 | 411 | db.freelist = nil 412 | 413 | // Clear ops. 414 | db.ops.writeAt = nil 415 | 416 | // Close the mmap. 417 | if err := db.munmap(); err != nil { 418 | return err 419 | } 420 | 421 | // Close file handles. 422 | if db.file != nil { 423 | // No need to unlock read-only file. 424 | if !db.readOnly { 425 | // Unlock the file. 426 | if err := funlock(db); err != nil { 427 | log.Printf("bolt.Close(): funlock error: %s", err) 428 | } 429 | } 430 | 431 | // Close the file descriptor. 432 | if err := db.file.Close(); err != nil { 433 | return fmt.Errorf("db file close: %s", err) 434 | } 435 | db.file = nil 436 | } 437 | 438 | db.path = "" 439 | return nil 440 | } 441 | 442 | // Begin starts a new transaction. 443 | // Multiple read-only transactions can be used concurrently but only one 444 | // write transaction can be used at a time. Starting multiple write transactions 445 | // will cause the calls to block and be serialized until the current write 446 | // transaction finishes. 447 | // 448 | // Transactions should not be dependent on one another. Opening a read 449 | // transaction and a write transaction in the same goroutine can cause the 450 | // writer to deadlock because the database periodically needs to re-mmap itself 451 | // as it grows and it cannot do that while a read transaction is open. 452 | // 453 | // If a long running read transaction (for example, a snapshot transaction) is 454 | // needed, you might want to set DB.InitialMmapSize to a large enough value 455 | // to avoid potential blocking of write transaction. 456 | // 457 | // IMPORTANT: You must close read-only transactions after you are finished or 458 | // else the database will not reclaim old pages. 459 | func (db *DB) Begin(writable bool) (*Tx, error) { 460 | if writable { 461 | return db.beginRWTx() 462 | } 463 | return db.beginTx() 464 | } 465 | 466 | func (db *DB) beginTx() (*Tx, error) { 467 | // Lock the meta pages while we initialize the transaction. We obtain 468 | // the meta lock before the mmap lock because that's the order that the 469 | // write transaction will obtain them. 470 | db.metalock.Lock() 471 | 472 | // Obtain a read-only lock on the mmap. When the mmap is remapped it will 473 | // obtain a write lock so all transactions must finish before it can be 474 | // remapped. 475 | db.mmaplock.RLock() 476 | 477 | // Exit if the database is not open yet. 478 | if !db.opened { 479 | db.mmaplock.RUnlock() 480 | db.metalock.Unlock() 481 | return nil, ErrDatabaseNotOpen 482 | } 483 | 484 | // Create a transaction associated with the database. 485 | t := &Tx{} 486 | t.init(db) 487 | 488 | // Keep track of transaction until it closes. 489 | db.txs = append(db.txs, t) 490 | n := len(db.txs) 491 | 492 | // Unlock the meta pages. 493 | db.metalock.Unlock() 494 | 495 | // Update the transaction stats. 496 | db.statlock.Lock() 497 | db.stats.TxN++ 498 | db.stats.OpenTxN = n 499 | db.statlock.Unlock() 500 | 501 | return t, nil 502 | } 503 | 504 | func (db *DB) beginRWTx() (*Tx, error) { 505 | // If the database was opened with Options.ReadOnly, return an error. 506 | if db.readOnly { 507 | return nil, ErrDatabaseReadOnly 508 | } 509 | 510 | // Obtain writer lock. This is released by the transaction when it closes. 511 | // This enforces only one writer transaction at a time. 512 | db.rwlock.Lock() 513 | 514 | // Once we have the writer lock then we can lock the meta pages so that 515 | // we can set up the transaction. 516 | db.metalock.Lock() 517 | defer db.metalock.Unlock() 518 | 519 | // Exit if the database is not open yet. 520 | if !db.opened { 521 | db.rwlock.Unlock() 522 | return nil, ErrDatabaseNotOpen 523 | } 524 | 525 | // Create a transaction associated with the database. 526 | t := &Tx{writable: true} 527 | t.init(db) 528 | db.rwtx = t 529 | 530 | // Free any pages associated with closed read-only transactions. 531 | var minid txid = 0xFFFFFFFFFFFFFFFF 532 | for _, t := range db.txs { 533 | if t.meta.txid < minid { 534 | minid = t.meta.txid 535 | } 536 | } 537 | if minid > 0 { 538 | db.freelist.release(minid - 1) 539 | } 540 | 541 | return t, nil 542 | } 543 | 544 | // removeTx removes a transaction from the database. 545 | func (db *DB) removeTx(tx *Tx) { 546 | // Release the read lock on the mmap. 547 | db.mmaplock.RUnlock() 548 | 549 | // Use the meta lock to restrict access to the DB object. 550 | db.metalock.Lock() 551 | 552 | // Remove the transaction. 553 | for i, t := range db.txs { 554 | if t == tx { 555 | last := len(db.txs) - 1 556 | db.txs[i] = db.txs[last] 557 | db.txs[last] = nil 558 | db.txs = db.txs[:last] 559 | break 560 | } 561 | } 562 | n := len(db.txs) 563 | 564 | // Unlock the meta pages. 565 | db.metalock.Unlock() 566 | 567 | // Merge statistics. 568 | db.statlock.Lock() 569 | db.stats.OpenTxN = n 570 | db.stats.TxStats.add(&tx.stats) 571 | db.statlock.Unlock() 572 | } 573 | 574 | // Update executes a function within the context of a read-write managed transaction. 575 | // If no error is returned from the function then the transaction is committed. 576 | // If an error is returned then the entire transaction is rolled back. 577 | // Any error that is returned from the function or returned from the commit is 578 | // returned from the Update() method. 579 | // 580 | // Attempting to manually commit or rollback within the function will cause a panic. 581 | func (db *DB) Update(fn func(*Tx) error) error { 582 | t, err := db.Begin(true) 583 | if err != nil { 584 | return err 585 | } 586 | 587 | // Make sure the transaction rolls back in the event of a panic. 588 | defer func() { 589 | if t.db != nil { 590 | t.rollback() 591 | } 592 | }() 593 | 594 | // Mark as a managed tx so that the inner function cannot manually commit. 595 | t.managed = true 596 | 597 | // If an error is returned from the function then rollback and return error. 598 | err = fn(t) 599 | t.managed = false 600 | if err != nil { 601 | _ = t.Rollback() 602 | return err 603 | } 604 | 605 | return t.Commit() 606 | } 607 | 608 | // View executes a function within the context of a managed read-only transaction. 609 | // Any error that is returned from the function is returned from the View() method. 610 | // 611 | // Attempting to manually rollback within the function will cause a panic. 612 | func (db *DB) View(fn func(*Tx) error) error { 613 | t, err := db.Begin(false) 614 | if err != nil { 615 | return err 616 | } 617 | 618 | // Make sure the transaction rolls back in the event of a panic. 619 | defer func() { 620 | if t.db != nil { 621 | t.rollback() 622 | } 623 | }() 624 | 625 | // Mark as a managed tx so that the inner function cannot manually rollback. 626 | t.managed = true 627 | 628 | // If an error is returned from the function then pass it through. 629 | err = fn(t) 630 | t.managed = false 631 | if err != nil { 632 | _ = t.Rollback() 633 | return err 634 | } 635 | 636 | if err := t.Rollback(); err != nil { 637 | return err 638 | } 639 | 640 | return nil 641 | } 642 | 643 | // Batch calls fn as part of a batch. It behaves similar to Update, 644 | // except: 645 | // 646 | // 1. concurrent Batch calls can be combined into a single Bolt 647 | // transaction. 648 | // 649 | // 2. the function passed to Batch may be called multiple times, 650 | // regardless of whether it returns error or not. 651 | // 652 | // This means that Batch function side effects must be idempotent and 653 | // take permanent effect only after a successful return is seen in 654 | // caller. 655 | // 656 | // The maximum batch size and delay can be adjusted with DB.MaxBatchSize 657 | // and DB.MaxBatchDelay, respectively. 658 | // 659 | // Batch is only useful when there are multiple goroutines calling it. 660 | func (db *DB) Batch(fn func(*Tx) error) error { 661 | errCh := make(chan error, 1) 662 | 663 | db.batchMu.Lock() 664 | if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) { 665 | // There is no existing batch, or the existing batch is full; start a new one. 666 | db.batch = &batch{ 667 | db: db, 668 | } 669 | db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger) 670 | } 671 | db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh}) 672 | if len(db.batch.calls) >= db.MaxBatchSize { 673 | // wake up batch, it's ready to run 674 | go db.batch.trigger() 675 | } 676 | db.batchMu.Unlock() 677 | 678 | err := <-errCh 679 | if err == trySolo { 680 | err = db.Update(fn) 681 | } 682 | return err 683 | } 684 | 685 | type call struct { 686 | fn func(*Tx) error 687 | err chan<- error 688 | } 689 | 690 | type batch struct { 691 | db *DB 692 | timer *time.Timer 693 | start sync.Once 694 | calls []call 695 | } 696 | 697 | // trigger runs the batch if it hasn't already been run. 698 | func (b *batch) trigger() { 699 | b.start.Do(b.run) 700 | } 701 | 702 | // run performs the transactions in the batch and communicates results 703 | // back to DB.Batch. 704 | func (b *batch) run() { 705 | b.db.batchMu.Lock() 706 | b.timer.Stop() 707 | // Make sure no new work is added to this batch, but don't break 708 | // other batches. 709 | if b.db.batch == b { 710 | b.db.batch = nil 711 | } 712 | b.db.batchMu.Unlock() 713 | 714 | retry: 715 | for len(b.calls) > 0 { 716 | var failIdx = -1 717 | err := b.db.Update(func(tx *Tx) error { 718 | for i, c := range b.calls { 719 | if err := safelyCall(c.fn, tx); err != nil { 720 | failIdx = i 721 | return err 722 | } 723 | } 724 | return nil 725 | }) 726 | 727 | if failIdx >= 0 { 728 | // take the failing transaction out of the batch. it's 729 | // safe to shorten b.calls here because db.batch no longer 730 | // points to us, and we hold the mutex anyway. 731 | c := b.calls[failIdx] 732 | b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1] 733 | // tell the submitter re-run it solo, continue with the rest of the batch 734 | c.err <- trySolo 735 | continue retry 736 | } 737 | 738 | // pass success, or bolt internal errors, to all callers 739 | for _, c := range b.calls { 740 | c.err <- err 741 | } 742 | break retry 743 | } 744 | } 745 | 746 | // trySolo is a special sentinel error value used for signaling that a 747 | // transaction function should be re-run. It should never be seen by 748 | // callers. 749 | var trySolo = errors.New("batch function returned an error and should be re-run solo") 750 | 751 | type panicked struct { 752 | reason interface{} 753 | } 754 | 755 | func (p panicked) Error() string { 756 | if err, ok := p.reason.(error); ok { 757 | return err.Error() 758 | } 759 | return fmt.Sprintf("panic: %v", p.reason) 760 | } 761 | 762 | func safelyCall(fn func(*Tx) error, tx *Tx) (err error) { 763 | defer func() { 764 | if p := recover(); p != nil { 765 | err = panicked{p} 766 | } 767 | }() 768 | return fn(tx) 769 | } 770 | 771 | // Sync executes fdatasync() against the database file handle. 772 | // 773 | // This is not necessary under normal operation, however, if you use NoSync 774 | // then it allows you to force the database file to sync against the disk. 775 | func (db *DB) Sync() error { return fdatasync(db) } 776 | 777 | // Stats retrieves ongoing performance stats for the database. 778 | // This is only updated when a transaction closes. 779 | func (db *DB) Stats() Stats { 780 | db.statlock.RLock() 781 | defer db.statlock.RUnlock() 782 | return db.stats 783 | } 784 | 785 | // This is for internal access to the raw data bytes from the C cursor, use 786 | // carefully, or not at all. 787 | func (db *DB) Info() *Info { 788 | return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} 789 | } 790 | 791 | // page retrieves a page reference from the mmap based on the current page size. 792 | func (db *DB) page(id pgid) *page { 793 | pos := id * pgid(db.pageSize) 794 | return (*page)(unsafe.Pointer(&db.data[pos])) 795 | } 796 | 797 | // pageInBuffer retrieves a page reference from a given byte array based on the current page size. 798 | func (db *DB) pageInBuffer(b []byte, id pgid) *page { 799 | return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)])) 800 | } 801 | 802 | // meta retrieves the current meta page reference. 803 | func (db *DB) meta() *meta { 804 | // We have to return the meta with the highest txid which doesn't fail 805 | // validation. Otherwise, we can cause errors when in fact the database is 806 | // in a consistent state. metaA is the one with the higher txid. 807 | metaA := db.meta0 808 | metaB := db.meta1 809 | if db.meta1.txid > db.meta0.txid { 810 | metaA = db.meta1 811 | metaB = db.meta0 812 | } 813 | 814 | // Use higher meta page if valid. Otherwise fallback to previous, if valid. 815 | if err := metaA.validate(); err == nil { 816 | return metaA 817 | } else if err := metaB.validate(); err == nil { 818 | return metaB 819 | } 820 | 821 | // This should never be reached, because both meta1 and meta0 were validated 822 | // on mmap() and we do fsync() on every write. 823 | panic("bolt.DB.meta(): invalid meta pages") 824 | } 825 | 826 | // allocate returns a contiguous block of memory starting at a given page. 827 | func (db *DB) allocate(count int) (*page, error) { 828 | // Allocate a temporary buffer for the page. 829 | var buf []byte 830 | if count == 1 { 831 | buf = db.pagePool.Get().([]byte) 832 | } else { 833 | buf = make([]byte, count*db.pageSize) 834 | } 835 | p := (*page)(unsafe.Pointer(&buf[0])) 836 | p.overflow = uint32(count - 1) 837 | 838 | // Use pages from the freelist if they are available. 839 | if p.id = db.freelist.allocate(count); p.id != 0 { 840 | return p, nil 841 | } 842 | 843 | // Resize mmap() if we're at the end. 844 | p.id = db.rwtx.meta.pgid 845 | var minsz = int((p.id+pgid(count))+1) * db.pageSize 846 | if minsz >= db.datasz { 847 | if err := db.mmap(minsz); err != nil { 848 | return nil, fmt.Errorf("mmap allocate error: %s", err) 849 | } 850 | } 851 | 852 | // Move the page id high water mark. 853 | db.rwtx.meta.pgid += pgid(count) 854 | 855 | return p, nil 856 | } 857 | 858 | // grow grows the size of the database to the given sz. 859 | func (db *DB) grow(sz int) error { 860 | // Ignore if the new size is less than available file size. 861 | if sz <= db.filesz { 862 | return nil 863 | } 864 | 865 | // If the data is smaller than the alloc size then only allocate what's needed. 866 | // Once it goes over the allocation size then allocate in chunks. 867 | if db.datasz < db.AllocSize { 868 | sz = db.datasz 869 | } else { 870 | sz += db.AllocSize 871 | } 872 | 873 | // Truncate and fsync to ensure file size metadata is flushed. 874 | // https://github.com/boltdb/bolt/issues/284 875 | if !db.NoGrowSync && !db.readOnly { 876 | if runtime.GOOS != "windows" { 877 | if err := db.file.Truncate(int64(sz)); err != nil { 878 | return fmt.Errorf("file resize error: %s", err) 879 | } 880 | } 881 | if err := db.file.Sync(); err != nil { 882 | return fmt.Errorf("file sync error: %s", err) 883 | } 884 | } 885 | 886 | db.filesz = sz 887 | return nil 888 | } 889 | 890 | func (db *DB) IsReadOnly() bool { 891 | return db.readOnly 892 | } 893 | 894 | // Options represents the options that can be set when opening a database. 895 | type Options struct { 896 | // Timeout is the amount of time to wait to obtain a file lock. 897 | // When set to zero it will wait indefinitely. This option is only 898 | // available on Darwin and Linux. 899 | Timeout time.Duration 900 | 901 | // Sets the DB.NoGrowSync flag before memory mapping the file. 902 | NoGrowSync bool 903 | 904 | // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to 905 | // grab a shared lock (UNIX). 906 | ReadOnly bool 907 | 908 | // Sets the DB.MmapFlags flag before memory mapping the file. 909 | MmapFlags int 910 | 911 | // InitialMmapSize is the initial mmap size of the database 912 | // in bytes. Read transactions won't block write transaction 913 | // if the InitialMmapSize is large enough to hold database mmap 914 | // size. (See DB.Begin for more information) 915 | // 916 | // If <=0, the initial map size is 0. 917 | // If initialMmapSize is smaller than the previous database size, 918 | // it takes no effect. 919 | InitialMmapSize int 920 | } 921 | 922 | // DefaultOptions represent the options used if nil options are passed into Open(). 923 | // No timeout is used which will cause Bolt to wait indefinitely for a lock. 924 | var DefaultOptions = &Options{ 925 | Timeout: 0, 926 | NoGrowSync: false, 927 | } 928 | 929 | // Stats represents statistics about the database. 930 | type Stats struct { 931 | // Freelist stats 932 | FreePageN int // total number of free pages on the freelist 933 | PendingPageN int // total number of pending pages on the freelist 934 | FreeAlloc int // total bytes allocated in free pages 935 | FreelistInuse int // total bytes used by the freelist 936 | 937 | // Transaction stats 938 | TxN int // total number of started read transactions 939 | OpenTxN int // number of currently open read transactions 940 | 941 | TxStats TxStats // global, ongoing stats. 942 | } 943 | 944 | // Sub calculates and returns the difference between two sets of database stats. 945 | // This is useful when obtaining stats at two different points and time and 946 | // you need the performance counters that occurred within that time span. 947 | func (s *Stats) Sub(other *Stats) Stats { 948 | if other == nil { 949 | return *s 950 | } 951 | var diff Stats 952 | diff.FreePageN = s.FreePageN 953 | diff.PendingPageN = s.PendingPageN 954 | diff.FreeAlloc = s.FreeAlloc 955 | diff.FreelistInuse = s.FreelistInuse 956 | diff.TxN = s.TxN - other.TxN 957 | diff.TxStats = s.TxStats.Sub(&other.TxStats) 958 | return diff 959 | } 960 | 961 | func (s *Stats) add(other *Stats) { 962 | s.TxStats.add(&other.TxStats) 963 | } 964 | 965 | type Info struct { 966 | Data uintptr 967 | PageSize int 968 | } 969 | 970 | type meta struct { 971 | magic uint32 972 | version uint32 973 | pageSize uint32 974 | flags uint32 975 | root bucket 976 | freelist pgid 977 | pgid pgid 978 | txid txid 979 | checksum uint64 980 | } 981 | 982 | // validate checks the marker bytes and version of the meta page to ensure it matches this binary. 983 | func (m *meta) validate() error { 984 | if m.magic != magic { 985 | return ErrInvalid 986 | } else if m.version != version { 987 | return ErrVersionMismatch 988 | } else if m.checksum != 0 && m.checksum != m.sum64() { 989 | return ErrChecksum 990 | } 991 | return nil 992 | } 993 | 994 | // copy copies one meta object to another. 995 | func (m *meta) copy(dest *meta) { 996 | *dest = *m 997 | } 998 | 999 | // write writes the meta onto a page. 1000 | func (m *meta) write(p *page) { 1001 | if m.root.root >= m.pgid { 1002 | panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) 1003 | } else if m.freelist >= m.pgid { 1004 | panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) 1005 | } 1006 | 1007 | // Page id is either going to be 0 or 1 which we can determine by the transaction ID. 1008 | p.id = pgid(m.txid % 2) 1009 | p.flags |= metaPageFlag 1010 | 1011 | // Calculate the checksum. 1012 | m.checksum = m.sum64() 1013 | 1014 | m.copy(p.meta()) 1015 | } 1016 | 1017 | // generates the checksum for the meta. 1018 | func (m *meta) sum64() uint64 { 1019 | var h = fnv.New64a() 1020 | _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) 1021 | return h.Sum64() 1022 | } 1023 | 1024 | // _assert will panic with a given formatted message if the given condition is false. 1025 | func _assert(condition bool, msg string, v ...interface{}) { 1026 | if !condition { 1027 | panic(fmt.Sprintf("assertion failed: "+msg, v...)) 1028 | } 1029 | } 1030 | 1031 | func warn(v ...interface{}) { fmt.Fprintln(os.Stderr, v...) } 1032 | func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) } 1033 | 1034 | func printstack() { 1035 | stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n") 1036 | fmt.Fprintln(os.Stderr, stack) 1037 | } 1038 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package bolt implements a low-level key/value store in pure Go. It supports 3 | fully serializable transactions, ACID semantics, and lock-free MVCC with 4 | multiple readers and a single writer. Bolt can be used for projects that 5 | want a simple data store without the need to add large dependencies such as 6 | Postgres or MySQL. 7 | 8 | Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is 9 | optimized for fast read access and does not require recovery in the event of a 10 | system crash. Transactions which have not finished committing will simply be 11 | rolled back in the event of a crash. 12 | 13 | The design of Bolt is based on Howard Chu's LMDB database project. 14 | 15 | Bolt currently works on Windows, Mac OS X, and Linux. 16 | 17 | 18 | Basics 19 | 20 | There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is 21 | a collection of buckets and is represented by a single file on disk. A bucket is 22 | a collection of unique keys that are associated with values. 23 | 24 | Transactions provide either read-only or read-write access to the database. 25 | Read-only transactions can retrieve key/value pairs and can use Cursors to 26 | iterate over the dataset sequentially. Read-write transactions can create and 27 | delete buckets and can insert and remove keys. Only one read-write transaction 28 | is allowed at a time. 29 | 30 | 31 | Caveats 32 | 33 | The database uses a read-only, memory-mapped data file to ensure that 34 | applications cannot corrupt the database, however, this means that keys and 35 | values returned from Bolt cannot be changed. Writing to a read-only byte slice 36 | will cause Go to panic. 37 | 38 | Keys and values retrieved from the database are only valid for the life of 39 | the transaction. When used outside the transaction, these byte slices can 40 | point to different data or can point to invalid memory which will cause a panic. 41 | 42 | 43 | */ 44 | package bolt 45 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import "errors" 4 | 5 | // These errors can be returned when opening or calling methods on a DB. 6 | var ( 7 | // ErrDatabaseNotOpen is returned when a DB instance is accessed before it 8 | // is opened or after it is closed. 9 | ErrDatabaseNotOpen = errors.New("database not open") 10 | 11 | // ErrDatabaseOpen is returned when opening a database that is 12 | // already open. 13 | ErrDatabaseOpen = errors.New("database already open") 14 | 15 | // ErrInvalid is returned when both meta pages on a database are invalid. 16 | // This typically occurs when a file is not a bolt database. 17 | ErrInvalid = errors.New("invalid database") 18 | 19 | // ErrVersionMismatch is returned when the data file was created with a 20 | // different version of Bolt. 21 | ErrVersionMismatch = errors.New("version mismatch") 22 | 23 | // ErrChecksum is returned when either meta page checksum does not match. 24 | ErrChecksum = errors.New("checksum error") 25 | 26 | // ErrTimeout is returned when a database cannot obtain an exclusive lock 27 | // on the data file after the timeout passed to Open(). 28 | ErrTimeout = errors.New("timeout") 29 | ) 30 | 31 | // These errors can occur when beginning or committing a Tx. 32 | var ( 33 | // ErrTxNotWritable is returned when performing a write operation on a 34 | // read-only transaction. 35 | ErrTxNotWritable = errors.New("tx not writable") 36 | 37 | // ErrTxClosed is returned when committing or rolling back a transaction 38 | // that has already been committed or rolled back. 39 | ErrTxClosed = errors.New("tx closed") 40 | 41 | // ErrDatabaseReadOnly is returned when a mutating transaction is started on a 42 | // read-only database. 43 | ErrDatabaseReadOnly = errors.New("database is in read-only mode") 44 | ) 45 | 46 | // These errors can occur when putting or deleting a value or a bucket. 47 | var ( 48 | // ErrBucketNotFound is returned when trying to access a bucket that has 49 | // not been created yet. 50 | ErrBucketNotFound = errors.New("bucket not found") 51 | 52 | // ErrBucketExists is returned when creating a bucket that already exists. 53 | ErrBucketExists = errors.New("bucket already exists") 54 | 55 | // ErrBucketNameRequired is returned when creating a bucket with a blank name. 56 | ErrBucketNameRequired = errors.New("bucket name required") 57 | 58 | // ErrKeyRequired is returned when inserting a zero-length key. 59 | ErrKeyRequired = errors.New("key required") 60 | 61 | // ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize. 62 | ErrKeyTooLarge = errors.New("key too large") 63 | 64 | // ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize. 65 | ErrValueTooLarge = errors.New("value too large") 66 | 67 | // ErrIncompatibleValue is returned when trying create or delete a bucket 68 | // on an existing non-bucket key or when trying to create or delete a 69 | // non-bucket key on an existing bucket key. 70 | ErrIncompatibleValue = errors.New("incompatible value") 71 | ) 72 | -------------------------------------------------------------------------------- /freelist.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "unsafe" 7 | ) 8 | 9 | // freelist represents a list of all pages that are available for allocation. 10 | // It also tracks pages that have been freed but are still in use by open transactions. 11 | type freelist struct { 12 | ids []pgid // all free and available free page ids. 13 | pending map[txid][]pgid // mapping of soon-to-be free page ids by tx. 14 | cache map[pgid]bool // fast lookup of all free and pending page ids. 15 | } 16 | 17 | // newFreelist returns an empty, initialized freelist. 18 | func newFreelist() *freelist { 19 | return &freelist{ 20 | pending: make(map[txid][]pgid), 21 | cache: make(map[pgid]bool), 22 | } 23 | } 24 | 25 | // size returns the size of the page after serialization. 26 | func (f *freelist) size() int { 27 | n := f.count() 28 | if n >= 0xFFFF { 29 | // The first element will be used to store the count. See freelist.write. 30 | n++ 31 | } 32 | return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * n) 33 | } 34 | 35 | // count returns count of pages on the freelist 36 | func (f *freelist) count() int { 37 | return f.free_count() + f.pending_count() 38 | } 39 | 40 | // free_count returns count of free pages 41 | func (f *freelist) free_count() int { 42 | return len(f.ids) 43 | } 44 | 45 | // pending_count returns count of pending pages 46 | func (f *freelist) pending_count() int { 47 | var count int 48 | for _, list := range f.pending { 49 | count += len(list) 50 | } 51 | return count 52 | } 53 | 54 | // copyall copies into dst a list of all free ids and all pending ids in one sorted list. 55 | // f.count returns the minimum length required for dst. 56 | func (f *freelist) copyall(dst []pgid) { 57 | m := make(pgids, 0, f.pending_count()) 58 | for _, list := range f.pending { 59 | m = append(m, list...) 60 | } 61 | sort.Sort(m) 62 | mergepgids(dst, f.ids, m) 63 | } 64 | 65 | // allocate returns the starting page id of a contiguous list of pages of a given size. 66 | // If a contiguous block cannot be found then 0 is returned. 67 | func (f *freelist) allocate(n int) pgid { 68 | if len(f.ids) == 0 { 69 | return 0 70 | } 71 | 72 | var initial, previd pgid 73 | for i, id := range f.ids { 74 | if id <= 1 { 75 | panic(fmt.Sprintf("invalid page allocation: %d", id)) 76 | } 77 | 78 | // Reset initial page if this is not contiguous. 79 | if previd == 0 || id-previd != 1 { 80 | initial = id 81 | } 82 | 83 | // If we found a contiguous block then remove it and return it. 84 | if (id-initial)+1 == pgid(n) { 85 | // If we're allocating off the beginning then take the fast path 86 | // and just adjust the existing slice. This will use extra memory 87 | // temporarily but the append() in free() will realloc the slice 88 | // as is necessary. 89 | if (i + 1) == n { 90 | f.ids = f.ids[i+1:] 91 | } else { 92 | copy(f.ids[i-n+1:], f.ids[i+1:]) 93 | f.ids = f.ids[:len(f.ids)-n] 94 | } 95 | 96 | // Remove from the free cache. 97 | for i := pgid(0); i < pgid(n); i++ { 98 | delete(f.cache, initial+i) 99 | } 100 | 101 | return initial 102 | } 103 | 104 | previd = id 105 | } 106 | return 0 107 | } 108 | 109 | // free releases a page and its overflow for a given transaction id. 110 | // If the page is already free then a panic will occur. 111 | func (f *freelist) free(txid txid, p *page) { 112 | if p.id <= 1 { 113 | panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id)) 114 | } 115 | 116 | // Free page and all its overflow pages. 117 | var ids = f.pending[txid] 118 | for id := p.id; id <= p.id+pgid(p.overflow); id++ { 119 | // Verify that page is not already free. 120 | if f.cache[id] { 121 | panic(fmt.Sprintf("page %d already freed", id)) 122 | } 123 | 124 | // Add to the freelist and cache. 125 | ids = append(ids, id) 126 | f.cache[id] = true 127 | } 128 | f.pending[txid] = ids 129 | } 130 | 131 | // release moves all page ids for a transaction id (or older) to the freelist. 132 | func (f *freelist) release(txid txid) { 133 | m := make(pgids, 0) 134 | for tid, ids := range f.pending { 135 | if tid <= txid { 136 | // Move transaction's pending pages to the available freelist. 137 | // Don't remove from the cache since the page is still free. 138 | m = append(m, ids...) 139 | delete(f.pending, tid) 140 | } 141 | } 142 | sort.Sort(m) 143 | f.ids = pgids(f.ids).merge(m) 144 | } 145 | 146 | // rollback removes the pages from a given pending tx. 147 | func (f *freelist) rollback(txid txid) { 148 | // Remove page ids from cache. 149 | for _, id := range f.pending[txid] { 150 | delete(f.cache, id) 151 | } 152 | 153 | // Remove pages from pending list. 154 | delete(f.pending, txid) 155 | } 156 | 157 | // freed returns whether a given page is in the free list. 158 | func (f *freelist) freed(pgid pgid) bool { 159 | return f.cache[pgid] 160 | } 161 | 162 | // read initializes the freelist from a freelist page. 163 | func (f *freelist) read(p *page) { 164 | // If the page.count is at the max uint16 value (64k) then it's considered 165 | // an overflow and the size of the freelist is stored as the first element. 166 | idx, count := 0, int(p.count) 167 | if count == 0xFFFF { 168 | idx = 1 169 | count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0]) 170 | } 171 | 172 | // Copy the list of page ids from the freelist. 173 | if count == 0 { 174 | f.ids = nil 175 | } else { 176 | ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count] 177 | f.ids = make([]pgid, len(ids)) 178 | copy(f.ids, ids) 179 | 180 | // Make sure they're sorted. 181 | sort.Sort(pgids(f.ids)) 182 | } 183 | 184 | // Rebuild the page cache. 185 | f.reindex() 186 | } 187 | 188 | // write writes the page ids onto a freelist page. All free and pending ids are 189 | // saved to disk since in the event of a program crash, all pending ids will 190 | // become free. 191 | func (f *freelist) write(p *page) error { 192 | // Combine the old free pgids and pgids waiting on an open transaction. 193 | 194 | // Update the header flag. 195 | p.flags |= freelistPageFlag 196 | 197 | // The page.count can only hold up to 64k elements so if we overflow that 198 | // number then we handle it by putting the size in the first element. 199 | lenids := f.count() 200 | if lenids == 0 { 201 | p.count = uint16(lenids) 202 | } else if lenids < 0xFFFF { 203 | p.count = uint16(lenids) 204 | f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:]) 205 | } else { 206 | p.count = 0xFFFF 207 | ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids) 208 | f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:]) 209 | } 210 | 211 | return nil 212 | } 213 | 214 | // reload reads the freelist from a page and filters out pending items. 215 | func (f *freelist) reload(p *page) { 216 | f.read(p) 217 | 218 | // Build a cache of only pending pages. 219 | pcache := make(map[pgid]bool) 220 | for _, pendingIDs := range f.pending { 221 | for _, pendingID := range pendingIDs { 222 | pcache[pendingID] = true 223 | } 224 | } 225 | 226 | // Check each page in the freelist and build a new available freelist 227 | // with any pages not in the pending lists. 228 | var a []pgid 229 | for _, id := range f.ids { 230 | if !pcache[id] { 231 | a = append(a, id) 232 | } 233 | } 234 | f.ids = a 235 | 236 | // Once the available list is rebuilt then rebuild the free cache so that 237 | // it includes the available and pending free pages. 238 | f.reindex() 239 | } 240 | 241 | // reindex rebuilds the free cache based on available and pending free lists. 242 | func (f *freelist) reindex() { 243 | f.cache = make(map[pgid]bool, len(f.ids)) 244 | for _, id := range f.ids { 245 | f.cache[id] = true 246 | } 247 | for _, pendingIDs := range f.pending { 248 | for _, pendingID := range pendingIDs { 249 | f.cache[pendingID] = true 250 | } 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /freelist_test.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "math/rand" 5 | "reflect" 6 | "sort" 7 | "testing" 8 | "unsafe" 9 | ) 10 | 11 | // Ensure that a page is added to a transaction's freelist. 12 | func TestFreelist_free(t *testing.T) { 13 | f := newFreelist() 14 | f.free(100, &page{id: 12}) 15 | if !reflect.DeepEqual([]pgid{12}, f.pending[100]) { 16 | t.Fatalf("exp=%v; got=%v", []pgid{12}, f.pending[100]) 17 | } 18 | } 19 | 20 | // Ensure that a page and its overflow is added to a transaction's freelist. 21 | func TestFreelist_free_overflow(t *testing.T) { 22 | f := newFreelist() 23 | f.free(100, &page{id: 12, overflow: 3}) 24 | if exp := []pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100]) { 25 | t.Fatalf("exp=%v; got=%v", exp, f.pending[100]) 26 | } 27 | } 28 | 29 | // Ensure that a transaction's free pages can be released. 30 | func TestFreelist_release(t *testing.T) { 31 | f := newFreelist() 32 | f.free(100, &page{id: 12, overflow: 1}) 33 | f.free(100, &page{id: 9}) 34 | f.free(102, &page{id: 39}) 35 | f.release(100) 36 | f.release(101) 37 | if exp := []pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.ids) { 38 | t.Fatalf("exp=%v; got=%v", exp, f.ids) 39 | } 40 | 41 | f.release(102) 42 | if exp := []pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.ids) { 43 | t.Fatalf("exp=%v; got=%v", exp, f.ids) 44 | } 45 | } 46 | 47 | // Ensure that a freelist can find contiguous blocks of pages. 48 | func TestFreelist_allocate(t *testing.T) { 49 | f := &freelist{ids: []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}} 50 | if id := int(f.allocate(3)); id != 3 { 51 | t.Fatalf("exp=3; got=%v", id) 52 | } 53 | if id := int(f.allocate(1)); id != 6 { 54 | t.Fatalf("exp=6; got=%v", id) 55 | } 56 | if id := int(f.allocate(3)); id != 0 { 57 | t.Fatalf("exp=0; got=%v", id) 58 | } 59 | if id := int(f.allocate(2)); id != 12 { 60 | t.Fatalf("exp=12; got=%v", id) 61 | } 62 | if id := int(f.allocate(1)); id != 7 { 63 | t.Fatalf("exp=7; got=%v", id) 64 | } 65 | if id := int(f.allocate(0)); id != 0 { 66 | t.Fatalf("exp=0; got=%v", id) 67 | } 68 | if id := int(f.allocate(0)); id != 0 { 69 | t.Fatalf("exp=0; got=%v", id) 70 | } 71 | if exp := []pgid{9, 18}; !reflect.DeepEqual(exp, f.ids) { 72 | t.Fatalf("exp=%v; got=%v", exp, f.ids) 73 | } 74 | 75 | if id := int(f.allocate(1)); id != 9 { 76 | t.Fatalf("exp=9; got=%v", id) 77 | } 78 | if id := int(f.allocate(1)); id != 18 { 79 | t.Fatalf("exp=18; got=%v", id) 80 | } 81 | if id := int(f.allocate(1)); id != 0 { 82 | t.Fatalf("exp=0; got=%v", id) 83 | } 84 | if exp := []pgid{}; !reflect.DeepEqual(exp, f.ids) { 85 | t.Fatalf("exp=%v; got=%v", exp, f.ids) 86 | } 87 | } 88 | 89 | // Ensure that a freelist can deserialize from a freelist page. 90 | func TestFreelist_read(t *testing.T) { 91 | // Create a page. 92 | var buf [4096]byte 93 | page := (*page)(unsafe.Pointer(&buf[0])) 94 | page.flags = freelistPageFlag 95 | page.count = 2 96 | 97 | // Insert 2 page ids. 98 | ids := (*[3]pgid)(unsafe.Pointer(&page.ptr)) 99 | ids[0] = 23 100 | ids[1] = 50 101 | 102 | // Deserialize page into a freelist. 103 | f := newFreelist() 104 | f.read(page) 105 | 106 | // Ensure that there are two page ids in the freelist. 107 | if exp := []pgid{23, 50}; !reflect.DeepEqual(exp, f.ids) { 108 | t.Fatalf("exp=%v; got=%v", exp, f.ids) 109 | } 110 | } 111 | 112 | // Ensure that a freelist can serialize into a freelist page. 113 | func TestFreelist_write(t *testing.T) { 114 | // Create a freelist and write it to a page. 115 | var buf [4096]byte 116 | f := &freelist{ids: []pgid{12, 39}, pending: make(map[txid][]pgid)} 117 | f.pending[100] = []pgid{28, 11} 118 | f.pending[101] = []pgid{3} 119 | p := (*page)(unsafe.Pointer(&buf[0])) 120 | if err := f.write(p); err != nil { 121 | t.Fatal(err) 122 | } 123 | 124 | // Read the page back out. 125 | f2 := newFreelist() 126 | f2.read(p) 127 | 128 | // Ensure that the freelist is correct. 129 | // All pages should be present and in reverse order. 130 | if exp := []pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.ids) { 131 | t.Fatalf("exp=%v; got=%v", exp, f2.ids) 132 | } 133 | } 134 | 135 | func Benchmark_FreelistRelease10K(b *testing.B) { benchmark_FreelistRelease(b, 10000) } 136 | func Benchmark_FreelistRelease100K(b *testing.B) { benchmark_FreelistRelease(b, 100000) } 137 | func Benchmark_FreelistRelease1000K(b *testing.B) { benchmark_FreelistRelease(b, 1000000) } 138 | func Benchmark_FreelistRelease10000K(b *testing.B) { benchmark_FreelistRelease(b, 10000000) } 139 | 140 | func benchmark_FreelistRelease(b *testing.B, size int) { 141 | ids := randomPgids(size) 142 | pending := randomPgids(len(ids) / 400) 143 | b.ResetTimer() 144 | for i := 0; i < b.N; i++ { 145 | f := &freelist{ids: ids, pending: map[txid][]pgid{1: pending}} 146 | f.release(1) 147 | } 148 | } 149 | 150 | func randomPgids(n int) []pgid { 151 | rand.Seed(42) 152 | pgids := make(pgids, n) 153 | for i := range pgids { 154 | pgids[i] = pgid(rand.Int63()) 155 | } 156 | sort.Sort(pgids) 157 | return pgids 158 | } 159 | -------------------------------------------------------------------------------- /node.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "sort" 7 | "unsafe" 8 | ) 9 | 10 | // node represents an in-memory, deserialized page. 11 | type node struct { 12 | bucket *Bucket 13 | isLeaf bool 14 | unbalanced bool 15 | spilled bool 16 | key []byte 17 | pgid pgid 18 | parent *node 19 | children nodes 20 | inodes inodes 21 | } 22 | 23 | // root returns the top-level node this node is attached to. 24 | func (n *node) root() *node { 25 | if n.parent == nil { 26 | return n 27 | } 28 | return n.parent.root() 29 | } 30 | 31 | // minKeys returns the minimum number of inodes this node should have. 32 | func (n *node) minKeys() int { 33 | if n.isLeaf { 34 | return 1 35 | } 36 | return 2 37 | } 38 | 39 | // size returns the size of the node after serialization. 40 | func (n *node) size() int { 41 | sz, elsz := pageHeaderSize, n.pageElementSize() 42 | for i := 0; i < len(n.inodes); i++ { 43 | item := &n.inodes[i] 44 | sz += elsz + len(item.key) + len(item.value) 45 | } 46 | return sz 47 | } 48 | 49 | // sizeLessThan returns true if the node is less than a given size. 50 | // This is an optimization to avoid calculating a large node when we only need 51 | // to know if it fits inside a certain page size. 52 | func (n *node) sizeLessThan(v int) bool { 53 | sz, elsz := pageHeaderSize, n.pageElementSize() 54 | for i := 0; i < len(n.inodes); i++ { 55 | item := &n.inodes[i] 56 | sz += elsz + len(item.key) + len(item.value) 57 | if sz >= v { 58 | return false 59 | } 60 | } 61 | return true 62 | } 63 | 64 | // pageElementSize returns the size of each page element based on the type of node. 65 | func (n *node) pageElementSize() int { 66 | if n.isLeaf { 67 | return leafPageElementSize 68 | } 69 | return branchPageElementSize 70 | } 71 | 72 | // childAt returns the child node at a given index. 73 | func (n *node) childAt(index int) *node { 74 | if n.isLeaf { 75 | panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) 76 | } 77 | return n.bucket.node(n.inodes[index].pgid, n) 78 | } 79 | 80 | // childIndex returns the index of a given child node. 81 | func (n *node) childIndex(child *node) int { 82 | index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) 83 | return index 84 | } 85 | 86 | // numChildren returns the number of children. 87 | func (n *node) numChildren() int { 88 | return len(n.inodes) 89 | } 90 | 91 | // nextSibling returns the next node with the same parent. 92 | func (n *node) nextSibling() *node { 93 | if n.parent == nil { 94 | return nil 95 | } 96 | index := n.parent.childIndex(n) 97 | if index >= n.parent.numChildren()-1 { 98 | return nil 99 | } 100 | return n.parent.childAt(index + 1) 101 | } 102 | 103 | // prevSibling returns the previous node with the same parent. 104 | func (n *node) prevSibling() *node { 105 | if n.parent == nil { 106 | return nil 107 | } 108 | index := n.parent.childIndex(n) 109 | if index == 0 { 110 | return nil 111 | } 112 | return n.parent.childAt(index - 1) 113 | } 114 | 115 | // put inserts a key/value. 116 | func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { 117 | if pgid >= n.bucket.tx.meta.pgid { 118 | panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) 119 | } else if len(oldKey) <= 0 { 120 | panic("put: zero-length old key") 121 | } else if len(newKey) <= 0 { 122 | panic("put: zero-length new key") 123 | } 124 | 125 | // Find insertion index. 126 | index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) 127 | 128 | // Add capacity and shift nodes if we don't have an exact match and need to insert. 129 | exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) 130 | if !exact { 131 | n.inodes = append(n.inodes, inode{}) 132 | copy(n.inodes[index+1:], n.inodes[index:]) 133 | } 134 | 135 | inode := &n.inodes[index] 136 | inode.flags = flags 137 | inode.key = newKey 138 | inode.value = value 139 | inode.pgid = pgid 140 | _assert(len(inode.key) > 0, "put: zero-length inode key") 141 | } 142 | 143 | // del removes a key from the node. 144 | func (n *node) del(key []byte) { 145 | // Find index of key. 146 | index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) 147 | 148 | // Exit if the key isn't found. 149 | if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { 150 | return 151 | } 152 | 153 | // Delete inode from the node. 154 | n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) 155 | 156 | // Mark the node as needing rebalancing. 157 | n.unbalanced = true 158 | } 159 | 160 | // read initializes the node from a page. 161 | func (n *node) read(p *page) { 162 | n.pgid = p.id 163 | n.isLeaf = ((p.flags & leafPageFlag) != 0) 164 | n.inodes = make(inodes, int(p.count)) 165 | 166 | for i := 0; i < int(p.count); i++ { 167 | inode := &n.inodes[i] 168 | if n.isLeaf { 169 | elem := p.leafPageElement(uint16(i)) 170 | inode.flags = elem.flags 171 | inode.key = elem.key() 172 | inode.value = elem.value() 173 | } else { 174 | elem := p.branchPageElement(uint16(i)) 175 | inode.pgid = elem.pgid 176 | inode.key = elem.key() 177 | } 178 | _assert(len(inode.key) > 0, "read: zero-length inode key") 179 | } 180 | 181 | // Save first key so we can find the node in the parent when we spill. 182 | if len(n.inodes) > 0 { 183 | n.key = n.inodes[0].key 184 | _assert(len(n.key) > 0, "read: zero-length node key") 185 | } else { 186 | n.key = nil 187 | } 188 | } 189 | 190 | // write writes the items onto one or more pages. 191 | func (n *node) write(p *page) { 192 | // Initialize page. 193 | if n.isLeaf { 194 | p.flags |= leafPageFlag 195 | } else { 196 | p.flags |= branchPageFlag 197 | } 198 | 199 | if len(n.inodes) >= 0xFFFF { 200 | panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) 201 | } 202 | p.count = uint16(len(n.inodes)) 203 | 204 | // Stop here if there are no items to write. 205 | if p.count == 0 { 206 | return 207 | } 208 | 209 | // Loop over each item and write it to the page. 210 | b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] 211 | for i, item := range n.inodes { 212 | _assert(len(item.key) > 0, "write: zero-length inode key") 213 | 214 | // Write the page element. 215 | if n.isLeaf { 216 | elem := p.leafPageElement(uint16(i)) 217 | elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 218 | elem.flags = item.flags 219 | elem.ksize = uint32(len(item.key)) 220 | elem.vsize = uint32(len(item.value)) 221 | } else { 222 | elem := p.branchPageElement(uint16(i)) 223 | elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 224 | elem.ksize = uint32(len(item.key)) 225 | elem.pgid = item.pgid 226 | _assert(elem.pgid != p.id, "write: circular dependency occurred") 227 | } 228 | 229 | // If the length of key+value is larger than the max allocation size 230 | // then we need to reallocate the byte array pointer. 231 | // 232 | // See: https://github.com/boltdb/bolt/pull/335 233 | klen, vlen := len(item.key), len(item.value) 234 | if len(b) < klen+vlen { 235 | b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:] 236 | } 237 | 238 | // Write data for the element to the end of the page. 239 | copy(b[0:], item.key) 240 | b = b[klen:] 241 | copy(b[0:], item.value) 242 | b = b[vlen:] 243 | } 244 | 245 | // DEBUG ONLY: n.dump() 246 | } 247 | 248 | // split breaks up a node into multiple smaller nodes, if appropriate. 249 | // This should only be called from the spill() function. 250 | func (n *node) split(pageSize int) []*node { 251 | var nodes []*node 252 | 253 | node := n 254 | for { 255 | // Split node into two. 256 | a, b := node.splitTwo(pageSize) 257 | nodes = append(nodes, a) 258 | 259 | // If we can't split then exit the loop. 260 | if b == nil { 261 | break 262 | } 263 | 264 | // Set node to b so it gets split on the next iteration. 265 | node = b 266 | } 267 | 268 | return nodes 269 | } 270 | 271 | // splitTwo breaks up a node into two smaller nodes, if appropriate. 272 | // This should only be called from the split() function. 273 | func (n *node) splitTwo(pageSize int) (*node, *node) { 274 | // Ignore the split if the page doesn't have at least enough nodes for 275 | // two pages or if the nodes can fit in a single page. 276 | if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { 277 | return n, nil 278 | } 279 | 280 | // Determine the threshold before starting a new node. 281 | var fillPercent = n.bucket.FillPercent 282 | if fillPercent < minFillPercent { 283 | fillPercent = minFillPercent 284 | } else if fillPercent > maxFillPercent { 285 | fillPercent = maxFillPercent 286 | } 287 | threshold := int(float64(pageSize) * fillPercent) 288 | 289 | // Determine split position and sizes of the two pages. 290 | splitIndex, _ := n.splitIndex(threshold) 291 | 292 | // Split node into two separate nodes. 293 | // If there's no parent then we'll need to create one. 294 | if n.parent == nil { 295 | n.parent = &node{bucket: n.bucket, children: []*node{n}} 296 | } 297 | 298 | // Create a new node and add it to the parent. 299 | next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} 300 | n.parent.children = append(n.parent.children, next) 301 | 302 | // Split inodes across two nodes. 303 | next.inodes = n.inodes[splitIndex:] 304 | n.inodes = n.inodes[:splitIndex] 305 | 306 | // Update the statistics. 307 | n.bucket.tx.stats.Split++ 308 | 309 | return n, next 310 | } 311 | 312 | // splitIndex finds the position where a page will fill a given threshold. 313 | // It returns the index as well as the size of the first page. 314 | // This is only be called from split(). 315 | func (n *node) splitIndex(threshold int) (index, sz int) { 316 | sz = pageHeaderSize 317 | 318 | // Loop until we only have the minimum number of keys required for the second page. 319 | for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { 320 | index = i 321 | inode := n.inodes[i] 322 | elsize := n.pageElementSize() + len(inode.key) + len(inode.value) 323 | 324 | // If we have at least the minimum number of keys and adding another 325 | // node would put us over the threshold then exit and return. 326 | if i >= minKeysPerPage && sz+elsize > threshold { 327 | break 328 | } 329 | 330 | // Add the element size to the total size. 331 | sz += elsize 332 | } 333 | 334 | return 335 | } 336 | 337 | // spill writes the nodes to dirty pages and splits nodes as it goes. 338 | // Returns an error if dirty pages cannot be allocated. 339 | func (n *node) spill() error { 340 | var tx = n.bucket.tx 341 | if n.spilled { 342 | return nil 343 | } 344 | 345 | // Spill child nodes first. Child nodes can materialize sibling nodes in 346 | // the case of split-merge so we cannot use a range loop. We have to check 347 | // the children size on every loop iteration. 348 | sort.Sort(n.children) 349 | for i := 0; i < len(n.children); i++ { 350 | if err := n.children[i].spill(); err != nil { 351 | return err 352 | } 353 | } 354 | 355 | // We no longer need the child list because it's only used for spill tracking. 356 | n.children = nil 357 | 358 | // Split nodes into appropriate sizes. The first node will always be n. 359 | var nodes = n.split(tx.db.pageSize) 360 | for _, node := range nodes { 361 | // Add node's page to the freelist if it's not new. 362 | if node.pgid > 0 { 363 | tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) 364 | node.pgid = 0 365 | } 366 | 367 | // Allocate contiguous space for the node. 368 | p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) 369 | if err != nil { 370 | return err 371 | } 372 | 373 | // Write the node. 374 | if p.id >= tx.meta.pgid { 375 | panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) 376 | } 377 | node.pgid = p.id 378 | node.write(p) 379 | node.spilled = true 380 | 381 | // Insert into parent inodes. 382 | if node.parent != nil { 383 | var key = node.key 384 | if key == nil { 385 | key = node.inodes[0].key 386 | } 387 | 388 | node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) 389 | node.key = node.inodes[0].key 390 | _assert(len(node.key) > 0, "spill: zero-length node key") 391 | } 392 | 393 | // Update the statistics. 394 | tx.stats.Spill++ 395 | } 396 | 397 | // If the root node split and created a new root then we need to spill that 398 | // as well. We'll clear out the children to make sure it doesn't try to respill. 399 | if n.parent != nil && n.parent.pgid == 0 { 400 | n.children = nil 401 | return n.parent.spill() 402 | } 403 | 404 | return nil 405 | } 406 | 407 | // rebalance attempts to combine the node with sibling nodes if the node fill 408 | // size is below a threshold or if there are not enough keys. 409 | func (n *node) rebalance() { 410 | if !n.unbalanced { 411 | return 412 | } 413 | n.unbalanced = false 414 | 415 | // Update statistics. 416 | n.bucket.tx.stats.Rebalance++ 417 | 418 | // Ignore if node is above threshold (25%) and has enough keys. 419 | var threshold = n.bucket.tx.db.pageSize / 4 420 | if n.size() > threshold && len(n.inodes) > n.minKeys() { 421 | return 422 | } 423 | 424 | // Root node has special handling. 425 | if n.parent == nil { 426 | // If root node is a branch and only has one node then collapse it. 427 | if !n.isLeaf && len(n.inodes) == 1 { 428 | // Move root's child up. 429 | child := n.bucket.node(n.inodes[0].pgid, n) 430 | n.isLeaf = child.isLeaf 431 | n.inodes = child.inodes[:] 432 | n.children = child.children 433 | 434 | // Reparent all child nodes being moved. 435 | for _, inode := range n.inodes { 436 | if child, ok := n.bucket.nodes[inode.pgid]; ok { 437 | child.parent = n 438 | } 439 | } 440 | 441 | // Remove old child. 442 | child.parent = nil 443 | delete(n.bucket.nodes, child.pgid) 444 | child.free() 445 | } 446 | 447 | return 448 | } 449 | 450 | // If node has no keys then just remove it. 451 | if n.numChildren() == 0 { 452 | n.parent.del(n.key) 453 | n.parent.removeChild(n) 454 | delete(n.bucket.nodes, n.pgid) 455 | n.free() 456 | n.parent.rebalance() 457 | return 458 | } 459 | 460 | _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") 461 | 462 | // Destination node is right sibling if idx == 0, otherwise left sibling. 463 | var target *node 464 | var useNextSibling = (n.parent.childIndex(n) == 0) 465 | if useNextSibling { 466 | target = n.nextSibling() 467 | } else { 468 | target = n.prevSibling() 469 | } 470 | 471 | // If both this node and the target node are too small then merge them. 472 | if useNextSibling { 473 | // Reparent all child nodes being moved. 474 | for _, inode := range target.inodes { 475 | if child, ok := n.bucket.nodes[inode.pgid]; ok { 476 | child.parent.removeChild(child) 477 | child.parent = n 478 | child.parent.children = append(child.parent.children, child) 479 | } 480 | } 481 | 482 | // Copy over inodes from target and remove target. 483 | n.inodes = append(n.inodes, target.inodes...) 484 | n.parent.del(target.key) 485 | n.parent.removeChild(target) 486 | delete(n.bucket.nodes, target.pgid) 487 | target.free() 488 | } else { 489 | // Reparent all child nodes being moved. 490 | for _, inode := range n.inodes { 491 | if child, ok := n.bucket.nodes[inode.pgid]; ok { 492 | child.parent.removeChild(child) 493 | child.parent = target 494 | child.parent.children = append(child.parent.children, child) 495 | } 496 | } 497 | 498 | // Copy over inodes to target and remove node. 499 | target.inodes = append(target.inodes, n.inodes...) 500 | n.parent.del(n.key) 501 | n.parent.removeChild(n) 502 | delete(n.bucket.nodes, n.pgid) 503 | n.free() 504 | } 505 | 506 | // Either this node or the target node was deleted from the parent so rebalance it. 507 | n.parent.rebalance() 508 | } 509 | 510 | // removes a node from the list of in-memory children. 511 | // This does not affect the inodes. 512 | func (n *node) removeChild(target *node) { 513 | for i, child := range n.children { 514 | if child == target { 515 | n.children = append(n.children[:i], n.children[i+1:]...) 516 | return 517 | } 518 | } 519 | } 520 | 521 | // dereference causes the node to copy all its inode key/value references to heap memory. 522 | // This is required when the mmap is reallocated so inodes are not pointing to stale data. 523 | func (n *node) dereference() { 524 | if n.key != nil { 525 | key := make([]byte, len(n.key)) 526 | copy(key, n.key) 527 | n.key = key 528 | _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") 529 | } 530 | 531 | for i := range n.inodes { 532 | inode := &n.inodes[i] 533 | 534 | key := make([]byte, len(inode.key)) 535 | copy(key, inode.key) 536 | inode.key = key 537 | _assert(len(inode.key) > 0, "dereference: zero-length inode key") 538 | 539 | value := make([]byte, len(inode.value)) 540 | copy(value, inode.value) 541 | inode.value = value 542 | } 543 | 544 | // Recursively dereference children. 545 | for _, child := range n.children { 546 | child.dereference() 547 | } 548 | 549 | // Update statistics. 550 | n.bucket.tx.stats.NodeDeref++ 551 | } 552 | 553 | // free adds the node's underlying page to the freelist. 554 | func (n *node) free() { 555 | if n.pgid != 0 { 556 | n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) 557 | n.pgid = 0 558 | } 559 | } 560 | 561 | // dump writes the contents of the node to STDERR for debugging purposes. 562 | /* 563 | func (n *node) dump() { 564 | // Write node header. 565 | var typ = "branch" 566 | if n.isLeaf { 567 | typ = "leaf" 568 | } 569 | warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) 570 | 571 | // Write out abbreviated version of each item. 572 | for _, item := range n.inodes { 573 | if n.isLeaf { 574 | if item.flags&bucketLeafFlag != 0 { 575 | bucket := (*bucket)(unsafe.Pointer(&item.value[0])) 576 | warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) 577 | } else { 578 | warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) 579 | } 580 | } else { 581 | warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) 582 | } 583 | } 584 | warn("") 585 | } 586 | */ 587 | 588 | type nodes []*node 589 | 590 | func (s nodes) Len() int { return len(s) } 591 | func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 592 | func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } 593 | 594 | // inode represents an internal node inside of a node. 595 | // It can be used to point to elements in a page or point 596 | // to an element which hasn't been added to a page yet. 597 | type inode struct { 598 | flags uint32 599 | pgid pgid 600 | key []byte 601 | value []byte 602 | } 603 | 604 | type inodes []inode 605 | -------------------------------------------------------------------------------- /node_test.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "testing" 5 | "unsafe" 6 | ) 7 | 8 | // Ensure that a node can insert a key/value. 9 | func TestNode_put(t *testing.T) { 10 | n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{meta: &meta{pgid: 1}}}} 11 | n.put([]byte("baz"), []byte("baz"), []byte("2"), 0, 0) 12 | n.put([]byte("foo"), []byte("foo"), []byte("0"), 0, 0) 13 | n.put([]byte("bar"), []byte("bar"), []byte("1"), 0, 0) 14 | n.put([]byte("foo"), []byte("foo"), []byte("3"), 0, leafPageFlag) 15 | 16 | if len(n.inodes) != 3 { 17 | t.Fatalf("exp=3; got=%d", len(n.inodes)) 18 | } 19 | if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "1" { 20 | t.Fatalf("exp=; got=<%s,%s>", k, v) 21 | } 22 | if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "baz" || string(v) != "2" { 23 | t.Fatalf("exp=; got=<%s,%s>", k, v) 24 | } 25 | if k, v := n.inodes[2].key, n.inodes[2].value; string(k) != "foo" || string(v) != "3" { 26 | t.Fatalf("exp=; got=<%s,%s>", k, v) 27 | } 28 | if n.inodes[2].flags != uint32(leafPageFlag) { 29 | t.Fatalf("not a leaf: %d", n.inodes[2].flags) 30 | } 31 | } 32 | 33 | // Ensure that a node can deserialize from a leaf page. 34 | func TestNode_read_LeafPage(t *testing.T) { 35 | // Create a page. 36 | var buf [4096]byte 37 | page := (*page)(unsafe.Pointer(&buf[0])) 38 | page.flags = leafPageFlag 39 | page.count = 2 40 | 41 | // Insert 2 elements at the beginning. sizeof(leafPageElement) == 16 42 | nodes := (*[3]leafPageElement)(unsafe.Pointer(&page.ptr)) 43 | nodes[0] = leafPageElement{flags: 0, pos: 32, ksize: 3, vsize: 4} // pos = sizeof(leafPageElement) * 2 44 | nodes[1] = leafPageElement{flags: 0, pos: 23, ksize: 10, vsize: 3} // pos = sizeof(leafPageElement) + 3 + 4 45 | 46 | // Write data for the nodes at the end. 47 | data := (*[4096]byte)(unsafe.Pointer(&nodes[2])) 48 | copy(data[:], []byte("barfooz")) 49 | copy(data[7:], []byte("helloworldbye")) 50 | 51 | // Deserialize page into a leaf. 52 | n := &node{} 53 | n.read(page) 54 | 55 | // Check that there are two inodes with correct data. 56 | if !n.isLeaf { 57 | t.Fatal("expected leaf") 58 | } 59 | if len(n.inodes) != 2 { 60 | t.Fatalf("exp=2; got=%d", len(n.inodes)) 61 | } 62 | if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "fooz" { 63 | t.Fatalf("exp=; got=<%s,%s>", k, v) 64 | } 65 | if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "helloworld" || string(v) != "bye" { 66 | t.Fatalf("exp=; got=<%s,%s>", k, v) 67 | } 68 | } 69 | 70 | // Ensure that a node can serialize into a leaf page. 71 | func TestNode_write_LeafPage(t *testing.T) { 72 | // Create a node. 73 | n := &node{isLeaf: true, inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} 74 | n.put([]byte("susy"), []byte("susy"), []byte("que"), 0, 0) 75 | n.put([]byte("ricki"), []byte("ricki"), []byte("lake"), 0, 0) 76 | n.put([]byte("john"), []byte("john"), []byte("johnson"), 0, 0) 77 | 78 | // Write it to a page. 79 | var buf [4096]byte 80 | p := (*page)(unsafe.Pointer(&buf[0])) 81 | n.write(p) 82 | 83 | // Read the page back in. 84 | n2 := &node{} 85 | n2.read(p) 86 | 87 | // Check that the two pages are the same. 88 | if len(n2.inodes) != 3 { 89 | t.Fatalf("exp=3; got=%d", len(n2.inodes)) 90 | } 91 | if k, v := n2.inodes[0].key, n2.inodes[0].value; string(k) != "john" || string(v) != "johnson" { 92 | t.Fatalf("exp=; got=<%s,%s>", k, v) 93 | } 94 | if k, v := n2.inodes[1].key, n2.inodes[1].value; string(k) != "ricki" || string(v) != "lake" { 95 | t.Fatalf("exp=; got=<%s,%s>", k, v) 96 | } 97 | if k, v := n2.inodes[2].key, n2.inodes[2].value; string(k) != "susy" || string(v) != "que" { 98 | t.Fatalf("exp=; got=<%s,%s>", k, v) 99 | } 100 | } 101 | 102 | // Ensure that a node can split into appropriate subgroups. 103 | func TestNode_split(t *testing.T) { 104 | // Create a node. 105 | n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} 106 | n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) 107 | n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) 108 | n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) 109 | n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0) 110 | n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0) 111 | 112 | // Split between 2 & 3. 113 | n.split(100) 114 | 115 | var parent = n.parent 116 | if len(parent.children) != 2 { 117 | t.Fatalf("exp=2; got=%d", len(parent.children)) 118 | } 119 | if len(parent.children[0].inodes) != 2 { 120 | t.Fatalf("exp=2; got=%d", len(parent.children[0].inodes)) 121 | } 122 | if len(parent.children[1].inodes) != 3 { 123 | t.Fatalf("exp=3; got=%d", len(parent.children[1].inodes)) 124 | } 125 | } 126 | 127 | // Ensure that a page with the minimum number of inodes just returns a single node. 128 | func TestNode_split_MinKeys(t *testing.T) { 129 | // Create a node. 130 | n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} 131 | n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) 132 | n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) 133 | 134 | // Split. 135 | n.split(20) 136 | if n.parent != nil { 137 | t.Fatalf("expected nil parent") 138 | } 139 | } 140 | 141 | // Ensure that a node that has keys that all fit on a page just returns one leaf. 142 | func TestNode_split_SinglePage(t *testing.T) { 143 | // Create a node. 144 | n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} 145 | n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) 146 | n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) 147 | n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) 148 | n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0) 149 | n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0) 150 | 151 | // Split. 152 | n.split(4096) 153 | if n.parent != nil { 154 | t.Fatalf("expected nil parent") 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /page.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sort" 7 | "unsafe" 8 | ) 9 | 10 | const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr)) 11 | 12 | const minKeysPerPage = 2 13 | 14 | const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{})) 15 | const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{})) 16 | 17 | const ( 18 | branchPageFlag = 0x01 19 | leafPageFlag = 0x02 20 | metaPageFlag = 0x04 21 | freelistPageFlag = 0x10 22 | ) 23 | 24 | const ( 25 | bucketLeafFlag = 0x01 26 | ) 27 | 28 | type pgid uint64 29 | 30 | type page struct { 31 | id pgid 32 | flags uint16 33 | count uint16 34 | overflow uint32 35 | ptr uintptr 36 | } 37 | 38 | // typ returns a human readable page type string used for debugging. 39 | func (p *page) typ() string { 40 | if (p.flags & branchPageFlag) != 0 { 41 | return "branch" 42 | } else if (p.flags & leafPageFlag) != 0 { 43 | return "leaf" 44 | } else if (p.flags & metaPageFlag) != 0 { 45 | return "meta" 46 | } else if (p.flags & freelistPageFlag) != 0 { 47 | return "freelist" 48 | } 49 | return fmt.Sprintf("unknown<%02x>", p.flags) 50 | } 51 | 52 | // meta returns a pointer to the metadata section of the page. 53 | func (p *page) meta() *meta { 54 | return (*meta)(unsafe.Pointer(&p.ptr)) 55 | } 56 | 57 | // leafPageElement retrieves the leaf node by index 58 | func (p *page) leafPageElement(index uint16) *leafPageElement { 59 | n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index] 60 | return n 61 | } 62 | 63 | // leafPageElements retrieves a list of leaf nodes. 64 | func (p *page) leafPageElements() []leafPageElement { 65 | if p.count == 0 { 66 | return nil 67 | } 68 | return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:] 69 | } 70 | 71 | // branchPageElement retrieves the branch node by index 72 | func (p *page) branchPageElement(index uint16) *branchPageElement { 73 | return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index] 74 | } 75 | 76 | // branchPageElements retrieves a list of branch nodes. 77 | func (p *page) branchPageElements() []branchPageElement { 78 | if p.count == 0 { 79 | return nil 80 | } 81 | return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:] 82 | } 83 | 84 | // dump writes n bytes of the page to STDERR as hex output. 85 | func (p *page) hexdump(n int) { 86 | buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n] 87 | fmt.Fprintf(os.Stderr, "%x\n", buf) 88 | } 89 | 90 | type pages []*page 91 | 92 | func (s pages) Len() int { return len(s) } 93 | func (s pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 94 | func (s pages) Less(i, j int) bool { return s[i].id < s[j].id } 95 | 96 | // branchPageElement represents a node on a branch page. 97 | type branchPageElement struct { 98 | pos uint32 99 | ksize uint32 100 | pgid pgid 101 | } 102 | 103 | // key returns a byte slice of the node key. 104 | func (n *branchPageElement) key() []byte { 105 | buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) 106 | return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize] 107 | } 108 | 109 | // leafPageElement represents a node on a leaf page. 110 | type leafPageElement struct { 111 | flags uint32 112 | pos uint32 113 | ksize uint32 114 | vsize uint32 115 | } 116 | 117 | // key returns a byte slice of the node key. 118 | func (n *leafPageElement) key() []byte { 119 | buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) 120 | return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize:n.ksize] 121 | } 122 | 123 | // value returns a byte slice of the node value. 124 | func (n *leafPageElement) value() []byte { 125 | buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) 126 | return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos+n.ksize]))[:n.vsize:n.vsize] 127 | } 128 | 129 | // PageInfo represents human readable information about a page. 130 | type PageInfo struct { 131 | ID int 132 | Type string 133 | Count int 134 | OverflowCount int 135 | } 136 | 137 | type pgids []pgid 138 | 139 | func (s pgids) Len() int { return len(s) } 140 | func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 141 | func (s pgids) Less(i, j int) bool { return s[i] < s[j] } 142 | 143 | // merge returns the sorted union of a and b. 144 | func (a pgids) merge(b pgids) pgids { 145 | // Return the opposite slice if one is nil. 146 | if len(a) == 0 { 147 | return b 148 | } 149 | if len(b) == 0 { 150 | return a 151 | } 152 | merged := make(pgids, len(a)+len(b)) 153 | mergepgids(merged, a, b) 154 | return merged 155 | } 156 | 157 | // mergepgids copies the sorted union of a and b into dst. 158 | // If dst is too small, it panics. 159 | func mergepgids(dst, a, b pgids) { 160 | if len(dst) < len(a)+len(b) { 161 | panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b))) 162 | } 163 | // Copy in the opposite slice if one is nil. 164 | if len(a) == 0 { 165 | copy(dst, b) 166 | return 167 | } 168 | if len(b) == 0 { 169 | copy(dst, a) 170 | return 171 | } 172 | 173 | // Merged will hold all elements from both lists. 174 | merged := dst[:0] 175 | 176 | // Assign lead to the slice with a lower starting value, follow to the higher value. 177 | lead, follow := a, b 178 | if b[0] < a[0] { 179 | lead, follow = b, a 180 | } 181 | 182 | // Continue while there are elements in the lead. 183 | for len(lead) > 0 { 184 | // Merge largest prefix of lead that is ahead of follow[0]. 185 | n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] }) 186 | merged = append(merged, lead[:n]...) 187 | if n >= len(lead) { 188 | break 189 | } 190 | 191 | // Swap lead and follow. 192 | lead, follow = follow, lead[n:] 193 | } 194 | 195 | // Append what's left in follow. 196 | _ = append(merged, follow...) 197 | } 198 | -------------------------------------------------------------------------------- /page_test.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "reflect" 5 | "sort" 6 | "testing" 7 | "testing/quick" 8 | ) 9 | 10 | // Ensure that the page type can be returned in human readable format. 11 | func TestPage_typ(t *testing.T) { 12 | if typ := (&page{flags: branchPageFlag}).typ(); typ != "branch" { 13 | t.Fatalf("exp=branch; got=%v", typ) 14 | } 15 | if typ := (&page{flags: leafPageFlag}).typ(); typ != "leaf" { 16 | t.Fatalf("exp=leaf; got=%v", typ) 17 | } 18 | if typ := (&page{flags: metaPageFlag}).typ(); typ != "meta" { 19 | t.Fatalf("exp=meta; got=%v", typ) 20 | } 21 | if typ := (&page{flags: freelistPageFlag}).typ(); typ != "freelist" { 22 | t.Fatalf("exp=freelist; got=%v", typ) 23 | } 24 | if typ := (&page{flags: 20000}).typ(); typ != "unknown<4e20>" { 25 | t.Fatalf("exp=unknown<4e20>; got=%v", typ) 26 | } 27 | } 28 | 29 | // Ensure that the hexdump debugging function doesn't blow up. 30 | func TestPage_dump(t *testing.T) { 31 | (&page{id: 256}).hexdump(16) 32 | } 33 | 34 | func TestPgids_merge(t *testing.T) { 35 | a := pgids{4, 5, 6, 10, 11, 12, 13, 27} 36 | b := pgids{1, 3, 8, 9, 25, 30} 37 | c := a.merge(b) 38 | if !reflect.DeepEqual(c, pgids{1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30}) { 39 | t.Errorf("mismatch: %v", c) 40 | } 41 | 42 | a = pgids{4, 5, 6, 10, 11, 12, 13, 27, 35, 36} 43 | b = pgids{8, 9, 25, 30} 44 | c = a.merge(b) 45 | if !reflect.DeepEqual(c, pgids{4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30, 35, 36}) { 46 | t.Errorf("mismatch: %v", c) 47 | } 48 | } 49 | 50 | func TestPgids_merge_quick(t *testing.T) { 51 | if err := quick.Check(func(a, b pgids) bool { 52 | // Sort incoming lists. 53 | sort.Sort(a) 54 | sort.Sort(b) 55 | 56 | // Merge the two lists together. 57 | got := a.merge(b) 58 | 59 | // The expected value should be the two lists combined and sorted. 60 | exp := append(a, b...) 61 | sort.Sort(exp) 62 | 63 | if !reflect.DeepEqual(exp, got) { 64 | t.Errorf("\nexp=%+v\ngot=%+v\n", exp, got) 65 | return false 66 | } 67 | 68 | return true 69 | }, nil); err != nil { 70 | t.Fatal(err) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /quick_test.go: -------------------------------------------------------------------------------- 1 | package bolt_test 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "fmt" 7 | "math/rand" 8 | "os" 9 | "reflect" 10 | "testing/quick" 11 | "time" 12 | ) 13 | 14 | // testing/quick defaults to 5 iterations and a random seed. 15 | // You can override these settings from the command line: 16 | // 17 | // -quick.count The number of iterations to perform. 18 | // -quick.seed The seed to use for randomizing. 19 | // -quick.maxitems The maximum number of items to insert into a DB. 20 | // -quick.maxksize The maximum size of a key. 21 | // -quick.maxvsize The maximum size of a value. 22 | // 23 | 24 | var qcount, qseed, qmaxitems, qmaxksize, qmaxvsize int 25 | 26 | func init() { 27 | flag.IntVar(&qcount, "quick.count", 5, "") 28 | flag.IntVar(&qseed, "quick.seed", int(time.Now().UnixNano())%100000, "") 29 | flag.IntVar(&qmaxitems, "quick.maxitems", 1000, "") 30 | flag.IntVar(&qmaxksize, "quick.maxksize", 1024, "") 31 | flag.IntVar(&qmaxvsize, "quick.maxvsize", 1024, "") 32 | flag.Parse() 33 | fmt.Fprintln(os.Stderr, "seed:", qseed) 34 | fmt.Fprintf(os.Stderr, "quick settings: count=%v, items=%v, ksize=%v, vsize=%v\n", qcount, qmaxitems, qmaxksize, qmaxvsize) 35 | } 36 | 37 | func qconfig() *quick.Config { 38 | return &quick.Config{ 39 | MaxCount: qcount, 40 | Rand: rand.New(rand.NewSource(int64(qseed))), 41 | } 42 | } 43 | 44 | type testdata []testdataitem 45 | 46 | func (t testdata) Len() int { return len(t) } 47 | func (t testdata) Swap(i, j int) { t[i], t[j] = t[j], t[i] } 48 | func (t testdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == -1 } 49 | 50 | func (t testdata) Generate(rand *rand.Rand, size int) reflect.Value { 51 | n := rand.Intn(qmaxitems-1) + 1 52 | items := make(testdata, n) 53 | used := make(map[string]bool) 54 | for i := 0; i < n; i++ { 55 | item := &items[i] 56 | // Ensure that keys are unique by looping until we find one that we have not already used. 57 | for { 58 | item.Key = randByteSlice(rand, 1, qmaxksize) 59 | if !used[string(item.Key)] { 60 | used[string(item.Key)] = true 61 | break 62 | } 63 | } 64 | item.Value = randByteSlice(rand, 0, qmaxvsize) 65 | } 66 | return reflect.ValueOf(items) 67 | } 68 | 69 | type revtestdata []testdataitem 70 | 71 | func (t revtestdata) Len() int { return len(t) } 72 | func (t revtestdata) Swap(i, j int) { t[i], t[j] = t[j], t[i] } 73 | func (t revtestdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == 1 } 74 | 75 | type testdataitem struct { 76 | Key []byte 77 | Value []byte 78 | } 79 | 80 | func randByteSlice(rand *rand.Rand, minSize, maxSize int) []byte { 81 | n := rand.Intn(maxSize-minSize) + minSize 82 | b := make([]byte, n) 83 | for i := 0; i < n; i++ { 84 | b[i] = byte(rand.Intn(255)) 85 | } 86 | return b 87 | } 88 | -------------------------------------------------------------------------------- /simulation_test.go: -------------------------------------------------------------------------------- 1 | package bolt_test 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "math/rand" 7 | "sync" 8 | "testing" 9 | 10 | "github.com/boltdb/bolt" 11 | ) 12 | 13 | func TestSimulate_1op_1p(t *testing.T) { testSimulate(t, 1, 1) } 14 | func TestSimulate_10op_1p(t *testing.T) { testSimulate(t, 10, 1) } 15 | func TestSimulate_100op_1p(t *testing.T) { testSimulate(t, 100, 1) } 16 | func TestSimulate_1000op_1p(t *testing.T) { testSimulate(t, 1000, 1) } 17 | func TestSimulate_10000op_1p(t *testing.T) { testSimulate(t, 10000, 1) } 18 | 19 | func TestSimulate_10op_10p(t *testing.T) { testSimulate(t, 10, 10) } 20 | func TestSimulate_100op_10p(t *testing.T) { testSimulate(t, 100, 10) } 21 | func TestSimulate_1000op_10p(t *testing.T) { testSimulate(t, 1000, 10) } 22 | func TestSimulate_10000op_10p(t *testing.T) { testSimulate(t, 10000, 10) } 23 | 24 | func TestSimulate_100op_100p(t *testing.T) { testSimulate(t, 100, 100) } 25 | func TestSimulate_1000op_100p(t *testing.T) { testSimulate(t, 1000, 100) } 26 | func TestSimulate_10000op_100p(t *testing.T) { testSimulate(t, 10000, 100) } 27 | 28 | func TestSimulate_10000op_1000p(t *testing.T) { testSimulate(t, 10000, 1000) } 29 | 30 | // Randomly generate operations on a given database with multiple clients to ensure consistency and thread safety. 31 | func testSimulate(t *testing.T, threadCount, parallelism int) { 32 | if testing.Short() { 33 | t.Skip("skipping test in short mode.") 34 | } 35 | 36 | rand.Seed(int64(qseed)) 37 | 38 | // A list of operations that readers and writers can perform. 39 | var readerHandlers = []simulateHandler{simulateGetHandler} 40 | var writerHandlers = []simulateHandler{simulateGetHandler, simulatePutHandler} 41 | 42 | var versions = make(map[int]*QuickDB) 43 | versions[1] = NewQuickDB() 44 | 45 | db := MustOpenDB() 46 | defer db.MustClose() 47 | 48 | var mutex sync.Mutex 49 | 50 | // Run n threads in parallel, each with their own operation. 51 | var wg sync.WaitGroup 52 | var threads = make(chan bool, parallelism) 53 | var i int 54 | for { 55 | threads <- true 56 | wg.Add(1) 57 | writable := ((rand.Int() % 100) < 20) // 20% writers 58 | 59 | // Choose an operation to execute. 60 | var handler simulateHandler 61 | if writable { 62 | handler = writerHandlers[rand.Intn(len(writerHandlers))] 63 | } else { 64 | handler = readerHandlers[rand.Intn(len(readerHandlers))] 65 | } 66 | 67 | // Execute a thread for the given operation. 68 | go func(writable bool, handler simulateHandler) { 69 | defer wg.Done() 70 | 71 | // Start transaction. 72 | tx, err := db.Begin(writable) 73 | if err != nil { 74 | t.Fatal("tx begin: ", err) 75 | } 76 | 77 | // Obtain current state of the dataset. 78 | mutex.Lock() 79 | var qdb = versions[tx.ID()] 80 | if writable { 81 | qdb = versions[tx.ID()-1].Copy() 82 | } 83 | mutex.Unlock() 84 | 85 | // Make sure we commit/rollback the tx at the end and update the state. 86 | if writable { 87 | defer func() { 88 | mutex.Lock() 89 | versions[tx.ID()] = qdb 90 | mutex.Unlock() 91 | 92 | if err := tx.Commit(); err != nil { 93 | t.Fatal(err) 94 | } 95 | }() 96 | } else { 97 | defer func() { _ = tx.Rollback() }() 98 | } 99 | 100 | // Ignore operation if we don't have data yet. 101 | if qdb == nil { 102 | return 103 | } 104 | 105 | // Execute handler. 106 | handler(tx, qdb) 107 | 108 | // Release a thread back to the scheduling loop. 109 | <-threads 110 | }(writable, handler) 111 | 112 | i++ 113 | if i > threadCount { 114 | break 115 | } 116 | } 117 | 118 | // Wait until all threads are done. 119 | wg.Wait() 120 | } 121 | 122 | type simulateHandler func(tx *bolt.Tx, qdb *QuickDB) 123 | 124 | // Retrieves a key from the database and verifies that it is what is expected. 125 | func simulateGetHandler(tx *bolt.Tx, qdb *QuickDB) { 126 | // Randomly retrieve an existing exist. 127 | keys := qdb.Rand() 128 | if len(keys) == 0 { 129 | return 130 | } 131 | 132 | // Retrieve root bucket. 133 | b := tx.Bucket(keys[0]) 134 | if b == nil { 135 | panic(fmt.Sprintf("bucket[0] expected: %08x\n", trunc(keys[0], 4))) 136 | } 137 | 138 | // Drill into nested buckets. 139 | for _, key := range keys[1 : len(keys)-1] { 140 | b = b.Bucket(key) 141 | if b == nil { 142 | panic(fmt.Sprintf("bucket[n] expected: %v -> %v\n", keys, key)) 143 | } 144 | } 145 | 146 | // Verify key/value on the final bucket. 147 | expected := qdb.Get(keys) 148 | actual := b.Get(keys[len(keys)-1]) 149 | if !bytes.Equal(actual, expected) { 150 | fmt.Println("=== EXPECTED ===") 151 | fmt.Println(expected) 152 | fmt.Println("=== ACTUAL ===") 153 | fmt.Println(actual) 154 | fmt.Println("=== END ===") 155 | panic("value mismatch") 156 | } 157 | } 158 | 159 | // Inserts a key into the database. 160 | func simulatePutHandler(tx *bolt.Tx, qdb *QuickDB) { 161 | var err error 162 | keys, value := randKeys(), randValue() 163 | 164 | // Retrieve root bucket. 165 | b := tx.Bucket(keys[0]) 166 | if b == nil { 167 | b, err = tx.CreateBucket(keys[0]) 168 | if err != nil { 169 | panic("create bucket: " + err.Error()) 170 | } 171 | } 172 | 173 | // Create nested buckets, if necessary. 174 | for _, key := range keys[1 : len(keys)-1] { 175 | child := b.Bucket(key) 176 | if child != nil { 177 | b = child 178 | } else { 179 | b, err = b.CreateBucket(key) 180 | if err != nil { 181 | panic("create bucket: " + err.Error()) 182 | } 183 | } 184 | } 185 | 186 | // Insert into database. 187 | if err := b.Put(keys[len(keys)-1], value); err != nil { 188 | panic("put: " + err.Error()) 189 | } 190 | 191 | // Insert into in-memory database. 192 | qdb.Put(keys, value) 193 | } 194 | 195 | // QuickDB is an in-memory database that replicates the functionality of the 196 | // Bolt DB type except that it is entirely in-memory. It is meant for testing 197 | // that the Bolt database is consistent. 198 | type QuickDB struct { 199 | sync.RWMutex 200 | m map[string]interface{} 201 | } 202 | 203 | // NewQuickDB returns an instance of QuickDB. 204 | func NewQuickDB() *QuickDB { 205 | return &QuickDB{m: make(map[string]interface{})} 206 | } 207 | 208 | // Get retrieves the value at a key path. 209 | func (db *QuickDB) Get(keys [][]byte) []byte { 210 | db.RLock() 211 | defer db.RUnlock() 212 | 213 | m := db.m 214 | for _, key := range keys[:len(keys)-1] { 215 | value := m[string(key)] 216 | if value == nil { 217 | return nil 218 | } 219 | switch value := value.(type) { 220 | case map[string]interface{}: 221 | m = value 222 | case []byte: 223 | return nil 224 | } 225 | } 226 | 227 | // Only return if it's a simple value. 228 | if value, ok := m[string(keys[len(keys)-1])].([]byte); ok { 229 | return value 230 | } 231 | return nil 232 | } 233 | 234 | // Put inserts a value into a key path. 235 | func (db *QuickDB) Put(keys [][]byte, value []byte) { 236 | db.Lock() 237 | defer db.Unlock() 238 | 239 | // Build buckets all the way down the key path. 240 | m := db.m 241 | for _, key := range keys[:len(keys)-1] { 242 | if _, ok := m[string(key)].([]byte); ok { 243 | return // Keypath intersects with a simple value. Do nothing. 244 | } 245 | 246 | if m[string(key)] == nil { 247 | m[string(key)] = make(map[string]interface{}) 248 | } 249 | m = m[string(key)].(map[string]interface{}) 250 | } 251 | 252 | // Insert value into the last key. 253 | m[string(keys[len(keys)-1])] = value 254 | } 255 | 256 | // Rand returns a random key path that points to a simple value. 257 | func (db *QuickDB) Rand() [][]byte { 258 | db.RLock() 259 | defer db.RUnlock() 260 | if len(db.m) == 0 { 261 | return nil 262 | } 263 | var keys [][]byte 264 | db.rand(db.m, &keys) 265 | return keys 266 | } 267 | 268 | func (db *QuickDB) rand(m map[string]interface{}, keys *[][]byte) { 269 | i, index := 0, rand.Intn(len(m)) 270 | for k, v := range m { 271 | if i == index { 272 | *keys = append(*keys, []byte(k)) 273 | if v, ok := v.(map[string]interface{}); ok { 274 | db.rand(v, keys) 275 | } 276 | return 277 | } 278 | i++ 279 | } 280 | panic("quickdb rand: out-of-range") 281 | } 282 | 283 | // Copy copies the entire database. 284 | func (db *QuickDB) Copy() *QuickDB { 285 | db.RLock() 286 | defer db.RUnlock() 287 | return &QuickDB{m: db.copy(db.m)} 288 | } 289 | 290 | func (db *QuickDB) copy(m map[string]interface{}) map[string]interface{} { 291 | clone := make(map[string]interface{}, len(m)) 292 | for k, v := range m { 293 | switch v := v.(type) { 294 | case map[string]interface{}: 295 | clone[k] = db.copy(v) 296 | default: 297 | clone[k] = v 298 | } 299 | } 300 | return clone 301 | } 302 | 303 | func randKey() []byte { 304 | var min, max = 1, 1024 305 | n := rand.Intn(max-min) + min 306 | b := make([]byte, n) 307 | for i := 0; i < n; i++ { 308 | b[i] = byte(rand.Intn(255)) 309 | } 310 | return b 311 | } 312 | 313 | func randKeys() [][]byte { 314 | var keys [][]byte 315 | var count = rand.Intn(2) + 2 316 | for i := 0; i < count; i++ { 317 | keys = append(keys, randKey()) 318 | } 319 | return keys 320 | } 321 | 322 | func randValue() []byte { 323 | n := rand.Intn(8192) 324 | b := make([]byte, n) 325 | for i := 0; i < n; i++ { 326 | b[i] = byte(rand.Intn(255)) 327 | } 328 | return b 329 | } 330 | -------------------------------------------------------------------------------- /tx.go: -------------------------------------------------------------------------------- 1 | package bolt 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "sort" 8 | "strings" 9 | "time" 10 | "unsafe" 11 | ) 12 | 13 | // txid represents the internal transaction identifier. 14 | type txid uint64 15 | 16 | // Tx represents a read-only or read/write transaction on the database. 17 | // Read-only transactions can be used for retrieving values for keys and creating cursors. 18 | // Read/write transactions can create and remove buckets and create and remove keys. 19 | // 20 | // IMPORTANT: You must commit or rollback transactions when you are done with 21 | // them. Pages can not be reclaimed by the writer until no more transactions 22 | // are using them. A long running read transaction can cause the database to 23 | // quickly grow. 24 | type Tx struct { 25 | writable bool 26 | managed bool 27 | db *DB 28 | meta *meta 29 | root Bucket 30 | pages map[pgid]*page 31 | stats TxStats 32 | commitHandlers []func() 33 | 34 | // WriteFlag specifies the flag for write-related methods like WriteTo(). 35 | // Tx opens the database file with the specified flag to copy the data. 36 | // 37 | // By default, the flag is unset, which works well for mostly in-memory 38 | // workloads. For databases that are much larger than available RAM, 39 | // set the flag to syscall.O_DIRECT to avoid trashing the page cache. 40 | WriteFlag int 41 | } 42 | 43 | // init initializes the transaction. 44 | func (tx *Tx) init(db *DB) { 45 | tx.db = db 46 | tx.pages = nil 47 | 48 | // Copy the meta page since it can be changed by the writer. 49 | tx.meta = &meta{} 50 | db.meta().copy(tx.meta) 51 | 52 | // Copy over the root bucket. 53 | tx.root = newBucket(tx) 54 | tx.root.bucket = &bucket{} 55 | *tx.root.bucket = tx.meta.root 56 | 57 | // Increment the transaction id and add a page cache for writable transactions. 58 | if tx.writable { 59 | tx.pages = make(map[pgid]*page) 60 | tx.meta.txid += txid(1) 61 | } 62 | } 63 | 64 | // ID returns the transaction id. 65 | func (tx *Tx) ID() int { 66 | return int(tx.meta.txid) 67 | } 68 | 69 | // DB returns a reference to the database that created the transaction. 70 | func (tx *Tx) DB() *DB { 71 | return tx.db 72 | } 73 | 74 | // Size returns current database size in bytes as seen by this transaction. 75 | func (tx *Tx) Size() int64 { 76 | return int64(tx.meta.pgid) * int64(tx.db.pageSize) 77 | } 78 | 79 | // Writable returns whether the transaction can perform write operations. 80 | func (tx *Tx) Writable() bool { 81 | return tx.writable 82 | } 83 | 84 | // Cursor creates a cursor associated with the root bucket. 85 | // All items in the cursor will return a nil value because all root bucket keys point to buckets. 86 | // The cursor is only valid as long as the transaction is open. 87 | // Do not use a cursor after the transaction is closed. 88 | func (tx *Tx) Cursor() *Cursor { 89 | return tx.root.Cursor() 90 | } 91 | 92 | // Stats retrieves a copy of the current transaction statistics. 93 | func (tx *Tx) Stats() TxStats { 94 | return tx.stats 95 | } 96 | 97 | // Bucket retrieves a bucket by name. 98 | // Returns nil if the bucket does not exist. 99 | // The bucket instance is only valid for the lifetime of the transaction. 100 | func (tx *Tx) Bucket(name []byte) *Bucket { 101 | return tx.root.Bucket(name) 102 | } 103 | 104 | // CreateBucket creates a new bucket. 105 | // Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long. 106 | // The bucket instance is only valid for the lifetime of the transaction. 107 | func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) { 108 | return tx.root.CreateBucket(name) 109 | } 110 | 111 | // CreateBucketIfNotExists creates a new bucket if it doesn't already exist. 112 | // Returns an error if the bucket name is blank, or if the bucket name is too long. 113 | // The bucket instance is only valid for the lifetime of the transaction. 114 | func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) { 115 | return tx.root.CreateBucketIfNotExists(name) 116 | } 117 | 118 | // DeleteBucket deletes a bucket. 119 | // Returns an error if the bucket cannot be found or if the key represents a non-bucket value. 120 | func (tx *Tx) DeleteBucket(name []byte) error { 121 | return tx.root.DeleteBucket(name) 122 | } 123 | 124 | // ForEach executes a function for each bucket in the root. 125 | // If the provided function returns an error then the iteration is stopped and 126 | // the error is returned to the caller. 127 | func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error { 128 | return tx.root.ForEach(func(k, v []byte) error { 129 | if err := fn(k, tx.root.Bucket(k)); err != nil { 130 | return err 131 | } 132 | return nil 133 | }) 134 | } 135 | 136 | // OnCommit adds a handler function to be executed after the transaction successfully commits. 137 | func (tx *Tx) OnCommit(fn func()) { 138 | tx.commitHandlers = append(tx.commitHandlers, fn) 139 | } 140 | 141 | // Commit writes all changes to disk and updates the meta page. 142 | // Returns an error if a disk write error occurs, or if Commit is 143 | // called on a read-only transaction. 144 | func (tx *Tx) Commit() error { 145 | _assert(!tx.managed, "managed tx commit not allowed") 146 | if tx.db == nil { 147 | return ErrTxClosed 148 | } else if !tx.writable { 149 | return ErrTxNotWritable 150 | } 151 | 152 | // TODO(benbjohnson): Use vectorized I/O to write out dirty pages. 153 | 154 | // Rebalance nodes which have had deletions. 155 | var startTime = time.Now() 156 | tx.root.rebalance() 157 | if tx.stats.Rebalance > 0 { 158 | tx.stats.RebalanceTime += time.Since(startTime) 159 | } 160 | 161 | // spill data onto dirty pages. 162 | startTime = time.Now() 163 | if err := tx.root.spill(); err != nil { 164 | tx.rollback() 165 | return err 166 | } 167 | tx.stats.SpillTime += time.Since(startTime) 168 | 169 | // Free the old root bucket. 170 | tx.meta.root.root = tx.root.root 171 | 172 | opgid := tx.meta.pgid 173 | 174 | // Free the freelist and allocate new pages for it. This will overestimate 175 | // the size of the freelist but not underestimate the size (which would be bad). 176 | tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) 177 | p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) 178 | if err != nil { 179 | tx.rollback() 180 | return err 181 | } 182 | if err := tx.db.freelist.write(p); err != nil { 183 | tx.rollback() 184 | return err 185 | } 186 | tx.meta.freelist = p.id 187 | 188 | // If the high water mark has moved up then attempt to grow the database. 189 | if tx.meta.pgid > opgid { 190 | if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { 191 | tx.rollback() 192 | return err 193 | } 194 | } 195 | 196 | // Write dirty pages to disk. 197 | startTime = time.Now() 198 | if err := tx.write(); err != nil { 199 | tx.rollback() 200 | return err 201 | } 202 | 203 | // If strict mode is enabled then perform a consistency check. 204 | // Only the first consistency error is reported in the panic. 205 | if tx.db.StrictMode { 206 | ch := tx.Check() 207 | var errs []string 208 | for { 209 | err, ok := <-ch 210 | if !ok { 211 | break 212 | } 213 | errs = append(errs, err.Error()) 214 | } 215 | if len(errs) > 0 { 216 | panic("check fail: " + strings.Join(errs, "\n")) 217 | } 218 | } 219 | 220 | // Write meta to disk. 221 | if err := tx.writeMeta(); err != nil { 222 | tx.rollback() 223 | return err 224 | } 225 | tx.stats.WriteTime += time.Since(startTime) 226 | 227 | // Finalize the transaction. 228 | tx.close() 229 | 230 | // Execute commit handlers now that the locks have been removed. 231 | for _, fn := range tx.commitHandlers { 232 | fn() 233 | } 234 | 235 | return nil 236 | } 237 | 238 | // Rollback closes the transaction and ignores all previous updates. Read-only 239 | // transactions must be rolled back and not committed. 240 | func (tx *Tx) Rollback() error { 241 | _assert(!tx.managed, "managed tx rollback not allowed") 242 | if tx.db == nil { 243 | return ErrTxClosed 244 | } 245 | tx.rollback() 246 | return nil 247 | } 248 | 249 | func (tx *Tx) rollback() { 250 | if tx.db == nil { 251 | return 252 | } 253 | if tx.writable { 254 | tx.db.freelist.rollback(tx.meta.txid) 255 | tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist)) 256 | } 257 | tx.close() 258 | } 259 | 260 | func (tx *Tx) close() { 261 | if tx.db == nil { 262 | return 263 | } 264 | if tx.writable { 265 | // Grab freelist stats. 266 | var freelistFreeN = tx.db.freelist.free_count() 267 | var freelistPendingN = tx.db.freelist.pending_count() 268 | var freelistAlloc = tx.db.freelist.size() 269 | 270 | // Remove transaction ref & writer lock. 271 | tx.db.rwtx = nil 272 | tx.db.rwlock.Unlock() 273 | 274 | // Merge statistics. 275 | tx.db.statlock.Lock() 276 | tx.db.stats.FreePageN = freelistFreeN 277 | tx.db.stats.PendingPageN = freelistPendingN 278 | tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize 279 | tx.db.stats.FreelistInuse = freelistAlloc 280 | tx.db.stats.TxStats.add(&tx.stats) 281 | tx.db.statlock.Unlock() 282 | } else { 283 | tx.db.removeTx(tx) 284 | } 285 | 286 | // Clear all references. 287 | tx.db = nil 288 | tx.meta = nil 289 | tx.root = Bucket{tx: tx} 290 | tx.pages = nil 291 | } 292 | 293 | // Copy writes the entire database to a writer. 294 | // This function exists for backwards compatibility. 295 | // 296 | // Deprecated; Use WriteTo() instead. 297 | func (tx *Tx) Copy(w io.Writer) error { 298 | _, err := tx.WriteTo(w) 299 | return err 300 | } 301 | 302 | // WriteTo writes the entire database to a writer. 303 | // If err == nil then exactly tx.Size() bytes will be written into the writer. 304 | func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { 305 | // Attempt to open reader with WriteFlag 306 | f, err := os.OpenFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0) 307 | if err != nil { 308 | return 0, err 309 | } 310 | defer func() { _ = f.Close() }() 311 | 312 | // Generate a meta page. We use the same page data for both meta pages. 313 | buf := make([]byte, tx.db.pageSize) 314 | page := (*page)(unsafe.Pointer(&buf[0])) 315 | page.flags = metaPageFlag 316 | *page.meta() = *tx.meta 317 | 318 | // Write meta 0. 319 | page.id = 0 320 | page.meta().checksum = page.meta().sum64() 321 | nn, err := w.Write(buf) 322 | n += int64(nn) 323 | if err != nil { 324 | return n, fmt.Errorf("meta 0 copy: %s", err) 325 | } 326 | 327 | // Write meta 1 with a lower transaction id. 328 | page.id = 1 329 | page.meta().txid -= 1 330 | page.meta().checksum = page.meta().sum64() 331 | nn, err = w.Write(buf) 332 | n += int64(nn) 333 | if err != nil { 334 | return n, fmt.Errorf("meta 1 copy: %s", err) 335 | } 336 | 337 | // Move past the meta pages in the file. 338 | if _, err := f.Seek(int64(tx.db.pageSize*2), os.SEEK_SET); err != nil { 339 | return n, fmt.Errorf("seek: %s", err) 340 | } 341 | 342 | // Copy data pages. 343 | wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)) 344 | n += wn 345 | if err != nil { 346 | return n, err 347 | } 348 | 349 | return n, f.Close() 350 | } 351 | 352 | // CopyFile copies the entire database to file at the given path. 353 | // A reader transaction is maintained during the copy so it is safe to continue 354 | // using the database while a copy is in progress. 355 | func (tx *Tx) CopyFile(path string, mode os.FileMode) error { 356 | f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode) 357 | if err != nil { 358 | return err 359 | } 360 | 361 | err = tx.Copy(f) 362 | if err != nil { 363 | _ = f.Close() 364 | return err 365 | } 366 | return f.Close() 367 | } 368 | 369 | // Check performs several consistency checks on the database for this transaction. 370 | // An error is returned if any inconsistency is found. 371 | // 372 | // It can be safely run concurrently on a writable transaction. However, this 373 | // incurs a high cost for large databases and databases with a lot of subbuckets 374 | // because of caching. This overhead can be removed if running on a read-only 375 | // transaction, however, it is not safe to execute other writer transactions at 376 | // the same time. 377 | func (tx *Tx) Check() <-chan error { 378 | ch := make(chan error) 379 | go tx.check(ch) 380 | return ch 381 | } 382 | 383 | func (tx *Tx) check(ch chan error) { 384 | // Check if any pages are double freed. 385 | freed := make(map[pgid]bool) 386 | all := make([]pgid, tx.db.freelist.count()) 387 | tx.db.freelist.copyall(all) 388 | for _, id := range all { 389 | if freed[id] { 390 | ch <- fmt.Errorf("page %d: already freed", id) 391 | } 392 | freed[id] = true 393 | } 394 | 395 | // Track every reachable page. 396 | reachable := make(map[pgid]*page) 397 | reachable[0] = tx.page(0) // meta0 398 | reachable[1] = tx.page(1) // meta1 399 | for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { 400 | reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) 401 | } 402 | 403 | // Recursively check buckets. 404 | tx.checkBucket(&tx.root, reachable, freed, ch) 405 | 406 | // Ensure all pages below high water mark are either reachable or freed. 407 | for i := pgid(0); i < tx.meta.pgid; i++ { 408 | _, isReachable := reachable[i] 409 | if !isReachable && !freed[i] { 410 | ch <- fmt.Errorf("page %d: unreachable unfreed", int(i)) 411 | } 412 | } 413 | 414 | // Close the channel to signal completion. 415 | close(ch) 416 | } 417 | 418 | func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) { 419 | // Ignore inline buckets. 420 | if b.root == 0 { 421 | return 422 | } 423 | 424 | // Check every page used by this bucket. 425 | b.tx.forEachPage(b.root, 0, func(p *page, _ int) { 426 | if p.id > tx.meta.pgid { 427 | ch <- fmt.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid)) 428 | } 429 | 430 | // Ensure each page is only referenced once. 431 | for i := pgid(0); i <= pgid(p.overflow); i++ { 432 | var id = p.id + i 433 | if _, ok := reachable[id]; ok { 434 | ch <- fmt.Errorf("page %d: multiple references", int(id)) 435 | } 436 | reachable[id] = p 437 | } 438 | 439 | // We should only encounter un-freed leaf and branch pages. 440 | if freed[p.id] { 441 | ch <- fmt.Errorf("page %d: reachable freed", int(p.id)) 442 | } else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 { 443 | ch <- fmt.Errorf("page %d: invalid type: %s", int(p.id), p.typ()) 444 | } 445 | }) 446 | 447 | // Check each bucket within this bucket. 448 | _ = b.ForEach(func(k, v []byte) error { 449 | if child := b.Bucket(k); child != nil { 450 | tx.checkBucket(child, reachable, freed, ch) 451 | } 452 | return nil 453 | }) 454 | } 455 | 456 | // allocate returns a contiguous block of memory starting at a given page. 457 | func (tx *Tx) allocate(count int) (*page, error) { 458 | p, err := tx.db.allocate(count) 459 | if err != nil { 460 | return nil, err 461 | } 462 | 463 | // Save to our page cache. 464 | tx.pages[p.id] = p 465 | 466 | // Update statistics. 467 | tx.stats.PageCount++ 468 | tx.stats.PageAlloc += count * tx.db.pageSize 469 | 470 | return p, nil 471 | } 472 | 473 | // write writes any dirty pages to disk. 474 | func (tx *Tx) write() error { 475 | // Sort pages by id. 476 | pages := make(pages, 0, len(tx.pages)) 477 | for _, p := range tx.pages { 478 | pages = append(pages, p) 479 | } 480 | // Clear out page cache early. 481 | tx.pages = make(map[pgid]*page) 482 | sort.Sort(pages) 483 | 484 | // Write pages to disk in order. 485 | for _, p := range pages { 486 | size := (int(p.overflow) + 1) * tx.db.pageSize 487 | offset := int64(p.id) * int64(tx.db.pageSize) 488 | 489 | // Write out page in "max allocation" sized chunks. 490 | ptr := (*[maxAllocSize]byte)(unsafe.Pointer(p)) 491 | for { 492 | // Limit our write to our max allocation size. 493 | sz := size 494 | if sz > maxAllocSize-1 { 495 | sz = maxAllocSize - 1 496 | } 497 | 498 | // Write chunk to disk. 499 | buf := ptr[:sz] 500 | if _, err := tx.db.ops.writeAt(buf, offset); err != nil { 501 | return err 502 | } 503 | 504 | // Update statistics. 505 | tx.stats.Write++ 506 | 507 | // Exit inner for loop if we've written all the chunks. 508 | size -= sz 509 | if size == 0 { 510 | break 511 | } 512 | 513 | // Otherwise move offset forward and move pointer to next chunk. 514 | offset += int64(sz) 515 | ptr = (*[maxAllocSize]byte)(unsafe.Pointer(&ptr[sz])) 516 | } 517 | } 518 | 519 | // Ignore file sync if flag is set on DB. 520 | if !tx.db.NoSync || IgnoreNoSync { 521 | if err := fdatasync(tx.db); err != nil { 522 | return err 523 | } 524 | } 525 | 526 | // Put small pages back to page pool. 527 | for _, p := range pages { 528 | // Ignore page sizes over 1 page. 529 | // These are allocated using make() instead of the page pool. 530 | if int(p.overflow) != 0 { 531 | continue 532 | } 533 | 534 | buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:tx.db.pageSize] 535 | 536 | // See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1 537 | for i := range buf { 538 | buf[i] = 0 539 | } 540 | tx.db.pagePool.Put(buf) 541 | } 542 | 543 | return nil 544 | } 545 | 546 | // writeMeta writes the meta to the disk. 547 | func (tx *Tx) writeMeta() error { 548 | // Create a temporary buffer for the meta page. 549 | buf := make([]byte, tx.db.pageSize) 550 | p := tx.db.pageInBuffer(buf, 0) 551 | tx.meta.write(p) 552 | 553 | // Write the meta page to file. 554 | if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil { 555 | return err 556 | } 557 | if !tx.db.NoSync || IgnoreNoSync { 558 | if err := fdatasync(tx.db); err != nil { 559 | return err 560 | } 561 | } 562 | 563 | // Update statistics. 564 | tx.stats.Write++ 565 | 566 | return nil 567 | } 568 | 569 | // page returns a reference to the page with a given id. 570 | // If page has been written to then a temporary buffered page is returned. 571 | func (tx *Tx) page(id pgid) *page { 572 | // Check the dirty pages first. 573 | if tx.pages != nil { 574 | if p, ok := tx.pages[id]; ok { 575 | return p 576 | } 577 | } 578 | 579 | // Otherwise return directly from the mmap. 580 | return tx.db.page(id) 581 | } 582 | 583 | // forEachPage iterates over every page within a given page and executes a function. 584 | func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) { 585 | p := tx.page(pgid) 586 | 587 | // Execute function. 588 | fn(p, depth) 589 | 590 | // Recursively loop over children. 591 | if (p.flags & branchPageFlag) != 0 { 592 | for i := 0; i < int(p.count); i++ { 593 | elem := p.branchPageElement(uint16(i)) 594 | tx.forEachPage(elem.pgid, depth+1, fn) 595 | } 596 | } 597 | } 598 | 599 | // Page returns page information for a given page number. 600 | // This is only safe for concurrent use when used by a writable transaction. 601 | func (tx *Tx) Page(id int) (*PageInfo, error) { 602 | if tx.db == nil { 603 | return nil, ErrTxClosed 604 | } else if pgid(id) >= tx.meta.pgid { 605 | return nil, nil 606 | } 607 | 608 | // Build the page info. 609 | p := tx.db.page(pgid(id)) 610 | info := &PageInfo{ 611 | ID: id, 612 | Count: int(p.count), 613 | OverflowCount: int(p.overflow), 614 | } 615 | 616 | // Determine the type (or if it's free). 617 | if tx.db.freelist.freed(pgid(id)) { 618 | info.Type = "free" 619 | } else { 620 | info.Type = p.typ() 621 | } 622 | 623 | return info, nil 624 | } 625 | 626 | // TxStats represents statistics about the actions performed by the transaction. 627 | type TxStats struct { 628 | // Page statistics. 629 | PageCount int // number of page allocations 630 | PageAlloc int // total bytes allocated 631 | 632 | // Cursor statistics. 633 | CursorCount int // number of cursors created 634 | 635 | // Node statistics 636 | NodeCount int // number of node allocations 637 | NodeDeref int // number of node dereferences 638 | 639 | // Rebalance statistics. 640 | Rebalance int // number of node rebalances 641 | RebalanceTime time.Duration // total time spent rebalancing 642 | 643 | // Split/Spill statistics. 644 | Split int // number of nodes split 645 | Spill int // number of nodes spilled 646 | SpillTime time.Duration // total time spent spilling 647 | 648 | // Write statistics. 649 | Write int // number of writes performed 650 | WriteTime time.Duration // total time spent writing to disk 651 | } 652 | 653 | func (s *TxStats) add(other *TxStats) { 654 | s.PageCount += other.PageCount 655 | s.PageAlloc += other.PageAlloc 656 | s.CursorCount += other.CursorCount 657 | s.NodeCount += other.NodeCount 658 | s.NodeDeref += other.NodeDeref 659 | s.Rebalance += other.Rebalance 660 | s.RebalanceTime += other.RebalanceTime 661 | s.Split += other.Split 662 | s.Spill += other.Spill 663 | s.SpillTime += other.SpillTime 664 | s.Write += other.Write 665 | s.WriteTime += other.WriteTime 666 | } 667 | 668 | // Sub calculates and returns the difference between two sets of transaction stats. 669 | // This is useful when obtaining stats at two different points and time and 670 | // you need the performance counters that occurred within that time span. 671 | func (s *TxStats) Sub(other *TxStats) TxStats { 672 | var diff TxStats 673 | diff.PageCount = s.PageCount - other.PageCount 674 | diff.PageAlloc = s.PageAlloc - other.PageAlloc 675 | diff.CursorCount = s.CursorCount - other.CursorCount 676 | diff.NodeCount = s.NodeCount - other.NodeCount 677 | diff.NodeDeref = s.NodeDeref - other.NodeDeref 678 | diff.Rebalance = s.Rebalance - other.Rebalance 679 | diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime 680 | diff.Split = s.Split - other.Split 681 | diff.Spill = s.Spill - other.Spill 682 | diff.SpillTime = s.SpillTime - other.SpillTime 683 | diff.Write = s.Write - other.Write 684 | diff.WriteTime = s.WriteTime - other.WriteTime 685 | return diff 686 | } 687 | -------------------------------------------------------------------------------- /tx_test.go: -------------------------------------------------------------------------------- 1 | package bolt_test 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "log" 8 | "os" 9 | "testing" 10 | 11 | "github.com/boltdb/bolt" 12 | ) 13 | 14 | // Ensure that committing a closed transaction returns an error. 15 | func TestTx_Commit_ErrTxClosed(t *testing.T) { 16 | db := MustOpenDB() 17 | defer db.MustClose() 18 | tx, err := db.Begin(true) 19 | if err != nil { 20 | t.Fatal(err) 21 | } 22 | 23 | if _, err := tx.CreateBucket([]byte("foo")); err != nil { 24 | t.Fatal(err) 25 | } 26 | 27 | if err := tx.Commit(); err != nil { 28 | t.Fatal(err) 29 | } 30 | 31 | if err := tx.Commit(); err != bolt.ErrTxClosed { 32 | t.Fatalf("unexpected error: %s", err) 33 | } 34 | } 35 | 36 | // Ensure that rolling back a closed transaction returns an error. 37 | func TestTx_Rollback_ErrTxClosed(t *testing.T) { 38 | db := MustOpenDB() 39 | defer db.MustClose() 40 | 41 | tx, err := db.Begin(true) 42 | if err != nil { 43 | t.Fatal(err) 44 | } 45 | 46 | if err := tx.Rollback(); err != nil { 47 | t.Fatal(err) 48 | } 49 | if err := tx.Rollback(); err != bolt.ErrTxClosed { 50 | t.Fatalf("unexpected error: %s", err) 51 | } 52 | } 53 | 54 | // Ensure that committing a read-only transaction returns an error. 55 | func TestTx_Commit_ErrTxNotWritable(t *testing.T) { 56 | db := MustOpenDB() 57 | defer db.MustClose() 58 | tx, err := db.Begin(false) 59 | if err != nil { 60 | t.Fatal(err) 61 | } 62 | if err := tx.Commit(); err != bolt.ErrTxNotWritable { 63 | t.Fatal(err) 64 | } 65 | } 66 | 67 | // Ensure that a transaction can retrieve a cursor on the root bucket. 68 | func TestTx_Cursor(t *testing.T) { 69 | db := MustOpenDB() 70 | defer db.MustClose() 71 | if err := db.Update(func(tx *bolt.Tx) error { 72 | if _, err := tx.CreateBucket([]byte("widgets")); err != nil { 73 | t.Fatal(err) 74 | } 75 | 76 | if _, err := tx.CreateBucket([]byte("woojits")); err != nil { 77 | t.Fatal(err) 78 | } 79 | 80 | c := tx.Cursor() 81 | if k, v := c.First(); !bytes.Equal(k, []byte("widgets")) { 82 | t.Fatalf("unexpected key: %v", k) 83 | } else if v != nil { 84 | t.Fatalf("unexpected value: %v", v) 85 | } 86 | 87 | if k, v := c.Next(); !bytes.Equal(k, []byte("woojits")) { 88 | t.Fatalf("unexpected key: %v", k) 89 | } else if v != nil { 90 | t.Fatalf("unexpected value: %v", v) 91 | } 92 | 93 | if k, v := c.Next(); k != nil { 94 | t.Fatalf("unexpected key: %v", k) 95 | } else if v != nil { 96 | t.Fatalf("unexpected value: %v", k) 97 | } 98 | 99 | return nil 100 | }); err != nil { 101 | t.Fatal(err) 102 | } 103 | } 104 | 105 | // Ensure that creating a bucket with a read-only transaction returns an error. 106 | func TestTx_CreateBucket_ErrTxNotWritable(t *testing.T) { 107 | db := MustOpenDB() 108 | defer db.MustClose() 109 | if err := db.View(func(tx *bolt.Tx) error { 110 | _, err := tx.CreateBucket([]byte("foo")) 111 | if err != bolt.ErrTxNotWritable { 112 | t.Fatalf("unexpected error: %s", err) 113 | } 114 | return nil 115 | }); err != nil { 116 | t.Fatal(err) 117 | } 118 | } 119 | 120 | // Ensure that creating a bucket on a closed transaction returns an error. 121 | func TestTx_CreateBucket_ErrTxClosed(t *testing.T) { 122 | db := MustOpenDB() 123 | defer db.MustClose() 124 | tx, err := db.Begin(true) 125 | if err != nil { 126 | t.Fatal(err) 127 | } 128 | if err := tx.Commit(); err != nil { 129 | t.Fatal(err) 130 | } 131 | 132 | if _, err := tx.CreateBucket([]byte("foo")); err != bolt.ErrTxClosed { 133 | t.Fatalf("unexpected error: %s", err) 134 | } 135 | } 136 | 137 | // Ensure that a Tx can retrieve a bucket. 138 | func TestTx_Bucket(t *testing.T) { 139 | db := MustOpenDB() 140 | defer db.MustClose() 141 | if err := db.Update(func(tx *bolt.Tx) error { 142 | if _, err := tx.CreateBucket([]byte("widgets")); err != nil { 143 | t.Fatal(err) 144 | } 145 | if tx.Bucket([]byte("widgets")) == nil { 146 | t.Fatal("expected bucket") 147 | } 148 | return nil 149 | }); err != nil { 150 | t.Fatal(err) 151 | } 152 | } 153 | 154 | // Ensure that a Tx retrieving a non-existent key returns nil. 155 | func TestTx_Get_NotFound(t *testing.T) { 156 | db := MustOpenDB() 157 | defer db.MustClose() 158 | if err := db.Update(func(tx *bolt.Tx) error { 159 | b, err := tx.CreateBucket([]byte("widgets")) 160 | if err != nil { 161 | t.Fatal(err) 162 | } 163 | 164 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 165 | t.Fatal(err) 166 | } 167 | if b.Get([]byte("no_such_key")) != nil { 168 | t.Fatal("expected nil value") 169 | } 170 | return nil 171 | }); err != nil { 172 | t.Fatal(err) 173 | } 174 | } 175 | 176 | // Ensure that a bucket can be created and retrieved. 177 | func TestTx_CreateBucket(t *testing.T) { 178 | db := MustOpenDB() 179 | defer db.MustClose() 180 | 181 | // Create a bucket. 182 | if err := db.Update(func(tx *bolt.Tx) error { 183 | b, err := tx.CreateBucket([]byte("widgets")) 184 | if err != nil { 185 | t.Fatal(err) 186 | } else if b == nil { 187 | t.Fatal("expected bucket") 188 | } 189 | return nil 190 | }); err != nil { 191 | t.Fatal(err) 192 | } 193 | 194 | // Read the bucket through a separate transaction. 195 | if err := db.View(func(tx *bolt.Tx) error { 196 | if tx.Bucket([]byte("widgets")) == nil { 197 | t.Fatal("expected bucket") 198 | } 199 | return nil 200 | }); err != nil { 201 | t.Fatal(err) 202 | } 203 | } 204 | 205 | // Ensure that a bucket can be created if it doesn't already exist. 206 | func TestTx_CreateBucketIfNotExists(t *testing.T) { 207 | db := MustOpenDB() 208 | defer db.MustClose() 209 | if err := db.Update(func(tx *bolt.Tx) error { 210 | // Create bucket. 211 | if b, err := tx.CreateBucketIfNotExists([]byte("widgets")); err != nil { 212 | t.Fatal(err) 213 | } else if b == nil { 214 | t.Fatal("expected bucket") 215 | } 216 | 217 | // Create bucket again. 218 | if b, err := tx.CreateBucketIfNotExists([]byte("widgets")); err != nil { 219 | t.Fatal(err) 220 | } else if b == nil { 221 | t.Fatal("expected bucket") 222 | } 223 | 224 | return nil 225 | }); err != nil { 226 | t.Fatal(err) 227 | } 228 | 229 | // Read the bucket through a separate transaction. 230 | if err := db.View(func(tx *bolt.Tx) error { 231 | if tx.Bucket([]byte("widgets")) == nil { 232 | t.Fatal("expected bucket") 233 | } 234 | return nil 235 | }); err != nil { 236 | t.Fatal(err) 237 | } 238 | } 239 | 240 | // Ensure transaction returns an error if creating an unnamed bucket. 241 | func TestTx_CreateBucketIfNotExists_ErrBucketNameRequired(t *testing.T) { 242 | db := MustOpenDB() 243 | defer db.MustClose() 244 | if err := db.Update(func(tx *bolt.Tx) error { 245 | if _, err := tx.CreateBucketIfNotExists([]byte{}); err != bolt.ErrBucketNameRequired { 246 | t.Fatalf("unexpected error: %s", err) 247 | } 248 | 249 | if _, err := tx.CreateBucketIfNotExists(nil); err != bolt.ErrBucketNameRequired { 250 | t.Fatalf("unexpected error: %s", err) 251 | } 252 | 253 | return nil 254 | }); err != nil { 255 | t.Fatal(err) 256 | } 257 | } 258 | 259 | // Ensure that a bucket cannot be created twice. 260 | func TestTx_CreateBucket_ErrBucketExists(t *testing.T) { 261 | db := MustOpenDB() 262 | defer db.MustClose() 263 | 264 | // Create a bucket. 265 | if err := db.Update(func(tx *bolt.Tx) error { 266 | if _, err := tx.CreateBucket([]byte("widgets")); err != nil { 267 | t.Fatal(err) 268 | } 269 | return nil 270 | }); err != nil { 271 | t.Fatal(err) 272 | } 273 | 274 | // Create the same bucket again. 275 | if err := db.Update(func(tx *bolt.Tx) error { 276 | if _, err := tx.CreateBucket([]byte("widgets")); err != bolt.ErrBucketExists { 277 | t.Fatalf("unexpected error: %s", err) 278 | } 279 | return nil 280 | }); err != nil { 281 | t.Fatal(err) 282 | } 283 | } 284 | 285 | // Ensure that a bucket is created with a non-blank name. 286 | func TestTx_CreateBucket_ErrBucketNameRequired(t *testing.T) { 287 | db := MustOpenDB() 288 | defer db.MustClose() 289 | if err := db.Update(func(tx *bolt.Tx) error { 290 | if _, err := tx.CreateBucket(nil); err != bolt.ErrBucketNameRequired { 291 | t.Fatalf("unexpected error: %s", err) 292 | } 293 | return nil 294 | }); err != nil { 295 | t.Fatal(err) 296 | } 297 | } 298 | 299 | // Ensure that a bucket can be deleted. 300 | func TestTx_DeleteBucket(t *testing.T) { 301 | db := MustOpenDB() 302 | defer db.MustClose() 303 | 304 | // Create a bucket and add a value. 305 | if err := db.Update(func(tx *bolt.Tx) error { 306 | b, err := tx.CreateBucket([]byte("widgets")) 307 | if err != nil { 308 | t.Fatal(err) 309 | } 310 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 311 | t.Fatal(err) 312 | } 313 | return nil 314 | }); err != nil { 315 | t.Fatal(err) 316 | } 317 | 318 | // Delete the bucket and make sure we can't get the value. 319 | if err := db.Update(func(tx *bolt.Tx) error { 320 | if err := tx.DeleteBucket([]byte("widgets")); err != nil { 321 | t.Fatal(err) 322 | } 323 | if tx.Bucket([]byte("widgets")) != nil { 324 | t.Fatal("unexpected bucket") 325 | } 326 | return nil 327 | }); err != nil { 328 | t.Fatal(err) 329 | } 330 | 331 | if err := db.Update(func(tx *bolt.Tx) error { 332 | // Create the bucket again and make sure there's not a phantom value. 333 | b, err := tx.CreateBucket([]byte("widgets")) 334 | if err != nil { 335 | t.Fatal(err) 336 | } 337 | if v := b.Get([]byte("foo")); v != nil { 338 | t.Fatalf("unexpected phantom value: %v", v) 339 | } 340 | return nil 341 | }); err != nil { 342 | t.Fatal(err) 343 | } 344 | } 345 | 346 | // Ensure that deleting a bucket on a closed transaction returns an error. 347 | func TestTx_DeleteBucket_ErrTxClosed(t *testing.T) { 348 | db := MustOpenDB() 349 | defer db.MustClose() 350 | tx, err := db.Begin(true) 351 | if err != nil { 352 | t.Fatal(err) 353 | } 354 | if err := tx.Commit(); err != nil { 355 | t.Fatal(err) 356 | } 357 | if err := tx.DeleteBucket([]byte("foo")); err != bolt.ErrTxClosed { 358 | t.Fatalf("unexpected error: %s", err) 359 | } 360 | } 361 | 362 | // Ensure that deleting a bucket with a read-only transaction returns an error. 363 | func TestTx_DeleteBucket_ReadOnly(t *testing.T) { 364 | db := MustOpenDB() 365 | defer db.MustClose() 366 | if err := db.View(func(tx *bolt.Tx) error { 367 | if err := tx.DeleteBucket([]byte("foo")); err != bolt.ErrTxNotWritable { 368 | t.Fatalf("unexpected error: %s", err) 369 | } 370 | return nil 371 | }); err != nil { 372 | t.Fatal(err) 373 | } 374 | } 375 | 376 | // Ensure that nothing happens when deleting a bucket that doesn't exist. 377 | func TestTx_DeleteBucket_NotFound(t *testing.T) { 378 | db := MustOpenDB() 379 | defer db.MustClose() 380 | if err := db.Update(func(tx *bolt.Tx) error { 381 | if err := tx.DeleteBucket([]byte("widgets")); err != bolt.ErrBucketNotFound { 382 | t.Fatalf("unexpected error: %s", err) 383 | } 384 | return nil 385 | }); err != nil { 386 | t.Fatal(err) 387 | } 388 | } 389 | 390 | // Ensure that no error is returned when a tx.ForEach function does not return 391 | // an error. 392 | func TestTx_ForEach_NoError(t *testing.T) { 393 | db := MustOpenDB() 394 | defer db.MustClose() 395 | if err := db.Update(func(tx *bolt.Tx) error { 396 | b, err := tx.CreateBucket([]byte("widgets")) 397 | if err != nil { 398 | t.Fatal(err) 399 | } 400 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 401 | t.Fatal(err) 402 | } 403 | 404 | if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error { 405 | return nil 406 | }); err != nil { 407 | t.Fatal(err) 408 | } 409 | return nil 410 | }); err != nil { 411 | t.Fatal(err) 412 | } 413 | } 414 | 415 | // Ensure that an error is returned when a tx.ForEach function returns an error. 416 | func TestTx_ForEach_WithError(t *testing.T) { 417 | db := MustOpenDB() 418 | defer db.MustClose() 419 | if err := db.Update(func(tx *bolt.Tx) error { 420 | b, err := tx.CreateBucket([]byte("widgets")) 421 | if err != nil { 422 | t.Fatal(err) 423 | } 424 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 425 | t.Fatal(err) 426 | } 427 | 428 | marker := errors.New("marker") 429 | if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error { 430 | return marker 431 | }); err != marker { 432 | t.Fatalf("unexpected error: %s", err) 433 | } 434 | return nil 435 | }); err != nil { 436 | t.Fatal(err) 437 | } 438 | } 439 | 440 | // Ensure that Tx commit handlers are called after a transaction successfully commits. 441 | func TestTx_OnCommit(t *testing.T) { 442 | db := MustOpenDB() 443 | defer db.MustClose() 444 | 445 | var x int 446 | if err := db.Update(func(tx *bolt.Tx) error { 447 | tx.OnCommit(func() { x += 1 }) 448 | tx.OnCommit(func() { x += 2 }) 449 | if _, err := tx.CreateBucket([]byte("widgets")); err != nil { 450 | t.Fatal(err) 451 | } 452 | return nil 453 | }); err != nil { 454 | t.Fatal(err) 455 | } else if x != 3 { 456 | t.Fatalf("unexpected x: %d", x) 457 | } 458 | } 459 | 460 | // Ensure that Tx commit handlers are NOT called after a transaction rolls back. 461 | func TestTx_OnCommit_Rollback(t *testing.T) { 462 | db := MustOpenDB() 463 | defer db.MustClose() 464 | 465 | var x int 466 | if err := db.Update(func(tx *bolt.Tx) error { 467 | tx.OnCommit(func() { x += 1 }) 468 | tx.OnCommit(func() { x += 2 }) 469 | if _, err := tx.CreateBucket([]byte("widgets")); err != nil { 470 | t.Fatal(err) 471 | } 472 | return errors.New("rollback this commit") 473 | }); err == nil || err.Error() != "rollback this commit" { 474 | t.Fatalf("unexpected error: %s", err) 475 | } else if x != 0 { 476 | t.Fatalf("unexpected x: %d", x) 477 | } 478 | } 479 | 480 | // Ensure that the database can be copied to a file path. 481 | func TestTx_CopyFile(t *testing.T) { 482 | db := MustOpenDB() 483 | defer db.MustClose() 484 | 485 | path := tempfile() 486 | if err := db.Update(func(tx *bolt.Tx) error { 487 | b, err := tx.CreateBucket([]byte("widgets")) 488 | if err != nil { 489 | t.Fatal(err) 490 | } 491 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 492 | t.Fatal(err) 493 | } 494 | if err := b.Put([]byte("baz"), []byte("bat")); err != nil { 495 | t.Fatal(err) 496 | } 497 | return nil 498 | }); err != nil { 499 | t.Fatal(err) 500 | } 501 | 502 | if err := db.View(func(tx *bolt.Tx) error { 503 | return tx.CopyFile(path, 0600) 504 | }); err != nil { 505 | t.Fatal(err) 506 | } 507 | 508 | db2, err := bolt.Open(path, 0600, nil) 509 | if err != nil { 510 | t.Fatal(err) 511 | } 512 | 513 | if err := db2.View(func(tx *bolt.Tx) error { 514 | if v := tx.Bucket([]byte("widgets")).Get([]byte("foo")); !bytes.Equal(v, []byte("bar")) { 515 | t.Fatalf("unexpected value: %v", v) 516 | } 517 | if v := tx.Bucket([]byte("widgets")).Get([]byte("baz")); !bytes.Equal(v, []byte("bat")) { 518 | t.Fatalf("unexpected value: %v", v) 519 | } 520 | return nil 521 | }); err != nil { 522 | t.Fatal(err) 523 | } 524 | 525 | if err := db2.Close(); err != nil { 526 | t.Fatal(err) 527 | } 528 | } 529 | 530 | type failWriterError struct{} 531 | 532 | func (failWriterError) Error() string { 533 | return "error injected for tests" 534 | } 535 | 536 | type failWriter struct { 537 | // fail after this many bytes 538 | After int 539 | } 540 | 541 | func (f *failWriter) Write(p []byte) (n int, err error) { 542 | n = len(p) 543 | if n > f.After { 544 | n = f.After 545 | err = failWriterError{} 546 | } 547 | f.After -= n 548 | return n, err 549 | } 550 | 551 | // Ensure that Copy handles write errors right. 552 | func TestTx_CopyFile_Error_Meta(t *testing.T) { 553 | db := MustOpenDB() 554 | defer db.MustClose() 555 | if err := db.Update(func(tx *bolt.Tx) error { 556 | b, err := tx.CreateBucket([]byte("widgets")) 557 | if err != nil { 558 | t.Fatal(err) 559 | } 560 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 561 | t.Fatal(err) 562 | } 563 | if err := b.Put([]byte("baz"), []byte("bat")); err != nil { 564 | t.Fatal(err) 565 | } 566 | return nil 567 | }); err != nil { 568 | t.Fatal(err) 569 | } 570 | 571 | if err := db.View(func(tx *bolt.Tx) error { 572 | return tx.Copy(&failWriter{}) 573 | }); err == nil || err.Error() != "meta 0 copy: error injected for tests" { 574 | t.Fatalf("unexpected error: %v", err) 575 | } 576 | } 577 | 578 | // Ensure that Copy handles write errors right. 579 | func TestTx_CopyFile_Error_Normal(t *testing.T) { 580 | db := MustOpenDB() 581 | defer db.MustClose() 582 | if err := db.Update(func(tx *bolt.Tx) error { 583 | b, err := tx.CreateBucket([]byte("widgets")) 584 | if err != nil { 585 | t.Fatal(err) 586 | } 587 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 588 | t.Fatal(err) 589 | } 590 | if err := b.Put([]byte("baz"), []byte("bat")); err != nil { 591 | t.Fatal(err) 592 | } 593 | return nil 594 | }); err != nil { 595 | t.Fatal(err) 596 | } 597 | 598 | if err := db.View(func(tx *bolt.Tx) error { 599 | return tx.Copy(&failWriter{3 * db.Info().PageSize}) 600 | }); err == nil || err.Error() != "error injected for tests" { 601 | t.Fatalf("unexpected error: %v", err) 602 | } 603 | } 604 | 605 | func ExampleTx_Rollback() { 606 | // Open the database. 607 | db, err := bolt.Open(tempfile(), 0666, nil) 608 | if err != nil { 609 | log.Fatal(err) 610 | } 611 | defer os.Remove(db.Path()) 612 | 613 | // Create a bucket. 614 | if err := db.Update(func(tx *bolt.Tx) error { 615 | _, err := tx.CreateBucket([]byte("widgets")) 616 | return err 617 | }); err != nil { 618 | log.Fatal(err) 619 | } 620 | 621 | // Set a value for a key. 622 | if err := db.Update(func(tx *bolt.Tx) error { 623 | return tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) 624 | }); err != nil { 625 | log.Fatal(err) 626 | } 627 | 628 | // Update the key but rollback the transaction so it never saves. 629 | tx, err := db.Begin(true) 630 | if err != nil { 631 | log.Fatal(err) 632 | } 633 | b := tx.Bucket([]byte("widgets")) 634 | if err := b.Put([]byte("foo"), []byte("baz")); err != nil { 635 | log.Fatal(err) 636 | } 637 | if err := tx.Rollback(); err != nil { 638 | log.Fatal(err) 639 | } 640 | 641 | // Ensure that our original value is still set. 642 | if err := db.View(func(tx *bolt.Tx) error { 643 | value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) 644 | fmt.Printf("The value for 'foo' is still: %s\n", value) 645 | return nil 646 | }); err != nil { 647 | log.Fatal(err) 648 | } 649 | 650 | // Close database to release file lock. 651 | if err := db.Close(); err != nil { 652 | log.Fatal(err) 653 | } 654 | 655 | // Output: 656 | // The value for 'foo' is still: bar 657 | } 658 | 659 | func ExampleTx_CopyFile() { 660 | // Open the database. 661 | db, err := bolt.Open(tempfile(), 0666, nil) 662 | if err != nil { 663 | log.Fatal(err) 664 | } 665 | defer os.Remove(db.Path()) 666 | 667 | // Create a bucket and a key. 668 | if err := db.Update(func(tx *bolt.Tx) error { 669 | b, err := tx.CreateBucket([]byte("widgets")) 670 | if err != nil { 671 | return err 672 | } 673 | if err := b.Put([]byte("foo"), []byte("bar")); err != nil { 674 | return err 675 | } 676 | return nil 677 | }); err != nil { 678 | log.Fatal(err) 679 | } 680 | 681 | // Copy the database to another file. 682 | toFile := tempfile() 683 | if err := db.View(func(tx *bolt.Tx) error { 684 | return tx.CopyFile(toFile, 0666) 685 | }); err != nil { 686 | log.Fatal(err) 687 | } 688 | defer os.Remove(toFile) 689 | 690 | // Open the cloned database. 691 | db2, err := bolt.Open(toFile, 0666, nil) 692 | if err != nil { 693 | log.Fatal(err) 694 | } 695 | 696 | // Ensure that the key exists in the copy. 697 | if err := db2.View(func(tx *bolt.Tx) error { 698 | value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) 699 | fmt.Printf("The value for 'foo' in the clone is: %s\n", value) 700 | return nil 701 | }); err != nil { 702 | log.Fatal(err) 703 | } 704 | 705 | // Close database to release file lock. 706 | if err := db.Close(); err != nil { 707 | log.Fatal(err) 708 | } 709 | 710 | if err := db2.Close(); err != nil { 711 | log.Fatal(err) 712 | } 713 | 714 | // Output: 715 | // The value for 'foo' in the clone is: bar 716 | } 717 | --------------------------------------------------------------------------------