├── go.mod ├── sum └── sum.go ├── doc.go ├── stack ├── types.go └── stack.go ├── cache ├── scheduler │ ├── types.go │ └── scheduler.go ├── types.go └── cache.go ├── scheduler ├── manager │ ├── types.go │ └── manager.go ├── types.go └── scheduler.go ├── wal ├── wal.go ├── types.go ├── writer.go ├── file.go └── recover.go ├── data ├── types.go ├── file.go └── data.go ├── suffix ├── forward.go ├── backward.go ├── types.go └── suffix.go ├── disk ├── types.go └── disk.go ├── errmsg └── types.go ├── constant └── types.go ├── db ├── types.go └── db.go ├── mvcc ├── types.go ├── backward.go ├── forward.go └── mvcc.go ├── locker ├── types.go └── locker.go ├── prefix ├── types.go ├── backward.go ├── forward.go └── prefix.go ├── test └── main.go ├── README.md ├── transaction ├── types.go ├── forward.go ├── backward.go └── transaction.go ├── go.sum └── LICENSE /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/infinivision/gaeadb 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/nnsgmsone/damrey v1.0.2 7 | golang.org/x/sys v0.0.0-20191210023423-ac6580df4449 8 | ) 9 | -------------------------------------------------------------------------------- /sum/sum.go: -------------------------------------------------------------------------------- 1 | package sum 2 | 3 | import ( 4 | "hash" 5 | ) 6 | 7 | func Sum(h hash.Hash32, data []byte) uint32 { 8 | h.Reset() 9 | h.Write(data) 10 | return h.Sum32() 11 | } 12 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package gaeadb implements a Database engine in pure Go. It supports 3 | Repeatable reads transactions, ACID semantics, and MVCC(Serializable Snapshot Isolation) with 4 | multiple readers and multiple writer. 5 | */ 6 | package gaeadb 7 | -------------------------------------------------------------------------------- /stack/types.go: -------------------------------------------------------------------------------- 1 | package stack 2 | 3 | import "container/list" 4 | 5 | type Stack interface { 6 | IsEmpty() bool 7 | Push(interface{}) 8 | Pop() interface{} 9 | Peek() interface{} 10 | } 11 | 12 | type stack struct { 13 | l *list.List 14 | } 15 | -------------------------------------------------------------------------------- /cache/scheduler/types.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import "github.com/infinivision/gaeadb/disk" 4 | 5 | type Scheduler interface { 6 | Close() error 7 | Flush() error 8 | Write(disk.Block) error 9 | Read(int64) (disk.Block, error) 10 | } 11 | 12 | type scheduler struct { 13 | d disk.Disk 14 | } 15 | -------------------------------------------------------------------------------- /scheduler/manager/types.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | type Manager interface { 4 | Add(uint64) 5 | Del(uint64) bool 6 | } 7 | 8 | type element struct { 9 | n int // reference count 10 | ts uint64 11 | } 12 | 13 | type manager struct { 14 | xs []*element 15 | ch chan struct{} 16 | mp map[uint64]*element 17 | } 18 | -------------------------------------------------------------------------------- /wal/wal.go: -------------------------------------------------------------------------------- 1 | package wal 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "golang.org/x/sys/unix" 8 | ) 9 | 10 | func NewWriter(dir string) (*walWriter, error) { 11 | return newWriter(dir, unix.O_RDWR|unix.O_DIRECT) 12 | } 13 | 14 | func fileName(idx int, dir string) string { 15 | return fmt.Sprintf("%s%c%v.LOG", dir, os.PathSeparator, idx) 16 | } 17 | -------------------------------------------------------------------------------- /stack/stack.go: -------------------------------------------------------------------------------- 1 | package stack 2 | 3 | import "container/list" 4 | 5 | func New() *stack { 6 | return &stack{new(list.List)} 7 | } 8 | 9 | func (s *stack) IsEmpty() bool { 10 | return s.l.Len() == 0 11 | } 12 | 13 | func (s *stack) Pop() interface{} { 14 | if e := s.l.Front(); e != nil { 15 | s.l.Remove(e) 16 | return e.Value 17 | } 18 | return nil 19 | } 20 | 21 | func (s *stack) Peek() interface{} { 22 | if e := s.l.Front(); e != nil { 23 | return e.Value 24 | } 25 | return nil 26 | } 27 | 28 | func (s *stack) Push(v interface{}) { 29 | s.l.PushFront(v) 30 | } 31 | -------------------------------------------------------------------------------- /data/types.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "os" 5 | "sync" 6 | ) 7 | 8 | const ( 9 | HeaderSize = 2 10 | ) 11 | 12 | const ( 13 | Magic = "gaeadb" 14 | ) 15 | 16 | type Data interface { 17 | Close() error 18 | Flush() error 19 | Del(uint64) error 20 | Read(uint64) ([]byte, error) 21 | Write(uint64, []byte) error 22 | Alloc([]byte) (uint64, error) 23 | 24 | Load(uint64, int) ([]byte, error) 25 | } 26 | 27 | type file struct { 28 | size uint64 29 | fp *os.File 30 | } 31 | 32 | type data struct { 33 | sync.Mutex 34 | dir string 35 | size uint64 36 | fs []*file 37 | } 38 | -------------------------------------------------------------------------------- /cache/scheduler/scheduler.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "github.com/infinivision/gaeadb/constant" 5 | "github.com/infinivision/gaeadb/disk" 6 | ) 7 | 8 | func New(d disk.Disk) *scheduler { 9 | return &scheduler{d} 10 | } 11 | 12 | func (s *scheduler) Close() error { 13 | return s.d.Close() 14 | } 15 | 16 | func (s *scheduler) Flush() error { 17 | return s.d.Flush() 18 | } 19 | 20 | func (s *scheduler) Write(b disk.Block) error { 21 | return s.d.Write(b) 22 | } 23 | 24 | func (s *scheduler) Read(bn int64) (disk.Block, error) { 25 | return s.d.Read(bn, make([]byte, constant.BlockSize)) 26 | } 27 | -------------------------------------------------------------------------------- /suffix/forward.go: -------------------------------------------------------------------------------- 1 | package suffix 2 | 3 | import ( 4 | "bytes" 5 | ) 6 | 7 | func (itr *forwardIterator) Next() { 8 | for len(itr.es) > 0 { 9 | itr.es = itr.es[1:] 10 | if len(itr.es) == 0 { 11 | return 12 | } 13 | if len(itr.prefix) == 0 { 14 | return 15 | } 16 | if bytes.HasPrefix(itr.es[0].suff, itr.prefix) { 17 | return 18 | } 19 | } 20 | } 21 | 22 | func (itr *forwardIterator) Valid() bool { 23 | return len(itr.es) != 0 24 | } 25 | 26 | func (itr *forwardIterator) Key() []byte { 27 | return itr.es[0].suff 28 | } 29 | 30 | func (itr *forwardIterator) Value() uint64 { 31 | return itr.es[0].off 32 | } 33 | -------------------------------------------------------------------------------- /suffix/backward.go: -------------------------------------------------------------------------------- 1 | package suffix 2 | 3 | import "bytes" 4 | 5 | func (itr *backwardIterator) Next() { 6 | for len(itr.es) > 0 { 7 | itr.es = itr.es[:len(itr.es)-1] 8 | if len(itr.es) == 0 { 9 | return 10 | } 11 | if len(itr.prefix) == 0 { 12 | return 13 | } 14 | if bytes.HasPrefix(itr.es[len(itr.es)-1].suff, itr.prefix) { 15 | return 16 | } 17 | } 18 | } 19 | 20 | func (itr *backwardIterator) Valid() bool { 21 | return len(itr.es) != 0 22 | } 23 | 24 | func (itr *backwardIterator) Key() []byte { 25 | return itr.es[len(itr.es)-1].suff 26 | } 27 | 28 | func (itr *backwardIterator) Value() uint64 { 29 | return itr.es[len(itr.es)-1].off 30 | } 31 | -------------------------------------------------------------------------------- /disk/types.go: -------------------------------------------------------------------------------- 1 | package disk 2 | 3 | import ( 4 | "os" 5 | ) 6 | 7 | const ( 8 | InitDiskSize = 257 // 256 + 1 9 | ) 10 | 11 | type Block interface { 12 | Buffer() []byte 13 | BlockNumber() int64 14 | } 15 | 16 | type Disk interface { 17 | Close() error 18 | Flush() error 19 | Blocks() int64 20 | Write(Block) error 21 | Read(int64, []byte) (Block, error) 22 | } 23 | 24 | type block struct { 25 | bn int64 // block number 26 | buffer []byte 27 | } 28 | 29 | type disk struct { 30 | cnt int64 // block count 31 | fp *os.File 32 | } 33 | 34 | func (a *block) Buffer() []byte { 35 | return a.buffer 36 | } 37 | 38 | func (a *block) BlockNumber() int64 { 39 | return a.bn 40 | } 41 | -------------------------------------------------------------------------------- /errmsg/types.go: -------------------------------------------------------------------------------- 1 | package errmsg 2 | 3 | import "errors" 4 | 5 | var ( 6 | ScanEnd = errors.New("scan end") 7 | NotExist = errors.New("not exist") 8 | OpenFailed = errors.New("open failed") 9 | ReadFailed = errors.New("read failed") 10 | WriteFailed = errors.New("write failed") 11 | KeyTooLong = errors.New("key too long") 12 | KeyIsEmpty = errors.New("key is empty") 13 | ValTooLong = errors.New("value too long") 14 | OutOfSpace = errors.New("out of space") 15 | UnknownError = errors.New("unknown error") 16 | TransactionConflict = errors.New("transaction conflict") 17 | ReadOnlyTransaction = errors.New("read-only transaction") 18 | ) 19 | -------------------------------------------------------------------------------- /suffix/types.go: -------------------------------------------------------------------------------- 1 | package suffix 2 | 3 | import ( 4 | "github.com/infinivision/gaeadb/cache" 5 | ) 6 | 7 | const ( 8 | HeaderSize = 4 9 | ElementHeaderSize = 10 10 | ) 11 | 12 | type Iterator interface { 13 | Next() 14 | Valid() bool 15 | Key() []byte 16 | Value() uint64 17 | } 18 | 19 | type Writer interface { 20 | NewSuffix(byte, byte, uint64, uint64) error 21 | ChgPrefix(uint64, []uint16, []uint64) error 22 | NewPrefix(uint64, uint64, uint16, []uint16, []uint64) error 23 | } 24 | 25 | type element struct { 26 | off uint64 27 | suff []byte 28 | } 29 | 30 | type forwardIterator struct { 31 | prefix []byte 32 | es []*element 33 | } 34 | 35 | type backwardIterator struct { 36 | prefix []byte 37 | es []*element 38 | } 39 | 40 | type suffix struct { 41 | free int 42 | w Writer 43 | pg cache.Page 44 | es []*element 45 | } 46 | -------------------------------------------------------------------------------- /constant/types.go: -------------------------------------------------------------------------------- 1 | package constant 2 | 3 | import "time" 4 | 5 | var ( 6 | CheckPointCycle = 5 * time.Second 7 | ) 8 | 9 | const ( 10 | RootPage = int64(0) 11 | Preallocate = int64(257) 12 | ) 13 | 14 | const ( 15 | Cancel = iota // must be zero 16 | Delete 17 | Empty 18 | Cache 19 | ) 20 | 21 | const ( 22 | PreLoad = 100 23 | ) 24 | 25 | const ( 26 | MaxKeySize = 4074 27 | MaxValueSize = 1 << 16 // 64KB 28 | MaxTransactionSize = 1 << 26 // 64MB 29 | MaxDataFileSize = 1 << 40 // 1TB 30 | MaxLoadDataSize = 1 << 10 // 1KB 31 | ) 32 | 33 | const ( 34 | BlockSize = 4096 // 4k 35 | ) 36 | 37 | const ( 38 | PN = iota // prefix node 39 | SN // suffix node 40 | MS // mixed suffix node 41 | ES // empty suffix node 42 | ) 43 | 44 | const ( 45 | TypeOff = uint64(56) 46 | TypeMask = uint64(0xFF) 47 | Mask = uint64(0xFFFFFFFFFFFFFF) 48 | ) 49 | -------------------------------------------------------------------------------- /db/types.go: -------------------------------------------------------------------------------- 1 | package db 2 | 3 | import ( 4 | "io" 5 | "time" 6 | 7 | "github.com/infinivision/gaeadb/cache" 8 | "github.com/infinivision/gaeadb/data" 9 | "github.com/infinivision/gaeadb/mvcc" 10 | "github.com/infinivision/gaeadb/scheduler" 11 | "github.com/infinivision/gaeadb/transaction" 12 | "github.com/infinivision/gaeadb/wal" 13 | "github.com/nnsgmsone/damrey/logger" 14 | ) 15 | 16 | /* 17 | DB provides the various functions required to interact with gaeadb. DB is thread-safe. 18 | */ 19 | type DB interface { 20 | Close() error 21 | 22 | Del([]byte) error 23 | Set([]byte, []byte) error 24 | Get([]byte) ([]byte, error) 25 | 26 | NewTransaction(bool) (transaction.Transaction, error) 27 | } 28 | 29 | type Config struct { 30 | CacheSize int // cache size 31 | DirName string 32 | LogWriter io.Writer 33 | CheckPointCycle time.Duration 34 | } 35 | 36 | type db struct { 37 | d data.Data 38 | m mvcc.MVCC 39 | w wal.Writer 40 | c cache.Cache 41 | log logger.Log 42 | schd scheduler.Scheduler 43 | } 44 | -------------------------------------------------------------------------------- /mvcc/types.go: -------------------------------------------------------------------------------- 1 | package mvcc 2 | 3 | import ( 4 | "github.com/infinivision/gaeadb/prefix" 5 | "github.com/infinivision/gaeadb/suffix" 6 | ) 7 | 8 | type MVCC interface { 9 | Close() error 10 | 11 | Exist([]byte, uint64) bool 12 | Del([]byte, uint64, suffix.Writer) error 13 | Get([]byte, uint64) (uint64, uint64, error) 14 | Set([]byte, uint64, uint64, suffix.Writer) error 15 | 16 | NewForwardIterator([]byte, uint64) (Iterator, error) 17 | NewBackwardIterator([]byte, uint64) (Iterator, error) 18 | } 19 | 20 | type Iterator interface { 21 | Close() error 22 | Next() error 23 | Valid() bool 24 | Key() []byte 25 | Value() uint64 26 | Timestamp() uint64 27 | } 28 | 29 | type entry struct { 30 | k []byte 31 | v uint64 32 | ts uint64 33 | } 34 | 35 | type forwardIterator struct { 36 | s bool 37 | e *entry 38 | ts uint64 39 | itr prefix.Iterator 40 | } 41 | 42 | type backwardIterator struct { 43 | s bool 44 | e *entry 45 | ts uint64 46 | itr prefix.Iterator 47 | } 48 | 49 | type mvcc struct { 50 | t prefix.Tree 51 | } 52 | -------------------------------------------------------------------------------- /locker/types.go: -------------------------------------------------------------------------------- 1 | package locker 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | "sync/atomic" 7 | ) 8 | 9 | const ( 10 | Cycle = 30 11 | ) 12 | 13 | const ( 14 | I = iota 15 | H 16 | C 17 | F 18 | E 19 | ) 20 | 21 | const ( 22 | FreeMultiples = 10 23 | ColdMultiples = 200 24 | MinCacheSize = 2000 25 | ) 26 | 27 | type Locker interface { 28 | Lock() 29 | Unlock() 30 | RLock() 31 | RUnlock() 32 | } 33 | 34 | type Table interface { 35 | Run() 36 | Stop() 37 | Get(uint64) Locker 38 | } 39 | 40 | type locker struct { 41 | t int // type 42 | n int32 // refer 43 | k uint64 44 | lkr sync.RWMutex 45 | h, c, f *list.Element 46 | } 47 | 48 | type table struct { 49 | n int 50 | mp *sync.Map 51 | hq, cq, fq *list.List 52 | lch chan *locker 53 | ch chan struct{} 54 | } 55 | 56 | func (l *locker) Lock() { 57 | l.lkr.Lock() 58 | } 59 | 60 | func (l *locker) RLock() { 61 | l.lkr.RLock() 62 | } 63 | 64 | func (l *locker) Unlock() { 65 | l.lkr.Unlock() 66 | atomic.AddInt32(&l.n, -1) 67 | } 68 | 69 | func (l *locker) RUnlock() { 70 | l.lkr.RUnlock() 71 | atomic.AddInt32(&l.n, -1) 72 | } 73 | -------------------------------------------------------------------------------- /scheduler/manager/manager.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "sort" 5 | ) 6 | 7 | func New() *manager { 8 | return &manager{ 9 | xs: []*element{}, 10 | mp: make(map[uint64]*element), 11 | } 12 | } 13 | 14 | func (m *manager) Run() { 15 | } 16 | 17 | func (m *manager) Stop() { 18 | m.ch <- struct{}{} 19 | <-m.ch 20 | } 21 | 22 | func (m *manager) Add(ts uint64) { 23 | if e, ok := m.mp[ts]; ok { 24 | e.n++ 25 | } else { 26 | e := &element{n: 1, ts: ts} 27 | m.mp[ts] = e 28 | m.xs = push(e, m.xs) 29 | } 30 | } 31 | 32 | func (m *manager) Del(ts uint64) bool { 33 | if e, ok := m.mp[ts]; ok { 34 | if e.n = e.n - 1; e.n == 0 { 35 | r := m.xs[0].ts == ts 36 | delete(m.mp, ts) 37 | if r { 38 | m.xs = m.xs[1:] 39 | for len(m.xs) > 0 { 40 | if _, ok := m.mp[m.xs[0].ts]; !ok { 41 | m.xs = m.xs[1:] 42 | } else { 43 | break 44 | } 45 | } 46 | } 47 | return r 48 | } 49 | } 50 | return false 51 | } 52 | 53 | func push(x *element, xs []*element) []*element { 54 | i := sort.Search(len(xs), func(i int) bool { return xs[i].ts >= x.ts }) 55 | xs = append(xs, &element{}) 56 | copy(xs[i+1:], xs[i:]) 57 | xs[i] = x 58 | return xs 59 | } 60 | -------------------------------------------------------------------------------- /cache/types.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | 7 | "github.com/infinivision/gaeadb/cache/scheduler" 8 | "github.com/infinivision/gaeadb/constant" 9 | "github.com/infinivision/gaeadb/disk" 10 | "github.com/nnsgmsone/damrey/logger" 11 | ) 12 | 13 | const ( 14 | Cycle = 30 15 | ) 16 | 17 | const ( 18 | I = iota 19 | H 20 | C 21 | F 22 | E 23 | ) 24 | 25 | const ( 26 | FreeMultiples = 10 27 | ColdMultiples = 100 28 | MinCacheSize = 2000 29 | ) 30 | 31 | type Page interface { 32 | Sync() 33 | Buffer() []byte 34 | PageNumber() int64 35 | } 36 | 37 | type Cache interface { 38 | Run() 39 | Stop() 40 | Flush() 41 | Release(Page) 42 | Get(int64) (Page, error) 43 | } 44 | 45 | type page struct { 46 | t int // type 47 | n int32 // refer 48 | cp *cache 49 | b disk.Block 50 | h, c, f *list.Element 51 | } 52 | 53 | type cache struct { 54 | n int 55 | mp *sync.Map 56 | log logger.Log 57 | hq, cq, fq *list.List 58 | pch chan *page 59 | ch chan struct{} 60 | sched scheduler.Scheduler 61 | ps [constant.Preallocate]*page 62 | } 63 | 64 | func (pg *page) Sync() { 65 | pg.cp.sched.Write(pg.b) 66 | } 67 | 68 | func (pg *page) Buffer() []byte { 69 | return pg.b.Buffer() 70 | } 71 | 72 | func (pg *page) PageNumber() int64 { 73 | return pg.b.BlockNumber() 74 | } 75 | -------------------------------------------------------------------------------- /scheduler/types.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/infinivision/gaeadb/cache" 7 | "github.com/infinivision/gaeadb/data" 8 | "github.com/infinivision/gaeadb/scheduler/manager" 9 | "github.com/infinivision/gaeadb/wal" 10 | ) 11 | 12 | const ( 13 | Cycle = 1 14 | ) 15 | 16 | const ( 17 | MinCacheSize = 2000 18 | ) 19 | 20 | const ( 21 | CkptSize = 1024 * 1024 22 | ) 23 | 24 | const ( 25 | C = iota // commit 26 | D // done 27 | S // start 28 | ) 29 | 30 | type Scheduler interface { 31 | Run() 32 | Stop() 33 | Start() uint64 34 | Done(uint64) error 35 | Commit(uint64, map[string]uint64, map[string][]byte) (uint64, error) 36 | } 37 | 38 | type result struct { 39 | err error 40 | ts uint64 41 | } 42 | 43 | type message struct { 44 | t int 45 | ts uint64 46 | rch chan *result 47 | rmp map[string]uint64 48 | wmp map[string][]byte 49 | } 50 | 51 | type element struct { 52 | k string 53 | ts uint64 54 | } 55 | 56 | type checkpoint struct { 57 | s bool // start check point 58 | t time.Time 59 | d data.Data 60 | w wal.Writer 61 | c cache.Cache 62 | mp map[uint64]struct{} 63 | mq map[uint64]struct{} // backup 64 | } 65 | 66 | type scheduler struct { 67 | ts uint64 68 | mts uint64 // min ts 69 | xs []*element 70 | cp *checkpoint 71 | ch chan struct{} 72 | mch chan *message 73 | mgr manager.Manager 74 | mp map[string]*element 75 | } 76 | -------------------------------------------------------------------------------- /prefix/types.go: -------------------------------------------------------------------------------- 1 | package prefix 2 | 3 | import ( 4 | "github.com/infinivision/gaeadb/cache" 5 | "github.com/infinivision/gaeadb/locker" 6 | "github.com/infinivision/gaeadb/stack" 7 | "github.com/infinivision/gaeadb/suffix" 8 | ) 9 | 10 | const ( 11 | R = iota // root node 12 | E // entry of branch 13 | P // prefix node 14 | S // suffix node 15 | C // character 16 | ) 17 | 18 | type Tree interface { 19 | Close() error 20 | 21 | Get([]byte) (uint64, error) 22 | Del([]byte, suffix.Writer) error 23 | Set([]byte, uint64, suffix.Writer) error 24 | 25 | NewForwardIterator([]byte) (Iterator, error) 26 | NewBackwardIterator([]byte) (Iterator, error) 27 | } 28 | 29 | type Iterator interface { 30 | Close() error 31 | 32 | Next() error 33 | Valid() bool 34 | Key() []byte 35 | Value() uint64 36 | } 37 | 38 | type resource struct { 39 | pg cache.Page 40 | le locker.Locker 41 | } 42 | 43 | type forwardElement struct { 44 | typ int 45 | cnt int 46 | val uint64 47 | pref []byte 48 | rsrc *resource 49 | itr suffix.Iterator 50 | } 51 | 52 | type forwardIterator struct { 53 | t *tree 54 | k []byte 55 | v uint64 56 | s stack.Stack 57 | } 58 | 59 | type backwardElement struct { 60 | typ int 61 | cnt int 62 | val uint64 63 | pref []byte 64 | rsrc *resource 65 | itr suffix.Iterator 66 | } 67 | 68 | type backwardIterator struct { 69 | t *tree 70 | k []byte 71 | v uint64 72 | s stack.Stack 73 | } 74 | 75 | type tree struct { 76 | c cache.Cache 77 | t locker.Table 78 | } 79 | -------------------------------------------------------------------------------- /mvcc/backward.go: -------------------------------------------------------------------------------- 1 | package mvcc 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | 7 | "github.com/infinivision/gaeadb/constant" 8 | "github.com/infinivision/gaeadb/errmsg" 9 | ) 10 | 11 | func (itr *backwardIterator) Close() error { 12 | return itr.itr.Close() 13 | } 14 | 15 | func (itr *backwardIterator) Next() error { 16 | if itr.s { 17 | itr.s = false 18 | return itr.seek() 19 | } 20 | itr.itr.Next() 21 | return itr.seek() 22 | } 23 | 24 | func (itr *backwardIterator) Valid() bool { 25 | if itr.s { 26 | return true 27 | } 28 | return itr.itr.Valid() 29 | } 30 | 31 | func (itr *backwardIterator) Key() []byte { 32 | return itr.e.k 33 | } 34 | 35 | func (itr *backwardIterator) Value() uint64 { 36 | return itr.e.v 37 | } 38 | 39 | func (itr *backwardIterator) Timestamp() uint64 { 40 | return itr.e.ts 41 | } 42 | 43 | func (itr *backwardIterator) seek() error { 44 | for itr.itr.Valid() { 45 | if ts := binary.BigEndian.Uint64(itr.itr.Key()[len(itr.itr.Key())-8:]); ts <= itr.ts && itr.itr.Value() != constant.Cancel { 46 | itr.s = true 47 | return itr.filter(itr.itr.Key()[:len(itr.itr.Key())-8], ts) 48 | } 49 | if err := itr.itr.Next(); err != nil { 50 | return err 51 | } 52 | } 53 | return errmsg.ScanEnd 54 | } 55 | 56 | func (itr *backwardIterator) filter(k []byte, ts uint64) error { 57 | if !itr.itr.Valid() { 58 | return errmsg.ScanEnd 59 | } 60 | itr.e.k = k 61 | itr.e.ts = ts 62 | itr.e.v = itr.itr.Value() 63 | for { 64 | if err := itr.itr.Next(); err != nil { 65 | return err 66 | } 67 | if !itr.itr.Valid() { 68 | return nil 69 | } 70 | if bytes.Compare(k, itr.itr.Key()[:len(itr.itr.Key())-8]) != 0 { 71 | return nil 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /test/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "log" 7 | 8 | "github.com/infinivision/gaeadb/db" 9 | ) 10 | 11 | func main() { 12 | cfg := db.DefaultConfig() 13 | cfg.DirName = "test.db" 14 | db, err := db.Open(cfg) 15 | if err != nil { 16 | log.Fatal(err) 17 | } 18 | defer db.Close() 19 | 20 | { 21 | for i := 0; i < 100; i++ { 22 | if err := db.Set([]byte(fmt.Sprintf("/u/b/u_%v", i)), []byte(fmt.Sprintf("%v", i))); err != nil { 23 | log.Fatal(err) 24 | } 25 | } 26 | } 27 | { 28 | for i := 0; i < 100; i++ { 29 | if v, err := db.Get([]byte(fmt.Sprintf("/u/b/u_%v", i))); err != nil { 30 | log.Fatal(err) 31 | } else { 32 | if bytes.Compare(v, []byte(fmt.Sprintf("%v", i))) != 0 { 33 | log.Fatal(fmt.Errorf("%s is not %v - %v\n", fmt.Sprintf("/u/b/u_%v", i), fmt.Sprintf("%v", i), v)) 34 | } 35 | } 36 | } 37 | } 38 | { 39 | tx, err := db.NewTransaction(false) 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | defer tx.Rollback() 44 | itr, err := tx.NewForwardIterator(nil) 45 | if err != nil { 46 | log.Fatal(err) 47 | } 48 | for itr.Valid() { 49 | k := itr.Key() 50 | v, _ := itr.Value() 51 | fmt.Printf("%s: %s\n", string(k), string(v)) 52 | itr.Next() 53 | } 54 | itr.Close() 55 | } 56 | { 57 | tx, err := db.NewTransaction(false) 58 | if err != nil { 59 | log.Fatal(err) 60 | } 61 | defer tx.Rollback() 62 | itr, err := tx.NewBackwardIterator(nil) 63 | if err != nil { 64 | log.Fatal(err) 65 | } 66 | for itr.Valid() { 67 | k := itr.Key() 68 | v, _ := itr.Value() 69 | fmt.Printf("%s: %s\n", string(k), string(v)) 70 | itr.Next() 71 | } 72 | itr.Close() 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /data/file.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "encoding/binary" 5 | "os" 6 | 7 | "github.com/infinivision/gaeadb/constant" 8 | "github.com/infinivision/gaeadb/errmsg" 9 | ) 10 | 11 | func (f *file) close() error { 12 | return f.fp.Close() 13 | } 14 | 15 | func (f *file) flush() error { 16 | return f.fp.Sync() 17 | } 18 | 19 | func (f *file) write(o uint64, data []byte) error { 20 | return write(f.fp, int64(o), data) 21 | } 22 | 23 | func (f *file) read(o int64) ([]byte, error) { 24 | if int64(f.size)-o < HeaderSize { 25 | return nil, errmsg.NotExist 26 | } 27 | h, err := read(f.fp, o, HeaderSize) 28 | if err != nil { 29 | return nil, err 30 | } 31 | return read(f.fp, o+HeaderSize, int(binary.LittleEndian.Uint16(h))) 32 | } 33 | 34 | func (f *file) load(o int64, size int) ([]byte, error) { 35 | if size > constant.MaxLoadDataSize { 36 | size = constant.MaxLoadDataSize 37 | } 38 | if size > int(int64(f.size)-o) { 39 | size = int(int64(f.size) - o) 40 | } 41 | return read(f.fp, o, size) 42 | } 43 | 44 | func (f *file) alloc(size uint64) (uint64, error) { 45 | curr := f.size 46 | if curr+size > constant.MaxDataFileSize { 47 | return 0, errmsg.OutOfSpace 48 | } 49 | f.size += size 50 | return curr, nil 51 | } 52 | 53 | func write(fp *os.File, o int64, buf []byte) error { 54 | n, err := fp.WriteAt(buf, o) 55 | switch { 56 | case err != nil: 57 | return err 58 | case n != len(buf): 59 | return errmsg.WriteFailed 60 | } 61 | return nil 62 | } 63 | 64 | func read(fp *os.File, o int64, n int) ([]byte, error) { 65 | buf := make([]byte, n) 66 | m, err := fp.ReadAt(buf, o) 67 | switch { 68 | case err != nil: 69 | return nil, err 70 | case n != m: 71 | return nil, errmsg.ReadFailed 72 | } 73 | return buf, nil 74 | } 75 | -------------------------------------------------------------------------------- /mvcc/forward.go: -------------------------------------------------------------------------------- 1 | package mvcc 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | 7 | "github.com/infinivision/gaeadb/constant" 8 | "github.com/infinivision/gaeadb/errmsg" 9 | ) 10 | 11 | func (itr *forwardIterator) Close() error { 12 | return itr.itr.Close() 13 | } 14 | 15 | func (itr *forwardIterator) Next() error { 16 | if itr.s { 17 | itr.s = false 18 | return itr.seek() 19 | } 20 | itr.itr.Next() 21 | return itr.seek() 22 | } 23 | 24 | func (itr *forwardIterator) Valid() bool { 25 | if itr.s { 26 | return true 27 | } 28 | return itr.itr.Valid() 29 | } 30 | 31 | func (itr *forwardIterator) Key() []byte { 32 | return itr.e.k 33 | } 34 | 35 | func (itr *forwardIterator) Value() uint64 { 36 | return itr.e.v 37 | } 38 | 39 | func (itr *forwardIterator) Timestamp() uint64 { 40 | return itr.e.ts 41 | } 42 | 43 | func (itr *forwardIterator) seek() error { 44 | for itr.itr.Valid() { 45 | if ts := binary.BigEndian.Uint64(itr.itr.Key()[len(itr.itr.Key())-8:]); ts <= itr.ts && itr.itr.Value() != constant.Cancel { 46 | itr.s = true 47 | return itr.filter(itr.itr.Key()[:len(itr.itr.Key())-8], ts) 48 | } 49 | if err := itr.itr.Next(); err != nil { 50 | return err 51 | } 52 | } 53 | return errmsg.ScanEnd 54 | } 55 | 56 | func (itr *forwardIterator) filter(k []byte, ts uint64) error { 57 | if !itr.itr.Valid() { 58 | return errmsg.ScanEnd 59 | } 60 | itr.e.k = k 61 | itr.e.ts = ts 62 | itr.e.v = itr.itr.Value() 63 | for { 64 | if err := itr.itr.Next(); err != nil { 65 | return err 66 | } 67 | if !itr.itr.Valid() { 68 | return nil 69 | } 70 | if bytes.Compare(k, itr.itr.Key()[:len(itr.itr.Key())-8]) != 0 { 71 | return nil 72 | } 73 | if itr.Value() != constant.Cancel { 74 | itr.e.v = itr.itr.Value() 75 | itr.e.ts = binary.BigEndian.Uint64(itr.itr.Key()[len(itr.itr.Key())-8:]) 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /mvcc/mvcc.go: -------------------------------------------------------------------------------- 1 | package mvcc 2 | 3 | import ( 4 | "encoding/binary" 5 | 6 | "github.com/infinivision/gaeadb/constant" 7 | "github.com/infinivision/gaeadb/errmsg" 8 | "github.com/infinivision/gaeadb/prefix" 9 | "github.com/infinivision/gaeadb/suffix" 10 | ) 11 | 12 | func New(t prefix.Tree) *mvcc { 13 | return &mvcc{t} 14 | } 15 | 16 | func (m *mvcc) Close() error { 17 | return m.t.Close() 18 | } 19 | 20 | func (m *mvcc) Exist(k []byte, ts uint64) bool { 21 | buf := make([]byte, 8) 22 | binary.BigEndian.PutUint64(buf, ts) 23 | _, err := m.t.Get(append(k, buf...)) 24 | return err == nil 25 | } 26 | 27 | func (m *mvcc) Get(k []byte, ts uint64) (uint64, uint64, error) { 28 | itr, err := m.t.NewBackwardIterator(k) 29 | if err != nil { 30 | return 0, 0, err 31 | } 32 | defer itr.Close() 33 | for itr.Valid() { 34 | if len(itr.Key()) == len(k)+8 { 35 | if v, rts := itr.Value(), binary.BigEndian.Uint64(itr.Key()[len(k):]); rts <= ts && v != constant.Cancel { 36 | return v, rts, nil 37 | } 38 | } 39 | if err := itr.Next(); err != nil { 40 | return 0, 0, err 41 | } 42 | } 43 | return 0, 0, errmsg.NotExist 44 | } 45 | 46 | func (m *mvcc) Del(k []byte, ts uint64, w suffix.Writer) error { 47 | buf := make([]byte, 8) 48 | binary.BigEndian.PutUint64(buf, ts) 49 | return m.t.Del(append(k, buf...), w) 50 | } 51 | 52 | func (m *mvcc) Set(k []byte, v uint64, ts uint64, w suffix.Writer) error { 53 | buf := make([]byte, 8) 54 | binary.BigEndian.PutUint64(buf, ts) 55 | return m.t.Set(append(k, buf...), v, w) 56 | } 57 | 58 | func (m *mvcc) NewForwardIterator(pref []byte, ts uint64) (Iterator, error) { 59 | fItr, err := m.t.NewForwardIterator(pref) 60 | if err != nil { 61 | return nil, err 62 | } 63 | itr := &forwardIterator{ts: ts, itr: fItr, e: new(entry)} 64 | if err := itr.seek(); err != nil { 65 | itr.Close() 66 | return nil, err 67 | } 68 | return itr, nil 69 | } 70 | 71 | func (m *mvcc) NewBackwardIterator(pref []byte, ts uint64) (Iterator, error) { 72 | bItr, err := m.t.NewBackwardIterator(pref) 73 | if err != nil { 74 | return nil, err 75 | } 76 | itr := &backwardIterator{ts: ts, itr: bItr, e: new(entry)} 77 | if err := itr.seek(); err != nil { 78 | itr.Close() 79 | return nil, err 80 | } 81 | return itr, nil 82 | } 83 | -------------------------------------------------------------------------------- /wal/types.go: -------------------------------------------------------------------------------- 1 | package wal 2 | 3 | import "sync" 4 | 5 | const ( 6 | EM byte = iota // empty entry 7 | EC // end check point 8 | SC // start check point 9 | AT // abort transaction 10 | CT // commmit transaction 11 | ST // start transaction 12 | WD // write data 13 | NP // new prefix 14 | CP // change prefix 15 | NS // new suffix 16 | ) 17 | 18 | const ( 19 | SumSize = 4 20 | RecordSize = 4 21 | HeaderSize = SumSize + RecordSize 22 | ) 23 | 24 | type Writer interface { 25 | Close() error 26 | EndCKPT() error 27 | StartCKPT() error 28 | Append([]byte) error 29 | } 30 | 31 | type endCKPT struct { 32 | } 33 | 34 | type startCKPT struct { 35 | ts []uint64 36 | } 37 | 38 | type endTransaction struct { 39 | ts uint64 40 | } 41 | 42 | type startTransaction struct { 43 | ts uint64 44 | mp map[string][]byte 45 | } 46 | 47 | type writeData struct { 48 | ts uint64 49 | os []uint64 50 | } 51 | 52 | // ts.pn[off] = val, ts.pn[os] = vs 53 | type newPrefix struct { 54 | ts uint64 55 | pn uint64 56 | val uint64 57 | off uint16 58 | os []uint16 59 | vs []uint64 60 | } 61 | 62 | // ts.pn[start, end] = val 63 | type newSuffix struct { 64 | end byte 65 | start byte 66 | ts uint64 67 | pn uint64 68 | val uint64 69 | } 70 | 71 | // ts.pn[os] = vs 72 | type chgPrefix struct { 73 | ts uint64 74 | pn uint64 75 | os []uint16 76 | vs []uint64 77 | } 78 | 79 | type record struct { 80 | rc interface{} 81 | } 82 | 83 | type file struct { 84 | cnt int32 // reference count 85 | size int32 // file size 86 | buf []byte 87 | } 88 | 89 | type walWriter struct { 90 | sync.RWMutex 91 | n int // index of truncate 92 | m int // index of check point 93 | idx int 94 | flag int 95 | fp *file 96 | dir string 97 | } 98 | 99 | type recoverWriter struct { 100 | } 101 | 102 | func (r *recoverWriter) NewSuffix(_, _ byte, _, _ uint64) error { 103 | return nil 104 | } 105 | 106 | func (r *recoverWriter) ChgPrefix(_ uint64, _ []uint16, _ []uint64) error { 107 | return nil 108 | } 109 | 110 | func (r *recoverWriter) NewPrefix(_, _ uint64, _ uint16, _ []uint16, _ []uint64) error { 111 | return nil 112 | } 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gaeadb 2 | gaeadb is a pure Go Database engine designed by nnsgmsone. 3 | The goal of the project is to provide a database engine for table or other complex data structures. 4 | 5 | ## Table of Contents 6 | * [Getting Started](#getting-started) 7 | + [Opening a Database engine](#opening-a-database-engine) 8 | + [DB interface](#db-interface) 9 | + [Transaction interface](#transaction-interface) 10 | * [Benchmarks](#benchmarks) 11 | * [Caveats & Limitations](#caveats--limitations) 12 | 13 | 14 | ## Getting Started 15 | 16 | ### Opening a database 17 | The top-level object in gaeadb is a `DB`. It represents multiple files on disk 18 | in specific directories, which contain the data for a single database. 19 | 20 | ```go 21 | package main 22 | 23 | import ( 24 | "log" 25 | 26 | "gaeadb/db" 27 | ) 28 | 29 | func main() { 30 | // Open the gaeadb database located in the /tmp/gaea.db directory. 31 | // It will be created if it doesn't exist. 32 | cfg := db.DefaultConfig() 33 | cfg.DirName = "/tmp/gaea.db" 34 | db, err := db.Open(cfg) 35 | if err != nil { 36 | log.Fatal(err) 37 | } 38 | defer db.Close() 39 |  // Your code here… 40 | } 41 | ``` 42 | 43 | ### DB interface 44 | 45 | ```go 46 | type DB interface { 47 | Close() error 48 | 49 | Del([]byte) error 50 | Set([]byte, []byte) error 51 | Get([]byte) ([]byte, error) 52 | 53 | NewTransaction(readOnly bool) (Transaction, error) 54 | } 55 | ``` 56 | 57 | ### Transaction interface 58 | ```go 59 | type Transaction interface { 60 | Commit() error 61 | Rollback() error 62 | Del([]byte) error 63 | Set([]byte, []byte) error 64 | Get([]byte) ([]byte, error) 65 | NewForwardIterator([]byte) (Iterator, error) 66 | NewBackwardIterator([]byte) (Iterator, error) 67 | } 68 | 69 | type Iterator interface { 70 | Close() error 71 | Next() error 72 | Valid() bool 73 | Key() []byte // can use outside 74 | Value() ([]byte, error) // can use outside 75 | } 76 | 77 | ``` 78 | 79 | ## Benchmarks 80 | 81 | I have run comprehensive benchmarks against Bolt and Badger, The 82 | benchmarking code, and the detailed logs for the benchmarks can be found in the 83 | [gaeadbBench] repo. 84 | 85 | [gaeadbBench]: https://github.com/infinivision/gaeadbBench 86 | 87 | ## Caveats & Limitations 88 | 89 | ### Caveats 90 | sync is always on and not allowed to close 91 | 92 | ### Limitations 93 | The maximum value of key is 4074 and the maximum value of value is 64k. 94 | -------------------------------------------------------------------------------- /wal/writer.go: -------------------------------------------------------------------------------- 1 | package wal 2 | 3 | import ( 4 | "encoding/binary" 5 | "hash/crc32" 6 | "os" 7 | "sync/atomic" 8 | "syscall" 9 | 10 | "github.com/infinivision/gaeadb/sum" 11 | "golang.org/x/sys/unix" 12 | ) 13 | 14 | func (w *walWriter) Close() error { 15 | return w.fp.close() 16 | } 17 | 18 | func (w *walWriter) EndCKPT() error { 19 | for w.n < w.m { 20 | if err := os.Truncate(fileName(w.n, w.dir), 0); err != nil { 21 | return err 22 | } 23 | w.n++ 24 | } 25 | return nil 26 | } 27 | 28 | func (w *walWriter) StartCKPT() error { 29 | w.m = w.idx - 1 30 | return nil 31 | } 32 | 33 | func (w *walWriter) Append(record []byte) error { 34 | f, o, err := w.alloc(record) 35 | if err != nil { 36 | return err 37 | } 38 | binary.LittleEndian.PutUint32(f.buf[o:], sum.Sum(crc32.New(crc32.MakeTable(crc32.Castagnoli)), record)) 39 | binary.LittleEndian.PutUint32(f.buf[o+SumSize:], uint32(len(record))) 40 | copy(f.buf[o+HeaderSize:], record) 41 | defer func() { 42 | w.RLock() 43 | fp := w.fp 44 | w.RUnlock() 45 | cnt := add(&f.cnt, -1) 46 | if f != fp && cnt == 0 { 47 | f.close() 48 | } 49 | }() 50 | return f.flush() 51 | } 52 | 53 | func (w *walWriter) alloc(record []byte) (*file, int32, error) { 54 | m := int32(len(record) + HeaderSize) 55 | w.Lock() 56 | defer w.Unlock() 57 | for { 58 | if o, err := w.fp.alloc(m); err != nil { 59 | fp, err := newFile(fileName(w.idx, w.dir), w.flag) 60 | if err != nil { 61 | return nil, -1, err 62 | } 63 | w.idx++ 64 | if atomic.LoadInt32(&w.fp.cnt) == 0 { 65 | w.fp.close() 66 | } 67 | w.fp = fp 68 | } else { 69 | add(&w.fp.cnt, 1) 70 | return w.fp, o, nil 71 | } 72 | } 73 | } 74 | 75 | func add(x *int32, y int32) int32 { 76 | for { 77 | curr := atomic.LoadInt32(x) 78 | next := curr + y 79 | if atomic.CompareAndSwapInt32(x, curr, next) { 80 | return next 81 | } 82 | } 83 | } 84 | 85 | func newWriter(dir string, flag int) (*walWriter, error) { 86 | w := &walWriter{ 87 | n: 0, 88 | m: 0, 89 | dir: dir, 90 | flag: flag, 91 | } 92 | for i := 0; ; i++ { 93 | fp, err := openFile(fileName(i, dir), flag) 94 | switch { 95 | case err == nil: 96 | if w.fp != nil { 97 | w.fp.close() 98 | } 99 | w.fp = fp 100 | case err == syscall.ENOENT: 101 | if w.fp == nil { 102 | fp, err := newFile(fileName(i, dir), flag) 103 | if err != nil { 104 | return nil, err 105 | } 106 | i++ 107 | w.fp = fp 108 | } else { 109 | w.fp.getSize() 110 | } 111 | w.idx = i 112 | return w, nil 113 | default: 114 | return nil, err 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /disk/disk.go: -------------------------------------------------------------------------------- 1 | package disk 2 | 3 | import ( 4 | "encoding/binary" 5 | "math" 6 | "os" 7 | "sync/atomic" 8 | 9 | "github.com/infinivision/gaeadb/constant" 10 | "github.com/infinivision/gaeadb/errmsg" 11 | ) 12 | 13 | func New(path string) (*disk, error) { 14 | fp, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0664) 15 | if err != nil { 16 | return nil, err 17 | } 18 | st, err := fp.Stat() 19 | if err != nil { 20 | fp.Close() 21 | return nil, err 22 | } 23 | d := &disk{fp: fp, cnt: st.Size() / constant.BlockSize} 24 | if d.cnt < InitDiskSize { 25 | d.cnt = 0 26 | if err := d.init(); err != nil { 27 | fp.Close() 28 | return nil, err 29 | } 30 | } 31 | return d, nil 32 | } 33 | 34 | func (d *disk) Close() error { 35 | return d.fp.Close() 36 | } 37 | 38 | func (d *disk) Flush() error { 39 | return d.fp.Sync() 40 | } 41 | 42 | func (d *disk) Blocks() int64 { 43 | return atomic.LoadInt64(&d.cnt) 44 | } 45 | 46 | func (d *disk) Read(bn int64, buf []byte) (Block, error) { 47 | switch { 48 | case bn < 0: 49 | if cnt := d.alloc(); cnt == -1 { 50 | return nil, errmsg.OutOfSpace 51 | } else { 52 | return &block{cnt - 1, buf}, nil 53 | } 54 | case bn > atomic.LoadInt64(&d.cnt): 55 | return nil, errmsg.OutOfSpace 56 | default: 57 | n, err := d.fp.ReadAt(buf, bn*constant.BlockSize) 58 | switch { 59 | case err != nil: 60 | return nil, err 61 | case n != constant.BlockSize: 62 | return nil, errmsg.ReadFailed 63 | } 64 | return &block{bn, buf}, nil 65 | } 66 | } 67 | 68 | func (d *disk) Write(b Block) error { 69 | n, err := d.fp.WriteAt(b.Buffer(), b.BlockNumber()*constant.BlockSize) 70 | switch { 71 | case err != nil: 72 | return err 73 | case n != constant.BlockSize: 74 | return errmsg.WriteFailed 75 | } 76 | return nil 77 | } 78 | 79 | func (d *disk) alloc() int64 { 80 | for { 81 | curr := atomic.LoadInt64(&d.cnt) 82 | if curr == math.MaxInt64 { 83 | return -1 84 | } 85 | next := curr + 1 86 | if atomic.CompareAndSwapInt64(&d.cnt, curr, next) { 87 | return next 88 | } 89 | } 90 | } 91 | 92 | func (d *disk) init() error { 93 | var bs []Block 94 | 95 | root, err := d.read(-1) 96 | if err != nil { 97 | return err 98 | } 99 | buf := root.Buffer() 100 | for i := 0; i < 256; i++ { 101 | b, err := d.read(-1) 102 | if err != nil { 103 | return err 104 | } 105 | binary.LittleEndian.PutUint64(buf[i*8:], uint64(b.BlockNumber())|constant.PN< 0 { 31 | return true 32 | } 33 | return false 34 | } 35 | 36 | func (itr *forwardIterator) Key() []byte { 37 | return itr.kv.ks[0] 38 | } 39 | 40 | func (itr *forwardIterator) Value() ([]byte, error) { 41 | k := string(itr.kv.ks[0]) 42 | switch o := itr.kv.omp[k]; o { 43 | case constant.Empty: 44 | return []byte{}, nil 45 | case constant.Delete: 46 | return nil, errmsg.NotExist 47 | case constant.Cache: 48 | if v, ok := itr.tx.wmp[k]; ok { 49 | if v == nil { 50 | return nil, errmsg.NotExist 51 | } 52 | return v, nil 53 | } 54 | default: 55 | if v, ok := itr.kv.mp[k]; ok { 56 | return v, nil 57 | } else { 58 | return nil, errmsg.ReadFailed 59 | } 60 | } 61 | return itr.kv.mp[string(itr.kv.ks[0])], nil 62 | } 63 | 64 | func (itr *forwardIterator) seek() error { 65 | for itr.itr.Valid() { 66 | key := string(itr.itr.Key()) 67 | if !itr.tx.ro { 68 | for k, _ := range itr.tx.wmp { 69 | if bytes.Compare([]byte(k), []byte(key)) < 0 { 70 | itr.kv.omp[k] = constant.Cache 71 | itr.kv.ks = LtPush([]byte(k), itr.kv.ks) 72 | } 73 | } 74 | if _, ok := itr.tx.wmp[key]; !ok { 75 | itr.tx.rmp[key] = itr.itr.Timestamp() 76 | } 77 | } 78 | itr.kv.omp[key] = itr.itr.Value() 79 | itr.kv.ks = append(itr.kv.ks, []byte(key)) 80 | if len(itr.kv.ks) > constant.PreLoad { 81 | itr.fill() 82 | return nil 83 | } 84 | err := itr.itr.Next() 85 | switch { 86 | case err == errmsg.ScanEnd: 87 | itr.fill() 88 | return nil 89 | case err != nil: 90 | return err 91 | } 92 | } 93 | return errmsg.ScanEnd 94 | } 95 | 96 | func (itr *forwardIterator) fill() { 97 | min, max := itr.kv.omp[string(itr.kv.ks[0])], itr.kv.omp[string(itr.kv.ks[0])] 98 | for _, k := range itr.kv.ks { 99 | if o := itr.kv.omp[string(k)]; o > constant.Cache { 100 | switch { 101 | case o < min: 102 | min = o 103 | case o > max: 104 | max = o 105 | } 106 | } 107 | } 108 | buf, err := itr.tx.d.Load(min, int(max-min)+32) // preload 109 | switch { 110 | case err == nil: 111 | for _, k := range itr.kv.ks { 112 | if o := itr.kv.omp[string(k)]; o > constant.Cache { 113 | if int(o-min)+2 < len(buf) { 114 | n := int(binary.LittleEndian.Uint16(buf[o-min:])) 115 | if len(buf[o-min+2:]) >= n { 116 | itr.kv.mp[string(k)] = buf[int(o-min)+2 : int(o-min)+2+n] 117 | continue 118 | } 119 | } 120 | if v, err := itr.tx.d.Read(o); err == nil { 121 | itr.kv.mp[string(k)] = v 122 | } 123 | } 124 | } 125 | case err != nil: 126 | itr.tx.log.Errorf("forwardIterator - failed to preLoad: %v\n", err) 127 | for _, k := range itr.kv.ks { 128 | if o := itr.kv.omp[string(k)]; o > constant.Cache { 129 | if v, err := itr.tx.d.Read(o); err == nil { 130 | itr.kv.mp[string(k)] = v 131 | } 132 | } 133 | } 134 | } 135 | } 136 | 137 | func LtPush(x []byte, xs [][]byte) [][]byte { 138 | i := sort.Search(len(xs), func(i int) bool { return bytes.Compare(xs[i], x) >= 0 }) 139 | xs = append(xs, []byte{}) 140 | copy(xs[i+1:], xs[i:]) 141 | xs[i] = x 142 | return xs 143 | } 144 | -------------------------------------------------------------------------------- /transaction/backward.go: -------------------------------------------------------------------------------- 1 | package transaction 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "sort" 7 | 8 | "github.com/infinivision/gaeadb/constant" 9 | "github.com/infinivision/gaeadb/errmsg" 10 | ) 11 | 12 | func (itr *backwardIterator) Close() error { 13 | return itr.itr.Close() 14 | } 15 | 16 | func (itr *backwardIterator) Next() error { 17 | delete(itr.kv.mp, string(itr.kv.ks[0])) 18 | delete(itr.kv.omp, string(itr.kv.ks[0])) 19 | if itr.kv.ks = itr.kv.ks[1:]; len(itr.kv.ks) == 0 { 20 | if err := itr.itr.Next(); err != nil { 21 | return err 22 | } 23 | return itr.seek() 24 | } 25 | return nil 26 | } 27 | 28 | func (itr *backwardIterator) Valid() bool { 29 | if len(itr.kv.ks) > 0 { 30 | return true 31 | } 32 | return false 33 | } 34 | 35 | func (itr *backwardIterator) Key() []byte { 36 | return itr.kv.ks[0] 37 | } 38 | 39 | func (itr *backwardIterator) Value() ([]byte, error) { 40 | k := string(itr.kv.ks[0]) 41 | switch o := itr.kv.omp[k]; o { 42 | case constant.Empty: 43 | return []byte{}, nil 44 | case constant.Delete: 45 | return nil, errmsg.NotExist 46 | case constant.Cache: 47 | if v, ok := itr.tx.wmp[k]; ok { 48 | if v == nil { 49 | return nil, errmsg.NotExist 50 | } 51 | return v, nil 52 | } 53 | default: 54 | if v, ok := itr.kv.mp[k]; ok { 55 | return v, nil 56 | } else { 57 | return nil, errmsg.ReadFailed 58 | } 59 | } 60 | return itr.kv.mp[string(itr.kv.ks[0])], nil 61 | } 62 | 63 | func (itr *backwardIterator) seek() error { 64 | for itr.itr.Valid() { 65 | key := string(itr.itr.Key()) 66 | if !itr.tx.ro { 67 | for k, _ := range itr.tx.wmp { 68 | if bytes.Compare([]byte(k), []byte(key)) < 0 { 69 | itr.kv.omp[k] = constant.Cache 70 | itr.kv.ks = GtPush([]byte(k), itr.kv.ks) 71 | } 72 | } 73 | if _, ok := itr.tx.wmp[key]; !ok { 74 | itr.tx.rmp[key] = itr.itr.Timestamp() 75 | } 76 | } 77 | itr.kv.omp[key] = itr.itr.Value() 78 | itr.kv.ks = append(itr.kv.ks, []byte(key)) 79 | if len(itr.kv.ks) > constant.PreLoad { 80 | itr.fill() 81 | return nil 82 | } 83 | err := itr.itr.Next() 84 | switch { 85 | case err == errmsg.ScanEnd: 86 | itr.fill() 87 | return nil 88 | case err != nil: 89 | return err 90 | } 91 | } 92 | return errmsg.ScanEnd 93 | } 94 | 95 | func (itr *backwardIterator) fill() { 96 | min, max := itr.kv.omp[string(itr.kv.ks[0])], itr.kv.omp[string(itr.kv.ks[0])] 97 | for _, k := range itr.kv.ks { 98 | if o := itr.kv.omp[string(k)]; o > constant.Cache { 99 | switch { 100 | case o < min: 101 | min = o 102 | case o > max: 103 | max = o 104 | } 105 | } 106 | } 107 | buf, err := itr.tx.d.Load(min, int(max-min)+32) // preload 108 | switch { 109 | case err == nil: 110 | for _, k := range itr.kv.ks { 111 | if o := itr.kv.omp[string(k)]; o > constant.Cache { 112 | if int(o-min)+2 < len(buf) { 113 | n := int(binary.LittleEndian.Uint16(buf[o-min:])) 114 | if len(buf[o-min+2:]) >= n { 115 | itr.kv.mp[string(k)] = buf[int(o-min)+2 : int(o-min)+2+n] 116 | continue 117 | } 118 | } 119 | if v, err := itr.tx.d.Read(o); err == nil { 120 | itr.kv.mp[string(k)] = v 121 | } 122 | } 123 | } 124 | case err != nil: 125 | itr.tx.log.Errorf("forwardIterator - failed to preLoad: %v\n", err) 126 | for _, k := range itr.kv.ks { 127 | if o := itr.kv.omp[string(k)]; o > constant.Cache { 128 | if v, err := itr.tx.d.Read(o); err == nil { 129 | itr.kv.mp[string(k)] = v 130 | } 131 | } 132 | } 133 | } 134 | } 135 | 136 | func GtPush(x []byte, xs [][]byte) [][]byte { 137 | i := sort.Search(len(xs), func(i int) bool { return bytes.Compare(xs[i], x) <= 0 }) 138 | xs = append(xs, []byte{}) 139 | copy(xs[i+1:], xs[i:]) 140 | xs[i] = x 141 | return xs 142 | 143 | } 144 | -------------------------------------------------------------------------------- /db/db.go: -------------------------------------------------------------------------------- 1 | package db 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "runtime" 8 | "syscall" 9 | 10 | "github.com/infinivision/gaeadb/cache" 11 | "github.com/infinivision/gaeadb/constant" 12 | "github.com/infinivision/gaeadb/data" 13 | "github.com/infinivision/gaeadb/disk" 14 | "github.com/infinivision/gaeadb/locker" 15 | "github.com/infinivision/gaeadb/mvcc" 16 | "github.com/infinivision/gaeadb/prefix" 17 | "github.com/infinivision/gaeadb/scheduler" 18 | "github.com/infinivision/gaeadb/transaction" 19 | "github.com/infinivision/gaeadb/wal" 20 | "github.com/nnsgmsone/damrey/logger" 21 | ) 22 | 23 | func DefaultConfig() Config { 24 | return Config{ 25 | CacheSize: 2000, 26 | DirName: "gaea.db", 27 | LogWriter: os.Stderr, 28 | CheckPointCycle: constant.CheckPointCycle, 29 | } 30 | } 31 | 32 | func Open(cfg Config) (*db, error) { 33 | if err := enlargelimit(); err != nil { 34 | return nil, err 35 | } 36 | if err := checkDir(cfg.DirName); err != nil { 37 | return nil, err 38 | } 39 | log := logger.New(cfg.LogWriter, "gaeadb") 40 | d, err := data.New(cfg.DirName) 41 | if err != nil { 42 | return nil, err 43 | } 44 | c, m, err := newMVCC(cfg, log) 45 | if err != nil { 46 | d.Close() 47 | return nil, err 48 | } 49 | ts, err := wal.Recover(cfg.DirName, d, m, c) 50 | if err != nil { 51 | d.Close() 52 | m.Close() 53 | return nil, err 54 | } 55 | w, err := wal.NewWriter(cfg.DirName) 56 | if err != nil { 57 | d.Close() 58 | return nil, err 59 | } 60 | constant.CheckPointCycle = cfg.CheckPointCycle 61 | schd := scheduler.New(ts, d, c, w) 62 | go schd.Run() 63 | return &db{d, m, w, c, log, schd}, nil 64 | } 65 | 66 | func (db *db) Close() error { 67 | db.schd.Stop() 68 | db.d.Close() 69 | db.w.Close() 70 | db.m.Close() 71 | return nil 72 | } 73 | 74 | func (db *db) Del(k []byte) error { 75 | tx := transaction.New(false, db.d, db.m, db.w, db.log, db.schd) 76 | defer tx.Rollback() 77 | if err := tx.Del(k); err != nil { 78 | return err 79 | } 80 | return tx.Commit() 81 | } 82 | 83 | func (db *db) Set(k, v []byte) error { 84 | tx := transaction.New(false, db.d, db.m, db.w, db.log, db.schd) 85 | defer tx.Rollback() 86 | if err := tx.Set(k, v); err != nil { 87 | return err 88 | } 89 | return tx.Commit() 90 | } 91 | 92 | func (db *db) Get(k []byte) ([]byte, error) { 93 | tx := transaction.New(true, db.d, db.m, db.w, db.log, db.schd) 94 | defer tx.Rollback() 95 | if v, err := tx.Get(k); err != nil { 96 | return nil, err 97 | } else { 98 | return v, tx.Rollback() 99 | } 100 | } 101 | 102 | func (db *db) NewTransaction(ro bool) (transaction.Transaction, error) { 103 | return transaction.New(ro, db.d, db.m, db.w, db.log, db.schd), nil 104 | } 105 | 106 | func checkDir(dir string) error { 107 | st, err := os.Stat(dir) 108 | if os.IsNotExist(err) { 109 | return os.Mkdir(dir, os.FileMode(0775)) 110 | } 111 | if err != nil { 112 | return err 113 | } 114 | if !st.IsDir() { 115 | return errors.New("'%s' is not directory") 116 | } 117 | if st.Mode()&0700 != 0700 { 118 | return errors.New("permission denied") 119 | } 120 | return nil 121 | } 122 | 123 | func newMVCC(cfg Config, log logger.Log) (cache.Cache, mvcc.MVCC, error) { 124 | d, err := disk.New(fmt.Sprintf("%s%cIDX", cfg.DirName, os.PathSeparator)) 125 | if err != nil { 126 | return nil, nil, err 127 | } 128 | c := cache.New(cfg.CacheSize, d, log) 129 | return c, mvcc.New(prefix.New(c, locker.New())), nil 130 | } 131 | 132 | func enlargelimit() error { 133 | var rlimit syscall.Rlimit 134 | 135 | runtime.GOMAXPROCS(runtime.NumCPU()) 136 | if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rlimit); err != nil { 137 | return err 138 | } else { 139 | rlimit.Cur = rlimit.Max 140 | return syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rlimit) 141 | } 142 | return nil 143 | } 144 | -------------------------------------------------------------------------------- /locker/locker.go: -------------------------------------------------------------------------------- 1 | package locker 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | "sync/atomic" 7 | "time" 8 | ) 9 | 10 | func New() *table { 11 | return &table{ 12 | mp: new(sync.Map), 13 | cq: new(list.List), 14 | hq: new(list.List), 15 | fq: new(list.List), 16 | ch: make(chan struct{}), 17 | lch: make(chan *locker, 1024), 18 | n: MinCacheSize / FreeMultiples, 19 | } 20 | } 21 | 22 | func (t *table) Run() { 23 | cnt := 0 24 | freeSize := t.n * FreeMultiples 25 | ticker := time.NewTicker(Cycle * time.Second) 26 | for { 27 | select { 28 | case <-t.ch: 29 | t.ch <- struct{}{} 30 | return 31 | case l := <-t.lch: 32 | if l.t == I { 33 | if cnt = cnt + 1; cnt%freeSize == 0 { 34 | t.gc() 35 | } 36 | } 37 | t.get(l) 38 | case <-ticker.C: 39 | t.gc() 40 | } 41 | } 42 | } 43 | 44 | func (t *table) Stop() { 45 | t.ch <- struct{}{} 46 | <-t.ch 47 | } 48 | 49 | func (t *table) Get(k uint64) Locker { 50 | for { 51 | if v, ok := t.mp.Load(k); ok { 52 | l := v.(*locker) 53 | if add(&l.n) > 0 { 54 | t.lch <- l 55 | return l 56 | } 57 | for _, ok := t.mp.Load(k); ok; _, ok = t.mp.Load(k) { // wait for delete 58 | } 59 | } 60 | l := &locker{n: 1, k: k} 61 | if _, ok := t.mp.LoadOrStore(k, l); !ok { 62 | return l 63 | } 64 | } 65 | } 66 | 67 | func (t *table) get(l *locker) { 68 | switch l.t { 69 | case E: 70 | return 71 | case I: 72 | t.set(l) 73 | return 74 | case H: 75 | isBack := l.h.Next() == nil 76 | t.hq.MoveToFront(l.h) 77 | if isBack { 78 | t.reduce() 79 | } 80 | return 81 | case F: 82 | t.fq.Remove(l.f) 83 | l.f = nil 84 | t.set(l) 85 | return 86 | } 87 | switch { 88 | case l.h == nil: 89 | t.cq.MoveToFront(l.c) 90 | l.h = t.hq.PushFront(l) 91 | default: 92 | l.t = H 93 | t.cq.Remove(l.c) 94 | l.c = nil 95 | t.hq.MoveToFront(l.h) 96 | t.exchange() 97 | t.reduce() 98 | } 99 | } 100 | 101 | func (t *table) set(l *locker) { 102 | switch { 103 | case t.hq.Len() < t.n: 104 | l.t = H 105 | l.h = t.hq.PushFront(l) 106 | case t.cq.Len() < t.n/ColdMultiples: 107 | l.t = C 108 | l.c = t.cq.PushFront(l) 109 | default: 110 | t.release() 111 | l.t = C 112 | l.c = t.cq.PushFront(l) 113 | } 114 | } 115 | func (t *table) release() { 116 | if e := t.cq.Back(); e != nil { 117 | l := e.Value.(*locker) 118 | l.c = nil 119 | t.cq.Remove(e) 120 | if l.h != nil { 121 | t.hq.Remove(l.h) 122 | l.h = nil 123 | } 124 | l.t = F 125 | l.f = t.fq.PushFront(l) 126 | } 127 | } 128 | 129 | func (t *table) reduce() { 130 | for e := t.hq.Back(); e != nil; e = t.hq.Back() { 131 | l := e.Value.(*locker) 132 | if l.t == H { 133 | return 134 | } 135 | l.h = nil 136 | t.hq.Remove(e) 137 | } 138 | } 139 | 140 | func (t *table) exchange() { 141 | if e := t.hq.Back(); e != nil { 142 | l := e.Value.(*locker) 143 | if l.t != H { 144 | return 145 | } 146 | t.hq.Remove(e) 147 | l.h = nil 148 | l.t = C 149 | l.c = t.cq.PushFront(l) 150 | } 151 | } 152 | 153 | func (t *table) gc() { 154 | cnt := MinCacheSize 155 | if n := t.fq.Len(); n < cnt { 156 | cnt = n 157 | } 158 | prev := t.fq.Back() 159 | for e := prev; e != nil; e = prev { 160 | if cnt == 0 { 161 | break 162 | } 163 | cnt-- 164 | l := e.Value.(*locker) 165 | if n := del(&l.n); n >= 0 { 166 | continue 167 | } 168 | prev = e.Prev() 169 | l.t = E 170 | t.fq.Remove(e) 171 | t.mp.Delete(l.k) 172 | } 173 | } 174 | 175 | func add(x *int32) int32 { 176 | for { 177 | curr := atomic.LoadInt32(x) 178 | if curr == -1 { 179 | return -1 180 | } 181 | next := curr + 1 182 | if atomic.CompareAndSwapInt32(x, curr, next) { 183 | return next 184 | } 185 | } 186 | } 187 | 188 | func del(x *int32) int32 { 189 | var curr int32 190 | 191 | if curr = atomic.LoadInt32(x); curr != 0 { 192 | return 0 193 | } 194 | if atomic.CompareAndSwapInt32(x, curr, -1) { 195 | return -1 196 | } 197 | return 0 198 | } 199 | -------------------------------------------------------------------------------- /wal/file.go: -------------------------------------------------------------------------------- 1 | package wal 2 | 3 | import ( 4 | "encoding/binary" 5 | "hash/crc32" 6 | 7 | "github.com/infinivision/gaeadb/constant" 8 | "github.com/infinivision/gaeadb/errmsg" 9 | "github.com/infinivision/gaeadb/sum" 10 | "golang.org/x/sys/unix" 11 | ) 12 | 13 | func (f *file) close() error { 14 | return unix.Munmap(f.buf) 15 | } 16 | 17 | func (f *file) flush() error { 18 | return unix.Msync(f.buf, unix.MS_SYNC) 19 | } 20 | 21 | func (f *file) alloc(size int32) (int32, error) { 22 | curr := f.size 23 | if curr+size > constant.MaxTransactionSize { 24 | return 0, errmsg.OutOfSpace 25 | } 26 | f.size += size 27 | return curr, nil 28 | } 29 | 30 | func (f *file) getSize() { 31 | buf := append([]byte{}, f.buf...) 32 | o := 0 33 | for len(buf) > HeaderSize { 34 | n := int(binary.LittleEndian.Uint32(buf[o+SumSize:])) 35 | if len(buf[o+HeaderSize:]) < n { 36 | f.size = int32(o) 37 | return 38 | } 39 | if sum.Sum(crc32.New(crc32.MakeTable(crc32.Castagnoli)), buf[o+HeaderSize:o+HeaderSize+n]) != binary.LittleEndian.Uint32(buf[o:]) { 40 | f.size = int32(o) 41 | return 42 | } 43 | switch buf[o+HeaderSize] { 44 | case EM: 45 | f.size = int32(o) 46 | return 47 | case EC: 48 | o += 1 49 | case SC: 50 | if len(buf[o+1:]) < 4 { 51 | f.size = int32(o) 52 | return 53 | } 54 | n := int(binary.LittleEndian.Uint32(buf[o+1:])) 55 | if len(buf[o+5:]) < n*8 { // incomplete record 56 | f.size = int32(o) 57 | return 58 | } 59 | o += 5 + n*8 60 | case AT: 61 | if len(buf[o+1:]) < 8 { 62 | f.size = int32(o) 63 | return 64 | } 65 | o += 9 66 | case CT: 67 | if len(buf[o+1:]) < 8 { 68 | f.size = int32(o) 69 | return 70 | } 71 | o += 9 72 | case ST: 73 | if len(buf[o+1:]) < 8 { 74 | f.size = int32(o) 75 | return 76 | } 77 | n := int(binary.LittleEndian.Uint32(buf[o+9:])) 78 | j := 13 79 | for i := 0; i < n; i++ { 80 | if len(buf[o+j:]) < 2 { 81 | f.size = int32(o) 82 | return 83 | } 84 | kn := int(binary.LittleEndian.Uint16(buf[o+j:])) 85 | j += 2 86 | if len(buf[o+j:]) < kn { 87 | f.size = int32(o) 88 | return 89 | } 90 | j += kn 91 | if len(buf[o+j:]) < 2 { 92 | f.size = int32(o) 93 | return 94 | } 95 | vn := int(binary.LittleEndian.Uint16(buf[o:])) 96 | j += 2 97 | if len(buf[o+j:]) < vn { 98 | f.size = int32(o) 99 | return 100 | } 101 | j += vn 102 | } 103 | o += j 104 | case WD: 105 | if len(buf[o+1:]) < 8 { 106 | f.size = int32(o) 107 | return 108 | } 109 | if len(buf[o+9:]) < 8 { 110 | f.size = int32(o) 111 | return 112 | } 113 | n := int(binary.LittleEndian.Uint32(buf[9:])) 114 | if len(buf[o+13:]) < n*8 { 115 | f.size = int32(o) 116 | return 117 | } 118 | o += 13 + n*8 119 | case NP: 120 | if len(buf[o+1:]) < 30 { 121 | f.size = int32(o) 122 | return 123 | } 124 | o += 31 125 | case CP: 126 | if len(buf[o+1:]) < 20 { 127 | f.size = int32(o) 128 | return 129 | } 130 | on := int(binary.LittleEndian.Uint16(buf[o+17:])) 131 | vn := int(binary.LittleEndian.Uint16(buf[o+19:])) 132 | o += 21 + on*2 + vn*8 133 | case NS: 134 | if len(buf[o+1:]) < 26 { 135 | f.size = int32(o) 136 | return 137 | } 138 | o += 27 139 | } 140 | } 141 | f.size = int32(o) 142 | } 143 | 144 | func newFile(path string, flag int) (*file, error) { 145 | fd, err := unix.Open(path, unix.O_CREAT|flag, 0664) 146 | if err != nil { 147 | return nil, err 148 | } 149 | defer unix.Close(fd) 150 | if err := unix.Ftruncate(fd, constant.MaxTransactionSize); err != nil { 151 | return nil, err 152 | } 153 | buf, err := unix.Mmap(fd, 0, constant.MaxTransactionSize, unix.PROT_WRITE|unix.PROT_READ, unix.MAP_SHARED) 154 | if err != nil { 155 | return nil, err 156 | } 157 | return &file{size: 0, buf: buf}, nil 158 | } 159 | 160 | func openFile(path string, flag int) (*file, error) { 161 | fd, err := unix.Open(path, flag, 0664) 162 | if err != nil { 163 | return nil, err 164 | } 165 | defer unix.Close(fd) 166 | buf, err := unix.Mmap(fd, 0, constant.MaxTransactionSize, unix.PROT_WRITE|unix.PROT_READ, unix.MAP_SHARED) 167 | if err != nil { 168 | return nil, err 169 | } 170 | return &file{size: 0, buf: buf}, nil 171 | } 172 | -------------------------------------------------------------------------------- /scheduler/scheduler.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "encoding/binary" 5 | "sort" 6 | "sync/atomic" 7 | "time" 8 | 9 | "github.com/infinivision/gaeadb/cache" 10 | "github.com/infinivision/gaeadb/constant" 11 | "github.com/infinivision/gaeadb/data" 12 | "github.com/infinivision/gaeadb/errmsg" 13 | "github.com/infinivision/gaeadb/scheduler/manager" 14 | "github.com/infinivision/gaeadb/wal" 15 | ) 16 | 17 | func New(ts uint64, d data.Data, c cache.Cache, w wal.Writer) *scheduler { 18 | return &scheduler{ 19 | ts: ts, 20 | mts: ts, 21 | xs: []*element{}, 22 | mgr: manager.New(), 23 | ch: make(chan struct{}), 24 | mp: make(map[string]*element), 25 | mch: make(chan *message, 1024), 26 | cp: &checkpoint{ 27 | c: c, 28 | d: d, 29 | w: w, 30 | s: true, 31 | t: time.Now(), 32 | mp: make(map[uint64]struct{}), 33 | mq: make(map[uint64]struct{}), 34 | }, 35 | } 36 | } 37 | 38 | func (s *scheduler) Run() { 39 | ticker := time.NewTicker(Cycle * time.Second) 40 | for { 41 | select { 42 | case <-s.ch: 43 | s.ch <- struct{}{} 44 | return 45 | case m := <-s.mch: 46 | s.process(m) 47 | case <-ticker.C: 48 | s.gc() 49 | } 50 | } 51 | } 52 | 53 | func (s *scheduler) Stop() { 54 | s.ch <- struct{}{} 55 | <-s.ch 56 | } 57 | 58 | func (s *scheduler) Start() uint64 { 59 | ts := atomic.LoadUint64(&s.ts) 60 | s.mch <- &message{t: S, ts: ts} 61 | return ts 62 | } 63 | 64 | func (s *scheduler) Done(ts uint64) error { 65 | rch := make(chan *result) 66 | s.mch <- &message{t: D, ts: ts, rch: rch} 67 | r := <-rch 68 | return r.err 69 | } 70 | 71 | func (s *scheduler) Commit(ts uint64, rmp map[string]uint64, wmp map[string][]byte) (uint64, error) { 72 | rch := make(chan *result) 73 | s.mch <- &message{C, ts, rch, rmp, wmp} 74 | r := <-rch 75 | return r.ts, r.err 76 | } 77 | 78 | func (s *scheduler) process(m *message) { 79 | switch m.t { 80 | case S: 81 | s.mgr.Add(m.ts) 82 | case D: 83 | err := s.cp.endCKPT(m.ts) 84 | m.rch <- &result{err: err} 85 | case C: 86 | var err error 87 | 88 | for k, rts := range m.rmp { 89 | if e, ok := s.mp[k]; ok && e.ts > rts { 90 | err = errmsg.TransactionConflict 91 | m.rch <- &result{err: err} 92 | return 93 | } 94 | } 95 | ts := atomic.AddUint64(&s.ts, 1) 96 | for k, _ := range m.wmp { 97 | if e, ok := s.mp[k]; ok { 98 | e.ts = ts 99 | iSort(s.xs) 100 | } else { 101 | e = &element{k: k, ts: ts} 102 | s.mp[k] = e 103 | s.xs = push(e, s.xs) 104 | } 105 | } 106 | if s.mgr.Del(m.ts) { 107 | s.mts = m.ts 108 | } 109 | switch { 110 | case s.cp.s: 111 | s.cp.mp[ts] = struct{}{} 112 | if len(s.cp.mp) > CkptSize || time.Now().Sub(s.cp.t) > constant.CheckPointCycle { 113 | err = s.cp.startCKPT() 114 | } 115 | default: 116 | s.cp.mq[ts] = struct{}{} 117 | } 118 | m.rch <- &result{err, ts} 119 | } 120 | } 121 | 122 | func (s *scheduler) gc() { 123 | for len(s.xs) > 0 && s.xs[0].ts < s.mts { 124 | delete(s.mp, s.xs[0].k) 125 | s.xs = s.xs[1:] 126 | } 127 | } 128 | 129 | func (c *checkpoint) endCKPT(t uint64) error { 130 | if _, ok := c.mq[t]; ok { 131 | delete(c.mq, t) 132 | return nil 133 | } 134 | if delete(c.mp, t); !c.s && len(c.mp) == 0 { 135 | c.s = true 136 | c.t = time.Now() 137 | c.mp, c.mq = c.mq, c.mp 138 | c.c.Flush() 139 | if err := c.d.Flush(); err != nil { 140 | return err 141 | } 142 | if err := c.w.Append([]byte{wal.EC}); err != nil { 143 | return err 144 | } 145 | c.w.EndCKPT() 146 | } 147 | return nil 148 | } 149 | 150 | func (c *checkpoint) startCKPT() error { 151 | c.s = false 152 | log := make([]byte, 1+4+8*len(c.mp)) 153 | log[0] = wal.SC 154 | binary.LittleEndian.PutUint32(log[1:], uint32(len(c.mp))) 155 | i := 5 156 | for t, _ := range c.mp { 157 | binary.LittleEndian.PutUint64(log[i:], t) 158 | i += 8 159 | } 160 | c.w.StartCKPT() 161 | return c.w.Append(log) 162 | } 163 | 164 | func push(x *element, xs []*element) []*element { 165 | o := sort.Search(len(xs), func(i int) bool { return xs[i].ts >= x.ts }) 166 | xs = append(xs, &element{}) 167 | copy(xs[o+1:], xs[o:]) 168 | xs[o] = x 169 | return xs 170 | } 171 | 172 | func iSort(xs []*element) { 173 | n := len(xs) 174 | if n < 2 { 175 | return 176 | } 177 | for i := 1; i < n; i++ { 178 | for j := i - 1; j >= 0; j-- { 179 | if xs[j].ts > xs[j+1].ts { 180 | xs[j], xs[j+1] = xs[j+1], xs[j] 181 | } else { 182 | break 183 | } 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /cache/cache.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | "sync/atomic" 7 | "time" 8 | 9 | "github.com/infinivision/gaeadb/cache/scheduler" 10 | "github.com/infinivision/gaeadb/constant" 11 | "github.com/infinivision/gaeadb/disk" 12 | "github.com/nnsgmsone/damrey/logger" 13 | ) 14 | 15 | func New(limit int, d disk.Disk, log logger.Log) *cache { 16 | if limit < MinCacheSize { 17 | limit = MinCacheSize 18 | } 19 | return &cache{ 20 | log: log, 21 | mp: new(sync.Map), 22 | cq: new(list.List), 23 | hq: new(list.List), 24 | fq: new(list.List), 25 | sched: scheduler.New(d), 26 | ch: make(chan struct{}), 27 | n: limit / FreeMultiples, 28 | pch: make(chan *page, 1024), 29 | } 30 | } 31 | 32 | func (c *cache) Run() { 33 | { 34 | for i := constant.RootPage; i < constant.Preallocate; i++ { 35 | b, err := c.sched.Read(i) 36 | if err != nil { 37 | c.log.Fatalf("failed to load root page: %v\n", err) 38 | } 39 | c.ps[i] = &page{b: b, cp: c} 40 | } 41 | } 42 | cnt := 0 43 | freeSize := c.n * FreeMultiples 44 | ticker := time.NewTicker(Cycle * time.Second) 45 | for { 46 | select { 47 | case <-c.ch: 48 | c.ch <- struct{}{} 49 | return 50 | case pg := <-c.pch: 51 | if pg.t == I { 52 | if cnt = cnt + 1; cnt%freeSize == 0 { 53 | c.gc() 54 | } 55 | } 56 | c.get(pg) 57 | case <-ticker.C: 58 | c.gc() 59 | } 60 | } 61 | } 62 | 63 | func (c *cache) Stop() { 64 | c.ch <- struct{}{} 65 | <-c.ch 66 | c.sched.Close() 67 | } 68 | 69 | func (c *cache) Flush() { 70 | c.sched.Flush() 71 | } 72 | 73 | func (c *cache) Release(pg Page) { 74 | if pg.PageNumber() != 0 { 75 | atomic.AddInt32(&pg.(*page).n, -1) 76 | } 77 | } 78 | 79 | func (c *cache) Get(pn int64) (Page, error) { 80 | switch { 81 | case pn == -1: 82 | b, err := c.sched.Read(pn) 83 | if err != nil { 84 | return nil, err 85 | } 86 | pg := &page{b: b, n: 1, cp: c} 87 | c.mp.Store(pg.PageNumber(), pg) 88 | c.pch <- pg 89 | return pg, nil 90 | case pn < constant.Preallocate: 91 | return c.ps[pn], nil 92 | default: 93 | for { 94 | if v, ok := c.mp.Load(pn); ok { 95 | pg := v.(*page) 96 | if add(&pg.n) > 0 { 97 | c.pch <- pg 98 | return pg, nil 99 | } 100 | for _, ok := c.mp.Load(pn); ok; _, ok = c.mp.Load(pn) { // wait for delete 101 | } 102 | } 103 | b, err := c.sched.Read(pn) 104 | if err != nil { 105 | return nil, err 106 | } 107 | pg := &page{b: b, n: 1, cp: c} 108 | if _, ok := c.mp.LoadOrStore(pn, pg); !ok { 109 | return pg, nil 110 | } 111 | } 112 | } 113 | } 114 | 115 | func (c *cache) get(pg *page) { 116 | switch pg.t { 117 | case E: 118 | return 119 | case I: 120 | c.set(pg) 121 | return 122 | case H: 123 | isBack := pg.h.Next() == nil 124 | c.hq.MoveToFront(pg.h) 125 | if isBack { 126 | c.reduce() 127 | } 128 | return 129 | case F: 130 | c.fq.Remove(pg.f) 131 | pg.f = nil 132 | c.set(pg) 133 | return 134 | } 135 | switch { 136 | case pg.h == nil: 137 | c.cq.MoveToFront(pg.c) 138 | pg.h = c.hq.PushFront(pg) 139 | default: 140 | pg.t = H 141 | c.cq.Remove(pg.c) 142 | pg.c = nil 143 | c.hq.MoveToFront(pg.h) 144 | c.exchange() 145 | c.reduce() 146 | } 147 | } 148 | 149 | func (c *cache) set(pg *page) { 150 | switch { 151 | case c.hq.Len() < c.n: 152 | pg.t = H 153 | pg.h = c.hq.PushFront(pg) 154 | case c.cq.Len() < c.n/ColdMultiples: 155 | pg.t = C 156 | pg.c = c.cq.PushFront(pg) 157 | default: 158 | c.release() 159 | pg.t = C 160 | pg.c = c.cq.PushFront(pg) 161 | } 162 | } 163 | func (c *cache) release() { 164 | if e := c.cq.Back(); e != nil { 165 | pg := e.Value.(*page) 166 | pg.c = nil 167 | c.cq.Remove(e) 168 | if pg.h != nil { 169 | c.hq.Remove(pg.h) 170 | pg.h = nil 171 | } 172 | pg.t = F 173 | pg.f = c.fq.PushFront(pg) 174 | } 175 | } 176 | 177 | func (c *cache) reduce() { 178 | for e := c.hq.Back(); e != nil; e = c.hq.Back() { 179 | pg := e.Value.(*page) 180 | if pg.t == H { 181 | return 182 | } 183 | pg.h = nil 184 | c.hq.Remove(e) 185 | } 186 | } 187 | 188 | func (c *cache) exchange() { 189 | if e := c.hq.Back(); e != nil { 190 | pg := e.Value.(*page) 191 | if pg.t != H { 192 | return 193 | } 194 | c.hq.Remove(e) 195 | pg.h = nil 196 | pg.t = C 197 | pg.c = c.cq.PushFront(pg) 198 | } 199 | } 200 | 201 | func (c *cache) gc() { 202 | cnt := MinCacheSize 203 | if n := c.fq.Len(); n < cnt { 204 | cnt = n 205 | } 206 | prev := c.fq.Back() 207 | for e := prev; e != nil; e = prev { 208 | if cnt == 0 { 209 | break 210 | } 211 | cnt-- 212 | pg := e.Value.(*page) 213 | if n := del(&pg.n); n >= 0 { 214 | continue 215 | } 216 | prev = e.Prev() 217 | pg.t = E 218 | c.fq.Remove(e) 219 | c.mp.Delete(pg.b.BlockNumber()) 220 | } 221 | } 222 | 223 | func add(x *int32) int32 { 224 | for { 225 | curr := atomic.LoadInt32(x) 226 | if curr == -1 { 227 | return -1 228 | } 229 | next := curr + 1 230 | if atomic.CompareAndSwapInt32(x, curr, next) { 231 | return next 232 | } 233 | } 234 | } 235 | 236 | func del(x *int32) int32 { 237 | var curr int32 238 | 239 | if curr = atomic.LoadInt32(x); curr != 0 { 240 | return 0 241 | } 242 | if atomic.CompareAndSwapInt32(x, curr, -1) { 243 | return -1 244 | } 245 | return 0 246 | } 247 | -------------------------------------------------------------------------------- /prefix/backward.go: -------------------------------------------------------------------------------- 1 | package prefix 2 | 3 | import ( 4 | "encoding/binary" 5 | 6 | "github.com/infinivision/gaeadb/cache" 7 | "github.com/infinivision/gaeadb/constant" 8 | "github.com/infinivision/gaeadb/suffix" 9 | ) 10 | 11 | func (itr *backwardIterator) Close() error { 12 | for { 13 | if itr.s.IsEmpty() { 14 | return nil 15 | } 16 | e := itr.s.Pop().(*backwardElement) 17 | switch e.typ { 18 | case R: 19 | case E: 20 | e.rsrc.le.RUnlock() 21 | itr.t.c.Release(e.rsrc.pg) 22 | case P: 23 | itr.t.c.Release(e.rsrc.pg) 24 | } 25 | } 26 | return nil 27 | } 28 | 29 | func (itr *backwardIterator) Next() error { 30 | for i := 0; ; i = 1 { 31 | if itr.s.IsEmpty() { 32 | return nil 33 | } 34 | e := itr.s.Peek().(*backwardElement) 35 | switch e.typ { 36 | case C: 37 | if i == 0 { 38 | itr.s.Pop() 39 | } else { 40 | itr.v = e.val 41 | itr.k = e.pref 42 | return nil 43 | } 44 | case S: 45 | if i == 0 { 46 | e.itr.Next() 47 | } 48 | if e.itr.Valid() { 49 | itr.v = e.itr.Value() 50 | itr.k = append(e.pref, e.itr.Key()...) 51 | return nil 52 | } 53 | itr.s.Pop() 54 | case R: 55 | if e.cnt == -1 { 56 | itr.s.Pop() 57 | continue 58 | } 59 | if err := itr.down(byte(e.cnt), nil, e); err != nil { 60 | itr.Close() 61 | return err 62 | } 63 | case E: 64 | itr.s.Pop() 65 | e.rsrc.le.RUnlock() 66 | itr.t.c.Release(e.rsrc.pg) 67 | case P: 68 | if e.cnt == -1 { 69 | itr.s.Pop() 70 | itr.t.c.Release(e.rsrc.pg) 71 | continue 72 | } 73 | if err := itr.down(byte(e.cnt), e.rsrc.pg, e); err != nil { 74 | itr.Close() 75 | return err 76 | } 77 | } 78 | } 79 | } 80 | 81 | func (itr *backwardIterator) Valid() bool { 82 | return !itr.s.IsEmpty() 83 | } 84 | 85 | func (itr *backwardIterator) Key() []byte { 86 | if itr.s.IsEmpty() { 87 | return nil 88 | } 89 | return itr.k 90 | } 91 | 92 | func (itr *backwardIterator) Value() uint64 { 93 | if itr.s.IsEmpty() { 94 | return 0 95 | } 96 | return itr.v 97 | } 98 | 99 | func (itr *backwardIterator) seek() error { 100 | e := itr.s.Peek().(*backwardElement) 101 | switch e.typ { 102 | case S: 103 | itr.v = e.itr.Value() 104 | itr.k = append(e.pref, e.itr.Key()...) 105 | return nil 106 | case C: 107 | itr.v = e.val 108 | itr.k = e.pref 109 | return nil 110 | default: 111 | return itr.Next() 112 | } 113 | } 114 | 115 | func (itr *backwardIterator) down(k byte, par cache.Page, e *backwardElement) error { 116 | if e.typ == R { 117 | e.cnt-- 118 | root, err := itr.t.c.Get(constant.RootPage) 119 | if err != nil { 120 | return err 121 | } 122 | le := itr.t.t.Get(uint64(constant.RootPage) | uint64(k)<= j; i-- { 178 | if v := binary.LittleEndian.Uint64(par.Buffer()[2048+i*8:]); v != constant.Cancel { 179 | vs = append(vs, v) 180 | ks = append(ks, append(e.pref, byte(i))) 181 | } 182 | } 183 | if fItr := suffix.NewForwardIterator(ks, vs, nil, pg); fItr.Valid() { 184 | itr.s.Push(&forwardElement{ 185 | typ: S, 186 | itr: fItr, 187 | pref: e.pref, 188 | }) 189 | } 190 | e.cnt = int(pg.Buffer()[3]) + 1 191 | itr.t.c.Release(pg) 192 | case constant.SN: 193 | e.cnt++ 194 | if fItr := suffix.NewForwardIterator(nil, nil, nil, pg); fItr.Valid() { 195 | itr.s.Push(&forwardElement{ 196 | typ: S, 197 | itr: fItr, 198 | pref: append(e.pref, k), 199 | }) 200 | } 201 | itr.t.c.Release(pg) 202 | if v := binary.LittleEndian.Uint64(par.Buffer()[2048+int(k)*8:]); v != constant.Cancel { 203 | itr.s.Push(&forwardElement{ 204 | typ: C, 205 | val: v, 206 | pref: append(e.pref, k), 207 | }) 208 | } 209 | default: 210 | e.cnt++ 211 | itr.t.c.Release(pg) 212 | if v := binary.LittleEndian.Uint64(par.Buffer()[2048+int(k)*8:]); v != constant.Cancel { 213 | itr.s.Push(&forwardElement{ 214 | typ: C, 215 | val: v, 216 | pref: append(e.pref, k), 217 | }) 218 | } 219 | } 220 | return nil 221 | } 222 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= 2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 3 | github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= 4 | github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= 5 | github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= 6 | github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= 7 | github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= 8 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 10 | github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6psNgSztDR4= 11 | github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= 12 | github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= 13 | github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= 14 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 15 | github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= 16 | github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= 17 | github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= 18 | github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= 19 | github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= 20 | github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= 21 | github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= 22 | github.com/nnsgmsone/damrey v1.0.2 h1:Ms4905Wa7j5bIElQg2JGbbPdP4+plYpQNQiC4zrLw3w= 23 | github.com/nnsgmsone/damrey v1.0.2/go.mod h1:sRx8G+Ox98DPEaCO1j1wPT63auyaWrFqVYjCDUP2rQE= 24 | github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= 25 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 26 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 27 | github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= 28 | github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= 29 | github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= 30 | github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU= 31 | github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= 32 | github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= 33 | github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= 34 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 35 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 36 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 37 | github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= 38 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= 39 | github.com/valyala/fasthttp v1.5.0/go.mod h1:eriCz9OhZjKCGfJ185a/IDgNl0bg9IbzfpcslMZXU1c= 40 | github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio= 41 | github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= 42 | golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 43 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 44 | golang.org/x/crypto v0.0.0-20191002192127-34f69633bfdc/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 45 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 46 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 47 | golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 48 | golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 49 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 50 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 51 | golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 52 | golang.org/x/sys v0.0.0-20191210023423-ac6580df4449 h1:gSbV7h1NRL2G1xTg/owz62CST1oJBmxy4QpMMregXVQ= 53 | golang.org/x/sys v0.0.0-20191210023423-ac6580df4449/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 54 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 55 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 56 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 57 | -------------------------------------------------------------------------------- /transaction/transaction.go: -------------------------------------------------------------------------------- 1 | package transaction 2 | 3 | import ( 4 | "encoding/binary" 5 | "sync/atomic" 6 | 7 | "github.com/infinivision/gaeadb/cache" 8 | "github.com/infinivision/gaeadb/constant" 9 | "github.com/infinivision/gaeadb/data" 10 | "github.com/infinivision/gaeadb/errmsg" 11 | "github.com/infinivision/gaeadb/mvcc" 12 | "github.com/infinivision/gaeadb/scheduler" 13 | "github.com/infinivision/gaeadb/wal" 14 | "github.com/nnsgmsone/damrey/logger" 15 | ) 16 | 17 | func New(ro bool, d data.Data, m mvcc.MVCC, w wal.Writer, log logger.Log, schd scheduler.Scheduler) *transaction { 18 | return &transaction{ 19 | s: 13, // timestamp size + one byte + key's number 20 | d: d, 21 | m: m, 22 | w: w, 23 | ro: ro, 24 | log: log, 25 | schd: schd, 26 | rts: schd.Start(), 27 | rmp: make(map[string]uint64), 28 | wmp: make(map[string][]byte), 29 | } 30 | } 31 | 32 | func (tx *transaction) Rollback() error { 33 | if del(&tx.n) >= 0 { 34 | return nil 35 | } 36 | return nil 37 | } 38 | 39 | func (tx *transaction) Commit() error { 40 | var err error 41 | var os []uint64 42 | var ks []string 43 | 44 | switch { 45 | case tx.ro: 46 | return errmsg.ReadOnlyTransaction 47 | case del(&tx.n) >= 0: 48 | return nil 49 | } 50 | tx.wts, err = tx.schd.Commit(tx.rts, tx.rmp, tx.wmp) 51 | if err != nil { 52 | return err 53 | } 54 | cnt := 0 55 | log := make([]byte, tx.s) 56 | { // commit 57 | log[0] = wal.ST 58 | binary.LittleEndian.PutUint64(log[1:], tx.wts) 59 | binary.LittleEndian.PutUint32(log[9:], uint32(len(tx.wmp))) 60 | i := 13 61 | for k, v := range tx.wmp { 62 | binary.LittleEndian.PutUint16(log[i:], uint16(len(k))) 63 | i += 2 64 | copy(log[i:], []byte(k)) 65 | i += len(k) 66 | binary.LittleEndian.PutUint16(log[i:], uint16(len(v))) 67 | i += 2 68 | if len(v) > 0 { 69 | cnt++ 70 | copy(log[i:], v) 71 | i += len(v) 72 | } 73 | } 74 | if err = tx.w.Append(log); err != nil { 75 | tx.log.Fatalf("transaction start failed: %v\n", err) 76 | } 77 | } 78 | { 79 | if len(log) < 9+4+cnt*8 { 80 | log = make([]byte, 9+4+cnt*8) 81 | } else { 82 | log = log[:9+4+cnt*8] 83 | } 84 | log[0] = wal.WD 85 | binary.LittleEndian.PutUint64(log[1:], tx.wts) 86 | binary.LittleEndian.PutUint32(log[9:], uint32(cnt)) 87 | i := 13 88 | for k, v := range tx.wmp { 89 | switch { 90 | case v == nil: 91 | continue 92 | case len(v) == 0: 93 | ks = append(ks, k) 94 | continue 95 | } 96 | if o, err := tx.d.Alloc(v); err != nil { 97 | tx.log.Fatalf("transaction alloc space for data failed: %v\n", err) 98 | } else { 99 | os = append(os, o) 100 | binary.LittleEndian.PutUint64(log[i:], o) 101 | i += 8 102 | } 103 | ks = append(ks, k) 104 | } 105 | if err = tx.w.Append(log); err != nil { 106 | tx.log.Fatalf("transaction append record failed: %v\n", err) 107 | } 108 | } 109 | w := &walWriter{ 110 | w: tx.w, 111 | ts: tx.wts, 112 | mp: make(map[int64]cache.Page), 113 | } 114 | for _, k := range ks { 115 | switch { 116 | case tx.wmp[k] == nil: 117 | if err := tx.m.Set([]byte(k), constant.Delete, tx.wts, w); err != nil { 118 | tx.log.Fatalf("transaction del '%s' failed: %v\n", k, err) 119 | } 120 | case len(tx.wmp[k]) == 0: 121 | if err := tx.m.Set([]byte(k), constant.Empty, tx.wts, w); err != nil { 122 | tx.log.Fatalf("transaction set '%s' failed: %v\n", k, err) 123 | } 124 | default: 125 | if err := tx.d.Write(os[0], tx.wmp[k]); err != nil { 126 | tx.log.Fatalf("transaction write data of '%s' failed: %v\n", k, err) 127 | } 128 | if err := tx.m.Set([]byte(k), os[0], tx.wts, w); err != nil { 129 | tx.log.Fatalf("transaction set '%s' failed: %v\n", k, err) 130 | } 131 | os = os[1:] 132 | } 133 | } 134 | { 135 | log = log[:9] 136 | log[0] = wal.CT 137 | binary.LittleEndian.PutUint64(log[1:], tx.wts) 138 | if err = tx.w.Append(log); err != nil { 139 | tx.log.Fatalf("transaction commit failed: %v\n", err) 140 | } 141 | } 142 | { 143 | for _, pg := range w.mp { 144 | if pg.s { 145 | pg.pg.Sync() 146 | } 147 | } 148 | } 149 | if err := tx.schd.Done(tx.wts); err != nil { 150 | tx.log.Fatalf("transaction done failed: %v\n", err) 151 | } 152 | return nil 153 | } 154 | 155 | func (tx *transaction) Del(k []byte) error { 156 | switch { 157 | case tx.ro: 158 | return errmsg.ReadOnlyTransaction 159 | case len(k) == 0: 160 | return errmsg.KeyIsEmpty 161 | case len(k) > constant.MaxKeySize: 162 | return errmsg.KeyTooLong 163 | } 164 | if tx.s += 4 + len(k); tx.s > constant.MaxTransactionSize { 165 | return errmsg.OutOfSpace 166 | } 167 | tx.wmp[string(k)] = nil 168 | return nil 169 | } 170 | 171 | func (tx *transaction) Set(k, v []byte) error { 172 | switch { 173 | case tx.ro: 174 | return errmsg.ReadOnlyTransaction 175 | case len(k) == 0: 176 | return errmsg.KeyIsEmpty 177 | case len(k) > constant.MaxKeySize: 178 | return errmsg.KeyTooLong 179 | case len(v) > constant.MaxValueSize: 180 | return errmsg.ValTooLong 181 | } 182 | if tx.s += 4 + len(k) + len(v); tx.s > constant.MaxTransactionSize { 183 | return errmsg.OutOfSpace 184 | } 185 | tx.wmp[string(k)] = v 186 | return nil 187 | } 188 | 189 | func (tx *transaction) Get(k []byte) ([]byte, error) { 190 | if len(k) == 0 { 191 | return nil, errmsg.KeyIsEmpty 192 | } 193 | if !tx.ro { 194 | if v, ok := tx.wmp[string(k)]; ok { 195 | if v == nil { 196 | return nil, errmsg.NotExist 197 | } 198 | return v, nil 199 | } 200 | } 201 | o, ts, err := tx.m.Get(k, tx.rts) 202 | if err != nil { 203 | return nil, err 204 | } 205 | switch { 206 | case o != constant.Empty: 207 | if v, err := tx.d.Read(o); err != nil { 208 | return nil, err 209 | } else { 210 | if !tx.ro { 211 | tx.rmp[string(k)] = ts 212 | } 213 | return v, nil 214 | } 215 | default: 216 | if !tx.ro { 217 | tx.rmp[string(k)] = ts 218 | } 219 | return []byte{}, nil 220 | } 221 | } 222 | 223 | func (tx *transaction) NewForwardIterator(pref []byte) (Iterator, error) { 224 | if itr, err := tx.m.NewForwardIterator(pref, tx.rts); err != nil { 225 | return nil, err 226 | } else { 227 | fitr := &forwardIterator{ 228 | tx: tx, 229 | itr: itr, 230 | kv: &kvList{ 231 | mp: make(map[string][]byte), 232 | omp: make(map[string]uint64), 233 | }, 234 | } 235 | return fitr, fitr.seek() 236 | } 237 | } 238 | 239 | func (tx *transaction) NewBackwardIterator(pref []byte) (Iterator, error) { 240 | if itr, err := tx.m.NewBackwardIterator(pref, tx.rts); err != nil { 241 | return nil, err 242 | } else { 243 | bitr := &forwardIterator{ 244 | tx: tx, 245 | itr: itr, 246 | kv: &kvList{ 247 | mp: make(map[string][]byte), 248 | omp: make(map[string]uint64), 249 | }, 250 | } 251 | return bitr, bitr.seek() 252 | } 253 | } 254 | 255 | func del(x *int32) int32 { 256 | var curr int32 257 | 258 | if curr = atomic.LoadInt32(x); curr != 0 { 259 | return 0 260 | } 261 | if atomic.CompareAndSwapInt32(x, curr, -1) { 262 | return -1 263 | } 264 | return 0 265 | } 266 | -------------------------------------------------------------------------------- /wal/recover.go: -------------------------------------------------------------------------------- 1 | package wal 2 | 3 | import ( 4 | "encoding/binary" 5 | "hash/crc32" 6 | "os" 7 | 8 | "github.com/infinivision/gaeadb/cache" 9 | "github.com/infinivision/gaeadb/constant" 10 | "github.com/infinivision/gaeadb/data" 11 | "github.com/infinivision/gaeadb/mvcc" 12 | "github.com/infinivision/gaeadb/sum" 13 | "golang.org/x/sys/unix" 14 | ) 15 | 16 | func Recover(dir string, d data.Data, m mvcc.MVCC, c cache.Cache) (uint64, error) { 17 | h, l, err := headAndLast(dir) 18 | if err != nil { 19 | return 0, err 20 | } 21 | if h < 0 { 22 | return 0, nil 23 | } 24 | ok, last, err := findCKPT(dir, h, l) 25 | if err != nil { 26 | return 0, err 27 | } 28 | switch { 29 | case !ok: 30 | return recoverFromStart(dir, h, l, d, m, c) 31 | default: 32 | return recoverFromCKPT(dir, h, last, d, m, c) 33 | } 34 | return 0, nil 35 | } 36 | 37 | func recoverFromCKPT(dir string, head, last int, d data.Data, m mvcc.MVCC, c cache.Cache) (uint64, error) { 38 | rs, err := loads(head, last, dir) 39 | if err != nil { 40 | return 0, err 41 | } 42 | ts, mp, mr, mq, rs := getTimestamp(rs, true) 43 | for i, j := 0, len(rs); i < j; i++ { 44 | switch r := rs[i].rc.(type) { 45 | case startTransaction: 46 | if _, ok := mr[r.ts]; ok { 47 | break 48 | } 49 | if _, ok := mp[r.ts]; ok { // redo 50 | os := mq[r.ts].os 51 | for k, v := range r.mp { 52 | switch { 53 | case v == nil: 54 | if !m.Exist([]byte(k), r.ts) { 55 | if err := m.Del([]byte(k), r.ts, &recoverWriter{}); err != nil { 56 | return 0, err 57 | } 58 | } 59 | default: 60 | if err := d.Write(os[0], v); err != nil { 61 | return 0, err 62 | } 63 | if !m.Exist([]byte(k), r.ts) { 64 | if err := m.Set([]byte(k), os[0], r.ts, &recoverWriter{}); err != nil { 65 | return 0, err 66 | } 67 | } 68 | os = os[1:] 69 | } 70 | } 71 | } else { // undo 72 | if wd, ok := mq[r.ts]; ok { 73 | for _, o := range wd.os { 74 | if err := d.Del(o); err != nil { 75 | return 0, err 76 | } 77 | } 78 | } 79 | for k, _ := range r.mp { 80 | if m.Exist([]byte(k), r.ts) { 81 | if err := m.Set([]byte(k), constant.Cancel, r.ts, &recoverWriter{}); err != nil { 82 | return 0, err 83 | } 84 | } 85 | } 86 | } 87 | } 88 | } 89 | c.Flush() 90 | return ts, nil 91 | } 92 | 93 | func recoverFromStart(dir string, head, last int, d data.Data, m mvcc.MVCC, c cache.Cache) (uint64, error) { 94 | rs, err := loads(head, last, dir) 95 | if err != nil { 96 | return 0, err 97 | } 98 | ts, mp, _, mq, rs := getTimestamp(rs, false) 99 | for i, j := 0, len(rs); i < j; i++ { 100 | switch r := rs[i].rc.(type) { 101 | case startTransaction: 102 | if _, ok := mp[r.ts]; ok { // redo 103 | os := mq[r.ts].os 104 | for k, v := range r.mp { 105 | switch { 106 | case v == nil: 107 | if !m.Exist([]byte(k), r.ts) { 108 | if err := m.Del([]byte(k), r.ts, &recoverWriter{}); err != nil { 109 | return 0, err 110 | } 111 | } 112 | default: 113 | if err := d.Write(os[0], v); err != nil { 114 | return 0, err 115 | } 116 | if !m.Exist([]byte(k), r.ts) { 117 | if err := m.Set([]byte(k), os[0], r.ts, &recoverWriter{}); err != nil { 118 | return 0, err 119 | } 120 | } 121 | os = os[1:] 122 | } 123 | } 124 | } else { // undo 125 | if wd, ok := mq[r.ts]; ok { 126 | for _, o := range wd.os { 127 | if err := d.Del(o); err != nil { 128 | return 0, err 129 | } 130 | } 131 | } 132 | for k, _ := range r.mp { 133 | if m.Exist([]byte(k), r.ts) { 134 | if err := m.Set([]byte(k), constant.Cancel, r.ts, &recoverWriter{}); err != nil { 135 | return 0, err 136 | } 137 | } 138 | } 139 | } 140 | } 141 | } 142 | c.Flush() 143 | return ts, nil 144 | } 145 | 146 | func headAndLast(dir string) (int, int, error) { 147 | h := -1 148 | for i := 0; ; i++ { 149 | st, err := os.Stat(fileName(i, dir)) 150 | switch { 151 | case err == nil: 152 | if h == -1 && st.Size() != 0 { 153 | h = i 154 | } 155 | case os.IsNotExist(err): 156 | return h, i - 1, nil 157 | default: 158 | return -1, -1, err 159 | } 160 | } 161 | } 162 | 163 | func findCKPT(dir string, head, last int) (bool, int, error) { 164 | for last >= head { 165 | rs, err := load(fileName(last, dir)) 166 | if err != nil { 167 | return false, -1, err 168 | } 169 | for i := len(rs) - 1; i >= 0; i-- { 170 | switch rs[i].rc.(type) { 171 | case endCKPT: 172 | return true, last, nil 173 | } 174 | } 175 | last-- 176 | } 177 | return false, head, nil 178 | } 179 | 180 | func loads(head, last int, dir string) ([]*record, error) { 181 | var rs []*record 182 | 183 | for head <= last { 184 | s, err := load(fileName(head, dir)) 185 | if err != nil { 186 | return nil, err 187 | } 188 | rs = append(rs, s...) 189 | head++ 190 | } 191 | return rs, nil 192 | } 193 | 194 | func load(path string) ([]*record, error) { 195 | var rs []*record 196 | 197 | fp, err := openFile(path, unix.O_RDWR) 198 | if err != nil { 199 | return nil, err 200 | } 201 | defer fp.close() 202 | buf := append([]byte{}, fp.buf...) 203 | for len(buf) > HeaderSize { 204 | n := int(binary.LittleEndian.Uint32(buf[SumSize:])) 205 | if len(buf[HeaderSize:]) < n { 206 | return rs, nil 207 | } 208 | if sum.Sum(crc32.New(crc32.MakeTable(crc32.Castagnoli)), buf[HeaderSize:HeaderSize+n]) != binary.LittleEndian.Uint32(buf) { 209 | return rs, nil 210 | } 211 | buf = buf[HeaderSize:] 212 | switch buf[0] { 213 | case EM: 214 | return rs, nil 215 | case EC: 216 | rs = append(rs, &record{endCKPT{}}) 217 | buf = buf[1:] 218 | case SC: 219 | if len(buf[1:]) < 4 { // incomplete record 220 | return rs, nil 221 | } 222 | n := int(binary.LittleEndian.Uint32(buf[1:])) 223 | if len(buf[5:]) < n*8 { // incomplete record 224 | return rs, nil 225 | } 226 | o := 5 227 | sc := startCKPT{} 228 | for i := 0; i < n; i++ { 229 | sc.ts = append(sc.ts, binary.LittleEndian.Uint64(buf[o:])) 230 | o += 8 231 | } 232 | rs = append(rs, &record{sc}) 233 | buf = buf[o:] 234 | case AT: 235 | if len(buf[1:]) < 8 { // incomplete record 236 | return rs, nil 237 | } 238 | rs = append(rs, &record{endTransaction{binary.LittleEndian.Uint64(buf[1:])}}) 239 | buf = buf[9:] 240 | case CT: 241 | if len(buf[1:]) < 8 { // incomplete record 242 | return rs, nil 243 | } 244 | rs = append(rs, &record{endTransaction{binary.LittleEndian.Uint64(buf[1:])}}) 245 | buf = buf[9:] 246 | case ST: 247 | if len(buf[1:]) < 8 { // incomplete record 248 | return rs, nil 249 | } 250 | st := startTransaction{} 251 | st.mp = make(map[string][]byte) 252 | st.ts = binary.LittleEndian.Uint64(buf[1:]) 253 | n := int(binary.LittleEndian.Uint32(buf[9:])) 254 | o := 13 255 | for i := 0; i < n; i++ { 256 | if len(buf[o:]) < 2 { 257 | return rs, nil 258 | } 259 | kn := int(binary.LittleEndian.Uint16(buf[o:])) 260 | o += 2 261 | if len(buf[o:]) < kn { 262 | return rs, nil 263 | } 264 | k := buf[o : o+kn] 265 | o += kn 266 | if len(buf[o:]) < 2 { 267 | return rs, nil 268 | } 269 | vn := int(binary.LittleEndian.Uint16(buf[o:])) 270 | o += 2 271 | if len(buf[o:]) < vn { 272 | return rs, nil 273 | } 274 | if vn > 0 { 275 | v := buf[o : o+vn] 276 | st.mp[string(k)] = v 277 | } else { 278 | st.mp[string(k)] = nil 279 | } 280 | o += vn 281 | } 282 | rs = append(rs, &record{st}) 283 | buf = buf[o:] 284 | case WD: 285 | if len(buf[1:]) < 8 { // incomplete record 286 | return rs, nil 287 | } 288 | if len(buf[9:]) < 4 { // incomplete record 289 | return rs, nil 290 | } 291 | n := int(binary.LittleEndian.Uint32(buf[9:])) 292 | if len(buf[13:]) < n*8 { // incomplete record 293 | return rs, nil 294 | } 295 | wd := writeData{} 296 | wd.ts = binary.LittleEndian.Uint64(buf[1:]) 297 | o := 13 298 | for i := 0; i < n; i++ { 299 | wd.os = append(wd.os, binary.LittleEndian.Uint64(buf[o:])) 300 | o += 8 301 | } 302 | rs = append(rs, &record{wd}) 303 | buf = buf[o:] 304 | case NP: 305 | if len(buf[1:]) < 30 { // incomplete record 306 | return rs, nil 307 | } 308 | on := int(binary.LittleEndian.Uint16(buf[27:])) 309 | vn := int(binary.LittleEndian.Uint16(buf[29:])) 310 | if len(buf[31:]) < on*2+vn*8 { // incomplete record 311 | return rs, nil 312 | } 313 | np := newPrefix{} 314 | np.ts = binary.LittleEndian.Uint64(buf[1:]) 315 | np.pn = binary.LittleEndian.Uint64(buf[9:]) 316 | np.val = binary.LittleEndian.Uint64(buf[17:]) 317 | np.off = binary.LittleEndian.Uint16(buf[25:]) 318 | o := 31 319 | for i := 0; i < on; i++ { 320 | np.os = append(np.os, binary.LittleEndian.Uint16(buf[o:])) 321 | o += 2 322 | } 323 | for i := 0; i < vn; i++ { 324 | np.vs = append(np.vs, binary.LittleEndian.Uint64(buf[o:])) 325 | o += 8 326 | } 327 | rs = append(rs, &record{np}) 328 | buf = buf[o:] 329 | case CP: 330 | if len(buf[1:]) < 20 { // incomplete record 331 | return rs, nil 332 | } 333 | on := int(binary.LittleEndian.Uint16(buf[17:])) 334 | vn := int(binary.LittleEndian.Uint16(buf[19:])) 335 | if len(buf[21:]) < on*2+vn*8 { // incomplete record 336 | return rs, nil 337 | } 338 | cp := chgPrefix{} 339 | cp.ts = binary.LittleEndian.Uint64(buf[3:]) 340 | cp.pn = binary.LittleEndian.Uint64(buf[11:]) 341 | o := 21 342 | for i := 0; i < on; i++ { 343 | cp.os = append(cp.os, binary.LittleEndian.Uint16(buf[o:])) 344 | o += 2 345 | } 346 | for i := 0; i < vn; i++ { 347 | cp.vs = append(cp.vs, binary.LittleEndian.Uint64(buf[o:])) 348 | o += 8 349 | } 350 | rs = append(rs, &record{cp}) 351 | buf = buf[o:] 352 | case NS: 353 | if len(buf[1:]) < 26 { // incomplete record 354 | return rs, nil 355 | } 356 | ns := newSuffix{} 357 | ns.start, ns.end = buf[1], buf[2] 358 | ns.ts = binary.LittleEndian.Uint64(buf[3:]) 359 | ns.pn = binary.LittleEndian.Uint64(buf[11:]) 360 | ns.val = binary.LittleEndian.Uint64(buf[19:]) 361 | rs = append(rs, &record{ns}) 362 | buf = buf[27:] 363 | } 364 | } 365 | return rs, nil 366 | } 367 | 368 | func recoverMetadata(rs []*record, c cache.Cache) error { 369 | for i, j := 0, len(rs); i < j; i++ { 370 | switch r := rs[i].rc.(type) { 371 | case newPrefix: 372 | par, err := c.Get(int64(r.pn)) 373 | if err != nil { 374 | return err 375 | } 376 | pg, err := c.Get(int64(r.val)) 377 | if err != nil { 378 | c.Release(par) 379 | return err 380 | } 381 | defer c.Release(pg) 382 | defer c.Release(par) 383 | binary.LittleEndian.PutUint64(par.Buffer()[r.off:], r.val|constant.PN<= 0; i-- { 419 | switch r := rs[i].rc.(type) { 420 | case writeData: 421 | mq[r.ts] = &r 422 | case startCKPT: 423 | if isCKPT { 424 | mr := make(map[uint64]struct{}) 425 | for _, t := range r.ts { 426 | mr[t] = struct{}{} 427 | } 428 | return ts, mp, mr, mq, rs[i+1:] 429 | } 430 | case endTransaction: 431 | if r.ts > ts { 432 | ts = r.ts 433 | } 434 | mp[r.ts] = struct{}{} 435 | } 436 | } 437 | return ts, mp, nil, mq, rs 438 | } 439 | -------------------------------------------------------------------------------- /prefix/prefix.go: -------------------------------------------------------------------------------- 1 | package prefix 2 | 3 | import ( 4 | "encoding/binary" 5 | 6 | "github.com/infinivision/gaeadb/cache" 7 | "github.com/infinivision/gaeadb/constant" 8 | "github.com/infinivision/gaeadb/errmsg" 9 | "github.com/infinivision/gaeadb/locker" 10 | "github.com/infinivision/gaeadb/stack" 11 | "github.com/infinivision/gaeadb/suffix" 12 | ) 13 | 14 | func New(c cache.Cache, t locker.Table) *tree { 15 | go t.Run() 16 | go c.Run() 17 | return &tree{c, t} 18 | } 19 | 20 | func (t *tree) Close() error { 21 | t.t.Stop() 22 | t.c.Stop() 23 | return nil 24 | } 25 | 26 | func (t *tree) Del(k []byte, w suffix.Writer) error { 27 | return t.Set(k, constant.Cancel, w) 28 | } 29 | 30 | func (t *tree) Get(k []byte) (uint64, error) { 31 | typ, pn, le, k, pg, err := t.down(k, false) 32 | if err != nil { 33 | return 0, err 34 | } 35 | defer t.c.Release(pg) 36 | defer le.RUnlock() 37 | switch typ { 38 | case constant.PN: 39 | if v := binary.LittleEndian.Uint64(pg.Buffer()[2048+int(k[0])*8:]); v == constant.Cancel { 40 | return 0, errmsg.NotExist 41 | } else { 42 | return v, nil 43 | } 44 | case constant.ES: 45 | return 0, nil 46 | case constant.MS: 47 | return t.find(k, pn) 48 | case constant.SN: 49 | return t.find(k[1:], pn) 50 | } 51 | return 0, nil 52 | } 53 | 54 | func (t *tree) Set(k []byte, v uint64, w suffix.Writer) error { 55 | typ, pn, le, k, pg, err := t.down(k, true) 56 | if err != nil { 57 | return err 58 | } 59 | defer t.c.Release(pg) 60 | defer le.Unlock() 61 | switch typ { 62 | case constant.PN: 63 | defer pg.Sync() 64 | binary.LittleEndian.PutUint64(pg.Buffer()[2048+int(k[0])*8:], v) 65 | return nil 66 | case constant.MS: 67 | return t.insert(k, v, w, pn, pg) 68 | case constant.SN: 69 | return t.insert(k[1:], v, w, pn, pg) 70 | case constant.ES: 71 | return t.insertByNewSuffix(k, v, w, pg) 72 | } 73 | return errmsg.UnknownError 74 | } 75 | 76 | func (t *tree) NewForwardIterator(pref []byte) (Iterator, error) { 77 | s := stack.New() 78 | switch { 79 | case len(pref) == 0: 80 | s.Push(&forwardElement{ 81 | typ: R, 82 | cnt: 0, 83 | pref: pref, 84 | }) 85 | default: 86 | typ, pn, le, suff, pg, err := t.down(pref, false) 87 | if err != nil { 88 | return nil, err 89 | } 90 | if err := t.newForwardElement(s, typ, pn, &resource{pg, le}, pref[:len(pref)-len(suff)], suff); err != nil { 91 | return nil, err 92 | } 93 | } 94 | itr := &forwardIterator{t: t, s: s} 95 | if err := itr.seek(); err != nil { 96 | itr.Close() 97 | return nil, err 98 | } 99 | return itr, nil 100 | } 101 | 102 | func (t *tree) NewBackwardIterator(pref []byte) (Iterator, error) { 103 | s := stack.New() 104 | switch { 105 | case len(pref) == 0: 106 | s.Push(&backwardElement{ 107 | typ: R, 108 | cnt: 255, 109 | pref: pref, 110 | }) 111 | default: 112 | typ, pn, le, suff, pg, err := t.down(pref, false) 113 | if err != nil { 114 | return nil, err 115 | } 116 | if err := t.newBackwardElement(s, typ, pn, &resource{pg, le}, pref[:len(pref)-len(suff)], suff); err != nil { 117 | return nil, err 118 | } 119 | } 120 | itr := &backwardIterator{t: t, s: s} 121 | if err := itr.seek(); err != nil { 122 | itr.Close() 123 | return nil, err 124 | } 125 | return itr, nil 126 | } 127 | 128 | func (t *tree) newForwardElement(s stack.Stack, typ int, pn int64, rsrc *resource, pref, suff []byte) error { 129 | for { 130 | switch typ { 131 | case constant.ES: 132 | s.Push(&forwardElement{ 133 | typ: E, 134 | rsrc: rsrc, 135 | }) 136 | if len(suff) == 1 && binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 137 | s.Push(&forwardElement{ 138 | typ: C, 139 | pref: append(pref, suff[0]), 140 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 141 | }) 142 | } 143 | return nil 144 | case constant.MS: 145 | pg, err := t.c.Get(pn) 146 | if err != nil { 147 | defer t.c.Release(rsrc.pg) 148 | defer rsrc.le.RUnlock() 149 | return err 150 | } 151 | defer t.c.Release(pg) 152 | s.Push(&forwardElement{ 153 | typ: E, 154 | rsrc: rsrc, 155 | }) 156 | if fItr := suffix.NewForwardIterator(nil, nil, suff, pg); fItr.Valid() { 157 | s.Push(&forwardElement{ 158 | typ: S, 159 | itr: fItr, 160 | pref: pref, 161 | }) 162 | } 163 | if len(suff) == 1 && binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 164 | s.Push(&forwardElement{ 165 | typ: C, 166 | pref: append(pref, suff[0]), 167 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 168 | }) 169 | } 170 | return nil 171 | case constant.SN: 172 | pg, err := t.c.Get(pn) 173 | if err != nil { 174 | defer t.c.Release(rsrc.pg) 175 | defer rsrc.le.RUnlock() 176 | return err 177 | } 178 | defer t.c.Release(pg) 179 | s.Push(&forwardElement{ 180 | typ: E, 181 | rsrc: rsrc, 182 | }) 183 | if fItr := suffix.NewForwardIterator(nil, nil, suff[1:], pg); fItr.Valid() { 184 | s.Push(&forwardElement{ 185 | typ: S, 186 | itr: fItr, 187 | pref: append(pref, suff[0]), 188 | }) 189 | } 190 | if len(suff) == 1 && binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 191 | s.Push(&forwardElement{ 192 | typ: C, 193 | pref: append(pref, suff[0]), 194 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 195 | }) 196 | } 197 | return nil 198 | case constant.PN: 199 | pn, typ = branch(suff[0], rsrc.pg.Buffer()) 200 | if typ == constant.PN { 201 | pg, err := t.c.Get(pn) 202 | if err != nil { 203 | defer t.c.Release(rsrc.pg) 204 | defer rsrc.le.RUnlock() 205 | return err 206 | } 207 | s.Push(&forwardElement{ 208 | typ: E, 209 | rsrc: rsrc, 210 | }) 211 | s.Push(&forwardElement{ 212 | typ: P, 213 | cnt: 0, 214 | rsrc: &resource{pg: pg}, 215 | pref: append(pref, suff[0]), 216 | }) 217 | if binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 218 | s.Push(&forwardElement{ 219 | typ: C, 220 | pref: append(pref, suff[0]), 221 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 222 | }) 223 | } 224 | return nil 225 | } 226 | } 227 | } 228 | return errmsg.UnknownError 229 | } 230 | 231 | func (t *tree) newBackwardElement(s stack.Stack, typ int, pn int64, rsrc *resource, pref, suff []byte) error { 232 | for { 233 | switch typ { 234 | case constant.ES: 235 | s.Push(&backwardElement{ 236 | typ: E, 237 | rsrc: rsrc, 238 | }) 239 | if len(suff) == 1 && binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 240 | s.Push(&backwardElement{ 241 | typ: C, 242 | pref: append(pref, suff[0]), 243 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 244 | }) 245 | } 246 | return nil 247 | case constant.MS: 248 | pg, err := t.c.Get(pn) 249 | if err != nil { 250 | defer t.c.Release(rsrc.pg) 251 | defer rsrc.le.RUnlock() 252 | return err 253 | } 254 | defer t.c.Release(pg) 255 | s.Push(&backwardElement{ 256 | typ: E, 257 | rsrc: rsrc, 258 | }) 259 | if len(suff) == 1 && binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 260 | s.Push(&backwardElement{ 261 | typ: C, 262 | pref: append(pref, suff[0]), 263 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 264 | }) 265 | } 266 | if bItr := suffix.NewBackwardIterator(nil, nil, suff, pg); bItr.Valid() { 267 | s.Push(&backwardElement{ 268 | typ: S, 269 | itr: bItr, 270 | pref: pref, 271 | }) 272 | } 273 | return nil 274 | case constant.SN: 275 | pg, err := t.c.Get(pn) 276 | if err != nil { 277 | defer t.c.Release(rsrc.pg) 278 | defer rsrc.le.RUnlock() 279 | return err 280 | } 281 | defer t.c.Release(pg) 282 | s.Push(&backwardElement{ 283 | typ: E, 284 | rsrc: rsrc, 285 | }) 286 | if len(suff) == 1 && binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 287 | s.Push(&backwardElement{ 288 | typ: C, 289 | pref: append(pref, suff[0]), 290 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 291 | }) 292 | } 293 | if bItr := suffix.NewBackwardIterator(nil, nil, suff[1:], pg); bItr.Valid() { 294 | s.Push(&backwardElement{ 295 | typ: S, 296 | itr: bItr, 297 | pref: append(pref, suff[0]), 298 | }) 299 | } 300 | return nil 301 | case constant.PN: 302 | pn, typ = branch(suff[0], rsrc.pg.Buffer()) 303 | if typ == constant.PN { 304 | pg, err := t.c.Get(pn) 305 | if err != nil { 306 | defer t.c.Release(rsrc.pg) 307 | defer rsrc.le.RUnlock() 308 | return err 309 | } 310 | s.Push(&backwardElement{ 311 | typ: E, 312 | rsrc: rsrc, 313 | }) 314 | if binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]) != constant.Cancel { 315 | s.Push(&backwardElement{ 316 | typ: C, 317 | pref: append(pref, suff[0]), 318 | val: binary.LittleEndian.Uint64(rsrc.pg.Buffer()[2048+int(suff[0])*8:]), 319 | }) 320 | } 321 | s.Push(&backwardElement{ 322 | typ: P, 323 | cnt: 255, 324 | rsrc: &resource{pg: pg}, 325 | pref: append(pref, suff[0]), 326 | }) 327 | return nil 328 | } 329 | } 330 | } 331 | return errmsg.UnknownError 332 | } 333 | 334 | // return value 335 | // typ - node's type 336 | // pn - node's page number 337 | // le - locker 338 | // k - suffix 339 | // pg - parent node 340 | func (t *tree) down(k []byte, update bool) (int, int64, locker.Locker, []byte, cache.Page, error) { 341 | pn, typ := constant.RootPage, constant.PN 342 | le := t.t.Get(uint64(pn) | uint64(k[0])< 1 { 362 | if ok, rtyp, rpn, rpg, err := t.detect(k, pn); err != nil { 363 | goto ERR 364 | } else if ok { 365 | t.c.Release(pg) 366 | return rtyp, rpn, le, k, rpg, nil 367 | } 368 | } 369 | switch { 370 | case update: 371 | le.Unlock() 372 | default: 373 | le.RUnlock() 374 | } 375 | t.c.Release(pg) 376 | le = t.t.Get(uint64(pn) | uint64(k[0])<= 0; start-- { 429 | if _, typ := branch(start, buf); typ != constant.ES { 430 | start++ 431 | break 432 | } 433 | if start == 0 { 434 | break 435 | } 436 | } 437 | for end = k[0]; end <= 0xFF; end++ { 438 | if _, typ := branch(end, buf); typ != constant.ES { 439 | end-- 440 | break 441 | } 442 | if end == 0xFF { 443 | break 444 | } 445 | } 446 | pn := uint64(pg.PageNumber()) 447 | pg.Buffer()[2], pg.Buffer()[3] = start, end 448 | if start == end { 449 | pn |= constant.SN << constant.TypeOff 450 | } else { 451 | pn |= constant.MS << constant.TypeOff 452 | } 453 | for i := int(start); i <= int(end); i++ { 454 | binary.LittleEndian.PutUint64(buf[i*8:], pn) 455 | } 456 | } 457 | if err = w.NewSuffix(end, start, uint64(par.PageNumber()), uint64(pg.PageNumber())); err != nil { 458 | return err 459 | } 460 | par.Sync() 461 | if start == end { 462 | return suffix.Insert(k[1:], v, w, t.c, pg, par) 463 | } 464 | return suffix.Insert(k, v, w, t.c, pg, par) 465 | } 466 | 467 | func (t *tree) detect(k []byte, pn int64) (bool, int, int64, cache.Page, error) { 468 | pg, err := t.c.Get(pn) 469 | if err != nil { 470 | return false, 0, 0, nil, err 471 | } 472 | if pn, typ := branch(k[0], pg.Buffer()); typ == constant.MS || typ == constant.ES { 473 | return true, typ, pn, pg, nil 474 | } 475 | t.c.Release(pg) 476 | return false, 0, 0, nil, nil 477 | } 478 | 479 | func branch(k byte, buf []byte) (int64, int) { 480 | pn := binary.LittleEndian.Uint64(buf[int(k)*8:]) 481 | return int64(pn & constant.Mask), int((pn >> constant.TypeOff) & constant.TypeMask) 482 | } 483 | -------------------------------------------------------------------------------- /suffix/suffix.go: -------------------------------------------------------------------------------- 1 | package suffix 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "sort" 7 | 8 | "github.com/infinivision/gaeadb/cache" 9 | "github.com/infinivision/gaeadb/constant" 10 | ) 11 | 12 | func Find(k []byte, buf []byte) uint64 { 13 | if o := find(k, buf); o > 0 { 14 | return binary.LittleEndian.Uint64(buf[o:]) 15 | } 16 | return 0 17 | } 18 | 19 | func Insert(k []byte, v uint64, w Writer, c cache.Cache, pg, par cache.Page) error { 20 | return load(w, pg, true).insert(k, v, c, par) 21 | } 22 | 23 | func NewForwardIterator(ks [][]byte, vs []uint64, prefix []byte, pg cache.Page) Iterator { 24 | es := load(nil, pg, false).es 25 | for i := 0; i < len(ks); i++ { 26 | es = push(&element{vs[i], ks[i]}, es) 27 | } 28 | if len(prefix) > 0 { 29 | for len(es) > 0 { 30 | if bytes.HasPrefix(es[0].suff, prefix) { 31 | break 32 | } 33 | es = es[1:] 34 | } 35 | } 36 | return &forwardIterator{prefix, es} 37 | } 38 | 39 | func NewBackwardIterator(ks [][]byte, vs []uint64, prefix []byte, pg cache.Page) Iterator { 40 | es := load(nil, pg, false).es 41 | for i := 0; i < len(ks); i++ { 42 | es = push(&element{vs[i], ks[i]}, es) 43 | } 44 | if len(prefix) > 0 { 45 | for len(es) > 0 { 46 | if bytes.HasPrefix(es[len(es)-1].suff, prefix) { 47 | break 48 | } 49 | es = es[:len(es)-1] 50 | } 51 | } 52 | return &backwardIterator{prefix, es} 53 | } 54 | 55 | func (s *suffix) insert(k []byte, v uint64, c cache.Cache, par cache.Page) error { 56 | e := &element{v, k} 57 | if s.append(e) { 58 | return s.writeBack() 59 | } 60 | if s.pg.Buffer()[2] == s.pg.Buffer()[3] { 61 | return s.insertBySN(e, c, par) 62 | } 63 | return s.insertByMS(e, c, par) 64 | } 65 | 66 | // all prefix node are sorted 67 | func (s *suffix) insertBySN(e *element, c cache.Cache, par cache.Page) error { 68 | var os []uint16 69 | var vs []uint64 70 | var es []*element 71 | 72 | pg, err := c.Get(-1) 73 | if err != nil { 74 | return err 75 | } 76 | defer c.Release(pg) 77 | defer pg.Sync() 78 | defer par.Sync() 79 | for i, j := 0, len(s.es); i < j; i++ { 80 | switch { 81 | case len(s.es[i].suff) == 1: 82 | s.free += ElementHeaderSize + 1 83 | binary.LittleEndian.PutUint64(pg.Buffer()[2048+int(s.es[i].suff[0])*8:], s.es[i].off) 84 | vs = append(vs, s.es[i].off) 85 | os = append(os, uint16(s.es[i].suff[0])) 86 | default: 87 | es = append(es, s.es[i]) 88 | } 89 | } 90 | if len(s.es) != len(es) { 91 | s.es = es 92 | pg.Sync() 93 | s.pg.Sync() 94 | } 95 | binary.LittleEndian.PutUint64(par.Buffer()[int(s.pg.Buffer()[2])*8:], 96 | uint64(pg.PageNumber())|constant.PN< y: 129 | pg, err := c.Get(-1) 130 | if err != nil { 131 | return err 132 | } 133 | defer c.Release(pg) 134 | return s.splitR(e, pg, par) 135 | case x == y && x == e.suff[0]: 136 | var os []uint16 137 | var vs []uint64 138 | { 139 | for i, j := int(buf[2]), int(x); i < j; i++ { 140 | os = append(os, uint16(i)) 141 | vs = append(vs, constant.ES< 0 && x < k; n, x, size = rs.cutL() { 301 | s.free -= size 302 | s.es = append(s.es, rs.es[:n]...) 303 | rs.free += size 304 | rs.es = rs.es[n:] 305 | } 306 | s.pg.Buffer()[3] = k - 1 // ls.end = k - 1 307 | pg.Buffer()[2], pg.Buffer()[3] = k, k // rs.start = rs.end = k 308 | { // exchange 309 | s.pg, rs.pg = rs.pg, s.pg 310 | s.es, rs.es = rs.es, s.es 311 | s.free, rs.free = rs.free, s.free 312 | } 313 | s.reduce(par) 314 | rs.reduce(par) 315 | s.writeBack() 316 | rs.writeBack() 317 | return flush(s.w, []*suffix{s, rs}, par, os, vs) 318 | } 319 | 320 | func (s *suffix) splitRR(k byte, pg, par cache.Page, os []uint16, vs []uint64) error { 321 | rs := &suffix{ 322 | pg: pg, 323 | es: []*element{}, 324 | free: constant.BlockSize - HeaderSize, 325 | } 326 | pg.Buffer()[2], pg.Buffer()[3] = k+1, s.pg.Buffer()[3] // rs.start = k + 1, rs.end = s.end 327 | s.pg.Buffer()[2], s.pg.Buffer()[3] = k, k // s.start = s.end = k 328 | for n, x, size := s.cutR(); n > 0 && x > k; n, x, size = s.cutR() { 329 | rs.free -= size 330 | for i, j := len(s.es)-n, len(s.es); i < j; i++ { 331 | rs.es = push(s.es[i], rs.es) 332 | } 333 | s.free += size 334 | s.es = s.es[:len(s.es)-n] 335 | } 336 | s.reduce(par) 337 | rs.reduce(par) 338 | s.writeBack() 339 | rs.writeBack() 340 | return flush(s.w, []*suffix{s, rs}, par, os, vs) 341 | } 342 | 343 | // s ms rs 344 | func (s *suffix) splitM(e *element, mpg, rpg, par cache.Page) (*suffix, error) { 345 | rs := s.split(e.suff[0], rpg) 346 | ms := &suffix{ 347 | pg: mpg, 348 | es: []*element{}, 349 | free: constant.BlockSize - HeaderSize, 350 | } 351 | mpg.Buffer()[2], mpg.Buffer()[3] = e.suff[0], e.suff[0] 352 | if n, x, size := s.cutR(); x == e.suff[0] { 353 | ms.free -= size 354 | for i, j := len(s.es)-n, len(s.es); i < j; i++ { 355 | ms.es = push(s.es[i], ms.es) 356 | } 357 | s.free += size 358 | s.es = s.es[:len(s.es)-n] 359 | } 360 | s.reduce(par) 361 | ms.reduce(par) 362 | rs.reduce(par) 363 | s.writeBack() 364 | ms.writeBack() 365 | rs.writeBack() 366 | return ms, flush(s.w, []*suffix{s, ms, rs}, par, []uint16{}, []uint64{}) 367 | } 368 | 369 | func (s *suffix) cutL() (int, byte, int) { 370 | switch len(s.es) { 371 | case 0: 372 | return 0, 0, 0 373 | case 1: 374 | return 1, s.es[0].suff[0], len(s.es[0].suff) + ElementHeaderSize 375 | default: 376 | k := s.es[0].suff[0] 377 | size := len(s.es[0].suff) + ElementHeaderSize 378 | for i, j := 1, len(s.es); i < j; i++ { 379 | if k != s.es[i].suff[0] { 380 | return i, k, size 381 | } 382 | size += len(s.es[i].suff) + ElementHeaderSize 383 | } 384 | return len(s.es), k, size 385 | } 386 | } 387 | 388 | func (s *suffix) cutR() (int, byte, int) { 389 | switch len(s.es) { 390 | case 0: 391 | return 0, 0, 0 392 | case 1: 393 | return 1, s.es[0].suff[0], len(s.es[0].suff) + ElementHeaderSize 394 | default: 395 | n := len(s.es) 396 | k := s.es[n-1].suff[0] 397 | size := len(s.es[n-1].suff) + ElementHeaderSize 398 | for i := n - 2; i >= 0; i-- { 399 | if k != s.es[i].suff[0] { 400 | return n - i - 1, k, size 401 | } 402 | size += len(s.es[i].suff) + ElementHeaderSize 403 | } 404 | return len(s.es), k, size 405 | } 406 | } 407 | 408 | // s - k - rs 409 | func (s *suffix) split(k byte, pg cache.Page) *suffix { 410 | rs := &suffix{ 411 | pg: pg, 412 | es: []*element{}, 413 | free: constant.BlockSize - HeaderSize, 414 | } 415 | pg.Buffer()[2], pg.Buffer()[3] = k+1, s.pg.Buffer()[3] 416 | s.pg.Buffer()[3] = k - 1 417 | for n, x, size := s.cutR(); n > 0 && x > k; n, x, size = s.cutR() { 418 | s.free += size 419 | rs.free -= size 420 | for i, j := len(s.es)-n, len(s.es); i < j; i++ { 421 | rs.es = push(s.es[i], rs.es) 422 | } 423 | s.es = s.es[:len(s.es)-n] 424 | } 425 | return rs 426 | } 427 | 428 | func (s *suffix) reduce(pg cache.Page) { 429 | if s.pg.Buffer()[2] == s.pg.Buffer()[3] { 430 | for i, j := 0, len(s.es); i < j; i++ { 431 | s.free++ 432 | s.es[i].suff = s.es[i].suff[1:] 433 | } 434 | } 435 | } 436 | 437 | func dup(xs []byte) []byte { 438 | return append([]byte{}, xs...) 439 | } 440 | 441 | func push(e *element, es []*element) []*element { 442 | i := sort.Search(len(es), func(i int) bool { return bytes.Compare(es[i].suff, e.suff) >= 0 }) 443 | es = append(es, &element{}) 444 | copy(es[i+1:], es[i:]) 445 | es[i] = e 446 | return es 447 | } 448 | 449 | func find(k []byte, buf []byte) int { 450 | o := HeaderSize 451 | for i, j := 0, int(binary.LittleEndian.Uint16(buf)); i < j; i++ { 452 | n := int(binary.LittleEndian.Uint16(buf[o:])) 453 | if bytes.Compare(k, buf[ElementHeaderSize+o:ElementHeaderSize+n+o]) == 0 { 454 | return o + 2 455 | } 456 | o += ElementHeaderSize + n 457 | } 458 | return 0 459 | } 460 | 461 | func load(w Writer, pg cache.Page, update bool) *suffix { 462 | var s suffix 463 | 464 | s.w = w 465 | s.pg = pg 466 | o := HeaderSize 467 | buf := pg.Buffer() 468 | s.es = make([]*element, int(binary.LittleEndian.Uint16(buf))) 469 | for i, j := 0, int(binary.LittleEndian.Uint16(buf)); i < j; i++ { 470 | n := int(binary.LittleEndian.Uint16(buf[o:])) 471 | s.es[i] = new(element) 472 | s.es[i].off = binary.LittleEndian.Uint64(buf[o+2:]) 473 | if update { 474 | s.es[i].suff = dup(buf[ElementHeaderSize+o : ElementHeaderSize+n+o]) 475 | } else { 476 | s.es[i].suff = buf[ElementHeaderSize+o : ElementHeaderSize+n+o] 477 | } 478 | o += ElementHeaderSize + n 479 | } 480 | s.free = constant.BlockSize - o 481 | return &s 482 | } 483 | 484 | func flush(w Writer, xs []*suffix, pg cache.Page, os []uint16, vs []uint64) error { 485 | for _, s := range xs { 486 | start, end := s.pg.Buffer()[2], s.pg.Buffer()[3] 487 | pn := uint64(s.pg.PageNumber()) 488 | switch { 489 | case start == end: 490 | pn |= constant.SN << constant.TypeOff 491 | default: 492 | pn |= constant.MS << constant.TypeOff 493 | } 494 | for i, j := int(start), int(end)+1; i < j; i++ { 495 | vs = append(vs, pn) 496 | os = append(os, uint16(i)) 497 | binary.LittleEndian.PutUint64(pg.Buffer()[i*8:], pn) 498 | } 499 | } 500 | if len(os) > 0 { 501 | if err := w.ChgPrefix(uint64(pg.PageNumber()), os, vs); err != nil { 502 | return err 503 | } 504 | } 505 | pg.Sync() 506 | return nil 507 | } 508 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | --------------------------------------------------------------------------------