├── .travis.yml ├── mm ├── build.go ├── malloc_test.go ├── malloc.h ├── malloc.go └── malloc.c ├── hole_punch.go ├── hole_punch_linux.go ├── skiplist ├── merger_test.go ├── item.go ├── batch_ops.go ├── C │ ├── main.cc │ └── skiplist.hh ├── merger.go ├── node.go ├── builder.go ├── batch_ops_test.go ├── stats.go ├── node_amd64.go ├── iterator.go ├── skiplist_test.go ├── node_alloc_amd64.go ├── access_barrier.go └── skiplist.go ├── util.go ├── nodelist_test.go ├── block.go ├── nodelist.go ├── supernitro ├── merger.go ├── supernitro.go └── supernitro_test.go ├── README.md ├── file.go ├── item.go ├── block_manager.go ├── iterator.go ├── batch.go ├── nodetable ├── table.go └── table_test.go ├── LICENSE ├── nitro_test.go └── nitro.go /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: go 3 | go: 4 | - 1.6 5 | 6 | before_install: 7 | - go get github.com/axw/gocov/gocov 8 | - go get github.com/mattn/goveralls 9 | - go get golang.org/x/tools/cmd/cover 10 | 11 | script: 12 | - go get ./... 13 | - go test -v ./... 14 | - $HOME/gopath/bin/goveralls -service=travis-ci 15 | 16 | notifications: 17 | email: false 18 | -------------------------------------------------------------------------------- /mm/build.go: -------------------------------------------------------------------------------- 1 | // +build jemalloc 2 | 3 | // Copyright (c) 2016 Couchbase, Inc. 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 5 | // except in compliance with the License. You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software distributed under the 8 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 9 | // either express or implied. See the License for the specific language governing permissions 10 | // and limitations under the License. 11 | 12 | package mm 13 | 14 | // #cgo CFLAGS: -DJEMALLOC=1 15 | // #cgo LDFLAGS: -ljemalloc 16 | import "C" 17 | -------------------------------------------------------------------------------- /mm/malloc_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package mm 11 | 12 | import ( 13 | "fmt" 14 | "testing" 15 | ) 16 | 17 | func TestMalloc(t *testing.T) { 18 | Malloc(100 * 1024 * 1024) 19 | fmt.Println("size:", Size()) 20 | fmt.Println(Stats()) 21 | } 22 | -------------------------------------------------------------------------------- /hole_punch.go: -------------------------------------------------------------------------------- 1 | // +build !linux 2 | 3 | // Copyright (c) 2016 Couchbase, Inc. 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 5 | // except in compliance with the License. You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software distributed under the 8 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 9 | // either express or implied. See the License for the specific language governing permissions 10 | // and limitations under the License. 11 | 12 | package nitro 13 | 14 | import "os" 15 | 16 | // Dummy function to avoid compilation failure 17 | func punchHole(f *os.File, offset, size int64) error { 18 | return nil 19 | } 20 | 21 | func mmapPunchHole([]byte) error { 22 | return nil 23 | } 24 | -------------------------------------------------------------------------------- /mm/malloc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | #ifndef MALLOC_MM_H 10 | #define MALLOC_MM_H 11 | 12 | #include 13 | 14 | typedef struct { 15 | char *buf; 16 | int offset; 17 | int size; 18 | } stats_buf; 19 | 20 | 21 | void *mm_malloc(size_t); 22 | 23 | void mm_free(void *); 24 | 25 | char *mm_stats(); 26 | 27 | size_t mm_size(); 28 | 29 | int mm_free2os(); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /hole_punch_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import ( 13 | "io/ioutil" 14 | "os" 15 | "syscall" 16 | ) 17 | 18 | func init() { 19 | if f, err := ioutil.TempFile("", "test_holepunch"); err == nil { 20 | if _, err := f.Write(make([]byte, 4096, 4096)); err == nil { 21 | if punchHole(f, 0, 4096) == nil { 22 | useLinuxHolePunch = true 23 | } 24 | } 25 | 26 | f.Close() 27 | os.Remove(f.Name()) 28 | } 29 | } 30 | 31 | func punchHole(f *os.File, offset, size int64) error { 32 | return syscall.Fallocate(int(f.Fd()), 33 | 0x02 /*FALLOC_FL_PUNCH_HOLE*/ |0x01 /* FALLOC_FL_KEEP_SIZE */, offset, 34 | size) 35 | } 36 | 37 | func mmapPunchHole(b []byte) error { 38 | return syscall.Madvise(b, syscall.MADV_WILLNEED) 39 | } 40 | -------------------------------------------------------------------------------- /skiplist/merger_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | package skiplist 10 | 11 | import "testing" 12 | import "fmt" 13 | 14 | func TestMerger(t *testing.T) { 15 | var lists []*Skiplist 16 | var iters []*Iterator 17 | 18 | s := New() 19 | cmp := CompareBytes 20 | buf := s.MakeBuf() 21 | defer s.FreeBuf(buf) 22 | 23 | n := 5 24 | 25 | for i := 0; i < n; i++ { 26 | lists = append(lists, New()) 27 | } 28 | 29 | for i := 0; i < 10000; i++ { 30 | if i >= 1000 && i <= 8000 && i%n == 0 { 31 | continue 32 | } 33 | s := lists[i%n] 34 | s.Insert(NewByteKeyItem([]byte(fmt.Sprintf("%010d", i))), cmp, buf, &s.Stats) 35 | } 36 | 37 | for i := 0; i < n; i++ { 38 | buf := s.MakeBuf() 39 | iters = append(iters, lists[i].NewIterator(cmp, buf)) 40 | } 41 | 42 | mit := NewMergeIterator(iters) 43 | 44 | i := 0 45 | for mit.SeekFirst(); mit.Valid(); i++ { 46 | if i >= 1000 && i <= 8000 && i%n == 0 { 47 | continue 48 | } 49 | expected := fmt.Sprintf("%010d", i) 50 | got := string(*((*byteKeyItem)(mit.Get()))) 51 | if got != expected { 52 | t.Errorf("Expected %s, got %v", expected, got) 53 | } 54 | mit.Next() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import ( 13 | "unsafe" 14 | ) 15 | 16 | // partitionPivots returns pivot items which partitions items in the store 17 | // into nsplits parititons. 18 | func (m *Nitro) partitionPivots(snap *Snapshot, nsplits int) []*Item { 19 | var pivotItems []*Item 20 | 21 | tmpIter := m.NewIterator(snap) 22 | if tmpIter == nil { 23 | panic("iterator cannot be nil") 24 | } 25 | defer tmpIter.Close() 26 | 27 | barrier := m.store.GetAccesBarrier() 28 | token := barrier.Acquire() 29 | defer barrier.Release(token) 30 | 31 | pivotItems = append(pivotItems, nil) // start item 32 | pivotPtrs := m.store.GetRangeSplitItems(nsplits) 33 | for _, itmPtr := range pivotPtrs { 34 | itm := m.ptrToItem(itmPtr) 35 | tmpIter.Seek(itm.Bytes()) 36 | if tmpIter.Valid() { 37 | prevItm := pivotItems[len(pivotItems)-1] 38 | // Find bigger item than prev pivot 39 | if prevItm == nil || m.insCmp(unsafe.Pointer(itm), unsafe.Pointer(prevItm)) > 0 { 40 | pivotItems = append(pivotItems, itm) 41 | } 42 | } 43 | } 44 | pivotItems = append(pivotItems, nil) // end item 45 | 46 | return pivotItems 47 | } 48 | -------------------------------------------------------------------------------- /nodelist_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import ( 13 | "fmt" 14 | "testing" 15 | ) 16 | 17 | func TestNodeList(t *testing.T) { 18 | db := New() 19 | defer db.Close() 20 | 21 | n := 10 22 | var list *NodeList 23 | w := db.NewWriter() 24 | for i := 0; i < n; i++ { 25 | ptr := w.Put2([]byte(fmt.Sprintf("%010d", i))) 26 | if list == nil { 27 | list = NewNodeList(ptr) 28 | } else { 29 | list.Add(ptr) 30 | } 31 | } 32 | 33 | count := 0 34 | for i, k := range list.Keys() { 35 | expected := fmt.Sprintf("%010d", n-i-1) 36 | if expected != string(k) { 37 | t.Errorf("Expected %s, got %s", expected, string(k)) 38 | } 39 | count++ 40 | } 41 | 42 | if count != n { 43 | t.Errorf("Expected %d, got %d", n, count) 44 | } 45 | 46 | list.Remove([]byte(fmt.Sprintf("%010d", 2))) 47 | list.Remove([]byte(fmt.Sprintf("%010d", 5))) 48 | list.Remove([]byte(fmt.Sprintf("%010d", 8))) 49 | 50 | count = len(list.Keys()) 51 | if count != n-3 { 52 | t.Errorf("Expected %d, got %d", n-3, count) 53 | } 54 | 55 | for i := 10; i < 13; i++ { 56 | ptr := w.Put2([]byte(fmt.Sprintf("%010d", i))) 57 | list.Add(ptr) 58 | } 59 | 60 | count = len(list.Keys()) 61 | if count != n { 62 | t.Errorf("Expected %d, got %d", n, count) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /block.go: -------------------------------------------------------------------------------- 1 | package nitro 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | ) 7 | 8 | var ( 9 | errBlockFull = errors.New("Block full") 10 | ) 11 | 12 | type blockPtr uint64 13 | 14 | type dataBlock struct { 15 | buf []byte 16 | offset int 17 | } 18 | 19 | func newDataBlock(bs []byte) *dataBlock { 20 | return &dataBlock{ 21 | buf: bs[:cap(bs)], 22 | } 23 | } 24 | 25 | func (db *dataBlock) Get() []byte { 26 | if db == nil { 27 | return nil 28 | } 29 | 30 | if db.offset+2 < blockSize { 31 | l := int(binary.BigEndian.Uint16(db.buf[db.offset : db.offset+2])) 32 | if l == 0 { 33 | db.offset = blockSize 34 | return nil 35 | } 36 | db.offset += 2 37 | offset := db.offset 38 | db.offset += l 39 | return db.buf[offset : offset+l] 40 | } 41 | 42 | return nil 43 | } 44 | 45 | func (db *dataBlock) GetItems() [][]byte { 46 | var itms [][]byte 47 | 48 | for offset := 0; offset+2 < blockSize; { 49 | l := int(binary.BigEndian.Uint16(db.buf[offset : offset+2])) 50 | if l == 0 { 51 | break 52 | } 53 | offset += 2 54 | itms = append(itms, db.buf[offset:offset+l]) 55 | offset += l 56 | } 57 | 58 | return itms 59 | } 60 | 61 | func (db *dataBlock) Write(itm []byte) error { 62 | newLen := db.offset + 2 + len(itm) 63 | if newLen > len(db.buf) { 64 | return errBlockFull 65 | } 66 | 67 | binary.BigEndian.PutUint16(db.buf[db.offset:db.offset+2], uint16(len(itm))) 68 | db.offset += 2 69 | copy(db.buf[db.offset:db.offset+len(itm)], itm) 70 | db.offset += len(itm) 71 | 72 | return nil 73 | } 74 | 75 | func (db *dataBlock) IsEmpty() bool { 76 | return db.offset == 0 77 | } 78 | 79 | func (db *dataBlock) Reset() { 80 | db.offset = 0 81 | } 82 | 83 | func (db *dataBlock) Bytes() []byte { 84 | offset := db.offset 85 | // Set 2 byte len = 0 86 | if offset+1 < len(db.buf) { 87 | db.buf[offset] = 0 88 | db.buf[offset+1] = 0 89 | offset += 2 90 | } 91 | 92 | return db.buf[:offset] 93 | } 94 | -------------------------------------------------------------------------------- /nodelist.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import ( 13 | "bytes" 14 | "github.com/t3rm1n4l/nitro/skiplist" 15 | ) 16 | 17 | // NodeList is a linked list of skiplist nodes 18 | type NodeList struct { 19 | head *skiplist.Node 20 | } 21 | 22 | // NewNodeList creates new node list 23 | func NewNodeList(head *skiplist.Node) *NodeList { 24 | return &NodeList{ 25 | head: head, 26 | } 27 | } 28 | 29 | // Keys returns all keys from the node list 30 | func (l *NodeList) Keys() (keys [][]byte) { 31 | node := l.head 32 | for node != nil { 33 | key := (*Item)(node.Item()).Bytes() 34 | keys = append(keys, key) 35 | node = node.GetLink() 36 | } 37 | 38 | return 39 | } 40 | 41 | // Remove a key from the node list 42 | func (l *NodeList) Remove(key []byte) *skiplist.Node { 43 | var prev *skiplist.Node 44 | node := l.head 45 | for node != nil { 46 | nodeKey := (*Item)(node.Item()).Bytes() 47 | if bytes.Equal(nodeKey, key) { 48 | if prev == nil { 49 | l.head = node.GetLink() 50 | return node 51 | } 52 | 53 | prev.SetLink(node.GetLink()) 54 | return node 55 | } 56 | prev = node 57 | node = node.GetLink() 58 | } 59 | 60 | return nil 61 | } 62 | 63 | // Add a key into the node list 64 | func (l *NodeList) Add(node *skiplist.Node) { 65 | node.SetLink(l.head) 66 | l.head = node 67 | } 68 | 69 | // Head returns head node from the list 70 | func (l *NodeList) Head() *skiplist.Node { 71 | return l.head 72 | } 73 | -------------------------------------------------------------------------------- /skiplist/item.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import ( 13 | "bytes" 14 | "fmt" 15 | "unsafe" 16 | ) 17 | 18 | var ( 19 | MinItem unsafe.Pointer 20 | MaxItem = unsafe.Pointer(^uintptr(0)) 21 | ) 22 | 23 | func Compare(cmp CompareFn, this, that unsafe.Pointer) int { 24 | if this == MinItem || that == MaxItem { 25 | return -1 26 | } 27 | 28 | if this == MaxItem || that == MinItem { 29 | return 1 30 | } 31 | 32 | return cmp(this, that) 33 | } 34 | 35 | type byteKeyItem []byte 36 | 37 | func (itm *byteKeyItem) String() string { 38 | return string(*itm) 39 | } 40 | 41 | func (itm byteKeyItem) Size() int { 42 | return len(itm) 43 | } 44 | 45 | // NewByteKeyItem creates a new item from bytes 46 | func NewByteKeyItem(k []byte) unsafe.Pointer { 47 | itm := byteKeyItem(k) 48 | return unsafe.Pointer(&itm) 49 | } 50 | 51 | // CompareBytes is a byte item comparator 52 | func CompareBytes(this, that unsafe.Pointer) int { 53 | thisItem := (*byteKeyItem)(this) 54 | thatItem := (*byteKeyItem)(that) 55 | return bytes.Compare([]byte(*thisItem), []byte(*thatItem)) 56 | } 57 | 58 | type intKeyItem int 59 | 60 | func (itm *intKeyItem) String() string { 61 | return fmt.Sprint(*itm) 62 | } 63 | 64 | func (itm intKeyItem) Size() int { 65 | return int(unsafe.Sizeof(itm)) 66 | } 67 | 68 | // CompareInt is a helper integer item comparator 69 | func CompareInt(this, that unsafe.Pointer) int { 70 | thisItem := (*intKeyItem)(this) 71 | thatItem := (*intKeyItem)(that) 72 | return int(*thisItem - *thatItem) 73 | } 74 | -------------------------------------------------------------------------------- /mm/malloc.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package mm 11 | 12 | /* 13 | #include "malloc.h" 14 | */ 15 | import "C" 16 | 17 | import ( 18 | "fmt" 19 | "sync" 20 | "sync/atomic" 21 | "unsafe" 22 | ) 23 | 24 | var ( 25 | // Debug enables debug stats 26 | Debug = true 27 | mu sync.Mutex 28 | ) 29 | 30 | var stats struct { 31 | allocs uint64 32 | frees uint64 33 | } 34 | 35 | // Malloc implements C like memory allocator 36 | func Malloc(l int) unsafe.Pointer { 37 | if Debug { 38 | atomic.AddUint64(&stats.allocs, 1) 39 | } 40 | return C.mm_malloc(C.size_t(l)) 41 | } 42 | 43 | // Free implements C like memory deallocator 44 | func Free(p unsafe.Pointer) { 45 | if Debug { 46 | atomic.AddUint64(&stats.frees, 1) 47 | } 48 | C.mm_free(p) 49 | } 50 | 51 | // Stats returns allocator statistics 52 | // Returns jemalloc stats 53 | func Stats() string { 54 | mu.Lock() 55 | defer mu.Unlock() 56 | 57 | buf := C.mm_stats() 58 | s := "==== Stats ====\n" 59 | if Debug { 60 | s += fmt.Sprintf("Mallocs = %d\n"+ 61 | "Frees = %d\n", stats.allocs, stats.frees) 62 | } 63 | 64 | if buf != nil { 65 | s += C.GoString(buf) 66 | C.free(unsafe.Pointer(buf)) 67 | } 68 | 69 | return s 70 | } 71 | 72 | // Size returns total size allocated by mm allocator 73 | func Size() uint64 { 74 | return uint64(C.mm_size()) 75 | } 76 | 77 | // FreeOSMemory forces jemalloc to scrub memory and release back to OS 78 | func FreeOSMemory() error { 79 | errCode := int(C.mm_free2os()) 80 | if errCode != 0 { 81 | return fmt.Errorf("status: %d", errCode) 82 | } 83 | 84 | return nil 85 | } 86 | -------------------------------------------------------------------------------- /skiplist/batch_ops.go: -------------------------------------------------------------------------------- 1 | package skiplist 2 | 3 | import ( 4 | "unsafe" 5 | ) 6 | 7 | type BatchOpIterator interface { 8 | Next() 9 | Valid() bool 10 | Item() unsafe.Pointer 11 | } 12 | 13 | type AcceptFn func(unsafe.Pointer) bool 14 | 15 | type BatchOpCallback func(*Node, CompareFn, unsafe.Pointer, BatchOpIterator) error 16 | 17 | type ValidNodeFn func(*Node) bool 18 | 19 | func defaultValidNode(*Node) bool { 20 | return true 21 | } 22 | 23 | func (s *Skiplist) ExecBatchOps(opItr BatchOpIterator, head, tail *Node, 24 | callb BatchOpCallback, cmp CompareFn, 25 | validNode ValidNodeFn, sts *Stats) error { 26 | 27 | // Maxlevel 28 | var level int 29 | 30 | if validNode == nil { 31 | validNode = defaultValidNode 32 | } 33 | 34 | if head == nil { 35 | head = s.head 36 | level = int(s.level) 37 | } else { 38 | level = head.Level() 39 | } 40 | 41 | if tail == nil { 42 | tail = s.tail 43 | } 44 | 45 | err := s.execBatchOpsInner(head, tail, level, opItr, 46 | cmp, validNode, callb, sts) 47 | 48 | if err != nil { 49 | return err 50 | } 51 | 52 | if opItr.Valid() { 53 | panic("non-zero items remaining") 54 | } 55 | 56 | return err 57 | } 58 | 59 | func (s *Skiplist) execBatchOpsInner(startNode, endNode *Node, level int, 60 | opItr BatchOpIterator, cmp CompareFn, validNode ValidNodeFn, 61 | callb BatchOpCallback, sts *Stats) (err error) { 62 | 63 | currNode := startNode 64 | 65 | // Iterate in the current level 66 | for Compare(cmp, currNode.Item(), endNode.Item()) < 0 && opItr.Valid() { 67 | var rightNode *Node 68 | for rightNode, _ = currNode.getNext(level); !validNode(rightNode); { 69 | rightNode, _ = rightNode.getNext(level) 70 | } 71 | 72 | // Descend to the next level 73 | if Compare(cmp, opItr.Item(), rightNode.Item()) < 0 { 74 | if level == 0 { 75 | if err = callb(currNode, cmp, rightNode.Item(), opItr); err != nil { 76 | return 77 | } 78 | } else { 79 | if err = s.execBatchOpsInner(currNode, rightNode, level-1, opItr, 80 | cmp, validNode, callb, sts); err != nil { 81 | return 82 | } 83 | } 84 | } 85 | 86 | currNode = rightNode 87 | if currNode == nil { 88 | break 89 | } 90 | } 91 | 92 | return 93 | } 94 | -------------------------------------------------------------------------------- /supernitro/merger.go: -------------------------------------------------------------------------------- 1 | package supernitro 2 | 3 | import ( 4 | "bytes" 5 | "container/heap" 6 | "github.com/t3rm1n4l/nitro" 7 | ) 8 | 9 | type Iterator struct { 10 | iters []*nitro.Iterator 11 | h itmHeap 12 | curr []byte 13 | } 14 | 15 | func newMergeIterator(iters []*nitro.Iterator) *Iterator { 16 | return &Iterator{iters: iters} 17 | } 18 | 19 | type itmVal struct { 20 | iter *nitro.Iterator 21 | itm []byte 22 | prio int 23 | } 24 | 25 | type itmHeap []itmVal 26 | 27 | func (h itmHeap) Len() int { return len(h) } 28 | 29 | func (h itmHeap) Less(i, j int) bool { 30 | val := bytes.Compare(h[i].itm, h[j].itm) 31 | if val == 0 && h[i].prio-h[j].prio < 0 { 32 | return true 33 | } 34 | 35 | return val < 0 36 | } 37 | 38 | func (h itmHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 39 | 40 | func (h *itmHeap) Push(x interface{}) { 41 | *h = append(*h, x.(itmVal)) 42 | } 43 | 44 | func (h *itmHeap) Pop() interface{} { 45 | old := *h 46 | n := len(old) 47 | x := old[n-1] 48 | *h = old[0 : n-1] 49 | return x 50 | } 51 | 52 | func (it *Iterator) SeekFirst() { 53 | it.Seek(nil) 54 | } 55 | 56 | func (it *Iterator) Seek(itm []byte) { 57 | it.curr = nil 58 | it.h = nil 59 | for prio, subIt := range it.iters { 60 | if itm == nil { 61 | subIt.SeekFirst() 62 | } else { 63 | subIt.Seek(itm) 64 | } 65 | if subIt.Valid() { 66 | itm := append([]byte(nil), subIt.Get()...) 67 | it.h = append(it.h, itmVal{iter: subIt, itm: itm, prio: prio}) 68 | } 69 | } 70 | 71 | heap.Init(&it.h) 72 | it.Next() 73 | } 74 | 75 | func (it *Iterator) Valid() bool { 76 | return it.curr != nil 77 | } 78 | 79 | func (it *Iterator) Get() []byte { 80 | return it.curr 81 | } 82 | 83 | func (it *Iterator) Next() { 84 | var next []byte 85 | for next = it.next(); it.curr != nil && bytes.Equal(next, it.curr); next = it.next() { 86 | } 87 | 88 | it.curr = next 89 | } 90 | 91 | func (it *Iterator) next() []byte { 92 | if it.h.Len() == 0 { 93 | return nil 94 | } 95 | 96 | o := heap.Pop(&it.h) 97 | hi := o.(itmVal) 98 | curr := hi.itm 99 | hi.iter.Next() 100 | if hi.iter.Valid() { 101 | // Make explicit copy. Iterator may share the buffer 102 | hi.itm = append([]byte(nil), hi.iter.Get()...) 103 | heap.Push(&it.h, hi) 104 | } 105 | 106 | return curr 107 | } 108 | 109 | func (it *Iterator) Close() { 110 | for _, subIt := range it.iters { 111 | subIt.Close() 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /mm/malloc.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | #include "malloc.h" 10 | #include 11 | #include 12 | 13 | #ifdef JEMALLOC 14 | #include 15 | 16 | const char *je_malloc_conf = "narenas:2,lg_dirty_mult:5"; 17 | 18 | void writecb(void *ref, const char *s) { 19 | stats_buf *buf = (stats_buf *)(ref); 20 | int len; 21 | len = strlen(s); 22 | if (buf->offset + len >= buf->size) { 23 | buf->size *=2; 24 | buf->buf = realloc(buf->buf, buf->size); 25 | } 26 | strncpy(buf->buf + buf->offset, s, len); 27 | buf->offset += len; 28 | } 29 | 30 | 31 | char *doStats() { 32 | stats_buf buf; 33 | buf.size = 1024; 34 | buf.buf = malloc(buf.size); 35 | buf.offset = 0; 36 | je_malloc_stats_print(writecb, &buf, NULL); 37 | buf.buf[buf.offset] = 0; 38 | return buf.buf; 39 | } 40 | 41 | #endif 42 | 43 | void *mm_malloc(size_t sz) { 44 | #ifdef JEMALLOC 45 | return je_malloc(sz); 46 | #else 47 | return malloc(sz); 48 | #endif 49 | } 50 | 51 | void mm_free(void *p) { 52 | #ifdef JEMALLOC 53 | return je_free(p); 54 | #else 55 | return free(p); 56 | #endif 57 | } 58 | 59 | char *mm_stats() { 60 | #ifdef JEMALLOC 61 | return doStats(); 62 | #else 63 | return NULL; 64 | #endif 65 | } 66 | 67 | size_t mm_size() { 68 | size_t resident, sz; 69 | sz = sizeof(size_t); 70 | #ifdef JEMALLOC 71 | // Force stats cache flush 72 | uint64_t epoch = 1; 73 | sz = sizeof(epoch); 74 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 75 | 76 | je_mallctl("stats.resident", &resident, &sz, NULL, 0); 77 | return resident; 78 | #else 79 | return 0; 80 | #endif 81 | } 82 | 83 | int mm_free2os() { 84 | #ifdef JEMALLOC 85 | char buf[100]; 86 | unsigned int narenas; 87 | size_t len = sizeof(narenas); 88 | je_mallctl("arenas.narenas", &narenas, &len, NULL, 0); 89 | sprintf(buf, "arena.%u.purge", narenas); 90 | return je_mallctl(buf, NULL, NULL, NULL, 0); 91 | #endif 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /skiplist/C/main.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | #include "skiplist.hh" 10 | #include 11 | #include 12 | #include 13 | 14 | static unsigned int seed; 15 | 16 | void insert(Skiplist *s, int n, bool is_rand) { 17 | for (int x=0; x < n; x++) { 18 | unsigned r; 19 | if (is_rand) { 20 | r = rand_r(&seed); 21 | } else { 22 | r = x; 23 | } 24 | int *v = (int *) skiplist_malloc(sizeof(int)); 25 | *v = r; 26 | Item *itm = newItem(v, sizeof(int)); 27 | Skiplist_Insert(s, itm); 28 | } 29 | } 30 | 31 | void lookup(Skiplist *s, int n) { 32 | Node *preds[MaxLevel], *succs[MaxLevel]; 33 | for (int x=0; x < n; x++) { 34 | unsigned r = rand_r(&seed); 35 | int *v = (int *) skiplist_malloc(sizeof(int)); 36 | *v = r % n; 37 | Item *itm = newItem(v, sizeof(int)); 38 | Skiplist_findPath(s, itm, preds, succs); 39 | skiplist_free(itm); 40 | } 41 | } 42 | 43 | int main() { 44 | 45 | srand(time(NULL)); 46 | int i = 100; 47 | Skiplist *s = newSkiplist(); 48 | std::vector threads; 49 | 50 | insert(s, 10000000, false); 51 | 52 | time_t t0 = time(NULL); 53 | /* 54 | for (int x=0; x < 8; x++) { 55 | threads.push_back(std::thread(&insert,s, 1000000, true)); 56 | } 57 | */ 58 | for (int x=0; x < 8; x++) { 59 | threads.push_back(std::thread(&lookup,s, 1000000)); 60 | } 61 | 62 | for (auto& th : threads) th.join(); 63 | std::cout<<"took "<<(time(NULL)-t0)<<"s"<head; 68 | while (p) { 69 | if (p->itm->l == 4) { 70 | count++; 71 | // std::cout<<"itm "<itm->data))< 0 { 85 | itm := m.allocItem(int(l), m.useMemoryMgmt) 86 | data := itm.Bytes() 87 | _, err := io.ReadFull(r, data) 88 | return itm, err 89 | } 90 | 91 | return nil, nil 92 | } 93 | 94 | // Bytes return item data bytes 95 | func (itm *Item) Bytes() (bs []byte) { 96 | if itm == nil { 97 | return 98 | } 99 | 100 | l := itm.dataLen 101 | dataOffset := uintptr(unsafe.Pointer(itm)) + itemHeaderSize 102 | 103 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) 104 | hdr.Data = dataOffset 105 | hdr.Len = int(l) 106 | hdr.Cap = hdr.Len 107 | return 108 | } 109 | 110 | // ItemSize returns total bytes consumed by item representation 111 | func ItemSize(p unsafe.Pointer) int { 112 | itm := (*Item)(p) 113 | return int(itemHeaderSize + uintptr(itm.dataLen)) 114 | } 115 | -------------------------------------------------------------------------------- /skiplist/builder.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import "math/rand" 13 | import "unsafe" 14 | 15 | // NodeCallback is used by segment builder 16 | type NodeCallback func(*Node) 17 | 18 | // Segment is a skiplist segment 19 | type Segment struct { 20 | builder *Builder 21 | tail []*Node 22 | head []*Node 23 | rand *rand.Rand 24 | callb NodeCallback 25 | count uint64 26 | 27 | sts Stats 28 | } 29 | 30 | // SetNodeCallback sets callback for segment builder 31 | func (s *Segment) SetNodeCallback(fn NodeCallback) { 32 | s.callb = fn 33 | } 34 | 35 | // Add an item into skiplist segment 36 | func (s *Segment) Add(itm unsafe.Pointer) { 37 | itemLevel := s.builder.store.NewLevel(s.rand.Float32) 38 | x := s.builder.store.newNode(itm, itemLevel) 39 | s.sts.AddInt64(&s.sts.nodeAllocs, 1) 40 | s.sts.AddInt64(&s.sts.levelNodesCount[itemLevel], 1) 41 | s.sts.AddInt64(&s.sts.usedBytes, int64(s.builder.store.Size(x))) 42 | 43 | for l := 0; l <= itemLevel; l++ { 44 | if s.tail[l] != nil { 45 | s.tail[l].setNext(l, x, false) 46 | } else { 47 | s.head[l] = x 48 | } 49 | s.tail[l] = x 50 | } 51 | 52 | if s.callb != nil { 53 | s.callb(x) 54 | } 55 | } 56 | 57 | // Builder performs concurrent bottom-up skiplist build 58 | type Builder struct { 59 | store *Skiplist 60 | } 61 | 62 | // SetItemSizeFunc configures items size function 63 | func (b *Builder) SetItemSizeFunc(fn ItemSizeFn) { 64 | b.store.ItemSize = fn 65 | } 66 | 67 | // NewSegment creates a new skiplist segment 68 | func (b *Builder) NewSegment() *Segment { 69 | seg := &Segment{tail: make([]*Node, MaxLevel+1), 70 | head: make([]*Node, MaxLevel+1), builder: b, 71 | rand: rand.New(rand.NewSource(int64(rand.Int()))), 72 | } 73 | 74 | seg.sts.IsLocal(true) 75 | return seg 76 | } 77 | 78 | // Assemble multiple skiplist segments and form a parent skiplist 79 | func (b *Builder) Assemble(segments ...*Segment) *Skiplist { 80 | tail := make([]*Node, MaxLevel+1) 81 | head := make([]*Node, MaxLevel+1) 82 | 83 | for _, seg := range segments { 84 | for l := 0; l <= MaxLevel; l++ { 85 | if tail[l] != nil && seg.head[l] != nil { 86 | tail[l].setNext(l, seg.head[l], false) 87 | } else if head[l] == nil && seg.head[l] != nil { 88 | head[l] = seg.head[l] 89 | } 90 | 91 | if seg.tail[l] != nil { 92 | tail[l] = seg.tail[l] 93 | } 94 | } 95 | } 96 | 97 | for l := 0; l <= MaxLevel; l++ { 98 | if head[l] != nil { 99 | b.store.head.setNext(l, head[l], false) 100 | } 101 | if tail[l] != nil { 102 | tail[l].setNext(l, b.store.tail, false) 103 | } 104 | } 105 | 106 | for _, seg := range segments { 107 | b.store.Stats.Merge(&seg.sts) 108 | } 109 | 110 | return b.store 111 | 112 | } 113 | 114 | // NewBuilder creates a builder based on default config 115 | func NewBuilder() *Builder { 116 | return NewBuilderWithConfig(DefaultConfig()) 117 | } 118 | 119 | // NewBuilderWithConfig creates a builder from a config 120 | func NewBuilderWithConfig(cfg Config) *Builder { 121 | return &Builder{store: NewWithConfig(cfg)} 122 | } 123 | -------------------------------------------------------------------------------- /skiplist/batch_ops_test.go: -------------------------------------------------------------------------------- 1 | package skiplist 2 | 3 | import "testing" 4 | import "fmt" 5 | import "unsafe" 6 | import "math/rand" 7 | import "sort" 8 | 9 | var hash map[uintptr][]unsafe.Pointer 10 | var maxItems = 1000 11 | 12 | func init() { 13 | hash = make(map[uintptr][]unsafe.Pointer) 14 | } 15 | 16 | func batchOps(itms []unsafe.Pointer) (ops []BatchOp) { 17 | for _, itm := range itms { 18 | ops = append(ops, BatchOp{itm: itm}) 19 | } 20 | 21 | return 22 | } 23 | 24 | func TestBatchInsert(t *testing.T) { 25 | s := New() 26 | cmp := CompareBytes 27 | buf := s.MakeBuf() 28 | defer s.FreeBuf(buf) 29 | var items []unsafe.Pointer 30 | 31 | sr := itemSorter{ 32 | cmp: cmp, 33 | } 34 | 35 | for i := 0; i < 5000000; i++ { 36 | items = append(items, NewByteKeyItem([]byte(fmt.Sprintf("%010d", rand.Int()%1000000000)))) 37 | } 38 | 39 | sr.itms = items 40 | sort.Sort(sr) 41 | 42 | s.ExecBatchOps(batchOps(sr.itms), buildLeafCallback, cmp, &s.Stats) 43 | 44 | dumpIt(s, cmp) 45 | items = nil 46 | for i := 0; i < 100000; i++ { 47 | items = append(items, NewByteKeyItem([]byte(fmt.Sprintf("%010d", rand.Int()%1000000000)))) 48 | } 49 | 50 | sr.itms = items 51 | sort.Sort(sr) 52 | s.ExecBatchOps(batchOps(sr.itms), buildLeafCallback, cmp, &s.Stats) 53 | 54 | dumpIt(s, cmp) 55 | 56 | fmt.Println(s.GetStats()) 57 | } 58 | 59 | func dumpIt(s *Skiplist, cmp CompareFn) { 60 | buf := s.MakeBuf() 61 | itr := s.NewIterator(cmp, buf) 62 | itr.SeekFirst() 63 | last := NewByteKeyItem([]byte("0")) 64 | for ; itr.Valid(); itr.Next() { 65 | itms := hash[uintptr(unsafe.Pointer(itr.GetNode()))] 66 | for _, itm := range itms { 67 | if cmp(last, itm) > 0 { 68 | panic(fmt.Sprintf("prob - %s > %s", string(*(*byteKeyItem)(last)), string(*(*byteKeyItem)(itm)))) 69 | } 70 | last = itm 71 | } 72 | } 73 | } 74 | 75 | type itemSorter struct { 76 | itms []unsafe.Pointer 77 | cmp CompareFn 78 | } 79 | 80 | func (s itemSorter) Len() int { 81 | return len(s.itms) 82 | } 83 | 84 | func (s itemSorter) Swap(i, j int) { 85 | s.itms[i], s.itms[j] = s.itms[j], s.itms[i] 86 | } 87 | 88 | func (s itemSorter) Less(i, j int) bool { 89 | return s.cmp(s.itms[i], s.itms[j]) < 0 90 | } 91 | 92 | func (s *itemSorter) Dedup() { 93 | j := 0 94 | i := 0 95 | for i = 0; i < len(s.itms); { 96 | for j < len(s.itms) && s.cmp(s.itms[i], s.itms[j]) == 0 { 97 | j++ 98 | } 99 | i++ 100 | if i < len(s.itms) && j < len(s.itms) { 101 | s.itms[i] = s.itms[j] 102 | } else { 103 | break 104 | } 105 | 106 | } 107 | 108 | s.itms = s.itms[:i] 109 | } 110 | 111 | func printItems(itms []unsafe.Pointer) { 112 | for _, itm := range itms { 113 | fmt.Println(string(*(*byteKeyItem)(itm))) 114 | } 115 | } 116 | 117 | func printBlock(itms []unsafe.Pointer) { 118 | fmt.Println("(", string(*(*byteKeyItem)(itms[0])), string(*(*byteKeyItem)(itms[len(itms)-1])), ")", len(itms)) 119 | } 120 | 121 | func printItem(itm unsafe.Pointer) string { 122 | if itm == minItem { 123 | return "min" 124 | } else if itm == maxItem { 125 | return "max" 126 | } 127 | return string(*(*byteKeyItem)(itm)) 128 | } 129 | 130 | func buildLeafCallback(s *Skiplist, node *Node, ops []BatchOp, cmp CompareFn) error { 131 | buf := s.MakeBuf() 132 | var items []unsafe.Pointer 133 | for _, op := range ops { 134 | items = append(items, op.itm) 135 | } 136 | 137 | var block []unsafe.Pointer 138 | if node.Item() != nil { 139 | ptr := uintptr(unsafe.Pointer(node)) 140 | items = append(items, hash[ptr]...) 141 | 142 | sr := itemSorter{ 143 | cmp: cmp, 144 | itms: items, 145 | } 146 | sort.Sort(sr) 147 | sr.Dedup() 148 | items = sr.itms 149 | 150 | s.Delete(node.Item(), cmp, buf, &s.Stats) 151 | delete(hash, ptr) 152 | } 153 | 154 | for len(items) > 0 { 155 | if len(items) >= maxItems { 156 | block = items[:maxItems] 157 | items = items[maxItems:] 158 | } else { 159 | block = items 160 | items = nil 161 | } 162 | 163 | k := block[0] 164 | newnode, _ := s.Insert2(k, cmp, nil, buf, rand.Float32, &s.Stats) 165 | hash[uintptr(unsafe.Pointer(newnode))] = block 166 | } 167 | 168 | return nil 169 | } 170 | -------------------------------------------------------------------------------- /skiplist/stats.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import "fmt" 13 | import "sync/atomic" 14 | 15 | // StatsReport is used for reporting skiplist statistics 16 | type StatsReport struct { 17 | ReadConflicts uint64 18 | InsertConflicts uint64 19 | NextPointersPerNode float64 20 | NodeDistribution [MaxLevel + 1]int64 21 | NodeCount int 22 | SoftDeletes int64 23 | Memory int64 24 | 25 | NodeAllocs int64 26 | NodeFrees int64 27 | } 28 | 29 | // Apply updates the report with provided paritial stats 30 | func (report *StatsReport) Apply(s *Stats) { 31 | var totalNextPtrs int 32 | var totalNodes int 33 | 34 | report.ReadConflicts += s.readConflicts 35 | report.InsertConflicts += s.insertConflicts 36 | 37 | for i, c := range s.levelNodesCount { 38 | report.NodeDistribution[i] += c 39 | nodesAtlevel := report.NodeDistribution[i] 40 | totalNodes += int(nodesAtlevel) 41 | totalNextPtrs += (i + 1) * int(nodesAtlevel) 42 | } 43 | 44 | report.SoftDeletes += s.softDeletes 45 | report.NodeCount = totalNodes 46 | report.NextPointersPerNode = float64(totalNextPtrs) / float64(totalNodes) 47 | report.NodeAllocs += s.nodeAllocs 48 | report.NodeFrees += s.nodeFrees 49 | report.Memory += s.usedBytes 50 | } 51 | 52 | // Stats keeps stats for a skiplist instance 53 | type Stats struct { 54 | insertConflicts uint64 55 | readConflicts uint64 56 | levelNodesCount [MaxLevel + 1]int64 57 | softDeletes int64 58 | nodeAllocs, nodeFrees int64 59 | usedBytes int64 60 | 61 | isLocal bool 62 | } 63 | 64 | // IsLocal reports true if the stats is partial 65 | func (s *Stats) IsLocal(flag bool) { 66 | s.isLocal = flag 67 | } 68 | 69 | // AddInt64 provides atomic add 70 | func (s *Stats) AddInt64(src *int64, val int64) { 71 | if s.isLocal { 72 | *src += val 73 | } else { 74 | atomic.AddInt64(src, val) 75 | } 76 | } 77 | 78 | // AddUint64 provides atomic add 79 | func (s *Stats) AddUint64(src *uint64, val uint64) { 80 | if s.isLocal { 81 | *src += val 82 | } else { 83 | atomic.AddUint64(src, val) 84 | } 85 | } 86 | 87 | // Merge updates global stats with partial stats and resets partial stats 88 | func (s *Stats) Merge(sts *Stats) { 89 | atomic.AddUint64(&s.insertConflicts, sts.insertConflicts) 90 | sts.insertConflicts = 0 91 | atomic.AddUint64(&s.readConflicts, sts.readConflicts) 92 | sts.readConflicts = 0 93 | atomic.AddInt64(&s.softDeletes, sts.softDeletes) 94 | sts.softDeletes = 0 95 | atomic.AddInt64(&s.nodeAllocs, sts.nodeAllocs) 96 | sts.nodeAllocs = 0 97 | atomic.AddInt64(&s.nodeFrees, sts.nodeFrees) 98 | sts.nodeFrees = 0 99 | atomic.AddInt64(&s.usedBytes, sts.usedBytes) 100 | sts.usedBytes = 0 101 | 102 | for i, val := range sts.levelNodesCount { 103 | if val != 0 { 104 | atomic.AddInt64(&s.levelNodesCount[i], val) 105 | sts.levelNodesCount[i] = 0 106 | } 107 | } 108 | } 109 | 110 | func (report StatsReport) String() string { 111 | str := fmt.Sprintf( 112 | "node_count = %d\n"+ 113 | "soft_deletes = %d\n"+ 114 | "read_conflicts = %d\n"+ 115 | "insert_conflicts = %d\n"+ 116 | "next_pointers_per_node = %.4f\n"+ 117 | "memory_used = %d\n"+ 118 | "node_allocs = %d\n"+ 119 | "node_frees = %d\n\n", 120 | report.NodeCount, report.SoftDeletes, report.ReadConflicts, 121 | report.InsertConflicts, report.NextPointersPerNode, report.Memory, 122 | report.NodeAllocs, report.NodeFrees) 123 | 124 | str += "level_node_distribution:\n" 125 | 126 | for i, c := range report.NodeDistribution { 127 | str += fmt.Sprintf("level%d => %d\n", i, c) 128 | } 129 | 130 | return str 131 | } 132 | 133 | // GetStats returns skiplist stats 134 | func (s *Skiplist) GetStats() StatsReport { 135 | var report StatsReport 136 | report.Apply(&s.Stats) 137 | return report 138 | } 139 | 140 | // MemoryInUse returns memory used by skiplist 141 | func (s *Skiplist) MemoryInUse() int64 { 142 | return atomic.LoadInt64(&s.Stats.usedBytes) 143 | } 144 | -------------------------------------------------------------------------------- /skiplist/node_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import ( 13 | "sync/atomic" 14 | "unsafe" 15 | ) 16 | 17 | // Node structure overlaps with an array of NodeRef struct 18 | // 19 | // 20 | // +--------------+-----------------+----------------+ 21 | // | itm - 8bytes | GClink - 8bytes | level = 2 bytes| <[]NodeRef struct> 22 | // +--------------+-----------------+----------------+-----+--------------+--------------+--------------+ 23 | // | flag - 8bytes | ptr - 8 bytes| flag - 8bytes| ptr - 8 bytes| 24 | // +----------------------+--------------+--------------+--------------+ 25 | 26 | var nodeHdrSize = unsafe.Sizeof(struct { 27 | itm unsafe.Pointer 28 | GClink *Node 29 | DataPtr uint64 30 | }{}) 31 | 32 | var nodeRefSize = unsafe.Sizeof(NodeRef{}) 33 | 34 | var nodeRefFlagSize = unsafe.Sizeof(NodeRef{}.flag) 35 | 36 | const deletedFlag = 0xff 37 | 38 | // Node represents skiplist node header 39 | type Node struct { 40 | itm unsafe.Pointer 41 | GClink *Node 42 | DataPtr uint64 43 | level uint16 44 | } 45 | 46 | // Level returns the level of a node in the skiplist 47 | func (n Node) Level() int { 48 | return int(n.level) 49 | } 50 | 51 | // Size returns memory used by the node 52 | func (n Node) Size() int { 53 | return int(nodeHdrSize + uintptr(n.level+1)*nodeRefSize) 54 | } 55 | 56 | // Item returns item held by the node 57 | func (n *Node) Item() unsafe.Pointer { 58 | return n.itm 59 | } 60 | 61 | // SetLink can be used to set link pointer for the node 62 | func (n *Node) SetLink(l *Node) { 63 | n.GClink = l 64 | } 65 | 66 | // GetLink returns link pointer from the node 67 | func (n *Node) GetLink() *Node { 68 | return n.GClink 69 | } 70 | 71 | // NodeRef is a wrapper for node pointer 72 | type NodeRef struct { 73 | flag uint64 74 | ptr *Node 75 | } 76 | 77 | func (n *Node) setNext(level int, ptr *Node, deleted bool) { 78 | nlevel := n.level 79 | ref := (*NodeRef)(unsafe.Pointer(uintptr(unsafe.Pointer(n)) + nodeHdrSize + nodeRefSize*uintptr(level))) 80 | ref.ptr = ptr 81 | ref.flag = 0 82 | // Setting flag for level 0 will require reseting of level 83 | if level == 0 { 84 | n.level = nlevel 85 | } 86 | } 87 | 88 | func (n *Node) getNext(level int) (*Node, bool) { 89 | nodeRefAddr := uintptr(unsafe.Pointer(n)) + nodeHdrSize + nodeRefSize*uintptr(level) 90 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr + uintptr(7))) 91 | 92 | v := atomic.LoadUint64(wordAddr) 93 | deleted := v&deletedFlag == deletedFlag 94 | ptr := (*Node)(unsafe.Pointer(uintptr(v >> 8))) 95 | return ptr, deleted 96 | } 97 | 98 | // The node struct holds a slice of NodeRef. We assume that the 99 | // most-significant-byte of the golang pointer is always unused. In NodeRef 100 | // struct, deleted flag and *Node are packed one after the other. 101 | // If we shift the node address 1 byte to the left. The shifted 8 byte word will have 102 | // a byte from the deleted flag and 7 bytes from the address (8th byte of the address 103 | // is always 0x00). CAS operation can be performed at this location to set 104 | // least-significant to 0xff (denotes deleted). Same applies for loading delete 105 | // flag and the address atomically. 106 | func (n *Node) dcasNext(level int, prevPtr, newPtr *Node, prevIsdeleted, newIsdeleted bool) bool { 107 | nodeRefAddr := uintptr(unsafe.Pointer(n)) + nodeHdrSize + nodeRefSize*uintptr(level) 108 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr + uintptr(7))) 109 | prevVal := uint64(uintptr(unsafe.Pointer(prevPtr)) << 8) 110 | newVal := uint64(uintptr(unsafe.Pointer(newPtr)) << 8) 111 | 112 | if newIsdeleted { 113 | newVal |= deletedFlag 114 | } 115 | 116 | swapped := atomic.CompareAndSwapUint64(wordAddr, prevVal, newVal) 117 | 118 | // This is required to make go1.5+ concurrent garbage collector happy 119 | // It makes writebarrier to mark newPtr as reachable 120 | if swapped { 121 | atomic.CompareAndSwapPointer((*unsafe.Pointer)(unsafe.Pointer(nodeRefAddr+nodeRefFlagSize)), 122 | unsafe.Pointer(newPtr), unsafe.Pointer(newPtr)) 123 | } 124 | 125 | return swapped 126 | } 127 | -------------------------------------------------------------------------------- /skiplist/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import "sync/atomic" 13 | import "unsafe" 14 | 15 | // Iterator is used for lookup and range operations on skiplist 16 | type Iterator struct { 17 | cmp CompareFn 18 | s *Skiplist 19 | prev, curr *Node 20 | valid bool 21 | buf *ActionBuffer 22 | deleted bool 23 | 24 | bs *BarrierSession 25 | } 26 | 27 | // NewIterator creates an iterator for skiplist 28 | func (s *Skiplist) NewIterator(cmp CompareFn, 29 | buf *ActionBuffer) *Iterator { 30 | 31 | return &Iterator{ 32 | cmp: cmp, 33 | s: s, 34 | buf: buf, 35 | bs: s.barrier.Acquire(), 36 | } 37 | } 38 | 39 | // SeekFirst moves cursor to the start 40 | func (it *Iterator) SeekFirst() { 41 | it.prev = it.s.head 42 | it.curr, _ = it.s.head.getNext(0) 43 | it.valid = true 44 | } 45 | 46 | // SeekWithCmp moves iterator to a provided item by using custom comparator 47 | func (it *Iterator) SeekWithCmp(itm unsafe.Pointer, cmp CompareFn, eqCmp CompareFn) bool { 48 | var found bool 49 | if found = it.s.findPath(itm, cmp, it.buf, &it.s.Stats) != nil; found { 50 | it.prev = it.buf.preds[0] 51 | it.curr = it.buf.succs[0] 52 | } else { 53 | if found = eqCmp != nil && Compare(eqCmp, itm, it.buf.preds[0].Item()) == 0; found { 54 | it.prev = nil 55 | it.curr = it.buf.preds[0] 56 | } 57 | } 58 | return found 59 | } 60 | 61 | // SeekWithSkip performs Seek() with optional skipping of nodes while reading nodes as part of 62 | // finding the item. 63 | func (it *Iterator) SeekWithSkip(itm unsafe.Pointer, skipItm func(unsafe.Pointer) bool) bool { 64 | it.valid = true 65 | found := it.s.findPath2(itm, it.cmp, skipItm, it.buf, &it.s.Stats) != nil 66 | it.prev = it.buf.preds[0] 67 | it.curr = it.buf.succs[0] 68 | return found 69 | } 70 | 71 | // Seek moves iterator to a provided item 72 | func (it *Iterator) Seek(itm unsafe.Pointer) bool { 73 | return it.SeekWithSkip(itm, nil) 74 | } 75 | 76 | // SeekPrev moves iterator to the provided item or an item less than the lookup item 77 | func (it *Iterator) SeekPrev(itm unsafe.Pointer, skip func(unsafe.Pointer) bool) { 78 | if !it.SeekWithSkip(itm, skip) && it.prev != it.s.head { 79 | it.curr = it.prev 80 | it.prev = nil 81 | } 82 | } 83 | 84 | // Valid returns true when iterator reaches the end 85 | // If the specified item is not found, start with the predecessor node 86 | // This is used for implementing disk block based storage 87 | func (it *Iterator) Valid() bool { 88 | if it.valid && it.curr == it.s.tail { 89 | it.valid = false 90 | } 91 | 92 | return it.valid 93 | } 94 | 95 | // Get returns the current item 96 | func (it *Iterator) Get() unsafe.Pointer { 97 | return it.curr.Item() 98 | } 99 | 100 | // GetNode returns node which holds the current item 101 | func (it *Iterator) GetNode() *Node { 102 | return it.curr 103 | } 104 | 105 | // Delete removes the current item from the skiplist 106 | func (it *Iterator) Delete() { 107 | it.s.softDelete(it.curr, &it.s.Stats) 108 | // It will observe that current item is deleted 109 | // Run delete helper and move to the next possible item 110 | it.Next() 111 | it.deleted = true 112 | } 113 | 114 | // Next moves iterator to the next item 115 | func (it *Iterator) Next() { 116 | if it.deleted { 117 | it.deleted = false 118 | return 119 | } 120 | 121 | if !it.Valid() { 122 | return 123 | } 124 | 125 | retry: 126 | it.valid = true 127 | next, deleted := it.curr.getNext(0) 128 | if deleted { 129 | // Current node is deleted. Unlink current node from the level 130 | // and make next node as current node. 131 | // If it fails, refresh the path buffer and obtain new current node. 132 | if it.prev != nil && it.s.helpDelete(0, it.prev, it.curr, next, &it.s.Stats) { 133 | it.curr = next 134 | } else { 135 | atomic.AddUint64(&it.s.Stats.readConflicts, 1) 136 | found := it.s.findPath(it.curr.Item(), it.cmp, it.buf, &it.s.Stats) != nil 137 | last := it.curr 138 | it.prev = it.buf.preds[0] 139 | it.curr = it.buf.succs[0] 140 | if found && last == it.curr { 141 | goto retry 142 | } 143 | } 144 | } else { 145 | it.prev = it.curr 146 | it.curr = next 147 | } 148 | } 149 | 150 | // Close is a destructor 151 | func (it *Iterator) Close() { 152 | it.s.barrier.Release(it.bs) 153 | } 154 | -------------------------------------------------------------------------------- /block_manager.go: -------------------------------------------------------------------------------- 1 | package nitro 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "path/filepath" 8 | "sync" 9 | "sync/atomic" 10 | "syscall" 11 | ) 12 | 13 | var useLinuxHolePunch = false 14 | 15 | // TODO: Reopen fds on error 16 | type BlockManager interface { 17 | DeleteBlock(bptr blockPtr) error 18 | WriteBlock(bs []byte, shard int) (blockPtr, error) 19 | ReadBlock(bptr blockPtr, buf []byte) error 20 | } 21 | 22 | func newBlockPtr(shard int, off int64) blockPtr { 23 | off |= int64(shard) << 55 24 | return blockPtr(off) 25 | } 26 | 27 | func (ptr blockPtr) Offset() int64 { 28 | off := int64(ptr) & ^(0xff << 55) 29 | return off 30 | } 31 | 32 | func (ptr blockPtr) Shard() int { 33 | shard := int(int64(ptr) >> 55) 34 | return shard 35 | } 36 | 37 | type fileBlockManager struct { 38 | wlocks []sync.Mutex 39 | wfds []*os.File 40 | rfds []*os.File 41 | 42 | wpos []int64 43 | 44 | freeBlocks [][]int64 45 | } 46 | 47 | func newFileBlockManager(nfiles int, path string) (*fileBlockManager, error) { 48 | var fd *os.File 49 | var err error 50 | 51 | fbm := &fileBlockManager{} 52 | defer func() { 53 | if err != nil { 54 | for _, wfd := range fbm.wfds { 55 | wfd.Close() 56 | } 57 | for _, rfd := range fbm.rfds { 58 | rfd.Close() 59 | } 60 | } 61 | }() 62 | 63 | fbm.wlocks = make([]sync.Mutex, nfiles) 64 | fbm.wpos = make([]int64, nfiles) 65 | fbm.freeBlocks = make([][]int64, nfiles) 66 | 67 | for i := 0; i < nfiles; i++ { 68 | fpath := filepath.Join(path, fmt.Sprintf("blockstore-%d.data", i)) 69 | fd, err = os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0755) 70 | if err != nil { 71 | return nil, err 72 | } 73 | fbm.wfds = append(fbm.wfds, fd) 74 | 75 | fbm.wpos[i], err = fd.Seek(0, 2) 76 | if err != nil { 77 | return nil, err 78 | } 79 | 80 | fbm.wpos[i] += fbm.wpos[i] % blockSize 81 | fd, err = os.Open(fpath) 82 | if err != nil { 83 | return nil, err 84 | } 85 | fbm.rfds = append(fbm.rfds, fd) 86 | fbm.freeBlocks[i] = make([]int64, 0) 87 | } 88 | 89 | return fbm, err 90 | } 91 | 92 | func (fbm *fileBlockManager) DeleteBlock(bptr blockPtr) error { 93 | shard := bptr.Shard() 94 | if useLinuxHolePunch { 95 | return punchHole(fbm.wfds[shard], bptr.Offset(), blockSize) 96 | } 97 | 98 | fbm.wlocks[shard].Lock() 99 | defer fbm.wlocks[shard].Unlock() 100 | fbm.freeBlocks[shard] = append(fbm.freeBlocks[shard], bptr.Offset()) 101 | 102 | return nil 103 | } 104 | 105 | func (fbm *fileBlockManager) WriteBlock(bs []byte, shard int) (blockPtr, error) { 106 | shard = shard % len(fbm.wpos) 107 | fbm.wlocks[shard].Lock() 108 | var pos int64 109 | 110 | flist := fbm.freeBlocks[shard] 111 | if !useLinuxHolePunch && len(flist) > 0 { 112 | pos = flist[len(flist)-1] 113 | flist = flist[0 : len(flist)-1] 114 | fbm.freeBlocks[shard] = flist 115 | } else { 116 | pos = fbm.wpos[shard] 117 | fbm.wpos[shard] += blockSize 118 | } 119 | fbm.wlocks[shard].Unlock() 120 | 121 | _, err := fbm.wfds[shard].WriteAt(bs, pos) 122 | if err != nil { 123 | return 0, err 124 | } 125 | 126 | bptr := newBlockPtr(shard, pos) 127 | return bptr, nil 128 | } 129 | 130 | func (fbm *fileBlockManager) ReadBlock(bptr blockPtr, buf []byte) error { 131 | shard := bptr.Shard() 132 | n, err := fbm.rfds[shard].ReadAt(buf, bptr.Offset()) 133 | if err == io.EOF { 134 | for ; n < len(buf); n++ { 135 | buf[n] = 0 136 | } 137 | err = nil 138 | } 139 | return err 140 | } 141 | 142 | type mmapBlockManager struct { 143 | file *os.File 144 | offset int64 145 | data []byte 146 | } 147 | 148 | const maxFileOffset = 16000000000000 // 16TB 149 | func newMmapBlockManager(dir string) (*mmapBlockManager, error) { 150 | // TODO: Ability to reuse file and update offset 151 | file := filepath.Join(dir, "blockstore-mmap.data") 152 | mbm := new(mmapBlockManager) 153 | if f, err := os.Create(file); err == nil { 154 | if _, err := f.WriteAt([]byte("EOF"), maxFileOffset); err != nil { 155 | return nil, err 156 | } 157 | f.Close() 158 | } 159 | 160 | if f, err := os.OpenFile(file, os.O_RDWR, 0755); err != nil { 161 | return nil, err 162 | } else { 163 | mbm.file = f 164 | mbm.data, err = syscall.Mmap(int(f.Fd()), 0, 165 | maxFileOffset, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) 166 | if err != nil { 167 | return nil, err 168 | } 169 | } 170 | 171 | return mbm, nil 172 | } 173 | 174 | func (mbm *mmapBlockManager) WriteBlock(bs []byte, shard int) (blockPtr, error) { 175 | pos := atomic.AddInt64(&mbm.offset, blockSize) 176 | pos -= blockSize 177 | 178 | copy(mbm.data[pos:], bs) 179 | 180 | bptr := newBlockPtr(0, pos) 181 | return bptr, nil 182 | } 183 | 184 | func (mbm *mmapBlockManager) DeleteBlock(bptr blockPtr) error { 185 | pos := bptr.Offset() 186 | return mmapPunchHole(mbm.data[pos : pos+blockSize]) 187 | } 188 | 189 | func (mbm mmapBlockManager) ReadBlock(bptr blockPtr, buf []byte) error { 190 | pos := bptr.Offset() 191 | copy(buf[:blockSize], mbm.data[pos:pos+blockSize]) 192 | return nil 193 | } 194 | -------------------------------------------------------------------------------- /supernitro/supernitro.go: -------------------------------------------------------------------------------- 1 | package supernitro 2 | 3 | import ( 4 | "fmt" 5 | "github.com/t3rm1n4l/nitro" 6 | "github.com/t3rm1n4l/nitro/mm" 7 | "sync" 8 | "sync/atomic" 9 | "time" 10 | ) 11 | 12 | type Config struct { 13 | MaxMStoreSize int64 14 | BlockstorePath string 15 | NitroConfig nitro.Config 16 | Writers int 17 | } 18 | 19 | func DefaultConfig() Config { 20 | ncfg := nitro.DefaultConfig() 21 | ncfg.UseMemoryMgmt(mm.Malloc, mm.Free) 22 | 23 | return Config{ 24 | MaxMStoreSize: 1 * 1024 * 1024, 25 | BlockstorePath: ".", 26 | NitroConfig: ncfg, 27 | Writers: 8, 28 | } 29 | } 30 | 31 | type SuperNitro struct { 32 | sync.Mutex 33 | Config 34 | mstore *nitro.Nitro 35 | dstore *nitro.Nitro 36 | 37 | // Lower level immutable snapshots 38 | snaps []*nitro.Snapshot 39 | 40 | wrlist []*Writer 41 | 42 | isMergeRunning int32 43 | } 44 | 45 | func New() *SuperNitro { 46 | return &SuperNitro{ 47 | Config: DefaultConfig(), 48 | mstore: nitro.NewWithConfig(DefaultConfig().NitroConfig), 49 | } 50 | } 51 | 52 | func (m *SuperNitro) NewWriter() *Writer { 53 | m.Lock() 54 | defer m.Unlock() 55 | 56 | w := &Writer{ 57 | SuperNitro: m, 58 | mw: m.mstore.NewWriter(), 59 | } 60 | 61 | m.wrlist = append(m.wrlist, w) 62 | return w 63 | } 64 | 65 | type Snapshot struct { 66 | snaps []*nitro.Snapshot 67 | } 68 | 69 | func (s *Snapshot) Open() bool { 70 | for i, snap := range s.snaps { 71 | if !snap.Open() { 72 | for x := 0; x < i; x++ { 73 | s.snaps[x].Close() 74 | } 75 | return false 76 | } 77 | } 78 | 79 | return true 80 | } 81 | 82 | func (s *Snapshot) Close() { 83 | for _, snap := range s.snaps { 84 | snap.Close() 85 | } 86 | } 87 | 88 | func (m *SuperNitro) NewIterator(snap *Snapshot) *Iterator { 89 | var iters []*nitro.Iterator 90 | for _, snap := range snap.snaps { 91 | iters = append(iters, snap.NewIterator()) 92 | } 93 | return newMergeIterator(iters) 94 | } 95 | 96 | func (m *SuperNitro) execMerge(msnap *nitro.Snapshot, store *nitro.Nitro) { 97 | fmt.Println("execMerge") 98 | go func() { 99 | defer func() { 100 | store.Close() 101 | if r := recover(); r != nil { 102 | panic(r) 103 | } 104 | }() 105 | 106 | if m.dstore == nil { 107 | dcfg := m.Config.NitroConfig 108 | dcfg.SetBlockStoreDir(m.Config.BlockstorePath) 109 | m.dstore = nitro.NewWithConfig(dcfg) 110 | } 111 | 112 | // Perform merge operation 113 | t0 := time.Now() 114 | stats, err := m.dstore.ApplyOps(msnap, m.Config.Writers) 115 | if err != nil { 116 | panic(err) 117 | } 118 | dur := time.Since(t0) 119 | fmt.Printf("\nexecMergeStats: took %v (%v items/sec)\n================\n%s\n\n", dur, float64(stats.ItemsInserted)/float64(dur.Seconds()), stats) 120 | dsnap, err := m.dstore.NewSnapshot() 121 | if err != nil { 122 | panic(err) 123 | } 124 | 125 | func() { 126 | m.Lock() 127 | defer m.Unlock() 128 | 129 | for _, snap := range m.snaps { 130 | snap.Close() 131 | } 132 | 133 | m.snaps = []*nitro.Snapshot{dsnap} 134 | atomic.CompareAndSwapInt32(&m.isMergeRunning, 1, 0) 135 | }() 136 | }() 137 | } 138 | 139 | func (m *SuperNitro) Sync() error { 140 | for !atomic.CompareAndSwapInt32(&m.isMergeRunning, 0, 1) { 141 | time.Sleep(time.Millisecond) 142 | } 143 | atomic.CompareAndSwapInt32(&m.isMergeRunning, 1, 0) 144 | 145 | snap, err := m.newSnapshot(0) 146 | if err == nil { 147 | snap.Close() 148 | } 149 | 150 | for !atomic.CompareAndSwapInt32(&m.isMergeRunning, 0, 1) { 151 | time.Sleep(time.Millisecond) 152 | } 153 | atomic.CompareAndSwapInt32(&m.isMergeRunning, 1, 0) 154 | return err 155 | } 156 | 157 | func (m *SuperNitro) NewSnapshot() (*Snapshot, error) { 158 | return m.newSnapshot(m.MaxMStoreSize) 159 | } 160 | 161 | func (m *SuperNitro) newSnapshot(MaxMStoreSize int64) (*Snapshot, error) { 162 | m.Lock() 163 | defer m.Unlock() 164 | 165 | snap := &Snapshot{} 166 | msnap, err := m.mstore.NewSnapshot() 167 | if err != nil { 168 | return nil, err 169 | } 170 | snaps := append([]*nitro.Snapshot{msnap}, m.snaps...) 171 | for _, snap := range m.snaps { 172 | snap.Open() 173 | } 174 | snap.snaps = snaps 175 | 176 | fmt.Println("newsnap", m.mstore.MemoryInUse(), m.MaxMStoreSize, len(m.snaps)) 177 | if m.mstore.MemoryInUse() > MaxMStoreSize && atomic.CompareAndSwapInt32(&m.isMergeRunning, 0, 1) { 178 | msnap.Open() 179 | m.snaps = snaps 180 | mstoreOld := m.mstore 181 | m.mstore = nitro.NewWithConfig(m.Config.NitroConfig) 182 | for _, wr := range m.wrlist { 183 | wr.mw = m.mstore.NewWriter() 184 | } 185 | m.execMerge(msnap, mstoreOld) 186 | } 187 | 188 | return snap, nil 189 | } 190 | 191 | func (m *SuperNitro) Close() { 192 | for _, snap := range m.snaps { 193 | snap.Close() 194 | } 195 | m.mstore.Close() 196 | if m.dstore != nil { 197 | m.dstore.Close() 198 | } 199 | } 200 | 201 | type Writer struct { 202 | *SuperNitro 203 | mw *nitro.Writer 204 | } 205 | 206 | func (w *Writer) Put(bs []byte) bool { 207 | return w.mw.Put2(bs) != nil 208 | } 209 | 210 | func (w *Writer) Delete(bs []byte) bool { 211 | return w.mw.DeleteNonExist(bs) 212 | } 213 | -------------------------------------------------------------------------------- /iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import ( 13 | "github.com/t3rm1n4l/nitro/skiplist" 14 | "unsafe" 15 | ) 16 | 17 | // Iterator implements Nitro snapshot iterator 18 | type Iterator struct { 19 | count int 20 | refreshRate int 21 | 22 | snap *Snapshot 23 | iter *skiplist.Iterator 24 | buf *skiplist.ActionBuffer 25 | 26 | blockBuf []byte 27 | 28 | block dataBlock 29 | curr []byte 30 | 31 | endItm *Item 32 | } 33 | 34 | func (it *Iterator) skipItem(ptr unsafe.Pointer) bool { 35 | itm := (*Item)(ptr) 36 | if ptr != skiplist.MaxItem && itm.bornSn > it.snap.sn { 37 | return true 38 | } 39 | 40 | return false 41 | } 42 | 43 | func (it *Iterator) skipUnwanted() { 44 | loop: 45 | if !it.iter.Valid() { 46 | return 47 | } 48 | itm := (*Item)(it.iter.Get()) 49 | if itm.bornSn > it.snap.sn || (itm.deadSn > 0 && itm.deadSn <= it.snap.sn) { 50 | it.iter.Next() 51 | it.count++ 52 | goto loop 53 | } 54 | } 55 | 56 | func (it *Iterator) loadItems() { 57 | if it.snap.db.HasBlockStore() && it.iter.Valid() { 58 | n := it.GetNode() 59 | if err := it.snap.db.bm.ReadBlock(blockPtr(n.DataPtr), it.blockBuf); err != nil { 60 | panic(err) 61 | } 62 | 63 | it.block = *newDataBlock(it.blockBuf) 64 | it.curr = it.block.Get() 65 | } 66 | } 67 | 68 | // SeekFirst moves cursor to the beginning 69 | func (it *Iterator) SeekFirst() { 70 | it.iter.SeekFirst() 71 | it.skipUnwanted() 72 | it.loadItems() 73 | } 74 | 75 | // Seek to a specified key or the next bigger one if an item with key does not 76 | // exist. 77 | func (it *Iterator) Seek(bs []byte) { 78 | if bs == nil { 79 | it.SeekFirst() 80 | return 81 | } 82 | 83 | itm := it.snap.db.newItem(bs, false) 84 | if it.snap.db.HasBlockStore() { 85 | it.iter.SeekPrev(unsafe.Pointer(itm), it.skipItem) 86 | it.skipUnwanted() 87 | it.loadItems() 88 | for ; it.curr != nil && it.snap.db.keyCmp(it.curr, bs) < 0; it.curr = it.block.Get() { 89 | } 90 | 91 | if it.curr == nil { 92 | it.Next() 93 | } 94 | } else { 95 | it.iter.Seek(unsafe.Pointer(itm)) 96 | it.skipUnwanted() 97 | } 98 | } 99 | 100 | func (it *Iterator) SetEnd(bs []byte) { 101 | if len(bs) > 0 { 102 | it.endItm = it.snap.db.newItem(bs, false) 103 | } 104 | } 105 | 106 | // Valid returns false when the iterator has reached the end. 107 | func (it *Iterator) Valid() bool { 108 | if it.iter.Valid() { 109 | if it.endItm != nil && it.snap.db.iterCmp(it.iter.Get(), unsafe.Pointer(it.endItm)) >= 0 { 110 | return false 111 | } 112 | return true 113 | } 114 | 115 | return false 116 | } 117 | 118 | // Get eturns the current item data from the iterator. 119 | func (it *Iterator) Get() []byte { 120 | if it.snap.db.HasBlockStore() { 121 | return it.curr 122 | } 123 | return (*Item)(it.iter.Get()).Bytes() 124 | } 125 | 126 | // GetNode eturns the current skiplist node which holds current item. 127 | func (it *Iterator) GetNode() *skiplist.Node { 128 | return it.iter.GetNode() 129 | } 130 | 131 | // Next moves iterator cursor to the next item 132 | func (it *Iterator) Next() { 133 | if it.snap.db.HasBlockStore() && it.iter.Valid() { 134 | if it.curr = it.block.Get(); it.curr != nil { 135 | return 136 | } 137 | } 138 | 139 | it.iter.Next() 140 | it.count++ 141 | it.skipUnwanted() 142 | if it.refreshRate > 0 && it.count > it.refreshRate { 143 | it.Refresh() 144 | it.count = 0 145 | } 146 | it.loadItems() 147 | } 148 | 149 | // Refresh is a helper API to call refresh accessor tokens manually 150 | // This would enable SMR to reclaim objects faster if an iterator is 151 | // alive for a longer duration of time. 152 | func (it *Iterator) Refresh() { 153 | if it.Valid() { 154 | itm := it.snap.db.ptrToItem(it.GetNode().Item()) 155 | it.iter.Close() 156 | it.iter = it.snap.db.store.NewIterator(it.snap.db.iterCmp, it.buf) 157 | it.iter.Seek(unsafe.Pointer(itm)) 158 | } 159 | } 160 | 161 | // SetRefreshRate sets automatic refresh frequency. By default, it is unlimited 162 | // If this is set, the iterator SMR accessor will be refreshed 163 | // after every `rate` items. 164 | func (it *Iterator) SetRefreshRate(rate int) { 165 | it.refreshRate = rate 166 | } 167 | 168 | // Close executes destructor for iterator 169 | func (it *Iterator) Close() { 170 | it.snap.Close() 171 | it.snap.db.store.FreeBuf(it.buf) 172 | it.iter.Close() 173 | } 174 | 175 | // NewIterator creates an iterator for a Nitro snapshot 176 | func (m *Nitro) NewIterator(snap *Snapshot) *Iterator { 177 | if !snap.Open() { 178 | return nil 179 | } 180 | buf := snap.db.store.MakeBuf() 181 | it := &Iterator{ 182 | snap: snap, 183 | iter: m.store.NewIterator(m.iterCmp, buf), 184 | buf: buf, 185 | } 186 | 187 | if snap.db.HasBlockStore() { 188 | it.blockBuf = make([]byte, blockSize, blockSize) 189 | } 190 | 191 | return it 192 | } 193 | -------------------------------------------------------------------------------- /skiplist/skiplist_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | package skiplist 10 | 11 | import "testing" 12 | import "fmt" 13 | import "math/rand" 14 | import "runtime" 15 | import "sync" 16 | import "time" 17 | import "unsafe" 18 | 19 | func TestInsert(t *testing.T) { 20 | s := New() 21 | cmp := CompareBytes 22 | buf := s.MakeBuf() 23 | defer s.FreeBuf(buf) 24 | 25 | for i := 0; i < 2000; i++ { 26 | s.Insert(NewByteKeyItem([]byte(fmt.Sprintf("%010d", i))), cmp, buf, &s.Stats) 27 | } 28 | 29 | for i := 1750; i < 2000; i++ { 30 | s.Delete(NewByteKeyItem([]byte(fmt.Sprintf("%010d", i))), cmp, buf, &s.Stats) 31 | } 32 | 33 | itr := s.NewIterator(cmp, buf) 34 | count := 0 35 | itr.SeekFirst() 36 | itr.Seek(NewByteKeyItem([]byte(fmt.Sprintf("%010d", 1500)))) 37 | for ; itr.Valid(); itr.Next() { 38 | expected := fmt.Sprintf("%010d", count+1500) 39 | got := string(*(*byteKeyItem)(itr.Get())) 40 | count++ 41 | if got != expected { 42 | t.Errorf("Expected %s, got %v", expected, got) 43 | } 44 | } 45 | 46 | if count != 250 { 47 | t.Errorf("Expected count = 250, got %v", count) 48 | } 49 | } 50 | 51 | func doInsert(sl *Skiplist, wg *sync.WaitGroup, n int, isRand bool) { 52 | defer wg.Done() 53 | buf := sl.MakeBuf() 54 | defer sl.FreeBuf(buf) 55 | 56 | cmp := CompareInt 57 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 58 | for i := 0; i < n; i++ { 59 | var val int 60 | if isRand { 61 | val = rnd.Int() 62 | } else { 63 | val = i 64 | } 65 | 66 | itm := intKeyItem(val) 67 | sl.Insert2(unsafe.Pointer(&itm), cmp, nil, buf, rnd.Float32, &sl.Stats) 68 | } 69 | } 70 | 71 | func doGet(sl *Skiplist, wg *sync.WaitGroup, n int) { 72 | defer wg.Done() 73 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 74 | cmp := CompareInt 75 | buf := sl.MakeBuf() 76 | defer sl.FreeBuf(buf) 77 | 78 | itr := sl.NewIterator(cmp, buf) 79 | for i := 0; i < n; i++ { 80 | val := rnd.Int() % n 81 | itm := intKeyItem(val) 82 | itr.Seek(unsafe.Pointer(&itm)) 83 | } 84 | 85 | } 86 | 87 | func TestInsertPerf(t *testing.T) { 88 | var wg sync.WaitGroup 89 | sl := New() 90 | n := 1000000 91 | t0 := time.Now() 92 | total := n * runtime.GOMAXPROCS(0) 93 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 94 | wg.Add(1) 95 | go doInsert(sl, &wg, n, true) 96 | } 97 | wg.Wait() 98 | 99 | dur := time.Since(t0) 100 | 101 | fmt.Printf("%d items took %v -> %v items/s conflicts %v\n", total, dur, float64(total)/float64(dur.Seconds()), sl.GetStats().InsertConflicts) 102 | } 103 | 104 | func TestGetPerf(t *testing.T) { 105 | var wg sync.WaitGroup 106 | sl := New() 107 | n := 1000000 108 | wg.Add(1) 109 | go doInsert(sl, &wg, n, false) 110 | wg.Wait() 111 | 112 | t0 := time.Now() 113 | total := n * runtime.GOMAXPROCS(0) 114 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 115 | wg.Add(1) 116 | go doGet(sl, &wg, n) 117 | } 118 | wg.Wait() 119 | dur := time.Since(t0) 120 | fmt.Printf("%d items took %v -> %v items/s\n", total, dur, float64(total)/float64(dur.Seconds())) 121 | 122 | } 123 | 124 | func TestGetRangeSplitItems(t *testing.T) { 125 | var wg sync.WaitGroup 126 | sl := New() 127 | n := 1000000 128 | wg.Add(1) 129 | go doInsert(sl, &wg, n, false) 130 | wg.Wait() 131 | 132 | fmt.Println(sl.GetStats()) 133 | 134 | var keys []int 135 | var diff []int 136 | var curr int 137 | for i, itm := range sl.GetRangeSplitItems(8) { 138 | k := int(*(*intKeyItem)(itm)) 139 | keys = append(keys, k) 140 | diff = append(diff, keys[i]-curr) 141 | curr = keys[i] 142 | } 143 | 144 | diff = append(diff, n-keys[len(keys)-1]) 145 | 146 | fmt.Println("Split range keys", keys) 147 | fmt.Println("No of items in each range", diff) 148 | } 149 | 150 | func TestBuilder(t *testing.T) { 151 | var wg sync.WaitGroup 152 | 153 | n := 50000000 154 | nsplit := 8 155 | segs := make([]*Segment, nsplit) 156 | t0 := time.Now() 157 | b := NewBuilder() 158 | for i := 0; i < nsplit; i++ { 159 | segs[i] = b.NewSegment() 160 | } 161 | 162 | perSplit := n / nsplit 163 | for i := 0; i < nsplit; i++ { 164 | wg.Add(1) 165 | go func(wg *sync.WaitGroup, shard int) { 166 | defer wg.Done() 167 | for x := 0; x < perSplit; x++ { 168 | itm := intKeyItem(perSplit*shard + x) 169 | segs[shard].Add(unsafe.Pointer(&itm)) 170 | } 171 | }(&wg, i) 172 | } 173 | 174 | wg.Wait() 175 | 176 | sl := b.Assemble(segs...) 177 | fmt.Println(sl.GetStats()) 178 | dur := time.Since(t0) 179 | fmt.Printf("Took %v to build %d items, %v items/sec\n", dur, n, float32(n)/float32(dur.Seconds())) 180 | buf := sl.MakeBuf() 181 | defer sl.FreeBuf(buf) 182 | count := 0 183 | 184 | t0 = time.Now() 185 | itr := sl.NewIterator(CompareInt, buf) 186 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 187 | if int(*(*intKeyItem)(itr.Get())) != count { 188 | t.Errorf("Expected %d, got %d", count, itr.Get()) 189 | } 190 | count++ 191 | } 192 | fmt.Printf("Took %v to iterate %d items\n", time.Since(t0), n) 193 | 194 | if count != n { 195 | t.Errorf("Expected %d, got %d", n, count) 196 | } 197 | 198 | } 199 | -------------------------------------------------------------------------------- /skiplist/C/skiplist.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | #ifndef SKIPLIST_H 10 | #define SKIPLIST_H 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace std; 21 | 22 | static int MaxLevel = 32; 23 | static float p = 0.25; 24 | 25 | void *skiplist_malloc(size_t sz) { 26 | return malloc(sz); 27 | } 28 | 29 | void skiplist_free(void *p) { 30 | free(p); 31 | } 32 | 33 | struct Node; 34 | 35 | typedef struct NodeRef { 36 | struct Node *ptr; 37 | bool deleted; 38 | } NodeRef; 39 | 40 | typedef struct Item { 41 | int l; 42 | void *data; 43 | } Item; 44 | 45 | typedef struct Node { 46 | NodeRef * volatile *next; 47 | Item *itm; 48 | uint16_t level; 49 | } Node; 50 | 51 | typedef struct Skiplist { 52 | Node *head; 53 | Node *tail; 54 | uint16_t level; 55 | unsigned int randSeed; 56 | } Skiplist; 57 | 58 | Item *newItem(void *data, int l) { 59 | Item *i = (Item *) skiplist_malloc(sizeof(Item)); 60 | i->data = data; 61 | i->l = l; 62 | return i; 63 | } 64 | 65 | int Item_Compare(Item *itm1, Item *itm2) { 66 | if (itm2 == NULL || itm2->l == INT_MAX) { 67 | return 1; 68 | } 69 | 70 | if (itm1->l == INT_MIN) { 71 | return -1; 72 | } 73 | 74 | if (itm1->l == INT_MAX) { 75 | return 1; 76 | } 77 | 78 | int l = min(itm1->l,itm2->l); 79 | return memcmp(itm1->data, itm2->data, l); 80 | } 81 | 82 | NodeRef *newRef(Node *ptr, bool deleted) { 83 | NodeRef *n = (NodeRef *) skiplist_malloc(sizeof(NodeRef)); 84 | n->ptr = ptr; 85 | n->deleted = deleted; 86 | return n; 87 | } 88 | 89 | Node *newNode(Item *itm, int level) { 90 | Node *n = (Node *) skiplist_malloc(sizeof(Node)); 91 | n->level = (uint16_t) level; 92 | n->itm = itm; 93 | n->next = (NodeRef **) skiplist_malloc((sizeof(NodeRef*)) * level+1); 94 | 95 | return n; 96 | } 97 | 98 | void Node_setNext(Node *n, int level, Node *ptr, bool deleted) { 99 | n->next[level] = newRef(ptr, deleted); 100 | } 101 | 102 | NodeRef Node_getNext(Node *n, int level) { 103 | NodeRef null; 104 | NodeRef *ref = (NodeRef *) __atomic_load_n(&n->next[level], __ATOMIC_RELAXED); 105 | if (ref != NULL) { 106 | return *ref; 107 | } 108 | 109 | return null; 110 | } 111 | 112 | bool Node_dcasNext(Node *n, int level, Node *prevPtr, Node *newPtr, 113 | bool prevIsdeleted, bool newIsdeleted) { 114 | 115 | bool swapped = false; 116 | NodeRef * volatile *addr = &n->next[level]; 117 | NodeRef *ref = (NodeRef *) __atomic_load_n(addr, __ATOMIC_RELAXED); 118 | 119 | if (ref != NULL) { 120 | if (ref->ptr == prevPtr && ref->deleted == prevIsdeleted) { 121 | swapped = __sync_bool_compare_and_swap(addr, ref, newRef(newPtr, newIsdeleted)); 122 | } 123 | } 124 | 125 | return swapped; 126 | } 127 | 128 | Skiplist *newSkiplist() { 129 | Skiplist *s; 130 | Item *minItem, *maxItem; 131 | Node *head, *tail; 132 | 133 | srand(time(NULL)); 134 | 135 | minItem = newItem(NULL, INT_MIN); 136 | maxItem = newItem(NULL, INT_MAX); 137 | 138 | head = newNode(minItem, MaxLevel); 139 | tail = newNode(maxItem, MaxLevel); 140 | 141 | for (int i=0; i <= MaxLevel; i++) { 142 | Node_setNext(head, i, tail, false); 143 | } 144 | 145 | s = (Skiplist *) skiplist_malloc(sizeof(Skiplist)); 146 | s->head = head; 147 | s->tail = tail; 148 | s->level = 0; 149 | 150 | return s; 151 | } 152 | 153 | float Skiplist_randFloat(Skiplist *s) { 154 | return (float)rand_r(&s->randSeed) / (float)RAND_MAX; 155 | } 156 | 157 | int Skiplist_randomLevel(Skiplist *s) { 158 | int nextLevel = 0; 159 | int level; 160 | 161 | for (; Skiplist_randFloat(s) < p; nextLevel++) { 162 | } 163 | 164 | if (nextLevel > MaxLevel) { 165 | nextLevel = MaxLevel; 166 | } 167 | 168 | level = (int) __atomic_load_n(&s->level, __ATOMIC_RELAXED); 169 | if (nextLevel > level) { 170 | __sync_bool_compare_and_swap(&s->level, level, level+1); 171 | nextLevel = level + 1; 172 | } 173 | return nextLevel; 174 | } 175 | 176 | bool Skiplist_findPath(Skiplist *s, Item *itm, Node *preds[], Node *succs[]) { 177 | int cmpVal = 1; 178 | int level; 179 | Node *prev, *curr; 180 | NodeRef curRef, nextRef; 181 | 182 | retry: 183 | prev = s->head; 184 | level = (int) __atomic_load_n(&s->level, __ATOMIC_RELAXED); 185 | for (int i=level; i>=0; i--) { 186 | curRef = Node_getNext(prev, i); 187 | levelSearch: 188 | while (1) { 189 | curr = curRef.ptr; 190 | nextRef = Node_getNext(curr, i); 191 | 192 | cmpVal = Item_Compare(curr->itm, itm); 193 | if (cmpVal < 0) { 194 | prev = curr; 195 | curRef = Node_getNext(prev, i); 196 | curr = curRef.ptr; 197 | } else { 198 | break; 199 | } 200 | } 201 | 202 | preds[i] = prev; 203 | succs[i] = curr; 204 | } 205 | 206 | if (cmpVal == 0) { 207 | return true; 208 | } 209 | 210 | return false; 211 | } 212 | 213 | 214 | void Skiplist_Insert(Skiplist *s, Item *itm) { 215 | int itemLevel = Skiplist_randomLevel(s); 216 | Node *x = newNode(itm, itemLevel); 217 | Node *preds[MaxLevel], *succs[MaxLevel]; 218 | 219 | retry: 220 | Skiplist_findPath(s, itm, preds, succs); 221 | 222 | Node_setNext(x, 0, succs[0], false); 223 | if (!Node_dcasNext(preds[0], 0, succs[0], x, false, false)) { 224 | goto retry; 225 | } 226 | 227 | for (int i=1; i <= int(itemLevel); i++) { 228 | fixThisLevel: 229 | while (1) { 230 | Node_setNext(x, i, succs[i], false); 231 | if (Node_dcasNext(preds[i], i, succs[i], x, false, false)) { 232 | break; 233 | } 234 | Skiplist_findPath(s, itm, preds, succs); 235 | } 236 | } 237 | } 238 | 239 | 240 | #endif 241 | -------------------------------------------------------------------------------- /supernitro/supernitro_test.go: -------------------------------------------------------------------------------- 1 | package supernitro 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "math/rand" 8 | "runtime" 9 | "sync" 10 | "testing" 11 | "time" 12 | ) 13 | 14 | /* 15 | 16 | func TestInsert(t *testing.T) { 17 | db := New() 18 | defer db.Close() 19 | 20 | w := db.NewWriter() 21 | for i := 0; i < 2000000; i++ { 22 | w.Put([]byte(fmt.Sprintf("%010d", i))) 23 | } 24 | 25 | for i := 1750; i < 2000000; i++ { 26 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 27 | } 28 | snap, _ := w.NewSnapshot() 29 | defer snap.Close() 30 | 31 | for i := 2000; i < 2000000; i++ { 32 | w.Put([]byte(fmt.Sprintf("%010d", i))) 33 | } 34 | 35 | snap2, _ := w.NewSnapshot() 36 | defer snap2.Close() 37 | 38 | var snp *Snapshot 39 | for i := 0; i < 200; i++ { 40 | x, _ := w.NewSnapshot() 41 | for m := 0; m < 1000000; m++ { 42 | w.Put([]byte(fmt.Sprintf("%010d", i*1000000+m))) 43 | } 44 | //x.Close() 45 | snp = x 46 | } 47 | 48 | itr := db.NewIterator(snp) 49 | count := 0 50 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 51 | count++ 52 | 53 | } 54 | fmt.Println("count", count) 55 | 56 | } 57 | 58 | */ 59 | 60 | func TestInsert(t *testing.T) { 61 | db := New() 62 | defer db.Close() 63 | 64 | w := db.NewWriter() 65 | for i := 0; i < 2000000; i++ { 66 | w.Put([]byte(fmt.Sprintf("%010d", i))) 67 | } 68 | 69 | for i := 1750000; i < 2000000; i++ { 70 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 71 | } 72 | snap, _ := w.NewSnapshot() 73 | defer snap.Close() 74 | 75 | for i := 2000000; i < 5000000; i++ { 76 | w.Put([]byte(fmt.Sprintf("%010d", i))) 77 | } 78 | 79 | snap2, _ := w.NewSnapshot() 80 | defer snap2.Close() 81 | 82 | count := 0 83 | itr := db.NewIterator(snap) 84 | defer itr.Close() 85 | 86 | itr.SeekFirst() 87 | itr.Seek([]byte(fmt.Sprintf("%010d", 1500000))) 88 | for ; itr.Valid(); itr.Next() { 89 | expected := fmt.Sprintf("%010d", count+1500000) 90 | got := string(itr.Get()) 91 | count++ 92 | if got != expected { 93 | t.Errorf("Expected %s, got %v", expected, got) 94 | } 95 | } 96 | 97 | if count != 250000 { 98 | t.Errorf("Expected count = 250, got %v", count) 99 | } 100 | } 101 | 102 | func doInsert(id int, db *SuperNitro, ch chan bool, wg *sync.WaitGroup, n int, isRand bool, shouldSnap bool) { 103 | defer wg.Done() 104 | w := db.NewWriter() 105 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 106 | for i := 0; i < n; i++ { 107 | var val int 108 | if isRand { 109 | val = rnd.Int()%1000000000 + id*10000000000 110 | } else { 111 | val = i + id*n 112 | } 113 | if shouldSnap && i%100000 == 0 { 114 | ch <- true 115 | <-ch 116 | } 117 | buf := make([]byte, 8) 118 | binary.LittleEndian.PutUint64(buf, uint64(val)) 119 | w.Put(buf) 120 | } 121 | } 122 | 123 | func TestInsertPerf(t *testing.T) { 124 | var wg sync.WaitGroup 125 | db := New() 126 | defer db.Close() 127 | 128 | workers := 8 129 | n := 20000000 / workers 130 | t0 := time.Now() 131 | total := n * workers 132 | ch := make([]chan bool, workers) 133 | for i := 0; i < workers; i++ { 134 | ch[i] = make(chan bool) 135 | } 136 | go func() { 137 | 138 | for { 139 | for i := 0; i < workers; i++ { 140 | <-ch[i] 141 | } 142 | snap, _ := db.NewSnapshot() 143 | snap.Close() 144 | 145 | for i := 0; i < workers; i++ { 146 | ch[i] <- true 147 | } 148 | } 149 | }() 150 | 151 | for i := 0; i < workers; i++ { 152 | wg.Add(1) 153 | go doInsert(i, db, ch[i], &wg, n, false, true) 154 | } 155 | wg.Wait() 156 | 157 | snap, _ := db.NewSnapshot() 158 | db.Sync() 159 | dur := time.Since(t0) 160 | fmt.Printf("%d items took %v -> %v items/s\n", 161 | total, dur, float64(total)/float64(dur.Seconds())) 162 | 163 | itr := db.NewIterator(snap) 164 | fmt.Println("snap", snap) 165 | c := 0 166 | x := uint64(0) 167 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 168 | v := binary.BigEndian.Uint64(itr.Get()) 169 | if v < x { 170 | //panic(fmt.Sprint(x, v)) 171 | fmt.Println("bad", x, v) 172 | } 173 | // fmt.Println(v) 174 | x = v 175 | c++ 176 | } 177 | fmt.Println("count", c) 178 | 179 | /* 180 | itr1 := snap.snaps[0].NewIterator() 181 | for itr1.SeekFirst(); itr1.Valid(); itr1.Next() { 182 | fmt.Println("first", binary.BigEndian.Uint64(itr1.Get())) 183 | } 184 | fmt.Println("") 185 | 186 | itr1 = snap.snaps[1].NewIterator() 187 | for itr1.SeekFirst(); itr1.Valid(); itr1.Next() { 188 | fmt.Println("second", binary.BigEndian.Uint64(itr1.Get())) 189 | } 190 | */ 191 | itr.Close() 192 | snap.Close() 193 | } 194 | 195 | func doGet(t *testing.T, db *SuperNitro, snap *Snapshot, wg *sync.WaitGroup, n int) { 196 | defer wg.Done() 197 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 198 | 199 | buf := make([]byte, 8) 200 | itr := db.NewIterator(snap) 201 | defer itr.Close() 202 | for i := 0; i < n; i++ { 203 | val := rnd.Int() % n 204 | //binary.BigEndian.PutUint64(buf, uint64(val)) 205 | //exp := fmt.Sprintf("%010d", val) 206 | //itr.Seek(buf) 207 | itr.SeekFirst() 208 | if !itr.Valid() { 209 | t.Errorf("Expected to find %v", val) 210 | } 211 | if !bytes.Equal(buf, itr.Get()) { 212 | panic(string(itr.Get())) 213 | } 214 | } 215 | } 216 | 217 | func TestGetPerf(t *testing.T) { 218 | var wg sync.WaitGroup 219 | db := New() 220 | defer db.Close() 221 | n := 1000 222 | wg.Add(1) 223 | go doInsert(0, db, make(chan bool), &wg, n, false, false) 224 | wg.Wait() 225 | snap, _ := db.NewSnapshot() 226 | defer snap.Close() 227 | 228 | fmt.Println("built index") 229 | 230 | t0 := time.Now() 231 | total := n * runtime.GOMAXPROCS(0) 232 | //for i := 0; i < runtime.GOMAXPROCS(0); i++ { 233 | wg.Add(1) 234 | // go doGet(t, db, snap, &wg, n) 235 | //} 236 | //wg.Wait() 237 | doGet(t, db, snap, &wg, n) 238 | dur := time.Since(t0) 239 | fmt.Printf("%d items took %v -> %v items/s\n", total, dur, float64(total)/float64(dur.Seconds())) 240 | } 241 | 242 | func TestSimpleGet(t *testing.T) { 243 | db := New() 244 | w := db.NewWriter() 245 | 246 | n := 1000000 247 | buf := make([]byte, 8) 248 | for i := 0; i < n; i++ { 249 | binary.BigEndian.PutUint64(buf, uint64(i)) 250 | w.Put(buf) 251 | if i%100000 == 0 { 252 | snap, _ := w.NewSnapshot() 253 | snap.Close() 254 | time.Sleep(time.Second) 255 | } 256 | } 257 | 258 | snap, _ := w.NewSnapshot() 259 | itr := db.NewIterator(snap) 260 | 261 | for i := 0; i < n; i++ { 262 | binary.BigEndian.PutUint64(buf, uint64(i)) 263 | itr.Seek(buf) 264 | if !itr.Valid() { 265 | t.Errorf("invalid %v buf:%v", i, buf) 266 | continue 267 | } 268 | 269 | x := binary.BigEndian.Uint64(itr.Get()) 270 | if uint64(i) != x { 271 | t.Errorf("failed to lookup %v, got %v", i, x) 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /skiplist/node_alloc_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import ( 13 | "reflect" 14 | "unsafe" 15 | ) 16 | 17 | var nodeTypes = [33]reflect.Type{ 18 | reflect.TypeOf(node0), 19 | reflect.TypeOf(node1), 20 | reflect.TypeOf(node2), 21 | reflect.TypeOf(node3), 22 | reflect.TypeOf(node4), 23 | reflect.TypeOf(node5), 24 | reflect.TypeOf(node6), 25 | reflect.TypeOf(node7), 26 | reflect.TypeOf(node8), 27 | reflect.TypeOf(node9), 28 | reflect.TypeOf(node10), 29 | reflect.TypeOf(node11), 30 | reflect.TypeOf(node12), 31 | reflect.TypeOf(node13), 32 | reflect.TypeOf(node14), 33 | reflect.TypeOf(node15), 34 | reflect.TypeOf(node16), 35 | reflect.TypeOf(node17), 36 | reflect.TypeOf(node18), 37 | reflect.TypeOf(node19), 38 | reflect.TypeOf(node20), 39 | reflect.TypeOf(node21), 40 | reflect.TypeOf(node22), 41 | reflect.TypeOf(node23), 42 | reflect.TypeOf(node24), 43 | reflect.TypeOf(node25), 44 | reflect.TypeOf(node26), 45 | reflect.TypeOf(node27), 46 | reflect.TypeOf(node28), 47 | reflect.TypeOf(node29), 48 | reflect.TypeOf(node30), 49 | reflect.TypeOf(node31), 50 | reflect.TypeOf(node32), 51 | } 52 | 53 | var node0 struct { 54 | itm unsafe.Pointer 55 | gc unsafe.Pointer 56 | dp uint64 57 | buf [1]NodeRef 58 | } 59 | 60 | var node1 struct { 61 | itm unsafe.Pointer 62 | gc unsafe.Pointer 63 | dp uint64 64 | buf [2]NodeRef 65 | } 66 | 67 | var node2 struct { 68 | itm unsafe.Pointer 69 | gc unsafe.Pointer 70 | dp uint64 71 | buf [3]NodeRef 72 | } 73 | 74 | var node3 struct { 75 | itm unsafe.Pointer 76 | gc unsafe.Pointer 77 | dp uint64 78 | buf [4]NodeRef 79 | } 80 | 81 | var node4 struct { 82 | itm unsafe.Pointer 83 | gc unsafe.Pointer 84 | dp uint64 85 | buf [5]NodeRef 86 | } 87 | 88 | var node5 struct { 89 | itm unsafe.Pointer 90 | gc unsafe.Pointer 91 | dp uint64 92 | buf [6]NodeRef 93 | } 94 | 95 | var node6 struct { 96 | itm unsafe.Pointer 97 | gc unsafe.Pointer 98 | dp uint64 99 | buf [7]NodeRef 100 | } 101 | 102 | var node7 struct { 103 | itm unsafe.Pointer 104 | gc unsafe.Pointer 105 | dp uint64 106 | buf [8]NodeRef 107 | } 108 | 109 | var node8 struct { 110 | itm unsafe.Pointer 111 | gc unsafe.Pointer 112 | dp uint64 113 | buf [9]NodeRef 114 | } 115 | 116 | var node9 struct { 117 | itm unsafe.Pointer 118 | gc unsafe.Pointer 119 | dp uint64 120 | buf [10]NodeRef 121 | } 122 | 123 | var node10 struct { 124 | itm unsafe.Pointer 125 | gc unsafe.Pointer 126 | dp uint64 127 | buf [11]NodeRef 128 | } 129 | var node11 struct { 130 | itm unsafe.Pointer 131 | gc unsafe.Pointer 132 | dp uint64 133 | buf [12]NodeRef 134 | } 135 | 136 | var node12 struct { 137 | itm unsafe.Pointer 138 | gc unsafe.Pointer 139 | dp uint64 140 | buf [13]NodeRef 141 | } 142 | 143 | var node13 struct { 144 | itm unsafe.Pointer 145 | gc unsafe.Pointer 146 | dp uint64 147 | buf [14]NodeRef 148 | } 149 | 150 | var node14 struct { 151 | itm unsafe.Pointer 152 | gc unsafe.Pointer 153 | dp uint64 154 | buf [15]NodeRef 155 | } 156 | 157 | var node15 struct { 158 | itm unsafe.Pointer 159 | gc unsafe.Pointer 160 | dp uint64 161 | buf [16]NodeRef 162 | } 163 | 164 | var node16 struct { 165 | itm unsafe.Pointer 166 | gc unsafe.Pointer 167 | dp uint64 168 | buf [17]NodeRef 169 | } 170 | 171 | var node17 struct { 172 | itm unsafe.Pointer 173 | gc unsafe.Pointer 174 | dp uint64 175 | buf [18]NodeRef 176 | } 177 | 178 | var node18 struct { 179 | itm unsafe.Pointer 180 | gc unsafe.Pointer 181 | dp uint64 182 | buf [19]NodeRef 183 | } 184 | 185 | var node19 struct { 186 | itm unsafe.Pointer 187 | gc unsafe.Pointer 188 | dp uint64 189 | buf [20]NodeRef 190 | } 191 | 192 | var node20 struct { 193 | itm unsafe.Pointer 194 | gc unsafe.Pointer 195 | dp uint64 196 | buf [21]NodeRef 197 | } 198 | 199 | var node21 struct { 200 | itm unsafe.Pointer 201 | gc unsafe.Pointer 202 | dp uint64 203 | buf [22]NodeRef 204 | } 205 | 206 | var node22 struct { 207 | itm unsafe.Pointer 208 | gc unsafe.Pointer 209 | dp uint64 210 | buf [23]NodeRef 211 | } 212 | 213 | var node23 struct { 214 | itm unsafe.Pointer 215 | gc unsafe.Pointer 216 | dp uint64 217 | buf [24]NodeRef 218 | } 219 | 220 | var node24 struct { 221 | itm unsafe.Pointer 222 | gc unsafe.Pointer 223 | dp uint64 224 | buf [25]NodeRef 225 | } 226 | 227 | var node25 struct { 228 | itm unsafe.Pointer 229 | gc unsafe.Pointer 230 | dp uint64 231 | buf [26]NodeRef 232 | } 233 | 234 | var node26 struct { 235 | itm unsafe.Pointer 236 | gc unsafe.Pointer 237 | dp uint64 238 | buf [27]NodeRef 239 | } 240 | 241 | var node27 struct { 242 | itm unsafe.Pointer 243 | gc unsafe.Pointer 244 | dp uint64 245 | buf [28]NodeRef 246 | } 247 | 248 | var node28 struct { 249 | itm unsafe.Pointer 250 | gc unsafe.Pointer 251 | dp uint64 252 | buf [29]NodeRef 253 | } 254 | 255 | var node29 struct { 256 | itm unsafe.Pointer 257 | gc unsafe.Pointer 258 | dp uint64 259 | buf [30]NodeRef 260 | } 261 | 262 | var node30 struct { 263 | itm unsafe.Pointer 264 | gc unsafe.Pointer 265 | dp uint64 266 | buf [31]NodeRef 267 | } 268 | var node31 struct { 269 | itm unsafe.Pointer 270 | gc unsafe.Pointer 271 | dp uint64 272 | buf [32]NodeRef 273 | } 274 | 275 | var node32 struct { 276 | itm unsafe.Pointer 277 | gc unsafe.Pointer 278 | dp uint64 279 | buf [33]NodeRef 280 | } 281 | 282 | func allocNode(itm unsafe.Pointer, level int, malloc MallocFn) *Node { 283 | var block unsafe.Pointer 284 | if malloc == nil { 285 | block = unsafe.Pointer(reflect.New(nodeTypes[level]).Pointer()) 286 | } else { 287 | block = malloc(int(nodeTypes[level].Size())) 288 | } 289 | 290 | n := (*Node)(block) 291 | n.level = uint16(level) 292 | n.itm = itm 293 | n.DataPtr = 0 294 | n.GClink = nil 295 | return n 296 | } 297 | 298 | var freeBlockContent []byte 299 | 300 | func init() { 301 | l := int(nodeTypes[32].Size()) 302 | freeBlockContent = make([]byte, l) 303 | for i := 0; i < l; i++ { 304 | freeBlockContent[i] = 0xdd 305 | } 306 | } 307 | 308 | // Fill free blocks with a const 309 | // This can help debugging of memory reclaimer bugs 310 | func debugMarkFree(n *Node) { 311 | var block []byte 312 | l := int(nodeTypes[n.level].Size()) 313 | sh := (*reflect.SliceHeader)(unsafe.Pointer(&block)) 314 | sh.Data = uintptr(unsafe.Pointer(n)) 315 | sh.Len = l 316 | sh.Cap = l 317 | 318 | copy(block, freeBlockContent) 319 | } 320 | -------------------------------------------------------------------------------- /batch.go: -------------------------------------------------------------------------------- 1 | package nitro 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "github.com/t3rm1n4l/nitro/skiplist" 7 | "unsafe" 8 | ) 9 | 10 | const blockSize = 4096 11 | 12 | type itemOp int 13 | 14 | const ( 15 | itemDeleteOp itemOp = iota 16 | itemInsertop 17 | ) 18 | 19 | type diskWriter struct { 20 | shard int 21 | w *Writer 22 | rbuf, wbuf []byte 23 | 24 | stats BatchOpStats 25 | } 26 | 27 | type BatchOpStats struct { 28 | BlocksWritten int64 29 | BlocksRemoved int64 30 | 31 | ItemsInserted int64 32 | ItemsWritten int64 33 | ItemsRemoved int64 34 | } 35 | 36 | func (b BatchOpStats) String() string { 37 | return fmt.Sprintf( 38 | "blocks_written = %d\n"+ 39 | "blocks_removed = %d\n"+ 40 | "items_inserted = %d\n"+ 41 | "items_written = %d\n"+ 42 | "items_removed = %d", 43 | b.BlocksWritten, b.BlocksRemoved, b.ItemsInserted, b.ItemsWritten, 44 | b.ItemsRemoved) 45 | } 46 | 47 | func (r *BatchOpStats) ApplyDiff(a, b BatchOpStats) { 48 | r.BlocksWritten += a.BlocksWritten - b.BlocksWritten 49 | r.BlocksRemoved += a.BlocksRemoved - b.BlocksRemoved 50 | r.ItemsInserted += a.ItemsInserted - b.ItemsInserted 51 | r.ItemsWritten += a.ItemsWritten - b.ItemsWritten 52 | r.ItemsRemoved += a.ItemsRemoved - b.ItemsRemoved 53 | } 54 | 55 | func (m *Nitro) newDiskWriter(shard int) *diskWriter { 56 | return &diskWriter{ 57 | rbuf: make([]byte, blockSize), 58 | wbuf: make([]byte, blockSize), 59 | w: m.NewWriter(), 60 | shard: shard, 61 | } 62 | } 63 | 64 | type nodeOpIterator struct { 65 | *Iterator 66 | } 67 | 68 | func NewOpIterator(itr *Iterator) BatchOpIterator { 69 | it := &nodeOpIterator{ 70 | Iterator: itr, 71 | } 72 | 73 | return it 74 | } 75 | 76 | func (it *nodeOpIterator) Item() unsafe.Pointer { 77 | return it.Iterator.GetNode().Item() 78 | } 79 | 80 | func (it *nodeOpIterator) Next() { 81 | it.Iterator.Next() 82 | } 83 | 84 | func (it *nodeOpIterator) Op() itemOp { 85 | itm := (*Item)(it.Iterator.GetNode().Item()) 86 | if itm.bornSn != 0 { 87 | return itemInsertop 88 | } else { 89 | return itemDeleteOp 90 | } 91 | } 92 | 93 | func (it *nodeOpIterator) Close() { 94 | it.Iterator.Close() 95 | } 96 | 97 | type BatchOpIterator interface { 98 | skiplist.BatchOpIterator 99 | Op() itemOp 100 | Close() 101 | } 102 | 103 | func (dw *diskWriter) batchModifyCallback(n *skiplist.Node, cmp skiplist.CompareFn, 104 | maxItem unsafe.Pointer, sOpItr skiplist.BatchOpIterator) error { 105 | 106 | var err error 107 | var indexItem []byte 108 | var db *dataBlock 109 | 110 | opItr := sOpItr.(BatchOpIterator) 111 | 112 | if n.Item() != skiplist.MinItem { 113 | dw.w.DeleteNode(n) 114 | dw.stats.BlocksRemoved++ 115 | err := dw.w.bm.ReadBlock(blockPtr(n.DataPtr), dw.rbuf) 116 | if err != nil { 117 | return err 118 | } 119 | db = newDataBlock(dw.rbuf) 120 | } 121 | 122 | wblock := newDataBlock(dw.wbuf) 123 | 124 | flushBlock := func() error { 125 | bptr, err := dw.w.bm.WriteBlock(wblock.Bytes(), dw.shard) 126 | if err == nil { 127 | indexNode := dw.w.Put2(indexItem) 128 | if indexNode == nil { 129 | panic("index node creation should not fail") 130 | } 131 | indexNode.DataPtr = uint64(bptr) 132 | wblock.Reset() 133 | dw.stats.BlocksWritten++ 134 | } 135 | 136 | return err 137 | } 138 | 139 | doWriteItem := func(itm []byte) error { 140 | if indexItem == nil { 141 | indexItem = itm 142 | } 143 | 144 | dw.stats.ItemsWritten++ 145 | if err := wblock.Write(itm); err == errBlockFull { 146 | if err := flushBlock(); err != nil { 147 | return err 148 | } 149 | 150 | indexItem = itm 151 | return wblock.Write(itm) 152 | } 153 | 154 | return nil 155 | } 156 | 157 | var nItm []byte 158 | for nItm = db.Get(); err == nil && opItr.Valid() && 159 | skiplist.Compare(cmp, opItr.Item(), maxItem) < 0 && nItm != nil; { 160 | opItm := (*Item)(opItr.Item()).Bytes() 161 | cmpval := bytes.Compare(nItm, opItm) 162 | switch { 163 | case cmpval < 0: 164 | err = doWriteItem(nItm) 165 | nItm = db.Get() 166 | break 167 | case cmpval == 0: 168 | if opItr.Op() == itemInsertop { 169 | err = doWriteItem(opItm) 170 | } else { 171 | dw.stats.ItemsRemoved++ 172 | } 173 | 174 | opItr.Next() 175 | nItm = db.Get() 176 | break 177 | default: 178 | if opItr.Op() == itemInsertop { 179 | err = doWriteItem(opItm) 180 | dw.stats.ItemsInserted++ 181 | opItr.Next() 182 | } 183 | } 184 | } 185 | 186 | for ; err == nil && opItr.Valid() && 187 | skiplist.Compare(cmp, opItr.Item(), maxItem) < 0; opItr.Next() { 188 | 189 | if opItr.Op() == itemInsertop { 190 | opItm := (*Item)(opItr.Item()).Bytes() 191 | err = doWriteItem(opItm) 192 | dw.stats.ItemsInserted++ 193 | } 194 | } 195 | 196 | for ; err == nil && nItm != nil; nItm = db.Get() { 197 | err = doWriteItem(nItm) 198 | } 199 | 200 | if err != nil { 201 | return err 202 | } 203 | 204 | if !wblock.IsEmpty() { 205 | return flushBlock() 206 | } 207 | 208 | return nil 209 | } 210 | 211 | type batchOpIterator struct { 212 | db *Nitro 213 | BatchOpIterator 214 | itm unsafe.Pointer 215 | } 216 | 217 | func (it *batchOpIterator) fillItem() { 218 | srcItm := (*Item)(it.BatchOpIterator.Item()) 219 | l := len(srcItm.Bytes()) 220 | dstItm := it.db.allocItem(l, false) 221 | copy(dstItm.Bytes(), srcItm.Bytes()) 222 | dstItm.bornSn = it.db.getCurrSn() 223 | it.itm = unsafe.Pointer(dstItm) 224 | } 225 | 226 | func (it *batchOpIterator) Next() { 227 | it.BatchOpIterator.Next() 228 | if it.BatchOpIterator.Valid() { 229 | it.fillItem() 230 | } 231 | } 232 | 233 | func (it *batchOpIterator) Item() unsafe.Pointer { 234 | return it.itm 235 | } 236 | 237 | func isValidNode(n *skiplist.Node) bool { 238 | itm := n.Item() 239 | 240 | // TODO: move this check to skiplist module 241 | if itm != skiplist.MaxItem { 242 | return (*Item)(itm).deadSn == 0 243 | } 244 | 245 | return true 246 | } 247 | 248 | func (m *Nitro) newBatchOpIterator(it *Iterator) BatchOpIterator { 249 | bItr := &batchOpIterator{ 250 | db: m, 251 | BatchOpIterator: NewOpIterator(it), 252 | } 253 | 254 | if bItr.Valid() { 255 | bItr.fillItem() 256 | } 257 | return bItr 258 | } 259 | 260 | func (m *Nitro) ApplyOps(snap *Snapshot, concurr int) (BatchOpStats, error) { 261 | var err error 262 | var stats BatchOpStats 263 | 264 | w := m.NewWriter() 265 | currSnap := &Snapshot{db: m, sn: m.getCurrSn(), refCount: 1} 266 | pivots := m.partitionPivots(currSnap, concurr) 267 | 268 | beforeStats := make([]BatchOpStats, len(pivots)-1) 269 | errors := make([]chan error, len(pivots)-1) 270 | 271 | for i := 0; i < len(pivots)-1; i++ { 272 | errors[i] = make(chan error, 1) 273 | beforeStats[i] = m.shardWrs[i].stats 274 | 275 | itr := snap.NewIterator() 276 | itr.Seek(pivots[i].Bytes()) 277 | itr.SetEnd(pivots[i+1].Bytes()) 278 | opItr := m.newBatchOpIterator(itr) 279 | defer opItr.Close() 280 | head := w.GetNode(pivots[i].Bytes()) 281 | tail := w.GetNode(pivots[i+1].Bytes()) 282 | 283 | if pivots[i] == nil { 284 | head = nil 285 | } 286 | 287 | if pivots[i+1] == nil { 288 | tail = nil 289 | } 290 | 291 | go func(id int, opItr BatchOpIterator, head, tail *skiplist.Node) { 292 | errors[id] <- m.store.ExecBatchOps(opItr, head, tail, m.shardWrs[id].batchModifyCallback, m.insCmp, isValidNode, &m.store.Stats) 293 | }(i, opItr, head, tail) 294 | } 295 | 296 | for i := 0; i < len(pivots)-1; i++ { 297 | if e := <-errors[i]; e != nil { 298 | err = e 299 | } 300 | 301 | stats.ApplyDiff(m.shardWrs[i].stats, beforeStats[i]) 302 | } 303 | 304 | return stats, err 305 | } 306 | -------------------------------------------------------------------------------- /skiplist/access_barrier.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import ( 13 | "math" 14 | "sync" 15 | "sync/atomic" 16 | "unsafe" 17 | ) 18 | 19 | /* 20 | * Algorithm: 21 | * Access barrier is used to facilitize safe remory reclaimation in the lockfree 22 | * skiplist. Every skiplist access needs to be passed through a gate which tracks 23 | * the safety premitives to figure out when is the right time to deallocate a 24 | * skiplist node. 25 | * 26 | * Even though lockfree skiplist deletion algorithm takes care of completely unlinking 27 | * a skiplist node from the skiplist, still there could be a small perioid during which 28 | * deleted node is accessible to already live skiplist accessors. We need to wait until 29 | * a safe period before the memory for the node can be deallocated. 30 | * 31 | * In this algorithm, the unit of safety period is called barrier session. All the 32 | * live accessor of the skiplist are tracked in a barrier session. Whenever a 33 | * skiplist delete or group of deletes is performed, current barrier session is 34 | * closed and new barrier session is started. The previous barrier 35 | * session tracks all the live accessors until the session of closed. The right 36 | * time to safely reclaim the node is when all the accessor becomes dead. This makes 37 | * sure that unlinked node will be invisible to everyone. The accessors in the 38 | * barrier session can cooperatively detect and mark when each of them terminates. 39 | * When the last accessor leaves, it can take the action to call the destructor 40 | * for the node and barrier session terminates. 41 | * 42 | * Closing and installing a new barrier session: 43 | * A session liveCount is incremented every time an accessor is entering the skiplist 44 | * and decremented when the leave the skiplist. When a session is closed and new 45 | * one needs to be installed, we just swap the global barrier session reference. 46 | * There could be race conditions while a session is being marked as closed. Still 47 | * an ongoing skiplist accessor can increment the counter of a session which was marked 48 | * as closed. To detect those accessors and make them retry, we add a large number 49 | * to the liveCount as part of the session close phase. When the accessor finds 50 | * that the incremented result is greater than that large offset, it needs to backoff 51 | * from the current session and acquire new session to increment the count. In this 52 | * scheme, whoever decrements the count and gets the count equal to the large offset 53 | * is responsible for deallocation of the object. 54 | * 55 | * The algorithm has to consider one more condition before it can call destructor for 56 | * the session. Multiple closed sessions can be active at a time. We cannot call the 57 | * destructor for a closed session while a previous closed session is still not terminated. 58 | * Because, even through accessors from a closed session has become zero, accessors from previous 59 | * closed session would be able to access items in the later closed session. Hence, a closed session 60 | * can be terminated only after termination of all previous closed sessions. 61 | * */ 62 | 63 | // BarrierSessionDestructor is a callback for SMR based reclaim of objects 64 | type BarrierSessionDestructor func(objectRef unsafe.Pointer) 65 | 66 | const barrierFlushOffset = math.MaxInt32 / 2 67 | 68 | // BarrierSession handle tracks the live accessors of a barrier session 69 | type BarrierSession struct { 70 | liveCount *int32 71 | objectRef unsafe.Pointer 72 | seqno uint64 73 | closed int32 74 | } 75 | 76 | // CompareBS is a barrier session comparator based on seqno 77 | func CompareBS(this, that unsafe.Pointer) int { 78 | thisItm := (*BarrierSession)(this) 79 | thatItm := (*BarrierSession)(that) 80 | 81 | return int(thisItm.seqno) - int(thatItm.seqno) 82 | } 83 | 84 | func newBarrierSession() *BarrierSession { 85 | bs := &BarrierSession{ 86 | liveCount: new(int32), 87 | } 88 | 89 | return bs 90 | } 91 | 92 | // AccessBarrier is the SMR core data structure for the skiplist 93 | type AccessBarrier struct { 94 | activeSeqno uint64 95 | session unsafe.Pointer 96 | callb BarrierSessionDestructor 97 | 98 | freeq *Skiplist 99 | freeSeqno uint64 100 | isDestructorRunning int32 101 | 102 | active bool 103 | sync.Mutex 104 | } 105 | 106 | func newAccessBarrier(active bool, callb BarrierSessionDestructor) *AccessBarrier { 107 | ab := &AccessBarrier{ 108 | active: active, 109 | session: unsafe.Pointer(newBarrierSession()), 110 | callb: callb, 111 | } 112 | if active { 113 | ab.freeq = New() 114 | } 115 | return ab 116 | } 117 | 118 | func (ab *AccessBarrier) doCleanup() { 119 | buf1 := ab.freeq.MakeBuf() 120 | buf2 := ab.freeq.MakeBuf() 121 | defer ab.freeq.FreeBuf(buf1) 122 | defer ab.freeq.FreeBuf(buf2) 123 | 124 | iter := ab.freeq.NewIterator(CompareBS, buf1) 125 | defer iter.Close() 126 | 127 | for iter.SeekFirst(); iter.Valid(); iter.Next() { 128 | node := iter.GetNode() 129 | bs := (*BarrierSession)(node.Item()) 130 | if bs.seqno != ab.freeSeqno+1 { 131 | return 132 | } 133 | 134 | ab.freeSeqno++ 135 | ab.callb(bs.objectRef) 136 | ab.freeq.DeleteNode(node, CompareBS, buf2, &ab.freeq.Stats) 137 | } 138 | } 139 | 140 | // Acquire marks enter of an accessor in the skiplist 141 | func (ab *AccessBarrier) Acquire() *BarrierSession { 142 | if ab.active { 143 | retry: 144 | bs := (*BarrierSession)(atomic.LoadPointer(&ab.session)) 145 | liveCount := atomic.AddInt32(bs.liveCount, 1) 146 | if liveCount > barrierFlushOffset { 147 | ab.Release(bs) 148 | goto retry 149 | } 150 | 151 | return bs 152 | } 153 | 154 | return nil 155 | } 156 | 157 | // Release marks leaving of an accessor in the skiplist 158 | func (ab *AccessBarrier) Release(bs *BarrierSession) { 159 | if ab.active { 160 | liveCount := atomic.AddInt32(bs.liveCount, -1) 161 | if liveCount == barrierFlushOffset { 162 | buf := ab.freeq.MakeBuf() 163 | defer ab.freeq.FreeBuf(buf) 164 | 165 | // Accessors which entered a closed barrier session steps down automatically 166 | // But, they may try to close an already closed session. 167 | if atomic.AddInt32(&bs.closed, 1) == 1 { 168 | ab.freeq.Insert(unsafe.Pointer(bs), CompareBS, buf, &ab.freeq.Stats) 169 | if atomic.CompareAndSwapInt32(&ab.isDestructorRunning, 0, 1) { 170 | ab.doCleanup() 171 | atomic.CompareAndSwapInt32(&ab.isDestructorRunning, 1, 0) 172 | } 173 | } 174 | } 175 | } 176 | } 177 | 178 | // FlushSession closes the current barrier session and starts the new session. 179 | // The caller should provide the destructor pointer for the new session. 180 | func (ab *AccessBarrier) FlushSession(ref unsafe.Pointer) { 181 | if ab.active { 182 | ab.Lock() 183 | defer ab.Unlock() 184 | 185 | bsPtr := atomic.LoadPointer(&ab.session) 186 | newBsPtr := unsafe.Pointer(newBarrierSession()) 187 | atomic.CompareAndSwapPointer(&ab.session, bsPtr, newBsPtr) 188 | bs := (*BarrierSession)(bsPtr) 189 | bs.objectRef = ref 190 | ab.activeSeqno++ 191 | bs.seqno = ab.activeSeqno 192 | 193 | atomic.AddInt32(bs.liveCount, barrierFlushOffset+1) 194 | ab.Release(bs) 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /nodetable/table.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | // Package nodetable implements high performance GC optimized Node lookup table 11 | // for Nitro index storage. This table is not thread-safe 12 | // 13 | // Golang map does not need to pay the cost of GC scans if you have native 14 | // fixed size types for both key and value. We use two tables for the node 15 | // lookup table implementation. Fast table and slow table. Fast table stores 16 | // maps crc32(key) to a uint64. Value is a pointer to a skiplist node. Highest 17 | // bit is used to indicate whether there is any hash collision for the crc32 18 | // key used. If the bit is set, that means we need to lookup second table, 19 | // which is the slow table. Slow table has multiple entries which are mapped 20 | // by the same crc32 key. 21 | package nodetable 22 | 23 | import "unsafe" 24 | import "fmt" 25 | import "github.com/t3rm1n4l/nitro/skiplist" 26 | 27 | var emptyResult ntResult 28 | 29 | const approxItemSize = 42 30 | 31 | var dbInstances *skiplist.Skiplist 32 | 33 | func init() { 34 | dbInstances = skiplist.New() 35 | } 36 | 37 | // EqualKeyFn implements key equality check 38 | type EqualKeyFn func(unsafe.Pointer, []byte) bool 39 | 40 | // HashFn implements 32bit hash function on a string 41 | type HashFn func([]byte) uint32 42 | 43 | // NodeTable describes lookup table 44 | type NodeTable struct { 45 | fastHT map[uint32]uint64 46 | slowHT map[uint32][]uint64 47 | fastHTCount uint64 48 | slowHTCount uint64 49 | conflicts uint64 50 | 51 | hash HashFn 52 | keyEqual EqualKeyFn 53 | 54 | res ntResult 55 | } 56 | 57 | // CompareNodeTable implements comparator for nodetable instances 58 | func CompareNodeTable(a, b unsafe.Pointer) int { 59 | return int(uintptr(a)) - int(uintptr(b)) 60 | } 61 | 62 | const ( 63 | ntNotFound = 0x00 64 | ntFoundInFast = 0x01 65 | ntFoundInSlow = 0x03 66 | ntFoundMask = 0x01 67 | ) 68 | 69 | type ntResult struct { 70 | status int 71 | hash uint32 72 | hasConflict bool 73 | fastHTHasEntry bool 74 | fastHTValue uint64 75 | slowHTValues []uint64 76 | slowHTPos int 77 | } 78 | 79 | // New creates a nodetable instance 80 | func New(hfn HashFn, kfn EqualKeyFn) *NodeTable { 81 | nt := &NodeTable{ 82 | fastHT: make(map[uint32]uint64), 83 | slowHT: make(map[uint32][]uint64), 84 | hash: hfn, 85 | keyEqual: kfn, 86 | } 87 | 88 | buf := dbInstances.MakeBuf() 89 | defer dbInstances.FreeBuf(buf) 90 | dbInstances.Insert(unsafe.Pointer(nt), CompareNodeTable, buf, &dbInstances.Stats) 91 | 92 | return nt 93 | } 94 | 95 | // Stats returns nodetable statistics 96 | func (nt *NodeTable) Stats() string { 97 | return fmt.Sprintf("\nFastHTCount = %d\n"+ 98 | "SlowHTCount = %d\n"+ 99 | "Conflicts = %d\n"+ 100 | "MemoryInUse = %d\n", 101 | nt.fastHTCount, nt.slowHTCount, nt.conflicts, nt.MemoryInUse()) 102 | } 103 | 104 | // MemoryInUse returns memory used by nodetable instance 105 | func (nt *NodeTable) MemoryInUse() int64 { 106 | return int64(approxItemSize * (nt.fastHTCount + nt.slowHTCount)) 107 | } 108 | 109 | // Get returns node pointer for the lookup key 110 | func (nt *NodeTable) Get(key []byte) unsafe.Pointer { 111 | res := nt.find(key) 112 | if res.status&ntFoundMask == ntFoundMask { 113 | if res.status == ntFoundInFast { 114 | return decodePointer(res.fastHTValue) 115 | } 116 | return decodePointer(res.slowHTValues[res.slowHTPos]) 117 | } 118 | 119 | return nil 120 | } 121 | 122 | // Update inserts or replaces an existing entry 123 | func (nt *NodeTable) Update(key []byte, nptr unsafe.Pointer) (updated bool, oldPtr unsafe.Pointer) { 124 | res := nt.find(key) 125 | if res.status&ntFoundMask == ntFoundMask { 126 | // Found key, replace old pointer value with new one 127 | updated = true 128 | if res.status == ntFoundInFast { 129 | oldPtr = decodePointer(res.fastHTValue) 130 | nt.fastHT[res.hash] = encodePointer(nptr, res.hasConflict) 131 | } else { 132 | oldPtr = decodePointer(res.slowHTValues[res.slowHTPos]) 133 | res.slowHTValues[res.slowHTPos] = encodePointer(nptr, true) 134 | } 135 | } else { 136 | // Insert new key 137 | updated = false 138 | newSlowValue := res.fastHTHasEntry && !res.hasConflict 139 | // Key needs to be inserted into slowHT 140 | if res.hasConflict || newSlowValue { 141 | slowHTValues := nt.slowHT[res.hash] 142 | slowHTValues = append(slowHTValues, encodePointer(nptr, false)) 143 | nt.slowHT[res.hash] = slowHTValues 144 | // There is an entry already in the fastHT for same crc32 hash 145 | // We have inserted first entry into the slowHT. Now mark conflict bit. 146 | if newSlowValue { 147 | nt.fastHT[res.hash] = encodePointer(decodePointer(nt.fastHT[res.hash]), true) 148 | nt.conflicts++ 149 | } 150 | nt.slowHTCount++ 151 | } else { 152 | // Insert new item into fastHT 153 | nt.fastHT[res.hash] = encodePointer(nptr, false) 154 | nt.fastHTCount++ 155 | } 156 | } 157 | 158 | return 159 | } 160 | 161 | // Remove an item from the nodetable 162 | func (nt *NodeTable) Remove(key []byte) (success bool, nptr unsafe.Pointer) { 163 | res := nt.find(key) 164 | if res.status&ntFoundMask == ntFoundMask { 165 | success = true 166 | if res.status == ntFoundInFast { 167 | nptr = decodePointer(res.fastHTValue) 168 | // Key needs to be removed from fastHT. For that we need to move 169 | // an item present in slowHT and overwrite fastHT entry. 170 | if res.hasConflict { 171 | slowHTValues := nt.slowHT[res.hash] 172 | v := slowHTValues[0] // New fastHT candidate 173 | slowHTValues = append([]uint64(nil), slowHTValues[1:]...) 174 | nt.slowHTCount-- 175 | 176 | var conflict bool 177 | if len(slowHTValues) == 0 { 178 | delete(nt.slowHT, res.hash) 179 | nt.conflicts-- 180 | } else { 181 | conflict = true 182 | nt.slowHT[res.hash] = slowHTValues 183 | } 184 | 185 | nt.fastHT[res.hash] = encodePointer(decodePointer(v), conflict) 186 | } else { 187 | delete(nt.fastHT, res.hash) 188 | nt.fastHTCount-- 189 | } 190 | } else { 191 | nptr = decodePointer(res.slowHTValues[res.slowHTPos]) 192 | // Remove key from slowHT 193 | newSlowValue := append([]uint64(nil), res.slowHTValues[:res.slowHTPos]...) 194 | if res.slowHTPos+1 != len(res.slowHTValues) { 195 | newSlowValue = append(newSlowValue, res.slowHTValues[:res.slowHTPos+1]...) 196 | } 197 | nt.slowHTCount-- 198 | 199 | if len(newSlowValue) == 0 { 200 | delete(nt.slowHT, res.hash) 201 | nt.fastHT[res.hash] = encodePointer(decodePointer(nt.fastHT[res.hash]), false) 202 | nt.conflicts-- 203 | } 204 | } 205 | } 206 | return 207 | } 208 | 209 | func decodePointer(v uint64) unsafe.Pointer { 210 | var x uintptr 211 | if unsafe.Sizeof(x) == 8 { 212 | ptr := uintptr(v & ^(uint64(1) << 63)) 213 | return unsafe.Pointer(ptr) 214 | } 215 | return unsafe.Pointer(uintptr(v & 0xffffffff)) 216 | } 217 | 218 | func encodePointer(p unsafe.Pointer, hasConflict bool) uint64 { 219 | v := uint64(uintptr(p)) 220 | if hasConflict { 221 | v |= 1 << 63 222 | } 223 | 224 | return v 225 | } 226 | 227 | func (nt *NodeTable) hasConflict(v uint64) bool { 228 | return v>>63 == 1 229 | } 230 | 231 | func (nt *NodeTable) isEqual(key []byte, v uint64) bool { 232 | p := decodePointer(v) 233 | return nt.keyEqual(p, key) 234 | } 235 | 236 | func (nt *NodeTable) find(key []byte) (res *ntResult) { 237 | nt.res = emptyResult 238 | res = &nt.res 239 | res.status = ntNotFound 240 | h := nt.hash(key) 241 | res.hash = h 242 | 243 | v, ok := nt.fastHT[h] 244 | res.fastHTHasEntry = ok 245 | if ok { 246 | res.hasConflict = nt.hasConflict(v) 247 | if nt.isEqual(key, v) { 248 | res.status = ntFoundInFast 249 | res.fastHTValue = v 250 | return 251 | } 252 | 253 | if res.hasConflict { 254 | if vs, ok := nt.slowHT[h]; ok { 255 | for i, v := range vs { 256 | if nt.isEqual(key, v) { 257 | res.slowHTPos = i 258 | res.slowHTValues = vs 259 | res.status = ntFoundInSlow 260 | return 261 | } 262 | } 263 | } 264 | } 265 | } 266 | 267 | return 268 | } 269 | 270 | // Close destroys the nodetable 271 | func (nt *NodeTable) Close() { 272 | nt.fastHTCount = 0 273 | nt.slowHTCount = 0 274 | nt.conflicts = 0 275 | nt.fastHT = make(map[uint32]uint64) 276 | nt.slowHT = make(map[uint32][]uint64) 277 | 278 | buf := dbInstances.MakeBuf() 279 | defer dbInstances.FreeBuf(buf) 280 | dbInstances.Delete(unsafe.Pointer(nt), CompareNodeTable, buf, &dbInstances.Stats) 281 | } 282 | 283 | // MemoryInUse returns total memory used by nodetables in a process 284 | func MemoryInUse() (sz int64) { 285 | buf := dbInstances.MakeBuf() 286 | defer dbInstances.FreeBuf(buf) 287 | iter := dbInstances.NewIterator(CompareNodeTable, buf) 288 | for iter.SeekFirst(); iter.Valid(); iter.Next() { 289 | db := (*NodeTable)(iter.Get()) 290 | sz += db.MemoryInUse() 291 | } 292 | 293 | return 294 | } 295 | -------------------------------------------------------------------------------- /nodetable/table_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nodetable 11 | 12 | import "testing" 13 | import "bytes" 14 | import "hash/crc32" 15 | import "unsafe" 16 | import "fmt" 17 | import "time" 18 | import "syscall" 19 | import "runtime/debug" 20 | 21 | type object struct { 22 | key []byte 23 | value int 24 | } 25 | 26 | func equalObject(p unsafe.Pointer, k []byte) bool { 27 | obj := (*object)(p) 28 | return bytes.Equal(obj.key, k) 29 | } 30 | 31 | func mkHashFun(h uint32) HashFn { 32 | return func([]byte) uint32 { 33 | return h 34 | } 35 | } 36 | 37 | func dumpTable(tab *NodeTable) { 38 | fmt.Println("==NodeTable==") 39 | count := 0 40 | for k, v := range tab.fastHT { 41 | o := (*object)(decodePointer(v)) 42 | fmt.Printf("hash:%d, keys:%s,", k, string(o.key)) 43 | count++ 44 | if vs, ok := tab.slowHT[k]; ok { 45 | for _, v := range vs { 46 | o := (*object)(decodePointer(v)) 47 | fmt.Printf("%s,", string(o.key)) 48 | count++ 49 | } 50 | } 51 | fmt.Println("") 52 | } 53 | 54 | fmt.Println("Total:", count) 55 | } 56 | 57 | func mkObject(key string, v int) *object { 58 | return &object{ 59 | key: []byte(key), 60 | value: v, 61 | } 62 | } 63 | 64 | func TestPointerEncode(t *testing.T) { 65 | o1 := unsafe.Pointer(mkObject("key", 1000)) 66 | v := encodePointer(o1, true) 67 | o2 := decodePointer(v) 68 | 69 | if o1 != o2 { 70 | t.Errorf("Expected encoded value to remain the same with conflict %p!=%p", o1, o2) 71 | } 72 | 73 | v = encodePointer(o1, false) 74 | o2 = decodePointer(v) 75 | 76 | if o1 != o2 { 77 | t.Errorf("Expected encoded value to remain the same without conflict %p!=%p", o1, o2) 78 | } 79 | } 80 | 81 | func TestInsertFastHT(t *testing.T) { 82 | table := New(mkHashFun(100), equalObject) 83 | o1 := mkObject("key", 1000) 84 | table.Update(o1.key, unsafe.Pointer(o1)) 85 | o2 := (*object)(table.Get(o1.key)) 86 | if o2 != o1 { 87 | t.Errorf("Expected same object") 88 | } 89 | } 90 | 91 | func TestInsertSlowHT(t *testing.T) { 92 | table := New(mkHashFun(100), equalObject) 93 | o1 := mkObject("key1", 1000) 94 | o2 := mkObject("key2", 2000) 95 | o3 := mkObject("key3", 3000) 96 | table.Update(o1.key, unsafe.Pointer(o1)) 97 | table.Update(o2.key, unsafe.Pointer(o2)) 98 | table.Update(o3.key, unsafe.Pointer(o3)) 99 | ro1 := (*object)(table.Get(o1.key)) 100 | ro2 := (*object)(table.Get(o2.key)) 101 | ro3 := (*object)(table.Get(o3.key)) 102 | if o1 != ro1 || o2 != ro2 || o3 != ro3 { 103 | t.Errorf("Expected same objects %p!=%p, %p!=%p, %p!=%p", o1, ro1, o2, ro2, o3, ro3) 104 | } 105 | } 106 | 107 | func TestUpdateFastHT(t *testing.T) { 108 | table := New(mkHashFun(100), equalObject) 109 | o1 := mkObject("key", 1000) 110 | o2 := mkObject("key", 2000) 111 | updated, old := table.Update(o1.key, unsafe.Pointer(o1)) 112 | if updated != false || old != nil { 113 | t.Errorf("Expected successful insert") 114 | } 115 | 116 | updated, old = table.Update(o2.key, unsafe.Pointer(o2)) 117 | if updated != true || (*object)(old) != o1 { 118 | t.Errorf("Expected old object to be returned") 119 | } 120 | } 121 | 122 | func TestUpdateSlowHT(t *testing.T) { 123 | table := New(mkHashFun(100), equalObject) 124 | o1 := mkObject("key0", 1000) 125 | o2 := mkObject("key1", 2000) 126 | o3 := mkObject("key1", 6000) 127 | updated, old := table.Update(o1.key, unsafe.Pointer(o1)) 128 | if updated != false || old != nil { 129 | t.Errorf("Expected successful insert") 130 | } 131 | 132 | table.Update(o2.key, unsafe.Pointer(o2)) 133 | updated, old = table.Update(o3.key, unsafe.Pointer(o3)) 134 | if updated != true || (*object)(old) != o2 { 135 | t.Errorf("Expected old object to be returned") 136 | } 137 | } 138 | 139 | func TestDeleteFastHT1(t *testing.T) { 140 | table := New(mkHashFun(100), equalObject) 141 | o1 := mkObject("key", 1000) 142 | table.Update(o1.key, unsafe.Pointer(o1)) 143 | o2 := (*object)(table.Get(o1.key)) 144 | if o2 != o1 { 145 | t.Errorf("Expected same object") 146 | } 147 | 148 | if success, _ := table.Remove(o1.key); success != true { 149 | t.Errorf("Expected successful remove") 150 | } 151 | 152 | o3 := (*object)(table.Get(o1.key)) 153 | if o3 != nil { 154 | t.Errorf("Expected not-found") 155 | } 156 | 157 | if success, _ := table.Remove(o1.key); success == true { 158 | t.Errorf("Expected remove fail") 159 | } 160 | } 161 | 162 | func TestDeleteFastHT2(t *testing.T) { 163 | table := New(mkHashFun(100), equalObject) 164 | o1 := mkObject("key1", 1000) 165 | o2 := mkObject("key2", 2000) 166 | o3 := mkObject("key3", 3000) 167 | table.Update(o1.key, unsafe.Pointer(o1)) 168 | table.Update(o2.key, unsafe.Pointer(o2)) 169 | table.Update(o3.key, unsafe.Pointer(o3)) 170 | 171 | if success, _ := table.Remove(o1.key); success != true { 172 | t.Errorf("Expected successful remove") 173 | } 174 | 175 | ro1 := (*object)(table.Get(o1.key)) 176 | ro2 := (*object)(table.Get(o2.key)) 177 | ro3 := (*object)(table.Get(o3.key)) 178 | 179 | if ro1 != nil { 180 | t.Errorf("Expected not found") 181 | } 182 | 183 | if ro2 != o2 || ro3 != o3 { 184 | t.Errorf("Expected to find those objects") 185 | } 186 | } 187 | 188 | func TestDeleteSlowHT1(t *testing.T) { 189 | table := New(mkHashFun(100), equalObject) 190 | o1 := mkObject("key1", 1000) 191 | o2 := mkObject("key2", 2000) 192 | o3 := mkObject("key3", 3000) 193 | table.Update(o1.key, unsafe.Pointer(o1)) 194 | table.Update(o2.key, unsafe.Pointer(o2)) 195 | table.Update(o3.key, unsafe.Pointer(o3)) 196 | 197 | if success, _ := table.Remove(o2.key); success != true { 198 | t.Errorf("Expected successful remove") 199 | } 200 | 201 | ro1 := (*object)(table.Get(o1.key)) 202 | ro2 := (*object)(table.Get(o2.key)) 203 | ro3 := (*object)(table.Get(o3.key)) 204 | 205 | if ro2 == nil { 206 | t.Errorf("Expected not found") 207 | } 208 | 209 | if ro1 != o1 || ro3 != o3 { 210 | t.Errorf("Expected to find those objects") 211 | } 212 | } 213 | 214 | func TestDeleteFastHT3(t *testing.T) { 215 | table := New(mkHashFun(100), equalObject) 216 | o1 := mkObject("key1", 1000) 217 | o2 := mkObject("key2", 2000) 218 | table.Update(o1.key, unsafe.Pointer(o1)) 219 | table.Update(o2.key, unsafe.Pointer(o2)) 220 | 221 | res := table.find(o1.key) 222 | if !table.hasConflict(res.fastHTValue) { 223 | t.Errorf("Expected conflict") 224 | } 225 | 226 | if success, _ := table.Remove(o2.key); success != true { 227 | t.Errorf("Expected successful remove") 228 | } 229 | 230 | ro1 := (*object)(table.Get(o1.key)) 231 | ro2 := (*object)(table.Get(o2.key)) 232 | 233 | if ro2 != nil { 234 | t.Errorf("Expected not found") 235 | } 236 | 237 | if ro1 != o1 { 238 | t.Errorf("Expected found") 239 | } 240 | 241 | res = table.find(o1.key) 242 | if table.hasConflict(res.fastHTValue) { 243 | t.Errorf("Expected no conflict") 244 | } 245 | 246 | } 247 | 248 | func TestSimple(t *testing.T) { 249 | table := New(crc32.ChecksumIEEE, equalObject) 250 | o1 := mkObject("key1", 100) 251 | o2 := mkObject("key1", 200) 252 | updated, old := table.Update(o1.key, unsafe.Pointer(o1)) 253 | if updated == true || old != nil { 254 | t.Errorf("Expected update=false, old=nil") 255 | } 256 | 257 | updated, old = table.Update(o2.key, unsafe.Pointer(o2)) 258 | if updated == false || old == nil { 259 | } 260 | 261 | o3 := table.Get(o1.key) 262 | if o3 == nil { 263 | t.Errorf("Expected non nil") 264 | } else { 265 | o4 := (*object)(o3) 266 | if o4.value != 200 { 267 | t.Errorf("Expected value = 200") 268 | } 269 | } 270 | } 271 | 272 | func TestLargeConflicts(t *testing.T) { 273 | n := 100000 274 | hfn := func(k []byte) uint32 { 275 | return crc32.ChecksumIEEE(k) % 1000 276 | } 277 | table := New(hfn, equalObject) 278 | objects := make([]*object, n) 279 | for i := 0; i < n; i++ { 280 | objects[i] = mkObject(fmt.Sprintf("key-%d", i), i) 281 | updated, _ := table.Update(objects[i].key, unsafe.Pointer(objects[i])) 282 | if updated { 283 | t.Errorf("Expected insert") 284 | } 285 | ptr := table.Get(objects[i].key) 286 | if (*object)(ptr) != objects[i] { 287 | t.Errorf("%s Expected object %p, not %p", objects[i].key, objects[i], ptr) 288 | dumpTable(table) 289 | } 290 | } 291 | 292 | for i := 0; i < n; i++ { 293 | ptr := table.Get(objects[i].key) 294 | if (*object)(ptr) != objects[i] { 295 | t.Errorf("Expected to find the object %s %v", string(objects[i].key), ptr) 296 | res := table.find(objects[i].key) 297 | fmt.Println(res) 298 | fmt.Println(table.Stats()) 299 | dumpTable(table) 300 | t.Fatalf("failed") 301 | } 302 | } 303 | 304 | } 305 | 306 | func TestMemoryOverhead(t *testing.T) { 307 | n := 100000 308 | table := New(crc32.ChecksumIEEE, equalObject) 309 | objects := make([]*object, n) 310 | for i := 0; i < n; i++ { 311 | objects[i] = mkObject(fmt.Sprintf("key-%d", i), i) 312 | } 313 | 314 | var rusage1, rusage2 syscall.Rusage 315 | debug.FreeOSMemory() 316 | syscall.Getrusage(syscall.RUSAGE_SELF, &rusage1) 317 | for i := 0; i < n; i++ { 318 | table.Update(objects[i].key, unsafe.Pointer(objects[i])) 319 | } 320 | debug.FreeOSMemory() 321 | syscall.Getrusage(syscall.RUSAGE_SELF, &rusage2) 322 | 323 | rss := (rusage2.Maxrss - rusage1.Maxrss) 324 | fmt.Println("Memory used for hashtable:", rss) 325 | fmt.Println("Overhead per item:", float32(rss)/float32(n)) 326 | } 327 | 328 | func TestPerf(t *testing.T) { 329 | n := 10000000 330 | table := New(crc32.ChecksumIEEE, equalObject) 331 | objects := make([]*object, n) 332 | newobjects := make([]*object, n) 333 | for i := 0; i < n; i++ { 334 | objects[i] = mkObject(fmt.Sprintf("key-%d", i), i) 335 | newobjects[i] = mkObject(fmt.Sprintf("key-%d", i), i+100) 336 | } 337 | 338 | t0 := time.Now() 339 | for i := 0; i < n; i++ { 340 | updated, last := table.Update(objects[i].key, unsafe.Pointer(objects[i])) 341 | if updated == true || last != nil { 342 | t.Errorf("Expected updated=false") 343 | } 344 | } 345 | dur := time.Since(t0) 346 | fmt.Printf("Insert took %v for %v items, %v/s\n", dur, n, float32(n)/float32(dur.Seconds())) 347 | 348 | t0 = time.Now() 349 | for i := 0; i < n; i++ { 350 | ptr := table.Get(objects[i].key) 351 | if ptr == nil { 352 | t.Fatalf("Expected to find the object") 353 | } 354 | 355 | o := (*object)(ptr) 356 | if o != objects[i] { 357 | t.Errorf("Received unexpected object") 358 | } 359 | } 360 | dur = time.Since(t0) 361 | fmt.Printf("Get took %v for %v items, %v/s\n", dur, n, float32(n)/float32(dur.Seconds())) 362 | 363 | t0 = time.Now() 364 | for i := 0; i < n; i++ { 365 | updated, last := table.Update(objects[i].key, unsafe.Pointer(objects[i])) 366 | if updated == false || (*object)(last) != objects[i] { 367 | t.Errorf("Expected updated=true") 368 | } 369 | } 370 | dur = time.Since(t0) 371 | fmt.Printf("Update took %v for %v items, %v/s\n", dur, n, float32(n)/float32(dur.Seconds())) 372 | fmt.Println("Table stats:", table.Stats()) 373 | } 374 | -------------------------------------------------------------------------------- /skiplist/skiplist.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package skiplist 11 | 12 | import ( 13 | "math/rand" 14 | "runtime" 15 | "sync/atomic" 16 | "unsafe" 17 | ) 18 | 19 | // Debug flag enables additional stats gathering 20 | var Debug bool 21 | 22 | // MaxLevel is the limit for the skiplist levels 23 | const MaxLevel = 32 24 | const p = 0.25 25 | 26 | // CompareFn is the skiplist item comparator 27 | type CompareFn func(unsafe.Pointer, unsafe.Pointer) int 28 | 29 | // ItemSizeFn returns size of a skiplist item 30 | type ItemSizeFn func(unsafe.Pointer) int 31 | 32 | func defaultItemSize(unsafe.Pointer) int { 33 | return 0 34 | } 35 | 36 | // MallocFn is a custom memory allocator 37 | type MallocFn func(int) unsafe.Pointer 38 | 39 | // FreeFn is a custom memory deallocator 40 | type FreeFn func(unsafe.Pointer) 41 | 42 | // Config holds skiplist configuration 43 | type Config struct { 44 | ItemSize ItemSizeFn 45 | 46 | UseMemoryMgmt bool 47 | Malloc MallocFn 48 | Free FreeFn 49 | BarrierDestructor BarrierSessionDestructor 50 | } 51 | 52 | // SetItemSizeFunc configures item size function 53 | func (cfg *Config) SetItemSizeFunc(fn ItemSizeFn) { 54 | cfg.ItemSize = fn 55 | } 56 | 57 | // DefaultConfig returns default skiplist configuration 58 | func DefaultConfig() Config { 59 | return Config{ 60 | ItemSize: defaultItemSize, 61 | UseMemoryMgmt: false, 62 | } 63 | } 64 | 65 | // Skiplist - core data structure 66 | type Skiplist struct { 67 | head *Node 68 | tail *Node 69 | level int32 70 | Stats Stats 71 | barrier *AccessBarrier 72 | 73 | newNode func(itm unsafe.Pointer, level int) *Node 74 | freeNode func(*Node) 75 | 76 | Config 77 | } 78 | 79 | // New creates a skiplist with default config 80 | func New() *Skiplist { 81 | return NewWithConfig(DefaultConfig()) 82 | } 83 | 84 | // NewWithConfig creates a config from given config 85 | func NewWithConfig(cfg Config) *Skiplist { 86 | if runtime.GOARCH != "amd64" { 87 | cfg.UseMemoryMgmt = false 88 | } 89 | 90 | s := &Skiplist{ 91 | Config: cfg, 92 | barrier: newAccessBarrier(cfg.UseMemoryMgmt, cfg.BarrierDestructor), 93 | } 94 | 95 | s.newNode = func(itm unsafe.Pointer, level int) *Node { 96 | return allocNode(itm, level, cfg.Malloc) 97 | } 98 | 99 | if cfg.UseMemoryMgmt { 100 | s.freeNode = func(n *Node) { 101 | if Debug { 102 | debugMarkFree(n) 103 | } 104 | cfg.Free(unsafe.Pointer(n)) 105 | } 106 | } else { 107 | s.freeNode = func(*Node) {} 108 | } 109 | 110 | head := allocNode(MinItem, MaxLevel, nil) 111 | tail := allocNode(MaxItem, MaxLevel, nil) 112 | 113 | for i := 0; i <= MaxLevel; i++ { 114 | head.setNext(i, tail, false) 115 | } 116 | 117 | s.head = head 118 | s.tail = tail 119 | 120 | return s 121 | } 122 | 123 | // GetAccesBarrier returns current active access barrier 124 | func (s *Skiplist) GetAccesBarrier() *AccessBarrier { 125 | return s.barrier 126 | } 127 | 128 | // FreeNode deallocates the skiplist node memory 129 | func (s *Skiplist) FreeNode(n *Node, sts *Stats) { 130 | s.freeNode(n) 131 | sts.AddInt64(&sts.nodeFrees, 1) 132 | } 133 | 134 | // ActionBuffer is a temporary buffer used by skiplist operations 135 | type ActionBuffer struct { 136 | preds []*Node 137 | succs []*Node 138 | } 139 | 140 | // MakeBuf creates an action buffer 141 | func (s *Skiplist) MakeBuf() *ActionBuffer { 142 | return &ActionBuffer{ 143 | preds: make([]*Node, MaxLevel+1), 144 | succs: make([]*Node, MaxLevel+1), 145 | } 146 | } 147 | 148 | // FreeBuf frees an action buffer 149 | func (s *Skiplist) FreeBuf(b *ActionBuffer) { 150 | } 151 | 152 | // Size returns the size of a node 153 | func (s *Skiplist) Size(n *Node) int { 154 | return s.ItemSize(n.Item()) + n.Size() 155 | } 156 | 157 | // NewLevel returns a random level for the next node 158 | func (s *Skiplist) NewLevel(randFn func() float32) int { 159 | var nextLevel int 160 | 161 | for ; randFn() < p; nextLevel++ { 162 | } 163 | 164 | if nextLevel > MaxLevel { 165 | nextLevel = MaxLevel 166 | } 167 | 168 | level := int(atomic.LoadInt32(&s.level)) 169 | if nextLevel > level { 170 | if atomic.CompareAndSwapInt32(&s.level, int32(level), int32(level+1)) { 171 | nextLevel = level + 1 172 | } else { 173 | nextLevel = level 174 | } 175 | } 176 | 177 | return nextLevel 178 | } 179 | 180 | func (s *Skiplist) helpDelete(level int, prev, curr, next *Node, sts *Stats) bool { 181 | success := prev.dcasNext(level, curr, next, false, false) 182 | if success && level == 0 { 183 | sts.AddInt64(&sts.softDeletes, -1) 184 | sts.AddInt64(&sts.levelNodesCount[curr.Level()], -1) 185 | sts.AddInt64(&sts.usedBytes, -int64(s.Size(curr))) 186 | } 187 | return success 188 | } 189 | 190 | func (s *Skiplist) findPath(itm unsafe.Pointer, cmp CompareFn, 191 | buf *ActionBuffer, sts *Stats) *Node { 192 | return s.findPath2(itm, cmp, nil, buf, sts) 193 | } 194 | 195 | func (s *Skiplist) findPath2(itm unsafe.Pointer, cmp CompareFn, 196 | skipItm func(unsafe.Pointer) bool, 197 | buf *ActionBuffer, sts *Stats) (foundNode *Node) { 198 | var cmpVal = 1 199 | 200 | retry: 201 | prev := s.head 202 | level := int(atomic.LoadInt32(&s.level)) 203 | for i := level; i >= 0; i-- { 204 | curr, _ := prev.getNext(i) 205 | levelSearch: 206 | for { 207 | skip: 208 | pred := prev 209 | next, deleted := curr.getNext(i) 210 | for deleted { 211 | if !s.helpDelete(i, pred, curr, next, sts) { 212 | sts.AddUint64(&sts.readConflicts, 1) 213 | goto retry 214 | } 215 | 216 | curr, _ = pred.getNext(i) 217 | next, deleted = curr.getNext(i) 218 | } 219 | 220 | if skipItm != nil && skipItm(curr.Item()) { 221 | pred = curr 222 | curr = next 223 | goto skip 224 | } 225 | 226 | cmpVal = Compare(cmp, curr.Item(), itm) 227 | if cmpVal < 0 { 228 | prev = curr 229 | curr = next 230 | } else { 231 | break levelSearch 232 | } 233 | } 234 | 235 | buf.preds[i] = prev 236 | buf.succs[i] = curr 237 | } 238 | 239 | if cmpVal == 0 { 240 | foundNode = buf.succs[0] 241 | } 242 | return 243 | } 244 | 245 | // Insert adds an item into the skiplist 246 | func (s *Skiplist) Insert(itm unsafe.Pointer, cmp CompareFn, 247 | buf *ActionBuffer, sts *Stats) (success bool) { 248 | _, success = s.Insert2(itm, cmp, nil, buf, rand.Float32, sts) 249 | return 250 | } 251 | 252 | // Insert2 is a more verbose version of Insert 253 | func (s *Skiplist) Insert2(itm unsafe.Pointer, inscmp CompareFn, eqCmp CompareFn, 254 | buf *ActionBuffer, randFn func() float32, sts *Stats) (*Node, bool) { 255 | itemLevel := s.NewLevel(randFn) 256 | return s.Insert3(itm, inscmp, eqCmp, buf, itemLevel, false, sts) 257 | } 258 | 259 | // Insert3 is more verbose version of Insert2 260 | func (s *Skiplist) Insert3(itm unsafe.Pointer, insCmp CompareFn, eqCmp CompareFn, 261 | buf *ActionBuffer, itemLevel int, skipFindPath bool, sts *Stats) (*Node, bool) { 262 | 263 | token := s.barrier.Acquire() 264 | defer s.barrier.Release(token) 265 | 266 | x := s.newNode(itm, itemLevel) 267 | 268 | retry: 269 | if skipFindPath { 270 | skipFindPath = false 271 | } else { 272 | if s.findPath(itm, insCmp, buf, sts) != nil || 273 | eqCmp != nil && Compare(eqCmp, itm, buf.preds[0].Item()) == 0 { 274 | 275 | s.freeNode(x) 276 | return nil, false 277 | } 278 | } 279 | 280 | // Set all next links for the node non-atomically 281 | for i := 0; i <= int(itemLevel); i++ { 282 | x.setNext(i, buf.succs[i], false) 283 | } 284 | 285 | // Now node is part of the skiplist 286 | if !buf.preds[0].dcasNext(0, buf.succs[0], x, false, false) { 287 | sts.AddUint64(&sts.insertConflicts, 1) 288 | goto retry 289 | } 290 | 291 | // Add to index levels 292 | for i := 1; i <= int(itemLevel); i++ { 293 | fixThisLevel: 294 | for { 295 | nodeNext, deleted := x.getNext(i) 296 | next := buf.succs[i] 297 | 298 | // Update the node's next pointer at current level if required. 299 | // This is the only thread which can modify next pointer at this level 300 | // The dcas operation can fail only if another thread marked delete 301 | if deleted || (nodeNext != next && !x.dcasNext(i, nodeNext, next, false, false)) { 302 | goto finished 303 | } 304 | 305 | if buf.preds[i].dcasNext(i, next, x, false, false) { 306 | break fixThisLevel 307 | } 308 | 309 | s.findPath(itm, insCmp, buf, sts) 310 | } 311 | } 312 | 313 | finished: 314 | sts.AddInt64(&sts.nodeAllocs, 1) 315 | sts.AddInt64(&sts.levelNodesCount[itemLevel], 1) 316 | sts.AddInt64(&sts.usedBytes, int64(s.Size(x))) 317 | return x, true 318 | } 319 | 320 | func (s *Skiplist) softDelete(delNode *Node, sts *Stats) bool { 321 | var marked bool 322 | 323 | targetLevel := delNode.Level() 324 | for i := targetLevel; i >= 0; i-- { 325 | next, deleted := delNode.getNext(i) 326 | for !deleted { 327 | if delNode.dcasNext(i, next, next, false, true) && i == 0 { 328 | sts.AddInt64(&sts.softDeletes, 1) 329 | marked = true 330 | } 331 | next, deleted = delNode.getNext(i) 332 | } 333 | } 334 | return marked 335 | } 336 | 337 | // Delete an item from the skiplist 338 | func (s *Skiplist) Delete(itm unsafe.Pointer, cmp CompareFn, 339 | buf *ActionBuffer, sts *Stats) bool { 340 | token := s.barrier.Acquire() 341 | defer s.barrier.Release(token) 342 | 343 | found := s.findPath(itm, cmp, buf, sts) != nil 344 | if !found { 345 | return false 346 | } 347 | 348 | delNode := buf.succs[0] 349 | return s.deleteNode(delNode, cmp, buf, sts) 350 | } 351 | 352 | // DeleteNode an item from the skiplist by specifying its node 353 | func (s *Skiplist) DeleteNode(n *Node, cmp CompareFn, 354 | buf *ActionBuffer, sts *Stats) bool { 355 | token := s.barrier.Acquire() 356 | defer s.barrier.Release(token) 357 | 358 | return s.deleteNode(n, cmp, buf, sts) 359 | } 360 | 361 | func (s *Skiplist) deleteNode(n *Node, cmp CompareFn, buf *ActionBuffer, sts *Stats) bool { 362 | itm := n.Item() 363 | if s.softDelete(n, sts) { 364 | s.findPath(itm, cmp, buf, sts) 365 | return true 366 | } 367 | 368 | return false 369 | } 370 | 371 | // GetRangeSplitItems returns `nways` split range pivots of the skiplist items 372 | // Explicit barrier and release should be used by the caller before 373 | // and after this function call 374 | func (s *Skiplist) GetRangeSplitItems(nways int) []unsafe.Pointer { 375 | var deleted bool 376 | repeat: 377 | var itms []unsafe.Pointer 378 | var finished bool 379 | 380 | l := int(atomic.LoadInt32(&s.level)) 381 | for ; l >= 0; l-- { 382 | c := int(atomic.LoadInt64(&s.Stats.levelNodesCount[l]) + 1) 383 | if c >= nways { 384 | perSplit := c / nways 385 | node := s.head 386 | for j := 0; node != s.tail && !finished; j++ { 387 | if j == perSplit { 388 | j = -1 389 | itms = append(itms, node.Item()) 390 | finished = len(itms) == nways-1 391 | } 392 | 393 | node, deleted = node.getNext(l) 394 | if deleted { 395 | goto repeat 396 | } 397 | } 398 | 399 | break 400 | } 401 | } 402 | 403 | return itms 404 | } 405 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /nitro_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import "fmt" 13 | import "sync/atomic" 14 | import "os" 15 | import "testing" 16 | import "time" 17 | import "math/rand" 18 | import "sync" 19 | import "runtime" 20 | import "encoding/binary" 21 | import "github.com/t3rm1n4l/nitro/mm" 22 | 23 | var testConf Config 24 | 25 | func init() { 26 | testConf = DefaultConfig() 27 | testConf.UseMemoryMgmt(mm.Malloc, mm.Free) 28 | testConf.UseDeltaInterleaving() 29 | Debug(true) 30 | } 31 | 32 | func TestBlocksPerf(t *testing.T) { 33 | conf := testConf 34 | conf.blockStoreDir = "./data" 35 | db := NewWithConfig(conf) 36 | defer db.Close() 37 | 38 | cf := DefaultConfig() 39 | cf.UseMemoryMgmt(mm.Malloc, mm.Free) 40 | tdb := NewWithConfig(cf) 41 | 42 | total := 100000000 43 | thr := 8 44 | max := total / thr 45 | var wg sync.WaitGroup 46 | for i := 0; i < thr; i++ { 47 | wg.Add(1) 48 | go func(id int, n int, wg *sync.WaitGroup) { 49 | defer wg.Done() 50 | w := tdb.NewWriter() 51 | buf := make([]byte, 8) 52 | for x := 0; x < n; x++ { 53 | binary.LittleEndian.PutUint64(buf, uint64(x+id*n)) 54 | w.Put(buf) 55 | } 56 | }(i, max, &wg) 57 | } 58 | 59 | tx := time.Now() 60 | wg.Wait() 61 | fmt.Println(time.Since(tx), total) 62 | 63 | snp, _ := tdb.NewSnapshot() 64 | t0 := time.Now() 65 | fmt.Println(db.ApplyOps(snp, 8)) 66 | fmt.Println("took", time.Since(t0)) 67 | db.NewSnapshot() 68 | 69 | snp.Close() 70 | go tdb.Close() 71 | 72 | tdb = NewWithConfig(cf) 73 | 74 | max = max / 8 75 | for i := 0; i < thr; i++ { 76 | wg.Add(1) 77 | go func(id int, n int, wg *sync.WaitGroup) { 78 | defer wg.Done() 79 | w := tdb.NewWriter() 80 | buf := make([]byte, 8) 81 | for x := 0; x < n; x++ { 82 | binary.LittleEndian.PutUint64(buf, uint64(x+id*n+total)) 83 | w.Put(buf) 84 | } 85 | }(i, max, &wg) 86 | } 87 | 88 | tx = time.Now() 89 | wg.Wait() 90 | fmt.Println(time.Since(tx), total) 91 | 92 | snp, _ = tdb.NewSnapshot() 93 | t0 = time.Now() 94 | fmt.Println(db.ApplyOps(snp, 8)) 95 | db.NewSnapshot() 96 | fmt.Println("took", time.Since(t0)) 97 | 98 | } 99 | 100 | func TestBatchOps(t *testing.T) { 101 | conf := testConf 102 | conf.blockStoreDir = "/tmp/" 103 | db := NewWithConfig(conf) 104 | defer db.Close() 105 | 106 | n := 5000000 107 | 108 | var snap *Snapshot 109 | 110 | for x := 0; x < 10; x++ { 111 | keys := make([]int, n) 112 | cf := DefaultConfig() 113 | cf.UseMemoryMgmt(mm.Malloc, mm.Free) 114 | tdb := NewWithConfig(cf) 115 | w := tdb.NewWriter() 116 | 117 | for i := 0; i < n; i++ { 118 | //hh := rand.Int() % 5000000 119 | hh := i 120 | w.Put([]byte(fmt.Sprintf("%010d", hh))) 121 | keys[i] = hh 122 | } 123 | 124 | snp, _ := tdb.NewSnapshot() 125 | 126 | t0 := time.Now() 127 | fmt.Println(db.ApplyOps(snp, 8)) 128 | snp.Close() 129 | tdb.Close() 130 | 131 | fmt.Println("thr", float64(n)/float64(time.Since(t0).Seconds())) 132 | if snap != nil { 133 | snap.Close() 134 | } 135 | snap, _ = db.NewSnapshot() 136 | 137 | //fmt.Println(db.DumpStats()) 138 | 139 | it := snap.NewIterator() 140 | i := 0 141 | for it.SeekFirst(); it.Valid(); it.Next() { 142 | exp := fmt.Sprintf("%010d", keys[i]) 143 | if string(it.Get()) != exp { 144 | t.Errorf("expected %s, got %s", exp, string(it.Get())) 145 | } 146 | i++ 147 | } 148 | 149 | /* 150 | exp := fmt.Sprintf("%010d", 1000) 151 | it.Seek([]byte(exp)) 152 | for i := 1000; i < 2000; i++ { 153 | exp := fmt.Sprintf("%010d", keys[i]) 154 | if string(it.Get()) != exp { 155 | t.Errorf("%s != %s", string(it.Get()), string(exp)) 156 | } 157 | it.Next() 158 | } 159 | */ 160 | 161 | var wg sync.WaitGroup 162 | t0 = time.Now() 163 | threads := 16 164 | total := n * threads 165 | for i := 0; i < threads; i++ { 166 | wg.Add(1) 167 | go doGet(t, db, snap, &wg, n/threads) 168 | } 169 | wg.Wait() 170 | dur := time.Since(t0) 171 | fmt.Printf("%d items took %v -> %v items/s\n", total, dur, float64(total)/float64(dur.Seconds())) 172 | 173 | it.Close() 174 | snap.Close() 175 | } 176 | 177 | snap, _ = db.NewSnapshot() 178 | fmt.Println(db.DumpStats()) 179 | snap.Close() 180 | } 181 | 182 | func TestInsert(t *testing.T) { 183 | db := NewWithConfig(testConf) 184 | defer db.Close() 185 | 186 | w := db.NewWriter() 187 | for i := 0; i < 2000; i++ { 188 | w.Put([]byte(fmt.Sprintf("%010d", i))) 189 | } 190 | 191 | for i := 1750; i < 2000; i++ { 192 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 193 | } 194 | snap, _ := w.NewSnapshot() 195 | defer snap.Close() 196 | 197 | for i := 2000; i < 5000; i++ { 198 | w.Put([]byte(fmt.Sprintf("%010d", i))) 199 | } 200 | 201 | snap2, _ := w.NewSnapshot() 202 | defer snap2.Close() 203 | 204 | count := 0 205 | itr := db.NewIterator(snap) 206 | defer itr.Close() 207 | 208 | itr.SeekFirst() 209 | itr.Seek([]byte(fmt.Sprintf("%010d", 1500))) 210 | for ; itr.Valid(); itr.Next() { 211 | expected := fmt.Sprintf("%010d", count+1500) 212 | got := string(itr.Get()) 213 | count++ 214 | if got != expected { 215 | t.Errorf("Expected %s, got %v", expected, got) 216 | } 217 | } 218 | 219 | if count != 250 { 220 | t.Errorf("Expected count = 250, got %v", count) 221 | } 222 | } 223 | 224 | func doInsert(db *Nitro, wg *sync.WaitGroup, n int, isRand bool, shouldSnap bool) { 225 | defer wg.Done() 226 | w := db.NewWriter() 227 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 228 | for i := 0; i < n; i++ { 229 | var val int 230 | if isRand { 231 | val = rnd.Int() 232 | } else { 233 | val = i 234 | } 235 | if shouldSnap && i%100000 == 0 { 236 | s, _ := w.NewSnapshot() 237 | s.Close() 238 | } 239 | buf := make([]byte, 8) 240 | binary.BigEndian.PutUint64(buf, uint64(val)) 241 | w.Put(buf) 242 | } 243 | } 244 | 245 | func TestInsertPerf(t *testing.T) { 246 | var wg sync.WaitGroup 247 | db := NewWithConfig(testConf) 248 | defer db.Close() 249 | n := 20000000 / runtime.GOMAXPROCS(0) 250 | t0 := time.Now() 251 | total := n * runtime.GOMAXPROCS(0) 252 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 253 | wg.Add(1) 254 | go doInsert(db, &wg, n, true, true) 255 | } 256 | wg.Wait() 257 | 258 | snap, _ := db.NewSnapshot() 259 | defer snap.Close() 260 | dur := time.Since(t0) 261 | VerifyCount(snap, n*runtime.GOMAXPROCS(0), t) 262 | fmt.Printf("%d items took %v -> %v items/s snapshots_created %v live_snapshots %v\n", 263 | total, dur, float64(total)/float64(dur.Seconds()), db.getCurrSn(), len(db.GetSnapshots())) 264 | } 265 | 266 | func doGet(t *testing.T, db *Nitro, snap *Snapshot, wg *sync.WaitGroup, n int) { 267 | defer wg.Done() 268 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 269 | 270 | buf := make([]byte, 8) 271 | itr := db.NewIterator(snap) 272 | defer itr.Close() 273 | for i := 0; i < n; i++ { 274 | val := rnd.Int() % n 275 | binary.BigEndian.PutUint64(buf, uint64(val)) 276 | itr.Seek(buf) 277 | if !itr.Valid() { 278 | t.Errorf("Expected to find %v", val) 279 | } 280 | } 281 | } 282 | 283 | func TestInsertDuplicates(t *testing.T) { 284 | db := NewWithConfig(testConf) 285 | defer db.Close() 286 | 287 | w := db.NewWriter() 288 | for i := 0; i < 2000; i++ { 289 | w.Put([]byte(fmt.Sprintf("%010d", i))) 290 | } 291 | 292 | snap1, _ := w.NewSnapshot() 293 | defer snap1.Close() 294 | 295 | // Duplicate 296 | for i := 0; i < 2000; i++ { 297 | key := fmt.Sprintf("%010d", i) 298 | newNode := w.Put2([]byte(key)) 299 | if newNode != nil { 300 | t.Errorf("Duplicate unexpected for %s", key) 301 | } 302 | } 303 | 304 | for i := 1500; i < 2000; i++ { 305 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 306 | } 307 | snap2, _ := w.NewSnapshot() 308 | defer snap2.Close() 309 | 310 | for i := 1500; i < 5000; i++ { 311 | key := fmt.Sprintf("%010d", i) 312 | newNode := w.Put2([]byte(key)) 313 | if newNode == nil { 314 | t.Errorf("Expected successful insert for %s", key) 315 | } 316 | } 317 | 318 | snap, _ := w.NewSnapshot() 319 | defer snap.Close() 320 | count := 0 321 | itr := db.NewIterator(snap) 322 | defer itr.Close() 323 | 324 | itr.SeekFirst() 325 | for ; itr.Valid(); itr.Next() { 326 | expected := fmt.Sprintf("%010d", count) 327 | got := string(itr.Get()) 328 | count++ 329 | if got != expected { 330 | t.Errorf("Expected %s, got %v", expected, got) 331 | } 332 | } 333 | 334 | if count != 5000 { 335 | t.Errorf("Expected count = 5000, got %v", count) 336 | } 337 | } 338 | 339 | func TestGetPerf(t *testing.T) { 340 | var wg sync.WaitGroup 341 | db := NewWithConfig(testConf) 342 | defer db.Close() 343 | n := 1000000 344 | wg.Add(1) 345 | go doInsert(db, &wg, n, false, true) 346 | wg.Wait() 347 | snap, _ := db.NewSnapshot() 348 | defer snap.Close() 349 | VerifyCount(snap, n, t) 350 | 351 | t0 := time.Now() 352 | total := n * runtime.GOMAXPROCS(0) 353 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 354 | wg.Add(1) 355 | go doGet(t, db, snap, &wg, n) 356 | } 357 | wg.Wait() 358 | dur := time.Since(t0) 359 | fmt.Printf("%d items took %v -> %v items/s\n", total, dur, float64(total)/float64(dur.Seconds())) 360 | } 361 | 362 | func VerifyCount(snap *Snapshot, n int, t *testing.T) { 363 | 364 | if c := CountItems(snap); c != n { 365 | t.Errorf("Expected count %d, got %d", n, c) 366 | } 367 | } 368 | 369 | func CountItems(snap *Snapshot) int { 370 | var count int 371 | itr := snap.NewIterator() 372 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 373 | count++ 374 | } 375 | itr.Close() 376 | return count 377 | } 378 | 379 | func TestLoadStoreDisk(t *testing.T) { 380 | os.RemoveAll("db.dump") 381 | var wg sync.WaitGroup 382 | db := NewWithConfig(testConf) 383 | defer db.Close() 384 | n := 1000000 385 | t0 := time.Now() 386 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 387 | wg.Add(1) 388 | go doInsert(db, &wg, n/runtime.GOMAXPROCS(0), true, true) 389 | } 390 | wg.Wait() 391 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 392 | snap0, _ := db.NewSnapshot() 393 | defer snap0.Close() 394 | snap, _ := db.NewSnapshot() 395 | fmt.Println(db.DumpStats()) 396 | 397 | t0 = time.Now() 398 | err := db.StoreToDisk("db.dump", snap, 8, nil) 399 | if err != nil { 400 | t.Errorf("Expected no error. got=%v", err) 401 | } 402 | 403 | fmt.Printf("Storing to disk took %v\n", time.Since(t0)) 404 | 405 | snap.Close() 406 | db = NewWithConfig(testConf) 407 | defer db.Close() 408 | t0 = time.Now() 409 | snap, err = db.LoadFromDisk("db.dump", 8, nil) 410 | defer snap.Close() 411 | if err != nil { 412 | t.Errorf("Expected no error. got=%v", err) 413 | } 414 | fmt.Printf("Loading from disk took %v\n", time.Since(t0)) 415 | 416 | count := CountItems(snap) 417 | if count != n { 418 | t.Errorf("Expected %v, got %v", n, count) 419 | } 420 | 421 | count = int(snap.Count()) 422 | if count != n { 423 | t.Errorf("Count mismatch on snapshot. Expected %d, got %d", n, count) 424 | } 425 | fmt.Println(db.DumpStats()) 426 | } 427 | 428 | func TestStoreDiskShutdown(t *testing.T) { 429 | os.RemoveAll("db.dump") 430 | var wg sync.WaitGroup 431 | db := NewWithConfig(testConf) 432 | n := 1000000 433 | t0 := time.Now() 434 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 435 | wg.Add(1) 436 | go doInsert(db, &wg, n/runtime.GOMAXPROCS(0), true, true) 437 | } 438 | wg.Wait() 439 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 440 | snap0, _ := db.NewSnapshot() 441 | snap, _ := db.NewSnapshot() 442 | fmt.Println(db.DumpStats()) 443 | 444 | errch := make(chan error, 1) 445 | go func() { 446 | errch <- db.StoreToDisk("db.dump", snap, 8, nil) 447 | }() 448 | 449 | snap0.Close() 450 | snap.Close() 451 | db.Close() 452 | 453 | if err := <-errch; err != ErrShutdown { 454 | t.Errorf("Expected ErrShutdown. got=%v", err) 455 | } 456 | } 457 | 458 | func TestDelete(t *testing.T) { 459 | expected := 10 460 | db := NewWithConfig(testConf) 461 | defer db.Close() 462 | w := db.NewWriter() 463 | for i := 0; i < expected; i++ { 464 | w.Put([]byte(fmt.Sprintf("%010d", i))) 465 | } 466 | 467 | snap1, _ := w.NewSnapshot() 468 | got := CountItems(snap1) 469 | if got != expected { 470 | t.Errorf("Expected 2000, got %d", got) 471 | } 472 | fmt.Println(db.DumpStats()) 473 | 474 | for i := 0; i < expected; i++ { 475 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 476 | } 477 | 478 | for i := 0; i < expected; i++ { 479 | w.Put([]byte(fmt.Sprintf("%010d", i))) 480 | } 481 | snap2, _ := w.NewSnapshot() 482 | snap1.Close() 483 | snap3, _ := w.NewSnapshot() 484 | snap2.Close() 485 | time.Sleep(time.Second) 486 | 487 | got = CountItems(snap3) 488 | snap3.Close() 489 | 490 | if got != expected { 491 | t.Errorf("Expected %d, got %d", expected, got) 492 | } 493 | fmt.Println(db.DumpStats()) 494 | } 495 | 496 | func doReplace(wg *sync.WaitGroup, t *testing.T, w *Writer, start, end int) { 497 | defer wg.Done() 498 | 499 | for ; start < end; start++ { 500 | w.Delete([]byte(fmt.Sprintf("%010d", start))) 501 | w.Put([]byte(fmt.Sprintf("%010d", start))) 502 | } 503 | } 504 | 505 | func TestGCPerf(t *testing.T) { 506 | var wg sync.WaitGroup 507 | var last *Snapshot 508 | 509 | db := NewWithConfig(testConf) 510 | defer db.Close() 511 | perW := 1000 512 | iterations := 1000 513 | nW := runtime.GOMAXPROCS(0) 514 | 515 | var ws []*Writer 516 | 517 | for i := 0; i < nW; i++ { 518 | ws = append(ws, db.NewWriter()) 519 | } 520 | 521 | nc := 0 522 | for x := 0; x < iterations; x++ { 523 | for i := 0; i < nW; i++ { 524 | wg.Add(1) 525 | go doReplace(&wg, t, ws[i], i*perW, i*perW+perW) 526 | } 527 | wg.Wait() 528 | curr, _ := db.NewSnapshot() 529 | if last != nil { 530 | last.Close() 531 | } 532 | 533 | last = curr 534 | nc += db.store.GetStats().NodeCount 535 | } 536 | 537 | snap, _ := db.NewSnapshot() 538 | defer snap.Close() 539 | last.Close() 540 | 541 | waits := 0 542 | for db.store.GetStats().NodeCount > nW*perW { 543 | time.Sleep(time.Millisecond) 544 | waits++ 545 | } 546 | 547 | fmt.Printf("final_node_count = %v, average_live_node_count = %v, wait_time_for_collection = %vms\n", db.store.GetStats().NodeCount, nc/iterations, waits) 548 | } 549 | 550 | func TestMemoryInUse(t *testing.T) { 551 | db := NewWithConfig(testConf) 552 | defer db.Close() 553 | 554 | dumpStats := func() { 555 | fmt.Printf("ItemsCount: %v, MemoryInUse: %v, NodesCount: %v\n", db.ItemsCount(), MemoryInUse(), db.store.GetStats().NodeCount) 556 | } 557 | w := db.NewWriter() 558 | for i := 0; i < 5000; i++ { 559 | w.Put([]byte(fmt.Sprintf("%010d", i))) 560 | } 561 | snap1, _ := w.NewSnapshot() 562 | 563 | dumpStats() 564 | 565 | for i := 0; i < 5000; i++ { 566 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 567 | } 568 | 569 | snap1.Close() 570 | snap2, _ := w.NewSnapshot() 571 | snap3, _ := w.NewSnapshot() 572 | defer snap3.Close() 573 | snap2.Close() 574 | time.Sleep(time.Second) 575 | dumpStats() 576 | } 577 | 578 | func TestFullScan(t *testing.T) { 579 | var wg sync.WaitGroup 580 | db := NewWithConfig(testConf) 581 | defer db.Close() 582 | n := 1000000 583 | t0 := time.Now() 584 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 585 | wg.Add(1) 586 | go doInsert(db, &wg, n/runtime.GOMAXPROCS(0), true, true) 587 | } 588 | wg.Wait() 589 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 590 | snap, _ := db.NewSnapshot() 591 | defer snap.Close() 592 | VerifyCount(snap, n, t) 593 | fmt.Println(db.DumpStats()) 594 | 595 | t0 = time.Now() 596 | c := CountItems(snap) 597 | fmt.Printf("Full iteration of %d items took %v\n", c, time.Since(t0)) 598 | } 599 | 600 | func TestVisitor(t *testing.T) { 601 | const shards = 32 602 | const concurrency = 8 603 | const n = 1000000 604 | 605 | var wg sync.WaitGroup 606 | db := NewWithConfig(testConf) 607 | defer db.Close() 608 | expectedSum := int64((n - 1) * (n / 2)) 609 | 610 | wg.Add(1) 611 | doInsert(db, &wg, n, false, false) 612 | snap, _ := db.NewSnapshot() 613 | defer snap.Close() 614 | fmt.Println(db.DumpStats()) 615 | 616 | var counts [shards]int64 617 | var startEndRange [shards][2]uint64 618 | var sum int64 619 | 620 | callb := func(itm *Item, shard int) error { 621 | v := binary.BigEndian.Uint64(itm.Bytes()) 622 | atomic.AddInt64(&sum, int64(v)) 623 | atomic.AddInt64(&counts[shard], 1) 624 | 625 | if shard > 0 && startEndRange[shard][0] == 0 { 626 | startEndRange[shard][0] = v 627 | } else { 628 | if startEndRange[shard][1] > v { 629 | t.Errorf("shard-%d validation of sort order %d > %d", shard, startEndRange[shard][1], v) 630 | } 631 | startEndRange[shard][1] = v 632 | } 633 | 634 | return nil 635 | } 636 | 637 | total := 0 638 | t0 := time.Now() 639 | db.Visitor(snap, callb, shards, concurrency) 640 | dur := time.Since(t0) 641 | fmt.Printf("Took %v to iterate %v items, %v items/s\n", dur, n, float32(n)/float32(dur.Seconds())) 642 | 643 | for i, v := range counts { 644 | fmt.Printf("shard - %d count = %d, range: %d-%d\n", i, v, startEndRange[i][0], startEndRange[i][1]) 645 | total += int(v) 646 | } 647 | 648 | if total != n { 649 | t.Errorf("Expected count %d, received %d", n, total) 650 | } 651 | 652 | if expectedSum != sum { 653 | t.Errorf("Expected sum %d, received %d", expectedSum, sum) 654 | } 655 | } 656 | 657 | func TestVisitorError(t *testing.T) { 658 | const n = 100000 659 | var wg sync.WaitGroup 660 | db := NewWithConfig(testConf) 661 | defer db.Close() 662 | 663 | wg.Add(1) 664 | doInsert(db, &wg, n, false, false) 665 | snap, _ := db.NewSnapshot() 666 | defer snap.Close() 667 | 668 | errVisitor := fmt.Errorf("visitor failed") 669 | callb := func(itm *Item, shard int) error { 670 | v := binary.BigEndian.Uint64(itm.Bytes()) 671 | if v == 90000 { 672 | return errVisitor 673 | } 674 | return nil 675 | } 676 | 677 | if db.Visitor(snap, callb, 4, 4) != errVisitor { 678 | t.Errorf("Expected error") 679 | } 680 | } 681 | 682 | func doUpdate(db *Nitro, wg *sync.WaitGroup, w *Writer, start, end int, version int) { 683 | defer wg.Done() 684 | for ; start < end; start++ { 685 | oldval := uint64(start) + uint64(version-1)*10000000 686 | val := uint64(start) + uint64(version)*10000000 687 | buf1 := make([]byte, 8) 688 | binary.BigEndian.PutUint64(buf1, uint64(val)) 689 | buf2 := make([]byte, 8) 690 | binary.BigEndian.PutUint64(buf2, uint64(oldval)) 691 | if version > 1 { 692 | if !w.Delete(buf2) { 693 | panic("delete failed") 694 | } 695 | } 696 | w.Put(buf1) 697 | } 698 | } 699 | 700 | func TestLoadDeltaStoreDisk(t *testing.T) { 701 | os.RemoveAll("db.dump") 702 | conf := DefaultConfig() 703 | conf.UseDeltaInterleaving() 704 | db := NewWithConfig(conf) 705 | 706 | var writers []*Writer 707 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 708 | writers = append(writers, db.NewWriter()) 709 | } 710 | 711 | n := 1000000 712 | chunk := n / runtime.GOMAXPROCS(0) 713 | version := 0 714 | 715 | doMutate := func() *Snapshot { 716 | var wg sync.WaitGroup 717 | version++ 718 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 719 | wg.Add(1) 720 | start := i * chunk 721 | end := start + chunk 722 | go doUpdate(db, &wg, writers[i], start, end, version) 723 | } 724 | wg.Wait() 725 | 726 | snap, _ := db.NewSnapshot() 727 | return snap 728 | } 729 | 730 | var snap, snapw *Snapshot 731 | for x := 0; x < 2; x++ { 732 | if snap != nil { 733 | snap.Close() 734 | } 735 | snap = doMutate() 736 | } 737 | 738 | waiter := make(chan bool) 739 | var wg2 sync.WaitGroup 740 | wg2.Add(1) 741 | go func() { 742 | defer wg2.Done() 743 | 744 | for x := 0; x < 10; x++ { 745 | if snapw != nil { 746 | snapw.Close() 747 | } 748 | 749 | snapw = doMutate() 750 | if x == 0 { 751 | close(waiter) 752 | } 753 | } 754 | 755 | snap.Close() 756 | count := db.gcsnapshots.GetStats().NodeCount 757 | 758 | for count > 5 { 759 | time.Sleep(time.Second) 760 | count = db.gcsnapshots.GetStats().NodeCount 761 | } 762 | }() 763 | 764 | callb := func(itm *ItemEntry) { 765 | <-waiter 766 | } 767 | 768 | t0 := time.Now() 769 | err := db.StoreToDisk("db.dump", snap, 8, callb) 770 | if err != nil { 771 | t.Errorf("Expected no error. got=%v", err) 772 | } 773 | 774 | fmt.Printf("Storing to disk took %v\n", time.Since(t0)) 775 | 776 | wg2.Wait() 777 | snapw.Close() 778 | db.Close() 779 | 780 | db = NewWithConfig(conf) 781 | defer db.Close() 782 | t0 = time.Now() 783 | snap, err = db.LoadFromDisk("db.dump", 8, nil) 784 | defer snap.Close() 785 | if err != nil { 786 | t.Errorf("Expected no error. got=%v", err) 787 | } 788 | fmt.Printf("Loading from disk took %v\n", time.Since(t0)) 789 | 790 | count := CountItems(snap) 791 | if count != n { 792 | t.Errorf("Expected %v, got %v", n, count) 793 | } 794 | 795 | count = int(snap.Count()) 796 | if count != n { 797 | t.Errorf("Count mismatch on snapshot. Expected %d, got %d", n, count) 798 | } 799 | 800 | itr := snap.NewIterator() 801 | i := 0 802 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 803 | itm := itr.Get() 804 | val := binary.BigEndian.Uint64(itm) 805 | exp := uint64(i) + uint64(2)*10000000 806 | 807 | if val != exp { 808 | t.Errorf("expected %d, got %d", exp, val) 809 | } 810 | i++ 811 | } 812 | itr.Close() 813 | 814 | fmt.Println(db.DumpStats()) 815 | fmt.Println("Restored", db.DeltaRestored) 816 | fmt.Println("RestoredFailed", db.DeltaRestoreFailed) 817 | } 818 | 819 | func TestExecuteConcurrGCWorkers(t *testing.T) { 820 | db := NewWithConfig(testConf) 821 | defer db.Close() 822 | 823 | w := db.NewWriter() 824 | 825 | for x := 0; x < 40; x++ { 826 | db.NewWriter() 827 | } 828 | 829 | for i := 0; i < 200000; i++ { 830 | w.Put([]byte(fmt.Sprintf("%010d", i))) 831 | } 832 | snap, _ := w.NewSnapshot() 833 | snap.Close() 834 | 835 | var snaps []*Snapshot 836 | for i := 0; i < 200000; i++ { 837 | if i%1000 == 0 { 838 | snap, _ := w.NewSnapshot() 839 | snaps = append(snaps, snap) 840 | } 841 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 842 | } 843 | snap, _ = w.NewSnapshot() 844 | snaps = append(snaps, snap) 845 | 846 | barrier := w.store.GetAccesBarrier() 847 | bs := barrier.Acquire() 848 | barrier.Release(bs) 849 | for _, snap := range snaps { 850 | snap.Close() 851 | } 852 | 853 | for db.store.GetStats().NodeFrees != 200000 { 854 | time.Sleep(time.Millisecond) 855 | } 856 | } 857 | 858 | func TestCloseWithActiveIterators(t *testing.T) { 859 | var wg sync.WaitGroup 860 | db := NewWithConfig(testConf) 861 | 862 | w := db.NewWriter() 863 | for i := 0; i < 200000; i++ { 864 | w.Put([]byte(fmt.Sprintf("%010d", i))) 865 | } 866 | 867 | snap, _ := w.NewSnapshot() 868 | for i := 0; i < 1000; i++ { 869 | wg.Add(1) 870 | go func(wg *sync.WaitGroup) { 871 | defer wg.Done() 872 | 873 | if itr := db.NewIterator(snap); itr != nil { 874 | for x := 0; x < 5; x++ { 875 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 876 | } 877 | } 878 | itr.Close() 879 | } 880 | }(&wg) 881 | } 882 | 883 | snap.Close() 884 | db.Close() 885 | wg.Wait() 886 | 887 | } 888 | 889 | func TestSimpleGet(t *testing.T) { 890 | db := NewWithConfig(testConf) 891 | w := db.NewWriter() 892 | 893 | n := 1000000 894 | buf := make([]byte, 8) 895 | for i := 0; i < n; i++ { 896 | binary.BigEndian.PutUint64(buf, uint64(i)) 897 | w.Put(buf) 898 | } 899 | 900 | snap, _ := w.NewSnapshot() 901 | itr := snap.NewIterator() 902 | for i := 0; i < n; i++ { 903 | binary.BigEndian.PutUint64(buf, uint64(i)) 904 | itr.Seek(buf) 905 | if !itr.Valid() { 906 | t.Errorf("invalid item %v, %v", i, buf) 907 | continue 908 | } 909 | 910 | x := binary.BigEndian.Uint64(itr.Get()) 911 | if uint64(i) != x { 912 | t.Errorf("Failed to lookup %d, got %d", i, x) 913 | } 914 | } 915 | } 916 | -------------------------------------------------------------------------------- /nitro.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016 Couchbase, Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3 | // except in compliance with the License. You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software distributed under the 6 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7 | // either express or implied. See the License for the specific language governing permissions 8 | // and limitations under the License. 9 | 10 | package nitro 11 | 12 | import ( 13 | "bytes" 14 | "encoding/binary" 15 | "encoding/json" 16 | "fmt" 17 | "github.com/t3rm1n4l/nitro/mm" 18 | "github.com/t3rm1n4l/nitro/skiplist" 19 | "io" 20 | "io/ioutil" 21 | "math" 22 | "math/rand" 23 | "os" 24 | "path/filepath" 25 | "runtime" 26 | "sync" 27 | "sync/atomic" 28 | "time" 29 | "unsafe" 30 | ) 31 | 32 | var ( 33 | // ErrMaxSnapshotsLimitReached means 32 bit integer overflow of snap number 34 | ErrMaxSnapshotsLimitReached = fmt.Errorf("Maximum snapshots limit reached") 35 | // ErrShutdown means an operation on a shutdown Nitro instance 36 | ErrShutdown = fmt.Errorf("Nitro instance has been shutdown") 37 | ) 38 | 39 | // KeyCompare implements item data key comparator 40 | type KeyCompare func([]byte, []byte) int 41 | 42 | // VisitorCallback implements Nitro snapshot visitor callback 43 | type VisitorCallback func(*Item, int) error 44 | 45 | // ItemEntry is a wrapper item struct used by backup file to Nitro restore callback 46 | type ItemEntry struct { 47 | itm *Item 48 | n *skiplist.Node 49 | } 50 | 51 | // Item returns Nitro item 52 | func (e *ItemEntry) Item() *Item { 53 | return e.itm 54 | } 55 | 56 | // Node returns the skiplist node which holds the item 57 | func (e *ItemEntry) Node() *skiplist.Node { 58 | return e.n 59 | } 60 | 61 | // ItemCallback implements callback used for backup file to Nitro restore API 62 | type ItemCallback func(*ItemEntry) 63 | 64 | const ( 65 | defaultRefreshRate = 10000 66 | gcchanBufSize = 256 67 | ) 68 | 69 | var ( 70 | dbInstances *skiplist.Skiplist 71 | dbInstancesCount int64 72 | ) 73 | 74 | func init() { 75 | dbInstances = skiplist.New() 76 | } 77 | 78 | // CompareNitro implements comparator for Nitro instances based on its id 79 | func CompareNitro(this unsafe.Pointer, that unsafe.Pointer) int { 80 | thisItem := (*Nitro)(this) 81 | thatItem := (*Nitro)(that) 82 | 83 | return int(thisItem.id - thatItem.id) 84 | } 85 | 86 | // DefaultConfig - Nitro configuration 87 | func DefaultConfig() Config { 88 | var cfg Config 89 | cfg.SetKeyComparator(defaultKeyCmp) 90 | cfg.fileType = RawdbFile 91 | cfg.useMemoryMgmt = false 92 | cfg.refreshRate = defaultRefreshRate 93 | // TOOD: Remove this 94 | cfg.storageShards = 48 95 | return cfg 96 | } 97 | 98 | func newInsertCompare(keyCmp KeyCompare) skiplist.CompareFn { 99 | return func(this, that unsafe.Pointer) int { 100 | var v int 101 | thisItem := (*Item)(this) 102 | thatItem := (*Item)(that) 103 | if v = keyCmp(thisItem.Bytes(), thatItem.Bytes()); v == 0 { 104 | var thisSn, thatSn uint32 105 | 106 | if thisItem.bornSn == 0 { 107 | thisSn = thisItem.deadSn 108 | } else { 109 | thisSn = thisItem.bornSn 110 | } 111 | 112 | if thatItem.bornSn == 0 { 113 | thatSn = thatItem.deadSn 114 | } else { 115 | thatSn = thatItem.bornSn 116 | } 117 | v = int(thisSn) - int(thatSn) 118 | } 119 | 120 | return v 121 | } 122 | } 123 | 124 | func newIterCompare(keyCmp KeyCompare) skiplist.CompareFn { 125 | return func(this, that unsafe.Pointer) int { 126 | thisItem := (*Item)(this) 127 | thatItem := (*Item)(that) 128 | return keyCmp(thisItem.Bytes(), thatItem.Bytes()) 129 | } 130 | } 131 | 132 | func newExistCompare(keyCmp KeyCompare) skiplist.CompareFn { 133 | return func(this, that unsafe.Pointer) int { 134 | thisItem := (*Item)(this) 135 | thatItem := (*Item)(that) 136 | if thisItem.deadSn != 0 || thatItem.deadSn != 0 { 137 | return 1 138 | } 139 | return keyCmp(thisItem.Bytes(), thatItem.Bytes()) 140 | } 141 | } 142 | 143 | func defaultKeyCmp(this, that []byte) int { 144 | return bytes.Compare(this, that) 145 | } 146 | 147 | const ( 148 | dwStateInactive = iota 149 | dwStateInit 150 | dwStateActive 151 | dwStateTerminate 152 | ) 153 | 154 | type deltaWrContext struct { 155 | state int 156 | closed chan struct{} 157 | notifyStatus chan error 158 | sn uint32 159 | fw FileWriter 160 | err error 161 | } 162 | 163 | func (ctx *deltaWrContext) Init() { 164 | ctx.state = dwStateInactive 165 | ctx.notifyStatus = make(chan error) 166 | ctx.closed = make(chan struct{}) 167 | } 168 | 169 | // Writer provides a handle for concurrent access 170 | // Nitro writer is thread-unsafe and should initialize separate Nitro writers 171 | // to perform concurrent writes from multiple threads. 172 | type Writer struct { 173 | dwrCtx deltaWrContext // Used for cooperative disk snapshotting 174 | 175 | rand *rand.Rand 176 | buf *skiplist.ActionBuffer 177 | gchead *skiplist.Node 178 | gctail *skiplist.Node 179 | next *Writer 180 | // Local skiplist stats for writer, gcworker and freeworker 181 | slSts1, slSts2, slSts3 skiplist.Stats 182 | resSts restoreStats 183 | count int64 184 | 185 | *Nitro 186 | fd *os.File 187 | rfd *os.File 188 | offset int 189 | } 190 | 191 | func (w *Writer) doCheckpoint() { 192 | ctx := &w.dwrCtx 193 | switch ctx.state { 194 | case dwStateInit: 195 | ctx.state = dwStateActive 196 | ctx.notifyStatus <- nil 197 | ctx.err = nil 198 | case dwStateTerminate: 199 | ctx.state = dwStateInactive 200 | ctx.notifyStatus <- ctx.err 201 | } 202 | } 203 | 204 | func (w *Writer) doDeltaWrite(itm *Item) { 205 | ctx := &w.dwrCtx 206 | if ctx.state == dwStateActive { 207 | if itm.bornSn <= ctx.sn && itm.deadSn > ctx.sn { 208 | if err := ctx.fw.WriteItem(itm); err != nil { 209 | ctx.err = err 210 | } 211 | } 212 | } 213 | } 214 | 215 | // Put implements insert of an item into Intro 216 | // Put fails if an item already exists 217 | func (w *Writer) Put(bs []byte) { 218 | w.Put2(bs) 219 | } 220 | 221 | // Put2 returns the skiplist node of the item if Put() succeeds 222 | func (w *Writer) Put2(bs []byte) *skiplist.Node { 223 | return w.insert(bs, true) 224 | } 225 | 226 | func (w *Writer) insert(bs []byte, isCreate bool) (n *skiplist.Node) { 227 | var success bool 228 | x := w.newItem(bs, w.useMemoryMgmt) 229 | if isCreate { 230 | x.bornSn = w.getCurrSn() 231 | } else { 232 | x.deadSn = w.getCurrSn() 233 | } 234 | n, success = w.store.Insert2(unsafe.Pointer(x), w.insCmp, w.existCmp, w.buf, 235 | w.rand.Float32, &w.slSts1) 236 | if success { 237 | w.count++ 238 | } else { 239 | w.freeItem(x) 240 | } 241 | return 242 | } 243 | 244 | // Delete an item 245 | // Delete always succeed if an item exists. 246 | func (w *Writer) Delete(bs []byte) (success bool) { 247 | _, success = w.Delete2(bs) 248 | return 249 | } 250 | 251 | // Delete2 is same as Delete(). Additionally returns the deleted item's node 252 | func (w *Writer) Delete2(bs []byte) (n *skiplist.Node, success bool) { 253 | if n := w.GetNode(bs); n != nil { 254 | return n, w.DeleteNode(n) 255 | } 256 | 257 | return nil, false 258 | } 259 | 260 | // DeleteNode deletes an item by specifying its skiplist Node. 261 | // Using this API can avoid a O(logn) lookup during Delete(). 262 | func (w *Writer) DeleteNode(x *skiplist.Node) (success bool) { 263 | defer func() { 264 | if success { 265 | w.count-- 266 | } 267 | }() 268 | 269 | x.GClink = nil 270 | sn := w.getCurrSn() 271 | gotItem := (*Item)(x.Item()) 272 | if gotItem.bornSn == sn { 273 | success = w.store.DeleteNode(x, w.insCmp, w.buf, &w.slSts1) 274 | 275 | barrier := w.store.GetAccesBarrier() 276 | barrier.FlushSession(unsafe.Pointer(x)) 277 | return 278 | } 279 | 280 | success = atomic.CompareAndSwapUint32(&gotItem.deadSn, 0, sn) 281 | if success { 282 | if w.gctail == nil { 283 | w.gctail = x 284 | w.gchead = w.gctail 285 | } else { 286 | w.gctail.GClink = x 287 | w.gctail = x 288 | } 289 | } 290 | return 291 | } 292 | 293 | // DeleteNonExist creates a delete marker node if an item does not exist 294 | func (w *Writer) DeleteNonExist(bs []byte) bool { 295 | if n := w.GetNode(bs); n != nil { 296 | return w.DeleteNode(n) 297 | } 298 | 299 | return w.insert(bs, false) != nil 300 | } 301 | 302 | // GetNode implements lookup of an item and return its skiplist Node 303 | // This API enables to lookup an item without using a snapshot handle. 304 | func (w *Writer) GetNode(bs []byte) *skiplist.Node { 305 | iter := w.store.NewIterator(w.iterCmp, w.buf) 306 | defer iter.Close() 307 | 308 | x := w.newItem(bs, false) 309 | x.bornSn = w.getCurrSn() 310 | 311 | if found := iter.SeekWithCmp(unsafe.Pointer(x), w.insCmp, w.existCmp); found { 312 | return iter.GetNode() 313 | } 314 | 315 | return nil 316 | } 317 | 318 | // Config - Nitro instance configuration 319 | type Config struct { 320 | keyCmp KeyCompare 321 | insCmp skiplist.CompareFn 322 | iterCmp skiplist.CompareFn 323 | existCmp skiplist.CompareFn 324 | 325 | refreshRate int 326 | fileType FileType 327 | 328 | useMemoryMgmt bool 329 | useDeltaFiles bool 330 | mallocFun skiplist.MallocFn 331 | freeFun skiplist.FreeFn 332 | blockStoreDir string 333 | storageShards int 334 | } 335 | 336 | // SetKeyComparator provides key comparator for the Nitro item data 337 | func (cfg *Config) SetKeyComparator(cmp KeyCompare) { 338 | cfg.keyCmp = cmp 339 | cfg.insCmp = newInsertCompare(cmp) 340 | cfg.iterCmp = newIterCompare(cmp) 341 | cfg.existCmp = newExistCompare(cmp) 342 | } 343 | func (cfg *Config) SetBlockStoreDir(p string) { 344 | cfg.blockStoreDir = p 345 | } 346 | 347 | func (cfg *Config) HasBlockStore() bool { 348 | return cfg.blockStoreDir != "" 349 | } 350 | 351 | // UseMemoryMgmt provides custom memory allocator for Nitro items storage 352 | func (cfg *Config) UseMemoryMgmt(malloc skiplist.MallocFn, free skiplist.FreeFn) { 353 | if runtime.GOARCH == "amd64" { 354 | cfg.useMemoryMgmt = true 355 | cfg.mallocFun = malloc 356 | cfg.freeFun = free 357 | } 358 | } 359 | 360 | // UseDeltaInterleaving option enables to avoid additional memory required during disk backup 361 | // as due to locking of older snapshots. This non-intrusive backup mode 362 | // eliminates the need for locking garbage collectable old snapshots. But, it may 363 | // use additional amount of disk space for backup. 364 | func (cfg *Config) UseDeltaInterleaving() { 365 | cfg.useDeltaFiles = true 366 | } 367 | 368 | type restoreStats struct { 369 | DeltaRestored uint64 370 | DeltaRestoreFailed uint64 371 | } 372 | 373 | // Nitro instance 374 | type Nitro struct { 375 | id int 376 | store *skiplist.Skiplist 377 | currSn uint32 378 | snapshots *skiplist.Skiplist 379 | gcsnapshots *skiplist.Skiplist 380 | isGCRunning int32 381 | lastGCSn uint32 382 | leastUnrefSn uint32 383 | itemsCount int64 384 | 385 | // Used to push gclist from current snapshot. 386 | parentSnap *Snapshot 387 | 388 | wlist *Writer 389 | gcchan chan *skiplist.Node 390 | freechan chan *skiplist.Node 391 | 392 | shardWrs []*diskWriter 393 | bm BlockManager 394 | 395 | hasShutdown bool 396 | shutdownWg1 sync.WaitGroup // GC workers and StoreToDisk task 397 | shutdownWg2 sync.WaitGroup // Free workers 398 | 399 | Config 400 | restoreStats 401 | } 402 | 403 | // NewWithConfig creates a new Nitro instance based on provided configuration. 404 | func NewWithConfig(cfg Config) *Nitro { 405 | m := &Nitro{ 406 | snapshots: skiplist.New(), 407 | gcsnapshots: skiplist.New(), 408 | currSn: 1, 409 | Config: cfg, 410 | gcchan: make(chan *skiplist.Node, gcchanBufSize), 411 | id: int(atomic.AddInt64(&dbInstancesCount, 1)), 412 | } 413 | 414 | m.freechan = make(chan *skiplist.Node, gcchanBufSize) 415 | m.store = skiplist.NewWithConfig(m.newStoreConfig()) 416 | m.initSizeFuns() 417 | 418 | buf := dbInstances.MakeBuf() 419 | defer dbInstances.FreeBuf(buf) 420 | dbInstances.Insert(unsafe.Pointer(m), CompareNitro, buf, &dbInstances.Stats) 421 | 422 | if cfg.HasBlockStore() { 423 | var err error 424 | m.bm, err = newFileBlockManager(cfg.storageShards, cfg.blockStoreDir) 425 | if err != nil { 426 | panic(err) 427 | } 428 | 429 | for i := 0; i < cfg.storageShards; i++ { 430 | m.shardWrs = append(m.shardWrs, m.newDiskWriter(i)) 431 | } 432 | } 433 | 434 | return m 435 | 436 | } 437 | 438 | func (m *Nitro) newStoreConfig() skiplist.Config { 439 | slCfg := skiplist.DefaultConfig() 440 | if m.useMemoryMgmt { 441 | slCfg.UseMemoryMgmt = true 442 | slCfg.Malloc = m.mallocFun 443 | slCfg.Free = m.freeFun 444 | slCfg.BarrierDestructor = m.newBSDestructor() 445 | 446 | } 447 | return slCfg 448 | } 449 | 450 | func (m *Nitro) newBSDestructor() skiplist.BarrierSessionDestructor { 451 | return func(ref unsafe.Pointer) { 452 | // If gclist is not empty 453 | if ref != nil { 454 | freelist := (*skiplist.Node)(ref) 455 | m.freechan <- freelist 456 | } 457 | } 458 | } 459 | 460 | func (m *Nitro) initSizeFuns() { 461 | m.snapshots.SetItemSizeFunc(SnapshotSize) 462 | m.gcsnapshots.SetItemSizeFunc(SnapshotSize) 463 | m.store.SetItemSizeFunc(ItemSize) 464 | } 465 | 466 | // New creates a Nitro instance using default configuration 467 | func New() *Nitro { 468 | return NewWithConfig(DefaultConfig()) 469 | } 470 | 471 | // MemoryInUse returns total memory used by the Nitro instance. 472 | func (m *Nitro) MemoryInUse() int64 { 473 | storeStats := m.aggrStoreStats() 474 | return storeStats.Memory + m.snapshots.MemoryInUse() + m.gcsnapshots.MemoryInUse() 475 | } 476 | 477 | // Close shuts down the nitro instance 478 | func (m *Nitro) Close() { 479 | if m.parentSnap != nil { 480 | m.parentSnap.Close() 481 | } 482 | 483 | // Wait until all snapshot iterators have finished 484 | for s := m.snapshots.GetStats(); int(s.NodeCount) != 0; s = m.snapshots.GetStats() { 485 | time.Sleep(time.Millisecond) 486 | } 487 | 488 | m.hasShutdown = true 489 | 490 | // Acquire gc chan ownership 491 | // This will make sure that no other goroutine will write to gcchan 492 | for !atomic.CompareAndSwapInt32(&m.isGCRunning, 0, 1) { 493 | time.Sleep(time.Millisecond) 494 | } 495 | close(m.gcchan) 496 | 497 | buf := dbInstances.MakeBuf() 498 | defer dbInstances.FreeBuf(buf) 499 | dbInstances.Delete(unsafe.Pointer(m), CompareNitro, buf, &dbInstances.Stats) 500 | 501 | if m.useMemoryMgmt { 502 | buf := m.snapshots.MakeBuf() 503 | defer m.snapshots.FreeBuf(buf) 504 | 505 | m.shutdownWg1.Wait() 506 | close(m.freechan) 507 | m.shutdownWg2.Wait() 508 | 509 | // Manually free up all nodes 510 | iter := m.store.NewIterator(m.iterCmp, buf) 511 | defer iter.Close() 512 | var lastNode *skiplist.Node 513 | 514 | iter.SeekFirst() 515 | if iter.Valid() { 516 | lastNode = iter.GetNode() 517 | iter.Next() 518 | } 519 | 520 | for lastNode != nil { 521 | m.freeItem((*Item)(lastNode.Item())) 522 | m.store.FreeNode(lastNode, &m.store.Stats) 523 | lastNode = nil 524 | 525 | if iter.Valid() { 526 | lastNode = iter.GetNode() 527 | iter.Next() 528 | } 529 | } 530 | } 531 | } 532 | 533 | func (m *Nitro) getCurrSn() uint32 { 534 | return atomic.LoadUint32(&m.currSn) 535 | } 536 | 537 | func (m *Nitro) newWriter() *Writer { 538 | w := &Writer{ 539 | rand: rand.New(rand.NewSource(int64(rand.Int()))), 540 | buf: m.store.MakeBuf(), 541 | Nitro: m, 542 | } 543 | 544 | w.slSts1.IsLocal(true) 545 | w.slSts2.IsLocal(true) 546 | w.slSts3.IsLocal(true) 547 | return w 548 | } 549 | 550 | // NewWriter creates a Nitro writer 551 | func (m *Nitro) NewWriter() *Writer { 552 | w := m.newWriter() 553 | w.next = m.wlist 554 | m.wlist = w 555 | w.dwrCtx.Init() 556 | 557 | m.shutdownWg1.Add(1) 558 | go m.collectionWorker(w) 559 | if m.useMemoryMgmt { 560 | m.shutdownWg2.Add(1) 561 | go m.freeWorker(w) 562 | } 563 | 564 | return w 565 | } 566 | 567 | // Snapshot describes Nitro immutable snapshot 568 | type Snapshot struct { 569 | sn uint32 570 | refCount int32 571 | db *Nitro 572 | count int64 573 | 574 | gclist *skiplist.Node 575 | } 576 | 577 | // SnapshotSize returns the memory used by Nitro snapshot metadata 578 | func SnapshotSize(p unsafe.Pointer) int { 579 | s := (*Snapshot)(p) 580 | return int(unsafe.Sizeof(s.sn) + unsafe.Sizeof(s.refCount) + unsafe.Sizeof(s.db) + 581 | unsafe.Sizeof(s.count) + unsafe.Sizeof(s.gclist)) 582 | } 583 | 584 | // Count returns the number of items in the Nitro snapshot 585 | func (s Snapshot) Count() int64 { 586 | return s.count 587 | } 588 | 589 | // Encode implements Binary encoder for snapshot metadata 590 | func (s *Snapshot) Encode(buf []byte, w io.Writer) error { 591 | l := 4 592 | if len(buf) < l { 593 | return errNotEnoughSpace 594 | } 595 | 596 | binary.BigEndian.PutUint32(buf[0:4], s.sn) 597 | if _, err := w.Write(buf[0:4]); err != nil { 598 | return err 599 | } 600 | 601 | return nil 602 | 603 | } 604 | 605 | // Decode implements binary decoder for snapshot metadata 606 | func (s *Snapshot) Decode(buf []byte, r io.Reader) error { 607 | if _, err := io.ReadFull(r, buf[0:4]); err != nil { 608 | return err 609 | } 610 | s.sn = binary.BigEndian.Uint32(buf[0:4]) 611 | return nil 612 | } 613 | 614 | // Open implements reference couting and garbage collection for snapshots 615 | // When snapshots are shared by multiple threads, each thread should Open the 616 | // snapshot. This API internally tracks the reference count for the snapshot. 617 | func (s *Snapshot) Open() bool { 618 | if atomic.LoadInt32(&s.refCount) == 0 { 619 | return false 620 | } 621 | atomic.AddInt32(&s.refCount, 1) 622 | return true 623 | } 624 | 625 | // Close is the snapshot descructor 626 | // Once a thread has finished using a snapshot, it can be destroyed by calling 627 | // Close(). Internal garbage collector takes care of freeing the items. 628 | func (s *Snapshot) Close() { 629 | newRefcount := atomic.AddInt32(&s.refCount, -1) 630 | if newRefcount == 0 { 631 | buf := s.db.snapshots.MakeBuf() 632 | defer s.db.snapshots.FreeBuf(buf) 633 | 634 | // Move from live snapshot list to dead list 635 | s.db.snapshots.Delete(unsafe.Pointer(s), CompareSnapshot, buf, &s.db.snapshots.Stats) 636 | s.db.gcsnapshots.Insert(unsafe.Pointer(s), CompareSnapshot, buf, &s.db.gcsnapshots.Stats) 637 | s.db.GC() 638 | } 639 | } 640 | 641 | // NewIterator creates a new snapshot iterator 642 | func (s *Snapshot) NewIterator() *Iterator { 643 | return s.db.NewIterator(s) 644 | } 645 | 646 | // CompareSnapshot implements comparator for snapshots based on snapshot number 647 | func CompareSnapshot(this, that unsafe.Pointer) int { 648 | thisItem := (*Snapshot)(this) 649 | thatItem := (*Snapshot)(that) 650 | 651 | return int(thisItem.sn) - int(thatItem.sn) 652 | } 653 | 654 | // NewSnapshot creates a new Nitro snapshot. 655 | // This is a thread-unsafe API. 656 | // While this API is invoked, no other Nitro writer should concurrently call any 657 | // public APIs such as Put*() and Delete*(). 658 | func (m *Nitro) NewSnapshot() (*Snapshot, error) { 659 | buf := m.snapshots.MakeBuf() 660 | defer m.snapshots.FreeBuf(buf) 661 | 662 | // Stitch all local gclists from all writers to create snapshot gclist 663 | var head, tail *skiplist.Node 664 | 665 | for w := m.wlist; w != nil; w = w.next { 666 | if tail == nil { 667 | head = w.gchead 668 | tail = w.gctail 669 | } else if w.gchead != nil { 670 | tail.GClink = w.gchead 671 | tail = w.gctail 672 | } 673 | 674 | w.gchead = nil 675 | w.gctail = nil 676 | 677 | // Update global stats 678 | m.store.Stats.Merge(&w.slSts1) 679 | atomic.AddInt64(&m.itemsCount, w.count) 680 | w.count = 0 681 | } 682 | 683 | snap := &Snapshot{db: m, sn: m.getCurrSn(), refCount: 2, count: m.ItemsCount()} 684 | m.snapshots.Insert(unsafe.Pointer(snap), CompareSnapshot, buf, &m.snapshots.Stats) 685 | if m.parentSnap != nil { 686 | m.parentSnap.gclist = head 687 | m.parentSnap.Close() 688 | } 689 | m.parentSnap = snap 690 | 691 | newSn := atomic.AddUint32(&m.currSn, 1) 692 | if newSn == math.MaxUint32 { 693 | return nil, ErrMaxSnapshotsLimitReached 694 | } 695 | 696 | return snap, nil 697 | } 698 | 699 | // ItemsCount returns the number of items in the Nitro instance 700 | func (m *Nitro) ItemsCount() int64 { 701 | return atomic.LoadInt64(&m.itemsCount) 702 | } 703 | 704 | func (m *Nitro) collectionWorker(w *Writer) { 705 | buf := m.store.MakeBuf() 706 | defer m.store.FreeBuf(buf) 707 | defer m.shutdownWg1.Done() 708 | 709 | for { 710 | select { 711 | case <-w.dwrCtx.notifyStatus: 712 | w.doCheckpoint() 713 | case gclist, ok := <-m.gcchan: 714 | if !ok { 715 | close(w.dwrCtx.closed) 716 | return 717 | } 718 | for n := gclist; n != nil; n = n.GClink { 719 | w.doDeltaWrite((*Item)(n.Item())) 720 | m.store.DeleteNode(n, m.insCmp, buf, &w.slSts2) 721 | } 722 | 723 | m.store.Stats.Merge(&w.slSts2) 724 | 725 | barrier := m.store.GetAccesBarrier() 726 | barrier.FlushSession(unsafe.Pointer(gclist)) 727 | } 728 | } 729 | } 730 | 731 | func (m *Nitro) freeWorker(w *Writer) { 732 | for freelist := range m.freechan { 733 | for n := freelist; n != nil; { 734 | dnode := n 735 | n = n.GClink 736 | 737 | if m.HasBlockStore() { 738 | m.bm.DeleteBlock(blockPtr(dnode.DataPtr)) 739 | } 740 | 741 | itm := (*Item)(dnode.Item()) 742 | m.freeItem(itm) 743 | m.store.FreeNode(dnode, &w.slSts3) 744 | } 745 | 746 | m.store.Stats.Merge(&w.slSts3) 747 | } 748 | 749 | m.shutdownWg2.Done() 750 | } 751 | 752 | // Invariant: Each snapshot n is dependent on snapshot n-1. 753 | // Unless snapshot n-1 is collected, snapshot n cannot be collected. 754 | func (m *Nitro) collectDead() { 755 | buf1 := m.snapshots.MakeBuf() 756 | buf2 := m.snapshots.MakeBuf() 757 | defer m.snapshots.FreeBuf(buf1) 758 | defer m.snapshots.FreeBuf(buf2) 759 | 760 | iter := m.gcsnapshots.NewIterator(CompareSnapshot, buf1) 761 | defer iter.Close() 762 | 763 | for iter.SeekFirst(); iter.Valid(); iter.Next() { 764 | node := iter.GetNode() 765 | sn := (*Snapshot)(node.Item()) 766 | if sn.sn != m.lastGCSn+1 { 767 | return 768 | } 769 | 770 | m.lastGCSn = sn.sn 771 | m.gcchan <- sn.gclist 772 | m.gcsnapshots.DeleteNode(node, CompareSnapshot, buf2, &m.gcsnapshots.Stats) 773 | } 774 | } 775 | 776 | // GC implements manual garbage collection of Nitro snapshots. 777 | func (m *Nitro) GC() { 778 | if atomic.CompareAndSwapInt32(&m.isGCRunning, 0, 1) { 779 | m.collectDead() 780 | atomic.CompareAndSwapInt32(&m.isGCRunning, 1, 0) 781 | } 782 | } 783 | 784 | // GetSnapshots returns the list of current live snapshots 785 | // This API is mainly for debugging purpose 786 | func (m *Nitro) GetSnapshots() []*Snapshot { 787 | var snaps []*Snapshot 788 | buf := m.snapshots.MakeBuf() 789 | defer m.snapshots.FreeBuf(buf) 790 | iter := m.snapshots.NewIterator(CompareSnapshot, buf) 791 | iter.SeekFirst() 792 | for ; iter.Valid(); iter.Next() { 793 | snaps = append(snaps, (*Snapshot)(iter.Get())) 794 | } 795 | 796 | return snaps 797 | } 798 | 799 | func (m *Nitro) ptrToItem(itmPtr unsafe.Pointer) *Item { 800 | o := (*Item)(itmPtr) 801 | itm := m.newItem(o.Bytes(), false) 802 | *itm = *o 803 | 804 | return itm 805 | } 806 | 807 | // Visitor implements concurrent Nitro snapshot visitor 808 | // This API divides the range of keys in a snapshot into `shards` range partitions 809 | // Number of concurrent worker threads used can be specified. 810 | func (m *Nitro) Visitor(snap *Snapshot, callb VisitorCallback, shards int, concurrency int) error { 811 | var wg sync.WaitGroup 812 | 813 | wch := make(chan int, shards) 814 | 815 | if snap == nil { 816 | panic("snapshot cannot be nil") 817 | } 818 | 819 | pivotItems := m.partitionPivots(snap, shards) 820 | errors := make([]error, len(pivotItems)-1) 821 | 822 | // Run workers 823 | for i := 0; i < concurrency; i++ { 824 | wg.Add(1) 825 | go func(wg *sync.WaitGroup) { 826 | defer wg.Done() 827 | 828 | for shard := range wch { 829 | startItem := pivotItems[shard] 830 | endItem := pivotItems[shard+1] 831 | 832 | itr := m.NewIterator(snap) 833 | if itr == nil { 834 | panic("iterator cannot be nil") 835 | } 836 | defer itr.Close() 837 | 838 | itr.SetRefreshRate(m.refreshRate) 839 | itr.Seek(startItem.Bytes()) 840 | itr.SetEnd(endItem.Bytes()) 841 | 842 | for ; itr.Valid(); itr.Next() { 843 | itm := (*Item)(itr.GetNode().Item()) 844 | if err := callb(itm, shard); err != nil { 845 | errors[shard] = err 846 | return 847 | } 848 | } 849 | } 850 | }(&wg) 851 | } 852 | 853 | // Provide work and wait 854 | for shard := 0; shard < len(pivotItems)-1; shard++ { 855 | wch <- shard 856 | } 857 | close(wch) 858 | 859 | wg.Wait() 860 | 861 | for _, err := range errors { 862 | if err != nil { 863 | return err 864 | } 865 | } 866 | 867 | return nil 868 | } 869 | 870 | func (m *Nitro) numWriters() int { 871 | var count int 872 | for w := m.wlist; w != nil; w = w.next { 873 | count++ 874 | } 875 | 876 | return count 877 | } 878 | 879 | func (m *Nitro) changeDeltaWrState(state int, 880 | writers []FileWriter, snap *Snapshot) error { 881 | 882 | var err error 883 | 884 | for id, w := 0, m.wlist; w != nil; w, id = w.next, id+1 { 885 | w.dwrCtx.state = state 886 | if state == dwStateInit { 887 | w.dwrCtx.sn = snap.sn 888 | w.dwrCtx.fw = writers[id] 889 | } 890 | 891 | // send 892 | select { 893 | case w.dwrCtx.notifyStatus <- nil: 894 | break 895 | case <-w.dwrCtx.closed: 896 | return ErrShutdown 897 | } 898 | 899 | // receive 900 | select { 901 | case e := <-w.dwrCtx.notifyStatus: 902 | if e != nil { 903 | err = e 904 | } 905 | break 906 | case <-w.dwrCtx.closed: 907 | return ErrShutdown 908 | } 909 | } 910 | 911 | return err 912 | } 913 | 914 | // StoreToDisk backups Nitro snapshot to disk 915 | // Concurrent threads are used to perform backup and concurrency can be specified. 916 | func (m *Nitro) StoreToDisk(dir string, snap *Snapshot, concurr int, itmCallback ItemCallback) (err error) { 917 | 918 | var snapClosed bool 919 | defer func() { 920 | if !snapClosed { 921 | snap.Close() 922 | } 923 | }() 924 | 925 | if m.useMemoryMgmt { 926 | m.shutdownWg1.Add(1) 927 | defer m.shutdownWg1.Done() 928 | } 929 | 930 | datadir := filepath.Join(dir, "data") 931 | os.MkdirAll(datadir, 0755) 932 | shards := runtime.NumCPU() 933 | 934 | writers := make([]FileWriter, shards) 935 | files := make([]string, shards) 936 | defer func() { 937 | for _, w := range writers { 938 | if w != nil { 939 | w.Close() 940 | } 941 | } 942 | }() 943 | 944 | for shard := 0; shard < shards; shard++ { 945 | w := m.newFileWriter(m.fileType) 946 | file := fmt.Sprintf("shard-%d", shard) 947 | datafile := filepath.Join(datadir, file) 948 | if err := w.Open(datafile); err != nil { 949 | return err 950 | } 951 | 952 | writers[shard] = w 953 | files[shard] = file 954 | } 955 | 956 | // Initialize and setup delta processing 957 | if m.useDeltaFiles { 958 | deltaWriters := make([]FileWriter, m.numWriters()) 959 | deltaFiles := make([]string, m.numWriters()) 960 | defer func() { 961 | for _, w := range deltaWriters { 962 | if w != nil { 963 | w.Close() 964 | } 965 | } 966 | }() 967 | 968 | deltadir := filepath.Join(dir, "delta") 969 | os.MkdirAll(deltadir, 0755) 970 | for id := 0; id < m.numWriters(); id++ { 971 | dw := m.newFileWriter(m.fileType) 972 | file := fmt.Sprintf("shard-%d", id) 973 | deltafile := filepath.Join(deltadir, file) 974 | if err = dw.Open(deltafile); err != nil { 975 | return err 976 | } 977 | deltaWriters[id] = dw 978 | deltaFiles[id] = file 979 | } 980 | 981 | if err = m.changeDeltaWrState(dwStateInit, deltaWriters, snap); err != nil { 982 | return err 983 | } 984 | 985 | // Create a placeholder snapshot object. We are decoupled from holding snapshot items 986 | // The fakeSnap object is to use the same iterator without any special handling for 987 | // usual refcount based freeing. 988 | 989 | snap.Close() 990 | snapClosed = true 991 | fakeSnap := *snap 992 | fakeSnap.refCount = 1 993 | snap = &fakeSnap 994 | 995 | defer func() { 996 | if err = m.changeDeltaWrState(dwStateTerminate, nil, nil); err == nil { 997 | bs, _ := json.Marshal(deltaFiles) 998 | ioutil.WriteFile(filepath.Join(deltadir, "files.json"), bs, 0660) 999 | } 1000 | }() 1001 | } 1002 | 1003 | visitorCallback := func(itm *Item, shard int) error { 1004 | if m.hasShutdown { 1005 | return ErrShutdown 1006 | } 1007 | 1008 | w := writers[shard] 1009 | if err := w.WriteItem(itm); err != nil { 1010 | return err 1011 | } 1012 | 1013 | if itmCallback != nil { 1014 | itmCallback(&ItemEntry{itm: itm, n: nil}) 1015 | } 1016 | 1017 | return nil 1018 | } 1019 | 1020 | if err = m.Visitor(snap, visitorCallback, shards, concurr); err == nil { 1021 | bs, _ := json.Marshal(files) 1022 | ioutil.WriteFile(filepath.Join(datadir, "files.json"), bs, 0660) 1023 | } 1024 | 1025 | return err 1026 | } 1027 | 1028 | // LoadFromDisk restores Nitro from a disk backup 1029 | func (m *Nitro) LoadFromDisk(dir string, concurr int, callb ItemCallback) (*Snapshot, error) { 1030 | var wg sync.WaitGroup 1031 | var files []string 1032 | var bs []byte 1033 | var err error 1034 | datadir := filepath.Join(dir, "data") 1035 | 1036 | if bs, err = ioutil.ReadFile(filepath.Join(datadir, "files.json")); err != nil { 1037 | return nil, err 1038 | } 1039 | json.Unmarshal(bs, &files) 1040 | 1041 | var nodeCallb skiplist.NodeCallback 1042 | wchan := make(chan int) 1043 | b := skiplist.NewBuilderWithConfig(m.newStoreConfig()) 1044 | b.SetItemSizeFunc(ItemSize) 1045 | segments := make([]*skiplist.Segment, len(files)) 1046 | readers := make([]FileReader, len(files)) 1047 | errors := make([]error, len(files)) 1048 | 1049 | if callb != nil { 1050 | nodeCallb = func(n *skiplist.Node) { 1051 | callb(&ItemEntry{itm: (*Item)(n.Item()), n: n}) 1052 | } 1053 | } 1054 | 1055 | defer func() { 1056 | for _, r := range readers { 1057 | if r != nil { 1058 | r.Close() 1059 | } 1060 | } 1061 | }() 1062 | 1063 | for i, file := range files { 1064 | segments[i] = b.NewSegment() 1065 | segments[i].SetNodeCallback(nodeCallb) 1066 | r := m.newFileReader(m.fileType) 1067 | datafile := filepath.Join(datadir, file) 1068 | if err := r.Open(datafile); err != nil { 1069 | return nil, err 1070 | } 1071 | 1072 | readers[i] = r 1073 | } 1074 | 1075 | for i := 0; i < concurr; i++ { 1076 | wg.Add(1) 1077 | go func(wg *sync.WaitGroup) { 1078 | defer wg.Done() 1079 | 1080 | for shard := range wchan { 1081 | r := readers[shard] 1082 | loop: 1083 | for { 1084 | itm, err := r.ReadItem() 1085 | if err != nil { 1086 | errors[shard] = err 1087 | return 1088 | } 1089 | 1090 | if itm == nil { 1091 | break loop 1092 | } 1093 | segments[shard].Add(unsafe.Pointer(itm)) 1094 | } 1095 | } 1096 | }(&wg) 1097 | } 1098 | 1099 | for i := range files { 1100 | wchan <- i 1101 | } 1102 | close(wchan) 1103 | wg.Wait() 1104 | 1105 | for _, err := range errors { 1106 | if err != nil { 1107 | return nil, err 1108 | } 1109 | } 1110 | 1111 | m.store = b.Assemble(segments...) 1112 | 1113 | // Delta processing 1114 | if m.useDeltaFiles { 1115 | m.DeltaRestoreFailed = 0 1116 | m.DeltaRestored = 0 1117 | 1118 | wchan := make(chan int) 1119 | deltadir := filepath.Join(dir, "delta") 1120 | var files []string 1121 | if bs, err := ioutil.ReadFile(filepath.Join(deltadir, "files.json")); err == nil { 1122 | json.Unmarshal(bs, &files) 1123 | } 1124 | 1125 | readers := make([]FileReader, len(files)) 1126 | errors := make([]error, len(files)) 1127 | writers := make([]*Writer, concurr) 1128 | 1129 | defer func() { 1130 | for _, r := range readers { 1131 | if r != nil { 1132 | r.Close() 1133 | } 1134 | } 1135 | }() 1136 | 1137 | for i, file := range files { 1138 | r := m.newFileReader(m.fileType) 1139 | deltafile := filepath.Join(deltadir, file) 1140 | if err := r.Open(deltafile); err != nil { 1141 | return nil, err 1142 | } 1143 | 1144 | readers[i] = r 1145 | } 1146 | 1147 | for i := 0; i < concurr; i++ { 1148 | writers[i] = m.newWriter() 1149 | wg.Add(1) 1150 | go func(wg *sync.WaitGroup, id int) { 1151 | defer wg.Done() 1152 | 1153 | for shard := range wchan { 1154 | r := readers[shard] 1155 | loop: 1156 | for { 1157 | itm, err := r.ReadItem() 1158 | if err != nil { 1159 | errors[shard] = err 1160 | return 1161 | } 1162 | 1163 | if itm == nil { 1164 | break loop 1165 | } 1166 | 1167 | w := writers[id] 1168 | if n, success := w.store.Insert2(unsafe.Pointer(itm), 1169 | w.insCmp, w.existCmp, w.buf, w.rand.Float32, &w.slSts1); success { 1170 | 1171 | w.resSts.DeltaRestored++ 1172 | if nodeCallb != nil { 1173 | nodeCallb(n) 1174 | } 1175 | } else { 1176 | w.freeItem(itm) 1177 | w.resSts.DeltaRestoreFailed++ 1178 | } 1179 | } 1180 | } 1181 | 1182 | // Aggregate stats 1183 | w := writers[id] 1184 | m.store.Stats.Merge(&w.slSts1) 1185 | atomic.AddUint64(&m.restoreStats.DeltaRestored, w.resSts.DeltaRestored) 1186 | atomic.AddUint64(&m.restoreStats.DeltaRestoreFailed, w.resSts.DeltaRestoreFailed) 1187 | }(&wg, i) 1188 | } 1189 | 1190 | for i := range files { 1191 | wchan <- i 1192 | } 1193 | close(wchan) 1194 | wg.Wait() 1195 | 1196 | for _, err := range errors { 1197 | if err != nil { 1198 | return nil, err 1199 | } 1200 | } 1201 | } 1202 | 1203 | stats := m.store.GetStats() 1204 | m.itemsCount = int64(stats.NodeCount) 1205 | return m.NewSnapshot() 1206 | } 1207 | 1208 | // DumpStats returns Nitro statistics 1209 | func (m *Nitro) DumpStats() string { 1210 | return m.aggrStoreStats().String() 1211 | } 1212 | 1213 | func (m *Nitro) aggrStoreStats() skiplist.StatsReport { 1214 | sts := m.store.GetStats() 1215 | for w := m.wlist; w != nil; w = w.next { 1216 | sts.Apply(&w.slSts1) 1217 | sts.Apply(&w.slSts2) 1218 | sts.Apply(&w.slSts3) 1219 | } 1220 | 1221 | return sts 1222 | } 1223 | 1224 | // MemoryInUse returns total memory used by all Nitro instances in the current process 1225 | func MemoryInUse() (sz int64) { 1226 | buf := dbInstances.MakeBuf() 1227 | defer dbInstances.FreeBuf(buf) 1228 | iter := dbInstances.NewIterator(CompareNitro, buf) 1229 | for iter.SeekFirst(); iter.Valid(); iter.Next() { 1230 | db := (*Nitro)(iter.Get()) 1231 | sz += db.MemoryInUse() 1232 | } 1233 | 1234 | return 1235 | } 1236 | 1237 | // Debug enables debug mode 1238 | // Additional details will be logged in the statistics 1239 | func Debug(flag bool) { 1240 | skiplist.Debug = flag 1241 | mm.Debug = flag 1242 | } 1243 | --------------------------------------------------------------------------------