├── .travis.yml ├── LICENSE.txt ├── README.md ├── examples └── kv.go ├── file.go ├── go.mod ├── item.go ├── iterator.go ├── licenses ├── APL2.txt └── BSL-Couchbase.txt ├── mm ├── build.go ├── malloc.c ├── malloc.go ├── malloc.h ├── malloc_perf_test.go └── malloc_test.go ├── nitro.go ├── nitro_test.go ├── nodelist.go ├── nodelist_test.go ├── nodetable ├── table.go └── table_test.go └── skiplist ├── C ├── main.cc └── skiplist.hh ├── access_barrier.go ├── builder.go ├── item.go ├── iterator.go ├── merger.go ├── merger_test.go ├── node.go ├── node_alloc_amd64.go ├── node_amd64.go ├── skiplist.go ├── skiplist_test.go └── stats.go /.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2016-Present Couchbase, Inc. 2 | # 3 | # Use of this software is governed by the Business Source License included in 4 | # the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | # file, in accordance with the Business Source License, use of this software 6 | # will be governed by the Apache License, Version 2.0, included in the file 7 | # licenses/APL2.txt. 8 | 9 | sudo: false 10 | language: go 11 | go: 12 | - 1.6 13 | 14 | before_install: 15 | - go get github.com/axw/gocov/gocov 16 | - go get github.com/mattn/goveralls 17 | - go get golang.org/x/tools/cmd/cover 18 | 19 | script: 20 | - go get ./... 21 | - go test -v ./... 22 | - $HOME/gopath/bin/goveralls -service=travis-ci 23 | 24 | notifications: 25 | email: false 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Source code in this repository is licensed under various licenses. The 2 | Business Source License 1.1 (BSL) is one such license. Each file indicates in 3 | a section at the beginning of the file the name of the license that applies to 4 | it. All licenses used in this repository can be found in the top-level 5 | licenses directory. 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nitro 2 | 3 | A high performance in-memory key-value item storage engine written in golang. 4 | The storage engine is based lock-free data structures and scales well with 5 | multicore CPUs. 6 | 7 | [![Build Status](https://travis-ci.org/couchbase/nitro.svg?branch=master)](https://travis-ci.org/couchbase/nitro) 8 | [![Go Report Card](https://goreportcard.com/badge/github.com/couchbase/nitro)](https://goreportcard.com/report/github.com/couchbase/nitro) 9 | [![GoDoc](https://godoc.org/github.com/couchbase/nitro?status.svg)](https://godoc.org/github.com/couchbase/nitro) 10 | 11 | 12 | ### Features 13 | 14 | - Operations: insert, delete, iterator (lookup, range queries) 15 | - Supports multiple concurrent readers and writers which scales almost linearly 16 | - Database snapshots which facilitates stable and repeatable scans 17 | - Lock-free data structures ensures that concurrent readers and writers doesn't 18 | block each other 19 | - The memory overhead of metadata of an item is 64 bytes 20 | - Fast snapshotting: Minimal overhead snapshots and they can be created frequently (eg,. every 10ms) 21 | - Optional memory manager based on jemalloc to avoid golang garbage collector 22 | for higher performance 23 | - Custom key comparator 24 | - Fast backup and restore on disk 25 | 26 | ### Example usage 27 | 28 | // Create a nitro instance with default config 29 | db := nitro.New() 30 | defer db.Close() 31 | 32 | // Create a writer 33 | // A writer should be created for every concurrent thread 34 | w := db.NewWriter() 35 | for i := 0; i < 100; i++ { 36 | itm := []byte(fmt.Sprintf("item-%02d", i)) 37 | w.Put(itm) 38 | } 39 | 40 | // Create an immutable snapshot 41 | snap1, _ := db.NewSnapshot() 42 | 43 | for i := 0; i < 100; i++ { 44 | if i%2 == 0 { 45 | itm := []byte(fmt.Sprintf("item-%02d", i)) 46 | w.Delete(itm) 47 | } 48 | } 49 | 50 | // Create an immutable snapshot 51 | snap2, _ := db.NewSnapshot() 52 | 53 | // Create an iterator for a snapshot 54 | it1 := snap1.NewIterator() 55 | count1 := 0 56 | for it1.SeekFirst(); it1.Valid(); it1.Next() { 57 | fmt.Println("snap-1", string(it1.Get())) 58 | count1++ 59 | } 60 | 61 | // Close snapshot and iterator once you have finished using them 62 | it1.Close() 63 | snap1.Close() 64 | 65 | // Create an iterator for a snapshot 66 | it2 := snap2.NewIterator() 67 | count2 := 0 68 | for it2.SeekFirst(); it2.Valid(); it2.Next() { 69 | fmt.Println("snap-2", string(it2.Get())) 70 | count2++ 71 | } 72 | 73 | // Close snapshot and iterator once you have finished using them 74 | it2.Close() 75 | snap2.Close() 76 | 77 | fmt.Println(count2 == count1/2) 78 | 79 | 80 | ### License 81 | 82 | Apache 2.0 83 | -------------------------------------------------------------------------------- /examples/kv.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016-Present Couchbase, Inc. 3 | 4 | Use of this software is governed by the Business Source License included in 5 | the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 6 | file, in accordance with the Business Source License, use of this software will 7 | be governed by the Apache License, Version 2.0, included in the file 8 | licenses/APL2.txt. 9 | */ 10 | 11 | package main 12 | 13 | import ( 14 | "fmt" 15 | "github.com/couchbase/nitro" 16 | ) 17 | 18 | func main() { 19 | cfg := nitro.DefaultConfig() 20 | cfg.SetKeyComparator(nitro.CompareKV) 21 | 22 | db := nitro.NewWithConfig(cfg) 23 | defer db.Close() 24 | 25 | w := db.NewWriter() 26 | 27 | w.Put(nitro.KVToBytes([]byte("key1"), []byte("value1"))) 28 | w.Put(nitro.KVToBytes([]byte("key2"), []byte("value2"))) 29 | snap1, _ := db.NewSnapshot() 30 | w.Delete(nitro.KVToBytes([]byte("key1"), nil)) 31 | w.Put(nitro.KVToBytes([]byte("key1"), []byte("value1-new"))) 32 | snap2, _ := db.NewSnapshot() 33 | 34 | fmt.Println("snapshot 1") 35 | itr := snap1.NewIterator() 36 | snap1.Close() 37 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 38 | k, v := nitro.KVFromBytes(itr.Get()) 39 | fmt.Printf("%s = %s\n", k, v) 40 | } 41 | itr.Close() 42 | 43 | fmt.Println("snapshot 2") 44 | itr = snap2.NewIterator() 45 | snap2.Close() 46 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 47 | k, v := nitro.KVFromBytes(itr.Get()) 48 | fmt.Printf("%s = %s\n", k, v) 49 | } 50 | itr.Close() 51 | } 52 | -------------------------------------------------------------------------------- /file.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nitro 10 | 11 | import "os" 12 | import "bufio" 13 | import "errors" 14 | 15 | var ( 16 | // DiskBlockSize - backup file reader and writer 17 | DiskBlockSize = 512 * 1024 18 | errNotEnoughSpace = errors.New("Not enough space in the buffer") 19 | ) 20 | 21 | // FileType describes backup file format 22 | type FileType int 23 | 24 | const ( 25 | encodeBufSize = 4 26 | readerBufSize = 10000 27 | // RawdbFile - backup file storage format 28 | RawdbFile FileType = iota 29 | ) 30 | 31 | // FileWriter represents backup file writer 32 | type FileWriter interface { 33 | Open(path string) error 34 | WriteItem(*Item) error 35 | Checksum() uint32 36 | Close() error 37 | } 38 | 39 | // FileReader represents backup file reader 40 | type FileReader interface { 41 | Open(path string) error 42 | ReadItem() (*Item, error) 43 | Checksum() uint32 44 | Close() error 45 | } 46 | 47 | func (m *Nitro) newFileWriter(t FileType) FileWriter { 48 | var w FileWriter 49 | if t == RawdbFile { 50 | w = &rawFileWriter{db: m} 51 | } 52 | return w 53 | } 54 | 55 | func (m *Nitro) newFileReader(t FileType, ver int) FileReader { 56 | var r FileReader 57 | if t == RawdbFile { 58 | r = &rawFileReader{db: m, version: ver} 59 | } 60 | return r 61 | } 62 | 63 | type rawFileWriter struct { 64 | db *Nitro 65 | fd *os.File 66 | w *bufio.Writer 67 | buf []byte 68 | path string 69 | checksum uint32 70 | } 71 | 72 | func (f *rawFileWriter) Open(path string) error { 73 | var err error 74 | f.fd, err = os.OpenFile(path, os.O_WRONLY|os.O_CREATE, 0755) 75 | if err == nil { 76 | f.buf = make([]byte, encodeBufSize) 77 | f.w = bufio.NewWriterSize(f.fd, DiskBlockSize) 78 | } 79 | return err 80 | } 81 | 82 | func (f *rawFileWriter) WriteItem(itm *Item) error { 83 | checksum, err := f.db.EncodeItem(itm, f.buf, f.w) 84 | f.checksum = f.checksum ^ checksum 85 | return err 86 | } 87 | 88 | func (f *rawFileWriter) Checksum() uint32 { 89 | return f.checksum 90 | } 91 | 92 | func (f *rawFileWriter) Close() error { 93 | terminator := &Item{} 94 | 95 | if err := f.WriteItem(terminator); err != nil { 96 | return err 97 | } 98 | 99 | f.w.Flush() 100 | return f.fd.Close() 101 | } 102 | 103 | type rawFileReader struct { 104 | version int 105 | db *Nitro 106 | fd *os.File 107 | r *bufio.Reader 108 | buf []byte 109 | path string 110 | checksum uint32 111 | } 112 | 113 | func (f *rawFileReader) Open(path string) error { 114 | var err error 115 | f.fd, err = os.Open(path) 116 | if err == nil { 117 | f.buf = make([]byte, encodeBufSize) 118 | f.r = bufio.NewReaderSize(f.fd, DiskBlockSize) 119 | } 120 | return err 121 | } 122 | 123 | func (f *rawFileReader) ReadItem() (*Item, error) { 124 | itm, checksum, err := f.db.DecodeItem(f.version, f.buf, f.r) 125 | if itm != nil { // Checksum excludes terminal nil item 126 | f.checksum = f.checksum ^ checksum 127 | } 128 | return itm, err 129 | } 130 | 131 | func (f *rawFileReader) Checksum() uint32 { 132 | return f.checksum 133 | } 134 | 135 | func (f *rawFileReader) Close() error { 136 | return f.fd.Close() 137 | } 138 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/couchbase/nitro 2 | 3 | go 1.18 4 | -------------------------------------------------------------------------------- /item.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nitro 10 | 11 | import ( 12 | "bytes" 13 | "encoding/binary" 14 | "hash/crc32" 15 | "io" 16 | "reflect" 17 | "unsafe" 18 | ) 19 | 20 | var itemHeaderSize = unsafe.Sizeof(Item{}) 21 | 22 | // Item represents nitro item header 23 | // The item data is followed by the header. 24 | // Item data is a block of bytes. The user can store key and value into a 25 | // block of bytes and provide custom key comparator. 26 | type Item struct { 27 | bornSn uint32 28 | deadSn uint32 29 | dataLen uint32 30 | } 31 | 32 | func (m *Nitro) newItem(data []byte, useMM bool) (itm *Item) { 33 | l := len(data) 34 | itm = m.allocItem(l, useMM) 35 | copy(itm.Bytes(), data) 36 | return itm 37 | } 38 | 39 | func (m *Nitro) freeItem(itm *Item) { 40 | if m.useMemoryMgmt { 41 | m.freeFun(unsafe.Pointer(itm)) 42 | } 43 | } 44 | 45 | func (m *Nitro) allocItem(l int, useMM bool) (itm *Item) { 46 | blockSize := itemHeaderSize + uintptr(l) 47 | if useMM { 48 | itm = (*Item)(m.mallocFun(int(blockSize))) 49 | itm.deadSn = 0 50 | itm.bornSn = 0 51 | } else { 52 | block := make([]byte, blockSize) 53 | itm = (*Item)(unsafe.Pointer(&block[0])) 54 | } 55 | 56 | itm.dataLen = uint32(l) 57 | return 58 | } 59 | 60 | // EncodeItem encodes in [4 byte len][item_bytes] format. 61 | func (m *Nitro) EncodeItem(itm *Item, buf []byte, w io.Writer) ( 62 | checksum uint32, err error) { 63 | l := 4 64 | if len(buf) < l { 65 | return checksum, errNotEnoughSpace 66 | } 67 | 68 | binary.BigEndian.PutUint32(buf[0:4], uint32(itm.dataLen)) 69 | if _, err = w.Write(buf[0:4]); err != nil { 70 | return 71 | } 72 | checksum = crc32.ChecksumIEEE(buf[0:4]) 73 | itmBytes := itm.Bytes() 74 | if _, err = w.Write(itmBytes); err != nil { 75 | return 76 | } 77 | checksum = checksum ^ crc32.ChecksumIEEE(itmBytes) 78 | 79 | return 80 | } 81 | 82 | // DecodeItem decodes encoded item 83 | // v0: [2 byte len][item_bytes] format. 84 | // v1: [4 byte len][item_bytes] format. 85 | func (m *Nitro) DecodeItem(ver int, buf []byte, r io.Reader) (*Item, uint32, error) { 86 | var l int 87 | var checksum uint32 88 | 89 | if ver == 0 { 90 | if _, err := io.ReadFull(r, buf[0:2]); err != nil { 91 | return nil, checksum, err 92 | } 93 | l = int(binary.BigEndian.Uint16(buf[0:2])) 94 | checksum = crc32.ChecksumIEEE(buf[0:2]) 95 | } else { 96 | if _, err := io.ReadFull(r, buf[0:4]); err != nil { 97 | return nil, checksum, err 98 | } 99 | l = int(binary.BigEndian.Uint32(buf[0:4])) 100 | checksum = crc32.ChecksumIEEE(buf[0:4]) 101 | } 102 | 103 | if l > 0 { 104 | itm := m.allocItem(l, m.useMemoryMgmt) 105 | data := itm.Bytes() 106 | _, err := io.ReadFull(r, data) 107 | if err == nil { 108 | checksum = checksum ^ crc32.ChecksumIEEE(data) 109 | } 110 | return itm, checksum, err 111 | } 112 | 113 | return nil, checksum, nil 114 | } 115 | 116 | // Bytes return item data bytes 117 | func (itm *Item) Bytes() (bs []byte) { 118 | l := itm.dataLen 119 | dataOffset := uintptr(unsafe.Pointer(itm)) + itemHeaderSize 120 | 121 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) 122 | hdr.Data = dataOffset 123 | hdr.Len = int(l) 124 | hdr.Cap = hdr.Len 125 | return 126 | } 127 | 128 | // ItemSize returns total bytes consumed by item representation 129 | func ItemSize(p unsafe.Pointer) int { 130 | itm := (*Item)(p) 131 | return int(itemHeaderSize + uintptr(itm.dataLen)) 132 | } 133 | 134 | // KVToBytes encodes key-value pair to item bytes which can be passed 135 | // to the Put() and Delete() methods. 136 | func KVToBytes(k, v []byte) []byte { 137 | klen := len(k) 138 | buf := make([]byte, 2, len(k)+len(v)+2) 139 | binary.LittleEndian.PutUint16(buf[0:2], uint16(klen)) 140 | buf = append(buf, k...) 141 | buf = append(buf, v...) 142 | 143 | return buf 144 | } 145 | 146 | // KVFromBytes extracts key-value pair from item bytes returned by iterator 147 | func KVFromBytes(bs []byte) (k, v []byte) { 148 | klen := int(binary.LittleEndian.Uint16(bs[0:2])) 149 | return bs[2 : 2+klen], bs[2+klen:] 150 | } 151 | 152 | // CompareKV is a comparator for KV item 153 | func CompareKV(a []byte, b []byte) int { 154 | la := int(binary.LittleEndian.Uint16(a[0:2])) 155 | lb := int(binary.LittleEndian.Uint16(b[0:2])) 156 | 157 | return bytes.Compare(a[2:2+la], b[2:2+lb]) 158 | } 159 | -------------------------------------------------------------------------------- /iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nitro 10 | 11 | import ( 12 | "github.com/couchbase/nitro/skiplist" 13 | "unsafe" 14 | ) 15 | 16 | // Iterator implements Nitro snapshot iterator 17 | type Iterator struct { 18 | count int 19 | refreshRate int 20 | 21 | snap *Snapshot 22 | iter *skiplist.Iterator 23 | buf *skiplist.ActionBuffer 24 | } 25 | 26 | func (it *Iterator) skipUnwanted() { 27 | loop: 28 | if !it.iter.Valid() { 29 | return 30 | } 31 | itm := (*Item)(it.iter.Get()) 32 | if itm.bornSn > it.snap.sn || (itm.deadSn > 0 && itm.deadSn <= it.snap.sn) { 33 | it.iter.Next() 34 | it.count++ 35 | goto loop 36 | } 37 | } 38 | 39 | // SeekFirst moves cursor to the beginning 40 | func (it *Iterator) SeekFirst() { 41 | it.iter.SeekFirst() 42 | it.skipUnwanted() 43 | } 44 | 45 | // Seek to a specified key or the next bigger one if an item with key does not 46 | // exist. 47 | func (it *Iterator) Seek(bs []byte) { 48 | itm := it.snap.db.newItem(bs, false) 49 | it.iter.Seek(unsafe.Pointer(itm)) 50 | it.skipUnwanted() 51 | } 52 | 53 | // Valid eturns false when the iterator has reached the end. 54 | func (it *Iterator) Valid() bool { 55 | return it.iter.Valid() 56 | } 57 | 58 | // Get eturns the current item data from the iterator. 59 | func (it *Iterator) Get() []byte { 60 | return (*Item)(it.iter.Get()).Bytes() 61 | } 62 | 63 | // GetNode eturns the current skiplist node which holds current item. 64 | func (it *Iterator) GetNode() *skiplist.Node { 65 | return it.iter.GetNode() 66 | } 67 | 68 | // Next moves iterator cursor to the next item 69 | func (it *Iterator) Next() { 70 | it.iter.Next() 71 | it.count++ 72 | it.skipUnwanted() 73 | if it.refreshRate > 0 && it.count > it.refreshRate { 74 | it.Refresh() 75 | it.count = 0 76 | } 77 | } 78 | 79 | // Refresh is a helper API to call refresh accessor tokens manually 80 | // This would enable SMR to reclaim objects faster if an iterator is 81 | // alive for a longer duration of time. 82 | func (it *Iterator) Refresh() { 83 | if it.Valid() { 84 | itm := it.snap.db.ptrToItem(it.GetNode().Item()) 85 | it.iter.Close() 86 | it.iter = it.snap.db.store.NewIterator(it.snap.db.iterCmp, it.buf) 87 | it.iter.Seek(unsafe.Pointer(itm)) 88 | } 89 | } 90 | 91 | // SetRefreshRate sets automatic refresh frequency. By default, it is unlimited 92 | // If this is set, the iterator SMR accessor will be refreshed 93 | // after every `rate` items. 94 | func (it *Iterator) SetRefreshRate(rate int) { 95 | it.refreshRate = rate 96 | } 97 | 98 | // Close executes destructor for iterator 99 | func (it *Iterator) Close() { 100 | it.snap.Close() 101 | it.snap.db.store.FreeBuf(it.buf) 102 | it.iter.Close() 103 | } 104 | 105 | // NewIterator creates an iterator for a Nitro snapshot 106 | func (m *Nitro) NewIterator(snap *Snapshot) *Iterator { 107 | if !snap.Open() { 108 | return nil 109 | } 110 | buf := snap.db.store.MakeBuf() 111 | return &Iterator{ 112 | snap: snap, 113 | iter: m.store.NewIterator(m.iterCmp, buf), 114 | buf: buf, 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /licenses/APL2.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /licenses/BSL-Couchbase.txt: -------------------------------------------------------------------------------- 1 | COUCHBASE BUSINESS SOURCE LICENSE AGREEMENT 2 | 3 | Business Source License 1.1 4 | Licensor: Couchbase, Inc. 5 | Licensed Work: Couchbase Server Version 8.0 6 | The Licensed Work is © 2021-Present Couchbase, Inc. 7 | 8 | Additional Use Grant: You may make production use of the Licensed Work, provided 9 | you comply with the following conditions: 10 | 11 | (i) You may not prepare a derivative work based upon the Licensed Work and 12 | distribute or otherwise offer such derivative work, whether on a standalone 13 | basis or in combination with other products, applications, or services 14 | (including in any "as-a-service" offering, such as, by way of example, a 15 | software-as-a-service, database-as-a-service, or infrastructure-as-a-service 16 | offering, or any other offering based on a cloud computing or other type of 17 | hosted distribution model (collectively, "Hosted Offerings")), for a fee or 18 | otherwise on a commercial or other for-profit basis. 19 | 20 | (ii) You may not link the Licensed Work to, or otherwise include the Licensed 21 | Work in or with, any product, application, or service (including in any Hosted 22 | Offering) that is distributed or otherwise offered, whether on a standalone 23 | basis or in combination with other products, applications, or services for a fee 24 | or otherwise on a commercial or other for-profit basis. Condition (ii) shall not 25 | limit the generality of condition (i) above. 26 | 27 | 28 | Change Date: March 1, 2029 29 | 30 | Change License: Apache License, Version 2.0 31 | 32 | 33 | Notice 34 | 35 | The Business Source License (this document, or the "License") is not an Open 36 | Source license. However, the Licensed Work will eventually be made available 37 | under an Open Source License, as stated in this License. License text copyright 38 | © 2017 MariaDB Corporation Ab, All Rights Reserved. "Business Source License" is 39 | a trademark of MariaDB Corporation Ab. 40 | 41 | Terms 42 | 43 | The Licensor hereby grants You the right to copy, modify, create derivative 44 | works, redistribute, and make non-production use of the Licensed Work. The 45 | Licensor may make an Additional Use Grant, above, permitting limited production 46 | use. 47 | 48 | Effective on the Change Date, or the fourth anniversary of the first publicly 49 | available distribution of a specific version of the Licensed Work under this 50 | License, whichever comes first, the Licensor hereby grants you rights under the 51 | terms of the Change License, and the rights granted in the paragraph above 52 | terminate. 53 | 54 | If your use of the Licensed Work does not comply with the requirements currently 55 | in effect as described in this License, you must purchase a commercial license 56 | from the Licensor, its affiliated entities, or authorized resellers, or you must 57 | refrain from using the Licensed Work. 58 | 59 | All copies of the original and modified Licensed Work, and derivative works of 60 | the Licensed Work, are subject to this License. This License applies separately 61 | for each version of the Licensed Work and the Change Date may vary for each 62 | version of the Licensed Work released by Licensor. 63 | 64 | You must conspicuously display this License on each original or modified copy of 65 | the Licensed Work. If you receive the Licensed Work in original or modified form 66 | from a third party, the terms and conditions set forth in this License apply to 67 | your use of that work. 68 | 69 | Any use of the Licensed Work in violation of this License will automatically 70 | terminate your rights under this License for the current and all other versions 71 | of the Licensed Work. 72 | 73 | This License does not grant you any right in any trademark or logo of Licensor 74 | or its affiliates (provided that you may use a trademark or logo of Licensor as 75 | expressly required by this License). 76 | 77 | TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN 78 | "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS 79 | OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY, 80 | FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE. 81 | 82 | MariaDB hereby grants you permission to use this License's text to license your 83 | works, and to refer to it using the trademark "Business Source License", as long 84 | as you comply with the Covenants of Licensor below. 85 | 86 | Covenants of Licensor 87 | 88 | In consideration of the right to use this License's text and the "Business 89 | Source License" name and trademark, Licensor covenants to MariaDB, and to all 90 | other recipients of the licensed work to be provided by Licensor: 91 | 92 | 1. To specify as the Change License the GPL Version 2.0 or any later version, or 93 | a license that is compatible with GPL Version 2.0 or a later version, where 94 | "compatible" means that software provided under the Change License can be 95 | included in a program with software provided under GPL Version 2.0 or a later 96 | version. Licensor may specify additional Change Licenses without limitation. 97 | 98 | 2. To either: (a) specify an additional grant of rights to use that does not 99 | impose any additional restriction on the right granted in this License, as the 100 | Additional Use Grant; or (b) insert the text "None". 101 | 102 | 3. To specify a Change Date. 103 | 104 | 4. Not to modify this License in any other way. 105 | -------------------------------------------------------------------------------- /mm/build.go: -------------------------------------------------------------------------------- 1 | // +build jemalloc 2 | 3 | // Copyright 2016-Present Couchbase, Inc. 4 | // 5 | // Use of this software is governed by the Business Source License included in 6 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 7 | // file, in accordance with the Business Source License, use of this software 8 | // will be governed by the Apache License, Version 2.0, included in the file 9 | // licenses/APL2.txt. 10 | 11 | package mm 12 | 13 | // #cgo CFLAGS: -DJEMALLOC=1 14 | // #cgo LDFLAGS: -ljemalloc 15 | import "C" 16 | -------------------------------------------------------------------------------- /mm/malloc.c: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | #include "malloc.h" 9 | #include 10 | #include 11 | #include 12 | 13 | #ifdef JEMALLOC 14 | #include 15 | 16 | // a) do not disable tcache, indexer may run into thread limit exhaustion 17 | // b) windows do not honor the config string see MB-63068 18 | const char* je_malloc_conf = "narenas:2" 19 | 20 | // Enable profiling, but keep it deactivated. Profiling is supported only on 21 | // linux. 22 | #ifdef __linux__ 23 | ",prof:true,prof_active:false" 24 | #endif 25 | ; 26 | 27 | // number of user arenas 28 | #define MAX_USER_ARENAS 2U 29 | static unsigned int user_arenas[MAX_USER_ARENAS]; 30 | static unsigned int user_arenas_init; 31 | 32 | #if defined(__linux__) || defined(__APPLE__) 33 | # include // C11 34 | 35 | static atomic_uint counter = 0; 36 | static __thread unsigned int tsd = 0; 37 | 38 | // key is ignored for thread based assignment 39 | static inline unsigned int assign_arena(unsigned short key) { 40 | if (tsd == 0) { 41 | unsigned int x = 1; 42 | tsd = (unsigned int)(atomic_fetch_add(&counter, x)) + 1; 43 | } 44 | return user_arenas[tsd % MAX_USER_ARENAS]; 45 | } 46 | #else 47 | // on windows C11 based thread local specifier has portability issues 48 | // https://github.com/golang/go/issues/20982 49 | static inline unsigned int assign_arena(unsigned short key) { 50 | return user_arenas[key % MAX_USER_ARENAS]; 51 | } 52 | #endif 53 | 54 | // not thread safe 55 | static void reset_user_arena_info() { 56 | #ifdef JEMALLOC 57 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 58 | if (user_arenas[i] > 0) { 59 | user_arenas[i] = 0; 60 | } 61 | } 62 | user_arenas_init = 0; 63 | #endif 64 | } 65 | 66 | static int is_auto_arena(unsigned int arena) { 67 | #ifdef JEMALLOC 68 | if (user_arenas_init > 0) { 69 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 70 | if ((user_arenas[i] > 0) && (user_arenas[i] == arena)) { 71 | return 0; 72 | } 73 | } 74 | } 75 | return 1; 76 | #else 77 | return 1; 78 | #endif 79 | } 80 | 81 | // writecb is callback passed to jemalloc used to process a chunk of 82 | // stats text. It is in charge of making sure that the buffer is 83 | // sufficiently sized. 84 | void writecb(void* ref, const char* s) { 85 | stats_buf* buf = (stats_buf*)(ref); 86 | int len; 87 | len = strlen(s); 88 | if (buf->offset + len >= buf->size) { 89 | // Buffer is too small, resize it to fit at least len and string 90 | // terminator 91 | buf->size += len + 2; 92 | buf->buf = realloc(buf->buf, buf->size); 93 | } 94 | strncpy(buf->buf + buf->offset, s, len); 95 | buf->offset += len; 96 | } 97 | 98 | // doStats returns a string with jemalloc stats. 99 | // Caller is responsible to call free on the string buffer. 100 | char *doStats(char *opts) { 101 | stats_buf buf; 102 | buf.size = 1024; 103 | buf.buf = malloc(buf.size); 104 | buf.offset = 0; 105 | je_malloc_stats_print(writecb, &buf, opts); 106 | buf.buf[buf.offset] = 0; 107 | return buf.buf; 108 | } 109 | 110 | #endif 111 | 112 | static int mm_create_arena(unsigned int* arena) { 113 | #ifdef JEMALLOC 114 | if (arena == NULL) { 115 | return -EINVAL; 116 | } 117 | size_t sz = sizeof(unsigned); 118 | return je_mallctl("arenas.create", (void*)arena, &sz, NULL, 0); 119 | #else 120 | return -ENOTSUP; 121 | #endif 122 | } 123 | 124 | // not thread safe 125 | int mm_create_arenas() { 126 | #ifdef JEMALLOC 127 | if (user_arenas_init > 0) { 128 | return -1; 129 | } 130 | 131 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 132 | int ret = mm_create_arena(&user_arenas[i]); 133 | if (ret != 0) { 134 | reset_user_arena_info(); 135 | return ret; 136 | } 137 | } 138 | user_arenas_init = 1; 139 | return 0; 140 | #else 141 | return -ENOTSUP; 142 | #endif 143 | } 144 | 145 | // count may remain same even after arena destroy 146 | unsigned int mm_narenas() { 147 | #ifdef JEMALLOC 148 | unsigned int narenas = 0; 149 | size_t sz = sizeof(unsigned); 150 | int ret = je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0); 151 | if (ret == 0) { 152 | return narenas; 153 | } 154 | return 0; 155 | #else 156 | return 0; 157 | #endif 158 | } 159 | 160 | unsigned int mm_user_narenas() { 161 | #ifdef JEMALLOC 162 | if (user_arenas_init == 0) { 163 | return 0; 164 | } 165 | 166 | return MAX_USER_ARENAS; 167 | #else 168 | return 0; 169 | #endif 170 | } 171 | 172 | // mm_arenas_nbins returns the stat nbins which is the 173 | // number of bin size classes. 174 | unsigned int mm_arenas_nbins() { 175 | #ifdef JEMALLOC 176 | unsigned int nbins = 0; 177 | size_t sz = sizeof(unsigned int); 178 | je_mallctl("arenas.nbins", &nbins, &sz, NULL, 0); 179 | 180 | return nbins; 181 | #else 182 | return 0; 183 | #endif 184 | } 185 | 186 | // Should be used only when value is expected to be a size_t. 187 | size_t mm_arenas_i_stat(unsigned int i, const char* stat) { 188 | #ifdef JEMALLOC 189 | if (stat == NULL) { 190 | return 0; 191 | } 192 | size_t stat_val = 0; 193 | size_t sz = sizeof(size_t); 194 | char ctl[128]; 195 | snprintf(ctl, 128, "stats.arenas.%u.%s", i, stat); 196 | je_mallctl(ctl, &stat_val, &sz, NULL, 0); 197 | 198 | return stat_val; 199 | #else 200 | return 0; 201 | #endif 202 | } 203 | 204 | // mm_arenas_bin_i_stat returns the value of the stat `stat` which is 205 | // something valid which can be used in arenas.bin... 206 | // Should be used only when value is expected to be a size_t. 207 | size_t mm_arenas_bin_i_stat(unsigned int i, const char *stat) { 208 | #ifdef JEMALLOC 209 | if (stat == NULL) { 210 | return 0; 211 | } 212 | size_t stat_val = 0; 213 | size_t sz = sizeof(size_t); 214 | char ctl[128]; 215 | snprintf(ctl, 128, "arenas.bin.%d.%s", i, stat); 216 | je_mallctl(ctl, &stat_val, &sz, NULL, 0); 217 | 218 | return stat_val; 219 | #else 220 | return 0; 221 | #endif 222 | } 223 | 224 | // mm_stats_arenas_merged_bins_j_stat returns the value of the stat `stat` merged across arenas 225 | // The `stat` should be something valid which can be used in stats.arenas..bins... 226 | // Should be used only when value is expected to be a size_t. 227 | size_t mm_stats_arenas_merged_bins_j_stat(unsigned int j, const char *stat) { 228 | #ifdef JEMALLOC 229 | if (stat == NULL) { 230 | return 0; 231 | } 232 | size_t stat_val = 0; 233 | size_t sz = sizeof(size_t); 234 | char ctl[128]; 235 | snprintf(ctl, 128, "stats.arenas.%d.bins.%d.%s", MALLCTL_ARENAS_ALL, j, stat); 236 | je_mallctl(ctl, &stat_val, &sz, NULL, 0); 237 | 238 | return stat_val; 239 | #else 240 | return 0; 241 | #endif 242 | } 243 | 244 | void* mm_malloc(size_t sz) { 245 | #ifdef JEMALLOC 246 | return je_calloc(1, sz); 247 | #else 248 | return calloc(1, sz); 249 | #endif 250 | } 251 | 252 | void* mm_malloc_user_arena(size_t sz, unsigned short key) { 253 | #ifdef JEMALLOC 254 | unsigned int arena = assign_arena(key); 255 | return je_mallocx(sz, MALLOCX_ARENA(arena) | MALLOCX_ZERO); 256 | #else 257 | return calloc(1, sz); 258 | #endif 259 | } 260 | 261 | // jemalloc uses radix tree to identify the associated extent 262 | // a) free first releases to tcache bins 263 | // b) on full (CACHE_BIN_NCACHED_MAX) or GC, entries are flushed 264 | // to the respective extents for use by other threads 265 | // c) tcache can have pointers from extents of arenas other than one associated 266 | void mm_free(void *p) { 267 | #ifdef JEMALLOC 268 | je_free(p); 269 | #else 270 | free(p); 271 | #endif 272 | } 273 | 274 | char *mm_stats() { 275 | #ifdef JEMALLOC 276 | return doStats(NULL); 277 | #else 278 | return NULL; 279 | #endif 280 | } 281 | 282 | char *mm_stats_json() { 283 | #ifdef JEMALLOC 284 | return doStats("J"); 285 | #else 286 | return NULL; 287 | #endif 288 | } 289 | 290 | size_t mm_sizeat(void *p) { 291 | #ifdef JEMALLOC 292 | return je_sallocx(p, 0); 293 | #else 294 | return 0; 295 | #endif 296 | } 297 | 298 | // merged stat 299 | size_t mm_size() { 300 | size_t resident, sz; 301 | sz = sizeof(size_t); 302 | #ifdef JEMALLOC 303 | // Force stats cache flush 304 | uint64_t epoch = 1; 305 | sz = sizeof(epoch); 306 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 307 | 308 | je_mallctl("stats.resident", &resident, &sz, NULL, 0); 309 | return resident; 310 | #else 311 | return 0; 312 | #endif 313 | } 314 | 315 | // dedicated arena for oversized allocations is not required for user arenas. 316 | // see opt.oversize_threshold 317 | size_t mm_size_user_arena() { 318 | #ifdef JEMALLOC 319 | if (user_arenas_init == 0) { 320 | return 0; 321 | } 322 | 323 | char ctl[128]; 324 | uint64_t epoch = 1; 325 | size_t sz = sizeof(epoch); 326 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 327 | 328 | size_t resident = 0; 329 | sz = sizeof(resident); 330 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 331 | size_t res = 0; 332 | snprintf(ctl, 128, "stats.arenas.%u.resident", user_arenas[i]); 333 | (void)je_mallctl(ctl, &res, &sz, NULL, 0); 334 | resident += res; 335 | } 336 | return resident; 337 | #else 338 | return 0; 339 | #endif 340 | } 341 | 342 | size_t mm_size_auto_arena() { 343 | #ifdef JEMALLOC 344 | char ctl[128]; 345 | uint64_t epoch = 1; 346 | size_t sz = sizeof(epoch); 347 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 348 | 349 | size_t resident = 0; 350 | sz = sizeof(resident); 351 | unsigned int n = mm_narenas(); 352 | for (unsigned int i = 0; i < n; i++) { 353 | if (is_auto_arena(i)) { 354 | size_t res = 0; 355 | snprintf(ctl, 128, "stats.arenas.%u.resident", i); 356 | (void)je_mallctl(ctl, &res, &sz, NULL, 0); 357 | resident += res; 358 | } 359 | } 360 | return resident; 361 | #else 362 | return 0; 363 | #endif 364 | } 365 | 366 | size_t mm_alloc_size() { 367 | size_t allocated, sz; 368 | sz = sizeof(size_t); 369 | #ifdef JEMALLOC 370 | // Force stats cache flush 371 | uint64_t epoch = 1; 372 | sz = sizeof(epoch); 373 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 374 | 375 | je_mallctl("stats.allocated", &allocated, &sz, NULL, 0); 376 | return allocated; 377 | #else 378 | return 0; 379 | #endif 380 | } 381 | 382 | size_t mm_alloc_size_user_arena() { 383 | #ifdef JEMALLOC 384 | if (user_arenas_init == 0) { 385 | return 0; 386 | } 387 | 388 | char ctl[128]; 389 | uint64_t epoch = 1; 390 | size_t sz = sizeof(epoch); 391 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 392 | 393 | size_t allocated = 0; 394 | sz = sizeof(allocated); 395 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 396 | size_t alloc = 0; 397 | snprintf(ctl, 128, "stats.arenas.%u.small.allocated", user_arenas[i]); 398 | je_mallctl(ctl, &alloc, &sz, NULL, 0); 399 | allocated += alloc; 400 | snprintf(ctl, 128, "stats.arenas.%u.large.allocated", user_arenas[i]); 401 | alloc = 0; 402 | je_mallctl(ctl, &alloc, &sz, NULL, 0); 403 | allocated += alloc; 404 | } 405 | return allocated; 406 | #else 407 | return 0; 408 | #endif 409 | } 410 | 411 | size_t mm_alloc_size_auto_arena() { 412 | #ifdef JEMALLOC 413 | char ctl[128]; 414 | uint64_t epoch = 1; 415 | size_t sz = sizeof(epoch); 416 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 417 | 418 | size_t allocated = 0; 419 | sz = sizeof(allocated); 420 | unsigned int n = mm_narenas(); 421 | for (unsigned int i = 0; i < n; i++) { 422 | if (is_auto_arena(i)) { 423 | size_t alloc = 0; 424 | snprintf(ctl, 128, "stats.arenas.%u.small.allocated", i); 425 | je_mallctl(ctl, &alloc, &sz, NULL, 0); 426 | allocated += alloc; 427 | snprintf(ctl, 128, "stats.arenas.%u.large.allocated", i); 428 | alloc = 0; 429 | je_mallctl(ctl, &alloc, &sz, NULL, 0); 430 | allocated += alloc; 431 | } 432 | } 433 | return allocated; 434 | #else 435 | return 0; 436 | #endif 437 | } 438 | 439 | size_t mm_dirty_size() { 440 | #ifdef JEMALLOC 441 | // Force stats cache flush 442 | uint64_t epoch = 1; 443 | size_t sz = sizeof(epoch); 444 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 445 | 446 | // Just enough to hold "stats.arenas.%d.pdirty" formatted with max uint64 447 | char ctl[42]; 448 | snprintf(ctl, 42, "stats.arenas.%d.pdirty", MALLCTL_ARENAS_ALL); 449 | 450 | // Get page size 451 | size_t pageSize = 0; 452 | sz = sizeof(size_t); 453 | je_mallctl("arenas.page", &pageSize, &sz, NULL, 0); 454 | 455 | // Get number of dirty pages 456 | size_t pdirty = 0; 457 | je_mallctl(ctl, &pdirty, &sz, NULL, 0); 458 | 459 | // Return number of dirty bytes 460 | return pdirty * pageSize; 461 | #else 462 | return 0; 463 | #endif 464 | } 465 | 466 | // dirty memory for user arenas 467 | size_t mm_dirty_size_user_arena() { 468 | #ifdef JEMALLOC 469 | if (user_arenas_init == 0) { 470 | return 0; 471 | } 472 | 473 | char ctl[128]; 474 | uint64_t epoch = 1; 475 | size_t sz = sizeof(epoch); 476 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 477 | 478 | // Get page size 479 | size_t pageSize = 0; 480 | sz = sizeof(pageSize); 481 | je_mallctl("arenas.page", &pageSize, &sz, NULL, 0); 482 | 483 | size_t pdirty = 0; 484 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 485 | size_t pd = 0; 486 | snprintf(ctl, 128, "stats.arenas.%u.pdirty", user_arenas[i]); 487 | je_mallctl(ctl, &pd, &sz, NULL, 0); 488 | pdirty += pd; 489 | } 490 | // Return number of dirty bytes 491 | return pdirty * pageSize; 492 | #else 493 | return 0; 494 | #endif 495 | } 496 | 497 | size_t mm_dirty_size_auto_arena() { 498 | #ifdef JEMALLOC 499 | char ctl[128]; 500 | uint64_t epoch = 1; 501 | size_t sz = sizeof(epoch); 502 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 503 | 504 | // Get page size 505 | size_t pageSize = 0; 506 | sz = sizeof(pageSize); 507 | je_mallctl("arenas.page", &pageSize, &sz, NULL, 0); 508 | 509 | size_t pdirty = 0; 510 | unsigned int n = mm_narenas(); 511 | for (unsigned int i = 0; i < n; i++) { 512 | if (is_auto_arena(i)) { 513 | size_t pd = 0; 514 | snprintf(ctl, 128, "stats.arenas.%u.pdirty", i); 515 | je_mallctl(ctl, &pd, &sz, NULL, 0); 516 | pdirty += pd; 517 | } 518 | } 519 | // Return number of dirty bytes 520 | return pdirty * pageSize; 521 | #else 522 | return 0; 523 | #endif 524 | } 525 | 526 | size_t mm_active_size() { 527 | size_t active, sz; 528 | sz = sizeof(size_t); 529 | #ifdef JEMALLOC 530 | // Force stats cache flush 531 | uint64_t epoch = 1; 532 | sz = sizeof(epoch); 533 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 534 | 535 | je_mallctl("stats.active", &active, &sz, NULL, 0); 536 | return active; 537 | #else 538 | return 0; 539 | #endif 540 | } 541 | 542 | size_t mm_active_size_user_arena() { 543 | #ifdef JEMALLOC 544 | if (user_arenas_init == 0) { 545 | return 0; 546 | } 547 | 548 | char ctl[128]; 549 | uint64_t epoch = 1; 550 | size_t sz = sizeof(epoch); 551 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 552 | 553 | // Get page size 554 | size_t pageSize = 0; 555 | sz = sizeof(pageSize); 556 | je_mallctl("arenas.page", &pageSize, &sz, NULL, 0); 557 | 558 | size_t pactive = 0; 559 | for (unsigned int i = 0; i < MAX_USER_ARENAS; i++) { 560 | size_t pactv = 0; 561 | snprintf(ctl, 128, "stats.arenas.%u.pactive", user_arenas[i]); 562 | je_mallctl(ctl, &pactv, &sz, NULL, 0); 563 | pactive += pactv; 564 | } 565 | return pactive * pageSize; 566 | #else 567 | return 0; 568 | #endif 569 | } 570 | 571 | size_t mm_active_size_auto_arena() { 572 | #ifdef JEMALLOC 573 | char ctl[128]; 574 | uint64_t epoch = 1; 575 | size_t sz = sizeof(epoch); 576 | je_mallctl("epoch", &epoch, &sz, &epoch, sz); 577 | 578 | // Get page size 579 | size_t pageSize = 0; 580 | sz = sizeof(size_t); 581 | je_mallctl("arenas.page", &pageSize, &sz, NULL, 0); 582 | 583 | size_t pactive = 0; 584 | unsigned int n = mm_narenas(); 585 | for (unsigned int i = 0; i < n; i++) { 586 | if (is_auto_arena(i)) { 587 | size_t pactv = 0; 588 | snprintf(ctl, 128, "stats.arenas.%u.pactive", i); 589 | je_mallctl(ctl, &pactv, &sz, NULL, 0); 590 | pactive += pactv; 591 | } 592 | } 593 | return pactive * pageSize; 594 | #else 595 | return 0; 596 | #endif 597 | } 598 | 599 | int mm_free2os() { 600 | #ifdef JEMALLOC 601 | char buf[100]; 602 | sprintf(buf, "arena.%u.purge", MALLCTL_ARENAS_ALL); 603 | return je_mallctl(buf, NULL, NULL, NULL, 0); 604 | #endif 605 | return 0; 606 | } 607 | 608 | int mm_free2os_user_arena(unsigned int idx) { 609 | #ifdef JEMALLOC 610 | if ((user_arenas_init == 0) || (idx >= MAX_USER_ARENAS)) { 611 | return 0; 612 | } 613 | 614 | char ctl[100]; 615 | snprintf(ctl, 100, "arena.%u.purge", user_arenas[idx]); 616 | return je_mallctl(ctl, NULL, NULL, NULL, 0); 617 | #endif 618 | return 0; 619 | } 620 | 621 | int mm_prof_activate() { 622 | #if defined(JEMALLOC) && defined(__linux__) 623 | bool active = true; 624 | return je_mallctl("prof.active", NULL, NULL, &active, sizeof(active)); 625 | #endif 626 | return ENOTSUP; 627 | } 628 | 629 | int mm_prof_deactivate() { 630 | #if defined(JEMALLOC) && defined(__linux__) 631 | bool active = false; 632 | return je_mallctl("prof.active", NULL, NULL, &active, sizeof(active)); 633 | #endif 634 | return ENOTSUP; 635 | } 636 | 637 | int mm_prof_dump(char* filePath) { 638 | #if defined(JEMALLOC) && defined(__linux__) 639 | return je_mallctl("prof.dump", NULL, NULL, &filePath, sizeof(const char *)); 640 | #endif 641 | return ENOTSUP; 642 | } 643 | -------------------------------------------------------------------------------- /mm/malloc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package mm 10 | 11 | /* 12 | #include "malloc.h" 13 | #include 14 | */ 15 | import "C" 16 | 17 | import ( 18 | "encoding/json" 19 | "fmt" 20 | "sync" 21 | "sync/atomic" 22 | "unsafe" 23 | ) 24 | 25 | var ( 26 | // Debug enables debug stats 27 | Debug = true 28 | mu sync.Mutex 29 | 30 | statSize *C.char 31 | statNregs *C.char 32 | statCurregs *C.char 33 | statCurslabs *C.char 34 | ) 35 | 36 | var stats struct { 37 | allocs uint64 38 | frees uint64 39 | } 40 | 41 | func init() { 42 | statSize = C.CString(C.MM_STAT_SIZE) 43 | statNregs = C.CString(C.MM_STAT_NREGS) 44 | statCurregs = C.CString(C.MM_STAT_CURREGS) 45 | statCurslabs = C.CString(C.MM_STAT_CURSLABS) 46 | } 47 | 48 | // user arenas (note: narenas in je_malloc_conf are auto/default arenas) 49 | func CreateArenas() int { 50 | mu.Lock() 51 | defer mu.Unlock() 52 | 53 | return int(C.mm_create_arenas()) 54 | } 55 | 56 | // Malloc implements C like memory allocator 57 | func Malloc(l int) unsafe.Pointer { 58 | if Debug { 59 | atomic.AddUint64(&stats.allocs, 1) 60 | } 61 | return C.mm_malloc(C.size_t(l)) 62 | } 63 | 64 | func MallocArena(l int) unsafe.Pointer { 65 | if Debug { 66 | atomic.AddUint64(&stats.allocs, 1) 67 | } 68 | return C.mm_malloc_user_arena(C.size_t(l), 0) 69 | } 70 | 71 | // key is mainly for windows where we do not do thread based arena assignment 72 | func MallocArenaByKey(l int, key uint16) unsafe.Pointer { 73 | if Debug { 74 | atomic.AddUint64(&stats.allocs, 1) 75 | } 76 | return C.mm_malloc_user_arena(C.size_t(l), C.ushort(key)) 77 | } 78 | 79 | // Free implements C like memory deallocator 80 | func Free(p unsafe.Pointer) { 81 | if Debug { 82 | atomic.AddUint64(&stats.frees, 1) 83 | } 84 | C.mm_free(p) 85 | } 86 | 87 | // SizeAt returns real allocated size from an allocated pointer 88 | func SizeAt(p unsafe.Pointer) int { 89 | return int(C.mm_sizeat(p)) 90 | } 91 | 92 | // Stats returns allocator statistics 93 | // Returns jemalloc stats 94 | func Stats() string { 95 | mu.Lock() 96 | defer mu.Unlock() 97 | 98 | buf := C.mm_stats() 99 | s := "---- Stats ----\n" 100 | if Debug { 101 | s += fmt.Sprintf("Mallocs = %d\n"+ 102 | "Frees = %d\n", stats.allocs, stats.frees) 103 | } 104 | 105 | if buf != nil { 106 | s += C.GoString(buf) 107 | C.free(unsafe.Pointer(buf)) 108 | } 109 | 110 | return s 111 | } 112 | 113 | func ArenaStats(i int, str string) uint64 { 114 | buf := C.CString(str) 115 | if buf == nil { 116 | return 0 117 | } 118 | defer C.free(unsafe.Pointer(buf)) 119 | 120 | return uint64(C.mm_arenas_i_stat(C.uint(i), buf)) 121 | } 122 | 123 | func NArenas() int { 124 | return int(C.mm_narenas()) 125 | } 126 | 127 | func NArenasUser() int { 128 | return int(C.mm_user_narenas()) 129 | } 130 | 131 | type JemallocBinStats struct { 132 | FragPercent uint64 133 | Resident uint64 134 | } 135 | 136 | func computeBinFrag(curregs, curslabs, nregs uint64) uint64 { 137 | if curslabs <= 0 || nregs <= 0 { 138 | return 0 139 | } 140 | 141 | return 100 - ((100 * curregs) / (curslabs * nregs)) 142 | } 143 | 144 | func computeBinResident(curslabs, nregs, size uint64) uint64 { 145 | return curslabs * nregs * size 146 | } 147 | 148 | func getBinsStats() map[string]JemallocBinStats { 149 | nbins := uint64(C.mm_arenas_nbins()) 150 | bs := make(map[string]JemallocBinStats) 151 | 152 | for i := uint64(0); i < nbins; i++ { 153 | binInd := C.uint(i) 154 | size := uint64(C.mm_arenas_bin_i_stat(binInd, statSize)) 155 | nregs := uint64(C.mm_arenas_bin_i_stat(binInd, statNregs)) 156 | curregs := uint64(C.mm_stats_arenas_merged_bins_j_stat(binInd, statCurregs)) 157 | curslabs := uint64(C.mm_stats_arenas_merged_bins_j_stat(binInd, statCurslabs)) 158 | 159 | sts := JemallocBinStats{} 160 | sts.FragPercent = computeBinFrag(curregs, curslabs, nregs) 161 | sts.Resident = computeBinResident(curslabs, nregs, size) 162 | 163 | bs[fmt.Sprintf("bin_%d", size)] = sts 164 | } 165 | 166 | return bs 167 | } 168 | 169 | func StatsJson() string { 170 | mu.Lock() 171 | defer mu.Unlock() 172 | 173 | buf := C.mm_stats_json() 174 | 175 | s := "" 176 | if buf != nil { 177 | s += C.GoString(buf) 178 | C.free(unsafe.Pointer(buf)) 179 | } 180 | 181 | // Unmarshal json and add derived stats to it 182 | stsJson := make(map[string]interface{}) 183 | err := json.Unmarshal([]byte(s), &stsJson) 184 | if err != nil { 185 | return s 186 | } 187 | stsJson["bin_stats"] = getBinsStats() 188 | 189 | data, err := json.Marshal(stsJson) 190 | if err != nil { 191 | return s 192 | } 193 | 194 | return string(data) 195 | } 196 | 197 | // Size returns total resident size merged across all arenas 198 | func Size() uint64 { 199 | return uint64(C.mm_size()) 200 | } 201 | 202 | func SizeUser() uint64 { 203 | return uint64(C.mm_size_user_arena()) 204 | } 205 | 206 | func SizeAuto() uint64 { 207 | return uint64(C.mm_size_auto_arena()) 208 | } 209 | 210 | func AllocSize() uint64 { 211 | return uint64(C.mm_alloc_size()) 212 | } 213 | 214 | func AllocSizeUser() uint64 { 215 | return uint64(C.mm_alloc_size_user_arena()) 216 | } 217 | 218 | func AllocSizeAuto() uint64 { 219 | return uint64(C.mm_alloc_size_auto_arena()) 220 | } 221 | 222 | func DirtySize() uint64 { 223 | return uint64(C.mm_dirty_size()) 224 | } 225 | 226 | func DirtySizeUser() uint64 { 227 | return uint64(C.mm_dirty_size_user_arena()) 228 | } 229 | 230 | func DirtySizeAuto() uint64 { 231 | return uint64(C.mm_dirty_size_auto_arena()) 232 | } 233 | 234 | func ActiveSize() uint64 { 235 | return uint64(C.mm_active_size()) 236 | } 237 | 238 | func ActiveSizeUser() uint64 { 239 | return uint64(C.mm_active_size_user_arena()) 240 | } 241 | 242 | func ActiveSizeAuto() uint64 { 243 | return uint64(C.mm_active_size_auto_arena()) 244 | } 245 | 246 | func GetAllocStats() (uint64, uint64) { 247 | return atomic.LoadUint64(&stats.allocs), atomic.LoadUint64(&stats.frees) 248 | } 249 | 250 | // FreeOSMemory forces jemalloc to scrub memory and release back to OS 251 | func FreeOSMemory() error { 252 | errCode := int(C.mm_free2os()) 253 | if errCode != 0 { 254 | return fmt.Errorf("status: %d", errCode) 255 | } 256 | 257 | return nil 258 | } 259 | 260 | // does parallel scrub of user arenas memory and release back to OS 261 | func FreeOSMemoryUser() error { 262 | var err error 263 | var wg sync.WaitGroup 264 | for i := 0; i < NArenasUser(); i++ { 265 | wg.Add(1) 266 | go func(id int) { 267 | defer wg.Done() 268 | 269 | errCode := int(C.mm_free2os_user_arena(C.uint(id))) 270 | if errCode != 0 && err == nil { 271 | err = fmt.Errorf("status: %d", errCode) 272 | } 273 | }(i) 274 | } 275 | 276 | wg.Wait() 277 | return err 278 | } 279 | 280 | func ProfActivate() error { 281 | if errCode := int(C.mm_prof_activate()); errCode != 0 { 282 | return fmt.Errorf("Error during jemalloc profile activate. err = [%v]", 283 | C.GoString(C.strerror(C.int(errCode)))) 284 | } 285 | 286 | return nil 287 | } 288 | 289 | func ProfDeactivate() error { 290 | if errCode := int(C.mm_prof_deactivate()); errCode != 0 { 291 | return fmt.Errorf("Error during jemalloc profile deactivate. err = [%v]", 292 | C.GoString(C.strerror(C.int(errCode)))) 293 | } 294 | 295 | return nil 296 | } 297 | 298 | func ProfDump(filePath string) error { 299 | filePathAsCString := C.CString(filePath) 300 | defer C.free(unsafe.Pointer(filePathAsCString)) 301 | 302 | if errCode := int(C.mm_prof_dump(filePathAsCString)); errCode != 0 { 303 | return fmt.Errorf("Error during jemalloc profile dump. err = [%v]", 304 | C.GoString(C.strerror(C.int(errCode)))) 305 | } 306 | 307 | return nil 308 | } 309 | -------------------------------------------------------------------------------- /mm/malloc.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | #ifndef MALLOC_MM_H 9 | #define MALLOC_MM_H 10 | 11 | #include 12 | 13 | typedef struct { 14 | char *buf; 15 | int offset; 16 | int size; 17 | } stats_buf; 18 | 19 | #define MM_STAT_SIZE "size" 20 | #define MM_STAT_NREGS "nregs" 21 | #define MM_STAT_CURREGS "curregs" 22 | #define MM_STAT_CURSLABS "curslabs" 23 | 24 | int mm_create_arenas(); 25 | 26 | void *mm_malloc(size_t); 27 | 28 | void* mm_malloc_user_arena(size_t, unsigned short); 29 | 30 | void mm_free(void *); 31 | 32 | char *mm_stats(); 33 | 34 | char *mm_stats_json(); 35 | 36 | unsigned int mm_narenas(); 37 | 38 | unsigned int mm_user_narenas(); 39 | 40 | size_t mm_arenas_i_stat(unsigned int, const char*); 41 | 42 | unsigned int mm_arenas_nbins(); 43 | 44 | size_t mm_arenas_bin_i_stat(unsigned int, const char *); 45 | 46 | size_t mm_stats_arenas_merged_bins_j_stat(unsigned int, const char *); 47 | 48 | size_t mm_sizeat(void *); 49 | 50 | size_t mm_size(); 51 | 52 | size_t mm_size_user_arena(); 53 | 54 | size_t mm_size_auto_arena(); 55 | 56 | size_t mm_alloc_size(); 57 | 58 | size_t mm_alloc_size_user_arena(); 59 | 60 | size_t mm_alloc_size_auto_arena(); 61 | 62 | size_t mm_dirty_size(); 63 | 64 | size_t mm_dirty_size_user_arena(); 65 | 66 | size_t mm_dirty_size_auto_arena(); 67 | 68 | size_t mm_active_size(); 69 | 70 | size_t mm_active_size_user_arena(); 71 | 72 | size_t mm_active_size_auto_arena(); 73 | 74 | int mm_free2os(); 75 | 76 | int mm_free2os_user_arena(unsigned int); 77 | 78 | int mm_prof_activate(); 79 | 80 | int mm_prof_deactivate(); 81 | 82 | int mm_prof_dump(char* filePath); 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /mm/malloc_perf_test.go: -------------------------------------------------------------------------------- 1 | //go:build perf 2 | // +build perf 3 | 4 | // Copyright 2024-Present Couchbase, Inc. 5 | // 6 | // Use of this software is governed by the Business Source License included in 7 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 8 | // file, in accordance with the Business Source License, use of this software 9 | // will be governed by the Apache License, Version 2.0, included in the file 10 | // licenses/APL2.txt. 11 | 12 | package mm 13 | 14 | import ( 15 | "fmt" 16 | "testing" 17 | ) 18 | 19 | func BenchmarkArenaMalloc(b *testing.B) { 20 | CreateArenas() 21 | defer func() { 22 | fmt.Println(Stats()) 23 | }() 24 | 25 | sz := 128 26 | 27 | b.Run("Malloc", func(b *testing.B) { 28 | for i := 0; i < b.N; i++ { 29 | Malloc(sz) 30 | } 31 | }) 32 | 33 | b.Run("MallocArena", func(b *testing.B) { 34 | for i := 0; i < b.N; i++ { 35 | MallocArena(sz) 36 | } 37 | }) 38 | } 39 | -------------------------------------------------------------------------------- /mm/malloc_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package mm 10 | 11 | import ( 12 | "encoding/json" 13 | "fmt" 14 | "os" 15 | "runtime" 16 | "sync" 17 | "testing" 18 | "time" 19 | "unsafe" 20 | ) 21 | 22 | func TestJeMalloc(t *testing.T) { 23 | Malloc(100 * 1024 * 1024) 24 | fmt.Println("size:", Size()) 25 | fmt.Println(Stats()) 26 | 27 | stsJsonStr := StatsJson() 28 | unmarshaledSts := new(map[string]interface{}) 29 | 30 | if err := json.Unmarshal([]byte(stsJsonStr), unmarshaledSts); err != nil { 31 | t.Errorf("Failed to unmarshal json stats: %v", err) 32 | } 33 | 34 | buf, err := json.MarshalIndent(unmarshaledSts, "", " ") 35 | if err != nil { 36 | t.Errorf("Failed to marshal again: %v", err) 37 | } 38 | fmt.Println(string(buf)) 39 | } 40 | 41 | func TestJeMallocSizeAt(t *testing.T) { 42 | p := Malloc(89) 43 | if sz := SizeAt(p); sz != 96 { 44 | t.Errorf("Expected sizeclass 96, but got %d", sz) 45 | } 46 | } 47 | 48 | func TestJeMallocProf(t *testing.T) { 49 | profPath := "TestProf.prof" 50 | 51 | if runtime.GOOS != "linux" { 52 | return 53 | } 54 | 55 | if err := os.Remove(profPath); err != nil && !os.IsNotExist(err) { 56 | t.Errorf("Could not remove old profile: err[%v]", err) 57 | return 58 | } 59 | 60 | if err := ProfActivate(); err != nil { 61 | t.Error(err) 62 | return 63 | } 64 | 65 | defer func() { 66 | if err := ProfDeactivate(); err != nil { 67 | t.Error(err) 68 | } 69 | }() 70 | 71 | if err := ProfDump(profPath); err != nil { 72 | t.Error(err) 73 | return 74 | } 75 | } 76 | 77 | func TestJeMallocArenaCreate(t *testing.T) { 78 | n := NArenas() 79 | if CreateArenas() == 0 { 80 | n2 := NArenas() 81 | t.Log(n, n2) 82 | if n >= n2 { 83 | t.Error("error") 84 | } 85 | } 86 | } 87 | 88 | // run this test separately as arena stats can get polluted 89 | func TestJeMallocArenaStatsFromZero(t *testing.T) { 90 | n := 100000 91 | nThreads := runtime.GOMAXPROCS(0) * 2 // 2 * G 92 | var wg sync.WaitGroup 93 | 94 | dArenas := NArenas() 95 | 96 | // check user arena configuration 97 | if err := CreateArenas(); err != 0 { 98 | return 99 | } 100 | 101 | defer func() { 102 | fmt.Println(Stats()) 103 | }() 104 | 105 | ptrs := make([][]unsafe.Pointer, nThreads) 106 | 107 | // arena stats are not updated immediately (thread based) 108 | doWait := func() { 109 | for i := 0; i < 1000; i++ { 110 | AllocSize() 111 | time.Sleep(time.Millisecond * 10) 112 | } 113 | } 114 | 115 | doAllocs := func(useX bool) { 116 | // check thread assignment 117 | t0 := time.Now() 118 | for i := 0; i < nThreads; i++ { 119 | wg.Add(1) 120 | ptrs[i] = make([]unsafe.Pointer, 0) 121 | go func(k int) { 122 | defer wg.Done() 123 | sz := 1024 124 | for j := 0; j < n; j++ { 125 | if useX { 126 | ptrs[k] = append(ptrs[k], MallocArena(sz)) 127 | } else { 128 | ptrs[k] = append(ptrs[k], Malloc(sz)) 129 | } 130 | } 131 | }(i) 132 | } 133 | wg.Wait() 134 | t.Logf("doAllocs Malloc(%v): %v", useX, time.Since(t0)) 135 | doWait() 136 | } 137 | 138 | doFree := func() { 139 | t0 := time.Now() 140 | for i := 0; i < nThreads; i++ { 141 | wg.Add(1) 142 | go func(k int) { 143 | defer wg.Done() 144 | for j := 0; j < len(ptrs[k]); j++ { 145 | Free(ptrs[k][j]) 146 | } 147 | ptrs[k] = nil 148 | }(i) 149 | } 150 | wg.Wait() 151 | t.Logf("doFree Free:%v", time.Since(t0)) 152 | doWait() 153 | } 154 | 155 | // check default allocations 156 | doAllocs(false) 157 | 158 | t.Run("auto_arena_alloc_stats", func(t *testing.T) { 159 | t.Log("Auto Arena Resident", SizeAuto(), "Alloc", AllocSizeAuto(), "Active", ActiveSizeAuto()) 160 | 161 | if SizeAuto() == 0 || AllocSizeAuto() == 0 { 162 | t.Error("error") 163 | return 164 | } 165 | 166 | if SizeAuto() < SizeUser() { 167 | t.Error("error") 168 | return 169 | } 170 | 171 | if AllocSizeUser() > 0 { 172 | t.Error("error") 173 | return 174 | } 175 | 176 | // no active pages for user arenas yet 177 | if ActiveSizeUser() > 0 { 178 | t.Error("error") 179 | return 180 | } 181 | 182 | // merged stat should always be ge than user arena stats 183 | if AllocSize() < AllocSizeAuto() { 184 | t.Error("error") 185 | return 186 | } 187 | }) 188 | 189 | doFree() 190 | 191 | t.Run("auto_arena_free_stats", func(t *testing.T) { 192 | for i := dArenas; i < NArenas(); i++ { 193 | if ArenaStats(i, "small.ndalloc")+ArenaStats(i, "large.ndalloc") > 0 { 194 | t.Error("error") 195 | return 196 | } 197 | } 198 | }) 199 | 200 | // check user arena allocations 201 | doAllocs(true) 202 | 203 | m := uint64(0) 204 | t.Run("user_arena_alloc_stats", func(t *testing.T) { 205 | t.Log("User Resident", SizeUser(), "Alloc", AllocSizeUser(), "Active", ActiveSizeUser()) 206 | 207 | if SizeUser() == 0 || AllocSizeUser() == 0 || ActiveSizeUser() == 0 { 208 | t.Error("error") 209 | return 210 | } 211 | m = AllocSizeUser() 212 | 213 | if Size() < SizeUser() { 214 | t.Error("error") 215 | return 216 | } 217 | 218 | for i := dArenas; i < NArenas(); i++ { 219 | if ArenaStats(i, "small.nmalloc") + ArenaStats(i, "large.nmalloc") == 0 { 220 | t.Error("error") 221 | return 222 | } 223 | } 224 | }) 225 | 226 | doFree() 227 | 228 | t.Run("user_arena_free_stats", func(t *testing.T) { 229 | if Size() < SizeUser() { 230 | t.Error("error") 231 | return 232 | } 233 | 234 | if m <= AllocSizeUser() { 235 | t.Error("error") 236 | return 237 | } 238 | 239 | for i := dArenas; i < NArenas(); i++ { 240 | if ArenaStats(i, "small.nmalloc") < ArenaStats(i, "large.nmalloc") { 241 | t.Error("error") 242 | return 243 | } 244 | } 245 | }) 246 | 247 | t.Run("purge_arena", func(t *testing.T) { 248 | m, mx, mg := Size(), SizeUser(), SizeAuto() 249 | 250 | t0 := time.Now() 251 | FreeOSMemoryUser() 252 | fmt.Println("User Arena Purge Duration:", time.Since(t0)) 253 | doWait() 254 | 255 | // merged resident should drop 256 | if Size() >= m { 257 | t.Error("error") 258 | return 259 | } 260 | // user arena resident should drop 261 | if SizeUser() >= mx { 262 | t.Error("error") 263 | return 264 | } 265 | 266 | // auto arena purged 267 | t0 = time.Now() 268 | FreeOSMemory() 269 | fmt.Println("Auto Arena Purge Duration", time.Since(t0)) 270 | doWait() 271 | 272 | if SizeAuto() >= mg { 273 | t.Error("error") 274 | return 275 | } 276 | }) 277 | 278 | t.Run("decay_statistics", func(t *testing.T) { 279 | l1 := ArenaStats(0, "dirty_decay_ms") 280 | l2 := ArenaStats(0, "muzzy_decay_ms") 281 | for i := 0; i < NArenas(); i++ { 282 | ms := ArenaStats(i, "dirty_decay_ms") 283 | if ms > l1 { 284 | t.Error("error", ms, l1) 285 | } 286 | ms = ArenaStats(i, "muzzy_decay_ms") 287 | if ms > l2 { 288 | t.Error("error", ms, l2) 289 | } 290 | t.Log("DirtyPurged:", ArenaStats(i, "dirty_purged")) 291 | } 292 | }) 293 | } 294 | 295 | // test dedicated arena is not used for large allocations for user arenas 296 | func TestJeMallocArenaLarge(t *testing.T) { 297 | nThreads := runtime.GOMAXPROCS(0) 298 | var wg sync.WaitGroup 299 | 300 | defer func() { 301 | fmt.Println(Stats()) 302 | }() 303 | 304 | if err := CreateArenas(); err != 0 && err != -1 { 305 | t.Error("error", err) 306 | return 307 | } 308 | 309 | ptrs := make([][]unsafe.Pointer, nThreads) 310 | 311 | sz := 10 * 1024 * 1024 312 | iters := 1000 313 | doAllocs := func() { 314 | for i := 0; i < nThreads; i++ { 315 | wg.Add(1) 316 | ptrs[i] = make([]unsafe.Pointer, 0) 317 | go func(k int) { 318 | defer wg.Done() 319 | for j := 0; j < iters; j++ { 320 | ptrs[k] = append(ptrs[k], MallocArena(sz)) 321 | } 322 | }(i) 323 | } 324 | wg.Wait() 325 | } 326 | 327 | doFree := func() { 328 | for i := 0; i < nThreads; i++ { 329 | wg.Add(1) 330 | go func(k int) { 331 | defer wg.Done() 332 | for j := 0; j < len(ptrs[k]); j++ { 333 | Free(ptrs[k][j]) 334 | } 335 | ptrs[k] = nil 336 | }(i) 337 | } 338 | wg.Wait() 339 | } 340 | 341 | for i := 0; i < 5; i++ { 342 | doAllocs() 343 | t.Log("User Arena Resident", SizeUser(), AllocSizeUser(), (sz * iters * nThreads)) 344 | doFree() 345 | FreeOSMemoryUser() 346 | t.Log("User Arena Resident", SizeUser(), AllocSizeUser(), (sz * iters * nThreads)) 347 | } 348 | } 349 | -------------------------------------------------------------------------------- /nitro_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nitro 10 | 11 | import ( 12 | "encoding/binary" 13 | "fmt" 14 | "math/rand" 15 | "os" 16 | "path/filepath" 17 | "runtime" 18 | "sync" 19 | "sync/atomic" 20 | "testing" 21 | "time" 22 | 23 | "github.com/couchbase/nitro/mm" 24 | ) 25 | 26 | var testConf Config 27 | 28 | func init() { 29 | testConf = DefaultConfig() 30 | testConf.UseMemoryMgmt(mm.MallocArena, mm.Free) 31 | testConf.UseDeltaInterleaving() 32 | Debug(true) 33 | } 34 | 35 | func TestInsert(t *testing.T) { 36 | db := NewWithConfig(testConf) 37 | defer db.Close() 38 | 39 | w := db.NewWriter() 40 | for i := 0; i < 2000; i++ { 41 | w.Put([]byte(fmt.Sprintf("%010d", i))) 42 | } 43 | 44 | for i := 1750; i < 2000; i++ { 45 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 46 | } 47 | snap, _ := w.NewSnapshot() 48 | defer snap.Close() 49 | 50 | for i := 2000; i < 5000; i++ { 51 | w.Put([]byte(fmt.Sprintf("%010d", i))) 52 | } 53 | 54 | snap2, _ := w.NewSnapshot() 55 | defer snap2.Close() 56 | 57 | count := 0 58 | itr := db.NewIterator(snap) 59 | defer itr.Close() 60 | 61 | itr.SeekFirst() 62 | itr.Seek([]byte(fmt.Sprintf("%010d", 1500))) 63 | for ; itr.Valid(); itr.Next() { 64 | expected := fmt.Sprintf("%010d", count+1500) 65 | got := string(itr.Get()) 66 | count++ 67 | if got != expected { 68 | t.Errorf("Expected %s, got %v", expected, got) 69 | } 70 | } 71 | 72 | if count != 250 { 73 | t.Errorf("Expected count = 250, got %v", count) 74 | } 75 | } 76 | 77 | func doInsert(db *Nitro, w *Writer, wg *sync.WaitGroup, n int, isRand bool, shouldSnap bool) { 78 | defer wg.Done() 79 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 80 | for i := 0; i < n; i++ { 81 | var val int 82 | if isRand { 83 | val = rnd.Int() 84 | } else { 85 | val = i 86 | } 87 | if shouldSnap && i%100000 == 0 { 88 | s, _ := w.NewSnapshot() 89 | s.Close() 90 | } 91 | buf := make([]byte, 8) 92 | binary.BigEndian.PutUint64(buf, uint64(val)) 93 | w.Put(buf) 94 | } 95 | } 96 | 97 | func TestInsertPerf(t *testing.T) { 98 | var wg sync.WaitGroup 99 | db := NewWithConfig(testConf) 100 | defer db.Close() 101 | n := (20000000 / runtime.GOMAXPROCS(0)) * runtime.GOMAXPROCS(0) 102 | t0 := time.Now() 103 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 104 | wg.Add(1) 105 | w := db.NewWriter() 106 | go doInsert(db, w, &wg, n/runtime.GOMAXPROCS(0), true, false) 107 | } 108 | wg.Wait() 109 | 110 | snap, _ := db.NewSnapshot() 111 | defer snap.Close() 112 | dur := time.Since(t0) 113 | VerifyCount(snap, n, t) 114 | fmt.Printf("%d items took %v -> %v items/s snapshots_created %v live_snapshots %v\n", 115 | n, dur, float64(n)/float64(dur.Seconds()), db.GetCurrSn(), len(db.GetSnapshots())) 116 | } 117 | 118 | func doGet(t *testing.T, db *Nitro, snap *Snapshot, wg *sync.WaitGroup, n int) { 119 | defer wg.Done() 120 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 121 | 122 | buf := make([]byte, 8) 123 | itr := db.NewIterator(snap) 124 | defer itr.Close() 125 | for i := 0; i < n; i++ { 126 | val := rnd.Int() % n 127 | binary.BigEndian.PutUint64(buf, uint64(val)) 128 | itr.Seek(buf) 129 | if !itr.Valid() { 130 | t.Errorf("Expected to find %v", val) 131 | } 132 | } 133 | } 134 | 135 | func TestInsertDuplicates(t *testing.T) { 136 | db := NewWithConfig(testConf) 137 | defer db.Close() 138 | 139 | w := db.NewWriter() 140 | for i := 0; i < 2000; i++ { 141 | w.Put([]byte(fmt.Sprintf("%010d", i))) 142 | } 143 | 144 | snap1, _ := w.NewSnapshot() 145 | defer snap1.Close() 146 | 147 | // Duplicate 148 | for i := 0; i < 2000; i++ { 149 | key := fmt.Sprintf("%010d", i) 150 | newNode := w.Put2([]byte(key)) 151 | if newNode != nil { 152 | t.Errorf("Duplicate unexpected for %s", key) 153 | } 154 | } 155 | 156 | for i := 1500; i < 2000; i++ { 157 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 158 | } 159 | snap2, _ := w.NewSnapshot() 160 | defer snap2.Close() 161 | 162 | for i := 1500; i < 5000; i++ { 163 | key := fmt.Sprintf("%010d", i) 164 | newNode := w.Put2([]byte(key)) 165 | if newNode == nil { 166 | t.Errorf("Expected successful insert for %s", key) 167 | } 168 | } 169 | 170 | snap, _ := w.NewSnapshot() 171 | defer snap.Close() 172 | count := 0 173 | itr := db.NewIterator(snap) 174 | defer itr.Close() 175 | 176 | itr.SeekFirst() 177 | for ; itr.Valid(); itr.Next() { 178 | expected := fmt.Sprintf("%010d", count) 179 | got := string(itr.Get()) 180 | count++ 181 | if got != expected { 182 | t.Errorf("Expected %s, got %v", expected, got) 183 | } 184 | } 185 | 186 | if count != 5000 { 187 | t.Errorf("Expected count = 5000, got %v", count) 188 | } 189 | } 190 | 191 | func TestGetPerf(t *testing.T) { 192 | var wg sync.WaitGroup 193 | db := NewWithConfig(testConf) 194 | defer db.Close() 195 | n := 1000000 196 | wg.Add(1) 197 | w := db.NewWriter() 198 | go doInsert(db, w, &wg, n, false, true) 199 | wg.Wait() 200 | snap, _ := db.NewSnapshot() 201 | defer snap.Close() 202 | VerifyCount(snap, n, t) 203 | 204 | t0 := time.Now() 205 | total := n * runtime.GOMAXPROCS(0) 206 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 207 | wg.Add(1) 208 | go doGet(t, db, snap, &wg, n) 209 | } 210 | wg.Wait() 211 | dur := time.Since(t0) 212 | fmt.Printf("%d items took %v -> %v items/s\n", total, dur, float64(total)/float64(dur.Seconds())) 213 | } 214 | 215 | func VerifyCount(snap *Snapshot, n int, t *testing.T) { 216 | 217 | if c := CountItems(snap); c != n { 218 | t.Errorf("Expected count %d, got %d", n, c) 219 | } 220 | } 221 | 222 | func CountItems(snap *Snapshot) int { 223 | var count int 224 | itr := snap.NewIterator() 225 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 226 | count++ 227 | } 228 | itr.Close() 229 | return count 230 | } 231 | 232 | func TestLoadStoreDisk(t *testing.T) { 233 | os.RemoveAll("db.dump") 234 | var wg sync.WaitGroup 235 | db := NewWithConfig(testConf) 236 | defer db.Close() 237 | n := (1000000 / runtime.GOMAXPROCS(0)) * runtime.GOMAXPROCS(0) 238 | t0 := time.Now() 239 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 240 | wg.Add(1) 241 | w := db.NewWriter() 242 | go doInsert(db, w, &wg, n/runtime.GOMAXPROCS(0), true, false) 243 | } 244 | wg.Wait() 245 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 246 | snap0, _ := db.NewSnapshot() 247 | defer snap0.Close() 248 | snap, _ := db.NewSnapshot() 249 | fmt.Println(db.DumpStats()) 250 | 251 | t0 = time.Now() 252 | err := db.StoreToDisk("db.dump", snap, 8, nil) 253 | if err != nil { 254 | t.Errorf("Expected no error. got=%v", err) 255 | } 256 | 257 | fmt.Printf("Storing to disk took %v\n", time.Since(t0)) 258 | 259 | snap.Close() 260 | db = NewWithConfig(testConf) 261 | defer db.Close() 262 | t0 = time.Now() 263 | snap, err = db.LoadFromDisk("db.dump", 8, nil) 264 | defer snap.Close() 265 | if err != nil { 266 | t.Errorf("Expected no error. got=%v", err) 267 | } 268 | fmt.Printf("Loading from disk took %v\n", time.Since(t0)) 269 | 270 | count := CountItems(snap) 271 | if count != n { 272 | t.Errorf("Expected %v, got %v", n, count) 273 | } 274 | 275 | count = int(snap.Count()) 276 | if count != n { 277 | t.Errorf("Count mismatch on snapshot. Expected %d, got %d", n, count) 278 | } 279 | fmt.Println(db.DumpStats()) 280 | } 281 | 282 | func TestStoreDiskShutdown(t *testing.T) { 283 | os.RemoveAll("db.dump") 284 | var wg sync.WaitGroup 285 | db := NewWithConfig(testConf) 286 | n := (1000000 / runtime.GOMAXPROCS(0)) * runtime.GOMAXPROCS(0) 287 | t0 := time.Now() 288 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 289 | wg.Add(1) 290 | w := db.NewWriter() 291 | go doInsert(db, w, &wg, n/runtime.GOMAXPROCS(0), true, false) 292 | } 293 | wg.Wait() 294 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 295 | snap0, _ := db.NewSnapshot() 296 | snap, _ := db.NewSnapshot() 297 | fmt.Println(db.DumpStats()) 298 | 299 | errch := make(chan error, 1) 300 | go func() { 301 | errch <- db.StoreToDisk("db.dump", snap, 8, nil) 302 | }() 303 | 304 | snap0.Close() 305 | snap.Close() 306 | db.Close() 307 | 308 | if err := <-errch; err != ErrShutdown { 309 | t.Errorf("Expected ErrShutdown. got=%v", err) 310 | } 311 | } 312 | 313 | func TestDelete(t *testing.T) { 314 | expected := 10 315 | db := NewWithConfig(testConf) 316 | defer db.Close() 317 | w := db.NewWriter() 318 | for i := 0; i < expected; i++ { 319 | w.Put([]byte(fmt.Sprintf("%010d", i))) 320 | } 321 | 322 | snap1, _ := w.NewSnapshot() 323 | got := CountItems(snap1) 324 | if got != expected { 325 | t.Errorf("Expected 2000, got %d", got) 326 | } 327 | fmt.Println(db.DumpStats()) 328 | 329 | for i := 0; i < expected; i++ { 330 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 331 | } 332 | 333 | for i := 0; i < expected; i++ { 334 | w.Put([]byte(fmt.Sprintf("%010d", i))) 335 | } 336 | snap2, _ := w.NewSnapshot() 337 | snap1.Close() 338 | snap3, _ := w.NewSnapshot() 339 | snap2.Close() 340 | time.Sleep(time.Second) 341 | 342 | got = CountItems(snap3) 343 | snap3.Close() 344 | 345 | if got != expected { 346 | t.Errorf("Expected %d, got %d", expected, got) 347 | } 348 | fmt.Println(db.DumpStats()) 349 | } 350 | 351 | func doReplace(wg *sync.WaitGroup, t *testing.T, w *Writer, start, end int) { 352 | defer wg.Done() 353 | 354 | for ; start < end; start++ { 355 | w.Delete([]byte(fmt.Sprintf("%010d", start))) 356 | w.Put([]byte(fmt.Sprintf("%010d", start))) 357 | } 358 | } 359 | 360 | func TestGCPerf(t *testing.T) { 361 | var wg sync.WaitGroup 362 | var last *Snapshot 363 | 364 | db := NewWithConfig(testConf) 365 | defer db.Close() 366 | perW := 1000 367 | iterations := 1000 368 | nW := runtime.GOMAXPROCS(0) 369 | 370 | var ws []*Writer 371 | 372 | for i := 0; i < nW; i++ { 373 | ws = append(ws, db.NewWriter()) 374 | } 375 | 376 | nc := 0 377 | for x := 0; x < iterations; x++ { 378 | for i := 0; i < nW; i++ { 379 | wg.Add(1) 380 | go doReplace(&wg, t, ws[i], i*perW, i*perW+perW) 381 | } 382 | wg.Wait() 383 | curr, _ := db.NewSnapshot() 384 | if last != nil { 385 | last.Close() 386 | } 387 | 388 | last = curr 389 | nc += db.store.GetStats().NodeCount 390 | } 391 | 392 | snap, _ := db.NewSnapshot() 393 | defer snap.Close() 394 | last.Close() 395 | 396 | waits := 0 397 | for db.store.GetStats().NodeCount > nW*perW { 398 | time.Sleep(time.Millisecond) 399 | waits++ 400 | } 401 | 402 | fmt.Printf("final_node_count = %v, average_live_node_count = %v, wait_time_for_collection = %vms\n", db.store.GetStats().NodeCount, nc/iterations, waits) 403 | } 404 | 405 | func TestMemoryInUse(t *testing.T) { 406 | db := NewWithConfig(testConf) 407 | defer db.Close() 408 | 409 | dumpStats := func() { 410 | fmt.Printf("ItemsCount: %v, MemoryInUse: %v, NodesCount: %v\n", db.ItemsCount(), MemoryInUse(), db.store.GetStats().NodeCount) 411 | } 412 | w := db.NewWriter() 413 | for i := 0; i < 5000; i++ { 414 | w.Put([]byte(fmt.Sprintf("%010d", i))) 415 | } 416 | snap1, _ := w.NewSnapshot() 417 | 418 | dumpStats() 419 | 420 | for i := 0; i < 5000; i++ { 421 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 422 | } 423 | 424 | snap1.Close() 425 | snap2, _ := w.NewSnapshot() 426 | snap3, _ := w.NewSnapshot() 427 | defer snap3.Close() 428 | snap2.Close() 429 | time.Sleep(time.Second) 430 | dumpStats() 431 | } 432 | 433 | func TestFullScan(t *testing.T) { 434 | var wg sync.WaitGroup 435 | db := NewWithConfig(testConf) 436 | defer db.Close() 437 | n := (1000000 / runtime.GOMAXPROCS(0)) * runtime.GOMAXPROCS(0) 438 | t0 := time.Now() 439 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 440 | wg.Add(1) 441 | w := db.NewWriter() 442 | go doInsert(db, w, &wg, n/runtime.GOMAXPROCS(0), true, false) 443 | } 444 | wg.Wait() 445 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 446 | snap, _ := db.NewSnapshot() 447 | defer snap.Close() 448 | VerifyCount(snap, n, t) 449 | nc := db.store.GetStats().NodeCount 450 | if n != nc { 451 | t.Errorf("skiplist statsReport NodeCount mismatch, got :%d, expected :%d", n, nc) 452 | } 453 | fmt.Println(db.DumpStats()) 454 | t0 = time.Now() 455 | c := CountItems(snap) 456 | fmt.Printf("Full iteration of %d items took %v\n", c, time.Since(t0)) 457 | } 458 | 459 | func TestVisitor(t *testing.T) { 460 | const shards = 32 461 | const concurrency = 8 462 | const n = 1000000 463 | 464 | var wg sync.WaitGroup 465 | db := NewWithConfig(testConf) 466 | defer db.Close() 467 | expectedSum := int64((n - 1) * (n / 2)) 468 | 469 | wg.Add(1) 470 | w := db.NewWriter() 471 | doInsert(db, w, &wg, n, false, false) 472 | snap, _ := db.NewSnapshot() 473 | defer snap.Close() 474 | fmt.Println(db.DumpStats()) 475 | 476 | var counts [shards]int64 477 | var startEndRange [shards][2]uint64 478 | var sum int64 479 | 480 | callb := func(itm *Item, shard int) error { 481 | v := binary.BigEndian.Uint64(itm.Bytes()) 482 | atomic.AddInt64(&sum, int64(v)) 483 | atomic.AddInt64(&counts[shard], 1) 484 | 485 | if shard > 0 && startEndRange[shard][0] == 0 { 486 | startEndRange[shard][0] = v 487 | } else { 488 | if startEndRange[shard][1] > v { 489 | t.Errorf("shard-%d validation of sort order %d > %d", shard, startEndRange[shard][1], v) 490 | } 491 | startEndRange[shard][1] = v 492 | } 493 | 494 | return nil 495 | } 496 | 497 | total := 0 498 | t0 := time.Now() 499 | db.Visitor(snap, callb, shards, concurrency) 500 | dur := time.Since(t0) 501 | fmt.Printf("Took %v to iterate %v items, %v items/s\n", dur, n, float32(n)/float32(dur.Seconds())) 502 | 503 | for i, v := range counts { 504 | fmt.Printf("shard - %d count = %d, range: %d-%d\n", i, v, startEndRange[i][0], startEndRange[i][1]) 505 | total += int(v) 506 | } 507 | 508 | if total != n { 509 | t.Errorf("Expected count %d, received %d", n, total) 510 | } 511 | 512 | if expectedSum != sum { 513 | t.Errorf("Expected sum %d, received %d", expectedSum, sum) 514 | } 515 | } 516 | 517 | func TestVisitorError(t *testing.T) { 518 | const n = 100000 519 | var wg sync.WaitGroup 520 | db := NewWithConfig(testConf) 521 | defer db.Close() 522 | 523 | wg.Add(1) 524 | w := db.NewWriter() 525 | doInsert(db, w, &wg, n, false, false) 526 | snap, _ := db.NewSnapshot() 527 | defer snap.Close() 528 | 529 | errVisitor := fmt.Errorf("visitor failed") 530 | callb := func(itm *Item, shard int) error { 531 | v := binary.BigEndian.Uint64(itm.Bytes()) 532 | if v == 90000 { 533 | return errVisitor 534 | } 535 | return nil 536 | } 537 | 538 | if db.Visitor(snap, callb, 4, 4) != errVisitor { 539 | t.Errorf("Expected error") 540 | } 541 | } 542 | 543 | func doUpdate(db *Nitro, wg *sync.WaitGroup, w *Writer, start, end int, version int) { 544 | defer wg.Done() 545 | for ; start < end; start++ { 546 | oldval := uint64(start) + uint64(version-1)*10000000 547 | val := uint64(start) + uint64(version)*10000000 548 | buf1 := make([]byte, 8) 549 | binary.BigEndian.PutUint64(buf1, uint64(val)) 550 | buf2 := make([]byte, 8) 551 | binary.BigEndian.PutUint64(buf2, uint64(oldval)) 552 | if version > 1 { 553 | if !w.Delete(buf2) { 554 | panic("delete failed") 555 | } 556 | } 557 | w.Put(buf1) 558 | } 559 | } 560 | 561 | func TestLoadDeltaStoreDisk(t *testing.T) { 562 | os.RemoveAll("db.dump") 563 | conf := DefaultConfig() 564 | conf.UseDeltaInterleaving() 565 | db := NewWithConfig(conf) 566 | 567 | var writers []*Writer 568 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 569 | writers = append(writers, db.NewWriter()) 570 | } 571 | 572 | chunk := 1000000 / runtime.GOMAXPROCS(0) 573 | total := chunk * runtime.GOMAXPROCS(0) 574 | version := 0 575 | 576 | doMutate := func() *Snapshot { 577 | var wg sync.WaitGroup 578 | version++ 579 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 580 | wg.Add(1) 581 | start := i * chunk 582 | end := start + chunk 583 | go doUpdate(db, &wg, writers[i], start, end, version) 584 | } 585 | wg.Wait() 586 | 587 | snap, _ := db.NewSnapshot() 588 | return snap 589 | } 590 | 591 | var snap, snapw *Snapshot 592 | for x := 0; x < 2; x++ { 593 | if snap != nil { 594 | snap.Close() 595 | } 596 | snap = doMutate() 597 | } 598 | 599 | waiter := make(chan bool) 600 | var wg2 sync.WaitGroup 601 | wg2.Add(1) 602 | go func() { 603 | defer wg2.Done() 604 | 605 | for x := 0; x < 10; x++ { 606 | if snapw != nil { 607 | snapw.Close() 608 | } 609 | 610 | snapw = doMutate() 611 | if x == 0 { 612 | close(waiter) 613 | } 614 | } 615 | 616 | snap.Close() 617 | count := db.gcsnapshots.GetStats().NodeCount 618 | 619 | for count > 5 { 620 | time.Sleep(time.Second) 621 | count = db.gcsnapshots.GetStats().NodeCount 622 | } 623 | }() 624 | 625 | callb := func(itm *ItemEntry) { 626 | <-waiter 627 | } 628 | 629 | t0 := time.Now() 630 | err := db.StoreToDisk("db.dump", snap, 8, callb) 631 | if err != nil { 632 | t.Errorf("Expected no error. got=%v", err) 633 | } 634 | 635 | fmt.Printf("Storing to disk took %v\n", time.Since(t0)) 636 | 637 | wg2.Wait() 638 | snapw.Close() 639 | db.Close() 640 | 641 | db = NewWithConfig(conf) 642 | defer db.Close() 643 | t0 = time.Now() 644 | snap, err = db.LoadFromDisk("db.dump", 8, nil) 645 | defer snap.Close() 646 | if err != nil { 647 | t.Errorf("Expected no error. got=%v", err) 648 | } 649 | fmt.Printf("Loading from disk took %v\n", time.Since(t0)) 650 | 651 | count := CountItems(snap) 652 | if count != total { 653 | t.Errorf("Expected %v, got %v", total, count) 654 | } 655 | 656 | count = int(snap.Count()) 657 | if count != total { 658 | t.Errorf("Count mismatch on snapshot. Expected %d, got %d", total, count) 659 | } 660 | 661 | itr := snap.NewIterator() 662 | i := 0 663 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 664 | itm := itr.Get() 665 | val := binary.BigEndian.Uint64(itm) 666 | exp := uint64(i) + uint64(2)*10000000 667 | 668 | if val != exp { 669 | t.Errorf("expected %d, got %d", exp, val) 670 | } 671 | i++ 672 | } 673 | itr.Close() 674 | 675 | fmt.Println(db.DumpStats()) 676 | fmt.Println("Restored", db.DeltaRestored) 677 | fmt.Println("RestoredFailed", db.DeltaRestoreFailed) 678 | } 679 | 680 | func TestExecuteConcurrGCWorkers(t *testing.T) { 681 | db := NewWithConfig(testConf) 682 | defer db.Close() 683 | 684 | w := db.NewWriter() 685 | 686 | // the test is valid only for UseMemoryMgmt since 687 | // nodefree is invoked only for memory mgmt case 688 | if !w.store.UseMemoryMgmt { 689 | return 690 | } 691 | 692 | for x := 0; x < 40; x++ { 693 | db.NewWriter() 694 | } 695 | 696 | for i := 0; i < 200000; i++ { 697 | w.Put([]byte(fmt.Sprintf("%010d", i))) 698 | } 699 | snap, _ := w.NewSnapshot() 700 | snap.Close() 701 | 702 | var snaps []*Snapshot 703 | for i := 0; i < 200000; i++ { 704 | if i%1000 == 0 { 705 | snap, _ := w.NewSnapshot() 706 | snaps = append(snaps, snap) 707 | } 708 | w.Delete([]byte(fmt.Sprintf("%010d", i))) 709 | } 710 | snap, _ = w.NewSnapshot() 711 | snaps = append(snaps, snap) 712 | 713 | barrier := w.store.GetAccesBarrier() 714 | bs := barrier.Acquire() 715 | barrier.Release(bs) 716 | for _, snap := range snaps { 717 | snap.Close() 718 | } 719 | 720 | for db.store.GetStats().NodeFrees != 200000 { 721 | time.Sleep(time.Millisecond) 722 | } 723 | } 724 | 725 | func TestCloseWithActiveIterators(t *testing.T) { 726 | var wg sync.WaitGroup 727 | db := NewWithConfig(testConf) 728 | 729 | w := db.NewWriter() 730 | for i := 0; i < 200000; i++ { 731 | w.Put([]byte(fmt.Sprintf("%010d", i))) 732 | } 733 | 734 | snap, _ := w.NewSnapshot() 735 | for i := 0; i < 1000; i++ { 736 | wg.Add(1) 737 | go func(wg *sync.WaitGroup) { 738 | defer wg.Done() 739 | 740 | if itr := db.NewIterator(snap); itr != nil { 741 | for x := 0; x < 5; x++ { 742 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 743 | } 744 | } 745 | itr.Close() 746 | } 747 | }(&wg) 748 | } 749 | 750 | snap.Close() 751 | db.Close() 752 | wg.Wait() 753 | 754 | } 755 | 756 | func TestDiskCorruption(t *testing.T) { 757 | os.RemoveAll("db.dump") 758 | var wg sync.WaitGroup 759 | db := NewWithConfig(testConf) 760 | defer db.Close() 761 | n := (100000 / runtime.GOMAXPROCS(0)) * runtime.GOMAXPROCS(0) 762 | t0 := time.Now() 763 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 764 | wg.Add(1) 765 | w := db.NewWriter() 766 | go doInsert(db, w, &wg, n/runtime.GOMAXPROCS(0), true, false) 767 | } 768 | wg.Wait() 769 | fmt.Printf("Inserting %v items took %v\n", n, time.Since(t0)) 770 | snap0, _ := db.NewSnapshot() 771 | defer snap0.Close() 772 | snap, _ := db.NewSnapshot() 773 | fmt.Println(db.DumpStats()) 774 | 775 | t0 = time.Now() 776 | err := db.StoreToDisk("db.dump", snap, 8, nil) 777 | if err != nil { 778 | t.Errorf("Expected no error. got=%v", err) 779 | } 780 | 781 | fmt.Printf("Storing to disk took %v\n", time.Since(t0)) 782 | snap.Close() 783 | 784 | // Now open a shard file and corrupt it 785 | shard0 := filepath.Join("db.dump", "data") 786 | shard0 = filepath.Join(shard0, "shard-0") 787 | if cwr, err := os.OpenFile(shard0, os.O_WRONLY, 0755); err != nil { 788 | panic(err) 789 | } else { 790 | cwr.WriteAt([]byte("corrupt"), 100) 791 | cwr.Close() 792 | } 793 | 794 | db = NewWithConfig(testConf) 795 | defer db.Close() 796 | t0 = time.Now() 797 | snap, err = db.LoadFromDisk("db.dump", 8, nil) 798 | if err != ErrCorruptSnapshot { 799 | t.Errorf("Expected corrupted snapshot! got=%v", err) 800 | } 801 | fmt.Printf("Loading from disk took %v\n", time.Since(t0)) 802 | } 803 | 804 | func TestSnapshotStats(t *testing.T) { 805 | db := NewWithConfig(testConf) 806 | defer db.Close() 807 | 808 | var snaps []*Snapshot 809 | n := 2000 810 | snapFreq := 25 811 | 812 | w := db.NewWriter() 813 | for i := 0; i < n; i++ { 814 | w.Put([]byte(fmt.Sprintf("%010d", i))) 815 | 816 | if i % snapFreq == 0 { 817 | snap, _ := w.NewSnapshot() 818 | snaps = append(snaps, snap) 819 | } 820 | } 821 | 822 | var currSn, lastGCSn uint32 823 | 824 | if currSn = db.GetCurrSn(); currSn != uint32(1+n/snapFreq) { 825 | t.Errorf("Wrong currSn. Expected [%d], got [%d]", 1+n/snapFreq, currSn) 826 | } 827 | 828 | if lastGCSn = db.GetLastGCSn(); lastGCSn != 0 { 829 | t.Errorf("Wrong lastGCSn. Expected [%d], got [%d]", 0, lastGCSn) 830 | } 831 | 832 | // Close half of the snapshots 833 | numSnapsToClose := uint32(n/snapFreq/2) 834 | for _, snap := range snaps[:numSnapsToClose] { 835 | snap.Close() 836 | } 837 | 838 | if lastGCSn = db.GetLastGCSn(); lastGCSn != numSnapsToClose { 839 | t.Errorf("Wrong lastGCSn. Expected [%d], got [%d]", numSnapsToClose, lastGCSn) 840 | } 841 | 842 | // close remaining snapshots 843 | for _, snap := range snaps[numSnapsToClose:] { 844 | snap.Close() 845 | } 846 | 847 | if lastGCSn = db.GetLastGCSn(); lastGCSn != uint32(n/snapFreq) { 848 | t.Errorf("Wrong lastGCSn. Expected [%d], got [%d]", n/snapFreq, lastGCSn) 849 | } 850 | } 851 | 852 | func TestInsertDeleteConcurrent(t *testing.T) { 853 | var wgInsert, wgDelete sync.WaitGroup 854 | // in case of leaks from any prev test case 855 | oldAllocs, oldFrees := mm.GetAllocStats() 856 | 857 | db := NewWithConfig(testConf) 858 | //debug.SetGCPercent(-1) 859 | 860 | rand.Seed(time.Now().UnixNano()) 861 | tmin := 50 862 | tmax := 300 863 | 864 | n := (10000000 / runtime.GOMAXPROCS(0)) * runtime.GOMAXPROCS(0) 865 | chunk := n / runtime.GOMAXPROCS(0) 866 | 867 | var iwriters, dwriters []*Writer 868 | 869 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 870 | iwriters = append(iwriters, db.NewWriter()) 871 | } 872 | 873 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 874 | dwriters = append(dwriters, db.NewWriter()) 875 | } 876 | 877 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 878 | wgInsert.Add(1) 879 | go func(db *Nitro, w *Writer, wg *sync.WaitGroup, min_key, max_key int) { 880 | defer wg.Done() 881 | for val := min_key; val < max_key; val++ { 882 | buf := make([]byte, 8) 883 | binary.BigEndian.PutUint64(buf, uint64(val)) 884 | w.Put(buf) 885 | } 886 | x := rand.Intn(tmax-tmin+1) + tmin 887 | time.Sleep(time.Duration(x) * time.Microsecond) 888 | //runtime.GC() 889 | }(db, iwriters[i], &wgInsert, i*chunk, (i+1)*chunk) 890 | } 891 | 892 | var del_count int64 = 0 893 | 894 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 895 | wgDelete.Add(1) 896 | go func(db *Nitro, w *Writer, wg *sync.WaitGroup, min_key, max_key int) { 897 | defer wg.Done() 898 | for val := min_key; val < max_key; val++ { 899 | buf := make([]byte, 8) 900 | binary.BigEndian.PutUint64(buf, uint64(val)) 901 | if w.Delete(buf) { 902 | atomic.AddInt64(&del_count, 1) 903 | } 904 | } 905 | x := rand.Intn(tmax-tmin+1) + tmin + 10 906 | time.Sleep(time.Duration(x) * time.Microsecond) 907 | //runtime.GC() 908 | }(db, dwriters[i], &wgDelete, i*chunk/2, (i+1)*chunk/2) 909 | } 910 | 911 | wgInsert.Wait() 912 | wgDelete.Wait() 913 | 914 | // Verify Snapshot Scan after deletes 915 | 916 | snap, _ := db.NewSnapshot() 917 | // scans items 918 | got1 := CountItems(snap) 919 | // from snapshot info 920 | got2 := (int)(snap.Count()) 921 | fmt.Println("total items in snapshot:", got1, " items deleted:", del_count) 922 | if got1 != got2 { 923 | t.Errorf("snapshot count inconsistent, got1: %d got2: %d", 924 | got1, got2) 925 | } 926 | 927 | // Verify Skiplist Stats 928 | 929 | // snapshot item count should match node count 930 | nc := db.store.GetStats().NodeCount 931 | if got1 != nc { 932 | t.Errorf("snapshot count mismatch with node count, got1: %d nc: %d", 933 | got1, nc) 934 | } 935 | 936 | na := db.store.GetStats().NodeAllocs 937 | // node count should match node allocs - deleted items 938 | if na - del_count != int64(nc) { 939 | t.Errorf("node count :%d does not match nodeAllocs - deleted items %d-%d", 940 | nc, na, del_count) 941 | } 942 | snap.Close() 943 | db.Close() 944 | 945 | fmt.Println(db.DumpStats()) 946 | 947 | // Verify Memory Leaks 948 | 949 | a, b := mm.GetAllocStats() 950 | a = a - oldAllocs 951 | b = b - oldFrees 952 | if a-b != 0 { 953 | t.Errorf("Found memory leak: allocs %d, freed %d, delta %d", a, b, a-b) 954 | } else { 955 | fmt.Printf("allocs: %d frees: %d\n", a, b) 956 | } 957 | } 958 | -------------------------------------------------------------------------------- /nodelist.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nitro 10 | 11 | import ( 12 | "bytes" 13 | "github.com/couchbase/nitro/skiplist" 14 | ) 15 | 16 | // NodeList is a linked list of skiplist nodes 17 | type NodeList struct { 18 | head *skiplist.Node 19 | } 20 | 21 | // NewNodeList creates new node list 22 | func NewNodeList(head *skiplist.Node) *NodeList { 23 | return &NodeList{ 24 | head: head, 25 | } 26 | } 27 | 28 | // Keys returns all keys from the node list 29 | func (l *NodeList) Keys() (keys [][]byte) { 30 | node := l.head 31 | for node != nil { 32 | key := (*Item)(node.Item()).Bytes() 33 | keys = append(keys, key) 34 | node = node.GetLink() 35 | } 36 | 37 | return 38 | } 39 | 40 | // Remove a key from the node list 41 | func (l *NodeList) Remove(key []byte) *skiplist.Node { 42 | var prev *skiplist.Node 43 | node := l.head 44 | for node != nil { 45 | nodeKey := (*Item)(node.Item()).Bytes() 46 | if bytes.Equal(nodeKey, key) { 47 | if prev == nil { 48 | l.head = node.GetLink() 49 | return node 50 | } 51 | 52 | prev.SetLink(node.GetLink()) 53 | return node 54 | } 55 | prev = node 56 | node = node.GetLink() 57 | } 58 | 59 | return nil 60 | } 61 | 62 | // Add a key into the node list 63 | func (l *NodeList) Add(node *skiplist.Node) { 64 | node.SetLink(l.head) 65 | l.head = node 66 | } 67 | 68 | // Head returns head node from the list 69 | func (l *NodeList) Head() *skiplist.Node { 70 | return l.head 71 | } 72 | -------------------------------------------------------------------------------- /nodelist_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nitro 10 | 11 | import ( 12 | "fmt" 13 | "testing" 14 | ) 15 | 16 | func TestNodeList(t *testing.T) { 17 | db := New() 18 | defer db.Close() 19 | 20 | n := 10 21 | var list *NodeList 22 | w := db.NewWriter() 23 | for i := 0; i < n; i++ { 24 | ptr := w.Put2([]byte(fmt.Sprintf("%010d", i))) 25 | if list == nil { 26 | list = NewNodeList(ptr) 27 | } else { 28 | list.Add(ptr) 29 | } 30 | } 31 | 32 | count := 0 33 | for i, k := range list.Keys() { 34 | expected := fmt.Sprintf("%010d", n-i-1) 35 | if expected != string(k) { 36 | t.Errorf("Expected %s, got %s", expected, string(k)) 37 | } 38 | count++ 39 | } 40 | 41 | if count != n { 42 | t.Errorf("Expected %d, got %d", n, count) 43 | } 44 | 45 | list.Remove([]byte(fmt.Sprintf("%010d", 2))) 46 | list.Remove([]byte(fmt.Sprintf("%010d", 5))) 47 | list.Remove([]byte(fmt.Sprintf("%010d", 8))) 48 | 49 | count = len(list.Keys()) 50 | if count != n-3 { 51 | t.Errorf("Expected %d, got %d", n-3, count) 52 | } 53 | 54 | for i := 10; i < 13; i++ { 55 | ptr := w.Put2([]byte(fmt.Sprintf("%010d", i))) 56 | list.Add(ptr) 57 | } 58 | 59 | count = len(list.Keys()) 60 | if count != n { 61 | t.Errorf("Expected %d, got %d", n, count) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /nodetable/table.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | // Package nodetable implements high performance GC optimized Node lookup table 10 | // for Nitro index storage. This table is not thread-safe 11 | // 12 | // Golang map does not need to pay the cost of GC scans if you have native 13 | // fixed size types for both key and value. We use two tables for the node 14 | // lookup table implementation. Fast table and slow table. Fast table stores 15 | // maps crc32(key) to a uint64. Value is a pointer to a skiplist node. Highest 16 | // bit is used to indicate whether there is any hash collision for the crc32 17 | // key used. If the bit is set, that means we need to lookup second table, 18 | // which is the slow table. Slow table has multiple entries which are mapped 19 | // by the same crc32 key. 20 | package nodetable 21 | 22 | import "unsafe" 23 | import "fmt" 24 | import "github.com/couchbase/nitro/skiplist" 25 | 26 | var emptyResult ntResult 27 | 28 | const approxItemSize = 42 29 | 30 | var dbInstances *skiplist.Skiplist 31 | 32 | func init() { 33 | dbInstances = skiplist.New() 34 | } 35 | 36 | // EqualKeyFn implements key equality check 37 | type EqualKeyFn func(unsafe.Pointer, []byte) bool 38 | 39 | // HashFn implements 32bit hash function on a string 40 | type HashFn func([]byte) uint32 41 | 42 | // NodeTable describes lookup table 43 | type NodeTable struct { 44 | fastHT map[uint32]uint64 45 | slowHT map[uint32][]uint64 46 | fastHTCount uint64 47 | slowHTCount uint64 48 | conflicts uint64 49 | 50 | hash HashFn 51 | keyEqual EqualKeyFn 52 | 53 | res ntResult 54 | } 55 | 56 | // CompareNodeTable implements comparator for nodetable instances 57 | func CompareNodeTable(a, b unsafe.Pointer) int { 58 | return int(uintptr(a)) - int(uintptr(b)) 59 | } 60 | 61 | const ( 62 | ntNotFound = 0x00 63 | ntFoundInFast = 0x01 64 | ntFoundInSlow = 0x03 65 | ntFoundMask = 0x01 66 | ) 67 | 68 | type ntResult struct { 69 | status int 70 | hash uint32 71 | hasConflict bool 72 | fastHTHasEntry bool 73 | fastHTValue uint64 74 | slowHTValues []uint64 75 | slowHTPos int 76 | } 77 | 78 | // New creates a nodetable instance 79 | func New(hfn HashFn, kfn EqualKeyFn) *NodeTable { 80 | nt := &NodeTable{ 81 | fastHT: make(map[uint32]uint64), 82 | slowHT: make(map[uint32][]uint64), 83 | hash: hfn, 84 | keyEqual: kfn, 85 | } 86 | 87 | buf := dbInstances.MakeBuf() 88 | defer dbInstances.FreeBuf(buf) 89 | dbInstances.Insert(unsafe.Pointer(nt), CompareNodeTable, buf, &dbInstances.Stats) 90 | 91 | return nt 92 | } 93 | 94 | // Stats returns nodetable statistics 95 | func (nt *NodeTable) Stats() string { 96 | return fmt.Sprintf("{\n"+ 97 | `"FastHTCount": %d,`+"\n"+ 98 | `"SlowHTCount": %d,`+"\n"+ 99 | `"Conflicts": %d,`+"\n"+ 100 | `"MemoryInUse": %d`+"\n}", 101 | nt.fastHTCount, nt.slowHTCount, nt.conflicts, nt.MemoryInUse()) 102 | } 103 | 104 | // MemoryInUse returns memory used by nodetable instance 105 | func (nt *NodeTable) MemoryInUse() int64 { 106 | return int64(approxItemSize * (nt.fastHTCount + nt.slowHTCount)) 107 | } 108 | 109 | // Get returns node pointer for the lookup key 110 | func (nt *NodeTable) Get(key []byte) unsafe.Pointer { 111 | res := nt.find(key) 112 | if res.status&ntFoundMask == ntFoundMask { 113 | if res.status == ntFoundInFast { 114 | return decodePointer(res.fastHTValue) 115 | } 116 | return decodePointer(res.slowHTValues[res.slowHTPos]) 117 | } 118 | 119 | return nil 120 | } 121 | 122 | // Update inserts or replaces an existing entry 123 | func (nt *NodeTable) Update(key []byte, nptr unsafe.Pointer) (updated bool, oldPtr unsafe.Pointer) { 124 | res := nt.find(key) 125 | if res.status&ntFoundMask == ntFoundMask { 126 | // Found key, replace old pointer value with new one 127 | updated = true 128 | if res.status == ntFoundInFast { 129 | oldPtr = decodePointer(res.fastHTValue) 130 | nt.fastHT[res.hash] = encodePointer(nptr, res.hasConflict) 131 | } else { 132 | oldPtr = decodePointer(res.slowHTValues[res.slowHTPos]) 133 | res.slowHTValues[res.slowHTPos] = encodePointer(nptr, true) 134 | } 135 | } else { 136 | // Insert new key 137 | updated = false 138 | newSlowValue := res.fastHTHasEntry && !res.hasConflict 139 | // Key needs to be inserted into slowHT 140 | if res.hasConflict || newSlowValue { 141 | slowHTValues := nt.slowHT[res.hash] 142 | slowHTValues = append(slowHTValues, encodePointer(nptr, false)) 143 | nt.slowHT[res.hash] = slowHTValues 144 | // There is an entry already in the fastHT for same crc32 hash 145 | // We have inserted first entry into the slowHT. Now mark conflict bit. 146 | if newSlowValue { 147 | nt.fastHT[res.hash] = encodePointer(decodePointer(nt.fastHT[res.hash]), true) 148 | nt.conflicts++ 149 | } 150 | nt.slowHTCount++ 151 | } else { 152 | // Insert new item into fastHT 153 | nt.fastHT[res.hash] = encodePointer(nptr, false) 154 | nt.fastHTCount++ 155 | } 156 | } 157 | 158 | return 159 | } 160 | 161 | // Remove an item from the nodetable 162 | func (nt *NodeTable) Remove(key []byte) (success bool, nptr unsafe.Pointer) { 163 | res := nt.find(key) 164 | if res.status&ntFoundMask == ntFoundMask { 165 | success = true 166 | if res.status == ntFoundInFast { 167 | nptr = decodePointer(res.fastHTValue) 168 | // Key needs to be removed from fastHT. For that we need to move 169 | // an item present in slowHT and overwrite fastHT entry. 170 | if res.hasConflict { 171 | slowHTValues := nt.slowHT[res.hash] 172 | v := slowHTValues[0] // New fastHT candidate 173 | slowHTValues = append([]uint64(nil), slowHTValues[1:]...) 174 | nt.slowHTCount-- 175 | 176 | var conflict bool 177 | if len(slowHTValues) == 0 { 178 | delete(nt.slowHT, res.hash) 179 | nt.conflicts-- 180 | } else { 181 | conflict = true 182 | nt.slowHT[res.hash] = slowHTValues 183 | } 184 | 185 | nt.fastHT[res.hash] = encodePointer(decodePointer(v), conflict) 186 | } else { 187 | delete(nt.fastHT, res.hash) 188 | nt.fastHTCount-- 189 | } 190 | } else { 191 | nptr = decodePointer(res.slowHTValues[res.slowHTPos]) 192 | // Remove key from slowHT 193 | newSlowValue := append([]uint64(nil), res.slowHTValues[:res.slowHTPos]...) 194 | if res.slowHTPos+1 != len(res.slowHTValues) { 195 | newSlowValue = append(newSlowValue, res.slowHTValues[res.slowHTPos+1:]...) 196 | } 197 | nt.slowHTCount-- 198 | 199 | if len(newSlowValue) == 0 { 200 | delete(nt.slowHT, res.hash) 201 | nt.fastHT[res.hash] = encodePointer(decodePointer(nt.fastHT[res.hash]), false) 202 | nt.conflicts-- 203 | } else { 204 | nt.slowHT[res.hash] = newSlowValue 205 | } 206 | } 207 | } 208 | return 209 | } 210 | 211 | func (nt *NodeTable) ItemsCount() int64 { 212 | return int64(nt.fastHTCount + nt.slowHTCount) 213 | } 214 | 215 | func decodePointer(v uint64) unsafe.Pointer { 216 | var x uintptr 217 | if unsafe.Sizeof(x) == 8 { 218 | ptr := uintptr(v & ^(uint64(1) << 63)) 219 | return unsafe.Pointer(ptr) 220 | } 221 | return unsafe.Pointer(uintptr(v & 0xffffffff)) 222 | } 223 | 224 | func encodePointer(p unsafe.Pointer, hasConflict bool) uint64 { 225 | v := uint64(uintptr(p)) 226 | if hasConflict { 227 | v |= 1 << 63 228 | } 229 | 230 | return v 231 | } 232 | 233 | func (nt *NodeTable) hasConflict(v uint64) bool { 234 | return v>>63 == 1 235 | } 236 | 237 | func (nt *NodeTable) isEqual(key []byte, v uint64) bool { 238 | p := decodePointer(v) 239 | return nt.keyEqual(p, key) 240 | } 241 | 242 | func (nt *NodeTable) find(key []byte) (res *ntResult) { 243 | nt.res = emptyResult 244 | res = &nt.res 245 | res.status = ntNotFound 246 | h := nt.hash(key) 247 | res.hash = h 248 | 249 | v, ok := nt.fastHT[h] 250 | res.fastHTHasEntry = ok 251 | if ok { 252 | res.hasConflict = nt.hasConflict(v) 253 | if nt.isEqual(key, v) { 254 | res.status = ntFoundInFast 255 | res.fastHTValue = v 256 | return 257 | } 258 | 259 | if res.hasConflict { 260 | if vs, ok := nt.slowHT[h]; ok { 261 | for i, v := range vs { 262 | if nt.isEqual(key, v) { 263 | res.slowHTPos = i 264 | res.slowHTValues = vs 265 | res.status = ntFoundInSlow 266 | return 267 | } 268 | } 269 | } 270 | } 271 | } 272 | 273 | return 274 | } 275 | 276 | // Close destroys the nodetable 277 | func (nt *NodeTable) Close() { 278 | nt.fastHTCount = 0 279 | nt.slowHTCount = 0 280 | nt.conflicts = 0 281 | nt.fastHT = make(map[uint32]uint64) 282 | nt.slowHT = make(map[uint32][]uint64) 283 | 284 | buf := dbInstances.MakeBuf() 285 | defer dbInstances.FreeBuf(buf) 286 | dbInstances.Delete(unsafe.Pointer(nt), CompareNodeTable, buf, &dbInstances.Stats) 287 | } 288 | 289 | // MemoryInUse returns total memory used by nodetables in a process 290 | func MemoryInUse() (sz int64) { 291 | buf := dbInstances.MakeBuf() 292 | defer dbInstances.FreeBuf(buf) 293 | iter := dbInstances.NewIterator(CompareNodeTable, buf) 294 | defer iter.Close() 295 | for iter.SeekFirst(); iter.Valid(); iter.Next() { 296 | db := (*NodeTable)(iter.Get()) 297 | sz += db.MemoryInUse() 298 | } 299 | 300 | return 301 | } 302 | -------------------------------------------------------------------------------- /nodetable/table_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package nodetable 10 | 11 | import "testing" 12 | import "bytes" 13 | import "hash/crc32" 14 | import "unsafe" 15 | import "fmt" 16 | import "time" 17 | import "syscall" 18 | import "runtime/debug" 19 | 20 | type object struct { 21 | key []byte 22 | value int 23 | } 24 | 25 | func equalObject(p unsafe.Pointer, k []byte) bool { 26 | obj := (*object)(p) 27 | return bytes.Equal(obj.key, k) 28 | } 29 | 30 | func mkHashFun(h uint32) HashFn { 31 | return func([]byte) uint32 { 32 | return h 33 | } 34 | } 35 | 36 | func dumpTable(tab *NodeTable) { 37 | fmt.Println("==NodeTable==") 38 | count := 0 39 | for k, v := range tab.fastHT { 40 | o := (*object)(decodePointer(v)) 41 | fmt.Printf("hash:%d, keys:%s,", k, string(o.key)) 42 | count++ 43 | if vs, ok := tab.slowHT[k]; ok { 44 | for _, v := range vs { 45 | o := (*object)(decodePointer(v)) 46 | fmt.Printf("%s,", string(o.key)) 47 | count++ 48 | } 49 | } 50 | fmt.Println("") 51 | } 52 | 53 | fmt.Println("Total:", count) 54 | } 55 | 56 | func mkObject(key string, v int) *object { 57 | return &object{ 58 | key: []byte(key), 59 | value: v, 60 | } 61 | } 62 | 63 | func TestPointerEncode(t *testing.T) { 64 | o1 := unsafe.Pointer(mkObject("key", 1000)) 65 | v := encodePointer(o1, true) 66 | o2 := decodePointer(v) 67 | 68 | if o1 != o2 { 69 | t.Errorf("Expected encoded value to remain the same with conflict %p!=%p", o1, o2) 70 | } 71 | 72 | v = encodePointer(o1, false) 73 | o2 = decodePointer(v) 74 | 75 | if o1 != o2 { 76 | t.Errorf("Expected encoded value to remain the same without conflict %p!=%p", o1, o2) 77 | } 78 | } 79 | 80 | func TestInsertFastHT(t *testing.T) { 81 | table := New(mkHashFun(100), equalObject) 82 | o1 := mkObject("key", 1000) 83 | table.Update(o1.key, unsafe.Pointer(o1)) 84 | o2 := (*object)(table.Get(o1.key)) 85 | if o2 != o1 { 86 | t.Errorf("Expected same object") 87 | } 88 | } 89 | 90 | func TestInsertSlowHT(t *testing.T) { 91 | table := New(mkHashFun(100), equalObject) 92 | o1 := mkObject("key1", 1000) 93 | o2 := mkObject("key2", 2000) 94 | o3 := mkObject("key3", 3000) 95 | table.Update(o1.key, unsafe.Pointer(o1)) 96 | table.Update(o2.key, unsafe.Pointer(o2)) 97 | table.Update(o3.key, unsafe.Pointer(o3)) 98 | ro1 := (*object)(table.Get(o1.key)) 99 | ro2 := (*object)(table.Get(o2.key)) 100 | ro3 := (*object)(table.Get(o3.key)) 101 | if o1 != ro1 || o2 != ro2 || o3 != ro3 { 102 | t.Errorf("Expected same objects %p!=%p, %p!=%p, %p!=%p", o1, ro1, o2, ro2, o3, ro3) 103 | } 104 | } 105 | 106 | func TestUpdateFastHT(t *testing.T) { 107 | table := New(mkHashFun(100), equalObject) 108 | o1 := mkObject("key", 1000) 109 | o2 := mkObject("key", 2000) 110 | updated, old := table.Update(o1.key, unsafe.Pointer(o1)) 111 | if updated != false || old != nil { 112 | t.Errorf("Expected successful insert") 113 | } 114 | 115 | updated, old = table.Update(o2.key, unsafe.Pointer(o2)) 116 | if updated != true || (*object)(old) != o1 { 117 | t.Errorf("Expected old object to be returned") 118 | } 119 | } 120 | 121 | func TestUpdateSlowHT(t *testing.T) { 122 | table := New(mkHashFun(100), equalObject) 123 | o1 := mkObject("key0", 1000) 124 | o2 := mkObject("key1", 2000) 125 | o3 := mkObject("key1", 6000) 126 | updated, old := table.Update(o1.key, unsafe.Pointer(o1)) 127 | if updated != false || old != nil { 128 | t.Errorf("Expected successful insert") 129 | } 130 | 131 | table.Update(o2.key, unsafe.Pointer(o2)) 132 | updated, old = table.Update(o3.key, unsafe.Pointer(o3)) 133 | if updated != true || (*object)(old) != o2 { 134 | t.Errorf("Expected old object to be returned") 135 | } 136 | } 137 | 138 | func TestDeleteFastHT1(t *testing.T) { 139 | table := New(mkHashFun(100), equalObject) 140 | o1 := mkObject("key", 1000) 141 | table.Update(o1.key, unsafe.Pointer(o1)) 142 | o2 := (*object)(table.Get(o1.key)) 143 | if o2 != o1 { 144 | t.Errorf("Expected same object") 145 | } 146 | 147 | if success, _ := table.Remove(o1.key); success != true { 148 | t.Errorf("Expected successful remove") 149 | } 150 | 151 | o3 := (*object)(table.Get(o1.key)) 152 | if o3 != nil { 153 | t.Errorf("Expected not-found") 154 | } 155 | 156 | if success, _ := table.Remove(o1.key); success == true { 157 | t.Errorf("Expected remove fail") 158 | } 159 | } 160 | 161 | func TestDeleteFastHT2(t *testing.T) { 162 | table := New(mkHashFun(100), equalObject) 163 | o1 := mkObject("key1", 1000) 164 | o2 := mkObject("key2", 2000) 165 | o3 := mkObject("key3", 3000) 166 | table.Update(o1.key, unsafe.Pointer(o1)) 167 | table.Update(o2.key, unsafe.Pointer(o2)) 168 | table.Update(o3.key, unsafe.Pointer(o3)) 169 | 170 | if success, _ := table.Remove(o1.key); success != true { 171 | t.Errorf("Expected successful remove") 172 | } 173 | 174 | ro1 := (*object)(table.Get(o1.key)) 175 | ro2 := (*object)(table.Get(o2.key)) 176 | ro3 := (*object)(table.Get(o3.key)) 177 | 178 | if ro1 != nil { 179 | t.Errorf("Expected not found") 180 | } 181 | 182 | if ro2 != o2 || ro3 != o3 { 183 | t.Errorf("Expected to find those objects") 184 | } 185 | } 186 | 187 | func TestDeleteSlowHT1(t *testing.T) { 188 | table := New(mkHashFun(100), equalObject) 189 | o1 := mkObject("key1", 1000) 190 | o2 := mkObject("key2", 2000) 191 | o3 := mkObject("key3", 3000) 192 | table.Update(o1.key, unsafe.Pointer(o1)) 193 | table.Update(o2.key, unsafe.Pointer(o2)) 194 | table.Update(o3.key, unsafe.Pointer(o3)) 195 | 196 | if success, _ := table.Remove(o2.key); success != true { 197 | t.Errorf("Expected successful remove") 198 | } 199 | 200 | ro1 := (*object)(table.Get(o1.key)) 201 | ro2 := (*object)(table.Get(o2.key)) 202 | ro3 := (*object)(table.Get(o3.key)) 203 | 204 | if ro2 != nil { 205 | t.Errorf("Expected not found") 206 | } 207 | 208 | if ro1 != o1 || ro3 != o3 { 209 | t.Errorf("Expected to find those objects") 210 | } 211 | } 212 | 213 | func TestDeleteFastHT3(t *testing.T) { 214 | table := New(mkHashFun(100), equalObject) 215 | o1 := mkObject("key1", 1000) 216 | o2 := mkObject("key2", 2000) 217 | table.Update(o1.key, unsafe.Pointer(o1)) 218 | table.Update(o2.key, unsafe.Pointer(o2)) 219 | 220 | res := table.find(o1.key) 221 | if !table.hasConflict(res.fastHTValue) { 222 | t.Errorf("Expected conflict") 223 | } 224 | 225 | if success, _ := table.Remove(o2.key); success != true { 226 | t.Errorf("Expected successful remove") 227 | } 228 | 229 | ro1 := (*object)(table.Get(o1.key)) 230 | ro2 := (*object)(table.Get(o2.key)) 231 | 232 | if ro2 != nil { 233 | t.Errorf("Expected not found") 234 | } 235 | 236 | if ro1 != o1 { 237 | t.Errorf("Expected found") 238 | } 239 | 240 | res = table.find(o1.key) 241 | if table.hasConflict(res.fastHTValue) { 242 | t.Errorf("Expected no conflict") 243 | } 244 | 245 | } 246 | 247 | func TestSimple(t *testing.T) { 248 | table := New(crc32.ChecksumIEEE, equalObject) 249 | o1 := mkObject("key1", 100) 250 | o2 := mkObject("key1", 200) 251 | updated, old := table.Update(o1.key, unsafe.Pointer(o1)) 252 | if updated == true || old != nil { 253 | t.Errorf("Expected update=false, old=nil") 254 | } 255 | 256 | updated, old = table.Update(o2.key, unsafe.Pointer(o2)) 257 | if updated == false || old == nil { 258 | } 259 | 260 | o3 := table.Get(o1.key) 261 | if o3 == nil { 262 | t.Errorf("Expected non nil") 263 | } else { 264 | o4 := (*object)(o3) 265 | if o4.value != 200 { 266 | t.Errorf("Expected value = 200") 267 | } 268 | } 269 | } 270 | 271 | func TestLargeConflicts(t *testing.T) { 272 | n := 100000 273 | hfn := func(k []byte) uint32 { 274 | return crc32.ChecksumIEEE(k) % 1000 275 | } 276 | table := New(hfn, equalObject) 277 | objects := make([]*object, n) 278 | for i := 0; i < n; i++ { 279 | objects[i] = mkObject(fmt.Sprintf("key-%d", i), i) 280 | updated, _ := table.Update(objects[i].key, unsafe.Pointer(objects[i])) 281 | if updated { 282 | t.Errorf("Expected insert") 283 | } 284 | ptr := table.Get(objects[i].key) 285 | if (*object)(ptr) != objects[i] { 286 | t.Errorf("%s Expected object %p, not %p", objects[i].key, objects[i], ptr) 287 | dumpTable(table) 288 | } 289 | } 290 | 291 | for i := 0; i < n; i++ { 292 | ptr := table.Get(objects[i].key) 293 | if (*object)(ptr) != objects[i] { 294 | t.Errorf("Expected to find the object %s %v", string(objects[i].key), ptr) 295 | res := table.find(objects[i].key) 296 | fmt.Println(res) 297 | fmt.Println(table.Stats()) 298 | dumpTable(table) 299 | t.Fatalf("failed") 300 | } 301 | } 302 | 303 | } 304 | 305 | func TestMemoryOverhead(t *testing.T) { 306 | n := 100000 307 | table := New(crc32.ChecksumIEEE, equalObject) 308 | objects := make([]*object, n) 309 | for i := 0; i < n; i++ { 310 | objects[i] = mkObject(fmt.Sprintf("key-%d", i), i) 311 | } 312 | 313 | var rusage1, rusage2 syscall.Rusage 314 | debug.FreeOSMemory() 315 | syscall.Getrusage(syscall.RUSAGE_SELF, &rusage1) 316 | for i := 0; i < n; i++ { 317 | table.Update(objects[i].key, unsafe.Pointer(objects[i])) 318 | } 319 | debug.FreeOSMemory() 320 | syscall.Getrusage(syscall.RUSAGE_SELF, &rusage2) 321 | 322 | rss := (rusage2.Maxrss - rusage1.Maxrss) 323 | fmt.Println("Memory used for hashtable:", rss) 324 | fmt.Println("Overhead per item:", float32(rss)/float32(n)) 325 | } 326 | 327 | func TestPerf(t *testing.T) { 328 | n := 10000000 329 | table := New(crc32.ChecksumIEEE, equalObject) 330 | objects := make([]*object, n) 331 | newobjects := make([]*object, n) 332 | for i := 0; i < n; i++ { 333 | objects[i] = mkObject(fmt.Sprintf("key-%d", i), i) 334 | newobjects[i] = mkObject(fmt.Sprintf("key-%d", i), i+100) 335 | } 336 | 337 | t0 := time.Now() 338 | for i := 0; i < n; i++ { 339 | updated, last := table.Update(objects[i].key, unsafe.Pointer(objects[i])) 340 | if updated == true || last != nil { 341 | t.Errorf("Expected updated=false") 342 | } 343 | } 344 | dur := time.Since(t0) 345 | fmt.Printf("Insert took %v for %v items, %v/s\n", dur, n, float32(n)/float32(dur.Seconds())) 346 | 347 | t0 = time.Now() 348 | for i := 0; i < n; i++ { 349 | ptr := table.Get(objects[i].key) 350 | if ptr == nil { 351 | t.Fatalf("Expected to find the object") 352 | } 353 | 354 | o := (*object)(ptr) 355 | if o != objects[i] { 356 | t.Errorf("Received unexpected object") 357 | } 358 | } 359 | dur = time.Since(t0) 360 | fmt.Printf("Get took %v for %v items, %v/s\n", dur, n, float32(n)/float32(dur.Seconds())) 361 | 362 | t0 = time.Now() 363 | for i := 0; i < n; i++ { 364 | updated, last := table.Update(objects[i].key, unsafe.Pointer(objects[i])) 365 | if updated == false || (*object)(last) != objects[i] { 366 | t.Errorf("Expected updated=true") 367 | } 368 | } 369 | dur = time.Since(t0) 370 | fmt.Printf("Update took %v for %v items, %v/s\n", dur, n, float32(n)/float32(dur.Seconds())) 371 | fmt.Println("Table stats:", table.Stats()) 372 | } 373 | 374 | func TestItemsCount(t *testing.T) { 375 | testItemsCount := func(hashFn HashFn) { 376 | table := New(hashFn, equalObject) 377 | o1 := mkObject("key1", 1000) 378 | o2 := mkObject("key2", 2000) 379 | o3 := mkObject("key3", 3000) 380 | table.Update(o1.key, unsafe.Pointer(o1)) 381 | table.Update(o2.key, unsafe.Pointer(o2)) 382 | table.Update(o3.key, unsafe.Pointer(o3)) 383 | 384 | itemsCount := table.ItemsCount() 385 | if itemsCount != 3 { 386 | t.Errorf("Expected 3 items but got %d", itemsCount) 387 | } 388 | 389 | if success, _ := table.Remove(o2.key); success != true { 390 | t.Errorf("Expected successful remove") 391 | } 392 | 393 | itemsCount = table.ItemsCount() 394 | if itemsCount != 2 { 395 | t.Errorf("Expected 2 items but got %d", itemsCount) 396 | } 397 | } 398 | 399 | // Test with collisions 400 | testItemsCount(mkHashFun(100)) 401 | 402 | // Test without collisions 403 | testItemsCount(crc32.ChecksumIEEE) 404 | } 405 | -------------------------------------------------------------------------------- /skiplist/C/main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | #include "skiplist.hh" 9 | #include 10 | #include 11 | #include 12 | 13 | static unsigned int seed; 14 | 15 | void insert(Skiplist *s, int n, bool is_rand) { 16 | for (int x=0; x < n; x++) { 17 | unsigned r; 18 | if (is_rand) { 19 | r = rand_r(&seed); 20 | } else { 21 | r = x; 22 | } 23 | int *v = (int *) skiplist_malloc(sizeof(int)); 24 | *v = r; 25 | Item *itm = newItem(v, sizeof(int)); 26 | Skiplist_Insert(s, itm); 27 | } 28 | } 29 | 30 | void lookup(Skiplist *s, int n) { 31 | Node *preds[MaxLevel], *succs[MaxLevel]; 32 | for (int x=0; x < n; x++) { 33 | unsigned r = rand_r(&seed); 34 | int *v = (int *) skiplist_malloc(sizeof(int)); 35 | *v = r % n; 36 | Item *itm = newItem(v, sizeof(int)); 37 | Skiplist_findPath(s, itm, preds, succs); 38 | skiplist_free(itm); 39 | } 40 | } 41 | 42 | int main() { 43 | 44 | srand(time(NULL)); 45 | int i = 100; 46 | Skiplist *s = newSkiplist(); 47 | std::vector threads; 48 | 49 | insert(s, 10000000, false); 50 | 51 | time_t t0 = time(NULL); 52 | /* 53 | for (int x=0; x < 8; x++) { 54 | threads.push_back(std::thread(&insert,s, 1000000, true)); 55 | } 56 | */ 57 | for (int x=0; x < 8; x++) { 58 | threads.push_back(std::thread(&lookup,s, 1000000)); 59 | } 60 | 61 | for (auto& th : threads) th.join(); 62 | std::cout<<"took "<<(time(NULL)-t0)<<"s"<head; 67 | while (p) { 68 | if (p->itm->l == 4) { 69 | count++; 70 | // std::cout<<"itm "<itm->data))< 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | using namespace std; 20 | 21 | static int MaxLevel = 32; 22 | static float p = 0.25; 23 | 24 | void *skiplist_malloc(size_t sz) { 25 | return malloc(sz); 26 | } 27 | 28 | void skiplist_free(void *p) { 29 | free(p); 30 | } 31 | 32 | struct Node; 33 | 34 | typedef struct NodeRef { 35 | struct Node *ptr; 36 | bool deleted; 37 | } NodeRef; 38 | 39 | typedef struct Item { 40 | int l; 41 | void *data; 42 | } Item; 43 | 44 | typedef struct Node { 45 | NodeRef * volatile *next; 46 | Item *itm; 47 | uint16_t level; 48 | } Node; 49 | 50 | typedef struct Skiplist { 51 | Node *head; 52 | Node *tail; 53 | uint16_t level; 54 | unsigned int randSeed; 55 | } Skiplist; 56 | 57 | Item *newItem(void *data, int l) { 58 | Item *i = (Item *) skiplist_malloc(sizeof(Item)); 59 | i->data = data; 60 | i->l = l; 61 | return i; 62 | } 63 | 64 | int Item_Compare(Item *itm1, Item *itm2) { 65 | if (itm2 == NULL || itm2->l == INT_MAX) { 66 | return 1; 67 | } 68 | 69 | if (itm1->l == INT_MIN) { 70 | return -1; 71 | } 72 | 73 | if (itm1->l == INT_MAX) { 74 | return 1; 75 | } 76 | 77 | int l = min(itm1->l,itm2->l); 78 | return memcmp(itm1->data, itm2->data, l); 79 | } 80 | 81 | NodeRef *newRef(Node *ptr, bool deleted) { 82 | NodeRef *n = (NodeRef *) skiplist_malloc(sizeof(NodeRef)); 83 | n->ptr = ptr; 84 | n->deleted = deleted; 85 | return n; 86 | } 87 | 88 | Node *newNode(Item *itm, int level) { 89 | Node *n = (Node *) skiplist_malloc(sizeof(Node)); 90 | n->level = (uint16_t) level; 91 | n->itm = itm; 92 | n->next = (NodeRef **) skiplist_malloc((sizeof(NodeRef*)) * level+1); 93 | 94 | return n; 95 | } 96 | 97 | void Node_setNext(Node *n, int level, Node *ptr, bool deleted) { 98 | n->next[level] = newRef(ptr, deleted); 99 | } 100 | 101 | NodeRef Node_getNext(Node *n, int level) { 102 | NodeRef null; 103 | NodeRef *ref = (NodeRef *) __atomic_load_n(&n->next[level], __ATOMIC_RELAXED); 104 | if (ref != NULL) { 105 | return *ref; 106 | } 107 | 108 | return null; 109 | } 110 | 111 | bool Node_dcasNext(Node *n, int level, Node *prevPtr, Node *newPtr, 112 | bool prevIsdeleted, bool newIsdeleted) { 113 | 114 | bool swapped = false; 115 | NodeRef * volatile *addr = &n->next[level]; 116 | NodeRef *ref = (NodeRef *) __atomic_load_n(addr, __ATOMIC_RELAXED); 117 | 118 | if (ref != NULL) { 119 | if (ref->ptr == prevPtr && ref->deleted == prevIsdeleted) { 120 | swapped = __sync_bool_compare_and_swap(addr, ref, newRef(newPtr, newIsdeleted)); 121 | } 122 | } 123 | 124 | return swapped; 125 | } 126 | 127 | Skiplist *newSkiplist() { 128 | Skiplist *s; 129 | Item *minItem, *maxItem; 130 | Node *head, *tail; 131 | 132 | srand(time(NULL)); 133 | 134 | minItem = newItem(NULL, INT_MIN); 135 | maxItem = newItem(NULL, INT_MAX); 136 | 137 | head = newNode(minItem, MaxLevel); 138 | tail = newNode(maxItem, MaxLevel); 139 | 140 | for (int i=0; i <= MaxLevel; i++) { 141 | Node_setNext(head, i, tail, false); 142 | } 143 | 144 | s = (Skiplist *) skiplist_malloc(sizeof(Skiplist)); 145 | s->head = head; 146 | s->tail = tail; 147 | s->level = 0; 148 | 149 | return s; 150 | } 151 | 152 | float Skiplist_randFloat(Skiplist *s) { 153 | return (float)rand_r(&s->randSeed) / (float)RAND_MAX; 154 | } 155 | 156 | int Skiplist_randomLevel(Skiplist *s) { 157 | int nextLevel = 0; 158 | int level; 159 | 160 | for (; Skiplist_randFloat(s) < p; nextLevel++) { 161 | } 162 | 163 | if (nextLevel > MaxLevel) { 164 | nextLevel = MaxLevel; 165 | } 166 | 167 | level = (int) __atomic_load_n(&s->level, __ATOMIC_RELAXED); 168 | if (nextLevel > level) { 169 | __sync_bool_compare_and_swap(&s->level, level, level+1); 170 | nextLevel = level + 1; 171 | } 172 | return nextLevel; 173 | } 174 | 175 | bool Skiplist_findPath(Skiplist *s, Item *itm, Node *preds[], Node *succs[]) { 176 | int cmpVal = 1; 177 | int level; 178 | Node *prev, *curr; 179 | NodeRef curRef, nextRef; 180 | 181 | retry: 182 | prev = s->head; 183 | level = (int) __atomic_load_n(&s->level, __ATOMIC_RELAXED); 184 | for (int i=level; i>=0; i--) { 185 | curRef = Node_getNext(prev, i); 186 | levelSearch: 187 | while (1) { 188 | curr = curRef.ptr; 189 | nextRef = Node_getNext(curr, i); 190 | 191 | cmpVal = Item_Compare(curr->itm, itm); 192 | if (cmpVal < 0) { 193 | prev = curr; 194 | curRef = Node_getNext(prev, i); 195 | curr = curRef.ptr; 196 | } else { 197 | break; 198 | } 199 | } 200 | 201 | preds[i] = prev; 202 | succs[i] = curr; 203 | } 204 | 205 | if (cmpVal == 0) { 206 | return true; 207 | } 208 | 209 | return false; 210 | } 211 | 212 | 213 | void Skiplist_Insert(Skiplist *s, Item *itm) { 214 | int itemLevel = Skiplist_randomLevel(s); 215 | Node *x = newNode(itm, itemLevel); 216 | Node *preds[MaxLevel], *succs[MaxLevel]; 217 | 218 | retry: 219 | Skiplist_findPath(s, itm, preds, succs); 220 | 221 | Node_setNext(x, 0, succs[0], false); 222 | if (!Node_dcasNext(preds[0], 0, succs[0], x, false, false)) { 223 | goto retry; 224 | } 225 | 226 | for (int i=1; i <= int(itemLevel); i++) { 227 | fixThisLevel: 228 | while (1) { 229 | Node_setNext(x, i, succs[i], false); 230 | if (Node_dcasNext(preds[i], i, succs[i], x, false, false)) { 231 | break; 232 | } 233 | Skiplist_findPath(s, itm, preds, succs); 234 | } 235 | } 236 | } 237 | 238 | 239 | #endif 240 | -------------------------------------------------------------------------------- /skiplist/access_barrier.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import ( 12 | "math" 13 | "sync" 14 | "sync/atomic" 15 | "unsafe" 16 | ) 17 | 18 | /* 19 | * Algorithm: 20 | * Access barrier is used to facilitize safe remory reclaimation in the lockfree 21 | * skiplist. Every skiplist access needs to be passed through a gate which tracks 22 | * the safety premitives to figure out when is the right time to deallocate a 23 | * skiplist node. 24 | * 25 | * Even though lockfree skiplist deletion algorithm takes care of completely unlinking 26 | * a skiplist node from the skiplist, still there could be a small perioid during which 27 | * deleted node is accessible to already live skiplist accessors. We need to wait until 28 | * a safe period before the memory for the node can be deallocated. 29 | * 30 | * In this algorithm, the unit of safety period is called barrier session. All the 31 | * live accessor of the skiplist are tracked in a barrier session. Whenever a 32 | * skiplist delete or group of deletes is performed, current barrier session is 33 | * closed and new barrier session is started. The previous barrier 34 | * session tracks all the live accessors until the session of closed. The right 35 | * time to safely reclaim the node is when all the accessor becomes dead. This makes 36 | * sure that unlinked node will be invisible to everyone. The accessors in the 37 | * barrier session can cooperatively detect and mark when each of them terminates. 38 | * When the last accessor leaves, it can take the action to call the destructor 39 | * for the node and barrier session terminates. 40 | * 41 | * Closing and installing a new barrier session: 42 | * A session liveCount is incremented every time an accessor is entering the skiplist 43 | * and decremented when the leave the skiplist. When a session is closed and new 44 | * one needs to be installed, we just swap the global barrier session reference. 45 | * There could be race conditions while a session is being marked as closed. Still 46 | * an ongoing skiplist accessor can increment the counter of a session which was marked 47 | * as closed. To detect those accessors and make them retry, we add a large number 48 | * to the liveCount as part of the session close phase. When the accessor finds 49 | * that the incremented result is greater than that large offset, it needs to backoff 50 | * from the current session and acquire new session to increment the count. In this 51 | * scheme, whoever decrements the count and gets the count equal to the large offset 52 | * is responsible for deallocation of the object. 53 | * 54 | * The algorithm has to consider one more condition before it can call destructor for 55 | * the session. Multiple closed sessions can be active at a time. We cannot call the 56 | * destructor for a closed session while a previous closed session is still not terminated. 57 | * Because, even through accessors from a closed session has become zero, accessors from previous 58 | * closed session would be able to access items in the later closed session. Hence, a closed session 59 | * can be terminated only after termination of all previous closed sessions. 60 | * */ 61 | 62 | // BarrierSessionDestructor is a callback for SMR based reclaim of objects 63 | type BarrierSessionDestructor func(objectRef unsafe.Pointer) 64 | 65 | const barrierFlushOffset = math.MaxInt32 / 2 66 | 67 | // BarrierSession handle tracks the live accessors of a barrier session 68 | type BarrierSession struct { 69 | liveCount *int32 70 | objectRef unsafe.Pointer 71 | seqno uint64 72 | closed int32 73 | } 74 | 75 | // CompareBS is a barrier session comparator based on seqno 76 | func CompareBS(this, that unsafe.Pointer) int { 77 | thisItm := (*BarrierSession)(this) 78 | thatItm := (*BarrierSession)(that) 79 | 80 | return int(thisItm.seqno) - int(thatItm.seqno) 81 | } 82 | 83 | func newBarrierSession() *BarrierSession { 84 | bs := &BarrierSession{ 85 | liveCount: new(int32), 86 | } 87 | 88 | return bs 89 | } 90 | 91 | // AccessBarrier is the SMR core data structure for the skiplist 92 | type AccessBarrier struct { 93 | activeSeqno uint64 94 | session unsafe.Pointer 95 | callb BarrierSessionDestructor 96 | 97 | freeq *Skiplist 98 | freeSeqno uint64 99 | isDestructorRunning int32 100 | 101 | numAllocated int64 102 | numFreed int64 103 | 104 | active bool 105 | sync.Mutex 106 | } 107 | 108 | func newAccessBarrier(active bool, callb BarrierSessionDestructor) *AccessBarrier { 109 | ab := &AccessBarrier{ 110 | active: active, 111 | session: unsafe.Pointer(newBarrierSession()), 112 | callb: callb, 113 | numAllocated: 1, 114 | } 115 | if active { 116 | ab.freeq = New() 117 | } 118 | return ab 119 | } 120 | 121 | func (ab *AccessBarrier) GetStats() (int64, int64, int64, uint64) { 122 | if ab.freeq != nil { 123 | return ab.numAllocated, ab.numFreed, int64(ab.freeq.GetStats().NodeCount), ab.freeSeqno 124 | } 125 | return ab.numAllocated, ab.numFreed, 0, ab.freeSeqno 126 | } 127 | 128 | func (ab *AccessBarrier) doCleanup() { 129 | buf1 := ab.freeq.MakeBuf() 130 | buf2 := ab.freeq.MakeBuf() 131 | defer ab.freeq.FreeBuf(buf1) 132 | defer ab.freeq.FreeBuf(buf2) 133 | 134 | iter := ab.freeq.NewIterator(CompareBS, buf1) 135 | defer iter.Close() 136 | 137 | for iter.SeekFirst(); iter.Valid(); iter.Next() { 138 | node := iter.GetNode() 139 | bs := (*BarrierSession)(node.Item()) 140 | if bs.seqno != ab.freeSeqno+1 { 141 | return 142 | } 143 | 144 | ab.freeSeqno++ 145 | ab.callb(bs.objectRef) 146 | ab.freeq.DeleteNode(node, CompareBS, buf2, &ab.freeq.Stats) 147 | ab.numFreed++ 148 | } 149 | } 150 | 151 | // Acquire marks enter of an accessor in the skiplist 152 | func (ab *AccessBarrier) Acquire() *BarrierSession { 153 | if ab.active { 154 | retry: 155 | bs := (*BarrierSession)(atomic.LoadPointer(&ab.session)) 156 | liveCount := atomic.AddInt32(bs.liveCount, 1) 157 | if liveCount > barrierFlushOffset { 158 | ab.Release(bs) 159 | goto retry 160 | } 161 | 162 | return bs 163 | } 164 | 165 | return nil 166 | } 167 | 168 | // Release marks leaving of an accessor in the skiplist 169 | func (ab *AccessBarrier) Release(bs *BarrierSession) { 170 | if ab.active { 171 | liveCount := atomic.AddInt32(bs.liveCount, -1) 172 | if liveCount == barrierFlushOffset { 173 | buf := ab.freeq.MakeBuf() 174 | defer ab.freeq.FreeBuf(buf) 175 | 176 | // Accessors which entered a closed barrier session steps down automatically 177 | // But, they may try to close an already closed session. 178 | if atomic.AddInt32(&bs.closed, 1) == 1 { 179 | if !ab.freeq.Insert(unsafe.Pointer(bs), CompareBS, buf, &ab.freeq.Stats) { 180 | panic("unable to insert barrier session into free list") 181 | } 182 | if atomic.CompareAndSwapInt32(&ab.isDestructorRunning, 0, 1) { 183 | ab.doCleanup() 184 | atomic.CompareAndSwapInt32(&ab.isDestructorRunning, 1, 0) 185 | } 186 | } 187 | } else if liveCount < 0 || liveCount == barrierFlushOffset-1 { 188 | panic("Unsafe memory reclamation detected") 189 | } 190 | } 191 | } 192 | 193 | // FlushSession closes the current barrier session and starts the new session. 194 | // The caller should provide the destructor pointer for the new session. 195 | func (ab *AccessBarrier) FlushSession(ref unsafe.Pointer) { 196 | if ab.active { 197 | ab.Lock() 198 | defer ab.Unlock() 199 | 200 | bsPtr := atomic.LoadPointer(&ab.session) 201 | newBsPtr := unsafe.Pointer(newBarrierSession()) 202 | atomic.CompareAndSwapPointer(&ab.session, bsPtr, newBsPtr) 203 | bs := (*BarrierSession)(bsPtr) 204 | bs.objectRef = ref 205 | ab.activeSeqno++ 206 | bs.seqno = ab.activeSeqno 207 | ab.numAllocated++ 208 | 209 | atomic.AddInt32(bs.liveCount, barrierFlushOffset+1) 210 | ab.Release(bs) 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /skiplist/builder.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import "math/rand" 12 | import "unsafe" 13 | 14 | // NodeCallback is used by segment builder 15 | type NodeCallback func(*Node) 16 | 17 | // Segment is a skiplist segment 18 | type Segment struct { 19 | builder *Builder 20 | tail []*Node 21 | head []*Node 22 | rand *rand.Rand 23 | callb NodeCallback 24 | count uint64 25 | 26 | sts Stats 27 | } 28 | 29 | // SetNodeCallback sets callback for segment builder 30 | func (s *Segment) SetNodeCallback(fn NodeCallback) { 31 | s.callb = fn 32 | } 33 | 34 | // Add an item into skiplist segment 35 | func (s *Segment) Add(itm unsafe.Pointer) { 36 | itemLevel := s.builder.store.NewLevel(s.rand.Float32) 37 | x := s.builder.store.newNode(itm, itemLevel) 38 | s.sts.AddInt64(&s.sts.nodeAllocs, 1) 39 | s.sts.AddInt64(&s.sts.levelNodesCount[itemLevel], 1) 40 | s.sts.AddInt64(&s.sts.usedBytes, int64(s.builder.store.Size(x))) 41 | 42 | for l := 0; l <= itemLevel; l++ { 43 | if s.tail[l] != nil { 44 | s.tail[l].setNext(l, x, false) 45 | } else { 46 | s.head[l] = x 47 | } 48 | s.tail[l] = x 49 | } 50 | 51 | if s.callb != nil { 52 | s.callb(x) 53 | } 54 | } 55 | 56 | // Builder performs concurrent bottom-up skiplist build 57 | type Builder struct { 58 | store *Skiplist 59 | } 60 | 61 | // SetItemSizeFunc configures items size function 62 | func (b *Builder) SetItemSizeFunc(fn ItemSizeFn) { 63 | b.store.ItemSize = fn 64 | } 65 | 66 | // NewSegment creates a new skiplist segment 67 | func (b *Builder) NewSegment() *Segment { 68 | seg := &Segment{tail: make([]*Node, MaxLevel+1), 69 | head: make([]*Node, MaxLevel+1), builder: b, 70 | rand: rand.New(rand.NewSource(int64(rand.Int()))), 71 | } 72 | 73 | seg.sts.IsLocal(true) 74 | return seg 75 | } 76 | 77 | // Assemble multiple skiplist segments and form a parent skiplist 78 | func (b *Builder) Assemble(segments ...*Segment) *Skiplist { 79 | tail := make([]*Node, MaxLevel+1) 80 | head := make([]*Node, MaxLevel+1) 81 | 82 | for _, seg := range segments { 83 | for l := 0; l <= MaxLevel; l++ { 84 | if tail[l] != nil && seg.head[l] != nil { 85 | tail[l].setNext(l, seg.head[l], false) 86 | } else if head[l] == nil && seg.head[l] != nil { 87 | head[l] = seg.head[l] 88 | } 89 | 90 | if seg.tail[l] != nil { 91 | tail[l] = seg.tail[l] 92 | } 93 | } 94 | } 95 | 96 | for l := 0; l <= MaxLevel; l++ { 97 | if head[l] != nil { 98 | b.store.head.setNext(l, head[l], false) 99 | } 100 | if tail[l] != nil { 101 | tail[l].setNext(l, b.store.tail, false) 102 | } 103 | } 104 | 105 | for _, seg := range segments { 106 | b.store.Stats.Merge(&seg.sts) 107 | } 108 | 109 | return b.store 110 | 111 | } 112 | 113 | // NewBuilder creates a builder based on default config 114 | func NewBuilder() *Builder { 115 | return NewBuilderWithConfig(DefaultConfig()) 116 | } 117 | 118 | // NewBuilderWithConfig creates a builder from a config 119 | func NewBuilderWithConfig(cfg Config) *Builder { 120 | return &Builder{store: NewWithConfig(cfg)} 121 | } 122 | -------------------------------------------------------------------------------- /skiplist/item.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import ( 12 | "bytes" 13 | "fmt" 14 | "unsafe" 15 | ) 16 | 17 | var ( 18 | MinItem unsafe.Pointer 19 | MaxItem = unsafe.Pointer(^uintptr(0)) 20 | ) 21 | 22 | func compare(cmp CompareFn, this, that unsafe.Pointer) int { 23 | if this == MinItem || that == MaxItem { 24 | return -1 25 | } 26 | 27 | if this == MaxItem || that == MinItem { 28 | return 1 29 | } 30 | 31 | return cmp(this, that) 32 | } 33 | 34 | type byteKeyItem []byte 35 | 36 | func (itm *byteKeyItem) String() string { 37 | return string(*itm) 38 | } 39 | 40 | func (itm byteKeyItem) Size() int { 41 | return len(itm) 42 | } 43 | 44 | // NewByteKeyItem creates a new item from bytes 45 | func NewByteKeyItem(k []byte) unsafe.Pointer { 46 | itm := byteKeyItem(k) 47 | return unsafe.Pointer(&itm) 48 | } 49 | 50 | func NewIntKeyItem(x int) unsafe.Pointer { 51 | p := new(int) 52 | *p = x 53 | return unsafe.Pointer(p) 54 | } 55 | 56 | func IntFromItem(itm unsafe.Pointer) int { 57 | return int(*(*IntKeyItem)(itm)) 58 | } 59 | 60 | // CompareBytes is a byte item comparator 61 | func CompareBytes(this, that unsafe.Pointer) int { 62 | thisItem := (*byteKeyItem)(this) 63 | thatItem := (*byteKeyItem)(that) 64 | return bytes.Compare([]byte(*thisItem), []byte(*thatItem)) 65 | } 66 | 67 | type IntKeyItem int 68 | 69 | func (itm *IntKeyItem) String() string { 70 | return fmt.Sprint(*itm) 71 | } 72 | 73 | func (itm IntKeyItem) Size() int { 74 | return int(unsafe.Sizeof(itm)) 75 | } 76 | 77 | // CompareInt is a helper integer item comparator 78 | func CompareInt(this, that unsafe.Pointer) int { 79 | if this == MinItem || that == MaxItem { 80 | return -1 81 | } 82 | 83 | if this == MaxItem || that == MinItem { 84 | return 1 85 | } 86 | 87 | thisItem := (*IntKeyItem)(this) 88 | thatItem := (*IntKeyItem)(that) 89 | return int(*thisItem - *thatItem) 90 | } 91 | -------------------------------------------------------------------------------- /skiplist/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import "sync/atomic" 12 | import "unsafe" 13 | 14 | // Iterator is used for lookup and range operations on skiplist 15 | type Iterator struct { 16 | cmp CompareFn 17 | s *Skiplist 18 | prev, curr *Node 19 | valid bool 20 | buf *ActionBuffer 21 | deleted bool 22 | 23 | bs *BarrierSession 24 | count uint 25 | smrInterval uint 26 | } 27 | 28 | // NewIterator creates an iterator for skiplist 29 | func (s *Skiplist) NewIterator(cmp CompareFn, 30 | buf *ActionBuffer) *Iterator { 31 | it := s.NewIterator2(cmp, buf) 32 | it.bs = s.barrier.Acquire() 33 | return it 34 | } 35 | 36 | func (s *Skiplist) NewIterator2(cmp CompareFn, 37 | buf *ActionBuffer) *Iterator { 38 | return &Iterator{ 39 | cmp: cmp, 40 | s: s, 41 | buf: buf, 42 | smrInterval: ^uint(0), 43 | } 44 | } 45 | 46 | // SeekFirst moves cursor to the start 47 | func (it *Iterator) SeekFirst() { 48 | it.prev = it.s.head 49 | it.curr, _ = it.s.head.getNext(0) 50 | it.valid = true 51 | } 52 | 53 | // SeekWithCmp moves iterator to a provided item by using custom comparator 54 | func (it *Iterator) SeekWithCmp(itm unsafe.Pointer, cmp CompareFn, eqCmp CompareFn) bool { 55 | var found bool 56 | if found = it.s.findPath(itm, cmp, it.buf, &it.s.Stats) != nil; found { 57 | it.prev = it.buf.preds[0] 58 | it.curr = it.buf.succs[0] 59 | } else { 60 | if found = eqCmp != nil && compare(eqCmp, itm, it.buf.preds[0].Item()) == 0; found { 61 | it.prev = nil 62 | it.curr = it.buf.preds[0] 63 | } 64 | } 65 | return found 66 | } 67 | 68 | // Seek moves iterator to a provided item 69 | func (it *Iterator) Seek(itm unsafe.Pointer) bool { 70 | it.valid = true 71 | found := it.s.findPath(itm, it.cmp, it.buf, &it.s.Stats) != nil 72 | it.prev = it.buf.preds[0] 73 | it.curr = it.buf.succs[0] 74 | return found 75 | } 76 | 77 | // Valid returns true when iterator reaches the end 78 | func (it *Iterator) Valid() bool { 79 | if it.valid && it.curr == it.s.tail { 80 | it.valid = false 81 | } 82 | 83 | return it.valid 84 | } 85 | 86 | // Get returns the current item 87 | func (it *Iterator) Get() unsafe.Pointer { 88 | return it.curr.Item() 89 | } 90 | 91 | // GetNode returns node which holds the current item 92 | func (it *Iterator) GetNode() *Node { 93 | return it.curr 94 | } 95 | 96 | // Next moves iterator to the next item 97 | func (it *Iterator) Next() { 98 | if it.deleted { 99 | it.deleted = false 100 | return 101 | } 102 | 103 | retry: 104 | it.valid = true 105 | next, deleted := it.curr.getNext(0) 106 | if deleted { 107 | // Current node is deleted. Unlink current node from the level 108 | // and make next node as current node. 109 | // If it fails, refresh the path buffer and obtain new current node. 110 | if it.s.helpDelete(0, it.prev, it.curr, next, &it.s.Stats) { 111 | it.curr = next 112 | } else { 113 | atomic.AddUint64(&it.s.Stats.readConflicts, 1) 114 | found := it.s.findPath(it.curr.Item(), it.cmp, it.buf, &it.s.Stats) != nil 115 | last := it.curr 116 | it.prev = it.buf.preds[0] 117 | it.curr = it.buf.succs[0] 118 | if found && last == it.curr { 119 | goto retry 120 | } 121 | } 122 | } else { 123 | it.prev = it.curr 124 | it.curr = next 125 | } 126 | 127 | it.count++ 128 | if it.count%it.smrInterval == 0 { 129 | it.Refresh() 130 | } 131 | } 132 | 133 | // Close is a destructor 134 | func (it *Iterator) Close() { 135 | if it.bs != nil { 136 | it.s.barrier.Release(it.bs) 137 | } 138 | } 139 | 140 | func (it *Iterator) SetRefreshInterval(interval int) { 141 | it.smrInterval = uint(interval) 142 | } 143 | 144 | func (it *Iterator) Refresh() { 145 | if it.Valid() { 146 | currBs := it.bs 147 | itm := it.Get() 148 | it.bs = it.s.barrier.Acquire() 149 | it.Seek(itm) 150 | it.s.barrier.Release(currBs) 151 | } 152 | } 153 | 154 | func (it *Iterator) Pause() { 155 | if it.bs != nil { 156 | it.s.barrier.Release(it.bs) 157 | it.bs = nil 158 | } 159 | } 160 | 161 | func (it *Iterator) Resume() { 162 | it.bs = it.s.barrier.Acquire() 163 | } 164 | -------------------------------------------------------------------------------- /skiplist/merger.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import "container/heap" 12 | import "unsafe" 13 | 14 | // MergeIterator aggregates multiple iterators 15 | type MergeIterator struct { 16 | iters []*Iterator 17 | h nodeHeap 18 | curr *Node 19 | } 20 | 21 | type heapItem struct { 22 | iter *Iterator 23 | n *Node 24 | } 25 | 26 | type nodeHeap []heapItem 27 | 28 | func (h nodeHeap) Len() int { return len(h) } 29 | func (h nodeHeap) Less(i, j int) bool { return h[i].iter.cmp(h[i].n.Item(), h[j].n.Item()) < 0 } 30 | func (h nodeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 31 | 32 | func (h *nodeHeap) Push(x interface{}) { 33 | *h = append(*h, x.(heapItem)) 34 | } 35 | 36 | func (h *nodeHeap) Pop() interface{} { 37 | old := *h 38 | n := len(old) 39 | x := old[n-1] 40 | *h = old[0 : n-1] 41 | return x 42 | } 43 | 44 | // NewMergeIterator creates an iterator that merges multiple iterators 45 | func NewMergeIterator(iters []*Iterator) *MergeIterator { 46 | return &MergeIterator{ 47 | iters: iters, 48 | } 49 | } 50 | 51 | // SeekFirst moves cursor to the first item 52 | func (mit *MergeIterator) SeekFirst() { 53 | for _, it := range mit.iters { 54 | it.SeekFirst() 55 | if it.Valid() { 56 | n := it.GetNode() 57 | mit.h = append(mit.h, heapItem{iter: it, n: n}) 58 | } 59 | } 60 | 61 | heap.Init(&mit.h) 62 | mit.Next() 63 | } 64 | 65 | // Valid returns false when cursor reaches end 66 | func (mit *MergeIterator) Valid() bool { 67 | return mit.curr != nil 68 | } 69 | 70 | // Next moves cursor to the next item 71 | func (mit *MergeIterator) Next() { 72 | mit.curr = nil 73 | if mit.h.Len() == 0 { 74 | return 75 | } 76 | 77 | o := heap.Pop(&mit.h) 78 | hi := o.(heapItem) 79 | mit.curr = hi.n 80 | hi.iter.Next() 81 | if hi.iter.Valid() { 82 | hi.n = hi.iter.GetNode() 83 | heap.Push(&mit.h, hi) 84 | } 85 | } 86 | 87 | // Seek moves cursor to the specified item, if present 88 | func (mit *MergeIterator) Seek(itm unsafe.Pointer) bool { 89 | var found bool 90 | for _, it := range mit.iters { 91 | if it.Seek(itm) { 92 | found = true 93 | } 94 | if it.Valid() { 95 | n := it.GetNode() 96 | mit.h = append(mit.h, heapItem{iter: it, n: n}) 97 | } 98 | } 99 | 100 | heap.Init(&mit.h) 101 | mit.Next() 102 | 103 | return found 104 | } 105 | 106 | // Get returns current item 107 | func (mit *MergeIterator) Get() unsafe.Pointer { 108 | return mit.curr.Item() 109 | } 110 | 111 | // GetNode returns node for the current item 112 | func (mit *MergeIterator) GetNode() *Node { 113 | return mit.curr 114 | } 115 | -------------------------------------------------------------------------------- /skiplist/merger_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | package skiplist 9 | 10 | import ( 11 | "fmt" 12 | "testing" 13 | "unsafe" 14 | ) 15 | 16 | func TestMerger(t *testing.T) { 17 | var lists []*Skiplist 18 | var iters []*Iterator 19 | 20 | s := New() 21 | cmp := CompareBytes 22 | buf := s.MakeBuf() 23 | defer s.FreeBuf(buf) 24 | 25 | n := 5 26 | 27 | for i := 0; i < n; i++ { 28 | lists = append(lists, New()) 29 | } 30 | 31 | for i := 0; i < 10000; i++ { 32 | if i >= 1000 && i <= 8000 && i%n == 0 { 33 | continue 34 | } 35 | s := lists[i%n] 36 | s.Insert(NewByteKeyItem([]byte(fmt.Sprintf("%010d", i))), cmp, buf, &s.Stats) 37 | } 38 | 39 | for i := 0; i < n; i++ { 40 | buf := s.MakeBuf() 41 | iters = append(iters, lists[i].NewIterator(cmp, buf)) 42 | } 43 | 44 | mit := NewMergeIterator(iters) 45 | var seekPtr unsafe.Pointer 46 | var seekPt int 47 | 48 | i := 0 49 | for mit.SeekFirst(); mit.Valid(); i++ { 50 | if i >= 1000 && i <= 8000 && i%n == 0 { 51 | continue 52 | } 53 | expected := fmt.Sprintf("%010d", i) 54 | seekPtr = mit.Get() 55 | seekPt = i 56 | got := string(*((*byteKeyItem)(seekPtr))) 57 | if got != expected { 58 | t.Errorf("Expected %s, got %v", expected, got) 59 | } 60 | mit.Next() 61 | } 62 | ok := mit.Seek(seekPtr) 63 | if !ok { 64 | t.Errorf("Expected seek to work") 65 | } 66 | seekPtr = mit.Get() 67 | got := string(*((*byteKeyItem)(seekPtr))) 68 | 69 | expected := fmt.Sprintf("%010d", seekPt) 70 | if got != expected { 71 | t.Errorf("Expected %s, got %v", expected, got) 72 | } 73 | node := mit.GetNode() 74 | if node == nil { 75 | t.Errorf("Expected getNode to work") 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /skiplist/node.go: -------------------------------------------------------------------------------- 1 | // +build !amd64 2 | 3 | // Copyright 2016-Present Couchbase, Inc. 4 | // 5 | // Use of this software is governed by the Business Source License included in 6 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 7 | // file, in accordance with the Business Source License, use of this software 8 | // will be governed by the Apache License, Version 2.0, included in the file 9 | // licenses/APL2.txt. 10 | 11 | package skiplist 12 | 13 | import ( 14 | "reflect" 15 | "sync/atomic" 16 | "unsafe" 17 | ) 18 | 19 | // 20 | // The default skiplist implementation : 21 | // a) should support 64-bit platforms including aarch64 which has alignment restrictions 22 | // (on arm64, 64-bit words accessed atomically should have 64-bit address alignment 23 | // otherwise will result in alignment fault.) 24 | // b) and can work with both golang memory (garbage collection safe) as well as user 25 | // managed memory (e.g. jemalloc) 26 | // 27 | // Node layout: 28 | // Nitro's two-phase deletion approach requires us to atomically update both the 29 | // next pointer as well as the state of the current node. 30 | // 31 | // For user-managed memory, this can be addressed by using tagged pointers. However 32 | // this relies on the fact that the platform's virtual addresses do not consume the 33 | // entire 64 bits. To our knowledge, amd64 uses 48-bits VA addresses and for ARMV8.3 34 | // supporting large VA addressing mode (64K page size), it can go upto 52-bits. This 35 | // has the advantage of using word aligned operations which is a requirement for 36 | // certain platforms. 37 | // 38 | // Below is the node layout we use for user managed memory 39 | // 40 | // 41 | // +------------+-----------+-----------+-------------+ 42 | // | level - 8b | itm - 8b | Link - 8b| Cache - 8b | 43 | // +------------+-----------+-----------+-------------+-------------+--------------+ 44 | // | tag ptr- 8b| tag ptr - 8b| 45 | // +-------------+--------------+ 46 | // 47 | // For golang memory, the same can be addressed by using an indirection pointer. A 48 | // NodeRef pointer is stored in skiplist levels which point to an object which contains 49 | // both the state & the next pointer. This is the existing implementation. 50 | // 51 | // Below is the node layout we use for golang memory 52 | // 53 | // 54 | // +------------+------------+-----------+-------------+-------------+ 55 | // | level - 8b | next - 8b | itm - 8b | Link - 8b | Cache - 8b | 56 | // +------------+------------+-----------+-------------+-------------+ 57 | // | ----- |------------------+----------------+ 58 | // | NodeRef ptr - 8b | NodeRefptr - 8b| 59 | // |------------------+----------------+ 60 | // 61 | // Note: Although golang indirection approach can work with user managed memory, 62 | // but it comes with an overhead of constant memory allocation/deallocation in 63 | // case of conflicts and also SMR will not be straightforward. Also reclaim in SMR 64 | // becomes easy if we allocate node memory as a single blob (NodeMM). 65 | // 66 | // Based on memory config used for skiplist, we cache the type information in the 67 | // MSB of level field to save extra bytes. Currently MaxLevel is 32. But it can go 68 | // up to 2^63 -1 69 | // 70 | 71 | // 52-bit Large VA address capability is supported from ARMv8.2 onwards (64KB page size) 72 | const deletedFlag = uint64(1) << 52 73 | const deletedFlagMask = ^deletedFlag 74 | 75 | // memory management type, bit set for user managed memory 76 | const mmFlag = int(1) << 62 77 | const mmFlagMask = (^mmFlag) 78 | 79 | var nodeHdrSizeMM = unsafe.Sizeof(NodeMM{}) 80 | var nodeRefSizeMM = unsafe.Sizeof(NodeRefMM{}) 81 | 82 | // Node represents skiplist entry 83 | // This should reside in a single cache line (L1 cache 64bytes) 84 | type Node struct { 85 | level int // we use the 2nd highest bit to store memory type 86 | itm unsafe.Pointer 87 | Link unsafe.Pointer 88 | Cache int64 // needed by plasma 89 | next unsafe.Pointer // Points to [level+1]unsafe.Pointer 90 | } 91 | 92 | // NodeRef is a wrapper for node pointer 93 | type NodeRef struct { 94 | deleted bool 95 | ptr *Node 96 | } 97 | 98 | // NodeMM represents skiplist entry from user managed memory. 99 | // We skips the next pointer in Node struct to save bytes 100 | type NodeMM struct { 101 | level int // // we use the 63rd bit to store node type 102 | itm unsafe.Pointer 103 | Link unsafe.Pointer 104 | Cache int64 // needed by plasma 105 | } 106 | 107 | // NodeRefMM is a wrapper for Node(MM) pointer tagged with deletedFlag 108 | type NodeRefMM struct { 109 | tagptr uint64 110 | } 111 | 112 | // for user managed memory 113 | func (n *Node) setMM() { 114 | n.level |= mmFlag 115 | } 116 | 117 | // this is inlined by go as seen from profile 118 | func (n *Node) usesMM() bool { 119 | return (n.level & mmFlag) != 0 120 | } 121 | 122 | // get a slice of NodeRef's containing golang pointers 123 | func (n *Node) nextArray() (s []unsafe.Pointer) { 124 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&s)) 125 | hdr.Data = uintptr(n.next) 126 | hdr.Len = n.Level() + 1 127 | hdr.Cap = hdr.Len 128 | return 129 | } 130 | 131 | // Level returns the level of a node in the skiplist 132 | func (n Node) Level() int { 133 | return n.level & mmFlagMask 134 | } 135 | 136 | // Size returns memory used by the node 137 | func (n Node) Size() int { 138 | if n.usesMM() { 139 | return int(nodeHdrSizeMM + uintptr(n.Level()+1)*nodeRefSizeMM) 140 | } else { 141 | return int(unsafe.Sizeof(n) + 142 | uintptr(n.Level()+1)*(unsafe.Sizeof(unsafe.Pointer(nil))+ 143 | unsafe.Sizeof(NodeRef{}))) 144 | } 145 | } 146 | 147 | // Item returns item held by the node 148 | func (n *Node) Item() unsafe.Pointer { 149 | return n.itm 150 | } 151 | 152 | // SetItem sets itm ptr 153 | func (n *Node) SetItem(itm unsafe.Pointer) { 154 | n.itm = itm 155 | } 156 | 157 | // SetLink can be used to set link pointer for the node 158 | func (n *Node) SetLink(l *Node) { 159 | n.Link = unsafe.Pointer(l) 160 | } 161 | 162 | // GetLink returns link pointer from the node 163 | func (n *Node) GetLink() *Node { 164 | return (*Node)(n.Link) 165 | } 166 | 167 | func allocNode(itm unsafe.Pointer, level int, fn MallocFn) *Node { 168 | var n *Node 169 | // we reserve level's MSB bit to cache node type 170 | if level < 0 || level >= mmFlag { 171 | return nil 172 | } 173 | if fn == nil { 174 | next := make([]unsafe.Pointer, level+1) 175 | n = &Node{ 176 | level: level, 177 | next: unsafe.Pointer(&next[0]), 178 | } 179 | } else { 180 | // NodeMM is casted as Node (NodeMM is not undersized) 181 | n = (*Node)(fn(int(nodeHdrSizeMM + uintptr(level+1)*nodeRefSizeMM))) 182 | if n == nil { 183 | return nil 184 | } 185 | n.level = level 186 | n.Link = nil 187 | n.setMM() // malloced memory 188 | } 189 | 190 | n.Cache = 0 191 | n.itm = itm 192 | return n 193 | } 194 | 195 | func (n *Node) setNext(level int, ptr *Node, deleted bool) { 196 | if n.usesMM() { 197 | nodeRefAddr := uintptr(unsafe.Pointer(uintptr(unsafe.Pointer(n)) + 198 | nodeHdrSizeMM + nodeRefSizeMM*uintptr(level))) 199 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr)) 200 | tag := uint64(uintptr(unsafe.Pointer(ptr))) 201 | if deleted { 202 | tag |= deletedFlag 203 | } 204 | atomic.StoreUint64(wordAddr, tag) 205 | } else { 206 | next := n.nextArray() 207 | next[level] = unsafe.Pointer(&NodeRef{ptr: ptr, deleted: deleted}) 208 | } 209 | } 210 | 211 | // GetNext returns next node in level 0 212 | func (n *Node) GetNext() *Node { 213 | var next *Node 214 | var del bool 215 | 216 | for next, del = n.getNext(0); del; next, del = next.getNext(0) { 217 | } 218 | return next 219 | } 220 | 221 | func (n *Node) getNext(level int) (*Node, bool) { 222 | if n.usesMM() { 223 | nodeRefAddr := uintptr(unsafe.Pointer(n)) + nodeHdrSizeMM + nodeRefSizeMM*uintptr(level) 224 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr)) 225 | v := atomic.LoadUint64(wordAddr) 226 | ptr := (*Node)(unsafe.Pointer(uintptr(v) & uintptr(deletedFlagMask))) 227 | if ptr != nil { 228 | return ptr, (v&deletedFlag != uint64(0)) 229 | } 230 | } else { 231 | next := n.nextArray() 232 | ref := (*NodeRef)(atomic.LoadPointer(&next[level])) 233 | if ref != nil { 234 | return ref.ptr, ref.deleted 235 | } 236 | } 237 | return nil, false 238 | } 239 | 240 | func (n *Node) dcasNext(level int, prevPtr, newPtr *Node, prevIsdeleted, newIsdeleted bool) bool { 241 | var swapped bool 242 | if n.usesMM() { 243 | nodeRefAddr := uintptr(unsafe.Pointer(n)) + nodeHdrSizeMM + nodeRefSizeMM*uintptr(level) 244 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr)) 245 | prevVal := uint64(uintptr(unsafe.Pointer(prevPtr))) 246 | newVal := uint64(uintptr(unsafe.Pointer(newPtr))) 247 | 248 | if prevIsdeleted { 249 | prevVal |= deletedFlag 250 | } 251 | 252 | if newIsdeleted { 253 | newVal |= deletedFlag 254 | } 255 | swapped = atomic.CompareAndSwapUint64(wordAddr, prevVal, newVal) 256 | } else { 257 | next := n.nextArray() 258 | addr := &next[level] 259 | ref := (*NodeRef)(atomic.LoadPointer(addr)) 260 | if (ref == nil) || (ref.ptr == prevPtr && ref.deleted == prevIsdeleted) { 261 | swapped = atomic.CompareAndSwapPointer(addr, unsafe.Pointer(ref), 262 | unsafe.Pointer(&NodeRef{ptr: newPtr, deleted: newIsdeleted})) 263 | } 264 | } 265 | 266 | return swapped 267 | } 268 | 269 | // This can help debugging of memory reclaimer bugs 270 | func debugMarkFree(n *Node) { 271 | } 272 | -------------------------------------------------------------------------------- /skiplist/node_alloc_amd64.go: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2016-Present Couchbase, Inc. 3 | // 4 | // Use of this software is governed by the Business Source License included in 5 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 6 | // file, in accordance with the Business Source License, use of this software 7 | // will be governed by the Apache License, Version 2.0, included in the file 8 | // licenses/APL2.txt. 9 | 10 | package skiplist 11 | 12 | import ( 13 | "reflect" 14 | "unsafe" 15 | ) 16 | 17 | var nodeTypes = [33]reflect.Type{ 18 | reflect.TypeOf(node0), 19 | reflect.TypeOf(node1), 20 | reflect.TypeOf(node2), 21 | reflect.TypeOf(node3), 22 | reflect.TypeOf(node4), 23 | reflect.TypeOf(node5), 24 | reflect.TypeOf(node6), 25 | reflect.TypeOf(node7), 26 | reflect.TypeOf(node8), 27 | reflect.TypeOf(node9), 28 | reflect.TypeOf(node10), 29 | reflect.TypeOf(node11), 30 | reflect.TypeOf(node12), 31 | reflect.TypeOf(node13), 32 | reflect.TypeOf(node14), 33 | reflect.TypeOf(node15), 34 | reflect.TypeOf(node16), 35 | reflect.TypeOf(node17), 36 | reflect.TypeOf(node18), 37 | reflect.TypeOf(node19), 38 | reflect.TypeOf(node20), 39 | reflect.TypeOf(node21), 40 | reflect.TypeOf(node22), 41 | reflect.TypeOf(node23), 42 | reflect.TypeOf(node24), 43 | reflect.TypeOf(node25), 44 | reflect.TypeOf(node26), 45 | reflect.TypeOf(node27), 46 | reflect.TypeOf(node28), 47 | reflect.TypeOf(node29), 48 | reflect.TypeOf(node30), 49 | reflect.TypeOf(node31), 50 | reflect.TypeOf(node32), 51 | } 52 | 53 | var node0 struct { 54 | itm unsafe.Pointer 55 | gc unsafe.Pointer 56 | cache int64 57 | buf [1]NodeRef 58 | } 59 | 60 | var node1 struct { 61 | itm unsafe.Pointer 62 | gc unsafe.Pointer 63 | cache int64 64 | buf [2]NodeRef 65 | } 66 | 67 | var node2 struct { 68 | itm unsafe.Pointer 69 | gc unsafe.Pointer 70 | cache int64 71 | buf [3]NodeRef 72 | } 73 | 74 | var node3 struct { 75 | itm unsafe.Pointer 76 | gc unsafe.Pointer 77 | cache int64 78 | buf [4]NodeRef 79 | } 80 | 81 | var node4 struct { 82 | itm unsafe.Pointer 83 | gc unsafe.Pointer 84 | cache int64 85 | buf [5]NodeRef 86 | } 87 | 88 | var node5 struct { 89 | itm unsafe.Pointer 90 | gc unsafe.Pointer 91 | cache int64 92 | buf [6]NodeRef 93 | } 94 | 95 | var node6 struct { 96 | itm unsafe.Pointer 97 | gc unsafe.Pointer 98 | cache int64 99 | buf [7]NodeRef 100 | } 101 | 102 | var node7 struct { 103 | itm unsafe.Pointer 104 | gc unsafe.Pointer 105 | cache int64 106 | buf [8]NodeRef 107 | } 108 | 109 | var node8 struct { 110 | itm unsafe.Pointer 111 | gc unsafe.Pointer 112 | cache int64 113 | buf [9]NodeRef 114 | } 115 | 116 | var node9 struct { 117 | itm unsafe.Pointer 118 | gc unsafe.Pointer 119 | cache int64 120 | buf [10]NodeRef 121 | } 122 | 123 | var node10 struct { 124 | itm unsafe.Pointer 125 | gc unsafe.Pointer 126 | cache int64 127 | buf [11]NodeRef 128 | } 129 | var node11 struct { 130 | itm unsafe.Pointer 131 | gc unsafe.Pointer 132 | cache int64 133 | buf [12]NodeRef 134 | } 135 | 136 | var node12 struct { 137 | itm unsafe.Pointer 138 | gc unsafe.Pointer 139 | cache int64 140 | buf [13]NodeRef 141 | } 142 | 143 | var node13 struct { 144 | itm unsafe.Pointer 145 | gc unsafe.Pointer 146 | cache int64 147 | buf [14]NodeRef 148 | } 149 | 150 | var node14 struct { 151 | itm unsafe.Pointer 152 | gc unsafe.Pointer 153 | cache int64 154 | buf [15]NodeRef 155 | } 156 | 157 | var node15 struct { 158 | itm unsafe.Pointer 159 | gc unsafe.Pointer 160 | cache int64 161 | buf [16]NodeRef 162 | } 163 | 164 | var node16 struct { 165 | itm unsafe.Pointer 166 | gc unsafe.Pointer 167 | cache int64 168 | buf [17]NodeRef 169 | } 170 | 171 | var node17 struct { 172 | itm unsafe.Pointer 173 | gc unsafe.Pointer 174 | cache int64 175 | buf [18]NodeRef 176 | } 177 | 178 | var node18 struct { 179 | itm unsafe.Pointer 180 | gc unsafe.Pointer 181 | cache int64 182 | buf [19]NodeRef 183 | } 184 | 185 | var node19 struct { 186 | itm unsafe.Pointer 187 | gc unsafe.Pointer 188 | cache int64 189 | buf [20]NodeRef 190 | } 191 | 192 | var node20 struct { 193 | itm unsafe.Pointer 194 | gc unsafe.Pointer 195 | cache int64 196 | buf [21]NodeRef 197 | } 198 | 199 | var node21 struct { 200 | itm unsafe.Pointer 201 | gc unsafe.Pointer 202 | cache int64 203 | buf [22]NodeRef 204 | } 205 | 206 | var node22 struct { 207 | itm unsafe.Pointer 208 | gc unsafe.Pointer 209 | cache int64 210 | buf [23]NodeRef 211 | } 212 | 213 | var node23 struct { 214 | itm unsafe.Pointer 215 | gc unsafe.Pointer 216 | cache int64 217 | buf [24]NodeRef 218 | } 219 | 220 | var node24 struct { 221 | itm unsafe.Pointer 222 | gc unsafe.Pointer 223 | cache int64 224 | buf [25]NodeRef 225 | } 226 | 227 | var node25 struct { 228 | itm unsafe.Pointer 229 | gc unsafe.Pointer 230 | cache int64 231 | buf [26]NodeRef 232 | } 233 | 234 | var node26 struct { 235 | itm unsafe.Pointer 236 | gc unsafe.Pointer 237 | cache int64 238 | buf [27]NodeRef 239 | } 240 | 241 | var node27 struct { 242 | itm unsafe.Pointer 243 | gc unsafe.Pointer 244 | cache int64 245 | buf [28]NodeRef 246 | } 247 | 248 | var node28 struct { 249 | itm unsafe.Pointer 250 | gc unsafe.Pointer 251 | cache int64 252 | buf [29]NodeRef 253 | } 254 | 255 | var node29 struct { 256 | itm unsafe.Pointer 257 | gc unsafe.Pointer 258 | cache int64 259 | buf [30]NodeRef 260 | } 261 | 262 | var node30 struct { 263 | itm unsafe.Pointer 264 | gc unsafe.Pointer 265 | cache int64 266 | buf [31]NodeRef 267 | } 268 | var node31 struct { 269 | itm unsafe.Pointer 270 | gc unsafe.Pointer 271 | cache int64 272 | buf [32]NodeRef 273 | } 274 | 275 | var node32 struct { 276 | itm unsafe.Pointer 277 | gc unsafe.Pointer 278 | cache int64 279 | buf [33]NodeRef 280 | } 281 | 282 | func allocNode(itm unsafe.Pointer, level int, malloc MallocFn) *Node { 283 | var block unsafe.Pointer 284 | if malloc == nil { 285 | block = unsafe.Pointer(reflect.New(nodeTypes[level]).Pointer()) 286 | } else { 287 | block = malloc(int(nodeTypes[level].Size())) 288 | } 289 | 290 | n := (*Node)(block) 291 | n.level = uint16(level) 292 | n.itm = itm 293 | n.Link = nil 294 | n.Cache = 0 295 | return n 296 | } 297 | 298 | var freeBlockContent []byte 299 | 300 | func init() { 301 | l := int(nodeTypes[32].Size()) 302 | freeBlockContent = make([]byte, l) 303 | for i := 0; i < l; i++ { 304 | freeBlockContent[i] = 0xdd 305 | } 306 | } 307 | 308 | // Fill free blocks with a const 309 | // This can help debugging of memory reclaimer bugs 310 | func debugMarkFree(n *Node) { 311 | var block []byte 312 | l := int(nodeTypes[n.level].Size()) 313 | sh := (*reflect.SliceHeader)(unsafe.Pointer(&block)) 314 | sh.Data = uintptr(unsafe.Pointer(n)) 315 | sh.Len = l 316 | sh.Cap = l 317 | 318 | copy(block, freeBlockContent) 319 | } 320 | -------------------------------------------------------------------------------- /skiplist/node_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import ( 12 | "sync/atomic" 13 | "unsafe" 14 | ) 15 | 16 | // Node structure overlaps with an array of NodeRef struct 17 | // 18 | // 19 | // +--------------+-----------------+----------------+ 20 | // | itm - 8bytes | GClink - 8bytes | level = 2 bytes| <[]NodeRef struct> 21 | // +--------------+-----------------+----------------+-----+--------------+--------------+--------------+ 22 | // | flag - 8bytes | ptr - 8 bytes| flag - 8bytes| ptr - 8 bytes| 23 | // +----------------------+--------------+--------------+--------------+ 24 | 25 | var nodeHdrSize = unsafe.Sizeof(struct { 26 | itm unsafe.Pointer 27 | GClink *Node 28 | DataPtr unsafe.Pointer 29 | }{}) 30 | 31 | var nodeRefSize = unsafe.Sizeof(NodeRef{}) 32 | 33 | var nodeRefFlagSize = unsafe.Sizeof(NodeRef{}.flag) 34 | 35 | const deletedFlag = 0xff 36 | 37 | // Node represents skiplist node header 38 | type Node struct { 39 | itm unsafe.Pointer 40 | Link unsafe.Pointer 41 | Cache int64 42 | level uint16 43 | } 44 | 45 | // Level returns the level of a node in the skiplist 46 | func (n Node) Level() int { 47 | return int(n.level) 48 | } 49 | 50 | // Size returns memory used by the node 51 | func (n Node) Size() int { 52 | return int(nodeHdrSize + uintptr(n.level+1)*nodeRefSize) 53 | } 54 | 55 | // Item returns item held by the node 56 | func (n *Node) Item() unsafe.Pointer { 57 | return n.itm 58 | } 59 | 60 | // SetItem sets itm ptr 61 | func (n *Node) SetItem(itm unsafe.Pointer) { 62 | n.itm = itm 63 | } 64 | 65 | // SetLink can be used to set link pointer for the node 66 | func (n *Node) SetLink(l *Node) { 67 | n.Link = unsafe.Pointer(l) 68 | } 69 | 70 | // GetLink returns link pointer from the node 71 | func (n *Node) GetLink() *Node { 72 | return (*Node)(n.Link) 73 | } 74 | 75 | // GetNext returns next node in level 0 76 | func (n *Node) GetNext() *Node { 77 | var next *Node 78 | var del bool 79 | 80 | for next, del = n.getNext(0); del; next, del = next.getNext(0) { 81 | } 82 | 83 | return next 84 | } 85 | 86 | // NodeRef is a wrapper for node pointer 87 | type NodeRef struct { 88 | flag uint64 89 | ptr *Node 90 | } 91 | 92 | func (n *Node) setNext(level int, ptr *Node, deleted bool) { 93 | nlevel := n.level 94 | ref := (*NodeRef)(unsafe.Pointer(uintptr(unsafe.Pointer(n)) + nodeHdrSize + nodeRefSize*uintptr(level))) 95 | ref.ptr = ptr 96 | ref.flag = 0 97 | // Setting flag for level 0 will require reseting of level 98 | if level == 0 { 99 | n.level = nlevel 100 | } 101 | } 102 | 103 | func (n *Node) getNext(level int) (*Node, bool) { 104 | nodeRefAddr := uintptr(unsafe.Pointer(n)) + nodeHdrSize + nodeRefSize*uintptr(level) 105 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr + uintptr(7))) 106 | 107 | v := atomic.LoadUint64(wordAddr) 108 | deleted := v&deletedFlag == deletedFlag 109 | ptr := (*Node)(unsafe.Pointer(uintptr(v >> 8))) 110 | return ptr, deleted 111 | } 112 | 113 | // The node struct holds a slice of NodeRef. We assume that the 114 | // most-significant-byte of the golang pointer is always unused. In NodeRef 115 | // struct, deleted flag and *Node are packed one after the other. 116 | // If we shift the node address 1 byte to the left. The shifted 8 byte word will have 117 | // a byte from the deleted flag and 7 bytes from the address (8th byte of the address 118 | // is always 0x00). CAS operation can be performed at this location to set 119 | // least-significant to 0xff (denotes deleted). Same applies for loading delete 120 | // flag and the address atomically. 121 | func (n *Node) dcasNext(level int, prevPtr, newPtr *Node, prevIsdeleted, newIsdeleted bool) bool { 122 | nodeRefAddr := uintptr(unsafe.Pointer(n)) + nodeHdrSize + nodeRefSize*uintptr(level) 123 | wordAddr := (*uint64)(unsafe.Pointer(nodeRefAddr + uintptr(7))) 124 | prevVal := uint64(uintptr(unsafe.Pointer(prevPtr)) << 8) 125 | newVal := uint64(uintptr(unsafe.Pointer(newPtr)) << 8) 126 | 127 | if newIsdeleted { 128 | newVal |= deletedFlag 129 | } 130 | 131 | swapped := atomic.CompareAndSwapUint64(wordAddr, prevVal, newVal) 132 | 133 | // This is required to make go1.5+ concurrent garbage collector happy 134 | // It makes writebarrier to mark newPtr as reachable 135 | if swapped { 136 | atomic.CompareAndSwapPointer((*unsafe.Pointer)(unsafe.Pointer(nodeRefAddr+nodeRefFlagSize)), 137 | unsafe.Pointer(newPtr), unsafe.Pointer(newPtr)) 138 | } 139 | 140 | return swapped 141 | } 142 | -------------------------------------------------------------------------------- /skiplist/skiplist.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import ( 12 | "math/rand" 13 | "runtime" 14 | "sync/atomic" 15 | "unsafe" 16 | ) 17 | 18 | // Debug flag enables additional stats gathering 19 | var Debug bool 20 | 21 | // MaxLevel is the limit for the skiplist levels 22 | const MaxLevel = 32 23 | const p = 0.25 24 | 25 | // CompareFn is the skiplist item comparator 26 | type CompareFn func(unsafe.Pointer, unsafe.Pointer) int 27 | 28 | // ItemSizeFn returns size of a skiplist item 29 | type ItemSizeFn func(unsafe.Pointer) int 30 | 31 | func defaultItemSize(unsafe.Pointer) int { 32 | return 0 33 | } 34 | 35 | // MallocFn is a custom memory allocator 36 | type MallocFn func(int) unsafe.Pointer 37 | 38 | // FreeFn is a custom memory deallocator 39 | type FreeFn func(unsafe.Pointer) 40 | 41 | // Config holds skiplist configuration 42 | type Config struct { 43 | ItemSize ItemSizeFn 44 | 45 | UseMemoryMgmt bool 46 | Malloc MallocFn 47 | Free FreeFn 48 | BarrierDestructor BarrierSessionDestructor 49 | } 50 | 51 | // SetItemSizeFunc configures item size function 52 | func (cfg *Config) SetItemSizeFunc(fn ItemSizeFn) { 53 | cfg.ItemSize = fn 54 | } 55 | 56 | // DefaultConfig returns default skiplist configuration 57 | func DefaultConfig() Config { 58 | return Config{ 59 | ItemSize: defaultItemSize, 60 | UseMemoryMgmt: false, 61 | } 62 | } 63 | 64 | // Skiplist - core data structure 65 | type Skiplist struct { 66 | head *Node 67 | tail *Node 68 | level int32 69 | Stats Stats 70 | barrier *AccessBarrier 71 | 72 | newNode func(itm unsafe.Pointer, level int) *Node 73 | freeNode func(*Node) 74 | 75 | Config 76 | } 77 | 78 | // New creates a skiplist with default config 79 | func New() *Skiplist { 80 | return NewWithConfig(DefaultConfig()) 81 | } 82 | 83 | // NewWithConfig creates a config from given config 84 | func NewWithConfig(cfg Config) *Skiplist { 85 | if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { 86 | cfg.UseMemoryMgmt = false 87 | } 88 | 89 | s := &Skiplist{ 90 | Config: cfg, 91 | barrier: newAccessBarrier(cfg.UseMemoryMgmt, cfg.BarrierDestructor), 92 | } 93 | 94 | 95 | if cfg.UseMemoryMgmt { 96 | s.newNode = func(itm unsafe.Pointer, level int) *Node { 97 | return allocNode(itm, level, cfg.Malloc) 98 | } 99 | 100 | s.freeNode = func(n *Node) { 101 | if Debug { 102 | debugMarkFree(n) 103 | } 104 | cfg.Free(unsafe.Pointer(n)) 105 | } 106 | } else { 107 | s.newNode = func(itm unsafe.Pointer, level int) *Node { 108 | return allocNode(itm, level, nil) 109 | } 110 | 111 | s.freeNode = func(*Node) {} 112 | } 113 | 114 | head := s.newNode(MinItem, MaxLevel) 115 | tail := s.newNode(MaxItem, MaxLevel) 116 | 117 | for i := 0; i <= MaxLevel; i++ { 118 | head.setNext(i, tail, false) 119 | tail.setNext(i, nil, false) 120 | } 121 | 122 | s.head = head 123 | s.tail = tail 124 | 125 | return s 126 | } 127 | 128 | // GetAccesBarrier returns current active access barrier 129 | func (s *Skiplist) GetAccesBarrier() *AccessBarrier { 130 | return s.barrier 131 | } 132 | 133 | // FreeNode deallocates the skiplist node memory 134 | func (s *Skiplist) FreeNode(n *Node, sts *Stats) { 135 | s.freeNode(n) 136 | sts.AddInt64(&sts.nodeFrees, 1) 137 | } 138 | 139 | func (s *Skiplist) NewNode(level int) *Node { 140 | return s.newNode(nil, level) 141 | } 142 | 143 | func (s *Skiplist) HeadNode() *Node { 144 | return s.head 145 | } 146 | 147 | func (s *Skiplist) TailNode() *Node { 148 | return s.tail 149 | } 150 | 151 | // ActionBuffer is a temporary buffer used by skiplist operations 152 | type ActionBuffer struct { 153 | preds []*Node 154 | succs []*Node 155 | } 156 | 157 | // MakeBuf creates an action buffer 158 | func (s *Skiplist) MakeBuf() *ActionBuffer { 159 | return &ActionBuffer{ 160 | preds: make([]*Node, MaxLevel+1), 161 | succs: make([]*Node, MaxLevel+1), 162 | } 163 | } 164 | 165 | // FreeBuf frees an action buffer 166 | func (s *Skiplist) FreeBuf(b *ActionBuffer) { 167 | } 168 | 169 | // Size returns the size of a node 170 | func (s *Skiplist) Size(n *Node) int { 171 | return s.ItemSize(n.Item()) + n.Size() 172 | } 173 | 174 | // NewLevel returns a random level for the next node 175 | func (s *Skiplist) NewLevel(randFn func() float32) int { 176 | var nextLevel int 177 | 178 | for ; randFn() < p; nextLevel++ { 179 | } 180 | 181 | if nextLevel > MaxLevel { 182 | nextLevel = MaxLevel 183 | } 184 | 185 | level := int(atomic.LoadInt32(&s.level)) 186 | if nextLevel > level { 187 | if atomic.CompareAndSwapInt32(&s.level, int32(level), int32(level+1)) { 188 | nextLevel = level + 1 189 | } else { 190 | nextLevel = level 191 | } 192 | } 193 | 194 | return nextLevel 195 | } 196 | 197 | func (s *Skiplist) helpDelete(level int, prev, curr, next *Node, sts *Stats) bool { 198 | success := prev.dcasNext(level, curr, next, false, false) 199 | if success && level == 0 { 200 | sts.AddInt64(&sts.softDeletes, -1) 201 | sts.AddInt64(&sts.levelNodesCount[curr.Level()], -1) 202 | sts.AddInt64(&sts.usedBytes, -int64(s.Size(curr))) 203 | } 204 | return success 205 | } 206 | 207 | func (s *Skiplist) Lookup(itm unsafe.Pointer, cmp CompareFn, buf *ActionBuffer, sts *Stats) (pred *Node, curr *Node, found bool) { 208 | found = s.findPath(itm, cmp, buf, sts) != nil 209 | pred = buf.preds[0] 210 | curr = buf.succs[0] 211 | return 212 | } 213 | 214 | func (s *Skiplist) findPath(itm unsafe.Pointer, cmp CompareFn, 215 | buf *ActionBuffer, sts *Stats) (foundNode *Node) { 216 | var cmpVal = 1 217 | 218 | retry: 219 | prev := s.head 220 | level := int(atomic.LoadInt32(&s.level)) 221 | for i := level; i >= 0; i-- { 222 | curr, _ := prev.getNext(i) 223 | levelSearch: 224 | for { 225 | next, deleted := curr.getNext(i) 226 | for deleted { 227 | if !s.helpDelete(i, prev, curr, next, sts) { 228 | sts.AddUint64(&sts.readConflicts, 1) 229 | goto retry 230 | } 231 | 232 | curr, _ = prev.getNext(i) 233 | next, deleted = curr.getNext(i) 234 | } 235 | 236 | cmpVal = compare(cmp, curr.Item(), itm) 237 | if cmpVal < 0 { 238 | prev = curr 239 | curr = next 240 | } else { 241 | break levelSearch 242 | } 243 | } 244 | 245 | buf.preds[i] = prev 246 | buf.succs[i] = curr 247 | } 248 | 249 | if cmpVal == 0 { 250 | foundNode = buf.succs[0] 251 | } 252 | return 253 | } 254 | 255 | // Insert adds an item into the skiplist 256 | func (s *Skiplist) Insert(itm unsafe.Pointer, cmp CompareFn, 257 | buf *ActionBuffer, sts *Stats) (success bool) { 258 | _, success = s.Insert2(itm, cmp, nil, buf, rand.Float32, sts) 259 | return 260 | } 261 | 262 | // Insert2 is a more verbose version of Insert 263 | func (s *Skiplist) Insert2(itm unsafe.Pointer, inscmp CompareFn, eqCmp CompareFn, 264 | buf *ActionBuffer, randFn func() float32, sts *Stats) (*Node, bool) { 265 | itemLevel := s.NewLevel(randFn) 266 | return s.Insert3(itm, inscmp, eqCmp, buf, itemLevel, false, sts) 267 | } 268 | 269 | // Insert3 is more verbose version of Insert2 270 | func (s *Skiplist) Insert3(itm unsafe.Pointer, insCmp CompareFn, eqCmp CompareFn, 271 | buf *ActionBuffer, itemLevel int, skipFindPath bool, sts *Stats) (*Node, bool) { 272 | 273 | token := s.barrier.Acquire() 274 | defer s.barrier.Release(token) 275 | 276 | x := s.newNode(itm, itemLevel) 277 | return s.Insert4(x, insCmp, eqCmp, buf, itemLevel, skipFindPath, true, sts) 278 | } 279 | 280 | func (s *Skiplist) Insert4(x *Node, insCmp CompareFn, eqCmp CompareFn, buf *ActionBuffer, 281 | itemLevel int, skipFindPath bool, dealloc bool, sts *Stats) (*Node, bool) { 282 | 283 | itm := x.Item() 284 | 285 | retry: 286 | if skipFindPath { 287 | skipFindPath = false 288 | } else { 289 | var foundNode *Node 290 | 291 | if foundNode = s.findPath(itm, insCmp, buf, sts); foundNode == nil { 292 | if eqCmp != nil && compare(eqCmp, itm, buf.preds[0].Item()) == 0 { 293 | foundNode = buf.preds[0] 294 | } 295 | } 296 | 297 | if foundNode != nil { 298 | if dealloc { 299 | s.freeNode(x) 300 | } 301 | return foundNode, false 302 | } 303 | } 304 | 305 | // Set all next links for the node non-atomically 306 | for i := 0; i <= int(itemLevel); i++ { 307 | x.setNext(i, buf.succs[i], false) 308 | } 309 | 310 | // Now node is part of the skiplist 311 | if !buf.preds[0].dcasNext(0, buf.succs[0], x, false, false) { 312 | sts.AddUint64(&sts.insertConflicts, 1) 313 | goto retry 314 | } 315 | 316 | // Add to index levels 317 | for i := 1; i <= int(itemLevel); i++ { 318 | fixThisLevel: 319 | for { 320 | nodeNext, deleted := x.getNext(i) 321 | next := buf.succs[i] 322 | 323 | // Update the node's next pointer at current level if required. 324 | // This is the only thread which can modify next pointer at this level 325 | // The dcas operation can fail only if another thread marked delete 326 | if deleted || (nodeNext != next && !x.dcasNext(i, nodeNext, next, false, false)) { 327 | goto finished 328 | } 329 | 330 | if buf.preds[i].dcasNext(i, next, x, false, false) { 331 | break fixThisLevel 332 | } 333 | 334 | s.findPath(itm, insCmp, buf, sts) 335 | } 336 | } 337 | 338 | finished: 339 | sts.AddInt64(&sts.nodeAllocs, 1) 340 | sts.AddInt64(&sts.levelNodesCount[itemLevel], 1) 341 | sts.AddInt64(&sts.usedBytes, int64(s.Size(x))) 342 | return x, true 343 | } 344 | 345 | func (s *Skiplist) softDelete(delNode *Node, sts *Stats) bool { 346 | var marked bool 347 | 348 | targetLevel := delNode.Level() 349 | for i := targetLevel; i >= 0; i-- { 350 | next, deleted := delNode.getNext(i) 351 | for !deleted { 352 | if delNode.dcasNext(i, next, next, false, true) && i == 0 { 353 | sts.AddInt64(&sts.softDeletes, 1) 354 | marked = true 355 | } 356 | next, deleted = delNode.getNext(i) 357 | } 358 | } 359 | return marked 360 | } 361 | 362 | // Delete an item from the skiplist 363 | func (s *Skiplist) Delete(itm unsafe.Pointer, cmp CompareFn, 364 | buf *ActionBuffer, sts *Stats) bool { 365 | token := s.barrier.Acquire() 366 | defer s.barrier.Release(token) 367 | 368 | found := s.findPath(itm, cmp, buf, sts) != nil 369 | if !found { 370 | return false 371 | } 372 | 373 | delNode := buf.succs[0] 374 | return s.deleteNode(delNode, cmp, buf, sts) 375 | } 376 | 377 | // DeleteNode an item from the skiplist by specifying its node 378 | 379 | func (s *Skiplist) DeleteNode(n *Node, cmp CompareFn, 380 | buf *ActionBuffer, sts *Stats) bool { 381 | token := s.barrier.Acquire() 382 | defer s.barrier.Release(token) 383 | return s.DeleteNode2(n, cmp, buf, sts) 384 | } 385 | 386 | func (s *Skiplist) DeleteNode2(n *Node, cmp CompareFn, 387 | buf *ActionBuffer, sts *Stats) bool { 388 | return s.deleteNode(n, cmp, buf, sts) 389 | } 390 | 391 | func (s *Skiplist) deleteNode(n *Node, cmp CompareFn, buf *ActionBuffer, sts *Stats) bool { 392 | itm := n.Item() 393 | if s.softDelete(n, sts) { 394 | s.findPath(itm, cmp, buf, sts) 395 | return true 396 | } 397 | 398 | return false 399 | } 400 | 401 | // GetRangeSplitItems returns `nways` split range pivots of the skiplist items 402 | // Explicit barrier and release should be used by the caller before 403 | // and after this function call 404 | func (s *Skiplist) GetRangeSplitItems(nways int) []unsafe.Pointer { 405 | var deleted bool 406 | repeat: 407 | var itms []unsafe.Pointer 408 | var finished bool 409 | 410 | l := int(atomic.LoadInt32(&s.level)) 411 | for ; l >= 0; l-- { 412 | c := int(atomic.LoadInt64(&s.Stats.levelNodesCount[l]) + 1) 413 | if c >= nways { 414 | perSplit := c / nways 415 | node := s.head 416 | for j := 0; node != s.tail && !finished; j++ { 417 | if j == perSplit { 418 | j = -1 419 | itms = append(itms, node.Item()) 420 | finished = len(itms) == nways-1 421 | } 422 | 423 | node, deleted = node.getNext(l) 424 | if deleted { 425 | goto repeat 426 | } 427 | } 428 | 429 | break 430 | } 431 | } 432 | 433 | return itms 434 | } 435 | -------------------------------------------------------------------------------- /skiplist/skiplist_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | package skiplist 9 | 10 | import "testing" 11 | import "fmt" 12 | import "math/rand" 13 | import "runtime" 14 | import "sync" 15 | import "time" 16 | import "unsafe" 17 | import "github.com/couchbase/nitro/mm" 18 | 19 | func TestInsert(t *testing.T) { 20 | s := New() 21 | cmp := CompareBytes 22 | buf := s.MakeBuf() 23 | defer s.FreeBuf(buf) 24 | 25 | for i := 0; i < 2000; i++ { 26 | s.Insert(NewByteKeyItem([]byte(fmt.Sprintf("%010d", i))), cmp, buf, &s.Stats) 27 | } 28 | 29 | for i := 1750; i < 2000; i++ { 30 | s.Delete(NewByteKeyItem([]byte(fmt.Sprintf("%010d", i))), cmp, buf, &s.Stats) 31 | } 32 | 33 | itr := s.NewIterator(cmp, buf) 34 | itr.SetRefreshInterval(50) 35 | count := 0 36 | itr.SeekFirst() 37 | itr.Seek(NewByteKeyItem([]byte(fmt.Sprintf("%010d", 1500)))) 38 | var seekItm unsafe.Pointer 39 | for ; itr.Valid(); itr.Next() { 40 | expected := fmt.Sprintf("%010d", count+1500) 41 | seekItm = itr.Get() 42 | got := string(*(*byteKeyItem)(seekItm)) 43 | count++ 44 | if got != expected { 45 | t.Errorf("Expected %s, got %v", expected, got) 46 | } 47 | } 48 | 49 | if count != 250 { 50 | t.Errorf("Expected count = 250, got %v", count) 51 | } 52 | 53 | got := itr.SeekWithCmp(seekItm, CompareInt, CompareInt) 54 | if !got { 55 | t.Errorf("Expected seekWithCmp to work") 56 | } 57 | itr.Close() 58 | 59 | // Reopen iterator with smaller refresh interval 60 | itr = s.NewIterator(cmp, buf) 61 | count = 0 62 | itr.SeekFirst() 63 | itr.Seek(NewByteKeyItem([]byte(fmt.Sprintf("%010d", 1500)))) 64 | for ; itr.Valid(); itr.Next() { 65 | expected := fmt.Sprintf("%010d", count+1500) 66 | seekItm = itr.Get() 67 | got := string(*(*byteKeyItem)(seekItm)) 68 | count++ 69 | if got != expected { 70 | t.Errorf("Expected %s, got %v", expected, got) 71 | } 72 | if count == 124 { 73 | itr.Refresh() 74 | } 75 | } 76 | } 77 | 78 | func doInsert(sl *Skiplist, wg *sync.WaitGroup, n int, isRand bool) { 79 | defer wg.Done() 80 | buf := sl.MakeBuf() 81 | defer sl.FreeBuf(buf) 82 | 83 | cmp := CompareInt 84 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 85 | for i := 0; i < n; i++ { 86 | var val int 87 | if isRand { 88 | val = rnd.Int() 89 | } else { 90 | val = i 91 | } 92 | 93 | itm := IntKeyItem(val) 94 | sl.Insert2(unsafe.Pointer(&itm), cmp, nil, buf, rnd.Float32, &sl.Stats) 95 | } 96 | } 97 | 98 | func doGet(sl *Skiplist, wg *sync.WaitGroup, n int) { 99 | defer wg.Done() 100 | rnd := rand.New(rand.NewSource(int64(rand.Int()))) 101 | cmp := CompareInt 102 | buf := sl.MakeBuf() 103 | defer sl.FreeBuf(buf) 104 | 105 | itr := sl.NewIterator(cmp, buf) 106 | for i := 0; i < n; i++ { 107 | val := rnd.Int() % n 108 | itm := IntKeyItem(val) 109 | itr.Seek(unsafe.Pointer(&itm)) 110 | } 111 | 112 | } 113 | 114 | func TestInsertPerf(t *testing.T) { 115 | var wg sync.WaitGroup 116 | sl := New() 117 | n := 1000 118 | t0 := time.Now() 119 | total := n * runtime.GOMAXPROCS(0) 120 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 121 | wg.Add(1) 122 | go doInsert(sl, &wg, n, true) 123 | } 124 | wg.Wait() 125 | 126 | dur := time.Since(t0) 127 | 128 | fmt.Printf("%d items took %v -> %v items/s conflicts %v\n", total, dur, float64(total)/float64(dur.Seconds()), sl.GetStats().InsertConflicts) 129 | } 130 | 131 | func TestGetPerf(t *testing.T) { 132 | var wg sync.WaitGroup 133 | sl := New() 134 | n := 1000 135 | wg.Add(1) 136 | go doInsert(sl, &wg, n, false) 137 | wg.Wait() 138 | 139 | t0 := time.Now() 140 | total := n * runtime.GOMAXPROCS(0) 141 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 142 | wg.Add(1) 143 | go doGet(sl, &wg, n) 144 | } 145 | wg.Wait() 146 | dur := time.Since(t0) 147 | fmt.Printf("%d items took %v -> %v items/s\n", total, dur, float64(total)/float64(dur.Seconds())) 148 | 149 | } 150 | 151 | func TestGetRangeSplitItems(t *testing.T) { 152 | var wg sync.WaitGroup 153 | sl := New() 154 | n := 1000 155 | wg.Add(1) 156 | go doInsert(sl, &wg, n, false) 157 | wg.Wait() 158 | 159 | fmt.Println(sl.GetStats()) 160 | 161 | var keys []int 162 | var diff []int 163 | var curr int 164 | for i, itm := range sl.GetRangeSplitItems(8) { 165 | k := int(*(*IntKeyItem)(itm)) 166 | keys = append(keys, k) 167 | diff = append(diff, keys[i]-curr) 168 | curr = keys[i] 169 | } 170 | 171 | diff = append(diff, n-keys[len(keys)-1]) 172 | 173 | fmt.Println("Split range keys", keys) 174 | fmt.Println("No of items in each range", diff) 175 | } 176 | 177 | func TestBuilder(t *testing.T) { 178 | var wg sync.WaitGroup 179 | 180 | n := 50000 181 | nsplit := 8 182 | segs := make([]*Segment, nsplit) 183 | t0 := time.Now() 184 | b := NewBuilder() 185 | for i := 0; i < nsplit; i++ { 186 | segs[i] = b.NewSegment() 187 | } 188 | 189 | perSplit := n / nsplit 190 | for i := 0; i < nsplit; i++ { 191 | wg.Add(1) 192 | go func(wg *sync.WaitGroup, shard int) { 193 | defer wg.Done() 194 | for x := 0; x < perSplit; x++ { 195 | itm := IntKeyItem(perSplit*shard + x) 196 | segs[shard].Add(unsafe.Pointer(&itm)) 197 | } 198 | }(&wg, i) 199 | } 200 | 201 | wg.Wait() 202 | 203 | sl := b.Assemble(segs...) 204 | fmt.Println(sl.GetStats()) 205 | dur := time.Since(t0) 206 | fmt.Printf("Took %v to build %d items, %v items/sec\n", dur, n, float32(n)/float32(dur.Seconds())) 207 | buf := sl.MakeBuf() 208 | defer sl.FreeBuf(buf) 209 | count := 0 210 | 211 | t0 = time.Now() 212 | itr := sl.NewIterator(CompareInt, buf) 213 | for itr.SeekFirst(); itr.Valid(); itr.Next() { 214 | if int(*(*IntKeyItem)(itr.Get())) != count { 215 | t.Errorf("Expected %d, got %d", count, itr.Get()) 216 | } 217 | count++ 218 | } 219 | fmt.Printf("Took %v to iterate %d items\n", time.Since(t0), n) 220 | 221 | if count != n { 222 | t.Errorf("Expected %d, got %d", n, count) 223 | } 224 | 225 | } 226 | 227 | func TestNodeDCAS(t *testing.T) { 228 | level := 0 229 | 230 | // golang memory 231 | 232 | pval1 := new(int) 233 | p1 := allocNode(unsafe.Pointer(pval1), level, nil) 234 | 235 | pval2 := new(int) 236 | p2 := allocNode(unsafe.Pointer(pval2), level, nil) 237 | 238 | pval3 := new(int) 239 | p3 := allocNode(unsafe.Pointer(pval3), level, nil) 240 | 241 | // initialize 242 | if !p1.dcasNext(level, nil, p2, false, false) { 243 | t.Errorf("dcas failed!") 244 | return 245 | } 246 | 247 | if !p1.dcasNext(level, p2, p2, false, false) { 248 | t.Errorf("dcas failed!") 249 | return 250 | } 251 | 252 | // not valid case but mark as deleted 253 | if !p2.dcasNext(level, nil, p3, false, true) { 254 | t.Errorf("dcas failed!") 255 | return 256 | } 257 | 258 | // already deleted 259 | if p2.dcasNext(level, p3, p3, false, false) { 260 | t.Errorf("dcas should fail!") 261 | return 262 | } 263 | 264 | if !p1.dcasNext(level, p2, p3, false, false) { 265 | t.Errorf("dcas failed!") 266 | return 267 | } 268 | 269 | // soft delete 270 | if !p1.dcasNext(level, p3, p3, false, true) { 271 | t.Errorf("dcas failed!") 272 | return 273 | } 274 | 275 | // user memory 276 | 277 | qval1 := mm.Malloc(8) 278 | q1 := allocNode(unsafe.Pointer(qval1), level, mm.Malloc) 279 | 280 | qval2 := mm.Malloc(8) 281 | q2 := allocNode(unsafe.Pointer(qval2), level, mm.Malloc) 282 | 283 | qval3 := mm.Malloc(8) 284 | q3 := allocNode(unsafe.Pointer(qval3), level, mm.Malloc) 285 | 286 | // initialize 287 | if !q1.dcasNext(level, nil, q2, false, false) { 288 | t.Errorf("dcas failed!") 289 | return 290 | } 291 | 292 | if !q1.dcasNext(level, q2, q2, false, false) { 293 | t.Errorf("dcas failed!") 294 | return 295 | } 296 | 297 | // not valid case but mark as deleted 298 | if !q2.dcasNext(level, nil, q3, false, true) { 299 | t.Errorf("dcas failed!") 300 | return 301 | } 302 | 303 | // already deleted 304 | if q2.dcasNext(level, q3, q3, false, false) { 305 | t.Errorf("dcas should fail!") 306 | return 307 | } 308 | 309 | if !q1.dcasNext(level, q2, q3, false, false) { 310 | t.Errorf("dcas failed!") 311 | return 312 | } 313 | 314 | // soft delete 315 | if !q1.dcasNext(level, q3, q3, false, true) { 316 | t.Errorf("dcas failed!") 317 | return 318 | } 319 | 320 | mm.Free(qval1) 321 | mm.Free(qval2) 322 | mm.Free(qval3) 323 | } 324 | -------------------------------------------------------------------------------- /skiplist/stats.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included in 4 | // the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 5 | // file, in accordance with the Business Source License, use of this software 6 | // will be governed by the Apache License, Version 2.0, included in the file 7 | // licenses/APL2.txt. 8 | 9 | package skiplist 10 | 11 | import "fmt" 12 | import "sync/atomic" 13 | 14 | // StatsReport is used for reporting skiplist statistics 15 | type StatsReport struct { 16 | ReadConflicts uint64 17 | InsertConflicts uint64 18 | NextPointersPerNode float64 19 | NodeDistribution [MaxLevel + 1]int64 20 | NodeCount int 21 | SoftDeletes int64 22 | Memory int64 23 | 24 | NodeAllocs int64 25 | NodeFrees int64 26 | } 27 | 28 | // Apply updates the report with provided paritial stats 29 | func (report *StatsReport) Apply(s *Stats) { 30 | var totalNextPtrs int 31 | var totalNodes int 32 | 33 | report.ReadConflicts += s.readConflicts 34 | report.InsertConflicts += s.insertConflicts 35 | 36 | for i, c := range s.levelNodesCount { 37 | report.NodeDistribution[i] += c 38 | nodesAtlevel := report.NodeDistribution[i] 39 | totalNodes += int(nodesAtlevel) 40 | totalNextPtrs += (i + 1) * int(nodesAtlevel) 41 | } 42 | 43 | report.SoftDeletes += s.softDeletes 44 | report.NodeCount = totalNodes 45 | report.NodeAllocs += s.nodeAllocs 46 | report.NodeFrees += s.nodeFrees 47 | report.Memory += s.usedBytes 48 | if totalNodes != 0 { 49 | report.NextPointersPerNode = float64(totalNextPtrs) / float64(totalNodes) 50 | } 51 | } 52 | 53 | // Stats keeps stats for a skiplist instance 54 | type Stats struct { 55 | insertConflicts uint64 56 | readConflicts uint64 57 | levelNodesCount [MaxLevel + 1]int64 58 | softDeletes int64 59 | nodeAllocs, nodeFrees int64 60 | usedBytes int64 61 | 62 | isLocal bool 63 | } 64 | 65 | // IsLocal reports true if the stats is partial 66 | func (s *Stats) IsLocal(flag bool) { 67 | s.isLocal = flag 68 | } 69 | 70 | // AddInt64 provides atomic add 71 | func (s *Stats) AddInt64(src *int64, val int64) { 72 | if s.isLocal { 73 | *src += val 74 | } else { 75 | atomic.AddInt64(src, val) 76 | } 77 | } 78 | 79 | // AddUint64 provides atomic add 80 | func (s *Stats) AddUint64(src *uint64, val uint64) { 81 | if s.isLocal { 82 | *src += val 83 | } else { 84 | atomic.AddUint64(src, val) 85 | } 86 | } 87 | 88 | // Merge updates global stats with partial stats and resets partial stats 89 | func (s *Stats) Merge(sts *Stats) { 90 | atomic.AddUint64(&s.insertConflicts, sts.insertConflicts) 91 | sts.insertConflicts = 0 92 | atomic.AddUint64(&s.readConflicts, sts.readConflicts) 93 | sts.readConflicts = 0 94 | atomic.AddInt64(&s.softDeletes, sts.softDeletes) 95 | sts.softDeletes = 0 96 | atomic.AddInt64(&s.nodeAllocs, sts.nodeAllocs) 97 | sts.nodeAllocs = 0 98 | atomic.AddInt64(&s.nodeFrees, sts.nodeFrees) 99 | sts.nodeFrees = 0 100 | atomic.AddInt64(&s.usedBytes, sts.usedBytes) 101 | sts.usedBytes = 0 102 | 103 | for i, val := range sts.levelNodesCount { 104 | if val != 0 { 105 | atomic.AddInt64(&s.levelNodesCount[i], val) 106 | sts.levelNodesCount[i] = 0 107 | } 108 | } 109 | } 110 | 111 | func (s StatsReport) String() string { 112 | str := fmt.Sprintf("{\n"+ 113 | `"node_count": %d,`+"\n"+ 114 | `"soft_deletes": %d,`+"\n"+ 115 | `"read_conflicts": %d,`+"\n"+ 116 | `"insert_conflicts": %d,`+"\n"+ 117 | `"next_pointers_per_node": %.4f,`+"\n"+ 118 | `"memory_used": %d,`+"\n"+ 119 | `"node_allocs": %d,`+"\n"+ 120 | `"node_frees": %d,`+"\n", 121 | s.NodeCount, s.SoftDeletes, s.ReadConflicts, s.InsertConflicts, 122 | s.NextPointersPerNode, s.Memory, s.NodeAllocs, s.NodeFrees) 123 | 124 | str += `"level_node_distribution":` + "{\n" 125 | 126 | for i, c := range s.NodeDistribution { 127 | if i > 0 { 128 | str += fmt.Sprintf(",\n") 129 | } 130 | str += fmt.Sprintf(`"level%d": %d`, i, c) 131 | } 132 | str += "\n}\n}" 133 | return str 134 | } 135 | 136 | // GetStats returns skiplist stats 137 | func (s *Skiplist) GetStats() StatsReport { 138 | var report StatsReport 139 | report.Apply(&s.Stats) 140 | return report 141 | } 142 | 143 | // MemoryInUse returns memory used by skiplist 144 | func (s *Skiplist) MemoryInUse() int64 { 145 | return atomic.LoadInt64(&s.Stats.usedBytes) 146 | } 147 | --------------------------------------------------------------------------------