├── utils ├── cache │ ├── cache.s │ ├── cache_test.go │ ├── lru.go │ ├── cmSketch.go │ ├── s2lru.go │ ├── bloom.go │ └── cache.go ├── slice.go ├── tools.go ├── iterator.go ├── entry_test.go ├── closer.go ├── rand.go ├── mmap │ ├── mmap_darwin.go │ ├── mmap_linux.go │ ├── darwin.go │ └── linux.go ├── const.go ├── map.go ├── throttle.go ├── key.go ├── bloom.go ├── error.go ├── wal.go ├── file.go ├── bloom_test.go ├── value.go ├── skiplist_test.go ├── entry.go └── arena.go ├── .gitignore ├── README.md ├── go.mod ├── debug.sh ├── gen.sh ├── LICENSE ├── file ├── file.go ├── wal.go ├── sstable_linux.go ├── sstable_darwin.go ├── vlog.go ├── mmap_darwin.go ├── mmap_linux.go └── manifest.go ├── stats.go ├── lsm ├── cache.go ├── manifest_test.go ├── lsm.go ├── memtable.go ├── lsm_test.go ├── levels.go ├── table.go └── iterator.go ├── options.go ├── pb └── pb.proto ├── db_test.go ├── iterator.go ├── go.sum ├── vlog_test.go └── db.go /utils/cache/cache.s: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | work_test 3 | testdata 4 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # corekv 2 | 3 | corekv 是一个用来高效率的验证kv引擎feature的项目。 -------------------------------------------------------------------------------- /utils/slice.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // Slice holds a reusable buf, will reallocate if you request a larger size than ever before. 4 | // One problem is with n distinct sizes in random order it'll reallocate log(n) times. 5 | type Slice struct { 6 | buf []byte 7 | } 8 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hardcore-os/corekv 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/cespare/xxhash/v2 v2.1.2 7 | github.com/davecgh/go-spew v1.1.1 // indirect 8 | github.com/golang/protobuf v1.5.2 9 | github.com/pkg/errors v0.9.1 10 | github.com/stretchr/testify v1.7.0 11 | golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 12 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect 13 | google.golang.org/protobuf v1.27.1 // indirect 14 | ) 15 | -------------------------------------------------------------------------------- /debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ### 3 | # Copyright 2021 logicrec Project Authors 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License") 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | ### 17 | 18 | dlv test -test.run=$1 -------------------------------------------------------------------------------- /utils/cache/cache_test.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestCacheBasicCRUD(t *testing.T) { 11 | cache := NewCache(5) 12 | for i := 0; i < 10; i++ { 13 | key := fmt.Sprintf("key%d", i) 14 | val := fmt.Sprintf("val%d", i) 15 | cache.Set(key, val) 16 | fmt.Printf("set %s: %s\n", key, cache) 17 | } 18 | 19 | for i := 0; i < 1000; i++ { 20 | key := fmt.Sprintf("key%d", i) 21 | val := fmt.Sprintf("val%d", i) 22 | res, ok := cache.Get(key) 23 | if ok { 24 | fmt.Printf("get %s: %s\n", key, cache) 25 | assert.Equal(t, val, res) 26 | continue 27 | } 28 | assert.Equal(t, res, nil) 29 | } 30 | fmt.Printf("at last: %s\n", cache) 31 | } 32 | -------------------------------------------------------------------------------- /gen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ### Copyright hardcore-os Project Authors 3 | ### 4 | # Licensed under the Apache License, Version 2.0 (the "License") 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | protoDir="pb" 16 | outDir="pb" 17 | protoc -I ${protoDir}/ ${protoDir}/pb.proto --gofast_out=plugins=grpc:${outDir} -------------------------------------------------------------------------------- /utils/tools.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 bardcckre-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | func ValueSize(value []byte) int64 { 18 | return 0 19 | } 20 | 21 | // Copy copies a byte slice and returns the copied slice. 22 | func Copy(a []byte) []byte { 23 | b := make([]byte, len(a)) 24 | copy(b, a) 25 | return b 26 | } 27 | -------------------------------------------------------------------------------- /utils/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | // Iterator 迭代器 18 | type Iterator interface { 19 | Next() 20 | Valid() bool 21 | Rewind() 22 | Item() Item 23 | Close() error 24 | Seek(key []byte) 25 | } 26 | 27 | // Item _ 28 | type Item interface { 29 | Entry() *Entry 30 | } 31 | 32 | // Options _ 33 | // TODO 可能被重构 34 | type Options struct { 35 | Prefix []byte 36 | IsAsc bool 37 | } 38 | -------------------------------------------------------------------------------- /utils/entry_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "testing" 19 | 20 | "github.com/stretchr/testify/assert" 21 | ) 22 | 23 | func TestValueStruct(t *testing.T) { 24 | v := ValueStruct{ 25 | Value: []byte("硬核课堂"), 26 | Meta: 2, 27 | ExpiresAt: 213123123123, 28 | } 29 | data := make([]byte, v.EncodedSize()) 30 | v.EncodeValue(data) 31 | var vv ValueStruct 32 | vv.DecodeValue(data) 33 | assert.Equal(t, vv, v) 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 hardcore-os 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /file/file.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package file 16 | 17 | import "io" 18 | 19 | // Options 20 | type Options struct { 21 | FID uint64 22 | FileName string 23 | Dir string 24 | Path string 25 | Flag int 26 | MaxSz int 27 | } 28 | 29 | type CoreFile interface { 30 | Close() error 31 | Truncature(n int64) error 32 | ReName(name string) error 33 | NewReader(offset int) io.Reader 34 | Bytes(off, sz int) ([]byte, error) 35 | AllocateSlice(sz, offset int) ([]byte, int, error) 36 | Sync() error 37 | Delete() error 38 | Slice(offset int) []byte 39 | } 40 | -------------------------------------------------------------------------------- /stats.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 logicrec Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package corekv 16 | 17 | import "github.com/hardcore-os/corekv/utils" 18 | 19 | type Stats struct { 20 | closer *utils.Closer 21 | EntryNum int64 // 存储多少个kv数据 22 | } 23 | 24 | // Close 25 | func (s *Stats) close() error { 26 | return nil 27 | } 28 | 29 | // StartStats 30 | func (s *Stats) StartStats() { 31 | defer s.closer.Done() 32 | for { 33 | select { 34 | case <-s.closer.CloseSignal: 35 | return 36 | } 37 | // stats logic... 38 | } 39 | } 40 | 41 | // NewStats 42 | func newStats(opt *Options) *Stats { 43 | s := &Stats{} 44 | s.closer = utils.NewCloser() 45 | s.EntryNum = 1 // 这里直接写 46 | return s 47 | } 48 | -------------------------------------------------------------------------------- /utils/closer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import "sync" 18 | 19 | // Closer _用于资源回收的信号控制 20 | type Closer struct { 21 | waiting sync.WaitGroup 22 | CloseSignal chan struct{} 23 | } 24 | 25 | // NewCloser _ 26 | func NewCloser() *Closer { 27 | closer := &Closer{waiting: sync.WaitGroup{}} 28 | closer.CloseSignal = make(chan struct{}) 29 | return closer 30 | } 31 | 32 | // Close 上游通知下游协程进行资源回收,并等待协程通知回收完毕 33 | func (c *Closer) Close() { 34 | close(c.CloseSignal) 35 | c.waiting.Wait() 36 | } 37 | 38 | // Done 标示协程已经完成资源回收,通知上游正式关闭 39 | func (c *Closer) Done() { 40 | c.waiting.Done() 41 | } 42 | 43 | // Add 添加wait 计数 44 | func (c *Closer) Add(n int) { 45 | c.waiting.Add(n) 46 | } 47 | -------------------------------------------------------------------------------- /lsm/cache.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lsm 16 | 17 | import ( 18 | coreCache "github.com/hardcore-os/corekv/utils/cache" 19 | ) 20 | 21 | type cache struct { 22 | indexs *coreCache.Cache // key fid, value table 23 | blocks *coreCache.Cache // key fid_blockOffset value block []byte 24 | } 25 | 26 | type blockBuffer struct { 27 | b []byte 28 | } 29 | 30 | const defaultCacheSize = 1024 31 | 32 | // close 33 | func (c *cache) close() error { 34 | return nil 35 | } 36 | 37 | // newCache 38 | func newCache(opt *Options) *cache { 39 | return &cache{indexs: coreCache.NewCache(defaultCacheSize), blocks: coreCache.NewCache(defaultCacheSize)} 40 | } 41 | 42 | // TODO fid 使用字符串是不是会有性能损耗 43 | func (c *cache) addIndex(fid uint64, t *table) { 44 | c.indexs.Set(fid, t) 45 | } 46 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-o Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package corekv 16 | 17 | import "github.com/hardcore-os/corekv/utils" 18 | 19 | // Options corekv 总的配置文件 20 | type Options struct { 21 | ValueThreshold int64 22 | WorkDir string 23 | MemTableSize int64 24 | SSTableMaxSz int64 25 | MaxBatchCount int64 26 | MaxBatchSize int64 // max batch size in bytes 27 | ValueLogFileSize int 28 | VerifyValueChecksum bool 29 | ValueLogMaxEntries uint32 30 | LogRotatesToFlush int32 31 | MaxTableSize int64 32 | } 33 | 34 | // NewDefaultOptions 返回默认的options 35 | func NewDefaultOptions() *Options { 36 | opt := &Options{ 37 | WorkDir: "./work_test", 38 | MemTableSize: 1024, 39 | SSTableMaxSz: 1 << 30, 40 | } 41 | opt.ValueThreshold = utils.DefaultValueThreshold 42 | return opt 43 | } 44 | -------------------------------------------------------------------------------- /utils/cache/lru.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | "fmt" 6 | ) 7 | 8 | type windowLRU struct { 9 | data map[uint64]*list.Element 10 | cap int 11 | list *list.List 12 | } 13 | 14 | type storeItem struct { 15 | stage int 16 | key uint64 17 | conflict uint64 18 | value interface{} 19 | } 20 | 21 | func newWindowLRU(size int, data map[uint64]*list.Element) *windowLRU { 22 | return &windowLRU{ 23 | data: data, 24 | cap: size, 25 | list: list.New(), 26 | } 27 | } 28 | 29 | func (lru *windowLRU) add(newitem storeItem) (eitem storeItem, evicted bool) { 30 | // 如果 window 部分容量未满,直接插入 31 | if lru.list.Len() < lru.cap { 32 | lru.data[newitem.key] = lru.list.PushFront(&newitem) 33 | return storeItem{}, false 34 | } 35 | //如果 widow 部分容量已满,按照 lru 规则从尾部淘汰 36 | evictItem := lru.list.Back() 37 | item := evictItem.Value.(*storeItem) 38 | 39 | // 从 slice 中删除该条数据 40 | delete(lru.data, item.key) 41 | 42 | // 这里直接对 evictItem 和 *item 赋值,避免向runtime 再次申请空间 43 | eitem, *item = *item, newitem 44 | 45 | lru.data[item.key] = evictItem 46 | lru.list.MoveToFront(evictItem) 47 | return eitem, true 48 | } 49 | 50 | func (lru *windowLRU) get(v *list.Element) { 51 | lru.list.MoveToFront(v) 52 | } 53 | 54 | func (lru *windowLRU) String() string { 55 | var s string 56 | for e := lru.list.Front(); e != nil; e = e.Next() { 57 | s += fmt.Sprintf("%v,", e.Value.(*storeItem).value) 58 | } 59 | return s 60 | } 61 | -------------------------------------------------------------------------------- /utils/rand.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | var ( 11 | r = rand.New(rand.NewSource(time.Now().UnixNano())) 12 | mu sync.Mutex 13 | ) 14 | 15 | func Int63n(n int64) int64 { 16 | mu.Lock() 17 | res := r.Int63n(n) 18 | mu.Unlock() 19 | return res 20 | } 21 | 22 | func RandN(n int) int { 23 | mu.Lock() 24 | res := r.Intn(n) 25 | mu.Unlock() 26 | return res 27 | } 28 | 29 | func Float64() float64 { 30 | mu.Lock() 31 | res := r.Float64() 32 | mu.Unlock() 33 | return res 34 | } 35 | 36 | // 生成随机字符串作为key和value 37 | func randStr(length int) string { 38 | // 包括特殊字符,进行测试 39 | str := "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ~=+%^*/()[]{}/!@#$?|©®😁😭🉑️🐂㎡硬核课堂" 40 | bytes := []byte(str) 41 | result := []byte{} 42 | rand.Seed(time.Now().UnixNano() + int64(rand.Intn(100))) 43 | for i := 0; i < length; i++ { 44 | result = append(result, bytes[rand.Intn(len(bytes))]) 45 | } 46 | return string(result) 47 | } 48 | 49 | // 构建entry对象 50 | func BuildEntry() *Entry { 51 | rand.Seed(time.Now().Unix()) 52 | key := []byte(fmt.Sprintf("%s%s", randStr(16), "12345678")) 53 | value := []byte(randStr(128)) 54 | // key := []byte(fmt.Sprintf("%s%s", "硬核课堂", "12345678")) 55 | // value := []byte("硬核😁课堂") 56 | expiresAt := uint64(time.Now().Add(12*time.Hour).UnixNano() / 1e6) 57 | return &Entry{ 58 | Key: key, 59 | Value: value, 60 | ExpiresAt: expiresAt, 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /utils/mmap/mmap_darwin.go: -------------------------------------------------------------------------------- 1 | // +build darwin 2 | 3 | // Copyright 2021 hardcore-os Project Authors 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License") 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // mmap api 17 | 18 | // Mmap uses the mmap system call to memory-map a file. If writable is true, 19 | // memory protection of the pages is set so that they may be written to as well. 20 | package mmap 21 | 22 | import ( 23 | "os" 24 | ) 25 | 26 | func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) { 27 | return mmap(fd, writable, size) 28 | } 29 | 30 | // Munmap unmaps a previously mapped slice. 31 | func Munmap(b []byte) error { 32 | return munmap(b) 33 | } 34 | 35 | // Madvise uses the madvise system call to give advise about the use of memory 36 | // when using a slice that is memory-mapped to a file. Set the readahead flag to 37 | // false if page references are expected in random order. 38 | func Madvise(b []byte, readahead bool) error { 39 | return madvise(b, readahead) 40 | } 41 | 42 | // Msync would call sync on the mmapped data. 43 | func Msync(b []byte) error { 44 | return msync(b) 45 | } 46 | -------------------------------------------------------------------------------- /utils/mmap/mmap_linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | // Copyright 2021 hardcore-os Project Authors 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License") 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // mmap api 17 | 18 | // Mmap uses the mmap system call to memory-map a file. If writable is true, 19 | // memory protection of the pages is set so that they may be written to as well. 20 | package mmap 21 | 22 | import ( 23 | "os" 24 | ) 25 | 26 | func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) { 27 | return mmap(fd, writable, size) 28 | } 29 | 30 | // Munmap unmaps a previously mapped slice. 31 | func Munmap(b []byte) error { 32 | return munmap(b) 33 | } 34 | 35 | // Madvise uses the madvise system call to give advise about the use of memory 36 | // when using a slice that is memory-mapped to a file. Set the readahead flag to 37 | // false if page references are expected in random order. 38 | func Madvise(b []byte, readahead bool) error { 39 | return madvise(b, readahead) 40 | } 41 | 42 | // Msync would call sync on the mmapped data. 43 | func Msync(b []byte) error { 44 | return msync(b) 45 | } 46 | 47 | // Mremap unmmap and mmap 48 | func Mremap(data []byte, size int) ([]byte, error) { 49 | return mremap(data, size) 50 | } 51 | -------------------------------------------------------------------------------- /utils/mmap/darwin.go: -------------------------------------------------------------------------------- 1 | // +build darwin 2 | 3 | /* 4 | * Copyright 2019 Dgraph Labs, Inc. and Contributors 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package mmap 20 | 21 | import ( 22 | "os" 23 | "syscall" 24 | "unsafe" 25 | 26 | "golang.org/x/sys/unix" 27 | ) 28 | 29 | // Mmap uses the mmap system call to memory-map a file. If writable is true, 30 | // memory protection of the pages is set so that they may be written to as well. 31 | func mmap(fd *os.File, writable bool, size int64) ([]byte, error) { 32 | mtype := unix.PROT_READ 33 | if writable { 34 | mtype |= unix.PROT_WRITE 35 | } 36 | return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) 37 | } 38 | 39 | // Munmap unmaps a previously mapped slice. 40 | func munmap(b []byte) error { 41 | return unix.Munmap(b) 42 | } 43 | 44 | // This is required because the unix package does not support the madvise system call on OS X. 45 | func madvise(b []byte, readahead bool) error { 46 | advice := unix.MADV_NORMAL 47 | if !readahead { 48 | advice = unix.MADV_RANDOM 49 | } 50 | 51 | _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), 52 | uintptr(len(b)), uintptr(advice)) 53 | if e1 != 0 { 54 | return e1 55 | } 56 | return nil 57 | } 58 | 59 | func msync(b []byte) error { 60 | return unix.Msync(b, unix.MS_SYNC) 61 | } 62 | -------------------------------------------------------------------------------- /pb/pb.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright hardcore-os Project Authors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License") 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | // Use protos/gen.sh to generate .pb.go files. 17 | syntax = "proto3"; 18 | 19 | package pb; 20 | 21 | message KV { 22 | bytes key = 1; 23 | bytes value = 2; 24 | bytes user_meta = 3; 25 | uint64 version = 4; 26 | uint64 expires_at = 5; 27 | bytes meta = 6; 28 | 29 | // Stream id is used to identify which stream the KV came from. 30 | uint32 stream_id = 10; 31 | } 32 | 33 | message KVList { 34 | repeated KV kv = 1; 35 | } 36 | 37 | message ManifestChangeSet { 38 | // A set of changes that are applied atomically. 39 | repeated ManifestChange changes = 1; 40 | } 41 | 42 | message ManifestChange { 43 | uint64 Id = 1; 44 | enum Operation { 45 | CREATE = 0; 46 | DELETE = 1; 47 | } 48 | Operation Op = 2; 49 | uint32 Level = 3; // Only used for CREATE 50 | bytes Checksum = 4; // Only used for CREATE 51 | } 52 | message TableIndex{ 53 | repeated BlockOffset offsets = 1; 54 | bytes bloomFilter = 2; 55 | uint64 maxVersion = 3; 56 | uint32 keyCount = 4; 57 | uint32 staleDataSize = 5; 58 | } 59 | 60 | message BlockOffset{ 61 | bytes key = 1; 62 | uint32 offset = 2; 63 | uint32 len = 3; 64 | } -------------------------------------------------------------------------------- /utils/const.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "hash/crc32" 19 | "math" 20 | "os" 21 | ) 22 | 23 | const ( 24 | // MaxLevelNum _ 25 | MaxLevelNum = 7 26 | // DefaultValueThreshold _ 27 | DefaultValueThreshold = 1024 28 | ) 29 | 30 | // file 31 | const ( 32 | ManifestFilename = "MANIFEST" 33 | ManifestRewriteFilename = "REWRITEMANIFEST" 34 | ManifestDeletionsRewriteThreshold = 10000 35 | ManifestDeletionsRatio = 10 36 | DefaultFileFlag = os.O_RDWR | os.O_CREATE | os.O_APPEND 37 | DefaultFileMode = 0666 38 | MaxValueLogSize = 10 << 20 39 | // This is O_DSYNC (datasync) on platforms that support it -- see file_unix.go 40 | datasyncFileFlag = 0x0 41 | // 基于可变长编码,其最可能的编码 42 | MaxHeaderSize = 21 43 | VlogHeaderSize = 0 44 | MaxVlogFileSize uint32 = math.MaxUint32 45 | Mi int64 = 1 << 20 46 | KVWriteChCapacity = 1000 47 | ) 48 | 49 | // meta 50 | const ( 51 | BitDelete byte = 1 << 0 // Set if the key has been deleted. 52 | BitValuePointer byte = 1 << 1 // Set if the value is NOT stored directly next to key. 53 | ) 54 | 55 | // codec 56 | var ( 57 | MagicText = [4]byte{'H', 'A', 'R', 'D'} 58 | MagicVersion = uint32(1) 59 | // CastagnoliCrcTable is a CRC32 polynomial table 60 | CastagnoliCrcTable = crc32.MakeTable(crc32.Castagnoli) 61 | ) 62 | -------------------------------------------------------------------------------- /utils/map.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "reflect" 19 | "sync" 20 | 21 | "github.com/pkg/errors" 22 | ) 23 | 24 | // CoreMap _ 25 | type CoreMap struct { 26 | m sync.Map 27 | } 28 | 29 | // NewMap _ 30 | func NewMap() *CoreMap { 31 | return &CoreMap{m: sync.Map{}} 32 | } 33 | 34 | // Get _ 35 | func (c *CoreMap) Get(key interface{}) (interface{}, bool) { 36 | hashKey := c.keyToHash(key) 37 | return c.m.Load(hashKey) 38 | } 39 | 40 | // Set _ 41 | func (c *CoreMap) Set(key, value interface{}) { 42 | hashKey := c.keyToHash(key) 43 | c.m.Store(hashKey, value) 44 | } 45 | 46 | // Del _ 47 | func (c *CoreMap) Del(key interface{}) { 48 | hashKey := c.keyToHash(key) 49 | c.m.Delete(hashKey) 50 | } 51 | 52 | // Range _ 53 | func (c *CoreMap) Range(f func(key, value interface{}) bool) { 54 | c.m.Range(f) 55 | } 56 | 57 | func (c *CoreMap) keyToHash(key interface{}) uint64 { 58 | if key == nil { 59 | return 0 60 | } 61 | switch k := key.(type) { 62 | case []byte: 63 | return MemHash(k) 64 | case uint32: 65 | return uint64(k) 66 | case string: 67 | return MemHashString(k) 68 | case uint64: 69 | return k 70 | case byte: 71 | return uint64(k) 72 | case int: 73 | return uint64(k) 74 | case int32: 75 | return uint64(k) 76 | 77 | case int64: 78 | return uint64(k) 79 | default: 80 | CondPanic(true, errors.Errorf("Key:[%+v] type not supported", reflect.TypeOf(k))) 81 | } 82 | return 0 83 | } 84 | -------------------------------------------------------------------------------- /lsm/manifest_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package lsm 15 | 16 | import ( 17 | "os" 18 | "path/filepath" 19 | "testing" 20 | 21 | "github.com/hardcore-os/corekv/utils" 22 | "github.com/stretchr/testify/require" 23 | ) 24 | 25 | // TestBaseManifest manifest 文件整体性测试 26 | func TestBaseManifest(t *testing.T) { 27 | clearDir() 28 | recovery := func() { 29 | // 每次运行都是相当于意外重启 30 | lsm := buildLSM() 31 | // 测试正确性 32 | baseTest(t, lsm, 128) 33 | lsm.Close() 34 | } 35 | // 运行这个闭包5次进行测试 36 | runTest(5, recovery) 37 | } 38 | 39 | func TestManifestMagic(t *testing.T) { 40 | helpTestManifestFileCorruption(t, 3, "bad magic") 41 | } 42 | 43 | func TestManifestVersion(t *testing.T) { 44 | helpTestManifestFileCorruption(t, 4, "unsupported version") 45 | } 46 | 47 | func TestManifestChecksum(t *testing.T) { 48 | helpTestManifestFileCorruption(t, 15, "bad check sum") 49 | } 50 | 51 | func helpTestManifestFileCorruption(t *testing.T, off int64, errorContent string) { 52 | clearDir() 53 | // 创建lsm,然后再将其关闭 54 | { 55 | lsm := buildLSM() 56 | require.NoError(t, lsm.Close()) 57 | } 58 | fp, err := os.OpenFile(filepath.Join(opt.WorkDir, utils.ManifestFilename), os.O_RDWR, 0) 59 | require.NoError(t, err) 60 | // 写入一个错误的值 61 | _, err = fp.WriteAt([]byte{'X'}, off) 62 | require.NoError(t, err) 63 | require.NoError(t, fp.Close()) 64 | defer func() { 65 | if err := recover(); err != nil { 66 | require.Contains(t, err.(error).Error(), errorContent) 67 | } 68 | }() 69 | // 在此打开 lsm 此时会panic 70 | lsm := buildLSM() 71 | require.NoError(t, lsm.Close()) 72 | } 73 | -------------------------------------------------------------------------------- /db_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package corekv 16 | 17 | import ( 18 | "fmt" 19 | "testing" 20 | "time" 21 | 22 | "github.com/hardcore-os/corekv/utils" 23 | ) 24 | 25 | func TestAPI(t *testing.T) { 26 | clearDir() 27 | db := Open(opt) 28 | defer func() { _ = db.Close() }() 29 | // 写入 30 | for i := 0; i < 50; i++ { 31 | key, val := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i) 32 | e := utils.NewEntry([]byte(key), []byte(val)).WithTTL(1000 * time.Second) 33 | if err := db.Set(e); err != nil { 34 | t.Fatal(err) 35 | } 36 | // 查询 37 | if entry, err := db.Get([]byte(key)); err != nil { 38 | t.Fatal(err) 39 | } else { 40 | t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt) 41 | } 42 | } 43 | 44 | for i := 0; i < 40; i++ { 45 | key, _ := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i) 46 | if err := db.Del([]byte(key)); err != nil { 47 | t.Fatal(err) 48 | } 49 | } 50 | 51 | // 迭代器 52 | iter := db.NewIterator(&utils.Options{ 53 | Prefix: []byte("hello"), 54 | IsAsc: false, 55 | }) 56 | defer func() { _ = iter.Close() }() 57 | defer func() { _ = iter.Close() }() 58 | for iter.Rewind(); iter.Valid(); iter.Next() { 59 | it := iter.Item() 60 | t.Logf("db.NewIterator key=%s, value=%s, expiresAt=%d", it.Entry().Key, it.Entry().Value, it.Entry().ExpiresAt) 61 | } 62 | t.Logf("db.Stats.EntryNum=%+v", db.Info().EntryNum) 63 | // 删除 64 | if err := db.Del([]byte("hello")); err != nil { 65 | t.Fatal(err) 66 | } 67 | 68 | for i := 0; i < 10; i++ { 69 | key, val := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i) 70 | e := utils.NewEntry([]byte(key), []byte(val)).WithTTL(1000 * time.Second) 71 | if err := db.Set(e); err != nil { 72 | t.Fatal(err) 73 | } 74 | // 查询 75 | if entry, err := db.Get([]byte(key)); err != nil { 76 | t.Fatal(err) 77 | } else { 78 | t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt) 79 | } 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 logicrec Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package corekv 16 | 17 | import ( 18 | "github.com/hardcore-os/corekv/lsm" 19 | "github.com/hardcore-os/corekv/utils" 20 | ) 21 | 22 | type DBIterator struct { 23 | iitr utils.Iterator 24 | vlog *valueLog 25 | } 26 | type Item struct { 27 | e *utils.Entry 28 | } 29 | 30 | func (it *Item) Entry() *utils.Entry { 31 | return it.e 32 | } 33 | func (db *DB) NewIterator(opt *utils.Options) utils.Iterator { 34 | iters := make([]utils.Iterator, 0) 35 | iters = append(iters, db.lsm.NewIterators(opt)...) 36 | 37 | res := &DBIterator{ 38 | vlog: db.vlog, 39 | iitr: lsm.NewMergeIterator(iters, opt.IsAsc), 40 | } 41 | return res 42 | } 43 | 44 | func (iter *DBIterator) Next() { 45 | iter.iitr.Next() 46 | for ; iter.Valid() && iter.Item() == nil; iter.iitr.Next() { 47 | } 48 | } 49 | func (iter *DBIterator) Valid() bool { 50 | return iter.iitr.Valid() 51 | } 52 | func (iter *DBIterator) Rewind() { 53 | iter.iitr.Rewind() 54 | for ; iter.Valid() && iter.Item() == nil; iter.iitr.Next() { 55 | } 56 | } 57 | func (iter *DBIterator) Item() utils.Item { 58 | // 检查从lsm拿到的value是否是value ptr,是则从vlog中拿值 59 | e := iter.iitr.Item().Entry() 60 | var value []byte 61 | 62 | if e != nil && utils.IsValuePtr(e) { 63 | var vp utils.ValuePtr 64 | vp.Decode(e.Value) 65 | result, cb, err := iter.vlog.read(&vp) 66 | defer utils.RunCallback(cb) 67 | if err != nil { 68 | return nil 69 | } 70 | value = utils.SafeCopy(nil, result) 71 | } 72 | 73 | if e.IsDeletedOrExpired() || value == nil { 74 | return nil 75 | } 76 | 77 | res := &utils.Entry{ 78 | Key: e.Key, 79 | Value: value, 80 | ExpiresAt: e.ExpiresAt, 81 | Meta: e.Meta, 82 | Version: e.Version, 83 | Offset: e.Offset, 84 | Hlen: e.Hlen, 85 | ValThreshold: e.ValThreshold, 86 | } 87 | return res 88 | } 89 | func (iter *DBIterator) Close() error { 90 | return iter.iitr.Close() 91 | } 92 | func (iter *DBIterator) Seek(key []byte) { 93 | } 94 | -------------------------------------------------------------------------------- /utils/throttle.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package utils 15 | 16 | import "sync" 17 | 18 | // Throttle allows a limited number of workers to run at a time. It also 19 | // provides a mechanism to check for errors encountered by workers and wait for 20 | // them to finish. 21 | type Throttle struct { 22 | once sync.Once 23 | wg sync.WaitGroup 24 | ch chan struct{} 25 | errCh chan error 26 | finishErr error 27 | } 28 | 29 | // NewThrottle creates a new throttle with a max number of workers. 30 | func NewThrottle(max int) *Throttle { 31 | return &Throttle{ 32 | ch: make(chan struct{}, max), 33 | errCh: make(chan error, max), 34 | } 35 | } 36 | 37 | // Do should be called by workers before they start working. It blocks if there 38 | // are already maximum number of workers working. If it detects an error from 39 | // previously Done workers, it would return it. 40 | func (t *Throttle) Do() error { 41 | for { 42 | select { 43 | case t.ch <- struct{}{}: 44 | t.wg.Add(1) 45 | return nil 46 | case err := <-t.errCh: 47 | if err != nil { 48 | return err 49 | } 50 | } 51 | } 52 | } 53 | 54 | // Done should be called by workers when they finish working. They can also 55 | // pass the error status of work done. 56 | func (t *Throttle) Done(err error) { 57 | if err != nil { 58 | t.errCh <- err 59 | } 60 | select { 61 | case <-t.ch: 62 | default: 63 | panic("Throttle Do Done mismatch") 64 | } 65 | t.wg.Done() 66 | } 67 | 68 | // Finish waits until all workers have finished working. It would return any error passed by Done. 69 | // If Finish is called multiple time, it will wait for workers to finish only once(first time). 70 | // From next calls, it will return same error as found on first call. 71 | func (t *Throttle) Finish() error { 72 | t.once.Do(func() { 73 | t.wg.Wait() 74 | close(t.ch) 75 | close(t.errCh) 76 | for err := range t.errCh { 77 | if err != nil { 78 | t.finishErr = err 79 | return 80 | } 81 | } 82 | }) 83 | 84 | return t.finishErr 85 | } 86 | -------------------------------------------------------------------------------- /utils/cache/cmSketch.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "time" 7 | ) 8 | 9 | const ( 10 | cmDepth = 4 11 | ) 12 | 13 | type cmSketch struct { 14 | rows [cmDepth]cmRow 15 | seed [cmDepth]uint64 16 | mask uint64 17 | } 18 | 19 | func newCmSketch(numCounters int64) *cmSketch { 20 | if numCounters == 0 { 21 | panic("cmSketch: invalid numCounters") 22 | } 23 | 24 | // numCounters 一定是二次幂,也就一定是1后面有 n 个 0 25 | numCounters = next2Power(numCounters) 26 | // mask 一定是0111...111 27 | sketch := &cmSketch{mask: uint64(numCounters - 1)} 28 | source := rand.New(rand.NewSource(time.Now().UnixNano())) 29 | 30 | // 初始化4行 31 | // 0000,0000|0000,0000|0000,0000 32 | // 0000,0000|0000,0000|0000,0000 33 | // 0000,0000|0000,0000|0000,0000 34 | // 0000,0000|0000,0000|0000,0000 35 | 36 | for i := 0; i < cmDepth; i++ { 37 | sketch.seed[i] = source.Uint64() 38 | sketch.rows[i] = newCmRow(numCounters) 39 | } 40 | 41 | return sketch 42 | } 43 | 44 | func (s *cmSketch) Increment(hashed uint64) { 45 | // 对于每一行进行相同操作 46 | for i := range s.rows { 47 | s.rows[i].increment((hashed ^ s.seed[i]) & s.mask) 48 | } 49 | } 50 | 51 | func (s *cmSketch) Estimate(hashed uint64) int64 { 52 | min := byte(255) 53 | for i := range s.rows { 54 | val := s.rows[i].get((hashed ^ s.seed[i]) & s.mask) 55 | if val < min { 56 | min = val 57 | } 58 | } 59 | 60 | return int64(min) 61 | } 62 | 63 | // Reset halves all counter values. 64 | func (s *cmSketch) Reset() { 65 | for _, r := range s.rows { 66 | r.reset() 67 | } 68 | } 69 | 70 | // Clear zeroes all counters. 71 | func (s *cmSketch) Clear() { 72 | for _, r := range s.rows { 73 | r.clear() 74 | } 75 | } 76 | 77 | // 快速计算大于 X,且最接近 X 的二次幂 78 | func next2Power(x int64) int64 { 79 | x-- 80 | x |= x >> 1 81 | x |= x >> 2 82 | x |= x >> 4 83 | x |= x >> 8 84 | x |= x >> 16 85 | x |= x >> 32 86 | x++ 87 | return x 88 | } 89 | 90 | type cmRow []byte 91 | 92 | func newCmRow(numCounters int64) cmRow { 93 | return make(cmRow, numCounters/2) 94 | } 95 | 96 | func (r cmRow) get(n uint64) byte { 97 | return r[n/2] >> ((n & 1) * 4) & 0x0f 98 | } 99 | 100 | func (r cmRow) increment(n uint64) { 101 | i := n / 2 102 | s := (n & 1) * 4 103 | v := (r[i] >> s) & 0x0f 104 | if v < 15 { 105 | r[i] += 1 << s 106 | } 107 | } 108 | 109 | func (r cmRow) reset() { 110 | for i := range r { 111 | r[i] = (r[i] >> 1) & 0x77 112 | } 113 | } 114 | 115 | func (r cmRow) clear() { 116 | for i := range r { 117 | r[i] = 0 118 | } 119 | } 120 | 121 | func (r cmRow) string() string { 122 | s := "" 123 | for i := uint64(0); i < uint64(len(r)*2); i++ { 124 | s += fmt.Sprintf("%02d ", (r[(i/2)]>>((i&1)*4))&0x0f) 125 | } 126 | s = s[:len(s)-1] 127 | return s 128 | } 129 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= 2 | github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 3 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 5 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 6 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= 7 | github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= 8 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= 9 | github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= 10 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 11 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 12 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 13 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 14 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 15 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 16 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 17 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 18 | golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 h1:xrCZDmdtoloIiooiA9q0OQb9r8HejIHYoHGhGCe1pGg= 19 | golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 20 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 21 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= 22 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 23 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= 24 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 25 | google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ= 26 | google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 27 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 28 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 29 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= 30 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 31 | -------------------------------------------------------------------------------- /utils/key.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "bytes" 19 | "encoding/binary" 20 | "math" 21 | "time" 22 | "unsafe" 23 | ) 24 | 25 | type stringStruct struct { 26 | str unsafe.Pointer 27 | len int 28 | } 29 | 30 | //go:noescape 31 | //go:linkname memhash runtime.memhash 32 | func memhash(p unsafe.Pointer, h, s uintptr) uintptr 33 | 34 | // ParseKey parses the actual key from the key bytes. 35 | func ParseKey(key []byte) []byte { 36 | if len(key) < 8 { 37 | return key 38 | } 39 | 40 | return key[:len(key)-8] 41 | } 42 | 43 | // ParseTs parses the timestamp from the key bytes. 44 | func ParseTs(key []byte) uint64 { 45 | if len(key) <= 8 { 46 | return 0 47 | } 48 | return math.MaxUint64 - binary.BigEndian.Uint64(key[len(key)-8:]) 49 | } 50 | 51 | // SameKey checks for key equality ignoring the version timestamp suffix. 52 | func SameKey(src, dst []byte) bool { 53 | if len(src) != len(dst) { 54 | return false 55 | } 56 | return bytes.Equal(ParseKey(src), ParseKey(dst)) 57 | } 58 | 59 | // KeyWithTs generates a new key by appending ts to key. 60 | func KeyWithTs(key []byte, ts uint64) []byte { 61 | out := make([]byte, len(key)+8) 62 | copy(out, key) 63 | binary.BigEndian.PutUint64(out[len(key):], math.MaxUint64-ts) 64 | return out 65 | } 66 | 67 | // MemHash is the hash function used by go map, it utilizes available hardware instructions(behaves 68 | // as aeshash if aes instruction is available). 69 | // NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. 70 | func MemHash(data []byte) uint64 { 71 | ss := (*stringStruct)(unsafe.Pointer(&data)) 72 | return uint64(memhash(ss.str, 0, uintptr(ss.len))) 73 | } 74 | 75 | // MemHashString is the hash function used by go map, it utilizes available hardware instructions 76 | // (behaves as aeshash if aes instruction is available). 77 | // NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. 78 | func MemHashString(str string) uint64 { 79 | ss := (*stringStruct)(unsafe.Pointer(&str)) 80 | return uint64(memhash(ss.str, 0, uintptr(ss.len))) 81 | } 82 | 83 | // SafeCopy does append(a[:0], src...). 84 | func SafeCopy(a, src []byte) []byte { 85 | return append(a[:0], src...) 86 | } 87 | 88 | func NewCurVersion() uint64 { 89 | return uint64(time.Now().UnixNano() / 1e9) 90 | } 91 | -------------------------------------------------------------------------------- /utils/cache/s2lru.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | "fmt" 6 | ) 7 | 8 | type segmentedLRU struct { 9 | data map[uint64]*list.Element 10 | stageOneCap, stageTwoCap int 11 | stageOne, stageTwo *list.List 12 | } 13 | 14 | const ( 15 | STAGE_ONE = iota + 1 16 | STAGE_TWO 17 | ) 18 | 19 | func newSLRU(data map[uint64]*list.Element, stageOneCap, stageTwoCap int) *segmentedLRU { 20 | return &segmentedLRU{ 21 | data: data, 22 | stageOneCap: stageOneCap, 23 | stageTwoCap: stageTwoCap, 24 | stageOne: list.New(), 25 | stageTwo: list.New(), 26 | } 27 | } 28 | 29 | func (slru *segmentedLRU) add(newitem storeItem) { 30 | // 先进来的都放 stageOne 31 | newitem.stage = 1 32 | 33 | // 如果 stageOne 没满,整个 LFU 区域也没满 34 | if slru.stageOne.Len() < slru.stageOneCap || slru.Len() < slru.stageOneCap+slru.stageTwoCap { 35 | slru.data[newitem.key] = slru.stageOne.PushFront(&newitem) 36 | return 37 | } 38 | 39 | //走到这里说明 StageOne 满了,或者整个 LFU都满了 40 | //那么需要从 StageOne 淘汰数据了 41 | e := slru.stageOne.Back() 42 | item := e.Value.(*storeItem) 43 | 44 | //这里淘汰就是真的淘汰了 45 | delete(slru.data, item.key) 46 | 47 | *item = newitem 48 | 49 | slru.data[item.key] = e 50 | slru.stageOne.MoveToFront(e) 51 | } 52 | 53 | func (slru *segmentedLRU) get(v *list.Element) { 54 | item := v.Value.(*storeItem) 55 | 56 | // 若访问的缓存数据,已经在 StageTwo,只需要按照 LRU 规则提前即可 57 | if item.stage == STAGE_TWO { 58 | slru.stageTwo.MoveToFront(v) 59 | return 60 | } 61 | 62 | // 若访问的数据还在 StageOne,那么再次被访问到,就需要提升到 StageTwo 阶段了 63 | if slru.stageTwo.Len() < slru.stageTwoCap { 64 | slru.stageOne.Remove(v) 65 | item.stage = STAGE_TWO 66 | slru.data[item.key] = slru.stageTwo.PushFront(item) 67 | return 68 | } 69 | 70 | // 新数据加入 StageTwo,需要淘汰旧数据 71 | // StageTwo 中淘汰的数据不会消失,会进入 StageOne 72 | // StageOne 中,访问频率更低的数据,有可能会被淘汰 73 | back := slru.stageTwo.Back() 74 | bitem := back.Value.(*storeItem) 75 | 76 | *bitem, *item = *item, *bitem 77 | 78 | bitem.stage = STAGE_TWO 79 | item.stage = STAGE_ONE 80 | 81 | slru.data[item.key] = v 82 | slru.data[bitem.key] = back 83 | 84 | slru.stageOne.MoveToFront(v) 85 | slru.stageTwo.MoveToFront(back) 86 | } 87 | 88 | func (slru *segmentedLRU) Len() int { 89 | return slru.stageTwo.Len() + slru.stageOne.Len() 90 | } 91 | 92 | func (slru *segmentedLRU) victim() *storeItem { 93 | //如果 slru 的容量未满,不需要淘汰 94 | if slru.Len() < slru.stageOneCap+slru.stageTwoCap { 95 | return nil 96 | } 97 | 98 | // 如果已经满了,则需要从20%的区域淘汰数据,这里直接从尾部拿最后一个元素即可 99 | v := slru.stageOne.Back() 100 | return v.Value.(*storeItem) 101 | } 102 | 103 | func (slru *segmentedLRU) String() string { 104 | var s string 105 | for e := slru.stageTwo.Front(); e != nil; e = e.Next() { 106 | s += fmt.Sprintf("%v,", e.Value.(*storeItem).value) 107 | } 108 | s += fmt.Sprintf(" | ") 109 | for e := slru.stageOne.Front(); e != nil; e = e.Next() { 110 | s += fmt.Sprintf("%v,", e.Value.(*storeItem).value) 111 | } 112 | return s 113 | } 114 | -------------------------------------------------------------------------------- /utils/mmap/linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | // Copyright 2021 logicrec Project Authors 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License") 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | package mmap 18 | 19 | import ( 20 | "os" 21 | "reflect" 22 | "unsafe" 23 | 24 | "golang.org/x/sys/unix" 25 | ) 26 | 27 | // mmap uses the mmap system call to memory-map a file. If writable is true, 28 | // memory protection of the pages is set so that they may be written to as well. 29 | func mmap(fd *os.File, writable bool, size int64) ([]byte, error) { 30 | mtype := unix.PROT_READ 31 | if writable { 32 | mtype |= unix.PROT_WRITE 33 | } 34 | return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) 35 | } 36 | 37 | // mremap is a Linux-specific system call to remap pages in memory. This can be used in place of munmap + mmap. 38 | func mremap(data []byte, size int) ([]byte, error) { 39 | // taken from 40 | const MREMAP_MAYMOVE = 0x1 41 | 42 | header := (*reflect.SliceHeader)(unsafe.Pointer(&data)) 43 | mmapAddr, _, errno := unix.Syscall6( 44 | unix.SYS_MREMAP, 45 | header.Data, 46 | uintptr(header.Len), 47 | uintptr(size), 48 | uintptr(MREMAP_MAYMOVE), 49 | 0, 50 | 0, 51 | ) 52 | if errno != 0 { 53 | return nil, errno 54 | } 55 | 56 | header.Data = mmapAddr 57 | header.Cap = size 58 | header.Len = size 59 | return data, nil 60 | } 61 | 62 | // munmap unmaps a previously mapped slice. 63 | // 64 | // unix.Munmap maintains an internal list of mmapped addresses, and only calls munmap 65 | // if the address is present in that list. If we use mremap, this list is not updated. 66 | // To bypass this, we call munmap ourselves. 67 | func munmap(data []byte) error { 68 | if len(data) == 0 || len(data) != cap(data) { 69 | return unix.EINVAL 70 | } 71 | _, _, errno := unix.Syscall( 72 | unix.SYS_MUNMAP, 73 | uintptr(unsafe.Pointer(&data[0])), 74 | uintptr(len(data)), 75 | 0, 76 | ) 77 | if errno != 0 { 78 | return errno 79 | } 80 | return nil 81 | } 82 | 83 | // madvise uses the madvise system call to give advise about the use of memory 84 | // when using a slice that is memory-mapped to a file. Set the readahead flag to 85 | // false if page references are expected in random order. 86 | func madvise(b []byte, readahead bool) error { 87 | flags := unix.MADV_NORMAL 88 | if !readahead { 89 | flags = unix.MADV_RANDOM 90 | } 91 | return unix.Madvise(b, flags) 92 | } 93 | 94 | // msync writes any modified data to persistent storage. 95 | func msync(b []byte) error { 96 | return unix.Msync(b, unix.MS_SYNC) 97 | } 98 | -------------------------------------------------------------------------------- /utils/bloom.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import "math" 18 | 19 | // Filter is an encoded set of []byte keys. 20 | type Filter []byte 21 | 22 | // MayContainKey _ 23 | func (f Filter) MayContainKey(k []byte) bool { 24 | return f.MayContain(Hash(k)) 25 | } 26 | 27 | // MayContain returns whether the filter may contain given key. False positives 28 | // are possible, where it returns true for keys not in the original set. 29 | func (f Filter) MayContain(h uint32) bool { 30 | if len(f) < 2 { 31 | return false 32 | } 33 | k := f[len(f)-1] 34 | if k > 30 { 35 | // This is reserved for potentially new encodings for short Bloom filters. 36 | // Consider it a match. 37 | return true 38 | } 39 | nBits := uint32(8 * (len(f) - 1)) 40 | delta := h>>17 | h<<15 41 | for j := uint8(0); j < k; j++ { 42 | bitPos := h % nBits 43 | if f[bitPos/8]&(1<<(bitPos%8)) == 0 { 44 | return false 45 | } 46 | h += delta 47 | } 48 | return true 49 | } 50 | 51 | // NewFilter returns a new Bloom filter that encodes a set of []byte keys with 52 | // the given number of bits per key, approximately. 53 | // 54 | // A good bitsPerKey value is 10, which yields a filter with ~ 1% false 55 | // positive rate. 56 | func NewFilter(keys []uint32, bitsPerKey int) Filter { 57 | return Filter(appendFilter(keys, bitsPerKey)) 58 | } 59 | 60 | // BloomBitsPerKey returns the bits per key required by bloomfilter based on 61 | // the false positive rate. 62 | func BloomBitsPerKey(numEntries int, fp float64) int { 63 | size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2) 64 | locs := math.Ceil(size / float64(numEntries)) 65 | return int(locs) 66 | } 67 | 68 | func appendFilter(keys []uint32, bitsPerKey int) []byte { 69 | if bitsPerKey < 0 { 70 | bitsPerKey = 0 71 | } 72 | // 0.69 is approximately ln(2). 73 | k := uint32(float64(bitsPerKey) * 0.69) 74 | if k < 1 { 75 | k = 1 76 | } 77 | if k > 30 { 78 | k = 30 79 | } 80 | 81 | nBits := len(keys) * int(bitsPerKey) 82 | // For small len(keys), we can see a very high false positive rate. Fix it 83 | // by enforcing a minimum bloom filter length. 84 | if nBits < 64 { 85 | nBits = 64 86 | } 87 | nBytes := (nBits + 7) / 8 88 | nBits = nBytes * 8 89 | filter := make([]byte, nBytes+1) 90 | 91 | for _, h := range keys { 92 | delta := h>>17 | h<<15 93 | for j := uint32(0); j < k; j++ { 94 | bitPos := h % uint32(nBits) 95 | filter[bitPos/8] |= 1 << (bitPos % 8) 96 | h += delta 97 | } 98 | } 99 | 100 | //record the K value of this Bloom Filter 101 | filter[nBytes] = uint8(k) 102 | 103 | return filter 104 | } 105 | 106 | // Hash implements a hashing algorithm similar to the Murmur hash. 107 | func Hash(b []byte) uint32 { 108 | const ( 109 | seed = 0xbc9f1d34 110 | m = 0xc6a4a793 111 | ) 112 | h := uint32(seed) ^ uint32(len(b))*m 113 | for ; len(b) >= 4; b = b[4:] { 114 | h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 115 | h *= m 116 | h ^= h >> 16 117 | } 118 | switch len(b) { 119 | case 3: 120 | h += uint32(b[2]) << 16 121 | fallthrough 122 | case 2: 123 | h += uint32(b[1]) << 8 124 | fallthrough 125 | case 1: 126 | h += uint32(b[0]) 127 | h *= m 128 | h ^= h >> 24 129 | } 130 | return h 131 | } 132 | -------------------------------------------------------------------------------- /lsm/lsm.go: -------------------------------------------------------------------------------- 1 | package lsm 2 | 3 | import ( 4 | "github.com/hardcore-os/corekv/utils" 5 | ) 6 | 7 | // LSM _ 8 | type LSM struct { 9 | memTable *memTable 10 | immutables []*memTable 11 | levels *levelManager 12 | option *Options 13 | closer *utils.Closer 14 | maxMemFID uint32 15 | } 16 | 17 | //Options _ 18 | type Options struct { 19 | WorkDir string 20 | MemTableSize int64 21 | SSTableMaxSz int64 22 | // BlockSize is the size of each block inside SSTable in bytes. 23 | BlockSize int 24 | // BloomFalsePositive is the false positive probabiltiy of bloom filter. 25 | BloomFalsePositive float64 26 | 27 | // compact 28 | NumCompactors int 29 | BaseLevelSize int64 30 | LevelSizeMultiplier int // 决定level之间期望的size比例 31 | TableSizeMultiplier int 32 | BaseTableSize int64 33 | NumLevelZeroTables int 34 | MaxLevelNum int 35 | 36 | DiscardStatsCh *chan map[uint32]int64 37 | } 38 | 39 | // Close _ 40 | func (lsm *LSM) Close() error { 41 | // 等待全部合并过程的结束 42 | // 等待全部api调用过程结束 43 | lsm.closer.Close() 44 | // TODO 需要加锁保证并发安全 45 | if lsm.memTable != nil { 46 | if err := lsm.memTable.close(); err != nil { 47 | return err 48 | } 49 | } 50 | for i := range lsm.immutables { 51 | if err := lsm.immutables[i].close(); err != nil { 52 | return err 53 | } 54 | } 55 | if err := lsm.levels.close(); err != nil { 56 | return err 57 | } 58 | return nil 59 | } 60 | 61 | // NewLSM _ 62 | func NewLSM(opt *Options) *LSM { 63 | lsm := &LSM{option: opt} 64 | // 初始化levelManager 65 | lsm.levels = lsm.initLevelManager(opt) 66 | // 启动DB恢复过程加载wal,如果没有恢复内容则创建新的内存表 67 | lsm.memTable, lsm.immutables = lsm.recovery() 68 | // 初始化closer 用于资源回收的信号控制 69 | lsm.closer = utils.NewCloser() 70 | return lsm 71 | } 72 | 73 | // StartCompacter _ 74 | func (lsm *LSM) StartCompacter() { 75 | n := lsm.option.NumCompactors 76 | lsm.closer.Add(n) 77 | for i := 0; i < n; i++ { 78 | go lsm.levels.runCompacter(i) 79 | } 80 | } 81 | 82 | // Set _ 83 | func (lsm *LSM) Set(entry *utils.Entry) (err error) { 84 | if entry == nil || len(entry.Key) == 0 { 85 | return utils.ErrEmptyKey 86 | } 87 | // 优雅关闭 88 | lsm.closer.Add(1) 89 | defer lsm.closer.Done() 90 | // 检查当前memtable是否写满,是的话创建新的memtable,并将当前内存表写到immutables中 91 | // 否则写入当前memtable中 92 | if int64(lsm.memTable.wal.Size())+ 93 | int64(utils.EstimateWalCodecSize(entry)) > lsm.option.MemTableSize { 94 | lsm.Rotate() 95 | } 96 | 97 | if err = lsm.memTable.set(entry); err != nil { 98 | return err 99 | } 100 | // 检查是否存在immutable需要刷盘, 101 | for _, immutable := range lsm.immutables { 102 | if err = lsm.levels.flush(immutable); err != nil { 103 | return err 104 | } 105 | // TODO 这里问题很大,应该是用引用计数的方式回收 106 | err = immutable.close() 107 | utils.Panic(err) 108 | } 109 | if len(lsm.immutables) != 0 { 110 | // TODO 将lsm的immutables队列置空,这里可以优化一下节省内存空间,还可以限制一下immut table的大小为固定值 111 | lsm.immutables = make([]*memTable, 0) 112 | } 113 | return err 114 | } 115 | 116 | // Get _ 117 | func (lsm *LSM) Get(key []byte) (*utils.Entry, error) { 118 | if len(key) == 0 { 119 | return nil, utils.ErrEmptyKey 120 | } 121 | lsm.closer.Add(1) 122 | defer lsm.closer.Done() 123 | var ( 124 | entry *utils.Entry 125 | err error 126 | ) 127 | // 从内存表中查询,先查活跃表,在查不变表 128 | if entry, err = lsm.memTable.Get(key); entry != nil && entry.Value != nil { 129 | return entry, err 130 | } 131 | 132 | for i := len(lsm.immutables) - 1; i >= 0; i-- { 133 | if entry, err = lsm.immutables[i].Get(key); entry != nil && entry.Value != nil { 134 | return entry, err 135 | } 136 | } 137 | // 从level manger查询 138 | return lsm.levels.Get(key) 139 | } 140 | 141 | func (lsm *LSM) MemSize() int64 { 142 | return lsm.memTable.Size() 143 | } 144 | 145 | func (lsm *LSM) MemTableIsNil() bool { 146 | return lsm.memTable == nil 147 | } 148 | 149 | func (lsm *LSM) GetSkipListFromMemTable() *utils.Skiplist { 150 | return lsm.memTable.sl 151 | } 152 | 153 | func (lsm *LSM) Rotate() { 154 | lsm.immutables = append(lsm.immutables, lsm.memTable) 155 | lsm.memTable = lsm.NewMemtable() 156 | } 157 | -------------------------------------------------------------------------------- /utils/error.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 logicrec Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Copyright 2021 hardcore-os Project Authors 16 | // 17 | // Licensed under the Apache License, Version 2.0 (the "License") 18 | // you may not use this file except in compliance with the License. 19 | // You may obtain a copy of the License at 20 | // 21 | // http://www.apache.org/licenses/LICENSE-2.0 22 | // 23 | // Unless required by applicable law or agreed to in writing, software 24 | // distributed under the License is distributed on an "AS IS" BASIS, 25 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | // See the License for the specific language governing permissions and 27 | // limitations under the License. 28 | 29 | package utils 30 | 31 | import ( 32 | "errors" 33 | "fmt" 34 | "os" 35 | "path" 36 | "path/filepath" 37 | "runtime" 38 | "strconv" 39 | "strings" 40 | ) 41 | 42 | var ( 43 | gopath = path.Join(os.Getenv("GOPATH"), "src") + "/" 44 | ) 45 | 46 | // NotFoundKey 找不到key 47 | var ( 48 | // ErrKeyNotFound is returned when key isn't found on a txn.Get. 49 | ErrKeyNotFound = errors.New("Key not found") 50 | // ErrEmptyKey is returned if an empty key is passed on an update function. 51 | ErrEmptyKey = errors.New("Key cannot be empty") 52 | // ErrReWriteFailure reWrite failure 53 | ErrReWriteFailure = errors.New("reWrite failure") 54 | // ErrBadMagic bad magic 55 | ErrBadMagic = errors.New("bad magic") 56 | // ErrBadChecksum bad check sum 57 | ErrBadChecksum = errors.New("bad check sum") 58 | // ErrChecksumMismatch is returned at checksum mismatch. 59 | ErrChecksumMismatch = errors.New("checksum mismatch") 60 | 61 | ErrTruncate = errors.New("Do truncate") 62 | ErrStop = errors.New("Stop") 63 | 64 | // compact 65 | ErrFillTables = errors.New("Unable to fill tables") 66 | 67 | ErrBlockedWrites = errors.New("Writes are blocked, possibly due to DropAll or Close") 68 | ErrTxnTooBig = errors.New("Txn is too big to fit into one request") 69 | ErrDeleteVlogFile = errors.New("Delete vlog file") 70 | ErrNoRoom = errors.New("No room for write") 71 | 72 | // ErrInvalidRequest is returned if the user request is invalid. 73 | ErrInvalidRequest = errors.New("Invalid request") 74 | // ErrNoRewrite is returned if a call for value log GC doesn't result in a log file rewrite. 75 | ErrNoRewrite = errors.New("Value log GC attempt didn't result in any cleanup") 76 | 77 | // ErrRejected is returned if a value log GC is called either while another GC is running, or 78 | // after DB::Close has been called. 79 | ErrRejected = errors.New("Value log GC request rejected") 80 | ) 81 | 82 | // Panic 如果err 不为nil 则panicc 83 | func Panic(err error) { 84 | if err != nil { 85 | panic(err) 86 | } 87 | } 88 | 89 | // Panic2 _ 90 | func Panic2(_ interface{}, err error) { 91 | Panic(err) 92 | } 93 | 94 | // Err err 95 | func Err(err error) error { 96 | if err != nil { 97 | fmt.Printf("%s %s\n", location(2, true), err) 98 | } 99 | return err 100 | } 101 | 102 | // WarpErr err 103 | func WarpErr(format string, err error) error { 104 | if err != nil { 105 | fmt.Printf("%s %s %s", format, location(2, true), err) 106 | } 107 | return err 108 | } 109 | func location(deep int, fullPath bool) string { 110 | _, file, line, ok := runtime.Caller(deep) 111 | if !ok { 112 | file = "???" 113 | line = 0 114 | } 115 | 116 | if fullPath { 117 | if strings.HasPrefix(file, gopath) { 118 | file = file[len(gopath):] 119 | } 120 | } else { 121 | file = filepath.Base(file) 122 | } 123 | return file + ":" + strconv.Itoa(line) 124 | } 125 | 126 | // CondPanic e 127 | func CondPanic(condition bool, err error) { 128 | if condition { 129 | Panic(err) 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /utils/wal.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 logicrec Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "bytes" 19 | "encoding/binary" 20 | "hash" 21 | "hash/crc32" 22 | "io" 23 | ) 24 | 25 | // LogEntry 26 | type LogEntry func(e *Entry, vp *ValuePtr) error 27 | 28 | type WalHeader struct { 29 | KeyLen uint32 30 | ValueLen uint32 31 | Meta byte 32 | ExpiresAt uint64 33 | } 34 | 35 | const maxHeaderSize int = 21 36 | 37 | func (h WalHeader) Encode(out []byte) int { 38 | index := 0 39 | index = binary.PutUvarint(out[index:], uint64(h.KeyLen)) 40 | index += binary.PutUvarint(out[index:], uint64(h.ValueLen)) 41 | index += binary.PutUvarint(out[index:], uint64(h.Meta)) 42 | index += binary.PutUvarint(out[index:], h.ExpiresAt) 43 | return index 44 | } 45 | 46 | func (h *WalHeader) Decode(reader *HashReader) (int, error) { 47 | var err error 48 | 49 | klen, err := binary.ReadUvarint(reader) 50 | if err != nil { 51 | return 0, err 52 | } 53 | h.KeyLen = uint32(klen) 54 | 55 | vlen, err := binary.ReadUvarint(reader) 56 | if err != nil { 57 | return 0, err 58 | } 59 | h.ValueLen = uint32(vlen) 60 | 61 | meta, err := binary.ReadUvarint(reader) 62 | if err != nil { 63 | return 0, err 64 | } 65 | h.Meta = byte(meta) 66 | h.ExpiresAt, err = binary.ReadUvarint(reader) 67 | if err != nil { 68 | return 0, err 69 | } 70 | return reader.BytesRead, nil 71 | } 72 | 73 | // WalCodec 写入wal文件的编码 74 | // | header | key | value | crc32 | 75 | func WalCodec(buf *bytes.Buffer, e *Entry) int { 76 | buf.Reset() 77 | h := WalHeader{ 78 | KeyLen: uint32(len(e.Key)), 79 | ValueLen: uint32(len(e.Value)), 80 | ExpiresAt: e.ExpiresAt, 81 | } 82 | 83 | hash := crc32.New(CastagnoliCrcTable) 84 | writer := io.MultiWriter(buf, hash) 85 | 86 | // encode header. 87 | var headerEnc [maxHeaderSize]byte 88 | sz := h.Encode(headerEnc[:]) 89 | Panic2(writer.Write(headerEnc[:sz])) 90 | Panic2(writer.Write(e.Key)) 91 | Panic2(writer.Write(e.Value)) 92 | // write crc32 hash. 93 | var crcBuf [crc32.Size]byte 94 | binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32()) 95 | Panic2(buf.Write(crcBuf[:])) 96 | // return encoded length. 97 | return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf) 98 | } 99 | 100 | // EstimateWalCodecSize 预估当前kv 写入wal文件占用的空间大小 101 | func EstimateWalCodecSize(e *Entry) int { 102 | return len(e.Key) + len(e.Value) + 8 /* ExpiresAt uint64 */ + 103 | crc32.Size + maxHeaderSize 104 | } 105 | 106 | type HashReader struct { 107 | R io.Reader 108 | H hash.Hash32 109 | BytesRead int // Number of bytes read. 110 | } 111 | 112 | func NewHashReader(r io.Reader) *HashReader { 113 | hash := crc32.New(CastagnoliCrcTable) 114 | return &HashReader{ 115 | R: r, 116 | H: hash, 117 | } 118 | } 119 | 120 | // Read reads len(p) bytes from the reader. Returns the number of bytes read, error on failure. 121 | func (t *HashReader) Read(p []byte) (int, error) { 122 | n, err := t.R.Read(p) 123 | if err != nil { 124 | return n, err 125 | } 126 | t.BytesRead += n 127 | return t.H.Write(p[:n]) 128 | } 129 | 130 | // ReadByte reads exactly one byte from the reader. Returns error on failure. 131 | func (t *HashReader) ReadByte() (byte, error) { 132 | b := make([]byte, 1) 133 | _, err := t.Read(b) 134 | return b[0], err 135 | } 136 | 137 | // Sum32 returns the sum32 of the underlying hash. 138 | func (t *HashReader) Sum32() uint32 { 139 | return t.H.Sum32() 140 | } 141 | 142 | // IsZero _ 143 | func (e *Entry) IsZero() bool { 144 | return len(e.Key) == 0 145 | } 146 | 147 | // LogHeaderLen _ 148 | func (e *Entry) LogHeaderLen() int { 149 | return e.Hlen 150 | } 151 | 152 | // LogOffset _ 153 | func (e *Entry) LogOffset() uint32 { 154 | return e.Offset 155 | } 156 | -------------------------------------------------------------------------------- /utils/file.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "hash/crc32" 21 | "io/ioutil" 22 | "os" 23 | "path" 24 | "path/filepath" 25 | "strconv" 26 | "strings" 27 | 28 | "github.com/pkg/errors" 29 | ) 30 | 31 | // FID 根据file name 获取其fid 32 | func FID(name string) uint64 { 33 | name = path.Base(name) 34 | if !strings.HasSuffix(name, ".sst") { 35 | return 0 36 | } 37 | // suffix := name[len(fileSuffix):] 38 | name = strings.TrimSuffix(name, ".sst") 39 | id, err := strconv.Atoi(name) 40 | if err != nil { 41 | Err(err) 42 | return 0 43 | } 44 | return uint64(id) 45 | } 46 | 47 | func VlogFilePath(dirPath string, fid uint32) string { 48 | return fmt.Sprintf("%s%s%05d.vlog", dirPath, string(os.PathSeparator), fid) 49 | } 50 | 51 | // CreateSyncedFile creates a new file (using O_EXCL), errors if it already existed. 52 | func CreateSyncedFile(filename string, sync bool) (*os.File, error) { 53 | flags := os.O_RDWR | os.O_CREATE | os.O_EXCL 54 | if sync { 55 | flags |= datasyncFileFlag 56 | } 57 | return os.OpenFile(filename, flags, 0600) 58 | } 59 | 60 | // FileNameSSTable sst 文件名 61 | func FileNameSSTable(dir string, id uint64) string { 62 | return filepath.Join(dir, fmt.Sprintf("%05d.sst", id)) 63 | } 64 | 65 | // openDir opens a directory for syncing. 66 | func openDir(path string) (*os.File, error) { return os.Open(path) } 67 | 68 | // SyncDir When you create or delete a file, you have to ensure the directory entry for the file is synced 69 | // in order to guarantee the file is visible (if the system crashes). (See the man page for fsync, 70 | // or see https://github.com/coreos/etcd/issues/6368 for an example.) 71 | func SyncDir(dir string) error { 72 | f, err := openDir(dir) 73 | if err != nil { 74 | return errors.Wrapf(err, "While opening directory: %s.", dir) 75 | } 76 | err = f.Sync() 77 | closeErr := f.Close() 78 | if err != nil { 79 | return errors.Wrapf(err, "While syncing directory: %s.", dir) 80 | } 81 | return errors.Wrapf(closeErr, "While closing directory: %s.", dir) 82 | } 83 | 84 | // LoadIDMap Get the id of all sst files in the current folder 85 | func LoadIDMap(dir string) map[uint64]struct{} { 86 | fileInfos, err := ioutil.ReadDir(dir) 87 | Err(err) 88 | idMap := make(map[uint64]struct{}) 89 | for _, info := range fileInfos { 90 | if info.IsDir() { 91 | continue 92 | } 93 | fileID := FID(info.Name()) 94 | if fileID != 0 { 95 | idMap[fileID] = struct{}{} 96 | } 97 | } 98 | return idMap 99 | } 100 | 101 | // CompareKeys checks the key without timestamp and checks the timestamp if keyNoTs 102 | // is same. 103 | // a would be sorted higher than aa if we use bytes.compare 104 | // All keys should have timestamp. 105 | func CompareKeys(key1, key2 []byte) int { 106 | CondPanic((len(key1) <= 8 || len(key2) <= 8), fmt.Errorf("%s,%s < 8", string(key1), string(key2))) 107 | if cmp := bytes.Compare(key1[:len(key1)-8], key2[:len(key2)-8]); cmp != 0 { 108 | return cmp 109 | } 110 | return bytes.Compare(key1[len(key1)-8:], key2[len(key2)-8:]) 111 | } 112 | 113 | // VerifyChecksum crc32 114 | func VerifyChecksum(data []byte, expected []byte) error { 115 | actual := uint64(crc32.Checksum(data, CastagnoliCrcTable)) 116 | expectedU64 := BytesToU64(expected) 117 | if actual != expectedU64 { 118 | return errors.Wrapf(ErrChecksumMismatch, "actual: %d, expected: %d", actual, expectedU64) 119 | } 120 | 121 | return nil 122 | } 123 | 124 | // CalculateChecksum _ 125 | func CalculateChecksum(data []byte) uint64 { 126 | return uint64(crc32.Checksum(data, CastagnoliCrcTable)) 127 | } 128 | 129 | // RemoveDir _ 130 | func RemoveDir(dir string) { 131 | if err := os.RemoveAll(dir); err != nil { 132 | panic(err) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /utils/bloom_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package utils 15 | 16 | import ( 17 | "testing" 18 | ) 19 | 20 | func (f Filter) String() string { 21 | s := make([]byte, 8*len(f)) 22 | for i, x := range f { 23 | for j := 0; j < 8; j++ { 24 | if x&(1<> 0) 81 | b[1] = uint8(uint32(i) >> 8) 82 | b[2] = uint8(uint32(i) >> 16) 83 | b[3] = uint8(uint32(i) >> 24) 84 | return b 85 | } 86 | 87 | nMediocreFilters, nGoodFilters := 0, 0 88 | loop: 89 | for length := 1; length <= 10000; length = nextLength(length) { 90 | keys := make([][]byte, 0, length) 91 | for i := 0; i < length; i++ { 92 | keys = append(keys, le32(i)) 93 | } 94 | var hashes []uint32 95 | for _, key := range keys { 96 | hashes = append(hashes, Hash(key)) 97 | } 98 | f := NewFilter(hashes, 10) 99 | 100 | if len(f) > (length*10/8)+40 { 101 | t.Errorf("length=%d: len(f)=%d is too large", length, len(f)) 102 | continue 103 | } 104 | 105 | // All added keys must match. 106 | for _, key := range keys { 107 | if !f.MayContainKey(key) { 108 | t.Errorf("length=%d: did not contain key %q", length, key) 109 | continue loop 110 | } 111 | } 112 | 113 | // Check false positive rate. 114 | nFalsePositive := 0 115 | for i := 0; i < 10000; i++ { 116 | if f.MayContainKey(le32(1e9 + i)) { 117 | nFalsePositive++ 118 | } 119 | } 120 | if nFalsePositive > 0.02*10000 { 121 | t.Errorf("length=%d: %d false positives in 10000", length, nFalsePositive) 122 | continue 123 | } 124 | if nFalsePositive > 0.0125*10000 { 125 | nMediocreFilters++ 126 | } else { 127 | nGoodFilters++ 128 | } 129 | } 130 | 131 | if nMediocreFilters > nGoodFilters/5 { 132 | t.Errorf("%d mediocre filters but only %d good filters", nMediocreFilters, nGoodFilters) 133 | } 134 | } 135 | 136 | func TestHash(t *testing.T) { 137 | // The magic want numbers come from running the C++ leveldb code in hash.cc. 138 | testCases := []struct { 139 | s string 140 | want uint32 141 | }{ 142 | {"", 0xbc9f1d34}, 143 | {"g", 0xd04a8bda}, 144 | {"go", 0x3e0b0745}, 145 | {"gop", 0x0c326610}, 146 | {"goph", 0x8c9d6390}, 147 | {"gophe", 0x9bfd4b0a}, 148 | {"gopher", 0xa78edc7c}, 149 | {"I had a dream it would end this way.", 0xe14a9db9}, 150 | } 151 | for _, tc := range testCases { 152 | if got := Hash([]byte(tc.s)); got != tc.want { 153 | t.Errorf("s=%q: got 0x%08x, want 0x%08x", tc.s, got, tc.want) 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /utils/value.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "encoding/binary" 19 | "reflect" 20 | "time" 21 | "unsafe" 22 | ) 23 | 24 | const ( 25 | // size of vlog header. 26 | // +----------------+------------------+ 27 | // | keyID(8 bytes) | baseIV(12 bytes)| 28 | // +----------------+------------------+ 29 | ValueLogHeaderSize = 20 30 | vptrSize = unsafe.Sizeof(ValuePtr{}) 31 | ) 32 | 33 | type ValuePtr struct { 34 | Len uint32 35 | Offset uint32 36 | Fid uint32 37 | } 38 | 39 | func (p ValuePtr) Less(o *ValuePtr) bool { 40 | if o == nil { 41 | return false 42 | } 43 | if p.Fid != o.Fid { 44 | return p.Fid < o.Fid 45 | } 46 | if p.Offset != o.Offset { 47 | return p.Offset < o.Offset 48 | } 49 | return p.Len < o.Len 50 | } 51 | 52 | func (p ValuePtr) IsZero() bool { 53 | return p.Fid == 0 && p.Offset == 0 && p.Len == 0 54 | } 55 | 56 | // Encode encodes Pointer into byte buffer. 57 | func (p ValuePtr) Encode() []byte { 58 | b := make([]byte, vptrSize) 59 | // Copy over the content from p to b. 60 | *(*ValuePtr)(unsafe.Pointer(&b[0])) = p 61 | return b 62 | } 63 | 64 | // Decode decodes the value pointer into the provided byte buffer. 65 | func (p *ValuePtr) Decode(b []byte) { 66 | // Copy over data from b into p. Using *p=unsafe.pointer(...) leads to 67 | copy(((*[vptrSize]byte)(unsafe.Pointer(p))[:]), b[:vptrSize]) 68 | } 69 | func IsValuePtr(e *Entry) bool { 70 | return e.Meta&BitValuePointer > 0 71 | } 72 | 73 | // BytesToU32 converts the given byte slice to uint32 74 | func BytesToU32(b []byte) uint32 { 75 | return binary.BigEndian.Uint32(b) 76 | } 77 | 78 | // BytesToU64 _ 79 | func BytesToU64(b []byte) uint64 { 80 | return binary.BigEndian.Uint64(b) 81 | } 82 | 83 | // U32SliceToBytes converts the given Uint32 slice to byte slice 84 | func U32SliceToBytes(u32s []uint32) []byte { 85 | if len(u32s) == 0 { 86 | return nil 87 | } 88 | var b []byte 89 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 90 | hdr.Len = len(u32s) * 4 91 | hdr.Cap = hdr.Len 92 | hdr.Data = uintptr(unsafe.Pointer(&u32s[0])) 93 | return b 94 | } 95 | 96 | // U32ToBytes converts the given Uint32 to bytes 97 | func U32ToBytes(v uint32) []byte { 98 | var uBuf [4]byte 99 | binary.BigEndian.PutUint32(uBuf[:], v) 100 | return uBuf[:] 101 | } 102 | 103 | // U64ToBytes converts the given Uint64 to bytes 104 | func U64ToBytes(v uint64) []byte { 105 | var uBuf [8]byte 106 | binary.BigEndian.PutUint64(uBuf[:], v) 107 | return uBuf[:] 108 | } 109 | 110 | // BytesToU32Slice converts the given byte slice to uint32 slice 111 | func BytesToU32Slice(b []byte) []uint32 { 112 | if len(b) == 0 { 113 | return nil 114 | } 115 | var u32s []uint32 116 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s)) 117 | hdr.Len = len(b) / 4 118 | hdr.Cap = hdr.Len 119 | hdr.Data = uintptr(unsafe.Pointer(&b[0])) 120 | return u32s 121 | } 122 | 123 | // ValuePtrCodec _ 124 | func ValuePtrCodec(vp *ValuePtr) []byte { 125 | return []byte{} 126 | } 127 | 128 | // RunCallback _ 129 | func RunCallback(cb func()) { 130 | if cb != nil { 131 | cb() 132 | } 133 | } 134 | 135 | func IsDeletedOrExpired(meta byte, expiresAt uint64) bool { 136 | if meta&BitDelete > 0 { 137 | return true 138 | } 139 | if expiresAt == 0 { 140 | return false 141 | } 142 | return expiresAt <= uint64(time.Now().Unix()) 143 | } 144 | 145 | func DiscardEntry(e, vs *Entry) bool { 146 | // TODO 版本这个信息应该被弱化掉 在后面上MVCC或者多版本查询的时候再考虑 147 | // if vs.Version != ParseTs(e.Key) { 148 | // // Version not found. Discard. 149 | // return true 150 | // } 151 | if IsDeletedOrExpired(vs.Meta, vs.ExpiresAt) { 152 | return true 153 | } 154 | if (vs.Meta & BitValuePointer) == 0 { 155 | // Key also stores the value in LSM. Discard. 156 | return true 157 | } 158 | return false 159 | } 160 | -------------------------------------------------------------------------------- /vlog_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package corekv 15 | 16 | import ( 17 | "bytes" 18 | "math/rand" 19 | "os" 20 | "testing" 21 | 22 | "github.com/hardcore-os/corekv/utils" 23 | "github.com/stretchr/testify/require" 24 | ) 25 | 26 | var ( 27 | // 初始化opt 28 | opt = &Options{ 29 | WorkDir: "./work_test", 30 | SSTableMaxSz: 1 << 10, 31 | MemTableSize: 1 << 10, 32 | ValueLogFileSize: 1 << 20, 33 | ValueThreshold: 0, 34 | MaxBatchCount: 10, 35 | MaxBatchSize: 1 << 20, 36 | } 37 | ) 38 | 39 | func TestVlogBase(t *testing.T) { 40 | // 清理目录 41 | clearDir() 42 | // 打开DB 43 | db := Open(opt) 44 | defer db.Close() 45 | log := db.vlog 46 | var err error 47 | // 创建一个简单的kv entry对象 48 | const val1 = "sampleval012345678901234567890123" 49 | const val2 = "samplevalb012345678901234567890123" 50 | require.True(t, int64(len(val1)) >= db.opt.ValueThreshold) 51 | 52 | e1 := &utils.Entry{ 53 | Key: []byte("samplekey"), 54 | Value: []byte(val1), 55 | Meta: utils.BitValuePointer, 56 | } 57 | e2 := &utils.Entry{ 58 | Key: []byte("samplekeyb"), 59 | Value: []byte(val2), 60 | Meta: utils.BitValuePointer, 61 | } 62 | 63 | // 构建一个批量请求的request 64 | b := new(request) 65 | b.Entries = []*utils.Entry{e1, e2} 66 | 67 | // 直接写入vlog中 68 | log.write([]*request{b}) 69 | require.Len(t, b.Ptrs, 2) 70 | t.Logf("Pointer written: %+v %+v\n", b.Ptrs[0], b.Ptrs[1]) 71 | 72 | // 从vlog中使用 value ptr指针中查询写入的分段vlog文件 73 | buf1, lf1, err1 := log.readValueBytes(b.Ptrs[0]) 74 | buf2, lf2, err2 := log.readValueBytes(b.Ptrs[1]) 75 | require.NoError(t, err1) 76 | require.NoError(t, err2) 77 | // 关闭会调的锁 78 | defer utils.RunCallback(log.getUnlockCallback(lf1)) 79 | defer utils.RunCallback((log.getUnlockCallback(lf2))) 80 | e1, err = lf1.DecodeEntry(buf1, b.Ptrs[0].Offset) 81 | require.NoError(t, err) 82 | // 从vlog文件中通过指指针反序列化回 entry对象 83 | e2, err = lf1.DecodeEntry(buf2, b.Ptrs[1].Offset) 84 | require.NoError(t, err) 85 | 86 | // 比较entry对象是否相等 87 | readEntries := []utils.Entry{*e1, *e2} 88 | require.EqualValues(t, []utils.Entry{ 89 | { 90 | Key: []byte("samplekey"), 91 | Value: []byte(val1), 92 | Meta: utils.BitValuePointer, 93 | Offset: b.Ptrs[0].Offset, 94 | }, 95 | { 96 | Key: []byte("samplekeyb"), 97 | Value: []byte(val2), 98 | Meta: utils.BitValuePointer, 99 | Offset: b.Ptrs[1].Offset, 100 | }, 101 | }, readEntries) 102 | } 103 | 104 | func clearDir() { 105 | _, err := os.Stat(opt.WorkDir) 106 | if err == nil { 107 | os.RemoveAll(opt.WorkDir) 108 | } 109 | os.Mkdir(opt.WorkDir, os.ModePerm) 110 | } 111 | 112 | func TestValueGC(t *testing.T) { 113 | clearDir() 114 | opt.ValueLogFileSize = 1 << 20 115 | kv := Open(opt) 116 | defer kv.Close() 117 | sz := 32 << 10 118 | kvList := []*utils.Entry{} 119 | for i := 0; i < 100; i++ { 120 | e := newRandEntry(sz) 121 | kvList = append(kvList, &utils.Entry{ 122 | Key: e.Key, 123 | Value: e.Value, 124 | Meta: e.Meta, 125 | ExpiresAt: e.ExpiresAt, 126 | }) 127 | require.NoError(t, kv.Set(e)) 128 | } 129 | kv.RunValueLogGC(0.9) 130 | for _, e := range kvList { 131 | item, err := kv.Get(e.Key) 132 | require.NoError(t, err) 133 | val := getItemValue(t, item) 134 | require.NotNil(t, val) 135 | require.True(t, bytes.Equal(item.Key, e.Key), "key not equal: e:%s, v:%s", e.Key, item.Key) 136 | require.True(t, bytes.Equal(item.Value, e.Value), "value not equal: e:%s, v:%s", e.Value, item.Key) 137 | } 138 | } 139 | 140 | func newRandEntry(sz int) *utils.Entry { 141 | v := make([]byte, sz) 142 | rand.Read(v[:rand.Intn(sz)]) 143 | e := utils.BuildEntry() 144 | e.Value = v 145 | return e 146 | } 147 | func getItemValue(t *testing.T, item *utils.Entry) (val []byte) { 148 | t.Helper() 149 | if item == nil { 150 | return nil 151 | } 152 | var v []byte 153 | v = append(v, item.Value...) 154 | if v == nil { 155 | return nil 156 | } 157 | return v 158 | } 159 | -------------------------------------------------------------------------------- /utils/skiplist_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package utils 16 | 17 | import ( 18 | "fmt" 19 | "strconv" 20 | "strings" 21 | "sync" 22 | "testing" 23 | 24 | "github.com/stretchr/testify/assert" 25 | "github.com/stretchr/testify/require" 26 | ) 27 | 28 | func RandString(len int) string { 29 | bytes := make([]byte, len) 30 | for i := 0; i < len; i++ { 31 | b := r.Intn(26) + 65 32 | bytes[i] = byte(b) 33 | } 34 | return string(bytes) 35 | } 36 | 37 | func TestSkipListBasicCRUD(t *testing.T) { 38 | list := NewSkiplist(1000) 39 | 40 | //Put & Get 41 | entry1 := NewEntry([]byte(RandString(10)), []byte("Val1")) 42 | list.Add(entry1) 43 | vs := list.Search(entry1.Key) 44 | assert.Equal(t, entry1.Value, vs.Value) 45 | 46 | entry2 := NewEntry([]byte(RandString(10)), []byte("Val2")) 47 | list.Add(entry2) 48 | vs = list.Search(entry2.Key) 49 | assert.Equal(t, entry2.Value, vs.Value) 50 | 51 | //Get a not exist entry 52 | assert.Nil(t, list.Search([]byte(RandString(10))).Value) 53 | 54 | //Update a entry 55 | entry2_new := NewEntry(entry1.Key, []byte("Val1+1")) 56 | list.Add(entry2_new) 57 | assert.Equal(t, entry2_new.Value, list.Search(entry2_new.Key).Value) 58 | } 59 | 60 | func Benchmark_SkipListBasicCRUD(b *testing.B) { 61 | list := NewSkiplist(100000000) 62 | key, val := "", "" 63 | maxTime := 1000 64 | for i := 0; i < maxTime; i++ { 65 | //number := rand.Intn(10000) 66 | key, val = RandString(10), fmt.Sprintf("Val%d", i) 67 | entry := NewEntry([]byte(key), []byte(val)) 68 | list.Add(entry) 69 | searchVal := list.Search([]byte(key)) 70 | assert.Equal(b, searchVal.Value, []byte(val)) 71 | } 72 | } 73 | 74 | func TestDrawList(t *testing.T) { 75 | list := NewSkiplist(1000) 76 | n := 12 77 | for i:=0; i>= 7 58 | if x == 0 { 59 | break 60 | } 61 | } 62 | return n 63 | } 64 | 65 | //Entry _ 最外层写入的结构体 66 | type Entry struct { 67 | Key []byte 68 | Value []byte 69 | ExpiresAt uint64 70 | 71 | Meta byte 72 | Version uint64 73 | Offset uint32 74 | Hlen int // Length of the header. 75 | ValThreshold int64 76 | } 77 | 78 | // NewEntry_ 79 | func NewEntry(key, value []byte) *Entry { 80 | return &Entry{ 81 | Key: key, 82 | Value: value, 83 | } 84 | } 85 | 86 | // Entry_ 87 | func (e *Entry) Entry() *Entry { 88 | return e 89 | } 90 | 91 | func (e *Entry) IsDeletedOrExpired() bool { 92 | if e.Value == nil { 93 | return true 94 | } 95 | 96 | if e.ExpiresAt == 0 { 97 | return false 98 | } 99 | 100 | return e.ExpiresAt <= uint64(time.Now().Unix()) 101 | } 102 | 103 | // WithTTL _ 104 | func (e *Entry) WithTTL(dur time.Duration) *Entry { 105 | e.ExpiresAt = uint64(time.Now().Add(dur).Unix()) 106 | return e 107 | } 108 | 109 | // EncodedSize is the size of the ValueStruct when encoded 110 | func (e *Entry) EncodedSize() uint32 { 111 | sz := len(e.Value) 112 | enc := sizeVarint(uint64(e.Meta)) 113 | enc += sizeVarint(e.ExpiresAt) 114 | return uint32(sz + enc) 115 | } 116 | 117 | // EstimateSize 118 | func (e *Entry) EstimateSize(threshold int) int { 119 | // TODO: 是否考虑 user meta? 120 | if len(e.Value) < threshold { 121 | return len(e.Key) + len(e.Value) + 1 // Meta 122 | } 123 | return len(e.Key) + 12 + 1 // 12 for ValuePointer, 2 for meta. 124 | } 125 | 126 | // header 对象 127 | // header is used in value log as a header before Entry. 128 | type Header struct { 129 | KLen uint32 130 | VLen uint32 131 | ExpiresAt uint64 132 | Meta byte 133 | } 134 | 135 | // +------+----------+------------+--------------+-----------+ 136 | // | Meta | UserMeta | Key Length | Value Length | ExpiresAt | 137 | // +------+----------+------------+--------------+-----------+ 138 | func (h Header) Encode(out []byte) int { 139 | out[0] = h.Meta 140 | index := 1 141 | index += binary.PutUvarint(out[index:], uint64(h.KLen)) 142 | index += binary.PutUvarint(out[index:], uint64(h.VLen)) 143 | index += binary.PutUvarint(out[index:], h.ExpiresAt) 144 | return index 145 | } 146 | 147 | // Decode decodes the given header from the provided byte slice. 148 | // Returns the number of bytes read. 149 | func (h *Header) Decode(buf []byte) int { 150 | h.Meta = buf[0] 151 | index := 1 152 | klen, count := binary.Uvarint(buf[index:]) 153 | h.KLen = uint32(klen) 154 | index += count 155 | vlen, count := binary.Uvarint(buf[index:]) 156 | h.VLen = uint32(vlen) 157 | index += count 158 | h.ExpiresAt, count = binary.Uvarint(buf[index:]) 159 | return index + count 160 | } 161 | 162 | // DecodeFrom reads the header from the hashReader. 163 | // Returns the number of bytes read. 164 | func (h *Header) DecodeFrom(reader *HashReader) (int, error) { 165 | var err error 166 | h.Meta, err = reader.ReadByte() 167 | if err != nil { 168 | return 0, err 169 | } 170 | klen, err := binary.ReadUvarint(reader) 171 | if err != nil { 172 | return 0, err 173 | } 174 | h.KLen = uint32(klen) 175 | vlen, err := binary.ReadUvarint(reader) 176 | if err != nil { 177 | return 0, err 178 | } 179 | h.VLen = uint32(vlen) 180 | h.ExpiresAt, err = binary.ReadUvarint(reader) 181 | if err != nil { 182 | return 0, err 183 | } 184 | return reader.BytesRead, nil 185 | } 186 | -------------------------------------------------------------------------------- /utils/cache/bloom.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cache 16 | 17 | import "math" 18 | 19 | // Filter is an encoded set of []byte keys. 20 | type Filter []byte 21 | 22 | type BloomFilter struct { 23 | bitmap Filter 24 | k uint8 25 | } 26 | 27 | // MayContainKey _ 28 | func (f *BloomFilter) MayContainKey(k []byte) bool { 29 | return f.MayContain(Hash(k)) 30 | } 31 | 32 | // MayContain returns whether the filter may contain given key. False positives 33 | // are possible, where it returns true for keys not in the original set. 34 | func (f *BloomFilter) MayContain(h uint32) bool { 35 | if f.Len() < 2 { 36 | return false 37 | } 38 | k := f.k 39 | if k > 30 { 40 | // This is reserved for potentially new encodings for short Bloom filters. 41 | // Consider it a match. 42 | return true 43 | } 44 | nBits := uint32(8 * (f.Len() - 1)) 45 | delta := h>>17 | h<<15 46 | for j := uint8(0); j < k; j++ { 47 | bitPos := h % nBits 48 | if f.bitmap[bitPos/8]&(1<<(bitPos%8)) == 0 { 49 | return false 50 | } 51 | h += delta 52 | } 53 | return true 54 | } 55 | 56 | func (f *BloomFilter) Len() int32 { 57 | return int32(len(f.bitmap)) 58 | } 59 | 60 | func (f *BloomFilter) InsertKey(k []byte) bool { 61 | return f.Insert(Hash(k)) 62 | } 63 | 64 | func (f *BloomFilter) Insert(h uint32) bool { 65 | k := f.k 66 | if k > 30 { 67 | // This is reserved for potentially new encodings for short Bloom filters. 68 | // Consider it a match. 69 | return true 70 | } 71 | nBits := uint32(8 * (f.Len() - 1)) 72 | delta := h>>17 | h<<15 73 | for j := uint8(0); j < k; j++ { 74 | bitPos := h % uint32(nBits) 75 | f.bitmap[bitPos/8] |= 1 << (bitPos % 8) 76 | h += delta 77 | } 78 | return true 79 | } 80 | 81 | func (f *BloomFilter) AllowKey(k []byte) bool { 82 | if f == nil { 83 | return true 84 | } 85 | already := f.MayContainKey(k) 86 | if !already { 87 | f.InsertKey(k) 88 | } 89 | return already 90 | } 91 | 92 | func (f *BloomFilter) Allow(h uint32) bool { 93 | if f == nil { 94 | return true 95 | } 96 | already := f.MayContain(h) 97 | if !already { 98 | f.Insert(h) 99 | } 100 | return already 101 | } 102 | 103 | func (f *BloomFilter) reset() { 104 | if f == nil { 105 | return 106 | } 107 | for i := range f.bitmap { 108 | f.bitmap[i] = 0 109 | } 110 | } 111 | 112 | // NewFilter returns a new Bloom filter that encodes a set of []byte keys with 113 | // the given number of bits per key, approximately. 114 | // 115 | // A good bitsPerKey value is 10, which yields a filter with ~ 1% false 116 | // positive rate. 117 | func newFilter(numEntries int, falsePositive float64) *BloomFilter { 118 | bitsPerKey := bloomBitsPerKey(numEntries, falsePositive) 119 | return initFilter(numEntries, bitsPerKey) 120 | } 121 | 122 | // BloomBitsPerKey returns the bits per key required by bloomfilter based on 123 | // the false positive rate. 124 | func bloomBitsPerKey(numEntries int, fp float64) int { 125 | size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2) 126 | locs := math.Ceil(size / float64(numEntries)) 127 | return int(locs) 128 | } 129 | 130 | func initFilter(numEntries int, bitsPerKey int) *BloomFilter { 131 | bf := &BloomFilter{} 132 | if bitsPerKey < 0 { 133 | bitsPerKey = 0 134 | } 135 | // 0.69 is approximately ln(2). 136 | k := uint32(float64(bitsPerKey) * 0.69) 137 | if k < 1 { 138 | k = 1 139 | } 140 | if k > 30 { 141 | k = 30 142 | } 143 | bf.k = uint8(k) 144 | 145 | nBits := numEntries * int(bitsPerKey) 146 | // For small len(keys), we can see a very high false positive rate. Fix it 147 | // by enforcing a minimum bloom filter length. 148 | if nBits < 64 { 149 | nBits = 64 150 | } 151 | nBytes := (nBits + 7) / 8 152 | nBits = nBytes * 8 153 | filter := make([]byte, nBytes+1) 154 | 155 | //record the K value of this Bloom Filter 156 | filter[nBytes] = uint8(k) 157 | 158 | bf.bitmap = filter 159 | return bf 160 | } 161 | 162 | // Hash implements a hashing algorithm similar to the Murmur hash. 163 | func Hash(b []byte) uint32 { 164 | const ( 165 | seed = 0xbc9f1d34 166 | m = 0xc6a4a793 167 | ) 168 | h := uint32(seed) ^ uint32(len(b))*m 169 | for ; len(b) >= 4; b = b[4:] { 170 | h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 171 | h *= m 172 | h ^= h >> 16 173 | } 174 | switch len(b) { 175 | case 3: 176 | h += uint32(b[2]) << 16 177 | fallthrough 178 | case 2: 179 | h += uint32(b[1]) << 8 180 | fallthrough 181 | case 1: 182 | h += uint32(b[0]) 183 | h *= m 184 | h ^= h >> 24 185 | } 186 | return h 187 | } 188 | -------------------------------------------------------------------------------- /file/wal.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package file 16 | 17 | import ( 18 | "bufio" 19 | "bytes" 20 | "fmt" 21 | "hash/crc32" 22 | "io" 23 | "os" 24 | "sync" 25 | 26 | "github.com/hardcore-os/corekv/utils" 27 | "github.com/pkg/errors" 28 | ) 29 | 30 | // WalFile _ 31 | type WalFile struct { 32 | lock *sync.RWMutex 33 | f *MmapFile 34 | opts *Options 35 | buf *bytes.Buffer 36 | size uint32 37 | writeAt uint32 38 | } 39 | 40 | // Fid _ 41 | func (wf *WalFile) Fid() uint64 { 42 | return wf.opts.FID 43 | } 44 | 45 | // Close _ 46 | func (wf *WalFile) Close() error { 47 | fileName := wf.f.Fd.Name() 48 | if err := wf.f.Close(); err != nil { 49 | return err 50 | } 51 | return os.Remove(fileName) 52 | } 53 | 54 | // Name _ 55 | func (wf *WalFile) Name() string { 56 | return wf.f.Fd.Name() 57 | } 58 | 59 | // Size 当前已经被写入的数据 60 | func (wf *WalFile) Size() uint32 { 61 | return wf.writeAt 62 | } 63 | 64 | // OpenWalFile _ 65 | func OpenWalFile(opt *Options) *WalFile { 66 | omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) 67 | wf := &WalFile{f: omf, lock: &sync.RWMutex{}, opts: opt} 68 | wf.buf = &bytes.Buffer{} 69 | wf.size = uint32(len(wf.f.Data)) 70 | utils.Err(err) 71 | return wf 72 | } 73 | 74 | func (wf *WalFile) Write(entry *utils.Entry) error { 75 | // 落预写日志简单的同步写即可 76 | // 序列化为磁盘结构 77 | wf.lock.Lock() 78 | plen := utils.WalCodec(wf.buf, entry) 79 | buf := wf.buf.Bytes() 80 | utils.Panic(wf.f.AppendBuffer(wf.writeAt, buf)) 81 | wf.writeAt += uint32(plen) 82 | wf.lock.Unlock() 83 | return nil 84 | } 85 | 86 | // Iterate 从磁盘中遍历wal,获得数据 87 | func (wf *WalFile) Iterate(readOnly bool, offset uint32, fn utils.LogEntry) (uint32, error) { 88 | // For now, read directly from file, because it allows 89 | reader := bufio.NewReader(wf.f.NewReader(int(offset))) 90 | read := SafeRead{ 91 | K: make([]byte, 10), 92 | V: make([]byte, 10), 93 | RecordOffset: offset, 94 | LF: wf, 95 | } 96 | var validEndOffset uint32 = offset 97 | loop: 98 | for { 99 | e, err := read.MakeEntry(reader) 100 | switch { 101 | case err == io.EOF: 102 | break loop 103 | case err == io.ErrUnexpectedEOF || err == utils.ErrTruncate: 104 | break loop 105 | case err != nil: 106 | return 0, err 107 | case e.IsZero(): 108 | break loop 109 | } 110 | 111 | var vp utils.ValuePtr // 给kv分离的设计留下扩展,可以不用考虑其作用 112 | size := uint32(int(e.LogHeaderLen()) + len(e.Key) + len(e.Value) + crc32.Size) 113 | read.RecordOffset += size 114 | validEndOffset = read.RecordOffset 115 | if err := fn(e, &vp); err != nil { 116 | if err == utils.ErrStop { 117 | break 118 | } 119 | return 0, errors.WithMessage(err, "Iteration function") 120 | } 121 | } 122 | return validEndOffset, nil 123 | } 124 | 125 | // Truncate _ 126 | // TODO Truncate 函数 127 | func (wf *WalFile) Truncate(end int64) error { 128 | if end <= 0 { 129 | return nil 130 | } 131 | if fi, err := wf.f.Fd.Stat(); err != nil { 132 | return fmt.Errorf("while file.stat on file: %s, error: %v\n", wf.Name(), err) 133 | } else if fi.Size() == end { 134 | return nil 135 | } 136 | wf.size = uint32(end) 137 | return wf.f.Truncature(end) 138 | } 139 | 140 | // 封装kv分离的读操作 141 | type SafeRead struct { 142 | K []byte 143 | V []byte 144 | 145 | RecordOffset uint32 146 | LF *WalFile 147 | } 148 | 149 | // MakeEntry _ 150 | func (r *SafeRead) MakeEntry(reader io.Reader) (*utils.Entry, error) { 151 | tee := utils.NewHashReader(reader) 152 | var h utils.WalHeader 153 | hlen, err := h.Decode(tee) 154 | if err != nil { 155 | return nil, err 156 | } 157 | if h.KeyLen > uint32(1<<16) { // Key length must be below uint16. 158 | return nil, utils.ErrTruncate 159 | } 160 | kl := int(h.KeyLen) 161 | if cap(r.K) < kl { 162 | r.K = make([]byte, 2*kl) 163 | } 164 | vl := int(h.ValueLen) 165 | if cap(r.V) < vl { 166 | r.V = make([]byte, 2*vl) 167 | } 168 | 169 | e := &utils.Entry{} 170 | e.Offset = r.RecordOffset 171 | e.Hlen = hlen 172 | buf := make([]byte, h.KeyLen+h.ValueLen) 173 | if _, err := io.ReadFull(tee, buf[:]); err != nil { 174 | if err == io.EOF { 175 | err = utils.ErrTruncate 176 | } 177 | return nil, err 178 | } 179 | e.Key = buf[:h.KeyLen] 180 | e.Value = buf[h.KeyLen:] 181 | var crcBuf [crc32.Size]byte 182 | if _, err := io.ReadFull(reader, crcBuf[:]); err != nil { 183 | if err == io.EOF { 184 | err = utils.ErrTruncate 185 | } 186 | return nil, err 187 | } 188 | crc := utils.BytesToU32(crcBuf[:]) 189 | if crc != tee.Sum32() { 190 | return nil, utils.ErrTruncate 191 | } 192 | e.ExpiresAt = h.ExpiresAt 193 | return e, nil 194 | } 195 | -------------------------------------------------------------------------------- /lsm/memtable.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lsm 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "io/ioutil" 21 | "os" 22 | "path/filepath" 23 | "sort" 24 | "strconv" 25 | "strings" 26 | "sync/atomic" 27 | 28 | "github.com/hardcore-os/corekv/file" 29 | "github.com/hardcore-os/corekv/utils" 30 | "github.com/pkg/errors" 31 | ) 32 | 33 | const walFileExt string = ".wal" 34 | 35 | // MemTable 36 | type memTable struct { 37 | lsm *LSM 38 | wal *file.WalFile 39 | sl *utils.Skiplist 40 | buf *bytes.Buffer 41 | maxVersion uint64 42 | } 43 | 44 | // NewMemtable _ 45 | func (lsm *LSM) NewMemtable() *memTable { 46 | newFid := atomic.AddUint64(&(lsm.levels.maxFID), 1) 47 | fileOpt := &file.Options{ 48 | Dir: lsm.option.WorkDir, 49 | Flag: os.O_CREATE | os.O_RDWR, 50 | MaxSz: int(lsm.option.MemTableSize), //TODO wal 要设置多大比较合理? 姑且跟sst一样大 51 | FID: newFid, 52 | FileName: mtFilePath(lsm.option.WorkDir, newFid), 53 | } 54 | return &memTable{wal: file.OpenWalFile(fileOpt), sl: utils.NewSkiplist(int64(1 << 20)), lsm: lsm} 55 | } 56 | 57 | // Close 58 | func (m *memTable) close() error { 59 | if err := m.wal.Close(); err != nil { 60 | return err 61 | } 62 | 63 | return nil 64 | } 65 | 66 | func (m *memTable) set(entry *utils.Entry) error { 67 | // 写到wal 日志中,防止崩溃 68 | if err := m.wal.Write(entry); err != nil { 69 | return err 70 | } 71 | // 写到memtable中 72 | m.sl.Add(entry) 73 | return nil 74 | } 75 | 76 | func (m *memTable) Get(key []byte) (*utils.Entry, error) { 77 | // 索引检查当前的key是否在表中 O(1) 的时间复杂度 78 | // 从内存表中获取数据 79 | vs := m.sl.Search(key) 80 | 81 | e := &utils.Entry{ 82 | Key: key, 83 | Value: vs.Value, 84 | ExpiresAt: vs.ExpiresAt, 85 | Meta: vs.Meta, 86 | Version: vs.Version, 87 | } 88 | 89 | return e, nil 90 | 91 | } 92 | 93 | func (m *memTable) Size() int64 { 94 | return m.sl.MemSize() 95 | } 96 | 97 | //recovery 98 | func (lsm *LSM) recovery() (*memTable, []*memTable) { 99 | // 从 工作目录中获取所有文件 100 | files, err := ioutil.ReadDir(lsm.option.WorkDir) 101 | if err != nil { 102 | utils.Panic(err) 103 | return nil, nil 104 | } 105 | var fids []uint64 106 | maxFid := lsm.levels.maxFID 107 | // 识别 后缀为.wal的文件 108 | for _, file := range files { 109 | if !strings.HasSuffix(file.Name(), walFileExt) { 110 | continue 111 | } 112 | fsz := len(file.Name()) 113 | fid, err := strconv.ParseUint(file.Name()[:fsz-len(walFileExt)], 10, 64) 114 | // 考虑 wal文件的存在 更新maxFid 115 | if maxFid < fid { 116 | maxFid = fid 117 | } 118 | if err != nil { 119 | utils.Panic(err) 120 | return nil, nil 121 | } 122 | fids = append(fids, fid) 123 | } 124 | // 排序一下子 125 | sort.Slice(fids, func(i, j int) bool { 126 | return fids[i] < fids[j] 127 | }) 128 | imms := []*memTable{} 129 | // 遍历fid 做处理 130 | for _, fid := range fids { 131 | mt, err := lsm.openMemTable(fid) 132 | utils.CondPanic(err != nil, err) 133 | if mt.sl.MemSize() == 0 { 134 | // mt.DecrRef() 135 | continue 136 | } 137 | // TODO 如果最后一个跳表没写满会怎么样?这不就浪费空间了吗 138 | imms = append(imms, mt) 139 | } 140 | // 更新最终的maxfid,初始化一定是串行执行的,因此不需要原子操作 141 | lsm.levels.maxFID = maxFid 142 | return lsm.NewMemtable(), imms 143 | } 144 | 145 | func (lsm *LSM) openMemTable(fid uint64) (*memTable, error) { 146 | fileOpt := &file.Options{ 147 | Dir: lsm.option.WorkDir, 148 | Flag: os.O_CREATE | os.O_RDWR, 149 | MaxSz: int(lsm.option.MemTableSize), 150 | FID: fid, 151 | FileName: mtFilePath(lsm.option.WorkDir, fid), 152 | } 153 | s := utils.NewSkiplist(int64(1 << 20)) 154 | mt := &memTable{ 155 | sl: s, 156 | buf: &bytes.Buffer{}, 157 | lsm: lsm, 158 | } 159 | mt.wal = file.OpenWalFile(fileOpt) 160 | err := mt.UpdateSkipList() 161 | utils.CondPanic(err != nil, errors.WithMessage(err, "while updating skiplist")) 162 | return mt, nil 163 | } 164 | func mtFilePath(dir string, fid uint64) string { 165 | return filepath.Join(dir, fmt.Sprintf("%05d%s", fid, walFileExt)) 166 | } 167 | 168 | func (m *memTable) UpdateSkipList() error { 169 | if m.wal == nil || m.sl == nil { 170 | return nil 171 | } 172 | endOff, err := m.wal.Iterate(true, 0, m.replayFunction(m.lsm.option)) 173 | if err != nil { 174 | return errors.WithMessage(err, fmt.Sprintf("while iterating wal: %s", m.wal.Name())) 175 | } 176 | // if endOff < m.wal.Size() { 177 | // return errors.WithMessage(utils.ErrTruncate, fmt.Sprintf("end offset: %d < size: %d", endOff, m.wal.Size())) 178 | // } 179 | return m.wal.Truncate(int64(endOff)) 180 | } 181 | 182 | func (m *memTable) replayFunction(opt *Options) func(*utils.Entry, *utils.ValuePtr) error { 183 | return func(e *utils.Entry, _ *utils.ValuePtr) error { // Function for replaying. 184 | if ts := utils.ParseTs(e.Key); ts > m.maxVersion { 185 | m.maxVersion = ts 186 | } 187 | m.sl.Add(e) 188 | return nil 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /file/sstable_linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | // Copyright 2021 hardcore-os Project Authors 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License") 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | package file 18 | 19 | import ( 20 | "io" 21 | "os" 22 | "sync" 23 | "syscall" 24 | "time" 25 | 26 | "github.com/golang/protobuf/proto" 27 | "github.com/hardcore-os/corekv/pb" 28 | "github.com/hardcore-os/corekv/utils" 29 | "github.com/pkg/errors" 30 | ) 31 | 32 | // SSTable 文件的内存封装 33 | type SSTable struct { 34 | lock *sync.RWMutex 35 | f *MmapFile 36 | maxKey []byte 37 | minKey []byte 38 | idxTables *pb.TableIndex 39 | hasBloomFilter bool 40 | idxLen int 41 | idxStart int 42 | fid uint64 43 | createdAt time.Time 44 | } 45 | 46 | // OpenSStable 打开一个 sst文件 47 | func OpenSStable(opt *Options) *SSTable { 48 | omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) 49 | utils.Err(err) 50 | return &SSTable{f: omf, fid: opt.FID, lock: &sync.RWMutex{}} 51 | } 52 | 53 | // Init 初始化 54 | func (ss *SSTable) Init() error { 55 | var ko *pb.BlockOffset 56 | var err error 57 | if ko, err = ss.initTable(); err != nil { 58 | return err 59 | } 60 | // 从文件中获取创建时间 61 | stat, _ := ss.f.Fd.Stat() 62 | statType := stat.Sys().(*syscall.Stat_t) 63 | ss.createdAt = time.Unix(statType.Ctim.Sec, statType.Ctim.Nsec) 64 | // init min key 65 | keyBytes := ko.GetKey() 66 | minKey := make([]byte, len(keyBytes)) 67 | copy(minKey, keyBytes) 68 | ss.minKey = minKey 69 | ss.maxKey = minKey 70 | return nil 71 | } 72 | 73 | // SetMaxKey max 需要使用table的迭代器,来获取最后一个block的最后一个key 74 | func (ss *SSTable) SetMaxKey(maxKey []byte) { 75 | ss.maxKey = maxKey 76 | } 77 | func (ss *SSTable) initTable() (bo *pb.BlockOffset, err error) { 78 | readPos := len(ss.f.Data) 79 | 80 | // Read checksum len from the last 4 bytes. 81 | readPos -= 4 82 | buf := ss.readCheckError(readPos, 4) 83 | checksumLen := int(utils.BytesToU32(buf)) 84 | if checksumLen < 0 { 85 | return nil, errors.New("checksum length less than zero. Data corrupted") 86 | } 87 | 88 | // Read checksum. 89 | readPos -= checksumLen 90 | expectedChk := ss.readCheckError(readPos, checksumLen) 91 | 92 | // Read index size from the footer. 93 | readPos -= 4 94 | buf = ss.readCheckError(readPos, 4) 95 | ss.idxLen = int(utils.BytesToU32(buf)) 96 | 97 | // Read index. 98 | readPos -= ss.idxLen 99 | ss.idxStart = readPos 100 | data := ss.readCheckError(readPos, ss.idxLen) 101 | if err := utils.VerifyChecksum(data, expectedChk); err != nil { 102 | return nil, errors.Wrapf(err, "failed to verify checksum for table: %s", ss.f.Fd.Name()) 103 | } 104 | indexTable := &pb.TableIndex{} 105 | if err := proto.Unmarshal(data, indexTable); err != nil { 106 | return nil, err 107 | } 108 | ss.idxTables = indexTable 109 | 110 | ss.hasBloomFilter = len(indexTable.BloomFilter) > 0 111 | if len(indexTable.GetOffsets()) > 0 { 112 | return indexTable.GetOffsets()[0], nil 113 | } 114 | return nil, errors.New("read index fail, offset is nil") 115 | } 116 | 117 | // Close 关闭 118 | func (ss *SSTable) Close() error { 119 | return ss.f.Close() 120 | } 121 | 122 | // Indexs _ 123 | func (ss *SSTable) Indexs() *pb.TableIndex { 124 | return ss.idxTables 125 | } 126 | 127 | // MaxKey 当前最大的key 128 | func (ss *SSTable) MaxKey() []byte { 129 | return ss.maxKey 130 | } 131 | 132 | // MinKey 当前最小的key 133 | func (ss *SSTable) MinKey() []byte { 134 | return ss.minKey 135 | } 136 | 137 | // FID 获取fid 138 | func (ss *SSTable) FID() uint64 { 139 | return ss.fid 140 | } 141 | 142 | // HasBloomFilter _ 143 | func (ss *SSTable) HasBloomFilter() bool { 144 | return ss.hasBloomFilter 145 | } 146 | 147 | func (ss *SSTable) read(off, sz int) ([]byte, error) { 148 | if len(ss.f.Data) > 0 { 149 | if len(ss.f.Data[off:]) < sz { 150 | return nil, io.EOF 151 | } 152 | return ss.f.Data[off : off+sz], nil 153 | } 154 | 155 | res := make([]byte, sz) 156 | _, err := ss.f.Fd.ReadAt(res, int64(off)) 157 | return res, err 158 | } 159 | func (ss *SSTable) readCheckError(off, sz int) []byte { 160 | buf, err := ss.read(off, sz) 161 | utils.Panic(err) 162 | return buf 163 | } 164 | 165 | // Bytes returns data starting from offset off of size sz. If there's not enough data, it would 166 | // return nil slice and io.EOF. 167 | func (ss *SSTable) Bytes(off, sz int) ([]byte, error) { 168 | return ss.f.Bytes(off, sz) 169 | } 170 | 171 | // Size 返回底层文件的尺寸 172 | func (ss *SSTable) Size() int64 { 173 | fileStats, err := ss.f.Fd.Stat() 174 | utils.Panic(err) 175 | return fileStats.Size() 176 | } 177 | 178 | // GetCreatedAt _ 179 | func (ss *SSTable) GetCreatedAt() *time.Time { 180 | return &ss.createdAt 181 | } 182 | 183 | // SetCreatedAt _ 184 | func (ss *SSTable) SetCreatedAt(t *time.Time) { 185 | ss.createdAt = *t 186 | } 187 | 188 | // Detele _ 189 | func (ss *SSTable) Detele() error { 190 | return ss.f.Delete() 191 | } 192 | 193 | // Truncature _ 194 | func (ss *SSTable) Truncature(size int64) error { 195 | return ss.f.Truncature(size) 196 | } 197 | -------------------------------------------------------------------------------- /file/sstable_darwin.go: -------------------------------------------------------------------------------- 1 | // +build darwin 2 | 3 | // Copyright 2021 hardcore-os Project Authors 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License") 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | package file 18 | 19 | import ( 20 | "io" 21 | "os" 22 | "sync" 23 | "syscall" 24 | "time" 25 | 26 | "github.com/golang/protobuf/proto" 27 | "github.com/hardcore-os/corekv/pb" 28 | "github.com/hardcore-os/corekv/utils" 29 | "github.com/pkg/errors" 30 | ) 31 | 32 | // SSTable 文件的内存封装 33 | type SSTable struct { 34 | lock *sync.RWMutex 35 | f *MmapFile 36 | maxKey []byte 37 | minKey []byte 38 | idxTables *pb.TableIndex 39 | hasBloomFilter bool 40 | idxLen int 41 | idxStart int 42 | fid uint64 43 | createdAt time.Time 44 | } 45 | 46 | // OpenSStable 打开一个 sst文件 47 | func OpenSStable(opt *Options) *SSTable { 48 | omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) 49 | utils.Err(err) 50 | return &SSTable{f: omf, fid: opt.FID, lock: &sync.RWMutex{}} 51 | } 52 | 53 | // Init 初始化 54 | func (ss *SSTable) Init() error { 55 | var ko *pb.BlockOffset 56 | var err error 57 | if ko, err = ss.initTable(); err != nil { 58 | return err 59 | } 60 | // 从文件中获取创建时间 61 | stat, _ := ss.f.Fd.Stat() 62 | statType := stat.Sys().(*syscall.Stat_t) 63 | ss.createdAt = time.Unix(statType.Atimespec.Sec, statType.Atimespec.Nsec) 64 | // init min key 65 | keyBytes := ko.GetKey() 66 | minKey := make([]byte, len(keyBytes)) 67 | copy(minKey, keyBytes) 68 | ss.minKey = minKey 69 | ss.maxKey = minKey 70 | return nil 71 | } 72 | 73 | // SetMaxKey max 需要使用table的迭代器,来获取最后一个block的最后一个key 74 | func (ss *SSTable) SetMaxKey(maxKey []byte) { 75 | ss.maxKey = maxKey 76 | } 77 | func (ss *SSTable) initTable() (bo *pb.BlockOffset, err error) { 78 | readPos := len(ss.f.Data) 79 | 80 | // Read checksum len from the last 4 bytes. 81 | readPos -= 4 82 | buf := ss.readCheckError(readPos, 4) 83 | checksumLen := int(utils.BytesToU32(buf)) 84 | if checksumLen < 0 { 85 | return nil, errors.New("checksum length less than zero. Data corrupted") 86 | } 87 | 88 | // Read checksum. 89 | readPos -= checksumLen 90 | expectedChk := ss.readCheckError(readPos, checksumLen) 91 | 92 | // Read index size from the footer. 93 | readPos -= 4 94 | buf = ss.readCheckError(readPos, 4) 95 | ss.idxLen = int(utils.BytesToU32(buf)) 96 | 97 | // Read index. 98 | readPos -= ss.idxLen 99 | ss.idxStart = readPos 100 | data := ss.readCheckError(readPos, ss.idxLen) 101 | if err := utils.VerifyChecksum(data, expectedChk); err != nil { 102 | return nil, errors.Wrapf(err, "failed to verify checksum for table: %s", ss.f.Fd.Name()) 103 | } 104 | indexTable := &pb.TableIndex{} 105 | if err := proto.Unmarshal(data, indexTable); err != nil { 106 | return nil, err 107 | } 108 | ss.idxTables = indexTable 109 | 110 | ss.hasBloomFilter = len(indexTable.BloomFilter) > 0 111 | if len(indexTable.GetOffsets()) > 0 { 112 | return indexTable.GetOffsets()[0], nil 113 | } 114 | return nil, errors.New("read index fail, offset is nil") 115 | } 116 | 117 | // Close 关闭 118 | func (ss *SSTable) Close() error { 119 | return ss.f.Close() 120 | } 121 | 122 | // Indexs _ 123 | func (ss *SSTable) Indexs() *pb.TableIndex { 124 | return ss.idxTables 125 | } 126 | 127 | // MaxKey 当前最大的key 128 | func (ss *SSTable) MaxKey() []byte { 129 | return ss.maxKey 130 | } 131 | 132 | // MinKey 当前最小的key 133 | func (ss *SSTable) MinKey() []byte { 134 | return ss.minKey 135 | } 136 | 137 | // FID 获取fid 138 | func (ss *SSTable) FID() uint64 { 139 | return ss.fid 140 | } 141 | 142 | // HasBloomFilter _ 143 | func (ss *SSTable) HasBloomFilter() bool { 144 | return ss.hasBloomFilter 145 | } 146 | 147 | func (ss *SSTable) read(off, sz int) ([]byte, error) { 148 | if len(ss.f.Data) > 0 { 149 | if len(ss.f.Data[off:]) < sz { 150 | return nil, io.EOF 151 | } 152 | return ss.f.Data[off : off+sz], nil 153 | } 154 | 155 | res := make([]byte, sz) 156 | _, err := ss.f.Fd.ReadAt(res, int64(off)) 157 | return res, err 158 | } 159 | func (ss *SSTable) readCheckError(off, sz int) []byte { 160 | buf, err := ss.read(off, sz) 161 | utils.Panic(err) 162 | return buf 163 | } 164 | 165 | // Bytes returns data starting from offset off of size sz. If there's not enough data, it would 166 | // return nil slice and io.EOF. 167 | func (ss *SSTable) Bytes(off, sz int) ([]byte, error) { 168 | return ss.f.Bytes(off, sz) 169 | } 170 | 171 | // Size 返回底层文件的尺寸 172 | func (ss *SSTable) Size() int64 { 173 | fileStats, err := ss.f.Fd.Stat() 174 | utils.Panic(err) 175 | return fileStats.Size() 176 | } 177 | 178 | // GetCreatedAt _ 179 | func (ss *SSTable) GetCreatedAt() *time.Time { 180 | return &ss.createdAt 181 | } 182 | 183 | // SetCreatedAt _ 184 | func (ss *SSTable) SetCreatedAt(t *time.Time) { 185 | ss.createdAt = *t 186 | } 187 | 188 | // Detele _ 189 | func (ss *SSTable) Detele() error { 190 | return ss.f.Delete() 191 | } 192 | 193 | // Truncature _ 194 | func (ss *SSTable) Truncature(size int64) error { 195 | return ss.f.Truncature(size) 196 | } 197 | -------------------------------------------------------------------------------- /utils/arena.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | import ( 20 | "log" 21 | "sync/atomic" 22 | "unsafe" 23 | 24 | "github.com/pkg/errors" 25 | ) 26 | 27 | const ( 28 | offsetSize = int(unsafe.Sizeof(uint32(0))) 29 | 30 | // Always align nodes on 64-bit boundaries, even on 32-bit architectures, 31 | // so that the node.value field is 64-bit aligned. This is necessary because 32 | // node.getValueOffset uses atomic.LoadUint64, which expects its input 33 | // pointer to be 64-bit aligned. 34 | nodeAlign = int(unsafe.Sizeof(uint64(0))) - 1 35 | 36 | MaxNodeSize = int(unsafe.Sizeof(node{})) 37 | ) 38 | 39 | // Arena should be lock-free. 40 | type Arena struct { 41 | n uint32 42 | shouldGrow bool 43 | buf []byte 44 | } 45 | 46 | // newArena returns a new arena. 47 | func newArena(n int64) *Arena { 48 | // Don't store data at position 0 in order to reserve offset=0 as a kind 49 | // of nil pointer. 50 | out := &Arena{ 51 | n: 1, 52 | buf: make([]byte, n), 53 | } 54 | return out 55 | } 56 | 57 | func (s *Arena) allocate(sz uint32) uint32 { 58 | offset := atomic.AddUint32(&s.n, sz) 59 | if !s.shouldGrow { 60 | AssertTrue(int(offset) <= len(s.buf)) 61 | return offset - sz 62 | } 63 | 64 | // We are keeping extra bytes in the end so that the checkptr doesn't fail. We apply some 65 | // intelligence to reduce the size of the node by only keeping towers upto valid height and not 66 | // maxHeight. This reduces the node's size, but checkptr doesn't know about its reduced size. 67 | // checkptr tries to verify that the node of size MaxNodeSize resides on a single heap 68 | // allocation which causes this error: checkptr:converted pointer straddles multiple allocations 69 | if int(offset) > len(s.buf)-MaxNodeSize { 70 | growBy := uint32(len(s.buf)) 71 | if growBy > 1<<30 { 72 | growBy = 1 << 30 73 | } 74 | if growBy < sz { 75 | growBy = sz 76 | } 77 | newBuf := make([]byte, len(s.buf)+int(growBy)) 78 | AssertTrue(len(s.buf) == copy(newBuf, s.buf)) 79 | s.buf = newBuf 80 | // fmt.Print(len(s.buf), " ") 81 | } 82 | return offset - sz 83 | } 84 | 85 | func (s *Arena) size() int64 { 86 | return int64(atomic.LoadUint32(&s.n)) 87 | } 88 | 89 | // putNode allocates a node in the arena. The node is aligned on a pointer-sized 90 | // boundary. The arena offset of the node is returned. 91 | func (s *Arena) putNode(height int) uint32 { 92 | // Compute the amount of the tower that will never be used, since the height 93 | // is less than maxHeight. 94 | unusedSize := (maxHeight - height) * offsetSize 95 | 96 | // Pad the allocation with enough bytes to ensure pointer alignment. 97 | l := uint32(MaxNodeSize - unusedSize + nodeAlign) 98 | n := s.allocate(l) 99 | 100 | // Return the aligned offset. 101 | m := (n + uint32(nodeAlign)) & ^uint32(nodeAlign) 102 | return m 103 | } 104 | 105 | // Put will *copy* val into arena. To make better use of this, reuse your input 106 | // val buffer. Returns an offset into buf. User is responsible for remembering 107 | // size of val. We could also store this size inside arena but the encoding and 108 | // decoding will incur some overhead. 109 | func (s *Arena) putVal(v ValueStruct) uint32 { 110 | l := uint32(v.EncodedSize()) 111 | offset := s.allocate(l) 112 | v.EncodeValue(s.buf[offset:]) 113 | return offset 114 | } 115 | 116 | func (s *Arena) putKey(key []byte) uint32 { 117 | keySz := uint32(len(key)) 118 | offset := s.allocate(keySz) 119 | buf := s.buf[offset : offset+keySz] 120 | AssertTrue(len(key) == copy(buf, key)) 121 | return offset 122 | } 123 | 124 | // getNode returns a pointer to the node located at offset. If the offset is 125 | // zero, then the nil node pointer is returned. 126 | func (s *Arena) getNode(offset uint32) *node { 127 | if offset == 0 { 128 | return nil 129 | } 130 | return (*node)(unsafe.Pointer(&s.buf[offset])) 131 | } 132 | 133 | // getKey returns byte slice at offset. 134 | func (s *Arena) getKey(offset uint32, size uint16) []byte { 135 | return s.buf[offset : offset+uint32(size)] 136 | } 137 | 138 | // getVal returns byte slice at offset. The given size should be just the value 139 | // size and should NOT include the meta bytes. 140 | func (s *Arena) getVal(offset uint32, size uint32) (ret ValueStruct) { 141 | ret.DecodeValue(s.buf[offset : offset+size]) 142 | return 143 | } 144 | 145 | // getNodeOffset returns the offset of node in the arena. If the node pointer is 146 | // nil, then the zero offset is returned. 147 | func (s *Arena) getNodeOffset(nd *node) uint32 { 148 | if nd == nil { 149 | return 0 //返回空指针 150 | } 151 | //implement me here!!! 152 | //获取某个节点,在 arena 当中的偏移量 153 | //unsafe.Pointer等价于void*,uintptr可以专门把void*的对于地址转化为数值型变量 154 | return uint32(uintptr(unsafe.Pointer(nd)) - uintptr(unsafe.Pointer(&s.buf[0]))) 155 | } 156 | 157 | // AssertTrue asserts that b is true. Otherwise, it would log fatal. 158 | func AssertTrue(b bool) { 159 | if !b { 160 | log.Fatalf("%+v", errors.Errorf("Assert failed")) 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /utils/cache/cache.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | xxhash "github.com/cespare/xxhash/v2" 6 | "sync" 7 | "unsafe" 8 | ) 9 | 10 | type Cache struct { 11 | m sync.RWMutex 12 | lru *windowLRU 13 | slru *segmentedLRU 14 | door *BloomFilter 15 | c *cmSketch 16 | t int32 17 | threshold int32 18 | data map[uint64]*list.Element 19 | } 20 | 21 | type Options struct { 22 | lruPct uint8 23 | } 24 | 25 | // NewCache size 指的是要缓存的数据个数 26 | func NewCache(size int) *Cache { 27 | //定义 window 部分缓存所占百分比,这里定义为1% 28 | const lruPct = 1 29 | //计算出来 widow 部分的容量 30 | lruSz := (lruPct * size) / 100 31 | 32 | if lruSz < 1 { 33 | lruSz = 1 34 | } 35 | 36 | // 计算 LFU 部分的缓存容量 37 | slruSz := int(float64(size) * ((100 - lruPct) / 100.0)) 38 | 39 | if slruSz < 1 { 40 | slruSz = 1 41 | } 42 | 43 | //LFU 分为两部分,stageOne 部分占比20% 44 | slruO := int(0.2 * float64(slruSz)) 45 | 46 | if slruO < 1 { 47 | slruO = 1 48 | } 49 | 50 | data := make(map[uint64]*list.Element, size) 51 | 52 | return &Cache{ 53 | lru: newWindowLRU(lruSz, data), 54 | slru: newSLRU(data, slruO, slruSz-slruO), 55 | door: newFilter(size, 0.01), //布隆过滤器设置误差率为0.01 56 | c: newCmSketch(int64(size)), 57 | data: data, //共用同一个 map 存储数据 58 | } 59 | 60 | } 61 | 62 | func (c *Cache) Set(key interface{}, value interface{}) bool { 63 | c.m.Lock() 64 | defer c.m.Unlock() 65 | return c.set(key, value) 66 | } 67 | 68 | func (c *Cache) set(key, value interface{}) bool { 69 | // keyHash 用来快速定位,conflice 用来判断冲突 70 | keyHash, conflictHash := c.keyToHash(key) 71 | 72 | // 刚放进去的缓存都先放到 window lru 中,所以 stage = 0 73 | i := storeItem{ 74 | stage: 0, 75 | key: keyHash, 76 | conflict: conflictHash, 77 | value: value, 78 | } 79 | 80 | // 如果 window 已满,要返回被淘汰的数据 81 | eitem, evicted := c.lru.add(i) 82 | 83 | if !evicted { 84 | return true 85 | } 86 | 87 | // 如果 window 中有被淘汰的数据,会走到这里 88 | // 需要从 LFU 的 stageOne 部分找到一个淘汰者 89 | // 二者进行 PK 90 | victim := c.slru.victim() 91 | 92 | // 走到这里是因为 LFU 未满,那么 window lru 的淘汰数据,可以进入 stageOne 93 | if victim == nil { 94 | c.slru.add(eitem) 95 | return true 96 | } 97 | 98 | // 这里进行 PK,必须在 bloomfilter 中出现过一次,才允许 PK 99 | // 在 bf 中出现,说明访问频率 >= 2 100 | if !c.door.Allow(uint32(eitem.key)) { 101 | return true 102 | } 103 | 104 | // 估算 windowlru 和 LFU 中淘汰数据,历史访问频次 105 | // 访问频率高的,被认为更有资格留下来 106 | vcount := c.c.Estimate(victim.key) 107 | ocount := c.c.Estimate(eitem.key) 108 | 109 | if ocount < vcount { 110 | return true 111 | } 112 | 113 | // 留下来的人进入 stageOne 114 | c.slru.add(eitem) 115 | return true 116 | } 117 | 118 | func (c *Cache) Get(key interface{}) (interface{}, bool) { 119 | c.m.RLock() 120 | defer c.m.RUnlock() 121 | return c.get(key) 122 | } 123 | 124 | func (c *Cache) get(key interface{}) (interface{}, bool) { 125 | c.t++ 126 | if c.t == c.threshold { 127 | c.c.Reset() 128 | c.door.reset() 129 | c.t = 0 130 | } 131 | 132 | keyHash, conflictHash := c.keyToHash(key) 133 | 134 | val, ok := c.data[keyHash] 135 | if !ok { 136 | c.door.Allow(uint32(keyHash)) 137 | c.c.Increment(keyHash) 138 | return nil, false 139 | } 140 | 141 | item := val.Value.(*storeItem) 142 | 143 | if item.conflict != conflictHash { 144 | c.door.Allow(uint32(keyHash)) 145 | c.c.Increment(keyHash) 146 | return nil, false 147 | } 148 | c.door.Allow(uint32(keyHash)) 149 | c.c.Increment(item.key) 150 | 151 | v := item.value 152 | 153 | if item.stage == 0 { 154 | c.lru.get(val) 155 | } else { 156 | c.slru.get(val) 157 | } 158 | 159 | return v, true 160 | 161 | } 162 | 163 | func (c *Cache) Del(key interface{}) (interface{}, bool) { 164 | c.m.Lock() 165 | defer c.m.Unlock() 166 | return c.del(key) 167 | } 168 | 169 | func (c *Cache) del(key interface{}) (interface{}, bool) { 170 | keyHash, conflictHash := c.keyToHash(key) 171 | 172 | val, ok := c.data[keyHash] 173 | if !ok { 174 | return 0, false 175 | } 176 | 177 | item := val.Value.(*storeItem) 178 | 179 | if conflictHash != 0 && (conflictHash != item.conflict) { 180 | return 0, false 181 | } 182 | 183 | delete(c.data, keyHash) 184 | return item.conflict, true 185 | } 186 | 187 | func (c *Cache) keyToHash(key interface{}) (uint64, uint64) { 188 | if key == nil { 189 | return 0, 0 190 | } 191 | switch k := key.(type) { 192 | case uint64: 193 | return k, 0 194 | case string: 195 | return MemHashString(k), xxhash.Sum64String(k) 196 | case []byte: 197 | return MemHash(k), xxhash.Sum64(k) 198 | case byte: 199 | return uint64(k), 0 200 | case int: 201 | return uint64(k), 0 202 | case int32: 203 | return uint64(k), 0 204 | case uint32: 205 | return uint64(k), 0 206 | case int64: 207 | return uint64(k), 0 208 | default: 209 | panic("Key type not supported") 210 | } 211 | } 212 | 213 | type stringStruct struct { 214 | str unsafe.Pointer 215 | len int 216 | } 217 | 218 | //go:noescape 219 | //go:linkname memhash runtime.memhash 220 | func memhash(p unsafe.Pointer, h, s uintptr) uintptr 221 | 222 | // MemHashString is the hash function used by go map, it utilizes available hardware instructions 223 | // (behaves as aeshash if aes instruction is available). 224 | // NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. 225 | func MemHashString(str string) uint64 { 226 | ss := (*stringStruct)(unsafe.Pointer(&str)) 227 | return uint64(memhash(ss.str, 0, uintptr(ss.len))) 228 | } 229 | 230 | func MemHash(data []byte) uint64 { 231 | ss := (*stringStruct)(unsafe.Pointer(&data)) 232 | return uint64(memhash(ss.str, 0, uintptr(ss.len))) 233 | } 234 | 235 | func (c *Cache) String() string { 236 | var s string 237 | s += c.lru.String() + " | " + c.slru.String() 238 | return s 239 | } 240 | -------------------------------------------------------------------------------- /file/vlog.go: -------------------------------------------------------------------------------- 1 | package file 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "hash/crc32" 8 | "io" 9 | "math" 10 | "os" 11 | "sync" 12 | "sync/atomic" 13 | 14 | "github.com/hardcore-os/corekv/utils" 15 | "github.com/pkg/errors" 16 | ) 17 | 18 | type LogFile struct { 19 | Lock sync.RWMutex 20 | FID uint32 21 | size uint32 22 | f *MmapFile 23 | } 24 | 25 | func (lf *LogFile) Open(opt *Options) error { 26 | var err error 27 | lf.FID = uint32(opt.FID) 28 | lf.Lock = sync.RWMutex{} 29 | lf.f, err = OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) 30 | utils.Panic2(nil, err) 31 | fi, err := lf.f.Fd.Stat() 32 | if err != nil { 33 | return utils.WarpErr("Unable to run file.Stat", err) 34 | } 35 | // 获取文件尺寸 36 | sz := fi.Size() 37 | utils.CondPanic(sz > math.MaxUint32, fmt.Errorf("file size: %d greater than %d", 38 | uint32(sz), uint32(math.MaxUint32))) 39 | lf.size = uint32(sz) 40 | // TODO 是否要在这里弄一个header放一些元数据呢? 41 | return nil 42 | } 43 | 44 | // Acquire lock on mmap/file if you are calling this 45 | func (lf *LogFile) Read(p *utils.ValuePtr) (buf []byte, err error) { 46 | offset := p.Offset 47 | // Do not convert size to uint32, because the lf.fmap can be of size 48 | // 4GB, which overflows the uint32 during conversion to make the size 0, 49 | // causing the read to fail with ErrEOF. See issue #585. 50 | size := int64(len(lf.f.Data)) 51 | valsz := p.Len 52 | lfsz := atomic.LoadUint32(&lf.size) 53 | if int64(offset) >= size || int64(offset+valsz) > size || 54 | // Ensure that the read is within the file's actual size. It might be possible that 55 | // the offset+valsz length is beyond the file's actual size. This could happen when 56 | // dropAll and iterations are running simultaneously. 57 | int64(offset+valsz) > int64(lfsz) { 58 | err = io.EOF 59 | } else { 60 | buf, err = lf.f.Bytes(int(offset), int(valsz)) 61 | } 62 | return buf, err 63 | } 64 | 65 | func (lf *LogFile) DoneWriting(offset uint32) error { 66 | // Sync before acquiring lock. (We call this from write() and thus know we have shared access 67 | // to the fd.) 68 | if err := lf.f.Sync(); err != nil { 69 | return errors.Wrapf(err, "Unable to sync value log: %q", lf.FileName()) 70 | } 71 | 72 | // 写嘛 总是要锁一下的 73 | lf.Lock.Lock() 74 | defer lf.Lock.Unlock() 75 | 76 | // TODO: Confirm if we need to run a file sync after truncation. 77 | // Truncation must run after unmapping, otherwise Windows would crap itself. 78 | if err := lf.f.Truncature(int64(offset)); err != nil { 79 | return errors.Wrapf(err, "Unable to truncate file: %q", lf.FileName()) 80 | } 81 | 82 | // Reinitialize the log file. This will mmap the entire file. 83 | if err := lf.Init(); err != nil { 84 | return errors.Wrapf(err, "failed to initialize file %s", lf.FileName()) 85 | } 86 | 87 | // Previously we used to close the file after it was written and reopen it in read-only mode. 88 | // We no longer open files in read-only mode. We keep all vlog files open in read-write mode. 89 | return nil 90 | } 91 | func (lf *LogFile) Write(offset uint32, buf []byte) (err error) { 92 | return lf.f.AppendBuffer(offset, buf) 93 | } 94 | func (lf *LogFile) Truncate(offset int64) error { 95 | return lf.f.Truncature(offset) 96 | } 97 | func (lf *LogFile) Close() error { 98 | return lf.f.Close() 99 | } 100 | 101 | func (lf *LogFile) Size() int64 { 102 | return int64(atomic.LoadUint32(&lf.size)) 103 | } 104 | func (lf *LogFile) AddSize(offset uint32) { 105 | atomic.StoreUint32(&lf.size, offset) 106 | } 107 | 108 | // 完成log文件的初始化 109 | func (lf *LogFile) Bootstrap() error { 110 | // TODO 是否需要初始化一些内容给vlog文件? 111 | return nil 112 | } 113 | 114 | func (lf *LogFile) Init() error { 115 | fstat, err := lf.f.Fd.Stat() 116 | if err != nil { 117 | return errors.Wrapf(err, "Unable to check stat for %q", lf.FileName()) 118 | } 119 | sz := fstat.Size() 120 | if sz == 0 { 121 | // File is empty. We don't need to mmap it. Return. 122 | return nil 123 | } 124 | utils.CondPanic(sz > math.MaxUint32, fmt.Errorf("[LogFile.Init] sz > math.MaxUint32")) 125 | lf.size = uint32(sz) 126 | return nil 127 | } 128 | func (lf *LogFile) FileName() string { 129 | return lf.f.Fd.Name() 130 | } 131 | 132 | func (lf *LogFile) Seek(offset int64, whence int) (ret int64, err error) { 133 | return lf.f.Fd.Seek(offset, whence) 134 | } 135 | 136 | func (lf *LogFile) FD() *os.File { 137 | return lf.f.Fd 138 | } 139 | 140 | // You must hold lf.lock to sync() 141 | func (lf *LogFile) Sync() error { 142 | return lf.f.Sync() 143 | } 144 | 145 | // encodeEntry will encode entry to the buf 146 | // layout of entry 147 | // +--------+-----+-------+-------+ 148 | // | header | key | value | crc32 | 149 | // +--------+-----+-------+-------+ 150 | func (lf *LogFile) EncodeEntry(e *utils.Entry, buf *bytes.Buffer, offset uint32) (int, error) { 151 | h := utils.Header{ 152 | KLen: uint32(len(e.Key)), 153 | VLen: uint32(len(e.Value)), 154 | ExpiresAt: e.ExpiresAt, 155 | Meta: e.Meta, 156 | } 157 | 158 | hash := crc32.New(utils.CastagnoliCrcTable) 159 | writer := io.MultiWriter(buf, hash) 160 | 161 | // encode header. 162 | var headerEnc [utils.MaxHeaderSize]byte 163 | sz := h.Encode(headerEnc[:]) 164 | utils.Panic2(writer.Write(headerEnc[:sz])) 165 | // Encryption is disabled so writing directly to the buffer. 166 | utils.Panic2(writer.Write(e.Key)) 167 | utils.Panic2(writer.Write(e.Value)) 168 | // write crc32 hash. 169 | var crcBuf [crc32.Size]byte 170 | binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32()) 171 | utils.Panic2(buf.Write(crcBuf[:])) 172 | // return encoded length. 173 | return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf), nil 174 | } 175 | func (lf *LogFile) DecodeEntry(buf []byte, offset uint32) (*utils.Entry, error) { 176 | var h utils.Header 177 | hlen := h.Decode(buf) 178 | kv := buf[hlen:] 179 | e := &utils.Entry{ 180 | Meta: h.Meta, 181 | ExpiresAt: h.ExpiresAt, 182 | Offset: offset, 183 | Key: kv[:h.KLen], 184 | Value: kv[h.KLen : h.KLen+h.VLen], 185 | } 186 | return e, nil 187 | } 188 | -------------------------------------------------------------------------------- /file/mmap_darwin.go: -------------------------------------------------------------------------------- 1 | //go:build darwin 2 | // +build darwin 3 | 4 | // Copyright 2021 hardcore-os Project Authors 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License") 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | package file 19 | 20 | import ( 21 | "encoding/binary" 22 | "fmt" 23 | "io" 24 | "os" 25 | "path/filepath" 26 | 27 | "github.com/hardcore-os/corekv/utils/mmap" 28 | "github.com/pkg/errors" 29 | ) 30 | 31 | // MmapFile represents an mmapd file and includes both the buffer to the data and the file descriptor. 32 | type MmapFile struct { 33 | Data []byte 34 | Fd *os.File 35 | } 36 | 37 | // OpenMmapFileUsing os 38 | func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) { 39 | filename := fd.Name() 40 | fi, err := fd.Stat() 41 | if err != nil { 42 | return nil, errors.Wrapf(err, "cannot stat file: %s", filename) 43 | } 44 | 45 | var rerr error 46 | fileSize := fi.Size() 47 | if sz > 0 && fileSize == 0 { 48 | // If file is empty, truncate it to sz. 49 | if err := fd.Truncate(int64(sz)); err != nil { 50 | return nil, errors.Wrapf(err, "error while truncation") 51 | } 52 | fileSize = int64(sz) 53 | } 54 | 55 | // fmt.Printf("Mmaping file: %s with writable: %v filesize: %d\n", fd.Name(), writable, fileSize) 56 | buf, err := mmap.Mmap(fd, writable, fileSize) // Mmap up to file size. 57 | if err != nil { 58 | return nil, errors.Wrapf(err, "while mmapping %s with size: %d", fd.Name(), fileSize) 59 | } 60 | 61 | if fileSize == 0 { 62 | dir, _ := filepath.Split(filename) 63 | go SyncDir(dir) 64 | } 65 | return &MmapFile{ 66 | Data: buf, 67 | Fd: fd, 68 | }, rerr 69 | } 70 | 71 | // OpenMmapFile opens an existing file or creates a new file. If the file is 72 | // created, it would truncate the file to maxSz. In both cases, it would mmap 73 | // the file to maxSz and returned it. In case the file is created, z.NewFile is 74 | // returned. 75 | func OpenMmapFile(filename string, flag int, maxSz int) (*MmapFile, error) { 76 | // fmt.Printf("opening file %s with flag: %v\n", filename, flag) 77 | fd, err := os.OpenFile(filename, flag, 0666) 78 | if err != nil { 79 | return nil, errors.Wrapf(err, "unable to open: %s", filename) 80 | } 81 | writable := true 82 | if flag == os.O_RDONLY { 83 | writable = false 84 | } 85 | return OpenMmapFileUsing(fd, maxSz, writable) 86 | } 87 | 88 | type mmapReader struct { 89 | Data []byte 90 | offset int 91 | } 92 | 93 | func (mr *mmapReader) Read(buf []byte) (int, error) { 94 | if mr.offset > len(mr.Data) { 95 | return 0, io.EOF 96 | } 97 | n := copy(buf, mr.Data[mr.offset:]) 98 | mr.offset += n 99 | if n < len(buf) { 100 | return n, io.EOF 101 | } 102 | return n, nil 103 | } 104 | 105 | func (m *MmapFile) NewReader(offset int) io.Reader { 106 | return &mmapReader{ 107 | Data: m.Data, 108 | offset: offset, 109 | } 110 | } 111 | 112 | // Bytes returns data starting from offset off of size sz. If there's not enough data, it would 113 | // return nil slice and io.EOF. 114 | func (m *MmapFile) Bytes(off, sz int) ([]byte, error) { 115 | if len(m.Data[off:]) < sz { 116 | return nil, io.EOF 117 | } 118 | return m.Data[off : off+sz], nil 119 | } 120 | 121 | // Slice returns the slice at the given offset. 122 | func (m *MmapFile) Slice(offset int) []byte { 123 | sz := binary.BigEndian.Uint32(m.Data[offset:]) 124 | start := offset + 4 125 | next := start + int(sz) 126 | if next > len(m.Data) { 127 | return []byte{} 128 | } 129 | res := m.Data[start:next] 130 | return res 131 | } 132 | 133 | // AllocateSlice allocates a slice of the given size at the given offset. 134 | func (m *MmapFile) AllocateSlice(sz, offset int) ([]byte, int, error) { 135 | start := offset + 4 136 | 137 | // If the file is too small, double its size or increase it by 1GB, whichever is smaller. 138 | if start+sz > len(m.Data) { 139 | const oneGB = 1 << 30 140 | growBy := len(m.Data) 141 | if growBy > oneGB { 142 | growBy = oneGB 143 | } 144 | if growBy < sz+4 { 145 | growBy = sz + 4 146 | } 147 | if err := m.Truncature(int64(len(m.Data) + growBy)); err != nil { 148 | return nil, 0, err 149 | } 150 | } 151 | 152 | binary.BigEndian.PutUint32(m.Data[offset:], uint32(sz)) 153 | return m.Data[start : start+sz], start + sz, nil 154 | } 155 | 156 | const oneGB = 1 << 30 157 | 158 | // AppendBuffer 向内存中追加一个buffer,如果空间不足则重新映射,扩大空间 159 | func (m *MmapFile) AppendBuffer(offset uint32, buf []byte) error { 160 | size := len(m.Data) 161 | needSize := len(buf) 162 | end := int(offset) + needSize 163 | if end > size { 164 | growBy := size 165 | if growBy > oneGB { 166 | growBy = oneGB 167 | } 168 | if growBy < needSize { 169 | growBy = needSize 170 | } 171 | if err := m.Truncature(int64(end)); err != nil { 172 | return err 173 | } 174 | } 175 | dLen := copy(m.Data[offset:end], buf) 176 | if dLen != needSize { 177 | return errors.Errorf("dLen != needSize AppendBuffer failed") 178 | } 179 | return nil 180 | } 181 | 182 | func (m *MmapFile) Sync() error { 183 | if m == nil { 184 | return nil 185 | } 186 | return mmap.Msync(m.Data) 187 | } 188 | 189 | func (m *MmapFile) Delete() error { 190 | if m.Fd == nil { 191 | return nil 192 | } 193 | 194 | if err := mmap.Munmap(m.Data); err != nil { 195 | return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) 196 | } 197 | m.Data = nil 198 | if err := m.Fd.Truncate(0); err != nil { 199 | return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) 200 | } 201 | if err := m.Fd.Close(); err != nil { 202 | return fmt.Errorf("while close file: %s, error: %v\n", m.Fd.Name(), err) 203 | } 204 | return os.Remove(m.Fd.Name()) 205 | } 206 | 207 | // Close would close the file. It would also truncate the file if maxSz >= 0. 208 | func (m *MmapFile) Close() error { 209 | if m.Fd == nil { 210 | return nil 211 | } 212 | if err := m.Sync(); err != nil { 213 | return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) 214 | } 215 | if err := mmap.Munmap(m.Data); err != nil { 216 | return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) 217 | } 218 | return m.Fd.Close() 219 | } 220 | 221 | func SyncDir(dir string) error { 222 | df, err := os.Open(dir) 223 | if err != nil { 224 | return errors.Wrapf(err, "while opening %s", dir) 225 | } 226 | if err := df.Sync(); err != nil { 227 | return errors.Wrapf(err, "while syncing %s", dir) 228 | } 229 | if err := df.Close(); err != nil { 230 | return errors.Wrapf(err, "while closing %s", dir) 231 | } 232 | return nil 233 | } 234 | 235 | // Truncature 兼容接口 236 | func (m *MmapFile) Truncature(maxSz int64) error { 237 | if err := m.Sync(); err != nil { 238 | return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) 239 | } 240 | if err := mmap.Munmap(m.Data); err != nil { 241 | return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) 242 | } 243 | if err := m.Fd.Truncate(maxSz); err != nil { 244 | return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) 245 | } 246 | var err error 247 | m.Data, err = mmap.Mmap(m.Fd, true, maxSz) // Mmap up to max size. 248 | return err 249 | } 250 | 251 | // ReName 兼容接口 252 | func (m *MmapFile) ReName(name string) error { 253 | return nil 254 | } 255 | -------------------------------------------------------------------------------- /file/mmap_linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | // Copyright 2021 hardcore-os Project Authors 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License") 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | package file 18 | 19 | import ( 20 | "encoding/binary" 21 | "fmt" 22 | "io" 23 | "os" 24 | "path/filepath" 25 | 26 | "github.com/hardcore-os/corekv/utils/mmap" 27 | "github.com/pkg/errors" 28 | ) 29 | 30 | // MmapFile represents an mmapd file and includes both the buffer to the data and the file descriptor. 31 | type MmapFile struct { 32 | Data []byte 33 | Fd *os.File 34 | } 35 | 36 | // OpenMmapFileUsing os 37 | func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) { 38 | filename := fd.Name() 39 | fi, err := fd.Stat() 40 | if err != nil { 41 | return nil, errors.Wrapf(err, "cannot stat file: %s", filename) 42 | } 43 | 44 | var rerr error 45 | fileSize := fi.Size() 46 | if sz > 0 && fileSize == 0 { 47 | // If file is empty, truncate it to sz. 48 | if err := fd.Truncate(int64(sz)); err != nil { 49 | return nil, errors.Wrapf(err, "error while truncation") 50 | } 51 | fileSize = int64(sz) 52 | } 53 | 54 | // fmt.Printf("Mmaping file: %s with writable: %v filesize: %d\n", fd.Name(), writable, fileSize) 55 | buf, err := mmap.Mmap(fd, writable, fileSize) // Mmap up to file size. 56 | if err != nil { 57 | return nil, errors.Wrapf(err, "while mmapping %s with size: %d", fd.Name(), fileSize) 58 | } 59 | 60 | if fileSize == 0 { 61 | dir, _ := filepath.Split(filename) 62 | go SyncDir(dir) 63 | } 64 | return &MmapFile{ 65 | Data: buf, 66 | Fd: fd, 67 | }, rerr 68 | } 69 | 70 | // OpenMmapFile opens an existing file or creates a new file. If the file is 71 | // created, it would truncate the file to maxSz. In both cases, it would mmap 72 | // the file to maxSz and returned it. In case the file is created, z.NewFile is 73 | // returned. 74 | func OpenMmapFile(filename string, flag int, maxSz int) (*MmapFile, error) { 75 | // fmt.Printf("opening file %s with flag: %v\n", filename, flag) 76 | fd, err := os.OpenFile(filename, flag, 0666) 77 | if err != nil { 78 | return nil, errors.Wrapf(err, "unable to open: %s", filename) 79 | } 80 | writable := true 81 | if flag == os.O_RDONLY { 82 | writable = false 83 | } 84 | // 如果 sst文件层被打开过,则使用其文件原来的大小 85 | if fileInfo, err := fd.Stat(); err == nil && fileInfo != nil && fileInfo.Size() > 0 { 86 | maxSz = int(fileInfo.Size()) 87 | } 88 | return OpenMmapFileUsing(fd, maxSz, writable) 89 | } 90 | 91 | type mmapReader struct { 92 | Data []byte 93 | offset int 94 | } 95 | 96 | func (mr *mmapReader) Read(buf []byte) (int, error) { 97 | if mr.offset > len(mr.Data) { 98 | return 0, io.EOF 99 | } 100 | n := copy(buf, mr.Data[mr.offset:]) 101 | mr.offset += n 102 | if n < len(buf) { 103 | return n, io.EOF 104 | } 105 | return n, nil 106 | } 107 | 108 | func (m *MmapFile) NewReader(offset int) io.Reader { 109 | return &mmapReader{ 110 | Data: m.Data, 111 | offset: offset, 112 | } 113 | } 114 | 115 | // Bytes returns data starting from offset off of size sz. If there's not enough data, it would 116 | // return nil slice and io.EOF. 117 | func (m *MmapFile) Bytes(off, sz int) ([]byte, error) { 118 | if len(m.Data[off:]) < sz { 119 | return nil, io.EOF 120 | } 121 | return m.Data[off : off+sz], nil 122 | } 123 | 124 | // Slice returns the slice at the given offset. 125 | func (m *MmapFile) Slice(offset int) []byte { 126 | sz := binary.BigEndian.Uint32(m.Data[offset:]) 127 | start := offset + 4 128 | next := start + int(sz) 129 | if next > len(m.Data) { 130 | return []byte{} 131 | } 132 | res := m.Data[start:next] 133 | return res 134 | } 135 | 136 | // AllocateSlice allocates a slice of the given size at the given offset. 137 | func (m *MmapFile) AllocateSlice(sz, offset int) ([]byte, int, error) { 138 | start := offset + 4 139 | 140 | // If the file is too small, double its size or increase it by 1GB, whichever is smaller. 141 | if start+sz > len(m.Data) { 142 | const oneGB = 1 << 30 143 | growBy := len(m.Data) 144 | if growBy > oneGB { 145 | growBy = oneGB 146 | } 147 | if growBy < sz+4 { 148 | growBy = sz + 4 149 | } 150 | if err := m.Truncature(int64(len(m.Data) + growBy)); err != nil { 151 | return nil, 0, err 152 | } 153 | } 154 | 155 | binary.BigEndian.PutUint32(m.Data[offset:], uint32(sz)) 156 | return m.Data[start : start+sz], start + sz, nil 157 | } 158 | 159 | const oneGB = 1 << 30 160 | 161 | // AppendBuffer 向内存中追加一个buffer,如果空间不足则重新映射,扩大空间 162 | func (m *MmapFile) AppendBuffer(offset uint32, buf []byte) error { 163 | size := len(m.Data) 164 | needSize := len(buf) 165 | end := int(offset) + needSize 166 | if end > size { 167 | growBy := size 168 | if growBy > oneGB { 169 | growBy = oneGB 170 | } 171 | if growBy < needSize { 172 | growBy = needSize 173 | } 174 | if err := m.Truncature(int64(end)); err != nil { 175 | return err 176 | } 177 | } 178 | dLen := copy(m.Data[offset:end], buf) 179 | if dLen != needSize { 180 | return errors.Errorf("dLen != needSize AppendBuffer failed") 181 | } 182 | return nil 183 | } 184 | 185 | func (m *MmapFile) Sync() error { 186 | if m == nil { 187 | return nil 188 | } 189 | return mmap.Msync(m.Data) 190 | } 191 | 192 | func (m *MmapFile) Delete() error { 193 | if m.Fd == nil { 194 | return nil 195 | } 196 | 197 | if err := mmap.Munmap(m.Data); err != nil { 198 | return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) 199 | } 200 | m.Data = nil 201 | if err := m.Fd.Truncate(0); err != nil { 202 | return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) 203 | } 204 | if err := m.Fd.Close(); err != nil { 205 | return fmt.Errorf("while close file: %s, error: %v\n", m.Fd.Name(), err) 206 | } 207 | return os.Remove(m.Fd.Name()) 208 | } 209 | 210 | // Close would close the file. It would also truncate the file if maxSz >= 0. 211 | func (m *MmapFile) Close() error { 212 | if m.Fd == nil { 213 | return nil 214 | } 215 | if err := m.Sync(); err != nil { 216 | return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) 217 | } 218 | if err := mmap.Munmap(m.Data); err != nil { 219 | return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) 220 | } 221 | return m.Fd.Close() 222 | } 223 | 224 | func SyncDir(dir string) error { 225 | df, err := os.Open(dir) 226 | if err != nil { 227 | return errors.Wrapf(err, "while opening %s", dir) 228 | } 229 | if err := df.Sync(); err != nil { 230 | return errors.Wrapf(err, "while syncing %s", dir) 231 | } 232 | if err := df.Close(); err != nil { 233 | return errors.Wrapf(err, "while closing %s", dir) 234 | } 235 | return nil 236 | } 237 | 238 | // Truncature 兼容接口 239 | func (m *MmapFile) Truncature(maxSz int64) error { 240 | if err := m.Sync(); err != nil { 241 | return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) 242 | } 243 | if err := m.Fd.Truncate(maxSz); err != nil { 244 | return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) 245 | } 246 | 247 | var err error 248 | m.Data, err = mmap.Mremap(m.Data, int(maxSz)) // Mmap up to max size. 249 | return err 250 | } 251 | 252 | // ReName 兼容接口 253 | func (m *MmapFile) ReName(name string) error { 254 | return nil 255 | } 256 | -------------------------------------------------------------------------------- /lsm/lsm_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lsm 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "os" 21 | "testing" 22 | "time" 23 | 24 | "github.com/hardcore-os/corekv/utils" 25 | ) 26 | 27 | var ( 28 | // 初始化opt 29 | opt = &Options{ 30 | WorkDir: "../work_test", 31 | SSTableMaxSz: 1024, 32 | MemTableSize: 1024, 33 | BlockSize: 1024, 34 | BloomFalsePositive: 0, 35 | BaseLevelSize: 10 << 20, 36 | LevelSizeMultiplier: 10, 37 | BaseTableSize: 2 << 20, 38 | TableSizeMultiplier: 2, 39 | NumLevelZeroTables: 15, 40 | MaxLevelNum: 7, 41 | NumCompactors: 3, 42 | } 43 | ) 44 | 45 | // TestBase 正确性测试 46 | func TestBase(t *testing.T) { 47 | clearDir() 48 | lsm := buildLSM() 49 | test := func() { 50 | // 基准测试 51 | baseTest(t, lsm, 128) 52 | } 53 | // 运行N次测试多个sst的影响 54 | runTest(1, test) 55 | } 56 | 57 | // TestClose 测试优雅关闭 58 | func TestClose(t *testing.T) { 59 | clearDir() 60 | lsm := buildLSM() 61 | lsm.StartCompacter() 62 | test := func() { 63 | baseTest(t, lsm, 128) 64 | utils.Err(lsm.Close()) 65 | // 重启后可正常工作才算成功 66 | lsm = buildLSM() 67 | baseTest(t, lsm, 128) 68 | } 69 | // 运行N次测试多个sst的影响 70 | runTest(1, test) 71 | } 72 | 73 | // 命中不同存储介质的逻辑分支测试 74 | func TestHitStorage(t *testing.T) { 75 | clearDir() 76 | lsm := buildLSM() 77 | e := utils.BuildEntry() 78 | lsm.Set(e) 79 | // 命中内存表 80 | hitMemtable := func() { 81 | v, err := lsm.memTable.Get(e.Key) 82 | utils.Err(err) 83 | utils.CondPanic(!bytes.Equal(v.Value, e.Value), fmt.Errorf("[hitMemtable] !equal(v.Value, e.Value)")) 84 | } 85 | // 命中L0层 86 | hitL0 := func() { 87 | // baseTest的测试就包含 在命中L0的sst查询 88 | baseTest(t, lsm, 128) 89 | } 90 | // 命中非L0层 91 | hitNotL0 := func() { 92 | // 通过压缩将compact生成非L0数据, 会命中l6层 93 | lsm.levels.runOnce(0) 94 | baseTest(t, lsm, 128) 95 | } 96 | // 命中bf 97 | hitBloom := func() { 98 | ee := utils.BuildEntry() 99 | // 查询不存在的key 如果命中则说明一定不存在 100 | v, err := lsm.levels.levels[0].tables[0].Serach(ee.Key, &ee.Version) 101 | utils.CondPanic(v != nil, fmt.Errorf("[hitBloom] v != nil")) 102 | utils.CondPanic(err != utils.ErrKeyNotFound, fmt.Errorf("[hitBloom] err != utils.ErrKeyNotFound")) 103 | } 104 | 105 | runTest(1, hitMemtable, hitL0, hitNotL0, hitBloom) 106 | } 107 | 108 | // Testparameter 测试异常参数 109 | func TestPsarameter(t *testing.T) { 110 | clearDir() 111 | lsm := buildLSM() 112 | testNil := func() { 113 | utils.CondPanic(lsm.Set(nil) != utils.ErrEmptyKey, fmt.Errorf("[testNil] lsm.Set(nil) != err")) 114 | _, err := lsm.Get(nil) 115 | utils.CondPanic(err != utils.ErrEmptyKey, fmt.Errorf("[testNil] lsm.Set(nil) != err")) 116 | } 117 | // TODO p2 优先级的case先忽略 118 | runTest(1, testNil) 119 | } 120 | 121 | // TestCompact 测试L0到Lmax压缩 122 | func TestCompact(t *testing.T) { 123 | clearDir() 124 | lsm := buildLSM() 125 | ok := false 126 | l0TOLMax := func() { 127 | // 正常触发即可 128 | baseTest(t, lsm, 128) 129 | // 直接触发压缩执行 130 | fid := lsm.levels.maxFID + 1 131 | lsm.levels.runOnce(1) 132 | for _, t := range lsm.levels.levels[6].tables { 133 | if t.fid == fid { 134 | ok = true 135 | } 136 | } 137 | utils.CondPanic(!ok, fmt.Errorf("[l0TOLMax] fid not found")) 138 | } 139 | l0ToL0 := func() { 140 | // 先写一些数据进来 141 | baseTest(t, lsm, 128) 142 | fid := lsm.levels.maxFID + 1 143 | cd := buildCompactDef(lsm, 0, 0, 0) 144 | // 非常tricky的处理方法,为了能通过检查 145 | tricky(cd.thisLevel.tables) 146 | ok := lsm.levels.fillTablesL0ToL0(cd) 147 | utils.CondPanic(!ok, fmt.Errorf("[l0ToL0] lsm.levels.fillTablesL0ToL0(cd) ret == false")) 148 | err := lsm.levels.runCompactDef(0, 0, *cd) 149 | // 删除全局状态,便于下游测试逻辑 150 | lsm.levels.compactState.delete(*cd) 151 | utils.Err(err) 152 | ok = false 153 | for _, t := range lsm.levels.levels[0].tables { 154 | if t.fid == fid { 155 | ok = true 156 | } 157 | } 158 | utils.CondPanic(!ok, fmt.Errorf("[l0ToL0] fid not found")) 159 | } 160 | nextCompact := func() { 161 | baseTest(t, lsm, 128) 162 | fid := lsm.levels.maxFID + 1 163 | cd := buildCompactDef(lsm, 0, 0, 1) 164 | // 非常tricky的处理方法,为了能通过检查 165 | tricky(cd.thisLevel.tables) 166 | ok := lsm.levels.fillTables(cd) 167 | utils.CondPanic(!ok, fmt.Errorf("[nextCompact] lsm.levels.fillTables(cd) ret == false")) 168 | err := lsm.levels.runCompactDef(0, 0, *cd) 169 | // 删除全局状态,便于下游测试逻辑 170 | lsm.levels.compactState.delete(*cd) 171 | utils.Err(err) 172 | ok = false 173 | for _, t := range lsm.levels.levels[1].tables { 174 | if t.fid == fid { 175 | ok = true 176 | } 177 | } 178 | utils.CondPanic(!ok, fmt.Errorf("[nextCompact] fid not found")) 179 | } 180 | 181 | maxToMax := func() { 182 | baseTest(t, lsm, 128) 183 | fid := lsm.levels.maxFID + 1 184 | cd := buildCompactDef(lsm, 6, 6, 6) 185 | // 非常tricky的处理方法,为了能通过检查 186 | tricky(cd.thisLevel.tables) 187 | ok := lsm.levels.fillTables(cd) 188 | utils.CondPanic(!ok, fmt.Errorf("[maxToMax] lsm.levels.fillTables(cd) ret == false")) 189 | err := lsm.levels.runCompactDef(0, 6, *cd) 190 | // 删除全局状态,便于下游测试逻辑 191 | lsm.levels.compactState.delete(*cd) 192 | utils.Err(err) 193 | ok = false 194 | for _, t := range lsm.levels.levels[6].tables { 195 | if t.fid == fid { 196 | ok = true 197 | } 198 | } 199 | utils.CondPanic(!ok, fmt.Errorf("[maxToMax] fid not found")) 200 | } 201 | parallerCompact := func() { 202 | baseTest(t, lsm, 128) 203 | cd := buildCompactDef(lsm, 0, 0, 1) 204 | // 非常tricky的处理方法,为了能通过检查 205 | tricky(cd.thisLevel.tables) 206 | ok := lsm.levels.fillTables(cd) 207 | utils.CondPanic(!ok, fmt.Errorf("[parallerCompact] lsm.levels.fillTables(cd) ret == false")) 208 | // 构建完全相同两个压缩计划的执行,以便于百分比构建 压缩冲突 209 | go lsm.levels.runCompactDef(0, 0, *cd) 210 | lsm.levels.runCompactDef(0, 0, *cd) 211 | // 检查compact status状态查看是否在执行并行压缩 212 | isParaller := false 213 | for _, state := range lsm.levels.compactState.levels { 214 | if len(state.ranges) != 0 { 215 | isParaller = true 216 | } 217 | } 218 | utils.CondPanic(!isParaller, fmt.Errorf("[parallerCompact] not is paralle")) 219 | } 220 | // 运行N次测试多个sst的影响 221 | runTest(1, l0TOLMax, l0ToL0, nextCompact, maxToMax, parallerCompact) 222 | } 223 | 224 | // 正确性测试 225 | func baseTest(t *testing.T, lsm *LSM, n int) { 226 | // 用来跟踪调试的 227 | e := &utils.Entry{ 228 | Key: []byte("CRTS😁硬核课堂MrGSBtL12345678"), 229 | Value: []byte("我草了"), 230 | ExpiresAt: 123, 231 | } 232 | //caseList := make([]*utils.Entry, 0) 233 | //caseList = append(caseList, e) 234 | 235 | // 随机构建数据进行测试 236 | lsm.Set(e) 237 | for i := 1; i < n; i++ { 238 | ee := utils.BuildEntry() 239 | lsm.Set(ee) 240 | // caseList = append(caseList, ee) 241 | } 242 | // 从levels中进行GET 243 | v, err := lsm.Get(e.Key) 244 | utils.Panic(err) 245 | utils.CondPanic(!bytes.Equal(e.Value, v.Value), fmt.Errorf("lsm.Get(e.Key) value not equal !!!")) 246 | // TODO range功能待完善 247 | //retList := make([]*utils.Entry, 0) 248 | // testRange := func(isAsc bool) { 249 | // // Range 确保写入进去的每个lsm都可以被读取到 250 | // iter := lsm.NewIterator(&utils.Options{IsAsc: true}) 251 | // for iter.Rewind(); iter.Valid(); iter.Next() { 252 | // e := iter.Item().Entry() 253 | // retList = append(retList, e) 254 | // } 255 | // utils.CondPanic(len(retList) != len(caseList), fmt.Errorf("len(retList) != len(caseList)")) 256 | // sort.Slice(retList, func(i, j int) bool { 257 | // return utils.CompareKeys(retList[i].Key, retList[j].Key) > 1 258 | // }) 259 | // for i := 0; i < len(caseList); i++ { 260 | // a, b := caseList[i], retList[i] 261 | // if !equal(a.Key, b.Key) || !equal(a.Value, b.Value) || a.ExpiresAt != b.ExpiresAt { 262 | // utils.Panic(fmt.Errorf("lsm.Get(e.Key) kv disagreement !!!")) 263 | // } 264 | // } 265 | // } 266 | // // 测试升序 267 | // testRange(true) 268 | // // 测试降序 269 | // testRange(false) 270 | } 271 | 272 | // 驱动模块 273 | func buildLSM() *LSM { 274 | // init DB Basic Test 275 | c := make(chan map[uint32]int64, 16) 276 | opt.DiscardStatsCh = &c 277 | lsm := NewLSM(opt) 278 | return lsm 279 | } 280 | 281 | // 运行测试用例 282 | func runTest(n int, testFunList ...func()) { 283 | for _, f := range testFunList { 284 | for i := 0; i < n; i++ { 285 | f() 286 | } 287 | } 288 | } 289 | 290 | // 构建compactDef对象 291 | func buildCompactDef(lsm *LSM, id, thisLevel, nextLevel int) *compactDef { 292 | t := targets{ 293 | targetSz: []int64{0, 10485760, 10485760, 10485760, 10485760, 10485760, 10485760}, 294 | fileSz: []int64{1024, 2097152, 2097152, 2097152, 2097152, 2097152, 2097152}, 295 | baseLevel: nextLevel, 296 | } 297 | def := &compactDef{ 298 | compactorId: id, 299 | thisLevel: lsm.levels.levels[thisLevel], 300 | nextLevel: lsm.levels.levels[nextLevel], 301 | t: t, 302 | p: buildCompactionPriority(lsm, thisLevel, t), 303 | } 304 | return def 305 | } 306 | 307 | // 构建CompactionPriority对象 308 | func buildCompactionPriority(lsm *LSM, thisLevel int, t targets) compactionPriority { 309 | return compactionPriority{ 310 | level: thisLevel, 311 | score: 8.6, 312 | adjusted: 860, 313 | t: t, 314 | } 315 | } 316 | 317 | func tricky(tables []*table) { 318 | // 非常tricky的处理方法,为了能通过检查,检查所有逻辑分支 319 | for _, table := range tables { 320 | table.ss.Indexs().StaleDataSize = 10 << 20 321 | t, _ := time.Parse("2006-01-02 15:04:05", "1995-08-10 00:00:00") 322 | table.ss.SetCreatedAt(&t) 323 | } 324 | } 325 | func clearDir() { 326 | _, err := os.Stat(opt.WorkDir) 327 | if err == nil { 328 | os.RemoveAll(opt.WorkDir) 329 | } 330 | os.Mkdir(opt.WorkDir, os.ModePerm) 331 | } 332 | -------------------------------------------------------------------------------- /db.go: -------------------------------------------------------------------------------- 1 | package corekv 2 | 3 | import ( 4 | "expvar" 5 | "fmt" 6 | "math" 7 | "sync" 8 | "sync/atomic" 9 | "time" 10 | 11 | "github.com/hardcore-os/corekv/lsm" 12 | "github.com/hardcore-os/corekv/utils" 13 | "github.com/pkg/errors" 14 | ) 15 | 16 | type ( 17 | // coreKV对外提供的功能集合 18 | CoreAPI interface { 19 | Set(data *utils.Entry) error 20 | Get(key []byte) (*utils.Entry, error) 21 | Del(key []byte) error 22 | NewIterator(opt *utils.Options) utils.Iterator 23 | Info() *Stats 24 | Close() error 25 | } 26 | 27 | // DB 对外暴露的接口对象 全局唯一,持有各种资源句柄 28 | DB struct { 29 | sync.RWMutex 30 | opt *Options 31 | lsm *lsm.LSM 32 | vlog *valueLog 33 | stats *Stats 34 | flushChan chan flushTask // For flushing memtables. 35 | writeCh chan *request 36 | blockWrites int32 37 | vhead *utils.ValuePtr 38 | logRotates int32 39 | } 40 | ) 41 | 42 | var ( 43 | head = []byte("!corekv!head") // For storing value offset for replay. 44 | ) 45 | 46 | /** 47 | SSTableMaxSz: 1024, 48 | MemTableSize: 1024, 49 | BlockSize: 1024, 50 | BloomFalsePositive: 0, 51 | BaseLevelSize: 10 << 20, 52 | LevelSizeMultiplier: 10, 53 | BaseTableSize: 2 << 20, 54 | TableSizeMultiplier: 2, 55 | NumLevelZeroTables: 15, 56 | MaxLevelNum: 7, 57 | NumCompactors: 3, 58 | */ 59 | // Open DB 60 | // TODO 这里是不是要上一个目录锁比较好,防止多个进程打开同一个目录? 61 | func Open(opt *Options) *DB { 62 | c := utils.NewCloser() 63 | db := &DB{opt: opt} 64 | // 初始化vlog结构 65 | db.initVLog() 66 | // 初始化LSM结构 67 | db.lsm = lsm.NewLSM(&lsm.Options{ 68 | WorkDir: opt.WorkDir, 69 | MemTableSize: opt.MemTableSize, 70 | SSTableMaxSz: opt.SSTableMaxSz, 71 | BlockSize: 8 * 1024, 72 | BloomFalsePositive: 0, //0.01, 73 | BaseLevelSize: 10 << 20, 74 | LevelSizeMultiplier: 10, 75 | BaseTableSize: 5 << 20, 76 | TableSizeMultiplier: 2, 77 | NumLevelZeroTables: 15, 78 | MaxLevelNum: 7, 79 | NumCompactors: 1, 80 | DiscardStatsCh: &(db.vlog.lfDiscardStats.flushChan), 81 | }) 82 | // 初始化统计信息 83 | db.stats = newStats(opt) 84 | // 启动 sstable 的合并压缩过程 85 | go db.lsm.StartCompacter() 86 | // 准备vlog gc 87 | c.Add(1) 88 | db.writeCh = make(chan *request) 89 | db.flushChan = make(chan flushTask, 16) 90 | go db.doWrites(c) 91 | // 启动 info 统计过程 92 | go db.stats.StartStats() 93 | return db 94 | } 95 | 96 | func (db *DB) Close() error { 97 | db.vlog.lfDiscardStats.closer.Close() 98 | if err := db.lsm.Close(); err != nil { 99 | return err 100 | } 101 | if err := db.vlog.close(); err != nil { 102 | return err 103 | } 104 | if err := db.stats.close(); err != nil { 105 | return err 106 | } 107 | return nil 108 | } 109 | 110 | func (db *DB) Del(key []byte) error { 111 | // 写入一个值为nil的entry 作为墓碑消息实现删除 112 | return db.Set(&utils.Entry{ 113 | Key: key, 114 | Value: nil, 115 | ExpiresAt: 0, 116 | }) 117 | } 118 | func (db *DB) Set(data *utils.Entry) error { 119 | if data == nil || len(data.Key) == 0 { 120 | return utils.ErrEmptyKey 121 | } 122 | // 做一些必要性的检查 123 | // 如果value 大于一个阈值 则创建值指针,并将其写入vlog中 124 | var ( 125 | vp *utils.ValuePtr 126 | err error 127 | ) 128 | data.Key = utils.KeyWithTs(data.Key, math.MaxUint32) 129 | // 如果value不应该直接写入LSM 则先写入 vlog文件,这时必须保证vlog具有重放功能 130 | // 以便于崩溃后恢复数据 131 | if !db.shouldWriteValueToLSM(data) { 132 | if vp, err = db.vlog.newValuePtr(data); err != nil { 133 | return err 134 | } 135 | data.Meta |= utils.BitValuePointer 136 | data.Value = vp.Encode() 137 | } 138 | return db.lsm.Set(data) 139 | } 140 | func (db *DB) Get(key []byte) (*utils.Entry, error) { 141 | if len(key) == 0 { 142 | return nil, utils.ErrEmptyKey 143 | } 144 | 145 | originKey := key 146 | var ( 147 | entry *utils.Entry 148 | err error 149 | ) 150 | key = utils.KeyWithTs(key, math.MaxUint32) 151 | // 从LSM中查询entry,这时不确定entry是不是值指针 152 | if entry, err = db.lsm.Get(key); err != nil { 153 | return entry, err 154 | } 155 | // 检查从lsm拿到的value是否是value ptr,是则从vlog中拿值 156 | if entry != nil && utils.IsValuePtr(entry) { 157 | var vp utils.ValuePtr 158 | vp.Decode(entry.Value) 159 | result, cb, err := db.vlog.read(&vp) 160 | defer utils.RunCallback(cb) 161 | if err != nil { 162 | return nil, err 163 | } 164 | entry.Value = utils.SafeCopy(nil, result) 165 | } 166 | 167 | if lsm.IsDeletedOrExpired(entry) { 168 | return nil, utils.ErrKeyNotFound 169 | } 170 | entry.Key = originKey 171 | return entry, nil 172 | } 173 | 174 | func (db *DB) Info() *Stats { 175 | // 读取stats结构,打包数据并返回 176 | return db.stats 177 | } 178 | 179 | // RunValueLogGC triggers a value log garbage collection. 180 | func (db *DB) RunValueLogGC(discardRatio float64) error { 181 | if discardRatio >= 1.0 || discardRatio <= 0.0 { 182 | return utils.ErrInvalidRequest 183 | } 184 | // Find head on disk 185 | headKey := utils.KeyWithTs(head, math.MaxUint64) 186 | val, err := db.lsm.Get(headKey) 187 | if err != nil { 188 | if err == utils.ErrKeyNotFound { 189 | val = &utils.Entry{ 190 | Key: headKey, 191 | Value: []byte{}, 192 | } 193 | } else { 194 | return errors.Wrap(err, "Retrieving head from on-disk LSM") 195 | } 196 | } 197 | 198 | // 内部key head 一定是value ptr 不需要检查内容 199 | var head utils.ValuePtr 200 | if len(val.Value) > 0 { 201 | head.Decode(val.Value) 202 | } 203 | 204 | // Pick a log file and run GC 205 | return db.vlog.runGC(discardRatio, &head) 206 | } 207 | 208 | func (db *DB) shouldWriteValueToLSM(e *utils.Entry) bool { 209 | return int64(len(e.Value)) < db.opt.ValueThreshold 210 | } 211 | 212 | func (db *DB) sendToWriteCh(entries []*utils.Entry) (*request, error) { 213 | if atomic.LoadInt32(&db.blockWrites) == 1 { 214 | return nil, utils.ErrBlockedWrites 215 | } 216 | var count, size int64 217 | for _, e := range entries { 218 | size += int64(e.EstimateSize(int(db.opt.ValueThreshold))) 219 | count++ 220 | } 221 | if count >= db.opt.MaxBatchCount || size >= db.opt.MaxBatchSize { 222 | return nil, utils.ErrTxnTooBig 223 | } 224 | 225 | // TODO 尝试使用对象复用,后面entry对象也应该使用 226 | req := requestPool.Get().(*request) 227 | req.reset() 228 | req.Entries = entries 229 | req.Wg.Add(1) 230 | req.IncrRef() // for db write 231 | db.writeCh <- req // Handled in doWrites. 232 | return req, nil 233 | } 234 | 235 | // Check(kv.BatchSet(entries)) 236 | func (db *DB) batchSet(entries []*utils.Entry) error { 237 | req, err := db.sendToWriteCh(entries) 238 | if err != nil { 239 | return err 240 | } 241 | 242 | return req.Wait() 243 | } 244 | 245 | func (db *DB) doWrites(lc *utils.Closer) { 246 | defer lc.Done() 247 | pendingCh := make(chan struct{}, 1) 248 | 249 | writeRequests := func(reqs []*request) { 250 | if err := db.writeRequests(reqs); err != nil { 251 | utils.Err(fmt.Errorf("writeRequests: %v", err)) 252 | } 253 | <-pendingCh 254 | } 255 | 256 | // This variable tracks the number of pending writes. 257 | reqLen := new(expvar.Int) 258 | 259 | reqs := make([]*request, 0, 10) 260 | for { 261 | var r *request 262 | select { 263 | case r = <-db.writeCh: 264 | case <-lc.CloseSignal: 265 | goto closedCase 266 | } 267 | 268 | for { 269 | reqs = append(reqs, r) 270 | reqLen.Set(int64(len(reqs))) 271 | 272 | if len(reqs) >= 3*utils.KVWriteChCapacity { 273 | pendingCh <- struct{}{} // blocking. 274 | goto writeCase 275 | } 276 | 277 | select { 278 | // Either push to pending, or continue to pick from writeCh. 279 | case r = <-db.writeCh: 280 | case pendingCh <- struct{}{}: 281 | goto writeCase 282 | case <-lc.CloseSignal: 283 | goto closedCase 284 | } 285 | } 286 | 287 | closedCase: 288 | // All the pending request are drained. 289 | // Don't close the writeCh, because it has be used in several places. 290 | for { 291 | select { 292 | case r = <-db.writeCh: 293 | reqs = append(reqs, r) 294 | default: 295 | pendingCh <- struct{}{} // Push to pending before doing a write. 296 | writeRequests(reqs) 297 | return 298 | } 299 | } 300 | 301 | writeCase: 302 | go writeRequests(reqs) 303 | reqs = make([]*request, 0, 10) 304 | reqLen.Set(0) 305 | } 306 | } 307 | 308 | // writeRequests is called serially by only one goroutine. 309 | func (db *DB) writeRequests(reqs []*request) error { 310 | if len(reqs) == 0 { 311 | return nil 312 | } 313 | 314 | done := func(err error) { 315 | for _, r := range reqs { 316 | r.Err = err 317 | r.Wg.Done() 318 | } 319 | } 320 | err := db.vlog.write(reqs) 321 | if err != nil { 322 | done(err) 323 | return err 324 | } 325 | var count int 326 | for _, b := range reqs { 327 | if len(b.Entries) == 0 { 328 | continue 329 | } 330 | count += len(b.Entries) 331 | if err != nil { 332 | done(err) 333 | return errors.Wrap(err, "writeRequests") 334 | } 335 | if err := db.writeToLSM(b); err != nil { 336 | done(err) 337 | return errors.Wrap(err, "writeRequests") 338 | } 339 | db.Lock() 340 | db.updateHead(b.Ptrs) 341 | db.Unlock() 342 | } 343 | done(nil) 344 | return nil 345 | } 346 | func (db *DB) writeToLSM(b *request) error { 347 | if len(b.Ptrs) != len(b.Entries) { 348 | return errors.Errorf("Ptrs and Entries don't match: %+v", b) 349 | } 350 | 351 | for i, entry := range b.Entries { 352 | if db.shouldWriteValueToLSM(entry) { // Will include deletion / tombstone case. 353 | entry.Meta = entry.Meta &^ utils.BitValuePointer 354 | } else { 355 | entry.Meta = entry.Meta | utils.BitValuePointer 356 | entry.Value = b.Ptrs[i].Encode() 357 | } 358 | db.lsm.Set(entry) 359 | } 360 | return nil 361 | } 362 | func (req *request) IncrRef() { 363 | atomic.AddInt32(&req.ref, 1) 364 | } 365 | 366 | func (req *request) DecrRef() { 367 | nRef := atomic.AddInt32(&req.ref, -1) 368 | if nRef > 0 { 369 | return 370 | } 371 | req.Entries = nil 372 | requestPool.Put(req) 373 | } 374 | 375 | func (req *request) Wait() error { 376 | req.Wg.Wait() 377 | err := req.Err 378 | req.DecrRef() // DecrRef after writing to DB. 379 | return err 380 | } 381 | 382 | // 结构体 383 | type flushTask struct { 384 | mt *utils.Skiplist 385 | vptr *utils.ValuePtr 386 | dropPrefixes [][]byte 387 | } 388 | 389 | func (db *DB) pushHead(ft flushTask) error { 390 | // Ensure we never push a zero valued head pointer. 391 | if ft.vptr.IsZero() { 392 | return errors.New("Head should not be zero") 393 | } 394 | 395 | fmt.Printf("Storing value log head: %+v\n", ft.vptr) 396 | val := ft.vptr.Encode() 397 | 398 | // Pick the max commit ts, so in case of crash, our read ts would be higher than all the 399 | // commits. 400 | headTs := utils.KeyWithTs(head, uint64(time.Now().Unix()/1e9)) 401 | ft.mt.Add(&utils.Entry{ 402 | Key: headTs, 403 | Value: val, 404 | }) 405 | return nil 406 | } 407 | -------------------------------------------------------------------------------- /lsm/levels.go: -------------------------------------------------------------------------------- 1 | package lsm 2 | 3 | import ( 4 | "bytes" 5 | "sort" 6 | "sync" 7 | "sync/atomic" 8 | 9 | "github.com/hardcore-os/corekv/file" 10 | "github.com/hardcore-os/corekv/utils" 11 | ) 12 | 13 | // initLevelManager 初始化函数 14 | func (lsm *LSM) initLevelManager(opt *Options) *levelManager { 15 | lm := &levelManager{lsm: lsm} // 反引用 16 | lm.compactState = lsm.newCompactStatus() 17 | lm.opt = opt 18 | // 读取manifest文件构建管理器 19 | if err := lm.loadManifest(); err != nil { 20 | panic(err) 21 | } 22 | lm.build() 23 | return lm 24 | } 25 | 26 | type levelManager struct { 27 | maxFID uint64 // 已经分配出去的最大fid,只要创建了memtable 就算已分配 28 | opt *Options 29 | cache *cache 30 | manifestFile *file.ManifestFile 31 | levels []*levelHandler 32 | lsm *LSM 33 | compactState *compactStatus 34 | } 35 | 36 | func (lm *levelManager) close() error { 37 | if err := lm.cache.close(); err != nil { 38 | return err 39 | } 40 | if err := lm.manifestFile.Close(); err != nil { 41 | return err 42 | } 43 | for i := range lm.levels { 44 | if err := lm.levels[i].close(); err != nil { 45 | return err 46 | } 47 | } 48 | return nil 49 | } 50 | 51 | func (lm *levelManager) iterators() []utils.Iterator { 52 | 53 | itrs := make([]utils.Iterator, 0, len(lm.levels)) 54 | for _, level := range lm.levels { 55 | itrs = append(itrs, level.iterators()...) 56 | } 57 | return itrs 58 | } 59 | 60 | func (lm *levelManager) Get(key []byte) (*utils.Entry, error) { 61 | var ( 62 | entry *utils.Entry 63 | err error 64 | ) 65 | // L0层查询 66 | if entry, err = lm.levels[0].Get(key); entry != nil { 67 | return entry, err 68 | } 69 | // L1-7层查询 70 | for level := 1; level < lm.opt.MaxLevelNum; level++ { 71 | ld := lm.levels[level] 72 | if entry, err = ld.Get(key); entry != nil { 73 | return entry, err 74 | } 75 | } 76 | return entry, utils.ErrKeyNotFound 77 | } 78 | 79 | func (lm *levelManager) loadCache() { 80 | 81 | } 82 | func (lm *levelManager) loadManifest() (err error) { 83 | lm.manifestFile, err = file.OpenManifestFile(&file.Options{Dir: lm.opt.WorkDir}) 84 | return err 85 | } 86 | func (lm *levelManager) build() error { 87 | lm.levels = make([]*levelHandler, 0, lm.opt.MaxLevelNum) 88 | for i := 0; i < lm.opt.MaxLevelNum; i++ { 89 | lm.levels = append(lm.levels, &levelHandler{ 90 | levelNum: i, 91 | tables: make([]*table, 0), 92 | lm: lm, 93 | }) 94 | } 95 | 96 | manifest := lm.manifestFile.GetManifest() 97 | // 对比manifest 文件的正确性 98 | if err := lm.manifestFile.RevertToManifest(utils.LoadIDMap(lm.opt.WorkDir)); err != nil { 99 | return err 100 | } 101 | // 逐一加载sstable 的index block 构建cache 102 | lm.cache = newCache(lm.opt) 103 | // TODO 初始化的时候index 结构放在了table中,相当于全部加载到了内存,减少了一次读磁盘,但增加了内存消耗 104 | var maxFID uint64 105 | for fID, tableInfo := range manifest.Tables { 106 | fileName := utils.FileNameSSTable(lm.opt.WorkDir, fID) 107 | if fID > maxFID { 108 | maxFID = fID 109 | } 110 | t := openTable(lm, fileName, nil) 111 | lm.levels[tableInfo.Level].add(t) 112 | lm.levels[tableInfo.Level].addSize(t) // 记录一个level的文件总大小 113 | } 114 | // 对每一层进行排序 115 | for i := 0; i < lm.opt.MaxLevelNum; i++ { 116 | lm.levels[i].Sort() 117 | } 118 | // 得到最大的fid值 119 | atomic.AddUint64(&lm.maxFID, maxFID) 120 | return nil 121 | } 122 | 123 | // 向L0层flush一个sstable 124 | func (lm *levelManager) flush(immutable *memTable) (err error) { 125 | // 分配一个fid 126 | fid := immutable.wal.Fid() 127 | sstName := utils.FileNameSSTable(lm.opt.WorkDir, fid) 128 | 129 | // 构建一个 builder 130 | builder := newTableBuiler(lm.opt) 131 | iter := immutable.sl.NewSkipListIterator() 132 | for iter.Rewind(); iter.Valid(); iter.Next() { 133 | entry := iter.Item().Entry() 134 | builder.add(entry, false) 135 | } 136 | // 创建一个 table 对象 137 | table := openTable(lm, sstName, builder) 138 | err = lm.manifestFile.AddTableMeta(0, &file.TableMeta{ 139 | ID: fid, 140 | Checksum: []byte{'m', 'o', 'c', 'k'}, 141 | }) 142 | // manifest写入失败直接panic 143 | utils.Panic(err) 144 | // 更新manifest文件 145 | lm.levels[0].add(table) 146 | return 147 | } 148 | 149 | //--------- level处理器 ------- 150 | type levelHandler struct { 151 | sync.RWMutex 152 | levelNum int 153 | tables []*table 154 | totalSize int64 155 | totalStaleSize int64 156 | lm *levelManager 157 | } 158 | 159 | func (lh *levelHandler) close() error { 160 | for i := range lh.tables { 161 | if err := lh.tables[i].ss.Close(); err != nil { 162 | return err 163 | } 164 | } 165 | return nil 166 | } 167 | func (lh *levelHandler) add(t *table) { 168 | lh.Lock() 169 | defer lh.Unlock() 170 | lh.tables = append(lh.tables, t) 171 | } 172 | func (lh *levelHandler) addBatch(ts []*table) { 173 | lh.Lock() 174 | defer lh.Unlock() 175 | lh.tables = append(lh.tables, ts...) 176 | } 177 | 178 | func (lh *levelHandler) getTotalSize() int64 { 179 | lh.RLock() 180 | defer lh.RUnlock() 181 | return lh.totalSize 182 | } 183 | 184 | func (lh *levelHandler) addSize(t *table) { 185 | lh.totalSize += t.Size() 186 | lh.totalStaleSize += int64(t.StaleDataSize()) 187 | } 188 | 189 | func (lh *levelHandler) subtractSize(t *table) { 190 | lh.totalSize -= t.Size() 191 | lh.totalStaleSize -= int64(t.StaleDataSize()) 192 | } 193 | 194 | func (lh *levelHandler) numTables() int { 195 | lh.RLock() 196 | defer lh.RUnlock() 197 | return len(lh.tables) 198 | } 199 | 200 | func (lh *levelHandler) Get(key []byte) (*utils.Entry, error) { 201 | // 如果是第0层文件则进行特殊处理 202 | if lh.levelNum == 0 { 203 | // TODO: logic... 204 | // 获取可能存在key的sst 205 | return lh.searchL0SST(key) 206 | } else { 207 | // TODO: logic... 208 | return lh.searchLNSST(key) 209 | } 210 | } 211 | 212 | func (lh *levelHandler) Sort() { 213 | lh.Lock() 214 | defer lh.Unlock() 215 | if lh.levelNum == 0 { 216 | // Key range will overlap. Just sort by fileID in ascending order 217 | // because newer tables are at the end of level 0. 218 | sort.Slice(lh.tables, func(i, j int) bool { 219 | return lh.tables[i].fid < lh.tables[j].fid 220 | }) 221 | } else { 222 | // Sort tables by keys. 223 | sort.Slice(lh.tables, func(i, j int) bool { 224 | return utils.CompareKeys(lh.tables[i].ss.MinKey(), lh.tables[j].ss.MinKey()) < 0 225 | }) 226 | } 227 | } 228 | 229 | func (lh *levelHandler) searchL0SST(key []byte) (*utils.Entry, error) { 230 | var version uint64 231 | for _, table := range lh.tables { 232 | if entry, err := table.Serach(key, &version); err == nil { 233 | return entry, nil 234 | } 235 | } 236 | return nil, utils.ErrKeyNotFound 237 | } 238 | func (lh *levelHandler) searchLNSST(key []byte) (*utils.Entry, error) { 239 | table := lh.getTable(key) 240 | var version uint64 241 | if table == nil { 242 | return nil, utils.ErrKeyNotFound 243 | } 244 | if entry, err := table.Serach(key, &version); err == nil { 245 | return entry, nil 246 | } 247 | return nil, utils.ErrKeyNotFound 248 | } 249 | func (lh *levelHandler) getTable(key []byte) *table { 250 | if len(lh.tables) > 0 && (bytes.Compare(key, lh.tables[0].ss.MinKey()) < 0 || bytes.Compare(key, lh.tables[len(lh.tables)-1].ss.MaxKey()) > 0) { 251 | return nil 252 | } else { 253 | for i := len(lh.tables) - 1; i >= 0; i-- { 254 | if bytes.Compare(key, lh.tables[i].ss.MinKey()) > -1 && 255 | bytes.Compare(key, lh.tables[i].ss.MaxKey()) < 1 { 256 | return lh.tables[i] 257 | } 258 | } 259 | } 260 | return nil 261 | } 262 | func (lh *levelHandler) isLastLevel() bool { 263 | return lh.levelNum == lh.lm.opt.MaxLevelNum-1 264 | } 265 | 266 | type levelHandlerRLocked struct{} 267 | 268 | // overlappingTables returns the tables that intersect with key range. Returns a half-interval. 269 | // This function should already have acquired a read lock, and this is so important the caller must 270 | // pass an empty parameter declaring such. 271 | func (lh *levelHandler) overlappingTables(_ levelHandlerRLocked, kr keyRange) (int, int) { 272 | if len(kr.left) == 0 || len(kr.right) == 0 { 273 | return 0, 0 274 | } 275 | left := sort.Search(len(lh.tables), func(i int) bool { 276 | return utils.CompareKeys(kr.left, lh.tables[i].ss.MaxKey()) <= 0 277 | }) 278 | right := sort.Search(len(lh.tables), func(i int) bool { 279 | return utils.CompareKeys(kr.right, lh.tables[i].ss.MaxKey()) < 0 280 | }) 281 | return left, right 282 | } 283 | 284 | // replaceTables will replace tables[left:right] with newTables. Note this EXCLUDES tables[right]. 285 | // You must call decr() to delete the old tables _after_ writing the update to the manifest. 286 | func (lh *levelHandler) replaceTables(toDel, toAdd []*table) error { 287 | // Need to re-search the range of tables in this level to be replaced as other goroutines might 288 | // be changing it as well. (They can't touch our tables, but if they add/remove other tables, 289 | // the indices get shifted around.) 290 | lh.Lock() // We s.Unlock() below. 291 | 292 | toDelMap := make(map[uint64]struct{}) 293 | for _, t := range toDel { 294 | toDelMap[t.fid] = struct{}{} 295 | } 296 | var newTables []*table 297 | for _, t := range lh.tables { 298 | _, found := toDelMap[t.fid] 299 | if !found { 300 | newTables = append(newTables, t) 301 | continue 302 | } 303 | lh.subtractSize(t) 304 | } 305 | 306 | // Increase totalSize first. 307 | for _, t := range toAdd { 308 | lh.addSize(t) 309 | t.IncrRef() 310 | newTables = append(newTables, t) 311 | } 312 | 313 | // Assign tables. 314 | lh.tables = newTables 315 | sort.Slice(lh.tables, func(i, j int) bool { 316 | return utils.CompareKeys(lh.tables[i].ss.MinKey(), lh.tables[i].ss.MinKey()) < 0 317 | }) 318 | lh.Unlock() // s.Unlock before we DecrRef tables -- that can be slow. 319 | return decrRefs(toDel) 320 | } 321 | 322 | // deleteTables remove tables idx0, ..., idx1-1. 323 | func (lh *levelHandler) deleteTables(toDel []*table) error { 324 | lh.Lock() // s.Unlock() below 325 | 326 | toDelMap := make(map[uint64]struct{}) 327 | for _, t := range toDel { 328 | toDelMap[t.fid] = struct{}{} 329 | } 330 | 331 | // Make a copy as iterators might be keeping a slice of tables. 332 | var newTables []*table 333 | for _, t := range lh.tables { 334 | _, found := toDelMap[t.fid] 335 | if !found { 336 | newTables = append(newTables, t) 337 | continue 338 | } 339 | lh.subtractSize(t) 340 | } 341 | lh.tables = newTables 342 | 343 | lh.Unlock() // Unlock s _before_ we DecrRef our tables, which can be slow. 344 | 345 | return decrRefs(toDel) 346 | } 347 | 348 | func (lh *levelHandler) iterators() []utils.Iterator { 349 | lh.RLock() 350 | defer lh.RUnlock() 351 | topt := &utils.Options{IsAsc: true} 352 | if lh.levelNum == 0 { 353 | return iteratorsReversed(lh.tables, topt) 354 | } 355 | 356 | if len(lh.tables) == 0 { 357 | return nil 358 | } 359 | return []utils.Iterator{NewConcatIterator(lh.tables, topt)} 360 | } 361 | -------------------------------------------------------------------------------- /lsm/table.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lsm 16 | 17 | import ( 18 | "encoding/binary" 19 | "fmt" 20 | "io" 21 | "math" 22 | "os" 23 | "sort" 24 | "strings" 25 | "sync/atomic" 26 | "time" 27 | 28 | "github.com/hardcore-os/corekv/file" 29 | "github.com/hardcore-os/corekv/pb" 30 | "github.com/hardcore-os/corekv/utils" 31 | "github.com/pkg/errors" 32 | ) 33 | 34 | type table struct { 35 | ss *file.SSTable 36 | lm *levelManager 37 | fid uint64 38 | ref int32 // For file garbage collection. Atomic. 39 | } 40 | 41 | func openTable(lm *levelManager, tableName string, builder *tableBuilder) *table { 42 | sstSize := int(lm.opt.SSTableMaxSz) 43 | if builder != nil { 44 | sstSize = int(builder.done().size) 45 | } 46 | var ( 47 | t *table 48 | err error 49 | ) 50 | fid := utils.FID(tableName) 51 | // 对builder存在的情况 把buf flush到磁盘 52 | if builder != nil { 53 | if t, err = builder.flush(lm, tableName); err != nil { 54 | utils.Err(err) 55 | return nil 56 | } 57 | } else { 58 | t = &table{lm: lm, fid: fid} 59 | // 如果没有builder 则创打开一个已经存在的sst文件 60 | t.ss = file.OpenSStable(&file.Options{ 61 | FileName: tableName, 62 | Dir: lm.opt.WorkDir, 63 | Flag: os.O_CREATE | os.O_RDWR, 64 | MaxSz: int(sstSize)}) 65 | } 66 | // 先要引用一下,否则后面使用迭代器会导致引用状态错误 67 | t.IncrRef() 68 | // 初始化sst文件,把index加载进来 69 | if err := t.ss.Init(); err != nil { 70 | utils.Err(err) 71 | return nil 72 | } 73 | 74 | // 获取sst的最大key 需要使用迭代器 75 | itr := t.NewIterator(&utils.Options{}) // 默认是降序 76 | defer itr.Close() 77 | // 定位到初始位置就是最大的key 78 | itr.Rewind() 79 | utils.CondPanic(!itr.Valid(), errors.Errorf("failed to read index, form maxKey")) 80 | maxKey := itr.Item().Entry().Key 81 | t.ss.SetMaxKey(maxKey) 82 | 83 | return t 84 | } 85 | 86 | // Serach 从table中查找key 87 | func (t *table) Serach(key []byte, maxVs *uint64) (entry *utils.Entry, err error) { 88 | t.IncrRef() 89 | defer t.DecrRef() 90 | // 获取索引 91 | idx := t.ss.Indexs() 92 | // 检查key是否存在 93 | bloomFilter := utils.Filter(idx.BloomFilter) 94 | if t.ss.HasBloomFilter() && !bloomFilter.MayContainKey(key) { 95 | return nil, utils.ErrKeyNotFound 96 | } 97 | iter := t.NewIterator(&utils.Options{}) 98 | defer iter.Close() 99 | 100 | iter.Seek(key) 101 | if !iter.Valid() { 102 | return nil, utils.ErrKeyNotFound 103 | } 104 | 105 | if utils.SameKey(key, iter.Item().Entry().Key) { 106 | if version := utils.ParseTs(iter.Item().Entry().Key); *maxVs < version { 107 | *maxVs = version 108 | return iter.Item().Entry(), nil 109 | } 110 | } 111 | return nil, utils.ErrKeyNotFound 112 | } 113 | 114 | func (t *table) indexKey() uint64 { 115 | return t.fid 116 | } 117 | func (t *table) getEntry(key, block []byte, idx int) (entry *utils.Entry, err error) { 118 | if len(block) == 0 { 119 | return nil, utils.ErrKeyNotFound 120 | } 121 | dataStr := string(block) 122 | blocks := strings.Split(dataStr, ",") 123 | if idx >= 0 && idx < len(blocks) { 124 | return &utils.Entry{ 125 | Key: key, 126 | Value: []byte(blocks[idx]), 127 | }, nil 128 | } 129 | return nil, utils.ErrKeyNotFound 130 | } 131 | 132 | // 去加载sst对应的block 133 | func (t *table) block(idx int) (*block, error) { 134 | utils.CondPanic(idx < 0, fmt.Errorf("idx=%d", idx)) 135 | if idx >= len(t.ss.Indexs().Offsets) { 136 | return nil, errors.New("block out of index") 137 | } 138 | var b *block 139 | key := t.blockCacheKey(idx) 140 | blk, ok := t.lm.cache.blocks.Get(key) 141 | if ok && blk != nil { 142 | b, _ = blk.(*block) 143 | return b, nil 144 | } 145 | 146 | var ko pb.BlockOffset 147 | utils.CondPanic(!t.offsets(&ko, idx), fmt.Errorf("block t.offset id=%d", idx)) 148 | b = &block{ 149 | offset: int(ko.GetOffset()), 150 | } 151 | 152 | var err error 153 | if b.data, err = t.read(b.offset, int(ko.GetLen())); err != nil { 154 | return nil, errors.Wrapf(err, 155 | "failed to read from sstable: %d at offset: %d, len: %d", 156 | t.ss.FID(), b.offset, ko.GetLen()) 157 | } 158 | 159 | readPos := len(b.data) - 4 // First read checksum length. 160 | b.chkLen = int(utils.BytesToU32(b.data[readPos : readPos+4])) 161 | 162 | if b.chkLen > len(b.data) { 163 | return nil, errors.New("invalid checksum length. Either the data is " + 164 | "corrupted or the table options are incorrectly set") 165 | } 166 | 167 | readPos -= b.chkLen 168 | b.checksum = b.data[readPos : readPos+b.chkLen] 169 | 170 | b.data = b.data[:readPos] 171 | 172 | if err = b.verifyCheckSum(); err != nil { 173 | return nil, err 174 | } 175 | 176 | readPos -= 4 177 | numEntries := int(utils.BytesToU32(b.data[readPos : readPos+4])) 178 | entriesIndexStart := readPos - (numEntries * 4) 179 | entriesIndexEnd := entriesIndexStart + numEntries*4 180 | 181 | b.entryOffsets = utils.BytesToU32Slice(b.data[entriesIndexStart:entriesIndexEnd]) 182 | 183 | b.entriesIndexStart = entriesIndexStart 184 | 185 | t.lm.cache.blocks.Set(key, b) 186 | 187 | return b, nil 188 | } 189 | 190 | func (t *table) read(off, sz int) ([]byte, error) { 191 | return t.ss.Bytes(off, sz) 192 | } 193 | 194 | // blockCacheKey is used to store blocks in the block cache. 195 | func (t *table) blockCacheKey(idx int) []byte { 196 | utils.CondPanic(t.fid >= math.MaxUint32, fmt.Errorf("t.fid >= math.MaxUint32")) 197 | utils.CondPanic(uint32(idx) >= math.MaxUint32, fmt.Errorf("uint32(idx) >= math.MaxUint32")) 198 | 199 | buf := make([]byte, 8) 200 | // Assume t.ID does not overflow uint32. 201 | binary.BigEndian.PutUint32(buf[:4], uint32(t.fid)) 202 | binary.BigEndian.PutUint32(buf[4:], uint32(idx)) 203 | return buf 204 | } 205 | 206 | type tableIterator struct { 207 | it utils.Item 208 | opt *utils.Options 209 | t *table 210 | blockPos int 211 | bi *blockIterator 212 | err error 213 | } 214 | 215 | func (t *table) NewIterator(options *utils.Options) utils.Iterator { 216 | t.IncrRef() 217 | return &tableIterator{ 218 | opt: options, 219 | t: t, 220 | bi: &blockIterator{}, 221 | } 222 | } 223 | func (it *tableIterator) Next() { 224 | it.err = nil 225 | 226 | if it.blockPos >= len(it.t.ss.Indexs().GetOffsets()) { 227 | it.err = io.EOF 228 | return 229 | } 230 | 231 | if len(it.bi.data) == 0 { 232 | block, err := it.t.block(it.blockPos) 233 | if err != nil { 234 | it.err = err 235 | return 236 | } 237 | it.bi.tableID = it.t.fid 238 | it.bi.blockID = it.blockPos 239 | it.bi.setBlock(block) 240 | it.bi.seekToFirst() 241 | it.err = it.bi.Error() 242 | return 243 | } 244 | 245 | it.bi.Next() 246 | if !it.bi.Valid() { 247 | it.blockPos++ 248 | it.bi.data = nil 249 | it.Next() 250 | return 251 | } 252 | it.it = it.bi.it 253 | } 254 | func (it *tableIterator) Valid() bool { 255 | return it.err != io.EOF // 如果没有的时候 则是EOF 256 | } 257 | func (it *tableIterator) Rewind() { 258 | if it.opt.IsAsc { 259 | it.seekToFirst() 260 | } else { 261 | it.seekToLast() 262 | } 263 | } 264 | func (it *tableIterator) Item() utils.Item { 265 | return it.it 266 | } 267 | func (it *tableIterator) Close() error { 268 | it.bi.Close() 269 | return it.t.DecrRef() 270 | } 271 | func (it *tableIterator) seekToFirst() { 272 | numBlocks := len(it.t.ss.Indexs().Offsets) 273 | if numBlocks == 0 { 274 | it.err = io.EOF 275 | return 276 | } 277 | it.blockPos = 0 278 | block, err := it.t.block(it.blockPos) 279 | if err != nil { 280 | it.err = err 281 | return 282 | } 283 | it.bi.tableID = it.t.fid 284 | it.bi.blockID = it.blockPos 285 | it.bi.setBlock(block) 286 | it.bi.seekToFirst() 287 | it.it = it.bi.Item() 288 | it.err = it.bi.Error() 289 | } 290 | 291 | func (it *tableIterator) seekToLast() { 292 | numBlocks := len(it.t.ss.Indexs().Offsets) 293 | if numBlocks == 0 { 294 | it.err = io.EOF 295 | return 296 | } 297 | it.blockPos = numBlocks - 1 298 | block, err := it.t.block(it.blockPos) 299 | if err != nil { 300 | it.err = err 301 | return 302 | } 303 | it.bi.tableID = it.t.fid 304 | it.bi.blockID = it.blockPos 305 | it.bi.setBlock(block) 306 | it.bi.seekToLast() 307 | it.it = it.bi.Item() 308 | it.err = it.bi.Error() 309 | } 310 | 311 | // Seek 312 | // 二分法搜索 offsets 313 | // 如果idx == 0 说明key只能在第一个block中 block[0].MinKey <= key 314 | // 否则 block[0].MinKey > key 315 | // 如果在 idx-1 的block中未找到key 那才可能在 idx 中 316 | // 如果都没有,则当前key不再此table 317 | func (it *tableIterator) Seek(key []byte) { 318 | var ko pb.BlockOffset 319 | idx := sort.Search(len(it.t.ss.Indexs().GetOffsets()), func(idx int) bool { 320 | utils.CondPanic(!it.t.offsets(&ko, idx), fmt.Errorf("tableutils.Seek idx < 0 || idx > len(index.GetOffsets()")) 321 | if idx == len(it.t.ss.Indexs().GetOffsets()) { 322 | return true 323 | } 324 | return utils.CompareKeys(ko.GetKey(), key) > 0 325 | }) 326 | if idx == 0 { 327 | it.seekHelper(0, key) 328 | return 329 | } 330 | it.seekHelper(idx-1, key) 331 | } 332 | 333 | func (it *tableIterator) seekHelper(blockIdx int, key []byte) { 334 | it.blockPos = blockIdx 335 | block, err := it.t.block(blockIdx) 336 | if err != nil { 337 | it.err = err 338 | return 339 | } 340 | it.bi.tableID = it.t.fid 341 | it.bi.blockID = it.blockPos 342 | it.bi.setBlock(block) 343 | it.bi.seek(key) 344 | it.err = it.bi.Error() 345 | it.it = it.bi.Item() 346 | } 347 | 348 | func (t *table) offsets(ko *pb.BlockOffset, i int) bool { 349 | index := t.ss.Indexs() 350 | if i < 0 || i > len(index.GetOffsets()) { 351 | return false 352 | } 353 | if i == len(index.GetOffsets()) { 354 | return true 355 | } 356 | *ko = *index.GetOffsets()[i] 357 | return true 358 | } 359 | 360 | // Size is its file size in bytes 361 | func (t *table) Size() int64 { return int64(t.ss.Size()) } 362 | 363 | // GetCreatedAt 364 | func (t *table) GetCreatedAt() *time.Time { 365 | return t.ss.GetCreatedAt() 366 | } 367 | func (t *table) Delete() error { 368 | return t.ss.Detele() 369 | } 370 | 371 | // StaleDataSize is the amount of stale data (that can be dropped by a compaction )in this SST. 372 | func (t *table) StaleDataSize() uint32 { return t.ss.Indexs().StaleDataSize } 373 | 374 | // DecrRef decrements the refcount and possibly deletes the table 375 | func (t *table) DecrRef() error { 376 | newRef := atomic.AddInt32(&t.ref, -1) 377 | if newRef == 0 { 378 | // TODO 从缓存中删除 379 | for i := 0; i < len(t.ss.Indexs().GetOffsets()); i++ { 380 | t.lm.cache.blocks.Del(t.blockCacheKey(i)) 381 | } 382 | if err := t.Delete(); err != nil { 383 | return err 384 | } 385 | } 386 | return nil 387 | } 388 | 389 | func (t *table) IncrRef() { 390 | atomic.AddInt32(&t.ref, 1) 391 | } 392 | func decrRefs(tables []*table) error { 393 | for _, table := range tables { 394 | if err := table.DecrRef(); err != nil { 395 | return err 396 | } 397 | } 398 | return nil 399 | } 400 | -------------------------------------------------------------------------------- /lsm/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 logicrec Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lsm 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "sort" 21 | 22 | "github.com/hardcore-os/corekv/utils" 23 | ) 24 | 25 | type Iterator struct { 26 | it Item 27 | iters []utils.Iterator 28 | } 29 | type Item struct { 30 | e *utils.Entry 31 | } 32 | 33 | func (it *Item) Entry() *utils.Entry { 34 | return it.e 35 | } 36 | 37 | // 创建迭代器 38 | func (lsm *LSM) NewIterators(opt *utils.Options) []utils.Iterator { 39 | iter := &Iterator{} 40 | iter.iters = make([]utils.Iterator, 0) 41 | iter.iters = append(iter.iters, lsm.memTable.NewIterator(opt)) 42 | for _, imm := range lsm.immutables { 43 | iter.iters = append(iter.iters, imm.NewIterator(opt)) 44 | } 45 | iter.iters = append(iter.iters, lsm.levels.iterators()...) 46 | return iter.iters 47 | } 48 | func (iter *Iterator) Next() { 49 | iter.iters[0].Next() 50 | } 51 | func (iter *Iterator) Valid() bool { 52 | return iter.iters[0].Valid() 53 | } 54 | func (iter *Iterator) Rewind() { 55 | iter.iters[0].Rewind() 56 | } 57 | func (iter *Iterator) Item() utils.Item { 58 | return iter.iters[0].Item() 59 | } 60 | func (iter *Iterator) Close() error { 61 | return nil 62 | } 63 | 64 | func (iter *Iterator) Seek(key []byte) { 65 | } 66 | 67 | // 内存表迭代器 68 | type memIterator struct { 69 | innerIter utils.Iterator 70 | } 71 | 72 | func (m *memTable) NewIterator(opt *utils.Options) utils.Iterator { 73 | return &memIterator{innerIter: m.sl.NewSkipListIterator()} 74 | } 75 | func (iter *memIterator) Next() { 76 | iter.innerIter.Next() 77 | } 78 | func (iter *memIterator) Valid() bool { 79 | return iter.innerIter.Valid() 80 | } 81 | func (iter *memIterator) Rewind() { 82 | iter.innerIter.Rewind() 83 | } 84 | func (iter *memIterator) Item() utils.Item { 85 | return iter.innerIter.Item() 86 | } 87 | func (iter *memIterator) Close() error { 88 | return iter.innerIter.Close() 89 | } 90 | func (iter *memIterator) Seek(key []byte) { 91 | } 92 | 93 | // levelManager上的迭代器 94 | type levelIterator struct { 95 | it *utils.Item 96 | iters []*Iterator 97 | } 98 | 99 | func (lm *levelManager) NewIterators(options *utils.Options) []utils.Iterator { 100 | return lm.iterators() 101 | } 102 | func (iter *levelIterator) Next() { 103 | } 104 | func (iter *levelIterator) Valid() bool { 105 | return false 106 | } 107 | func (iter *levelIterator) Rewind() { 108 | 109 | } 110 | func (iter *levelIterator) Item() utils.Item { 111 | return &Item{} 112 | } 113 | func (iter *levelIterator) Close() error { 114 | return nil 115 | } 116 | 117 | func (iter *levelIterator) Seek(key []byte) { 118 | } 119 | 120 | // ConcatIterator 将table 数组链接成一个迭代器,这样迭代效率更高 121 | type ConcatIterator struct { 122 | idx int // Which iterator is active now. 123 | cur utils.Iterator 124 | iters []utils.Iterator // Corresponds to tables. 125 | tables []*table // Disregarding reversed, this is in ascending order. 126 | options *utils.Options // Valid options are REVERSED and NOCACHE. 127 | } 128 | 129 | // NewConcatIterator creates a new concatenated iterator 130 | func NewConcatIterator(tbls []*table, opt *utils.Options) *ConcatIterator { 131 | iters := make([]utils.Iterator, len(tbls)) 132 | return &ConcatIterator{ 133 | options: opt, 134 | iters: iters, 135 | tables: tbls, 136 | idx: -1, // Not really necessary because s.it.Valid()=false, but good to have. 137 | } 138 | } 139 | 140 | func (s *ConcatIterator) setIdx(idx int) { 141 | s.idx = idx 142 | if idx < 0 || idx >= len(s.iters) { 143 | s.cur = nil 144 | return 145 | } 146 | if s.iters[idx] == nil { 147 | s.iters[idx] = s.tables[idx].NewIterator(s.options) 148 | } 149 | s.cur = s.iters[s.idx] 150 | } 151 | 152 | // Rewind implements Interface 153 | func (s *ConcatIterator) Rewind() { 154 | if len(s.iters) == 0 { 155 | return 156 | } 157 | if !s.options.IsAsc { 158 | s.setIdx(0) 159 | } else { 160 | s.setIdx(len(s.iters) - 1) 161 | } 162 | s.cur.Rewind() 163 | } 164 | 165 | // Valid implements y.Interface 166 | func (s *ConcatIterator) Valid() bool { 167 | return s.cur != nil && s.cur.Valid() 168 | } 169 | 170 | // Item _ 171 | func (s *ConcatIterator) Item() utils.Item { 172 | return s.cur.Item() 173 | } 174 | 175 | // Seek brings us to element >= key if reversed is false. Otherwise, <= key. 176 | func (s *ConcatIterator) Seek(key []byte) { 177 | var idx int 178 | if s.options.IsAsc { 179 | idx = sort.Search(len(s.tables), func(i int) bool { 180 | return utils.CompareKeys(s.tables[i].ss.MaxKey(), key) >= 0 181 | }) 182 | } else { 183 | n := len(s.tables) 184 | idx = n - 1 - sort.Search(n, func(i int) bool { 185 | return utils.CompareKeys(s.tables[n-1-i].ss.MinKey(), key) <= 0 186 | }) 187 | } 188 | if idx >= len(s.tables) || idx < 0 { 189 | s.setIdx(-1) 190 | return 191 | } 192 | // For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the 193 | // previous table cannot possibly contain key. 194 | s.setIdx(idx) 195 | s.cur.Seek(key) 196 | } 197 | 198 | // Next advances our concat iterator. 199 | func (s *ConcatIterator) Next() { 200 | s.cur.Next() 201 | if s.cur.Valid() { 202 | // Nothing to do. Just stay with the current table. 203 | return 204 | } 205 | for { // In case there are empty tables. 206 | if !s.options.IsAsc { 207 | s.setIdx(s.idx + 1) 208 | } else { 209 | s.setIdx(s.idx - 1) 210 | } 211 | if s.cur == nil { 212 | // End of list. Valid will become false. 213 | return 214 | } 215 | s.cur.Rewind() 216 | if s.cur.Valid() { 217 | break 218 | } 219 | } 220 | } 221 | 222 | // Close implements y.Interface. 223 | func (s *ConcatIterator) Close() error { 224 | for _, it := range s.iters { 225 | if it == nil { 226 | continue 227 | } 228 | if err := it.Close(); err != nil { 229 | return fmt.Errorf("ConcatIterator:%+v", err) 230 | } 231 | } 232 | return nil 233 | } 234 | 235 | // MergeIterator 多路合并迭代器 236 | // NOTE: MergeIterator owns the array of iterators and is responsible for closing them. 237 | type MergeIterator struct { 238 | left node 239 | right node 240 | small *node 241 | 242 | curKey []byte 243 | reverse bool 244 | } 245 | 246 | type node struct { 247 | valid bool 248 | entry *utils.Entry 249 | iter utils.Iterator 250 | 251 | // The two iterators are type asserted from `y.Iterator`, used to inline more function calls. 252 | // Calling functions on concrete types is much faster (about 25-30%) than calling the 253 | // interface's function. 254 | merge *MergeIterator 255 | concat *ConcatIterator 256 | } 257 | 258 | func (n *node) setIterator(iter utils.Iterator) { 259 | n.iter = iter 260 | // It's okay if the type assertion below fails and n.merge/n.concat are set to nil. 261 | // We handle the nil values of merge and concat in all the methods. 262 | n.merge, _ = iter.(*MergeIterator) 263 | n.concat, _ = iter.(*ConcatIterator) 264 | } 265 | 266 | func (n *node) setKey() { 267 | switch { 268 | case n.merge != nil: 269 | n.valid = n.merge.small.valid 270 | if n.valid { 271 | n.entry = n.merge.small.entry 272 | } 273 | case n.concat != nil: 274 | n.valid = n.concat.Valid() 275 | if n.valid { 276 | n.entry = n.concat.Item().Entry() 277 | } 278 | default: 279 | n.valid = n.iter.Valid() 280 | if n.valid { 281 | n.entry = n.iter.Item().Entry() 282 | } 283 | } 284 | } 285 | 286 | func (n *node) next() { 287 | switch { 288 | case n.merge != nil: 289 | n.merge.Next() 290 | case n.concat != nil: 291 | n.concat.Next() 292 | default: 293 | n.iter.Next() 294 | } 295 | n.setKey() 296 | } 297 | 298 | func (n *node) rewind() { 299 | n.iter.Rewind() 300 | n.setKey() 301 | } 302 | 303 | func (n *node) seek(key []byte) { 304 | n.iter.Seek(key) 305 | n.setKey() 306 | } 307 | 308 | func (mi *MergeIterator) fix() { 309 | if !mi.bigger().valid { 310 | return 311 | } 312 | if !mi.small.valid { 313 | mi.swapSmall() 314 | return 315 | } 316 | cmp := utils.CompareKeys(mi.small.entry.Key, mi.bigger().entry.Key) 317 | switch { 318 | case cmp == 0: // Both the keys are equal. 319 | // In case of same keys, move the right iterator ahead. 320 | mi.right.next() 321 | if &mi.right == mi.small { 322 | mi.swapSmall() 323 | } 324 | return 325 | case cmp < 0: // Small is less than bigger(). 326 | if mi.reverse { 327 | mi.swapSmall() 328 | } else { 329 | // we don't need to do anything. Small already points to the smallest. 330 | } 331 | return 332 | default: // bigger() is less than small. 333 | if mi.reverse { 334 | // Do nothing since we're iterating in reverse. Small currently points to 335 | // the bigger key and that's okay in reverse iteration. 336 | } else { 337 | mi.swapSmall() 338 | } 339 | return 340 | } 341 | } 342 | 343 | func (mi *MergeIterator) bigger() *node { 344 | if mi.small == &mi.left { 345 | return &mi.right 346 | } 347 | return &mi.left 348 | } 349 | 350 | func (mi *MergeIterator) swapSmall() { 351 | if mi.small == &mi.left { 352 | mi.small = &mi.right 353 | return 354 | } 355 | if mi.small == &mi.right { 356 | mi.small = &mi.left 357 | return 358 | } 359 | } 360 | 361 | // Next returns the next element. If it is the same as the current key, ignore it. 362 | func (mi *MergeIterator) Next() { 363 | for mi.Valid() { 364 | if !bytes.Equal(mi.small.entry.Key, mi.curKey) { 365 | break 366 | } 367 | mi.small.next() 368 | mi.fix() 369 | } 370 | mi.setCurrent() 371 | } 372 | 373 | func (mi *MergeIterator) setCurrent() { 374 | utils.CondPanic(mi.small.entry == nil && mi.small.valid == true, fmt.Errorf("mi.small.entry is nil")) 375 | if mi.small.valid { 376 | mi.curKey = append(mi.curKey[:0], mi.small.entry.Key...) 377 | } 378 | } 379 | 380 | // Rewind seeks to first element (or last element for reverse iterator). 381 | func (mi *MergeIterator) Rewind() { 382 | mi.left.rewind() 383 | mi.right.rewind() 384 | mi.fix() 385 | mi.setCurrent() 386 | } 387 | 388 | // Seek brings us to element with key >= given key. 389 | func (mi *MergeIterator) Seek(key []byte) { 390 | mi.left.seek(key) 391 | mi.right.seek(key) 392 | mi.fix() 393 | mi.setCurrent() 394 | } 395 | 396 | // Valid returns whether the MergeIterator is at a valid element. 397 | func (mi *MergeIterator) Valid() bool { 398 | return mi.small.valid 399 | } 400 | 401 | // Key returns the key associated with the current iterator. 402 | func (mi *MergeIterator) Item() utils.Item { 403 | return mi.small.iter.Item() 404 | } 405 | 406 | // Close implements Iterator. 407 | func (mi *MergeIterator) Close() error { 408 | err1 := mi.left.iter.Close() 409 | err2 := mi.right.iter.Close() 410 | if err1 != nil { 411 | return utils.WarpErr("MergeIterator", err1) 412 | } 413 | return utils.WarpErr("MergeIterator", err2) 414 | } 415 | 416 | // NewMergeIterator creates a merge iterator. 417 | func NewMergeIterator(iters []utils.Iterator, reverse bool) utils.Iterator { 418 | switch len(iters) { 419 | case 0: 420 | return &Iterator{} 421 | case 1: 422 | return iters[0] 423 | case 2: 424 | mi := &MergeIterator{ 425 | reverse: reverse, 426 | } 427 | mi.left.setIterator(iters[0]) 428 | mi.right.setIterator(iters[1]) 429 | // Assign left iterator randomly. This will be fixed when user calls rewind/seek. 430 | mi.small = &mi.left 431 | return mi 432 | } 433 | mid := len(iters) / 2 434 | return NewMergeIterator( 435 | []utils.Iterator{ 436 | NewMergeIterator(iters[:mid], reverse), 437 | NewMergeIterator(iters[mid:], reverse), 438 | }, reverse) 439 | } 440 | -------------------------------------------------------------------------------- /file/manifest.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 hardcore-os Project Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License") 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package file 16 | 17 | import ( 18 | "bufio" 19 | "bytes" 20 | "encoding/binary" 21 | "fmt" 22 | "hash/crc32" 23 | "io" 24 | "os" 25 | "path/filepath" 26 | "sync" 27 | 28 | "github.com/hardcore-os/corekv/pb" 29 | "github.com/hardcore-os/corekv/utils" 30 | "github.com/pkg/errors" 31 | ) 32 | 33 | // ManifestFile 维护sst文件元信息的文件 34 | // manifest 比较特殊,不能使用mmap,需要保证实时的写入 35 | type ManifestFile struct { 36 | opt *Options 37 | f *os.File 38 | lock sync.Mutex 39 | deletionsRewriteThreshold int 40 | manifest *Manifest 41 | } 42 | 43 | // Manifest corekv 元数据状态维护 44 | type Manifest struct { 45 | Levels []levelManifest 46 | Tables map[uint64]TableManifest 47 | Creations int 48 | Deletions int 49 | } 50 | 51 | // TableManifest 包含sst的基本信息 52 | type TableManifest struct { 53 | Level uint8 54 | Checksum []byte // 方便今后扩展 55 | } 56 | type levelManifest struct { 57 | Tables map[uint64]struct{} // Set of table id's 58 | } 59 | 60 | //TableMeta sst 的一些元信息 61 | type TableMeta struct { 62 | ID uint64 63 | Checksum []byte 64 | } 65 | 66 | // OpenManifestFile 打开manifest文件 67 | func OpenManifestFile(opt *Options) (*ManifestFile, error) { 68 | path := filepath.Join(opt.Dir, utils.ManifestFilename) 69 | mf := &ManifestFile{lock: sync.Mutex{}, opt: opt} 70 | f, err := os.OpenFile(path, os.O_RDWR, 0) 71 | // 如果打开失败 则尝试创建一个新的 manifest file 72 | if err != nil { 73 | if !os.IsNotExist(err) { 74 | return mf, err 75 | } 76 | m := createManifest() 77 | fp, netCreations, err := helpRewrite(opt.Dir, m) 78 | utils.CondPanic(netCreations == 0, errors.Wrap(err, utils.ErrReWriteFailure.Error())) 79 | if err != nil { 80 | return mf, err 81 | } 82 | mf.f = fp 83 | f = fp 84 | mf.manifest = m 85 | return mf, nil 86 | } 87 | 88 | // 如果打开 则对manifest文件重放 89 | manifest, truncOffset, err := ReplayManifestFile(f) 90 | if err != nil { 91 | _ = f.Close() 92 | return mf, err 93 | } 94 | // Truncate file so we don't have a half-written entry at the end. 95 | if err := f.Truncate(truncOffset); err != nil { 96 | _ = f.Close() 97 | return mf, err 98 | } 99 | if _, err = f.Seek(0, io.SeekEnd); err != nil { 100 | _ = f.Close() 101 | return mf, err 102 | } 103 | mf.f = f 104 | mf.manifest = manifest 105 | return mf, nil 106 | } 107 | 108 | // ReplayManifestFile 对已经存在的manifest文件重新应用所有状态变更 109 | func ReplayManifestFile(fp *os.File) (ret *Manifest, truncOffset int64, err error) { 110 | r := &bufReader{reader: bufio.NewReader(fp)} 111 | var magicBuf [8]byte 112 | if _, err := io.ReadFull(r, magicBuf[:]); err != nil { 113 | return &Manifest{}, 0, utils.ErrBadMagic 114 | } 115 | if !bytes.Equal(magicBuf[0:4], utils.MagicText[:]) { 116 | return &Manifest{}, 0, utils.ErrBadMagic 117 | } 118 | version := binary.BigEndian.Uint32(magicBuf[4:8]) 119 | if version != uint32(utils.MagicVersion) { 120 | return &Manifest{}, 0, 121 | fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, utils.MagicVersion) 122 | } 123 | 124 | build := createManifest() 125 | var offset int64 126 | for { 127 | offset = r.count 128 | var lenCrcBuf [8]byte 129 | _, err := io.ReadFull(r, lenCrcBuf[:]) 130 | if err != nil { 131 | if err == io.EOF || err == io.ErrUnexpectedEOF { 132 | break 133 | } 134 | return &Manifest{}, 0, err 135 | } 136 | length := binary.BigEndian.Uint32(lenCrcBuf[0:4]) 137 | var buf = make([]byte, length) 138 | if _, err := io.ReadFull(r, buf); err != nil { 139 | if err == io.EOF || err == io.ErrUnexpectedEOF { 140 | break 141 | } 142 | return &Manifest{}, 0, err 143 | } 144 | if crc32.Checksum(buf, utils.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) { 145 | return &Manifest{}, 0, utils.ErrBadChecksum 146 | } 147 | 148 | var changeSet pb.ManifestChangeSet 149 | if err := changeSet.Unmarshal(buf); err != nil { 150 | return &Manifest{}, 0, err 151 | } 152 | 153 | if err := applyChangeSet(build, &changeSet); err != nil { 154 | return &Manifest{}, 0, err 155 | } 156 | } 157 | 158 | return build, offset, err 159 | } 160 | 161 | // This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is 162 | // just plain broken. 163 | func applyChangeSet(build *Manifest, changeSet *pb.ManifestChangeSet) error { 164 | for _, change := range changeSet.Changes { 165 | if err := applyManifestChange(build, change); err != nil { 166 | return err 167 | } 168 | } 169 | return nil 170 | } 171 | 172 | func applyManifestChange(build *Manifest, tc *pb.ManifestChange) error { 173 | switch tc.Op { 174 | case pb.ManifestChange_CREATE: 175 | if _, ok := build.Tables[tc.Id]; ok { 176 | return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id) 177 | } 178 | build.Tables[tc.Id] = TableManifest{ 179 | Level: uint8(tc.Level), 180 | Checksum: append([]byte{}, tc.Checksum...), 181 | } 182 | for len(build.Levels) <= int(tc.Level) { 183 | build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})}) 184 | } 185 | build.Levels[tc.Level].Tables[tc.Id] = struct{}{} 186 | build.Creations++ 187 | case pb.ManifestChange_DELETE: 188 | tm, ok := build.Tables[tc.Id] 189 | if !ok { 190 | return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id) 191 | } 192 | delete(build.Levels[tm.Level].Tables, tc.Id) 193 | delete(build.Tables, tc.Id) 194 | build.Deletions++ 195 | default: 196 | return fmt.Errorf("MANIFEST file has invalid manifestChange op") 197 | } 198 | return nil 199 | } 200 | 201 | func createManifest() *Manifest { 202 | levels := make([]levelManifest, 0) 203 | return &Manifest{ 204 | Levels: levels, 205 | Tables: make(map[uint64]TableManifest), 206 | } 207 | } 208 | 209 | type bufReader struct { 210 | reader *bufio.Reader 211 | count int64 212 | } 213 | 214 | func (r *bufReader) Read(p []byte) (n int, err error) { 215 | n, err = r.reader.Read(p) 216 | r.count += int64(n) 217 | return 218 | } 219 | 220 | // asChanges returns a sequence of changes that could be used to recreate the Manifest in its 221 | // present state. 222 | func (m *Manifest) asChanges() []*pb.ManifestChange { 223 | changes := make([]*pb.ManifestChange, 0, len(m.Tables)) 224 | for id, tm := range m.Tables { 225 | changes = append(changes, newCreateChange(id, int(tm.Level), tm.Checksum)) 226 | } 227 | return changes 228 | } 229 | func newCreateChange(id uint64, level int, checksum []byte) *pb.ManifestChange { 230 | return &pb.ManifestChange{ 231 | Id: id, 232 | Op: pb.ManifestChange_CREATE, 233 | Level: uint32(level), 234 | Checksum: checksum, 235 | } 236 | } 237 | 238 | // Must be called while appendLock is held. 239 | func (mf *ManifestFile) rewrite() error { 240 | // In Windows the files should be closed before doing a Rename. 241 | if err := mf.f.Close(); err != nil { 242 | return err 243 | } 244 | fp, nextCreations, err := helpRewrite(mf.opt.Dir, mf.manifest) 245 | if err != nil { 246 | return err 247 | } 248 | mf.manifest.Creations = nextCreations 249 | mf.manifest.Deletions = 0 250 | mf.f = fp 251 | return nil 252 | } 253 | 254 | func helpRewrite(dir string, m *Manifest) (*os.File, int, error) { 255 | rewritePath := filepath.Join(dir, utils.ManifestRewriteFilename) 256 | // We explicitly sync. 257 | fp, err := os.OpenFile(rewritePath, utils.DefaultFileFlag, utils.DefaultFileMode) 258 | if err != nil { 259 | return nil, 0, err 260 | } 261 | 262 | buf := make([]byte, 8) 263 | copy(buf[0:4], utils.MagicText[:]) 264 | binary.BigEndian.PutUint32(buf[4:8], uint32(utils.MagicVersion)) 265 | 266 | netCreations := len(m.Tables) 267 | changes := m.asChanges() 268 | set := pb.ManifestChangeSet{Changes: changes} 269 | 270 | changeBuf, err := set.Marshal() 271 | if err != nil { 272 | fp.Close() 273 | return nil, 0, err 274 | } 275 | var lenCrcBuf [8]byte 276 | binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf))) 277 | binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, utils.CastagnoliCrcTable)) 278 | buf = append(buf, lenCrcBuf[:]...) 279 | buf = append(buf, changeBuf...) 280 | if _, err := fp.Write(buf); err != nil { 281 | fp.Close() 282 | return nil, 0, err 283 | } 284 | if err := fp.Sync(); err != nil { 285 | fp.Close() 286 | return nil, 0, err 287 | } 288 | 289 | // In Windows the files should be closed before doing a Rename. 290 | if err = fp.Close(); err != nil { 291 | return nil, 0, err 292 | } 293 | manifestPath := filepath.Join(dir, utils.ManifestFilename) 294 | if err := os.Rename(rewritePath, manifestPath); err != nil { 295 | return nil, 0, err 296 | } 297 | fp, err = os.OpenFile(manifestPath, utils.DefaultFileFlag, utils.DefaultFileMode) 298 | if err != nil { 299 | return nil, 0, err 300 | } 301 | if _, err := fp.Seek(0, io.SeekEnd); err != nil { 302 | fp.Close() 303 | return nil, 0, err 304 | } 305 | if err := utils.SyncDir(dir); err != nil { 306 | fp.Close() 307 | return nil, 0, err 308 | } 309 | 310 | return fp, netCreations, nil 311 | } 312 | 313 | // Close 关闭文件 314 | func (mf *ManifestFile) Close() error { 315 | if err := mf.f.Close(); err != nil { 316 | return err 317 | } 318 | return nil 319 | } 320 | 321 | // AddChanges 对外暴露的写比那更丰富 322 | func (mf *ManifestFile) AddChanges(changesParam []*pb.ManifestChange) error { 323 | return mf.addChanges(changesParam) 324 | } 325 | func (mf *ManifestFile) addChanges(changesParam []*pb.ManifestChange) error { 326 | changes := pb.ManifestChangeSet{Changes: changesParam} 327 | buf, err := changes.Marshal() 328 | if err != nil { 329 | return err 330 | } 331 | 332 | // TODO 锁粒度可以优化 333 | mf.lock.Lock() 334 | defer mf.lock.Unlock() 335 | if err := applyChangeSet(mf.manifest, &changes); err != nil { 336 | return err 337 | } 338 | // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care 339 | if mf.manifest.Deletions > utils.ManifestDeletionsRewriteThreshold && 340 | mf.manifest.Deletions > utils.ManifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) { 341 | if err := mf.rewrite(); err != nil { 342 | return err 343 | } 344 | } else { 345 | var lenCrcBuf [8]byte 346 | binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf))) 347 | binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, utils.CastagnoliCrcTable)) 348 | buf = append(lenCrcBuf[:], buf...) 349 | if _, err := mf.f.Write(buf); err != nil { 350 | return err 351 | } 352 | } 353 | err = mf.f.Sync() 354 | return err 355 | } 356 | 357 | // AddTableMeta 存储level表到manifest的level中 358 | func (mf *ManifestFile) AddTableMeta(levelNum int, t *TableMeta) (err error) { 359 | mf.addChanges([]*pb.ManifestChange{ 360 | newCreateChange(t.ID, levelNum, t.Checksum), 361 | }) 362 | return err 363 | } 364 | 365 | // RevertToManifest checks that all necessary table files exist and removes all table files not 366 | // referenced by the manifest. idMap is a set of table file id's that were read from the directory 367 | // listing. 368 | func (mf *ManifestFile) RevertToManifest(idMap map[uint64]struct{}) error { 369 | // 1. Check all files in manifest exist. 370 | for id := range mf.manifest.Tables { 371 | if _, ok := idMap[id]; !ok { 372 | return fmt.Errorf("file does not exist for table %d", id) 373 | } 374 | } 375 | 376 | // 2. Delete files that shouldn't exist. 377 | for id := range idMap { 378 | if _, ok := mf.manifest.Tables[id]; !ok { 379 | utils.Err(fmt.Errorf("Table file %d not referenced in MANIFEST", id)) 380 | filename := utils.FileNameSSTable(mf.opt.Dir, id) 381 | if err := os.Remove(filename); err != nil { 382 | return errors.Wrapf(err, "While removing table %d", id) 383 | } 384 | } 385 | } 386 | return nil 387 | } 388 | 389 | // GetManifest manifest 390 | func (mf *ManifestFile) GetManifest() *Manifest { 391 | return mf.manifest 392 | } 393 | --------------------------------------------------------------------------------