├── README.md ├── disk_file_test.go ├── disk_file.go ├── binary_tree.go ├── binary_tree_test.go ├── lstm_test.go └── lsmt.go /README.md: -------------------------------------------------------------------------------- 1 | # LSM-Tree 2 | A simplified Golang implementation for log structured merge tree. `Put` and `Get` are supported. Compaction is also supported. Data is only stored in memory. Disk files are simulated by in-memory byte arrays. See https://eileen-code4fun.medium.com/log-structured-merge-tree-lsm-tree-implementations-a-demo-and-leveldb-d5e028257330 for a more elaborate documentation and how this compares to LevelDB. 3 | -------------------------------------------------------------------------------- /disk_file_test.go: -------------------------------------------------------------------------------- 1 | package lsmt 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestDiskFileConstruction(t *testing.T) { 9 | elems := []Element{ 10 | {Key: "1", Value: "One"}, 11 | {Key: "2", Value: "Two"}, 12 | {Key: "3", Value: "Three"}, 13 | {Key: "4", Value: "Four"}, 14 | {Key: "5", Value: "Five"}, 15 | {Key: "6", Value: "Six"}, 16 | {Key: "7", Value: "Seven"}, 17 | } 18 | d := NewDiskFile(elems) 19 | got := d.AllElements() 20 | if !reflect.DeepEqual(elems, got) { 21 | t.Errorf("all elements got %v; want %v", got, elems) 22 | } 23 | // Call it again to make sure it's idempotent. 24 | got = d.AllElements() 25 | if !reflect.DeepEqual(elems, got) { 26 | t.Errorf("all elements got %v; want %v", got, elems) 27 | } 28 | } 29 | 30 | func TestDiskFileSearch(t *testing.T) { 31 | elems := []Element{ 32 | {Key: "1", Value: "One"}, 33 | {Key: "2", Value: "Two"}, 34 | {Key: "3", Value: "Three"}, 35 | {Key: "4", Value: "Four"}, 36 | {Key: "5", Value: "Five"}, 37 | {Key: "6", Value: "Six"}, 38 | {Key: "7", Value: "Seven"}, 39 | } 40 | d := NewDiskFile(elems) 41 | for _, e := range elems { 42 | if got, err := d.Search(e.Key); err != nil || got.Key != e.Key { 43 | t.Errorf("search got key %s, %v; want %s, nil", got.Key, err, e.Key) 44 | } 45 | } 46 | if got, err := d.Search("0"); err == nil { 47 | t.Errorf("search 0 got key %s; want not found", got.Key) 48 | } 49 | if got, err := d.Search("8"); err == nil { 50 | t.Errorf("search 8 got key %s; want not found", got.Key) 51 | } 52 | if got, err := d.Search("3.5"); err == nil { 53 | t.Errorf("search 3.5 got key %s; want not found", got.Key) 54 | } 55 | } -------------------------------------------------------------------------------- /disk_file.go: -------------------------------------------------------------------------------- 1 | package lsmt 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "fmt" 7 | "log" 8 | "io" 9 | "strconv" 10 | ) 11 | 12 | const ( 13 | maxFileLen = 1024 14 | indexSparseRatio = 3 15 | ) 16 | 17 | type DiskFile struct { 18 | index *TreeNode 19 | data io.ReadSeeker 20 | size int 21 | buf bytes.Buffer 22 | } 23 | 24 | func (d DiskFile) Empty() bool { 25 | return d.size == 0 26 | } 27 | 28 | func NewDiskFile(elems []Element) DiskFile { 29 | d := DiskFile{size: len(elems)} 30 | var indexElems []Element 31 | var enc *gob.Encoder 32 | for i, e := range elems { 33 | if i % indexSparseRatio == 0 { 34 | // Create sparse index. 35 | idx := Element{Key: e.Key, Value: fmt.Sprintf("%d", d.buf.Len())} 36 | log.Printf("created sparse index element %v", idx) 37 | indexElems = append(indexElems, idx) 38 | enc = gob.NewEncoder(&d.buf) 39 | } 40 | enc.Encode(e) 41 | } 42 | d.index = NewTree(indexElems) 43 | return d 44 | } 45 | 46 | func (d DiskFile) Search(key string) (Element, error) { 47 | canErr := fmt.Errorf("key %s not found in disk file", key) 48 | if d.Empty() { 49 | return Element{}, canErr 50 | } 51 | var si, ei int 52 | start, err := JustSmallerOrEqual(d.index, key) 53 | if err != nil { 54 | // Key smaller than all. 55 | return Element{}, canErr 56 | } 57 | si, _ = strconv.Atoi(start.Value) 58 | end, err := JustLarger(d.index, key) 59 | if err != nil { 60 | // Key larger than all or equal to the last one. 61 | ei = d.buf.Len() 62 | } else { 63 | ei, _ = strconv.Atoi(end.Value) 64 | } 65 | log.Printf("searching in range [%d,%d)]", si, ei) 66 | buf := bytes.NewBuffer(d.buf.Bytes()[si:ei]) 67 | dec := gob.NewDecoder(buf) 68 | for { 69 | var e Element 70 | if err := dec.Decode(&e); err != nil { 71 | log.Printf("got err: %v", err) 72 | break 73 | } 74 | if e.Key == key { 75 | return e, nil 76 | } 77 | } 78 | return Element{}, canErr 79 | } 80 | 81 | func (d DiskFile) AllElements() []Element { 82 | indexElems := Traverse(d.index) 83 | var elems []Element 84 | var dec *gob.Decoder 85 | for i, idx := range indexElems { 86 | start, _ := strconv.Atoi(idx.Value) 87 | end := d.buf.Len() 88 | if i < len(indexElems)-1 { 89 | end, _ = strconv.Atoi(indexElems[i+1].Value) 90 | } 91 | dec = gob.NewDecoder(bytes.NewBuffer(d.buf.Bytes()[start:end])) 92 | var e Element 93 | for dec.Decode(&e)==nil { 94 | elems = append(elems, e) 95 | } 96 | } 97 | return elems 98 | } -------------------------------------------------------------------------------- /binary_tree.go: -------------------------------------------------------------------------------- 1 | package lsmt 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type TreeNode struct { 8 | Elem Element 9 | Left *TreeNode 10 | Right *TreeNode 11 | Size int 12 | } 13 | 14 | // NewTree accepts a sorted element slice and returns a binary tree representation. 15 | func NewTree(elems []Element) *TreeNode { 16 | size := len(elems) 17 | if size == 0 { 18 | return nil 19 | } 20 | root := &TreeNode{ 21 | Elem: elems[size/2], 22 | Left: NewTree(elems[0:size/2]), 23 | Size: size, 24 | } 25 | if rightIndex := size/2+1; rightIndex < size { 26 | root.Right = NewTree(elems[rightIndex:size]) 27 | } 28 | return root 29 | } 30 | 31 | func Upsert(tree **TreeNode, elem Element) { 32 | if *tree == nil { 33 | *tree = &TreeNode{Elem: elem, Size: 1} 34 | } else if elem.Key < (*tree).Elem.Key { 35 | Upsert(&((*tree).Left), elem) 36 | (*tree).Size++ 37 | } else if elem.Key > (*tree).Elem.Key { 38 | Upsert(&((*tree).Right), elem) 39 | (*tree).Size++ 40 | } else { 41 | (*tree).Elem.Value = elem.Value 42 | } 43 | } 44 | 45 | func Find(tree *TreeNode, key string) (Element, error) { 46 | if tree == nil { 47 | // Not found. 48 | return Element{}, fmt.Errorf("key %s not found", key) 49 | } else if tree.Elem.Key == key { 50 | return tree.Elem, nil 51 | } 52 | if key <= tree.Elem.Key { 53 | return Find(tree.Left, key) 54 | } else { 55 | return Find(tree.Right, key) 56 | } 57 | } 58 | 59 | // Traverse returns all the elements in key order. 60 | func Traverse(tree *TreeNode) []Element { 61 | var elems []Element 62 | if tree == nil { 63 | return elems 64 | } 65 | left := Traverse(tree.Left) 66 | right := Traverse(tree.Right) 67 | elems = append(elems, left...) 68 | elems = append(elems, tree.Elem) 69 | return append(elems, right...) 70 | } 71 | 72 | func JustSmallerOrEqual(tree *TreeNode, key string) (Element, error) { 73 | if tree == nil { 74 | return Element{}, fmt.Errorf("key %s is smaller than any key in the tree", key) 75 | } 76 | current := tree.Elem 77 | if current.Key <= key { 78 | right, err := JustSmallerOrEqual(tree.Right, key) 79 | if err == nil && current.Key < right.Key { 80 | current = right 81 | } 82 | } else { 83 | left, err := JustSmallerOrEqual(tree.Left, key) 84 | if err != nil { 85 | return Element{}, err 86 | } 87 | current = left 88 | } 89 | return current, nil 90 | } 91 | 92 | func JustLarger(tree *TreeNode, key string) (Element, error) { 93 | if tree == nil { 94 | return Element{}, fmt.Errorf("key %s is larger than any key in the tree", key) 95 | } 96 | current := tree.Elem 97 | if current.Key > key { 98 | left, err := JustLarger(tree.Left, key) 99 | if err == nil && current.Key > left.Key { 100 | current = left 101 | } 102 | } else { 103 | right, err := JustLarger(tree.Right, key) 104 | if err != nil { 105 | return Element{}, err 106 | } 107 | current = right 108 | } 109 | return current, nil 110 | } -------------------------------------------------------------------------------- /binary_tree_test.go: -------------------------------------------------------------------------------- 1 | package lsmt 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestBinaryTree(t *testing.T) { 9 | elems := []Element{ 10 | {Key: "1", Value: "One"}, 11 | {Key: "2", Value: "Two"}, 12 | {Key: "3", Value: "Three"}, 13 | {Key: "4", Value: "Four"}, 14 | {Key: "5", Value: "Five"}, 15 | {Key: "6", Value: "Six"}, 16 | {Key: "7", Value: "Seven"}, 17 | } 18 | tree := NewTree(elems) 19 | if tree.Size != len(elems) { 20 | t.Errorf("got tree size %d; want %d", tree.Size, len(elems)) 21 | } 22 | e, err := Find(tree, "5") 23 | if err != nil { 24 | t.Errorf("find 5 got error: %v", err) 25 | } 26 | if e.Key != "5" || e.Value != "Five" { 27 | t.Errorf("got key %s value %s; want 5 Five", e.Key, e.Value) 28 | } 29 | e, err = Find(tree, "2") 30 | if err != nil { 31 | t.Errorf("find 2 got error: %v", err) 32 | } 33 | if e.Key != "2" || e.Value != "Two" { 34 | t.Errorf("got key %s value %s; want 2 Two", e.Key, e.Value) 35 | } 36 | if e, err := Find(tree, "1.5"); err == nil { 37 | t.Errorf("got elem %v; want not found", e) 38 | } 39 | newElem := Element{Key: "6.5", Value: "Six Point Five"} 40 | Upsert(&tree, newElem) 41 | got := Traverse(tree) 42 | var expected []Element 43 | expected = append(expected, elems[0:6]...) 44 | expected = append(expected, newElem) 45 | expected = append(expected, elems[6:]...) 46 | if tree.Size != len(expected) { 47 | t.Errorf("got tree size %d; want %d", tree.Size, len(expected)) 48 | } 49 | if !reflect.DeepEqual(expected, got) { 50 | t.Errorf("traverse got %v; want %v", got, expected) 51 | } 52 | // Update a key. 53 | Upsert(&tree, Element{Key: "1", Value: "ONE"}) 54 | if e, err := Find(tree, "1"); err != nil || e.Value != "ONE" { 55 | t.Errorf("got key 1 value %s error %v; want ONE", e.Value, err) 56 | } 57 | } 58 | 59 | func TestTreeKeyComparison(t *testing.T) { 60 | elems := []Element{ 61 | {Key: "1", Value: "One"}, 62 | {Key: "2", Value: "Two"}, 63 | {Key: "3", Value: "Three"}, 64 | {Key: "4", Value: "Four"}, 65 | {Key: "5", Value: "Five"}, 66 | {Key: "6", Value: "Six"}, 67 | {Key: "7", Value: "Seven"}, 68 | } 69 | tree := NewTree(elems) 70 | if e, err := JustSmallerOrEqual(tree, "1"); err != nil || e.Key != "1" { 71 | t.Errorf("got %v, %v; want key 1, nil", e, err) 72 | } 73 | if e, err := JustSmallerOrEqual(tree, "0"); err == nil { 74 | t.Errorf("got %v, %v; want not found", e, err) 75 | } 76 | if e, err := JustSmallerOrEqual(tree, "2.5"); err != nil || e.Key != "2" { 77 | t.Errorf("got %v, %v; want key 2, nil", e, err) 78 | } 79 | if e, err := JustSmallerOrEqual(tree, "8"); err != nil || e.Key != "7" { 80 | t.Errorf("got %v, %v; want key 7, nil", e, err) 81 | } 82 | if e, err := JustLarger(tree, "6"); err != nil || e.Key != "7" { 83 | t.Errorf("got %v, %v; want key 7, nil", e, err) 84 | } 85 | if e, err := JustLarger(tree, "7"); err == nil { 86 | t.Errorf("got %v, %v; want not found", e, err) 87 | } 88 | if e, err := JustLarger(tree, "0"); err != nil || e.Key != "1" { 89 | t.Errorf("got %v, %v; want key 1, nil", e, err) 90 | } 91 | } -------------------------------------------------------------------------------- /lstm_test.go: -------------------------------------------------------------------------------- 1 | package lsmt 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "sync" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | func TestInMemoryOnly(t *testing.T) { 12 | var wg sync.WaitGroup 13 | var expected []Element 14 | total := 10 15 | tree := NewLSMTree(total+1 /* flush threshold larger than total */) 16 | for i := 0; i < total; i ++ { 17 | e := Element{Key: fmt.Sprintf("%d", i), Value: fmt.Sprintf("%d", i)} 18 | expected = append(expected, e) 19 | wg.Add(1) 20 | go func(){ 21 | tree.Put(e.Key, e.Value) 22 | wg.Done() 23 | }() 24 | } 25 | wg.Wait() 26 | if tree.tree.Size != total { 27 | t.Errorf("got tree size %d; want %d", tree.tree.Size, total) 28 | } 29 | for i := 0; i < total; i ++ { 30 | wg.Add(1) 31 | e := fmt.Sprintf("%d", i) 32 | go func() { 33 | v, err := tree.Get(e) 34 | if err != nil { 35 | t.Errorf("key %s not found", e) 36 | } 37 | if v != e { 38 | t.Errorf("got %s for key %s; want %s", v, e, e) 39 | } 40 | wg.Done() 41 | }() 42 | } 43 | wg.Wait() 44 | got := Traverse(tree.tree) 45 | if !reflect.DeepEqual(expected, got) { 46 | t.Errorf("got result %v; want %v", got, expected) 47 | } 48 | } 49 | 50 | func TestFlushedToDisk(t *testing.T) { 51 | t.Parallel() 52 | tree := NewLSMTree(2) 53 | tree.Put("1", "One") 54 | tree.Put("2", "Two") 55 | // Wait for flush. 56 | time.Sleep(1 * time.Second) 57 | if tree.tree != nil { 58 | t.Errorf("got tree size %d; want empty", tree.tree.Size) 59 | } 60 | if len(tree.diskFiles) != 1 { 61 | t.Errorf("got disk file size %d; want 1", len(tree.diskFiles)) 62 | } 63 | if _, err := tree.Get("1"); err != nil { 64 | t.Error("key 1 not found") 65 | } 66 | if _, err := tree.Get("2"); err != nil { 67 | t.Error("key 2 not found") 68 | } 69 | tree.Put("3", "Three") 70 | if _, err := tree.Get("3"); err != nil { 71 | t.Error("key 3 not found") 72 | } 73 | tree.Put("4", "Four") 74 | // Wait for flush and compaction. 75 | time.Sleep(3 * time.Second) 76 | if len(tree.diskFiles) != 1 { 77 | t.Errorf("got disk file size %d; want 1", len(tree.diskFiles)) 78 | } 79 | if len(tree.diskFiles) == 1 { 80 | got := tree.diskFiles[0].AllElements() 81 | want := []Element{{Key: "1", Value: "One"}, {Key: "2", Value: "Two"}, {Key: "3", Value: "Three"}, {Key: "4", Value: "Four"}} 82 | if !reflect.DeepEqual(want, got) { 83 | t.Errorf("got result %v; want %v", got, want) 84 | } 85 | } 86 | } 87 | 88 | func TestCompactionCollapse(t *testing.T) { 89 | t.Parallel() 90 | tree := NewLSMTree(1) 91 | tree.Put("1", "One") 92 | time.Sleep(time.Second) 93 | tree.Put("1", "ONE") 94 | // Wait for flush and compaction. 95 | time.Sleep(3 * time.Second) 96 | if len(tree.diskFiles) != 1 { 97 | t.Errorf("got disk file size %d; want 1", len(tree.diskFiles)) 98 | } 99 | if len(tree.diskFiles) == 1 { 100 | got := tree.diskFiles[0].AllElements() 101 | want := []Element{{Key: "1", Value: "ONE"}} 102 | if !reflect.DeepEqual(want, got) { 103 | t.Errorf("got result %v; want %v", got, want) 104 | } 105 | } 106 | } -------------------------------------------------------------------------------- /lsmt.go: -------------------------------------------------------------------------------- 1 | package lsmt 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | type Element struct { 11 | Key, Value string 12 | } 13 | 14 | type LSMTree struct { 15 | // Read write lock to control access to the in-memory tree. 16 | rwm sync.RWMutex 17 | tree *TreeNode 18 | treeInFlush *TreeNode 19 | flushThreshold int 20 | // Read write lock to control access to the disk files. 21 | drwm sync.RWMutex 22 | diskFiles []DiskFile 23 | } 24 | 25 | func NewLSMTree(flushThreshold int) *LSMTree { 26 | t := &LSMTree{flushThreshold: flushThreshold} 27 | go t.compactService() 28 | return t 29 | } 30 | 31 | func (t *LSMTree) Put(key, value string) { 32 | t.rwm.Lock() 33 | defer t.rwm.Unlock() 34 | Upsert(&(t.tree), Element{Key: key, Value: value}) 35 | if t.tree.Size >= t.flushThreshold && t.treeInFlush == nil { 36 | // Trigger flush. 37 | log.Printf("triggering flush %v", Traverse(t.tree)) 38 | t.treeInFlush = t.tree 39 | t.tree = nil 40 | go t.flush() 41 | } 42 | } 43 | 44 | func (t *LSMTree) Get(key string) (string, error) { 45 | t.rwm.RLock() 46 | if e, err := Find(t.tree, key); err == nil { 47 | t.rwm.RUnlock() 48 | return e.Value, nil 49 | } 50 | if e, err := Find(t.treeInFlush, key); err == nil { 51 | t.rwm.RUnlock() 52 | return e.Value, nil 53 | } 54 | t.rwm.RUnlock() 55 | // The key is not in memory. Search in disk files. 56 | t.drwm.RLock() 57 | defer t.drwm.RUnlock() 58 | for _, d := range t.diskFiles { 59 | e, err := d.Search(key) 60 | if err == nil { 61 | // Found in disk 62 | return e.Value, nil 63 | } 64 | } 65 | return "", fmt.Errorf("key %s not found", key) 66 | } 67 | 68 | func (t *LSMTree) flush() { 69 | // Create a new disk file. 70 | d := []DiskFile{NewDiskFile(Traverse(t.treeInFlush))} 71 | // Put the disk file in the list. 72 | t.drwm.Lock() 73 | t.diskFiles = append(d, t.diskFiles...) 74 | t.drwm.Unlock() 75 | // Remove the tree in flush. 76 | t.rwm.Lock() 77 | t.treeInFlush = nil 78 | t.rwm.Unlock() 79 | } 80 | 81 | func (t *LSMTree) compactService() { 82 | for { 83 | time.Sleep(time.Second) 84 | var d1, d2 DiskFile 85 | t.drwm.RLock() 86 | if len(t.diskFiles) >= 2 { 87 | d1 = t.diskFiles[len(t.diskFiles)-1] 88 | d2 = t.diskFiles[len(t.diskFiles)-2] 89 | } 90 | t.drwm.RUnlock() 91 | if d1.Empty() || d2.Empty() { 92 | continue 93 | } 94 | // Create a new compacted disk file. 95 | d := compact(d1, d2) 96 | // Replace the two old files. 97 | t.drwm.Lock() 98 | t.diskFiles = t.diskFiles[0:len(t.diskFiles)-2] 99 | t.diskFiles = append(t.diskFiles, d) 100 | t.drwm.Unlock() 101 | } 102 | } 103 | 104 | func compact(d1, d2 DiskFile) DiskFile { 105 | elems1 := d1.AllElements() 106 | elems2 := d2.AllElements() 107 | log.Printf("compacting d1: %v; d2: %v", elems1, elems2) 108 | size := min(len(elems1), len(elems2)) 109 | var newElems []Element 110 | var i1, i2 int 111 | for i1 < size && i2 < size { 112 | e1 := elems1[i1] 113 | e2 := elems2[i2] 114 | if e1.Key < e2.Key { 115 | newElems = append(newElems, e1) 116 | i1++ 117 | } else if e1.Key > e2.Key { 118 | newElems = append(newElems, e2) 119 | i2++ 120 | } else { 121 | // d1 is assumed to be older than d2. 122 | newElems = append(newElems, e2) 123 | i1++ 124 | i2++ 125 | } 126 | } 127 | newElems = append(newElems, elems1[i1:len(elems1)]...) 128 | newElems = append(newElems, elems2[i2:len(elems2)]...) 129 | return NewDiskFile(newElems) 130 | } 131 | 132 | func min(i, j int) int { 133 | if i < j { 134 | return i 135 | } 136 | return j 137 | } --------------------------------------------------------------------------------