├── .github └── workflows │ └── test.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── go.mod ├── go.sum ├── mkv ├── requirements.txt ├── src ├── lib.go ├── lib_test.go ├── main.go ├── rebalance.go ├── rebuild.go ├── s3api.go └── server.go ├── tools ├── bringup.sh ├── kill.sh ├── leveldb_compare.go ├── rtest.sh ├── s3test.py ├── test.py └── thrasher.go └── volume /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | unit: 7 | name: Tests 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout Code 11 | uses: actions/checkout@v2 12 | - name: Install Go/nginx 13 | run: | 14 | sudo apt-get update 15 | sudo apt-get -y --no-install-recommends install golang nginx build-essential 16 | - name: Set log permissions 17 | run: | 18 | sudo mkdir -p /var/log/nginx 19 | sudo chmod 777 /var/log/nginx /var/log/nginx/* 20 | - name: Unit Tests 21 | run: go test -v src/lib_test.go src/lib.go 22 | - name: Install Python Requirements 23 | run: pip3 install --no-cache-dir -r requirements.txt 24 | - name: Thrasher Test 25 | run: | 26 | (./tools/bringup.sh &) 27 | ./tools/test.py 28 | ./tools/s3test.py 29 | go run tools/thrasher.go 30 | ./tools/rtest.sh 31 | 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .*.swp 3 | src/mkv 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | ENV DEBIAN_FRONTEND noninteractive 4 | 5 | # system basics 6 | RUN apt-get update && \ 7 | apt-get -y --no-install-recommends install \ 8 | build-essential \ 9 | curl \ 10 | python3 \ 11 | python3-dev \ 12 | python3-setuptools \ 13 | python3-pip \ 14 | libffi-dev \ 15 | nginx \ 16 | golang \ 17 | git && \ 18 | apt-get clean && \ 19 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 20 | 21 | WORKDIR / 22 | ENV GOPATH /go 23 | ENV PATH ${PATH}:/mkv 24 | 25 | COPY requirements.txt mkv/requirements.txt 26 | RUN pip3 install --no-cache-dir -r mkv/requirements.txt 27 | 28 | COPY mkv volume mkv/ 29 | COPY src/*.go mkv/src/ 30 | COPY tools/* mkv/tools/ 31 | WORKDIR /mkv 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 George Hotz 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minikeyvalue 2 | 3 | ![Tests](https://github.com/geohot/minikeyvalue/workflows/Tests/badge.svg) 4 | 5 | Fed up with the complexity of distributed filesystems? 6 | 7 | minikeyvalue is a ~1000 line distributed key value store, with support for replication, multiple machines, and multiple drives per machine. Optimized for values between 1MB and 1GB. Inspired by SeaweedFS, but simple. Should scale to billions of files and petabytes of data. Used in production at [comma.ai](https://comma.ai/). 8 | 9 | A key part of minikeyvalue's simplicity is using stock nginx as the volume server. 10 | 11 | Even if this code is crap, the on disk format is super simple! We rely on a filesystem for blob storage and a LevelDB for indexing. The index can be reconstructed with rebuild. Volumes can be added or removed with rebalance. 12 | 13 | ### API 14 | 15 | - GET /key 16 | - 302 redirect to nginx volume server. 17 | - PUT /key 18 | - Blocks. 201 = written, anything else = probably not written. 19 | - DELETE /key 20 | - Blocks. 204 = deleted, anything else = probably not deleted. 21 | 22 | It also now supports a subset of S3 requests, so some S3 libraries will be somewhat compatible. 23 | 24 | ### Start Volume Servers (default port 3001) 25 | 26 | ``` 27 | # this is just nginx under the hood 28 | PORT=3001 ./volume /tmp/volume1/ &; 29 | PORT=3002 ./volume /tmp/volume2/ &; 30 | PORT=3003 ./volume /tmp/volume3/ &; 31 | ``` 32 | 33 | ### Start Master Server (default port 3000) 34 | 35 | ``` 36 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003 -db /tmp/indexdb/ server 37 | ``` 38 | 39 | 40 | ### Usage 41 | 42 | ``` 43 | # put "bigswag" in key "wehave" (will 403 if it already exists) 44 | curl -v -L -X PUT -d bigswag localhost:3000/wehave 45 | 46 | # get key "wehave" (should be "bigswag") 47 | curl -v -L localhost:3000/wehave 48 | 49 | # delete key "wehave" 50 | curl -v -L -X DELETE localhost:3000/wehave 51 | 52 | # unlink key "wehave", this is a virtual delete 53 | curl -v -L -X UNLINK localhost:3000/wehave 54 | 55 | # list keys starting with "we" 56 | curl -v -L localhost:3000/we?list 57 | 58 | # list unlinked keys ripe for DELETE 59 | curl -v -L localhost:3000/?unlinked 60 | 61 | # put file in key "file.txt" 62 | curl -v -L -X PUT -T /path/to/local/file.txt localhost:3000/file.txt 63 | 64 | # get file in key "file.txt" 65 | curl -v -L -o /path/to/local/file.txt localhost:3000/file.txt 66 | ``` 67 | 68 | ### ./mkv Usage 69 | 70 | ``` 71 | Usage: ./mkv 72 | 73 | -db string 74 | Path to leveldb 75 | -fallback string 76 | Fallback server for missing keys 77 | -port int 78 | Port for the server to listen on (default 3000) 79 | -protect 80 | Force UNLINK before DELETE 81 | -replicas int 82 | Amount of replicas to make of the data (default 3) 83 | -subvolumes int 84 | Amount of subvolumes, disks per machine (default 10) 85 | -volumes string 86 | Volumes to use for storage, comma separated 87 | ``` 88 | 89 | ### Rebalancing (to change the amount of volume servers) 90 | 91 | ``` 92 | # must shut down master first, since LevelDB can only be accessed by one process 93 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003 -db /tmp/indexdb/ rebalance 94 | ``` 95 | 96 | ### Rebuilding (to regenerate the LevelDB) 97 | 98 | ``` 99 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003 -db /tmp/indexdbalt/ rebuild 100 | ``` 101 | 102 | ### Performance 103 | 104 | ``` 105 | # Fetching non-existent key: 116338 req/sec 106 | wrk -t2 -c100 -d10s http://localhost:3000/key 107 | 108 | # go run thrasher.go 109 | starting thrasher 110 | 10000 write/read/delete in 2.620922675s 111 | thats 3815.40/sec 112 | ``` 113 | 114 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/geohot/minikeyvalue 2 | 3 | go 1.17 4 | 5 | require github.com/syndtr/goleveldb v1.0.0 6 | 7 | require ( 8 | github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect 9 | github.com/google/uuid v1.3.0 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= 2 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 3 | github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= 4 | github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 5 | github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= 6 | github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 7 | github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= 8 | github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= 9 | github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= 10 | github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= 11 | github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= 12 | github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU= 13 | github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= 14 | github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= 15 | github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= 16 | golang.org/x/net v0.0.0-20180906233101-161cd47e91fd h1:nTDtHvHSdCn1m6ITfMRqtOd/9+7a3s8RBNOZ3eYZzJA= 17 | golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 18 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 19 | golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e h1:o3PsSEY8E4eXWkXrIP9YJALUkVZqzHJT5DOasTyn8Vs= 20 | golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 21 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 22 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 23 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 24 | gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= 25 | gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= 26 | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= 27 | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= 28 | gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE= 29 | gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 30 | -------------------------------------------------------------------------------- /mkv: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | cd src 3 | go build -o mkv 4 | ./mkv $@ 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pyarrow 3 | boto3 4 | -------------------------------------------------------------------------------- /src/lib.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "crypto/md5" 7 | "encoding/base64" 8 | "errors" 9 | "fmt" 10 | "io" 11 | "io/ioutil" 12 | "net/http" 13 | "sort" 14 | "strings" 15 | "time" 16 | ) 17 | 18 | // *** DB Type *** 19 | type Deleted int 20 | 21 | const ( 22 | NO Deleted = 0 23 | SOFT Deleted = 1 24 | HARD Deleted = 2 25 | ) 26 | 27 | type Record struct { 28 | rvolumes []string 29 | deleted Deleted 30 | hash string 31 | } 32 | 33 | func toRecord(data []byte) Record { 34 | var rec Record 35 | ss := string(data) 36 | rec.deleted = NO 37 | if strings.HasPrefix(ss, "DELETED") { 38 | rec.deleted = SOFT 39 | ss = ss[7:] 40 | } 41 | if strings.HasPrefix(ss, "HASH") { 42 | rec.hash = ss[4:36] 43 | ss = ss[36:] 44 | } 45 | rec.rvolumes = strings.Split(ss, ",") 46 | return rec 47 | } 48 | 49 | func fromRecord(rec Record) []byte { 50 | cc := "" 51 | if rec.deleted == HARD { 52 | panic("Can't put HARD delete in the database") 53 | } 54 | if rec.deleted == SOFT { 55 | cc = "DELETED" 56 | } 57 | if len(rec.hash) == 32 { 58 | cc += "HASH" + rec.hash 59 | } 60 | return []byte(cc + strings.Join(rec.rvolumes, ",")) 61 | } 62 | 63 | // *** Hash Functions *** 64 | 65 | func key2path(key []byte) string { 66 | mkey := md5.Sum(key) 67 | b64key := base64.StdEncoding.EncodeToString(key) 68 | 69 | // 2 byte layers deep, meaning a fanout of 256 70 | // optimized for 2^24 = 16M files per volume server 71 | return fmt.Sprintf("/%02x/%02x/%s", mkey[0], mkey[1], b64key) 72 | } 73 | 74 | type sortvol struct { 75 | score []byte 76 | volume string 77 | } 78 | type byScore []sortvol 79 | 80 | func (s byScore) Len() int { return len(s) } 81 | func (s byScore) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 82 | func (s byScore) Less(i, j int) bool { 83 | return bytes.Compare(s[i].score, s[j].score) == 1 84 | } 85 | 86 | func key2volume(key []byte, volumes []string, count int, svcount int) []string { 87 | // this is an intelligent way to pick the volume server for a file 88 | // stable in the volume server name (not position!) 89 | // and if more are added the correct portion will move (yay md5!) 90 | var sortvols []sortvol 91 | for _, v := range volumes { 92 | hash := md5.New() 93 | hash.Write(key) 94 | hash.Write([]byte(v)) 95 | score := hash.Sum(nil) 96 | sortvols = append(sortvols, sortvol{score, v}) 97 | } 98 | sort.Stable(byScore(sortvols)) 99 | // go should have a map function 100 | // this adds the subvolumes 101 | var ret []string 102 | for i := 0; i < count; i++ { 103 | sv := sortvols[i] 104 | var volume string 105 | if svcount == 1 { 106 | // if it's one, don't use the path structure for it 107 | volume = sv.volume 108 | } else { 109 | // use the least significant compare dword for the subvolume 110 | // using only a byte would cause potential imbalance 111 | svhash := uint(sv.score[12])<<24 + uint(sv.score[13])<<16 + 112 | uint(sv.score[14])<<8 + uint(sv.score[15]) 113 | volume = fmt.Sprintf("%s/sv%02X", sv.volume, svhash%uint(svcount)) 114 | } 115 | ret = append(ret, volume) 116 | } 117 | //fmt.Println(string(key), ret[0]) 118 | return ret 119 | } 120 | 121 | func needs_rebalance(volumes []string, kvolumes []string) bool { 122 | if len(volumes) != len(kvolumes) { 123 | return true 124 | } 125 | for i := 0; i < len(volumes); i++ { 126 | if volumes[i] != kvolumes[i] { 127 | return true 128 | } 129 | } 130 | return false 131 | } 132 | 133 | // *** Remote Access Functions *** 134 | 135 | func remote_delete(remote string) error { 136 | req, err := http.NewRequest("DELETE", remote, nil) 137 | if err != nil { 138 | return err 139 | } 140 | resp, err := http.DefaultClient.Do(req) 141 | if err != nil { 142 | return err 143 | } 144 | defer resp.Body.Close() 145 | if resp.StatusCode != 204 && resp.StatusCode != 404 { 146 | return fmt.Errorf("remote_delete: wrong status code %d", resp.StatusCode) 147 | } 148 | return nil 149 | } 150 | 151 | func remote_put(remote string, length int64, body io.Reader) error { 152 | req, err := http.NewRequest("PUT", remote, body) 153 | if err != nil { 154 | return err 155 | } 156 | req.ContentLength = length 157 | resp, err := http.DefaultClient.Do(req) 158 | if err != nil { 159 | return err 160 | } 161 | defer resp.Body.Close() 162 | if resp.StatusCode != 201 && resp.StatusCode != 204 { 163 | return fmt.Errorf("remote_put: wrong status code %d", resp.StatusCode) 164 | } 165 | return nil 166 | } 167 | 168 | func remote_get(remote string) (string, error) { 169 | resp, err := http.Get(remote) 170 | if err != nil { 171 | return "", err 172 | } 173 | defer resp.Body.Close() 174 | if resp.StatusCode != 200 { 175 | return "", errors.New(fmt.Sprintf("remote_get: wrong status code %d", resp.StatusCode)) 176 | } 177 | body, err := ioutil.ReadAll(resp.Body) 178 | if err != nil { 179 | return "", err 180 | } 181 | return string(body), nil 182 | } 183 | 184 | func remote_head(remote string, timeout time.Duration) (bool, error) { 185 | ctx, cancel := context.WithTimeout(context.Background(), timeout) 186 | defer cancel() 187 | req, err := http.NewRequestWithContext(ctx, "HEAD", remote, nil) 188 | if err != nil { 189 | return false, err 190 | } 191 | resp, err := http.DefaultClient.Do(req) 192 | if err != nil { 193 | return false, err 194 | } 195 | defer resp.Body.Close() 196 | return resp.StatusCode == 200, nil 197 | } 198 | -------------------------------------------------------------------------------- /src/lib_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | // ensure the path hashing function doesn't change 11 | func Test_key2path(t *testing.T) { 12 | tests := map[string]string{ 13 | "hello": "/5d/41/aGVsbG8=", 14 | "helloworld": "/fc/5e/aGVsbG93b3JsZA==", 15 | } 16 | for k, v := range tests { 17 | ret := key2path([]byte(k)) 18 | if ret != v { 19 | t.Fatal("key2path function broke", k, ret, v) 20 | } 21 | } 22 | } 23 | 24 | // ensure the volume hashing function doesn't change 25 | func Test_key2volume(t *testing.T) { 26 | volumes := []string{"larry", "moe", "curly"} 27 | tests := map[string]string{ 28 | "hello": "larry", 29 | "helloworld": "curly", 30 | "world": "moe", 31 | "blah": "curly", 32 | } 33 | for k, v := range tests { 34 | ret := key2volume([]byte(k), volumes, 1, 3) 35 | if strings.Split(ret[0], "/")[0] != v { 36 | t.Fatal("key2volume function broke", k, ret, v) 37 | } 38 | } 39 | } 40 | 41 | func fromToRecordExample(t *testing.T, rec Record, val string) { 42 | recs := fromRecord(rec) 43 | if val != string(recs) { 44 | t.Fatal("record string didn't match") 45 | } 46 | reca := toRecord(recs) 47 | if !reflect.DeepEqual(rec, reca) { 48 | t.Fatal("toRecord(fromRecord(rec)) failed") 49 | } 50 | fmt.Println(val) 51 | } 52 | 53 | func Test_fromToRecord(t *testing.T) { 54 | fromToRecordExample(t, Record{[]string{"hello", "world"}, SOFT, ""}, "DELETEDhello,world") 55 | fromToRecordExample(t, Record{[]string{"hello", "world"}, NO, ""}, "hello,world") 56 | fromToRecordExample(t, Record{[]string{"hello"}, NO, ""}, "hello") 57 | fromToRecordExample(t, Record{[]string{"hello"}, SOFT, ""}, "DELETEDhello") 58 | fromToRecordExample(t, Record{[]string{"hello"}, SOFT, "5d41402abc4b2a76b9719d911017c592"}, "DELETEDHASH5d41402abc4b2a76b9719d911017c592hello") 59 | fromToRecordExample(t, Record{[]string{"hello"}, NO, "5d41402abc4b2a76b9719d911017c592"}, "HASH5d41402abc4b2a76b9719d911017c592hello") 60 | } 61 | -------------------------------------------------------------------------------- /src/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "io/ioutil" 7 | "log" 8 | "math/rand" 9 | "net/http" 10 | "strings" 11 | "sync" 12 | "time" 13 | 14 | "github.com/syndtr/goleveldb/leveldb" 15 | ) 16 | 17 | // *** App struct and methods *** 18 | 19 | type App struct { 20 | db *leveldb.DB 21 | mlock sync.Mutex 22 | lock map[string]struct{} 23 | 24 | // params 25 | uploadids map[string]bool 26 | volumes []string 27 | fallback string 28 | replicas int 29 | subvolumes int 30 | protect bool 31 | md5sum bool 32 | voltimeout time.Duration 33 | } 34 | 35 | func (a *App) UnlockKey(key []byte) { 36 | a.mlock.Lock() 37 | delete(a.lock, string(key)) 38 | a.mlock.Unlock() 39 | } 40 | 41 | func (a *App) LockKey(key []byte) bool { 42 | a.mlock.Lock() 43 | defer a.mlock.Unlock() 44 | if _, prs := a.lock[string(key)]; prs { 45 | return false 46 | } 47 | a.lock[string(key)] = struct{}{} 48 | return true 49 | } 50 | 51 | func (a *App) GetRecord(key []byte) Record { 52 | data, err := a.db.Get(key, nil) 53 | rec := Record{[]string{}, HARD, ""} 54 | if err != leveldb.ErrNotFound { 55 | rec = toRecord(data) 56 | } 57 | return rec 58 | } 59 | 60 | func (a *App) PutRecord(key []byte, rec Record) bool { 61 | return a.db.Put(key, fromRecord(rec), nil) == nil 62 | } 63 | 64 | // *** Entry Point *** 65 | 66 | func main() { 67 | http.DefaultTransport.(*http.Transport).MaxIdleConnsPerHost = 100 68 | rand.Seed(time.Now().Unix()) 69 | 70 | port := flag.Int("port", 3000, "Port for the server to listen on") 71 | pdb := flag.String("db", "", "Path to leveldb") 72 | fallback := flag.String("fallback", "", "Fallback server for missing keys") 73 | replicas := flag.Int("replicas", 3, "Amount of replicas to make of the data") 74 | subvolumes := flag.Int("subvolumes", 10, "Amount of subvolumes, disks per machine") 75 | pvolumes := flag.String("volumes", "", "Volumes to use for storage, comma separated") 76 | protect := flag.Bool("protect", false, "Force UNLINK before DELETE") 77 | verbose := flag.Bool("v", false, "Verbose output") 78 | md5sum := flag.Bool("md5sum", true, "Calculate and store MD5 checksum of values") 79 | voltimeout := flag.Duration("voltimeout", 1*time.Second, "Volume servers must respond to GET/HEAD requests in this amount of time or they are considered down, as duration") 80 | flag.Parse() 81 | 82 | volumes := strings.Split(*pvolumes, ",") 83 | command := flag.Arg(0) 84 | 85 | if command != "server" && command != "rebuild" && command != "rebalance" { 86 | fmt.Println("Usage: ./mkv ") 87 | flag.PrintDefaults() 88 | return 89 | } 90 | 91 | if !*verbose { 92 | log.SetOutput(ioutil.Discard) 93 | } else { 94 | log.SetFlags(log.LstdFlags | log.Lmicroseconds) 95 | } 96 | 97 | if *pdb == "" { 98 | panic("Need a path to the database") 99 | } 100 | 101 | if len(volumes) < *replicas { 102 | panic("Need at least as many volumes as replicas") 103 | } 104 | 105 | db, err := leveldb.OpenFile(*pdb, nil) 106 | if err != nil { 107 | panic(fmt.Sprintf("LevelDB open failed: %s", err)) 108 | } 109 | defer db.Close() 110 | 111 | fmt.Printf("volume servers: %s\n", volumes) 112 | a := App{db: db, 113 | lock: make(map[string]struct{}), 114 | uploadids: make(map[string]bool), 115 | volumes: volumes, 116 | fallback: *fallback, 117 | replicas: *replicas, 118 | subvolumes: *subvolumes, 119 | protect: *protect, 120 | md5sum: *md5sum, 121 | voltimeout: *voltimeout, 122 | } 123 | 124 | if command == "server" { 125 | http.ListenAndServe(fmt.Sprintf(":%d", *port), &a) 126 | } else if command == "rebuild" { 127 | a.Rebuild() 128 | } else if command == "rebalance" { 129 | a.Rebalance() 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/rebalance.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | type RebalanceRequest struct { 11 | key []byte 12 | volumes []string 13 | kvolumes []string 14 | } 15 | 16 | func rebalance(a *App, req RebalanceRequest) bool { 17 | kp := key2path(req.key) 18 | 19 | // find the volumes that are real 20 | rvolumes := make([]string, 0) 21 | for _, rv := range req.volumes { 22 | remote_test := fmt.Sprintf("http://%s%s", rv, kp) 23 | found, err := remote_head(remote_test, 1*time.Minute) 24 | if err != nil { 25 | fmt.Println("rebalance head error", err, remote_test) 26 | return false 27 | } 28 | if found { 29 | rvolumes = append(rvolumes, rv) 30 | } 31 | } 32 | 33 | if len(rvolumes) == 0 { 34 | fmt.Printf("rebalance impossible, %s is missing!\n", string(req.key)) 35 | return false 36 | } 37 | 38 | if !needs_rebalance(rvolumes, req.kvolumes) { 39 | return true 40 | } 41 | 42 | // debug 43 | fmt.Println("rebalancing", string(req.key), "from", rvolumes, "to", req.kvolumes) 44 | 45 | // find a good rvolume 46 | var err error = nil 47 | var ss string 48 | for _, v := range rvolumes { 49 | remote_from := fmt.Sprintf("http://%s%s", v, kp) 50 | 51 | // read 52 | ss, err = remote_get(remote_from) 53 | if err != nil { 54 | fmt.Println("rebalance get error", err, remote_from) 55 | } else { 56 | break 57 | } 58 | } 59 | if err != nil { 60 | return false 61 | } 62 | 63 | // write to the kvolumes 64 | rebalance_error := false 65 | for _, v := range req.kvolumes { 66 | needs_write := true 67 | // see if it's already there 68 | for _, v2 := range rvolumes { 69 | if v == v2 { 70 | needs_write = false 71 | break 72 | } 73 | } 74 | if needs_write { 75 | remote_to := fmt.Sprintf("http://%s%s", v, kp) 76 | // write 77 | if err := remote_put(remote_to, int64(len(ss)), strings.NewReader(ss)); err != nil { 78 | fmt.Println("rebalance put error", err, remote_to) 79 | rebalance_error = true 80 | } 81 | } 82 | } 83 | if rebalance_error { 84 | return false 85 | } 86 | 87 | // update db 88 | if !a.PutRecord(req.key, Record{req.kvolumes, NO, ""}) { 89 | fmt.Println("rebalance put db error", err) 90 | return false 91 | } 92 | 93 | // delete from the volumes that now aren't kvolumes 94 | delete_error := false 95 | for _, v2 := range rvolumes { 96 | needs_delete := true 97 | for _, v := range req.kvolumes { 98 | if v == v2 { 99 | needs_delete = false 100 | break 101 | } 102 | } 103 | if needs_delete { 104 | remote_del := fmt.Sprintf("http://%s%s", v2, kp) 105 | if err := remote_delete(remote_del); err != nil { 106 | fmt.Println("rebalance delete error", err, remote_del) 107 | delete_error = true 108 | } 109 | } 110 | } 111 | if delete_error { 112 | return false 113 | } 114 | return true 115 | } 116 | 117 | func (a *App) Rebalance() { 118 | fmt.Println("rebalancing to", a.volumes) 119 | 120 | var wg sync.WaitGroup 121 | reqs := make(chan RebalanceRequest, 20000) 122 | 123 | for i := 0; i < 16; i++ { 124 | go func() { 125 | for req := range reqs { 126 | rebalance(a, req) 127 | wg.Done() 128 | } 129 | }() 130 | } 131 | 132 | iter := a.db.NewIterator(nil, nil) 133 | defer iter.Release() 134 | for iter.Next() { 135 | key := make([]byte, len(iter.Key())) 136 | copy(key, iter.Key()) 137 | rec := toRecord(iter.Value()) 138 | kvolumes := key2volume(key, a.volumes, a.replicas, a.subvolumes) 139 | wg.Add(1) 140 | reqs <- RebalanceRequest{ 141 | key: key, 142 | volumes: rec.rvolumes, 143 | kvolumes: kvolumes} 144 | } 145 | close(reqs) 146 | 147 | wg.Wait() 148 | } 149 | -------------------------------------------------------------------------------- /src/rebuild.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/base64" 5 | "encoding/hex" 6 | "encoding/json" 7 | "fmt" 8 | "strings" 9 | "sync" 10 | 11 | "github.com/syndtr/goleveldb/leveldb" 12 | ) 13 | 14 | type File struct { 15 | Name string 16 | Type string 17 | Mtime string 18 | } 19 | 20 | type RebuildRequest struct { 21 | vol string 22 | url string 23 | } 24 | 25 | func get_files(url string) []File { 26 | //fmt.Println(url) 27 | var files []File 28 | dat, err := remote_get(url) 29 | if err != nil { 30 | fmt.Println("ugh", err) 31 | return files 32 | } 33 | json.Unmarshal([]byte(dat), &files) 34 | return files 35 | } 36 | 37 | func rebuild(a *App, vol string, name string) bool { 38 | key, err := base64.StdEncoding.DecodeString(name) 39 | if err != nil { 40 | fmt.Println("base64 decode error", err) 41 | return false 42 | } 43 | 44 | kvolumes := key2volume(key, a.volumes, a.replicas, a.subvolumes) 45 | 46 | if !a.LockKey(key) { 47 | fmt.Println("lockKey issue", key) 48 | return false 49 | } 50 | defer a.UnlockKey(key) 51 | 52 | data, err := a.db.Get(key, nil) 53 | var rec Record 54 | if err != leveldb.ErrNotFound { 55 | rec = toRecord(data) 56 | rec.rvolumes = append(rec.rvolumes, vol) 57 | } else { 58 | rec = Record{[]string{vol}, NO, ""} 59 | } 60 | 61 | // sort by order in kvolumes (sorry it's n^2 but n is small) 62 | pvalues := make([]string, 0) 63 | for _, v := range kvolumes { 64 | for _, v2 := range rec.rvolumes { 65 | if v == v2 { 66 | pvalues = append(pvalues, v) 67 | } 68 | } 69 | } 70 | // insert the ones that aren't there at the end 71 | for _, v2 := range rec.rvolumes { 72 | insert := true 73 | for _, v := range kvolumes { 74 | if v == v2 { 75 | insert = false 76 | break 77 | } 78 | } 79 | if insert { 80 | pvalues = append(pvalues, v2) 81 | } 82 | } 83 | 84 | if !a.PutRecord(key, Record{pvalues, NO, ""}) { 85 | fmt.Println("put error", err) 86 | return false 87 | } 88 | 89 | fmt.Println(string(key), pvalues) 90 | return true 91 | } 92 | 93 | func valid(a File) bool { 94 | if len(a.Name) != 2 || a.Type != "directory" { 95 | return false 96 | } 97 | decoded, err := hex.DecodeString(a.Name) 98 | if err != nil { 99 | return false 100 | } 101 | if len(decoded) != 1 { 102 | return false 103 | } 104 | return true 105 | } 106 | 107 | func (a *App) Rebuild() { 108 | fmt.Println("rebuilding on", a.volumes) 109 | 110 | // empty db 111 | iter := a.db.NewIterator(nil, nil) 112 | for iter.Next() { 113 | a.db.Delete(iter.Key(), nil) 114 | } 115 | 116 | var wg sync.WaitGroup 117 | reqs := make(chan RebuildRequest, 20000) 118 | 119 | for i := 0; i < 128; i++ { 120 | go func() { 121 | for req := range reqs { 122 | files := get_files(req.url) 123 | for _, f := range files { 124 | rebuild(a, req.vol, f.Name) 125 | } 126 | wg.Done() 127 | } 128 | }() 129 | } 130 | 131 | parse_volume := func(tvol string) { 132 | for _, i := range get_files(fmt.Sprintf("http://%s/", tvol)) { 133 | if valid(i) { 134 | for _, j := range get_files(fmt.Sprintf("http://%s/%s/", tvol, i.Name)) { 135 | if valid(j) { 136 | wg.Add(1) 137 | url := fmt.Sprintf("http://%s/%s/%s/", tvol, i.Name, j.Name) 138 | reqs <- RebuildRequest{tvol, url} 139 | } 140 | } 141 | } 142 | } 143 | } 144 | 145 | for _, vol := range a.volumes { 146 | has_subvolumes := false 147 | for _, f := range get_files(fmt.Sprintf("http://%s/", vol)) { 148 | if len(f.Name) == 4 && strings.HasPrefix(f.Name, "sv") && f.Type == "directory" { 149 | parse_volume(fmt.Sprintf("%s/%s", vol, f.Name)) 150 | has_subvolumes = true 151 | } 152 | } 153 | if !has_subvolumes { 154 | parse_volume(vol) 155 | } 156 | } 157 | 158 | close(reqs) 159 | wg.Wait() 160 | } 161 | -------------------------------------------------------------------------------- /src/s3api.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/xml" 5 | "io" 6 | "io/ioutil" 7 | ) 8 | 9 | type CompleteMultipartUpload struct { 10 | XMLName xml.Name `xml:"CompleteMultipartUpload"` 11 | PartNumbers []int `xml:"Part>PartNumber"` 12 | } 13 | 14 | type Delete struct { 15 | XMLName xml.Name `xml:"Delete"` 16 | Keys []string `xml:"Object>Key"` 17 | } 18 | 19 | func parseXML(r io.Reader, dat interface{}) error { 20 | out, err := ioutil.ReadAll(r) 21 | if err != nil { 22 | return err 23 | } 24 | if err := xml.Unmarshal(out, &dat); err != nil { 25 | return err 26 | } 27 | return nil 28 | } 29 | 30 | func parseCompleteMultipartUpload(r io.Reader) (*CompleteMultipartUpload, error) { 31 | var cmu CompleteMultipartUpload 32 | err := parseXML(r, &cmu) 33 | if err != nil { 34 | return nil, err 35 | } 36 | return &cmu, nil 37 | } 38 | 39 | func parseDelete(r io.Reader) (*Delete, error) { 40 | var del Delete 41 | err := parseXML(r, &del) 42 | if err != nil { 43 | return nil, err 44 | } 45 | return &del, nil 46 | } 47 | -------------------------------------------------------------------------------- /src/server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "crypto/md5" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "log" 10 | "math/rand" 11 | "net/http" 12 | "os" 13 | "strconv" 14 | "strings" 15 | 16 | "github.com/google/uuid" 17 | "github.com/syndtr/goleveldb/leveldb/util" 18 | ) 19 | 20 | // *** Master Server *** 21 | 22 | type ListResponse struct { 23 | Next string `json:"next"` 24 | Keys []string `json:"keys"` 25 | } 26 | 27 | func (a *App) QueryHandler(key []byte, w http.ResponseWriter, r *http.Request) { 28 | if r.URL.Query().Get("list-type") == "2" { 29 | // this is an S3 style query 30 | // TODO: this is very incomplete 31 | key = []byte(string(key) + "/" + r.URL.Query().Get("prefix")) 32 | iter := a.db.NewIterator(util.BytesPrefix(key), nil) 33 | defer iter.Release() 34 | 35 | ret := "" 36 | for iter.Next() { 37 | rec := toRecord(iter.Value()) 38 | if rec.deleted != NO { 39 | continue 40 | } 41 | ret += "" + string(iter.Key()[len(key):]) + "" 42 | } 43 | ret += "" 44 | w.WriteHeader(200) 45 | w.Write([]byte(ret)) 46 | return 47 | } 48 | 49 | // operation is first query parameter (e.g. ?list&limit=10) 50 | operation := strings.Split(r.URL.RawQuery, "&")[0] 51 | switch operation { 52 | case "list", "unlinked": 53 | start := r.URL.Query().Get("start") 54 | limit := 0 55 | qlimit := r.URL.Query().Get("limit") 56 | if qlimit != "" { 57 | nlimit, err := strconv.Atoi(qlimit) 58 | if err != nil { 59 | w.WriteHeader(400) 60 | return 61 | } 62 | limit = nlimit 63 | } 64 | 65 | slice := util.BytesPrefix(key) 66 | if start != "" { 67 | slice.Start = []byte(start) 68 | } 69 | iter := a.db.NewIterator(slice, nil) 70 | defer iter.Release() 71 | keys := make([]string, 0) 72 | next := "" 73 | for iter.Next() { 74 | rec := toRecord(iter.Value()) 75 | if (rec.deleted != NO && operation == "list") || 76 | (rec.deleted != SOFT && operation == "unlinked") { 77 | continue 78 | } 79 | if len(keys) > 1000000 { // too large (need to specify limit) 80 | w.WriteHeader(413) 81 | return 82 | } 83 | if limit > 0 && len(keys) == limit { // limit results returned 84 | next = string(iter.Key()) 85 | break 86 | } 87 | keys = append(keys, string(iter.Key())) 88 | } 89 | str, err := json.Marshal(ListResponse{Next: next, Keys: keys}) 90 | if err != nil { 91 | w.WriteHeader(500) 92 | return 93 | } 94 | w.Header().Set("Content-Type", "application/json") 95 | w.WriteHeader(200) 96 | w.Write(str) 97 | return 98 | default: 99 | w.WriteHeader(403) 100 | return 101 | } 102 | } 103 | 104 | func (a *App) Delete(key []byte, unlink bool) int { 105 | // delete the key, first locally 106 | rec := a.GetRecord(key) 107 | if rec.deleted == HARD || (unlink && rec.deleted == SOFT) { 108 | return 404 109 | } 110 | 111 | if !unlink && a.protect && rec.deleted == NO { 112 | return 403 113 | } 114 | 115 | // mark as deleted 116 | if !a.PutRecord(key, Record{rec.rvolumes, SOFT, rec.hash}) { 117 | return 500 118 | } 119 | 120 | if !unlink { 121 | // then remotely, if this is not an unlink 122 | delete_error := false 123 | for _, volume := range rec.rvolumes { 124 | remote := fmt.Sprintf("http://%s%s", volume, key2path(key)) 125 | if remote_delete(remote) != nil { 126 | // if this fails, it's possible to get an orphan file 127 | // but i'm not really sure what else to do? 128 | delete_error = true 129 | } 130 | } 131 | 132 | if delete_error { 133 | return 500 134 | } 135 | 136 | // this is a hard delete in the database, aka nothing 137 | a.db.Delete(key, nil) 138 | } 139 | 140 | // 204, all good 141 | return 204 142 | } 143 | 144 | func (a *App) WriteToReplicas(key []byte, value io.Reader, valuelen int64) int { 145 | // we don't have the key, compute the remote URL 146 | kvolumes := key2volume(key, a.volumes, a.replicas, a.subvolumes) 147 | 148 | // push to leveldb initially as deleted, and without a hash since we don't have it yet 149 | if !a.PutRecord(key, Record{kvolumes, SOFT, ""}) { 150 | return 500 151 | } 152 | 153 | // write to each replica 154 | var buf bytes.Buffer 155 | body := io.TeeReader(value, &buf) 156 | for i := 0; i < len(kvolumes); i++ { 157 | if i != 0 { 158 | // if we have already read the contents into the TeeReader 159 | body = bytes.NewReader(buf.Bytes()) 160 | } 161 | remote := fmt.Sprintf("http://%s%s", kvolumes[i], key2path(key)) 162 | if remote_put(remote, valuelen, body) != nil { 163 | // we assume the remote wrote nothing if it failed 164 | fmt.Printf("replica %d write failed: %s\n", i, remote) 165 | return 500 166 | } 167 | } 168 | 169 | var hash = "" 170 | if a.md5sum { 171 | // compute the hash of the value 172 | hash = fmt.Sprintf("%x", md5.Sum(buf.Bytes())) 173 | } 174 | 175 | // push to leveldb as existing 176 | // note that the key is locked, so nobody wrote to the leveldb 177 | if !a.PutRecord(key, Record{kvolumes, NO, hash}) { 178 | return 500 179 | } 180 | 181 | // 201, all good 182 | return 201 183 | } 184 | 185 | func (a *App) ServeHTTP(w http.ResponseWriter, r *http.Request) { 186 | key := []byte(r.URL.Path) 187 | lkey := []byte(r.URL.Path + r.URL.Query().Get("partNumber")) 188 | 189 | log.Println(r.Method, r.URL, r.ContentLength, r.Header["Range"]) 190 | 191 | // this is a list query 192 | if len(r.URL.RawQuery) > 0 && r.Method == "GET" { 193 | a.QueryHandler(key, w, r) 194 | return 195 | } 196 | 197 | // lock the key while a PUT or DELETE is in progress 198 | if r.Method == "POST" || r.Method == "PUT" || r.Method == "DELETE" || r.Method == "UNLINK" || r.Method == "REBALANCE" { 199 | if !a.LockKey(lkey) { 200 | // Conflict, retry later 201 | w.WriteHeader(409) 202 | return 203 | } 204 | defer a.UnlockKey(lkey) 205 | } 206 | 207 | switch r.Method { 208 | case "GET", "HEAD": 209 | rec := a.GetRecord(key) 210 | var remote string 211 | if len(rec.hash) != 0 { 212 | // note that the hash is always of the whole file, not the content requested 213 | w.Header().Set("Content-Md5", rec.hash) 214 | } 215 | if rec.deleted == SOFT || rec.deleted == HARD { 216 | if a.fallback == "" { 217 | w.Header().Set("Content-Length", "0") 218 | w.WriteHeader(404) 219 | return 220 | } 221 | // fall through to fallback 222 | remote = fmt.Sprintf("http://%s%s", a.fallback, key) 223 | } else { 224 | kvolumes := key2volume(key, a.volumes, a.replicas, a.subvolumes) 225 | if needs_rebalance(rec.rvolumes, kvolumes) { 226 | w.Header().Set("Key-Balance", "unbalanced") 227 | fmt.Println("on wrong volumes, needs rebalance") 228 | } else { 229 | w.Header().Set("Key-Balance", "balanced") 230 | } 231 | w.Header().Set("Key-Volumes", strings.Join(rec.rvolumes, ",")) 232 | 233 | // check the volume servers in a random order 234 | good := false 235 | for _, vn := range rand.Perm(len(rec.rvolumes)) { 236 | remote = fmt.Sprintf("http://%s%s", rec.rvolumes[vn], key2path(key)) 237 | found, _ := remote_head(remote, a.voltimeout) 238 | if found { 239 | good = true 240 | break 241 | } 242 | } 243 | // if not found on any volume servers, fail before the redirect 244 | if !good { 245 | w.Header().Set("Content-Length", "0") 246 | w.WriteHeader(404) 247 | return 248 | } 249 | // note: this can race and fail, but in that case the client will handle the retry 250 | } 251 | w.Header().Set("Location", remote) 252 | w.Header().Set("Content-Length", "0") 253 | w.WriteHeader(302) 254 | case "POST": 255 | // check if we already have the key, and it's not deleted 256 | rec := a.GetRecord(key) 257 | if rec.deleted == NO { 258 | // Forbidden to overwrite with POST 259 | w.WriteHeader(403) 260 | return 261 | } 262 | 263 | // this will handle multipart uploads in "S3" 264 | if r.URL.RawQuery == "uploads" { 265 | uploadid := uuid.New().String() 266 | a.uploadids[uploadid] = true 267 | 268 | // init multipart upload 269 | w.WriteHeader(200) 270 | w.Write([]byte(` 271 | ` + uploadid + ` 272 | `)) 273 | } else if r.URL.RawQuery == "delete" { 274 | del, err := parseDelete(r.Body) 275 | if err != nil { 276 | log.Println(err) 277 | w.WriteHeader(500) 278 | return 279 | } 280 | 281 | for _, subkey := range del.Keys { 282 | fullkey := fmt.Sprintf("%s/%s", key, subkey) 283 | status := a.Delete([]byte(fullkey), false) 284 | if status != 204 { 285 | w.WriteHeader(status) 286 | return 287 | } 288 | } 289 | w.WriteHeader(204) 290 | } else if uploadid := r.URL.Query().Get("uploadId"); uploadid != "" { 291 | if a.uploadids[uploadid] != true { 292 | w.WriteHeader(403) 293 | return 294 | } 295 | delete(a.uploadids, uploadid) 296 | 297 | // finish multipart upload 298 | cmu, err := parseCompleteMultipartUpload(r.Body) 299 | if err != nil { 300 | log.Println(err) 301 | w.WriteHeader(500) 302 | return 303 | } 304 | 305 | // open all the part files 306 | var fs []io.Reader 307 | sz := int64(0) 308 | for _, part := range cmu.PartNumbers { 309 | fn := fmt.Sprintf("/tmp/%s-%d", uploadid, part) 310 | f, err := os.Open(fn) 311 | os.Remove(fn) 312 | if err != nil { 313 | w.WriteHeader(403) 314 | return 315 | } 316 | defer f.Close() 317 | fi, _ := f.Stat() 318 | sz += fi.Size() 319 | fs = append(fs, f) 320 | } 321 | 322 | status := a.WriteToReplicas(key, io.MultiReader(fs...), sz) 323 | w.WriteHeader(status) 324 | w.Write([]byte("")) 325 | return 326 | } 327 | case "PUT": 328 | // no empty values 329 | if r.ContentLength == 0 { 330 | w.WriteHeader(411) 331 | return 332 | } 333 | 334 | // check if we already have the key, and it's not deleted 335 | rec := a.GetRecord(key) 336 | if rec.deleted == NO { 337 | // Forbidden to overwrite with PUT 338 | w.WriteHeader(403) 339 | return 340 | } 341 | 342 | if pn := r.URL.Query().Get("partNumber"); pn != "" { 343 | uploadid := r.URL.Query().Get("uploadId") 344 | if a.uploadids[uploadid] != true { 345 | w.WriteHeader(403) 346 | return 347 | } 348 | 349 | pnnum, _ := strconv.Atoi(pn) 350 | f, err := os.OpenFile(fmt.Sprintf("/tmp/%s-%d", uploadid, pnnum), os.O_RDWR|os.O_CREATE, 0600) 351 | if err != nil { 352 | w.WriteHeader(403) 353 | return 354 | } 355 | defer f.Close() 356 | io.Copy(f, r.Body) 357 | w.WriteHeader(200) 358 | } else { 359 | status := a.WriteToReplicas(key, r.Body, r.ContentLength) 360 | w.WriteHeader(status) 361 | } 362 | case "DELETE", "UNLINK": 363 | status := a.Delete(key, r.Method == "UNLINK") 364 | w.WriteHeader(status) 365 | case "REBALANCE": 366 | rec := a.GetRecord(key) 367 | if rec.deleted != NO { 368 | w.WriteHeader(404) 369 | return 370 | } 371 | 372 | kvolumes := key2volume(key, a.volumes, a.replicas, a.subvolumes) 373 | rbreq := RebalanceRequest{key: key, volumes: rec.rvolumes, kvolumes: kvolumes} 374 | if !rebalance(a, rbreq) { 375 | w.WriteHeader(400) 376 | return 377 | } 378 | 379 | // 204, all good 380 | w.WriteHeader(204) 381 | } 382 | } 383 | -------------------------------------------------------------------------------- /tools/bringup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT 3 | kill $(pgrep -f nginx) 4 | 5 | PORT=3001 ./volume /tmp/volume1/ & 6 | PORT=3002 ./volume /tmp/volume2/ & 7 | PORT=3003 ./volume /tmp/volume3/ & 8 | PORT=3004 ./volume /tmp/volume4/ & 9 | PORT=3005 ./volume /tmp/volume5/ & 10 | 11 | ./mkv -port 3000 -volumes localhost:3001,localhost:3002,localhost:3003,localhost:3004,localhost:3005 -db /tmp/indexdb/ server 12 | 13 | -------------------------------------------------------------------------------- /tools/kill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | kill $(pgrep -f nginx) 3 | 4 | -------------------------------------------------------------------------------- /tools/leveldb_compare.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | 8 | "github.com/syndtr/goleveldb/leveldb" 9 | "github.com/syndtr/goleveldb/leveldb/opt" 10 | ) 11 | 12 | func main() { 13 | opts := &opt.Options{ErrorIfMissing: true, ReadOnly: true} 14 | db1, err1 := leveldb.OpenFile(os.Args[1], opts) 15 | if err1 != nil { 16 | panic(fmt.Sprintf("db1 open failed: %s", err1)) 17 | } 18 | db2, err2 := leveldb.OpenFile(os.Args[2], opts) 19 | if err2 != nil { 20 | panic(fmt.Sprintf("db2 open failed: %s", err2)) 21 | } 22 | 23 | iter1 := db1.NewIterator(nil, nil) 24 | iter2 := db2.NewIterator(nil, nil) 25 | bad := false 26 | for iter1.Next() { 27 | iter2.Next() 28 | k1 := string(iter1.Key()) 29 | v1 := string(iter1.Value()) 30 | k2 := string(iter2.Key()) 31 | v2 := string(iter2.Value()) 32 | if k1 != k2 { 33 | panic(fmt.Sprintf("key mismatch %s != %s", k1, k2)) 34 | } 35 | // remove the hashes for compare 36 | if strings.HasPrefix(v1, "HASH") { 37 | v1 = v1[36:] 38 | } 39 | if strings.HasPrefix(v2, "HASH") { 40 | v2 = v2[36:] 41 | } 42 | if v1 != v2 { 43 | // we can continue with a value mismatch 44 | fmt.Printf("%s: %s != %s\n", k1, v1, v2) 45 | bad = true 46 | } 47 | } 48 | if bad { 49 | panic("not all values matched") 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tools/rtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ALTDB=/tmp/indexdbalt/ 3 | ALTDB2=/tmp/indexdbalt2/ 4 | echo "rebuild and rebalance test" 5 | 6 | # take down main server (now leaves nginx running) 7 | kill $(pgrep -f "indexdb") 8 | set -e 9 | 10 | # rebuild and compare the database 11 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003,localhost:3004,localhost:3005 -db $ALTDB rebuild 12 | go run tools/leveldb_compare.go /tmp/indexdb/ $ALTDB 13 | 14 | # do a rebalance, then put it back 15 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003 -db $ALTDB rebalance 16 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003,localhost:3004,localhost:3005 -db $ALTDB rebalance 17 | go run tools/leveldb_compare.go /tmp/indexdb/ $ALTDB 18 | 19 | # rebuild and compare the database 20 | ./mkv -volumes localhost:3001,localhost:3002,localhost:3003,localhost:3004,localhost:3005 -db $ALTDB2 rebuild 21 | go run tools/leveldb_compare.go /tmp/indexdb/ $ALTDB2 22 | 23 | -------------------------------------------------------------------------------- /tools/s3test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import binascii 4 | import unittest 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | from pyarrow import fs 8 | import boto3 9 | 10 | class TestS3Boto(unittest.TestCase): 11 | def get_fresh_key(self): 12 | return "swag-" + binascii.hexlify(os.urandom(10)).decode('utf-8') 13 | 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.s3 = boto3.client('s3', endpoint_url="http://127.0.0.1:3000", aws_access_key_id="user", aws_secret_access_key="password") 17 | 18 | def test_writelist(self): 19 | key = self.get_fresh_key() 20 | self.s3.put_object(Body=b'hello1', Bucket='boto', Key=key) 21 | response = self.s3.list_objects_v2(Bucket='boto') 22 | keys = [x['Key'] for x in response['Contents']] 23 | self.assertIn(key, keys) 24 | 25 | @unittest.expectedFailure 26 | def test_writeread(self): 27 | key = self.get_fresh_key() 28 | self.s3.put_object(Body=b'hello1', Bucket='boto', Key=key) 29 | # sadly this doesn't work because it won't follow the redirect 30 | self.s3.get_object(Bucket="boto", Key=key) 31 | 32 | class TestS3PyArrow(unittest.TestCase): 33 | @classmethod 34 | def setUpClass(cls): 35 | # this prevents stupid requests to 169.254.169.254 which take a while 36 | os.environ["AWS_EC2_METADATA_DISABLED"] = "true" 37 | cls.s3 = fs.S3FileSystem(endpoint_override="127.0.0.1:3000", scheme="http", anonymous=True) 38 | 39 | def get_fresh_key(self): 40 | return "bucket/swag-" + binascii.hexlify(os.urandom(10)).decode('utf-8') 41 | 42 | def write_file(self, fn, dat): 43 | with self.s3.open_output_stream(fn) as f: 44 | f.write(dat) 45 | 46 | def test_fileinfo(self): 47 | fn = self.get_fresh_key()+"-fileinfo" 48 | self.write_file(fn, b"hello1") 49 | inf = self.s3.get_file_info(fn) 50 | self.assertEqual(inf.size, 6) 51 | 52 | def test_fileinfo_list(self): 53 | fn = self.get_fresh_key()+"-listdir" 54 | self.write_file(fn, b"hello1") 55 | infs = self.s3.get_file_info(fs.FileSelector("bucket/", recursive=True)) 56 | fns = [x.path for x in infs] 57 | self.assertIn(fn, fns) 58 | 59 | # need to support file delete with POST 60 | def test_deletedir(self): 61 | fn = self.get_fresh_key()+"-deltest" 62 | self.write_file(fn, b"hello1") 63 | self.s3.delete_dir_contents('bucket') 64 | inf = self.s3.get_file_info(fn) 65 | self.assertEqual(inf.type, fs.FileType.NotFound) 66 | 67 | def test_deletefile(self): 68 | fn = self.get_fresh_key()+"-delftest" 69 | self.write_file(fn, b"hello1") 70 | inf = self.s3.get_file_info(fn) 71 | self.assertEqual(inf.size, 6) 72 | self.s3.delete_file(fn) 73 | inf = self.s3.get_file_info(fn) 74 | self.assertEqual(inf.type, fs.FileType.NotFound) 75 | 76 | # this needs multipart uploads to work 77 | def test_largerw(self): 78 | tbl = pa.table([pa.array(range(2000000)), pa.array(range(2000000))], ['a', 'b']) 79 | 80 | key = self.get_fresh_key() 81 | pq.write_table(tbl, key, filesystem=self.s3) 82 | tbl2 = pq.read_table(key, filesystem=self.s3) 83 | self.assertEqual(tbl, tbl2) # unclear what sort of equality this checks 84 | self.s3.delete_file(key) 85 | 86 | def test_smallrw(self): 87 | tbl = pa.table([pa.array([0,1,2,3])], ['a']) 88 | 89 | key = self.get_fresh_key() 90 | pq.write_table(tbl, key, filesystem=self.s3) 91 | tbl2 = pq.read_table(key, filesystem=self.s3) 92 | self.assertEqual(tbl, tbl2) # unclear what sort of equality this checks 93 | self.s3.delete_file(key) 94 | 95 | if __name__ == '__main__': 96 | unittest.main() -------------------------------------------------------------------------------- /tools/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import socket 4 | import hashlib 5 | import binascii 6 | import unittest 7 | import requests 8 | from urllib.parse import quote_plus 9 | import time 10 | import timeit 11 | import logging 12 | from concurrent.futures import ThreadPoolExecutor 13 | 14 | logging.basicConfig(format='%(name)s %(levelname)s %(message)s') 15 | logger = logging.getLogger(__name__) 16 | logger.setLevel(logging.DEBUG) 17 | 18 | class TestMiniKeyValue(unittest.TestCase): 19 | maxDiff = None 20 | 21 | def get_fresh_key(self): 22 | return b"http://localhost:3000/swag-" + binascii.hexlify(os.urandom(10)) 23 | 24 | def test_getputdelete(self): 25 | key = self.get_fresh_key() 26 | 27 | r = requests.put(key, data="onyou") 28 | self.assertEqual(r.status_code, 201) 29 | 30 | r = requests.get(key) 31 | self.assertEqual(r.status_code, 200) 32 | self.assertEqual(r.text, "onyou") 33 | 34 | r = requests.delete(key) 35 | self.assertEqual(r.status_code, 204) 36 | 37 | def test_deleteworks(self): 38 | key = self.get_fresh_key() 39 | 40 | r = requests.put(key, data="onyou") 41 | self.assertEqual(r.status_code, 201) 42 | 43 | r = requests.delete(key) 44 | self.assertEqual(r.status_code, 204) 45 | 46 | r = requests.get(key) 47 | self.assertEqual(r.status_code, 404) 48 | 49 | def test_doubledelete(self): 50 | key = self.get_fresh_key() 51 | r = requests.put(key, data="onyou") 52 | self.assertEqual(r.status_code, 201) 53 | 54 | r = requests.delete(key) 55 | self.assertEqual(r.status_code, 204) 56 | 57 | r = requests.delete(key) 58 | self.assertNotEqual(r.status_code, 204) 59 | 60 | def test_doubleput(self): 61 | key = self.get_fresh_key() 62 | r = requests.put(key, data="onyou") 63 | self.assertEqual(r.status_code, 201) 64 | 65 | r = requests.put(key, data="onyou") 66 | self.assertNotEqual(r.status_code, 201) 67 | 68 | def test_doubleputwdelete(self): 69 | key = self.get_fresh_key() 70 | r = requests.put(key, data="onyou") 71 | self.assertEqual(r.status_code, 201) 72 | 73 | r = requests.delete(key) 74 | self.assertEqual(r.status_code, 204) 75 | 76 | r = requests.put(key, data="onyou") 77 | self.assertEqual(r.status_code, 201) 78 | 79 | def test_10keys(self): 80 | keys = [self.get_fresh_key() for i in range(10)] 81 | 82 | for k in keys: 83 | r = requests.put(k, data=hashlib.md5(k).hexdigest()) 84 | self.assertEqual(r.status_code, 201) 85 | 86 | for k in keys: 87 | r = requests.get(k) 88 | self.assertEqual(r.status_code, 200) 89 | self.assertEqual(r.text, hashlib.md5(k).hexdigest()) 90 | 91 | for k in keys: 92 | r = requests.delete(k) 93 | self.assertEqual(r.status_code, 204) 94 | 95 | def test_range_request(self): 96 | key = self.get_fresh_key() 97 | r = requests.put(key, data="onyou") 98 | self.assertEqual(r.status_code, 201) 99 | 100 | r = requests.get(key, headers={"Range": "bytes=2-5"}) 101 | self.assertEqual(r.status_code, 206) 102 | self.assertEqual(r.text, "you") 103 | 104 | def test_nonexistent_key(self): 105 | key = self.get_fresh_key() 106 | r = requests.get(key) 107 | self.assertEqual(r.status_code, 404) 108 | 109 | def test_head_request(self): 110 | # head not exist 111 | key = self.get_fresh_key() 112 | r = requests.head(key, allow_redirects=True) 113 | self.assertEqual(r.status_code, 404) 114 | # no redirect, content length should be zero 115 | self.assertEqual(int(r.headers['content-length']), 0) 116 | 117 | # head exist 118 | key = self.get_fresh_key() 119 | data = "onyou" 120 | r = requests.put(key, data=data) 121 | self.assertEqual(r.status_code, 201) 122 | r = requests.head(key, allow_redirects=True) 123 | self.assertEqual(r.status_code, 200) 124 | # redirect, content length should be size of data 125 | self.assertEqual(int(r.headers['content-length']), len(data)) 126 | 127 | def test_large_key(self): 128 | key = self.get_fresh_key() 129 | 130 | data = b"a"*(16*1024*1024) 131 | 132 | r = requests.put(key, data=data) 133 | self.assertEqual(r.status_code, 201) 134 | 135 | r = requests.get(key) 136 | self.assertEqual(r.status_code, 200) 137 | self.assertEqual(r.content, data) 138 | 139 | r = requests.delete(key) 140 | self.assertEqual(r.status_code, 204) 141 | 142 | def test_json_list(self): 143 | key = self.get_fresh_key() 144 | data = "eh" 145 | r = requests.put(key+b"1", data=data) 146 | self.assertEqual(r.status_code, 201) 147 | r = requests.put(key+b"2", data=data) 148 | self.assertEqual(r.status_code, 201) 149 | 150 | r = requests.get(key+b"?list") 151 | self.assertEqual(r.status_code, 200) 152 | bkey = key.decode('utf-8') 153 | bkey = "/"+bkey.split("/")[-1] 154 | self.assertEqual(r.json(), {"next": "", "keys": [bkey+"1", bkey+"2"]}) 155 | 156 | def test_json_list_null(self): 157 | r = requests.get(self.get_fresh_key()+b"/DOES_NOT_EXIST?list") 158 | self.assertEqual(r.status_code, 200) 159 | self.assertEqual(r.json(), {"next": "", "keys": []}) 160 | 161 | def test_json_list_limit(self): 162 | prefix = self.get_fresh_key() 163 | keys = [] 164 | data = "0" 165 | limit = 10 166 | for i in range(limit+2): 167 | key = prefix+str(i).encode() 168 | r = requests.put(key, data=data) 169 | self.assertEqual(r.status_code, 201) 170 | keys.append("/"+key.decode().split("/")[-1]) 171 | # leveldb is sorted alphabetically 172 | keys = sorted(keys) 173 | # should return first page 174 | r = requests.get(prefix+b"?list&limit="+str(limit).encode()) 175 | self.assertEqual(r.status_code, 200) 176 | self.assertEqual(r.json(), {"next": keys[limit], "keys": keys[:limit]}) 177 | start = quote_plus(r.json()["next"]).encode() 178 | # should return last page 179 | r = requests.get(prefix+b"?list&limit="+str(limit).encode()+b"&start="+start) 180 | self.assertEqual(r.status_code, 200) 181 | self.assertEqual(r.json(), {"next": "", "keys": keys[limit:]}) 182 | 183 | def test_noemptykey(self): 184 | key = self.get_fresh_key() 185 | r = requests.put(key, data="") 186 | self.assertEqual(r.status_code, 411) 187 | 188 | def test_content_hash(self): 189 | for i in range(100): 190 | key = self.get_fresh_key() 191 | r = requests.put(key, data=key) 192 | self.assertEqual(r.status_code, 201) 193 | 194 | r = requests.head(key, allow_redirects=False) 195 | self.assertEqual(r.headers['Content-Md5'], hashlib.md5(key).hexdigest()) 196 | 197 | if __name__ == '__main__': 198 | # wait for servers 199 | for port in range(3000,3006): 200 | print("check port %d" % port) 201 | while 1: 202 | try: 203 | s = socket.create_connection(("localhost", port), timeout=0.5) 204 | s.close() 205 | break 206 | except (ConnectionRefusedError, OSError): 207 | time.sleep(0.5) 208 | continue 209 | print("waiting for servers") 210 | 211 | unittest.main() 212 | 213 | -------------------------------------------------------------------------------- /tools/thrasher.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "math/rand" 9 | "net/http" 10 | "os" 11 | "strings" 12 | "time" 13 | ) 14 | 15 | // copied from lib.go 16 | func remote_delete(remote string) error { 17 | req, err := http.NewRequest("DELETE", remote, nil) 18 | if err != nil { 19 | return err 20 | } 21 | resp, err := http.DefaultClient.Do(req) 22 | if err != nil { 23 | return err 24 | } 25 | defer resp.Body.Close() 26 | if resp.StatusCode != 204 { 27 | return fmt.Errorf("remote_delete: wrong status code %d", resp.StatusCode) 28 | } 29 | return nil 30 | } 31 | 32 | func remote_put(remote string, length int64, body io.Reader) error { 33 | req, err := http.NewRequest("PUT", remote, body) 34 | if err != nil { 35 | return err 36 | } 37 | req.ContentLength = length 38 | resp, err := http.DefaultClient.Do(req) 39 | if err != nil { 40 | return err 41 | } 42 | defer resp.Body.Close() 43 | if resp.StatusCode != 201 && resp.StatusCode != 204 { 44 | return fmt.Errorf("remote_put: wrong status code %d", resp.StatusCode) 45 | } 46 | return nil 47 | } 48 | 49 | func remote_get(remote string) (string, error) { 50 | resp, err := http.Get(remote) 51 | if err != nil { 52 | return "", err 53 | } 54 | defer resp.Body.Close() 55 | if resp.StatusCode != 200 { 56 | return "", errors.New(fmt.Sprintf("remote_get: wrong status code %d", resp.StatusCode)) 57 | } 58 | body, err := ioutil.ReadAll(resp.Body) 59 | if err != nil { 60 | return "", err 61 | } 62 | return string(body), nil 63 | } 64 | 65 | func main() { 66 | rand.Seed(time.Now().UTC().UnixNano()) 67 | 68 | reqs := make(chan string, 20000) 69 | resp := make(chan bool, 20000) 70 | fmt.Println("starting thrasher") 71 | 72 | http.DefaultTransport.(*http.Transport).MaxIdleConnsPerHost = 100 73 | 74 | // 16 concurrent processes 75 | for i := 0; i < 16; i++ { 76 | go func() { 77 | for { 78 | key := <-reqs 79 | value := fmt.Sprintf("value-%d", rand.Int()) 80 | if err := remote_put("http://localhost:3000/"+key, int64(len(value)), strings.NewReader(value)); err != nil { 81 | fmt.Println("PUT FAILED", err) 82 | resp <- false 83 | continue 84 | } 85 | 86 | ss, err := remote_get("http://localhost:3000/" + key) 87 | if err != nil || ss != value { 88 | fmt.Println("GET FAILED", err, ss, value) 89 | resp <- false 90 | continue 91 | } 92 | 93 | if err := remote_delete("http://localhost:3000/" + key); err != nil { 94 | fmt.Println("DELETE FAILED", err) 95 | resp <- false 96 | continue 97 | } 98 | resp <- true 99 | } 100 | }() 101 | } 102 | 103 | count := 10000 104 | 105 | start := time.Now() 106 | for i := 0; i < count; i++ { 107 | key := fmt.Sprintf("benchmark-%d", rand.Int()) 108 | reqs <- key 109 | } 110 | 111 | for i := 0; i < count; i++ { 112 | if <-resp == false { 113 | fmt.Println("ERROR on", i) 114 | os.Exit(-1) 115 | } 116 | } 117 | 118 | fmt.Println(count, "write/read/delete in", time.Since(start)) 119 | fmt.Printf("thats %.2f/sec\n", float32(count)/(float32(time.Since(start))/1e9)) 120 | } 121 | -------------------------------------------------------------------------------- /volume: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | export VOLUME=${1:-/tmp/volume1/} 3 | export TYPE=volume 4 | export PORT=${PORT:-3001} 5 | 6 | mkdir -p $VOLUME 7 | chmod 777 $VOLUME 8 | 9 | CONF=$(mktemp) 10 | echo " 11 | daemon off; # docker 12 | #worker_rlimit_nofile 100000; 13 | worker_processes auto; 14 | pcre_jit on; 15 | 16 | error_log /dev/stderr; 17 | pid $VOLUME/nginx.pid; 18 | 19 | events { 20 | #use epoll; 21 | multi_accept on; 22 | accept_mutex off; 23 | worker_connections 4096; 24 | } 25 | 26 | http { 27 | sendfile on; 28 | sendfile_max_chunk 1024k; 29 | 30 | tcp_nopush on; 31 | tcp_nodelay on; 32 | 33 | open_file_cache off; 34 | types_hash_max_size 2048; 35 | 36 | server_tokens off; 37 | 38 | default_type application/octet-stream; 39 | 40 | error_log /dev/stderr error; # docker 41 | 42 | server { 43 | listen $PORT default_server backlog=4096; 44 | location / { 45 | root $VOLUME; 46 | disable_symlinks off; 47 | 48 | client_body_temp_path $VOLUME/body_temp; 49 | client_max_body_size 0; 50 | 51 | # this causes tests to fail 52 | #client_body_buffer_size 0; 53 | 54 | dav_methods PUT DELETE; 55 | dav_access group:rw all:r; 56 | create_full_put_path on; 57 | 58 | autoindex on; 59 | autoindex_format json; 60 | } 61 | } 62 | } 63 | " > $CONF 64 | echo "starting nginx on $PORT" 65 | nginx -c $CONF -p $VOLUME/tmp 66 | 67 | --------------------------------------------------------------------------------