├── .gitignore ├── LICENSE.md ├── README.md ├── go.mod ├── main.go └── main_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | otf 2 | *~ -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2024 Phil Eaton 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # otf: A little Delta Lake/Iceberg inspired database implementation in Go 2 | 3 | Only supports CREATE TABLE, INSERTs and SELECTs at the moment. Take a 4 | look at the tests for examples of usage and concurrency control. 5 | 6 | See the [blog post](https://notes.eatonphil.com/2024-09-29-build-a-serverless-acid-database-with-this-one-neat-trick.html) walking through this project. 7 | 8 | ``` 9 | $ go test 10 | ``` 11 | 12 | See also: 13 | 14 | * [The Delta Lake Paper](https://www.vldb.org/pvldb/vol13/p3411-armbrust.pdf) 15 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/eatonphil/otf 2 | 3 | go 1.22.3 4 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "os" 8 | "path" 9 | "slices" 10 | "strings" 11 | ) 12 | 13 | func assert(b bool, msg string) { 14 | if !b { 15 | panic(msg) 16 | } 17 | } 18 | 19 | func assertEq[C comparable](a C, b C, prefix string) { 20 | if a != b { 21 | panic(fmt.Sprintf("%s '%v' != '%v'", prefix, a, b)) 22 | } 23 | } 24 | 25 | var DEBUG = slices.Contains(os.Args, "--debug") 26 | 27 | func debug(a ...any) { 28 | if !DEBUG { 29 | return 30 | } 31 | 32 | args := append([]any{"[DEBUG]"}, a...) 33 | fmt.Println(args...) 34 | } 35 | 36 | // https://datatracker.ietf.org/doc/html/rfc4122#section-4.4 37 | func uuidv4() string { 38 | f, err := os.Open("/dev/random") 39 | assert(err == nil, fmt.Sprintf("could not open /dev/random: %s", err)) 40 | defer f.Close() 41 | 42 | buf := make([]byte, 16) 43 | n, err := f.Read(buf) 44 | assert(err == nil, fmt.Sprintf("could not read 16 bytes from /dev/random: %s", err)) 45 | assert(n == len(buf), "expected 16 bytes from /dev/random") 46 | 47 | // Set bit 6 to 0 48 | buf[8] &= ^(byte(1) << 6) 49 | // Set bit 7 to 1 50 | buf[8] |= 1 << 7 51 | 52 | // Set version 53 | buf[6] &= ^(byte(1) << 4) 54 | buf[6] &= ^(byte(1) << 5) 55 | buf[6] |= 1 << 6 56 | buf[6] &= ^(byte(1) << 7) 57 | 58 | return fmt.Sprintf("%x-%x-%x-%x-%x", 59 | buf[:4], 60 | buf[4:6], 61 | buf[6:8], 62 | buf[8:10], 63 | buf[10:16]) 64 | } 65 | 66 | type objectStorage interface { 67 | // Must be atomic 68 | putIfAbsent(name string, bytes []byte) error 69 | listPrefix(prefix string) ([]string, error) 70 | read(name string) ([]byte, error) 71 | } 72 | 73 | type fileObjectStorage struct { 74 | basedir string 75 | } 76 | 77 | func newFileObjectStorage(basedir string) *fileObjectStorage { 78 | return &fileObjectStorage{basedir} 79 | } 80 | 81 | func (fos *fileObjectStorage) putIfAbsent(name string, bytes []byte) error { 82 | tmpfilename := path.Join(fos.basedir, uuidv4()) 83 | f, err := os.OpenFile(tmpfilename, os.O_WRONLY|os.O_CREATE, 0644) 84 | if err != nil { 85 | return err 86 | } 87 | 88 | written := 0 89 | bufSize := 1024 * 16 90 | for written < len(bytes) { 91 | toWrite := min(written+bufSize, len(bytes)) 92 | n, err := f.Write(bytes[written:toWrite]) 93 | if err != nil { 94 | removeErr := os.Remove(tmpfilename) 95 | assert(removeErr == nil, "could not remove") 96 | return err 97 | } 98 | 99 | written += n 100 | } 101 | 102 | err = f.Sync() 103 | if err != nil { 104 | removeErr := os.Remove(tmpfilename) 105 | assert(removeErr == nil, "could not remove") 106 | return err 107 | } 108 | 109 | err = f.Close() 110 | if err != nil { 111 | removeErr := os.Remove(tmpfilename) 112 | assert(removeErr == nil, "could not remove") 113 | return err 114 | } 115 | 116 | filename := path.Join(fos.basedir, name) 117 | err = os.Link(tmpfilename, filename) 118 | if err != nil { 119 | removeErr := os.Remove(tmpfilename) 120 | assert(removeErr == nil, "could not remove") 121 | return err 122 | } 123 | 124 | return nil 125 | } 126 | 127 | func (fos *fileObjectStorage) listPrefix(prefix string) ([]string, error) { 128 | dir := path.Join(fos.basedir) 129 | f, err := os.Open(dir) 130 | if err != nil { 131 | return nil, err 132 | } 133 | 134 | var files []string 135 | for err != io.EOF { 136 | var names []string 137 | names, err = f.Readdirnames(100) 138 | if err != nil && err != io.EOF { 139 | return nil, err 140 | } 141 | 142 | for _, n := range names { 143 | if prefix == "" || strings.HasPrefix(n, prefix) { 144 | files = append(files, n) 145 | } 146 | } 147 | } 148 | err = f.Close() 149 | return files, err 150 | } 151 | 152 | func (fos *fileObjectStorage) read(name string) ([]byte, error) { 153 | filename := path.Join(fos.basedir, name) 154 | return os.ReadFile(filename) 155 | } 156 | 157 | type DataobjectAction struct { 158 | Name string 159 | Table string 160 | } 161 | 162 | type ChangeMetadataAction struct { 163 | Table string 164 | Columns []string 165 | } 166 | 167 | // an enum, only one field will be non-nil 168 | type Action struct { 169 | AddDataobject *DataobjectAction 170 | ChangeMetadata *ChangeMetadataAction 171 | // TODO: Support object removal. 172 | // DeleteDataobject *DataobjectAction 173 | } 174 | 175 | const DATAOBJECT_SIZE int = 64 * 1024 176 | 177 | type transaction struct { 178 | Id int 179 | 180 | // Both are mapping table name to a list of actions on the table. 181 | previousActions map[string][]Action 182 | Actions map[string][]Action 183 | 184 | // Mapping tables to column names. 185 | tables map[string][]string 186 | 187 | // Mapping table name to unflushed/in-memory rows. When rows 188 | // are flushed, the dataobject that contains them is added to 189 | // `tx.actions` above and `tx.unflushedDataPointer[table]` is 190 | // reset to `0`. 191 | unflushedData map[string]*[DATAOBJECT_SIZE][]any 192 | unflushedDataPointer map[string]int 193 | } 194 | 195 | type client struct { 196 | os objectStorage 197 | // Current transaction, if any. Only one transaction per 198 | // client at a time. All reads and writes must be within a 199 | // transaction. 200 | tx *transaction 201 | } 202 | 203 | func newClient(os objectStorage) client { 204 | return client{os, nil} 205 | } 206 | 207 | var ( 208 | errExistingTx = fmt.Errorf("Existing Transaction") 209 | errNoTx = fmt.Errorf("No Transaction") 210 | errTableExists = fmt.Errorf("Table Exists") 211 | errNoTable = fmt.Errorf("No Such Table") 212 | ) 213 | 214 | func (d *client) newTx() error { 215 | if d.tx != nil { 216 | return errExistingTx 217 | } 218 | 219 | logPrefix := "_log_" 220 | txLogFilenames, err := d.os.listPrefix(logPrefix) 221 | if err != nil { 222 | return err 223 | } 224 | 225 | tx := &transaction{} 226 | tx.previousActions = map[string][]Action{} 227 | tx.Actions = map[string][]Action{} 228 | tx.tables = map[string][]string{} 229 | tx.unflushedData = map[string]*[DATAOBJECT_SIZE][]any{} 230 | tx.unflushedDataPointer = map[string]int{} 231 | 232 | for _, txLogFilename := range txLogFilenames { 233 | bytes, err := d.os.read(txLogFilename) 234 | if err != nil { 235 | return err 236 | } 237 | 238 | var oldTx transaction 239 | err = json.Unmarshal(bytes, &oldTx) 240 | if err != nil { 241 | return err 242 | } 243 | // Transaction metadata files are sorted 244 | // lexicographically so that the most recent 245 | // transaction (i.e. the one with the largest 246 | // transaction id) will be last and tx.Id will end up 247 | // 1 greater than the most recent transaction ID we 248 | // see on disk. 249 | tx.Id = oldTx.Id + 1 250 | 251 | for table, actions := range oldTx.Actions { 252 | for _, action := range actions { 253 | if action.AddDataobject != nil { 254 | tx.previousActions[table] = append(tx.previousActions[table], action) 255 | } else if action.ChangeMetadata != nil { 256 | // Store the latest version of 257 | // each table in memory for 258 | // easy lookup. 259 | mtd := action.ChangeMetadata 260 | tx.tables[table] = mtd.Columns 261 | } else { 262 | panic(fmt.Sprintf("unsupported action: %v", action)) 263 | } 264 | } 265 | } 266 | } 267 | 268 | d.tx = tx 269 | return nil 270 | } 271 | 272 | func (d *client) createTable(table string, columns []string) error { 273 | if d.tx == nil { 274 | return errNoTx 275 | } 276 | 277 | if _, exists := d.tx.tables[table]; exists { 278 | return errTableExists 279 | } 280 | 281 | // Store it in the in-memory mapping. 282 | d.tx.tables[table] = columns 283 | 284 | // And also add it to the action history for future transactions. 285 | d.tx.Actions[table] = append(d.tx.Actions[table], Action{ 286 | ChangeMetadata: &ChangeMetadataAction{ 287 | Table: table, 288 | Columns: columns, 289 | }, 290 | }) 291 | 292 | return nil 293 | } 294 | 295 | func (d *client) writeRow(table string, row []any) error { 296 | if d.tx == nil { 297 | return errNoTx 298 | } 299 | 300 | if _, ok := d.tx.tables[table]; !ok { 301 | return errNoTable 302 | } 303 | 304 | // Try to find an unflushed/in-memory dataobject for this table 305 | pointer, ok := d.tx.unflushedDataPointer[table] 306 | if !ok { 307 | d.tx.unflushedDataPointer[table] = 0 308 | d.tx.unflushedData[table] = &[DATAOBJECT_SIZE][]any{} 309 | } 310 | 311 | if pointer == DATAOBJECT_SIZE { 312 | d.flushRows(table) 313 | pointer = 0 314 | } 315 | 316 | d.tx.unflushedData[table][pointer] = row 317 | d.tx.unflushedDataPointer[table]++ 318 | return nil 319 | } 320 | 321 | type dataobject struct { 322 | Table string 323 | Name string 324 | Data [DATAOBJECT_SIZE][]any 325 | Len int 326 | } 327 | 328 | func (d *client) flushRows(table string) error { 329 | if d.tx == nil { 330 | return errNoTx 331 | } 332 | 333 | // First write out dataobject if there is anything to write out. 334 | pointer, exists := d.tx.unflushedDataPointer[table] 335 | if !exists || pointer == 0 { 336 | return nil 337 | } 338 | 339 | df := dataobject{ 340 | Table: table, 341 | Name: uuidv4(), 342 | Data: *d.tx.unflushedData[table], 343 | Len: pointer, 344 | } 345 | bytes, err := json.Marshal(df) 346 | if err != nil { 347 | return err 348 | } 349 | 350 | err = d.os.putIfAbsent(fmt.Sprintf("_table_%s_%s", table, df.Name), bytes) 351 | if err != nil { 352 | return err 353 | } 354 | 355 | // Record the newly written data file. 356 | d.tx.Actions[table] = append(d.tx.Actions[table], Action{ 357 | AddDataobject: &DataobjectAction{ 358 | Table: table, 359 | Name: df.Name, 360 | }, 361 | }) 362 | 363 | // Reset in-memory pointer. 364 | d.tx.unflushedDataPointer[table] = 0 365 | return nil 366 | } 367 | 368 | func (d *client) scan(table string) (*scanIterator, error) { 369 | if d.tx == nil { 370 | return nil, errNoTx 371 | } 372 | 373 | var dataobjects []string 374 | allActions := append(d.tx.previousActions[table], d.tx.Actions[table]...) 375 | for _, action := range allActions { 376 | if action.AddDataobject != nil { 377 | dataobjects = append(dataobjects, action.AddDataobject.Name) 378 | } 379 | } 380 | 381 | var unflushedRows [DATAOBJECT_SIZE][]any 382 | if data, ok := d.tx.unflushedData[table]; ok { 383 | unflushedRows = *data 384 | } 385 | 386 | return &scanIterator{ 387 | unflushedRows: unflushedRows, 388 | unflushedRowsLen: d.tx.unflushedDataPointer[table], 389 | d: d, 390 | table: table, 391 | dataobjects: dataobjects, 392 | }, nil 393 | } 394 | 395 | type scanIterator struct { 396 | d *client 397 | table string 398 | 399 | // First we iterate through unflushed rows. 400 | unflushedRows [DATAOBJECT_SIZE][]any 401 | unflushedRowsLen int 402 | unflushedRowPointer int 403 | 404 | // Then we move through each dataobject. 405 | dataobjects []string 406 | dataobjectsPointer int 407 | 408 | // And within each dataobject we iterate through rows. 409 | dataobject *dataobject 410 | dataobjectRowPointer int 411 | } 412 | 413 | func (d *client) readDataobject(table, name string) (*dataobject, error) { 414 | bytes, err := d.os.read(fmt.Sprintf("_table_%s_%s", table, name)) 415 | if err != nil { 416 | return nil, err 417 | } 418 | 419 | var do dataobject 420 | err = json.Unmarshal(bytes, &do) 421 | return &do, err 422 | } 423 | 424 | // returns (nil, nil) when done 425 | func (si *scanIterator) next() ([]any, error) { 426 | // Iterate through in-memory rows first. 427 | if si.unflushedRowPointer < si.unflushedRowsLen { 428 | row := si.unflushedRows[si.unflushedRowPointer] 429 | si.unflushedRowPointer++ 430 | return row, nil 431 | } 432 | 433 | // If we've gotten through all dataobjects on disk we're done. 434 | if si.dataobjectsPointer == len(si.dataobjects) { 435 | return nil, nil 436 | } 437 | 438 | if si.dataobject == nil { 439 | name := si.dataobjects[si.dataobjectsPointer] 440 | o, err := si.d.readDataobject(si.table, name) 441 | if err != nil { 442 | return nil, err 443 | } 444 | 445 | si.dataobject = o 446 | } 447 | 448 | if si.dataobjectRowPointer > si.dataobject.Len { 449 | si.dataobjectsPointer++ 450 | si.dataobject = nil 451 | si.dataobjectRowPointer = 0 452 | return si.next() 453 | } 454 | 455 | row := si.dataobject.Data[si.dataobjectRowPointer] 456 | si.dataobjectRowPointer++ 457 | return row, nil 458 | } 459 | 460 | func (d *client) commitTx() error { 461 | if d.tx == nil { 462 | return errNoTx 463 | } 464 | 465 | // Flush any outstanding data 466 | for table := range d.tx.tables { 467 | err := d.flushRows(table) 468 | if err != nil { 469 | d.tx = nil 470 | return err 471 | } 472 | } 473 | 474 | wrote := false 475 | for _, actions := range d.tx.Actions { 476 | if len(actions) > 0 { 477 | wrote = true 478 | break 479 | } 480 | } 481 | // Read-only transaction, no need to do a concurrency check. 482 | if !wrote { 483 | d.tx = nil 484 | return nil 485 | } 486 | 487 | filename := fmt.Sprintf("_log_%020d", d.tx.Id) 488 | // We won't store previous actions, they will be recovered on 489 | // new transactions. So unset them. Honestly not totally 490 | // clear why. 491 | d.tx.previousActions = nil 492 | bytes, err := json.Marshal(d.tx) 493 | if err != nil { 494 | d.tx = nil 495 | return err 496 | } 497 | 498 | err = d.os.putIfAbsent(filename, bytes) 499 | d.tx = nil 500 | return err 501 | } 502 | 503 | func main() { 504 | panic("unimplemented") 505 | } 506 | -------------------------------------------------------------------------------- /main_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | func TestConcurrentTableWriters(t *testing.T) { 9 | dir, err := os.MkdirTemp("", "test-database") 10 | 11 | if err != nil { 12 | panic(err) 13 | } 14 | 15 | defer os.Remove(dir) 16 | 17 | fos := newFileObjectStorage(dir) 18 | c1Writer := newClient(fos) 19 | c2Writer := newClient(fos) 20 | 21 | // Have c2Writer start up a transaction. 22 | err = c2Writer.newTx() 23 | assertEq(err, nil, "could not start first c2 tx") 24 | debug("[c2] new tx") 25 | 26 | // But then have c1Writer start a transaction and commit it first. 27 | err = c1Writer.newTx() 28 | assertEq(err, nil, "could not start first c1 tx") 29 | debug("[c1] new tx") 30 | err = c1Writer.createTable("x", []string{"a", "b"}) 31 | assertEq(err, nil, "could not create x") 32 | debug("[c1] Created table") 33 | err = c1Writer.writeRow("x", []any{"Joey", 1}) 34 | assertEq(err, nil, "could not write first row") 35 | debug("[c1] Wrote row") 36 | err = c1Writer.writeRow("x", []any{"Yue", 2}) 37 | assertEq(err, nil, "could not write second row") 38 | debug("[c1] Wrote row") 39 | err = c1Writer.commitTx() 40 | assertEq(err, nil, "could not commit tx") 41 | debug("[c1] Committed tx") 42 | 43 | // Now go back to c2 and write data. 44 | err = c2Writer.createTable("x", []string{"a", "b"}) 45 | assertEq(err, nil, "could not create x") 46 | debug("[c2] Created table") 47 | err = c2Writer.writeRow("x", []any{"Holly", 1}) 48 | assertEq(err, nil, "could not write first row") 49 | debug("[c2] Wrote row") 50 | 51 | err = c2Writer.commitTx() 52 | assert(err != nil, "concurrent commit must fail") 53 | debug("[c2] tx not committed") 54 | } 55 | 56 | func TestConcurrentReaderWithWriterReadsSnapshot(t *testing.T) { 57 | dir, err := os.MkdirTemp("", "test-database") 58 | 59 | if err != nil { 60 | panic(err) 61 | } 62 | 63 | defer os.Remove(dir) 64 | 65 | fos := newFileObjectStorage(dir) 66 | c1Writer := newClient(fos) 67 | c2Reader := newClient(fos) 68 | 69 | // First create some data and commit the transaction. 70 | err = c1Writer.newTx() 71 | assertEq(err, nil, "could not start first c1 tx") 72 | debug("[c1Writer] Started tx") 73 | err = c1Writer.createTable("x", []string{"a", "b"}) 74 | assertEq(err, nil, "could not create x") 75 | debug("[c1Writer] Created table") 76 | err = c1Writer.writeRow("x", []any{"Joey", 1}) 77 | assertEq(err, nil, "could not write first row") 78 | debug("[c1Writer] Wrote row") 79 | err = c1Writer.writeRow("x", []any{"Yue", 2}) 80 | assertEq(err, nil, "could not write second row") 81 | debug("[c1Writer] Wrote row") 82 | err = c1Writer.commitTx() 83 | assertEq(err, nil, "could not commit tx") 84 | debug("[c1Writer] Committed tx") 85 | 86 | // Now start a new transaction for more edits. 87 | err = c1Writer.newTx() 88 | assertEq(err, nil, "could not start second c1 tx") 89 | debug("[c1Writer] Starting new write tx") 90 | 91 | // Before we commit this second write-transaction, start a 92 | // read transaction. 93 | err = c2Reader.newTx() 94 | assertEq(err, nil, "could not start c2 tx") 95 | debug("[c2Reader] Started tx") 96 | 97 | // Write and commit rows in c1. 98 | err = c1Writer.writeRow("x", []any{"Ada", 3}) 99 | assertEq(err, nil, "could not write third row") 100 | debug("[c1Writer] Wrote third row") 101 | 102 | // Scan x in read-only transaction 103 | it, err := c2Reader.scan("x") 104 | assertEq(err, nil, "could not scan x") 105 | debug("[c2Reader] Started scanning") 106 | seen := 0 107 | for { 108 | row, err := it.next() 109 | assertEq(err, nil, "could not iterate x scan") 110 | 111 | if row == nil { 112 | debug("[c2Reader] Done scanning") 113 | break 114 | } 115 | 116 | debug("[c2Reader] Got row in reader tx", row) 117 | if seen == 0 { 118 | assertEq(row[0], "Joey", "row mismatch in c1") 119 | assertEq(row[1], 1.0, "row mismatch in c1") 120 | } else { 121 | assertEq(row[0], "Yue", "row mismatch in c1") 122 | assertEq(row[1], 2.0, "row mismatch in c1") 123 | } 124 | 125 | seen++ 126 | } 127 | assertEq(seen, 2, "expected two rows") 128 | 129 | // Scan x in c1 write transaction 130 | it, err = c1Writer.scan("x") 131 | assertEq(err, nil, "could not scan x in c1") 132 | debug("[c1Writer] Started scanning") 133 | seen = 0 134 | for { 135 | row, err := it.next() 136 | assertEq(err, nil, "could not iterate x scan in c1") 137 | 138 | if row == nil { 139 | debug("[c1Writer] Done scanning") 140 | break 141 | } 142 | 143 | debug("[c1Writer] Got row in tx", row) 144 | 145 | if seen == 0 { 146 | assertEq(row[0], "Ada", "row mismatch in c1") 147 | // Since this hasn't been serialized to JSON, it's still an int not a float. 148 | assertEq(row[1], 3, "row mismatch in c1") 149 | } else if seen == 1 { 150 | assertEq(row[0], "Joey", "row mismatch in c1") 151 | assertEq(row[1], 1.0, "row mismatch in c1") 152 | } else { 153 | assertEq(row[0], "Yue", "row mismatch in c1") 154 | assertEq(row[1], 2.0, "row mismatch in c1") 155 | } 156 | 157 | seen++ 158 | } 159 | assertEq(seen, 3, "expected three rows") 160 | 161 | // Writer committing should succeed. 162 | err = c1Writer.commitTx() 163 | assertEq(err, nil, "could not commit second tx") 164 | debug("[c1Writer] Committed tx") 165 | 166 | // Reader committing should succeed. 167 | err = c2Reader.commitTx() 168 | assertEq(err, nil, "could not commit read-only tx") 169 | debug("[c2Reader] Committed tx") 170 | } 171 | --------------------------------------------------------------------------------